def process_classification_dataset(name):
    # converting categorical features to numerical

    data_dir = os.path.join('datasets', name)
    train_file = os.path.join(data_dir, 'full_train')
    test_file = os.path.join(data_dir, 'test')
    cd_file = os.path.join(data_dir, 'pool.cd')

    train = np.loadtxt(train_file, delimiter="\t", dtype="object")
    test = np.loadtxt(test_file, delimiter="\t", dtype="object")
    cd = read_cd(cd_file, data_file=train_file)

    # Target can be called 'Label' or 'Target' in pool.cd
    try:
        label_ind = cd['column_type_to_indices']['Label']
    except:
        label_ind = cd['column_type_to_indices']['Target']

    np.random.seed(42)  # fix random seed
    train = np.random.permutation(train)

    y_train = train[:, label_ind]
    y_train = y_train.reshape(-1)

    y_test = test[:, label_ind]
    y_test = y_test.reshape(-1)

    cat_features = cd['column_type_to_indices'][
        'Categ']  # features to be replaced

    enc = LeaveOneOutEncoder(cols=cat_features,
                             return_df=False,
                             random_state=10,
                             sigma=0.3)

    transformed_train = enc.fit_transform(train, y_train).astype("float64")
    X_train = np.delete(transformed_train, label_ind,
                        1)  # remove target column

    transformed_test = enc.transform(test).astype("float64")
    X_test = np.delete(transformed_test, label_ind, 1)  # remove target column

    return np.nan_to_num(X_train), y_train, np.nan_to_num(X_test), y_test, enc
Exemple #2
0
def aggregate_results(name,
                      modes=["single", "ens", "virt"],
                      algorithms=['sgb-fixed', 'sglb-fixed'],
                      num_models=10):

    results = []  # metric values for all algorithms and all folds

    for mode in modes:
        for alg in algorithms:

            if alg == "rf":
                train_pool, y_train, test_pool, y_test, enc = process_classification_dataset(
                    name)

                # process ood data
                cd = read_cd("datasets/" + name + "/pool.cd",
                             data_file="datasets/" + name + "/test")
                try:
                    label_ind = cd['column_type_to_indices']['Label']
                except:
                    label_ind = cd['column_type_to_indices']['Target']

                ood_test_pool = np.loadtxt("datasets/ood/" + name,
                                           delimiter="\t",
                                           dtype="object")
                ood_test_pool = enc.transform(ood_test_pool).astype("float64")
                ood_test_pool = np.delete(ood_test_pool, label_ind, 1)
                ood_size = len(ood_test_pool)

            else:
                test_pool = Pool(data="datasets/" + name + "/test",
                                 column_description="datasets/" + name +
                                 "/pool.cd")
                ood_test_pool = Pool(data="datasets/ood/" + name,
                                     column_description="datasets/" + name +
                                     "/pool.cd")
                ood_size = ood_test_pool.num_row()

                y_test = test_pool.get_label()

            test_size = len(y_test)
            domain_labels = np.concatenate(
                [np.zeros(test_size), np.ones(ood_size)])

            y_test_norm = normalize_test_labels(y_test)

            values = defaultdict(
            )  # metric values for all folds for given algorithm

            if mode == "single":
                # use 0th model from ensemble as a single model
                model = load_model(name, alg, 0)
                preds = model.predict(test_pool)
                preds_proba = model.predict_proba(test_pool)

                values["error"] = (preds != y_test).astype(int)
                values["nll"] = nll_class(y_test_norm, preds_proba)
                values["TU_prr"] = prr_class(y_test_norm, preds_proba,
                                             entropy(preds_proba), False)
                values["KU_prr"] = float("nan")
                values["KU_auc"] = float("nan")

                ood_preds_proba = model.predict_proba(ood_test_pool)
                in_measure = entropy(preds_proba)
                out_measure = entropy(ood_preds_proba)
                values["TU_auc"] = ood_detect(domain_labels,
                                              in_measure,
                                              out_measure,
                                              mode="ROC")

            if mode == "ens":
                all_preds = []  # predictions of all models in ensemble
                all_preds_ood = []

                for i in range(num_models):
                    model = load_model(name, alg, i)
                    preds = model.predict_proba(test_pool)
                    all_preds.append(preds)
                    preds = model.predict_proba(ood_test_pool)
                    all_preds_ood.append(preds)

                all_preds = np.array(all_preds)
                preds_proba = np.mean(all_preds, axis=0)

                all_preds_ood = np.array(all_preds_ood)

                preds = np.argmax(preds_proba, axis=1)
                values["error"] = (preds != y_test_norm).astype(int)
                values["nll"] = nll_class(y_test_norm, preds_proba)

                TU = entropy_of_expected_class(all_preds)
                DU = expected_entropy_class(all_preds)
                KU = TU - DU

                TU_ood = entropy_of_expected_class(all_preds_ood)
                DU_ood = expected_entropy_class(all_preds_ood)
                KU_ood = TU_ood - DU_ood

                values["TU_prr"] = prr_class(y_test_norm, preds_proba, TU,
                                             False)
                values["KU_prr"] = prr_class(y_test_norm, preds_proba, KU,
                                             False)

                values["TU_auc"] = ood_detect(domain_labels,
                                              TU,
                                              TU_ood,
                                              mode="ROC")
                values["KU_auc"] = ood_detect(domain_labels,
                                              KU,
                                              KU_ood,
                                              mode="ROC")

            if mode == "virt":
                if alg in ["sgb", "sgb-fixed"
                           ]:  # we do not evaluate virtual sgb model
                    continue

                # generate virtual ensemble from 0th model
                model = load_model(name, alg, 0)

                all_preds = virtual_ensembles_predict(test_pool, model, alg)

                preds_proba = np.mean(all_preds, axis=0)

                preds = np.argmax(preds_proba, axis=1)
                values["error"] = (preds != y_test_norm).astype(int)
                values["nll"] = nll_class(y_test_norm, preds_proba)

                TU = entropy_of_expected_class(all_preds)
                DU = expected_entropy_class(all_preds)
                KU = TU - DU

                all_preds_ood = virtual_ensembles_predict(
                    ood_test_pool, model, alg)
                TU_ood = entropy_of_expected_class(all_preds_ood)
                DU_ood = expected_entropy_class(all_preds_ood)
                KU_ood = TU_ood - DU_ood

                values["TU_prr"] = prr_class(y_test_norm, preds_proba, TU,
                                             False)
                values["KU_prr"] = prr_class(y_test_norm, preds_proba, KU,
                                             False)

                values["TU_auc"] = ood_detect(domain_labels,
                                              TU,
                                              TU_ood,
                                              mode="ROC")
                values["KU_auc"] = ood_detect(domain_labels,
                                              KU,
                                              KU_ood,
                                              mode="ROC")

            if mode == "virt" and alg in [
                    "sgb", "sgb-fixed"
            ]:  # we do not evaluate virtual sgb model
                continue

            results.append(values)

    return np.array(results)
Exemple #3
0
def load_pool_features_as_df(pool_file, cd_file):
    columns_metadata = read_cd(cd_file, data_file=pool_file, canonize_column_types=True)
    data = load_dataset_as_dataframe(pool_file, columns_metadata)
    return (data['features'], columns_metadata['cat_feature_indices'])