def process_classification_dataset(name): # converting categorical features to numerical data_dir = os.path.join('datasets', name) train_file = os.path.join(data_dir, 'full_train') test_file = os.path.join(data_dir, 'test') cd_file = os.path.join(data_dir, 'pool.cd') train = np.loadtxt(train_file, delimiter="\t", dtype="object") test = np.loadtxt(test_file, delimiter="\t", dtype="object") cd = read_cd(cd_file, data_file=train_file) # Target can be called 'Label' or 'Target' in pool.cd try: label_ind = cd['column_type_to_indices']['Label'] except: label_ind = cd['column_type_to_indices']['Target'] np.random.seed(42) # fix random seed train = np.random.permutation(train) y_train = train[:, label_ind] y_train = y_train.reshape(-1) y_test = test[:, label_ind] y_test = y_test.reshape(-1) cat_features = cd['column_type_to_indices'][ 'Categ'] # features to be replaced enc = LeaveOneOutEncoder(cols=cat_features, return_df=False, random_state=10, sigma=0.3) transformed_train = enc.fit_transform(train, y_train).astype("float64") X_train = np.delete(transformed_train, label_ind, 1) # remove target column transformed_test = enc.transform(test).astype("float64") X_test = np.delete(transformed_test, label_ind, 1) # remove target column return np.nan_to_num(X_train), y_train, np.nan_to_num(X_test), y_test, enc
def aggregate_results(name, modes=["single", "ens", "virt"], algorithms=['sgb-fixed', 'sglb-fixed'], num_models=10): results = [] # metric values for all algorithms and all folds for mode in modes: for alg in algorithms: if alg == "rf": train_pool, y_train, test_pool, y_test, enc = process_classification_dataset( name) # process ood data cd = read_cd("datasets/" + name + "/pool.cd", data_file="datasets/" + name + "/test") try: label_ind = cd['column_type_to_indices']['Label'] except: label_ind = cd['column_type_to_indices']['Target'] ood_test_pool = np.loadtxt("datasets/ood/" + name, delimiter="\t", dtype="object") ood_test_pool = enc.transform(ood_test_pool).astype("float64") ood_test_pool = np.delete(ood_test_pool, label_ind, 1) ood_size = len(ood_test_pool) else: test_pool = Pool(data="datasets/" + name + "/test", column_description="datasets/" + name + "/pool.cd") ood_test_pool = Pool(data="datasets/ood/" + name, column_description="datasets/" + name + "/pool.cd") ood_size = ood_test_pool.num_row() y_test = test_pool.get_label() test_size = len(y_test) domain_labels = np.concatenate( [np.zeros(test_size), np.ones(ood_size)]) y_test_norm = normalize_test_labels(y_test) values = defaultdict( ) # metric values for all folds for given algorithm if mode == "single": # use 0th model from ensemble as a single model model = load_model(name, alg, 0) preds = model.predict(test_pool) preds_proba = model.predict_proba(test_pool) values["error"] = (preds != y_test).astype(int) values["nll"] = nll_class(y_test_norm, preds_proba) values["TU_prr"] = prr_class(y_test_norm, preds_proba, entropy(preds_proba), False) values["KU_prr"] = float("nan") values["KU_auc"] = float("nan") ood_preds_proba = model.predict_proba(ood_test_pool) in_measure = entropy(preds_proba) out_measure = entropy(ood_preds_proba) values["TU_auc"] = ood_detect(domain_labels, in_measure, out_measure, mode="ROC") if mode == "ens": all_preds = [] # predictions of all models in ensemble all_preds_ood = [] for i in range(num_models): model = load_model(name, alg, i) preds = model.predict_proba(test_pool) all_preds.append(preds) preds = model.predict_proba(ood_test_pool) all_preds_ood.append(preds) all_preds = np.array(all_preds) preds_proba = np.mean(all_preds, axis=0) all_preds_ood = np.array(all_preds_ood) preds = np.argmax(preds_proba, axis=1) values["error"] = (preds != y_test_norm).astype(int) values["nll"] = nll_class(y_test_norm, preds_proba) TU = entropy_of_expected_class(all_preds) DU = expected_entropy_class(all_preds) KU = TU - DU TU_ood = entropy_of_expected_class(all_preds_ood) DU_ood = expected_entropy_class(all_preds_ood) KU_ood = TU_ood - DU_ood values["TU_prr"] = prr_class(y_test_norm, preds_proba, TU, False) values["KU_prr"] = prr_class(y_test_norm, preds_proba, KU, False) values["TU_auc"] = ood_detect(domain_labels, TU, TU_ood, mode="ROC") values["KU_auc"] = ood_detect(domain_labels, KU, KU_ood, mode="ROC") if mode == "virt": if alg in ["sgb", "sgb-fixed" ]: # we do not evaluate virtual sgb model continue # generate virtual ensemble from 0th model model = load_model(name, alg, 0) all_preds = virtual_ensembles_predict(test_pool, model, alg) preds_proba = np.mean(all_preds, axis=0) preds = np.argmax(preds_proba, axis=1) values["error"] = (preds != y_test_norm).astype(int) values["nll"] = nll_class(y_test_norm, preds_proba) TU = entropy_of_expected_class(all_preds) DU = expected_entropy_class(all_preds) KU = TU - DU all_preds_ood = virtual_ensembles_predict( ood_test_pool, model, alg) TU_ood = entropy_of_expected_class(all_preds_ood) DU_ood = expected_entropy_class(all_preds_ood) KU_ood = TU_ood - DU_ood values["TU_prr"] = prr_class(y_test_norm, preds_proba, TU, False) values["KU_prr"] = prr_class(y_test_norm, preds_proba, KU, False) values["TU_auc"] = ood_detect(domain_labels, TU, TU_ood, mode="ROC") values["KU_auc"] = ood_detect(domain_labels, KU, KU_ood, mode="ROC") if mode == "virt" and alg in [ "sgb", "sgb-fixed" ]: # we do not evaluate virtual sgb model continue results.append(values) return np.array(results)
def load_pool_features_as_df(pool_file, cd_file): columns_metadata = read_cd(cd_file, data_file=pool_file, canonize_column_types=True) data = load_dataset_as_dataframe(pool_file, columns_metadata) return (data['features'], columns_metadata['cat_feature_indices'])