def datasets_table(): files = experiments_helper.find_crossvalidation_datasets() valid_files = [ f for f in files if experiments_helper.validate_dataset(f, [2, 3, 5, 10]) is not None ] for idx_file, file in enumerate(valid_files): dataset, result_folder = experiments_helper.validate_dataset( file, [2, 3, 5, 10]) basefolder = os.path.basename(os.path.dirname(file)) print(basefolder + " " + str(dataset.shape[0]) + "x" + str(dataset.shape[1]))
def test_crossvalidation(): files = experiments_helper.find_crossvalidation_datasets() folds = experiments_helper.TRAINING_FOLDS patience = experiments_helper.PATIENCE string_file = "Dataset," + ','.join(["GBN_Validation_"+ str(f) + "_" + str(p) for f in folds for p in patience]) +\ ",BIC,BGe" print(string_file) for file in files: x = experiments_helper.validate_dataset( file, experiments_helper.TRAINING_FOLDS) if x is None: continue else: dataset, result_folder = x validation_gaussian = np.full( (experiments_helper.EVALUATION_FOLDS, len(folds), len(patience)), np.nan) bic_gaussian = np.full((experiments_helper.EVALUATION_FOLDS, ), np.nan) bge_gaussian = np.full((experiments_helper.EVALUATION_FOLDS, ), np.nan) for (idx_fold, (train_indices, test_indices)) in enumerate( KFold(experiments_helper.EVALUATION_FOLDS, shuffle=True, random_state=experiments_helper.SEED).split(dataset)): train_dataset = dataset.iloc[train_indices, :] test_dataset = dataset.iloc[test_indices, :] validation_gaussian[idx_fold] = test_validation_gaussian( train_dataset, test_dataset, folds, patience, result_folder, idx_fold) bic_gaussian[idx_fold] = test_bic_gaussian(train_dataset, test_dataset, result_folder, idx_fold) bge_gaussian[idx_fold] = test_bge_gaussian(train_dataset, test_dataset, result_folder, idx_fold) sum_validation_gaussian = validation_gaussian.sum(axis=0) sum_bic = bic_gaussian.sum(axis=0) sum_bge = bge_gaussian.sum(axis=0) basefolder = os.path.basename(os.path.dirname(file)) new_line = basefolder for idx_f, f in enumerate(folds): for idx_p, p in enumerate(patience): new_line += "," + str(sum_validation_gaussian[idx_f, idx_p]) new_line += "," + str(sum_bic) new_line += "," + str(sum_bge) print(new_line) string_file += '\n' + new_line with open('results_hc_gbn.csv', 'w') as f: f.write(string_file)
def train_crossvalidation_file(file, folds, patience): x = experiments_helper.validate_dataset(file, folds) if x is None: return else: dataset, result_folder = x if not os.path.exists(result_folder): os.mkdir(result_folder) print(file) with mp.Pool(processes=experiments_helper.EVALUATION_FOLDS) as p: p.starmap(run_validation_gaussian, [(dataset.iloc[train_indices,:], folds, patience, result_folder, idx_fold) for (idx_fold, (train_indices, test_indices)) in enumerate(KFold(experiments_helper.EVALUATION_FOLDS, shuffle=True, random_state=experiments_helper.SEED).split(dataset))] ) with mp.Pool(processes=experiments_helper.EVALUATION_FOLDS) as p: p.starmap(run_bic_gaussian, [(dataset.iloc[train_indices,:], result_folder, idx_fold) for (idx_fold, (train_indices, test_indices)) in enumerate(KFold(experiments_helper.EVALUATION_FOLDS, shuffle=True, random_state=0).split(dataset))] ) with mp.Pool(processes=experiments_helper.EVALUATION_FOLDS) as p: p.starmap(run_bge_gaussian, [(dataset.iloc[train_indices,:], result_folder, idx_fold) for (idx_fold, (train_indices, test_indices)) in enumerate(KFold(experiments_helper.EVALUATION_FOLDS, shuffle=True, random_state=0).split(dataset))] )
def print_datasets(): datasets = experiments_helper.find_crossvalidation_datasets() for dataset in datasets: basefolder = os.path.basename(os.path.dirname(dataset)) x = experiments_helper.validate_dataset( dataset, experiments_helper.TRAINING_FOLDS) if x is None: continue else: dataset, _ = x print("{} & {} & {}\\\\".format(basefolder, dataset.shape[0], dataset.shape[1]))
def num_instances_df(): files = experiments_helper.find_crossvalidation_datasets() names = [] num_instances = [] for file in files: x = experiments_helper.validate_dataset( file, experiments_helper.TRAINING_FOLDS) if x is not None: dataset, result_folder = x names.append(os.path.basename(result_folder)) num_instances.append(dataset.shape[0]) return pd.DataFrame({"Dataset": names, "N": num_instances})
def test_crossvalidation(): files = experiments_helper.find_crossvalidation_datasets() folds = experiments_helper.TRAINING_FOLDS patience = experiments_helper.PATIENCE string_file = "Dataset," + ','.join(["SPBN_CKDE_PC_LC_" + str(f) + "_" + str(p) for f in folds for p in patience]) +\ "," + ','.join(["SPBN_CKDE_PC_RCOT_" + str(f) + "_" + str(p) for f in folds for p in patience]) print(string_file) for file in files: x = experiments_helper.validate_dataset(file, experiments_helper.TRAINING_FOLDS) if x is None: continue else: dataset, result_folder = x spbn_lc_score = np.full((experiments_helper.EVALUATION_FOLDS, len(folds), len(patience)), np.nan) spbn_rcot_score = np.full((experiments_helper.EVALUATION_FOLDS, len(folds), len(patience)), np.nan) for (idx_fold, (train_indices, test_indices)) in enumerate(KFold(experiments_helper.EVALUATION_FOLDS, shuffle=True, random_state=experiments_helper.SEED).split(dataset)): train_dataset = dataset.iloc[train_indices,:] test_dataset = dataset.iloc[test_indices,:] spbn_lc_score[idx_fold] = test_pc_lc_spbn(train_dataset, test_dataset, folds, patience, result_folder, idx_fold) spbn_rcot_score[idx_fold] = test_pc_rcot_spbn(train_dataset, test_dataset, folds, patience, result_folder, idx_fold) sum_spbn_lc_score = spbn_lc_score.sum(axis=0) sum_spbn_rcot_score = spbn_rcot_score.sum(axis=0) basefolder = os.path.basename(os.path.dirname(file)) new_line = basefolder for idx_f, f in enumerate(folds): for idx_p, p in enumerate(patience): new_line += "," + str(sum_spbn_lc_score[idx_f, idx_p]) for idx_f, f in enumerate(folds): for idx_p, p in enumerate(patience): new_line += "," + str(sum_spbn_rcot_score[idx_f, idx_p]) print(new_line) string_file += '\n' + new_line with open('results_pc_spbn_ckde.csv', 'w') as f: f.write(string_file)
def test_crossvalidation(): files = experiments_helper.find_crossvalidation_datasets() string_file = "Dataset,GBN_PC_LC,GBN_PC_RCOT" print(string_file) for file in files: x = experiments_helper.validate_dataset( file, experiments_helper.TRAINING_FOLDS) if x is None: continue else: dataset, result_folder = x gbn_lc_score = np.full((experiments_helper.EVALUATION_FOLDS, ), np.nan) gbn_rcot_score = np.full((experiments_helper.EVALUATION_FOLDS, ), np.nan) for (idx_fold, (train_indices, test_indices)) in enumerate( KFold(experiments_helper.EVALUATION_FOLDS, shuffle=True, random_state=experiments_helper.SEED).split(dataset)): train_dataset = dataset.iloc[train_indices, :] test_dataset = dataset.iloc[test_indices, :] gbn_lc_score[idx_fold] = test_pc_lc_gbn(train_dataset, test_dataset, result_folder, idx_fold) gbn_rcot_score[idx_fold] = test_pc_rcot_gbn( train_dataset, test_dataset, result_folder, idx_fold) sum_gbn_lc_score = gbn_lc_score.sum(axis=0) sum_gbn_rcot_score = gbn_rcot_score.sum(axis=0) basefolder = os.path.basename(os.path.dirname(file)) new_line = basefolder + "," + str(sum_gbn_lc_score) + "," + str( sum_gbn_rcot_score) print(new_line) string_file += '\n' + new_line with open('results_pc_gbn.csv', 'w') as f: f.write(string_file)
def kdeness_ckde(): folds = experiments_helper.TRAINING_FOLDS patience = experiments_helper.PATIENCE files = experiments_helper.find_crossvalidation_datasets() valid_files = [ f for f in files if experiments_helper.validate_dataset(f, folds) is not None ] n_ckde = np.full((len(valid_files), len(folds), 3, 10), np.nan) datasets = [] n_vars = [] for idx_file, file in enumerate(valid_files): x = experiments_helper.validate_dataset( file, experiments_helper.TRAINING_FOLDS) dataset, result_folder = x basefolder = os.path.basename(os.path.dirname(file)) datasets.append(basefolder) n_vars.append(dataset.shape[1]) for idx_f, f in enumerate(experiments_helper.TRAINING_FOLDS): for idx_fold in range(10): models_folder = result_folder + '/HillClimbing/SPBN_CKDE/Validation_' + str( f) + '_5/' + str(idx_fold) all_models = sorted(glob.glob(models_folder + '/*.pickle')) final_model = load(all_models[-1]) n_ckde[idx_file, idx_f, 0, idx_fold] = \ sum(map(lambda kv: kv[1] == NodeType.CKDE, final_model.node_types().items())) for idx_f, f in enumerate(experiments_helper.TRAINING_FOLDS): for idx_fold in range(10): models_folder = result_folder + '/PC/SPBN_CKDE/LinearCorrelation/Validation_' + str( f) + '_5/' + str(idx_fold) all_models = sorted(glob.glob(models_folder + '/*.pickle')) final_model = load(all_models[-1]) n_ckde[idx_file, idx_f, 1, idx_fold] = \ sum(map(lambda kv: kv[1] == NodeType.CKDE, final_model.node_types().items())) for idx_f, f in enumerate(experiments_helper.TRAINING_FOLDS): for idx_fold in range(10): models_folder = result_folder + '/PC/SPBN_CKDE/RCoT/Validation_' + str( f) + '_5/' + str(idx_fold) all_models = sorted(glob.glob(models_folder + '/*.pickle')) final_model = load(all_models[-1]) n_ckde[idx_file, idx_f, 2, idx_fold] = \ sum(map(lambda kv: kv[1] == NodeType.CKDE, final_model.node_types().items())) mean_ckde = np.mean(n_ckde, axis=3).reshape(len(valid_files), -1) algorithms = ["HC", "PC-PLC", "PC-RCoT"] names = [ "CKDE_" + str(f) + "_" + algorithm for f in folds for algorithm in algorithms ] df = pd.DataFrame(mean_ckde, columns=names, index=datasets) df['n_vars'] = n_vars for f in folds: for algorithm in algorithms: df['%CKDE_' + str(f) + "_" + algorithm] = df.loc[:, 'CKDE_' + str(f) + "_" + algorithm] / df.loc[:, 'n_vars'] N = df.shape[0] ind = np.arange(N) num_bars = len(folds) * len(algorithms) width = (1 - 0.3) / num_bars fig = plt.figure() ax = fig.add_subplot(111) offset = 0 b = [] color = { algorithms[0]: "#729CF5", algorithms[1]: "#FFB346", algorithms[2]: "#B5EA7F" } for f in folds: for algorithm in algorithms: t = ax.bar(ind + width * offset, df['%CKDE_' + str(f) + "_" + algorithm].to_numpy(), width, align='edge', linewidth=0.5, edgecolor="black", color=color[algorithm]) offset += 1 b.append(t) ax.set_ylabel('Ratio of CKDE variables') ax.set_xticks(ind + (1 - 0.3) / 2) ax.set_xticklabels(df.index) ax.tick_params(axis='x', rotation=90) plt.legend([t[0] for t in b], algorithms) tikzplotlib.save("plots/kdeness.tex", standalone=True, axis_width="25cm", axis_height="10cm")