Ejemplo n.º 1
0
def datasets_table():
    files = experiments_helper.find_crossvalidation_datasets()
    valid_files = [
        f for f in files
        if experiments_helper.validate_dataset(f, [2, 3, 5, 10]) is not None
    ]

    for idx_file, file in enumerate(valid_files):
        dataset, result_folder = experiments_helper.validate_dataset(
            file, [2, 3, 5, 10])
        basefolder = os.path.basename(os.path.dirname(file))

        print(basefolder + " " + str(dataset.shape[0]) + "x" +
              str(dataset.shape[1]))
Ejemplo n.º 2
0
def test_crossvalidation():
    files = experiments_helper.find_crossvalidation_datasets()

    folds = experiments_helper.TRAINING_FOLDS
    patience = experiments_helper.PATIENCE

    string_file = "Dataset," + ','.join(["GBN_Validation_"+ str(f) + "_" + str(p) for f in folds for p in patience]) +\
                    ",BIC,BGe"

    print(string_file)
    for file in files:
        x = experiments_helper.validate_dataset(
            file, experiments_helper.TRAINING_FOLDS)
        if x is None:
            continue
        else:
            dataset, result_folder = x

        validation_gaussian = np.full(
            (experiments_helper.EVALUATION_FOLDS, len(folds), len(patience)),
            np.nan)
        bic_gaussian = np.full((experiments_helper.EVALUATION_FOLDS, ), np.nan)
        bge_gaussian = np.full((experiments_helper.EVALUATION_FOLDS, ), np.nan)

        for (idx_fold, (train_indices, test_indices)) in enumerate(
                KFold(experiments_helper.EVALUATION_FOLDS,
                      shuffle=True,
                      random_state=experiments_helper.SEED).split(dataset)):
            train_dataset = dataset.iloc[train_indices, :]
            test_dataset = dataset.iloc[test_indices, :]

            validation_gaussian[idx_fold] = test_validation_gaussian(
                train_dataset, test_dataset, folds, patience, result_folder,
                idx_fold)
            bic_gaussian[idx_fold] = test_bic_gaussian(train_dataset,
                                                       test_dataset,
                                                       result_folder, idx_fold)
            bge_gaussian[idx_fold] = test_bge_gaussian(train_dataset,
                                                       test_dataset,
                                                       result_folder, idx_fold)

        sum_validation_gaussian = validation_gaussian.sum(axis=0)
        sum_bic = bic_gaussian.sum(axis=0)
        sum_bge = bge_gaussian.sum(axis=0)

        basefolder = os.path.basename(os.path.dirname(file))
        new_line = basefolder

        for idx_f, f in enumerate(folds):
            for idx_p, p in enumerate(patience):
                new_line += "," + str(sum_validation_gaussian[idx_f, idx_p])

        new_line += "," + str(sum_bic)
        new_line += "," + str(sum_bge)
        print(new_line)

        string_file += '\n' + new_line

    with open('results_hc_gbn.csv', 'w') as f:
        f.write(string_file)
Ejemplo n.º 3
0
def train_crossvalidation_file(file, folds, patience):
    x = experiments_helper.validate_dataset(file, folds)
    if x is None:
        return
    else:
        dataset, result_folder = x

    if not os.path.exists(result_folder):
        os.mkdir(result_folder)

    print(file)

    with mp.Pool(processes=experiments_helper.EVALUATION_FOLDS) as p:
        p.starmap(run_validation_gaussian, [(dataset.iloc[train_indices,:], folds, patience, result_folder, idx_fold)
                                             for (idx_fold, (train_indices, test_indices)) in
                                             enumerate(KFold(experiments_helper.EVALUATION_FOLDS, shuffle=True, 
                                                             random_state=experiments_helper.SEED).split(dataset))]
                  )

    with mp.Pool(processes=experiments_helper.EVALUATION_FOLDS) as p:
        p.starmap(run_bic_gaussian, [(dataset.iloc[train_indices,:], result_folder, idx_fold)
                                             for (idx_fold, (train_indices, test_indices)) in
                                             enumerate(KFold(experiments_helper.EVALUATION_FOLDS, shuffle=True, 
                                                             random_state=0).split(dataset))]
                  )
    
    with mp.Pool(processes=experiments_helper.EVALUATION_FOLDS) as p:
        p.starmap(run_bge_gaussian, [(dataset.iloc[train_indices,:], result_folder, idx_fold)
                                             for (idx_fold, (train_indices, test_indices)) in
                                             enumerate(KFold(experiments_helper.EVALUATION_FOLDS,
                                                             shuffle=True, random_state=0).split(dataset))]
                  )
Ejemplo n.º 4
0
def print_datasets():
    datasets = experiments_helper.find_crossvalidation_datasets()

    for dataset in datasets:
        basefolder = os.path.basename(os.path.dirname(dataset))
        x = experiments_helper.validate_dataset(
            dataset, experiments_helper.TRAINING_FOLDS)
        if x is None:
            continue
        else:
            dataset, _ = x
        print("{} & {} & {}\\\\".format(basefolder, dataset.shape[0],
                                        dataset.shape[1]))
Ejemplo n.º 5
0
def num_instances_df():
    files = experiments_helper.find_crossvalidation_datasets()
    names = []
    num_instances = []
    for file in files:
        x = experiments_helper.validate_dataset(
            file, experiments_helper.TRAINING_FOLDS)
        if x is not None:
            dataset, result_folder = x

            names.append(os.path.basename(result_folder))
            num_instances.append(dataset.shape[0])

    return pd.DataFrame({"Dataset": names, "N": num_instances})
Ejemplo n.º 6
0
def test_crossvalidation():
    files = experiments_helper.find_crossvalidation_datasets()

    folds = experiments_helper.TRAINING_FOLDS
    patience = experiments_helper.PATIENCE

    string_file = "Dataset," + ','.join(["SPBN_CKDE_PC_LC_" + str(f) + "_" + str(p) for f in folds for p in patience]) +\
                        "," + ','.join(["SPBN_CKDE_PC_RCOT_" + str(f) + "_" + str(p) for f in folds for p in patience])

    print(string_file)
    for file in files:
        x = experiments_helper.validate_dataset(file, experiments_helper.TRAINING_FOLDS)
        if x is None:
            continue
        else:
            dataset, result_folder = x

        spbn_lc_score = np.full((experiments_helper.EVALUATION_FOLDS, len(folds), len(patience)), np.nan)
        spbn_rcot_score = np.full((experiments_helper.EVALUATION_FOLDS, len(folds), len(patience)), np.nan)

        for (idx_fold, (train_indices, test_indices)) in enumerate(KFold(experiments_helper.EVALUATION_FOLDS, shuffle=True, 
                                                                   random_state=experiments_helper.SEED).split(dataset)):
            train_dataset = dataset.iloc[train_indices,:]
            test_dataset = dataset.iloc[test_indices,:]

            spbn_lc_score[idx_fold] = test_pc_lc_spbn(train_dataset, test_dataset, folds, patience,
                                                             result_folder, idx_fold)
            spbn_rcot_score[idx_fold] = test_pc_rcot_spbn(train_dataset, test_dataset, folds, patience,
                                                             result_folder, idx_fold)

        sum_spbn_lc_score = spbn_lc_score.sum(axis=0)
        sum_spbn_rcot_score = spbn_rcot_score.sum(axis=0)

        basefolder = os.path.basename(os.path.dirname(file))
        new_line = basefolder

        for idx_f, f in enumerate(folds):
            for idx_p, p in enumerate(patience):
                new_line += "," + str(sum_spbn_lc_score[idx_f, idx_p])
        for idx_f, f in enumerate(folds):
            for idx_p, p in enumerate(patience):
                new_line += "," + str(sum_spbn_rcot_score[idx_f, idx_p])

        print(new_line)

        string_file += '\n' + new_line

    with open('results_pc_spbn_ckde.csv', 'w') as f:
        f.write(string_file)
Ejemplo n.º 7
0
def test_crossvalidation():
    files = experiments_helper.find_crossvalidation_datasets()

    string_file = "Dataset,GBN_PC_LC,GBN_PC_RCOT"

    print(string_file)
    for file in files:
        x = experiments_helper.validate_dataset(
            file, experiments_helper.TRAINING_FOLDS)
        if x is None:
            continue
        else:
            dataset, result_folder = x

        gbn_lc_score = np.full((experiments_helper.EVALUATION_FOLDS, ), np.nan)
        gbn_rcot_score = np.full((experiments_helper.EVALUATION_FOLDS, ),
                                 np.nan)

        for (idx_fold, (train_indices, test_indices)) in enumerate(
                KFold(experiments_helper.EVALUATION_FOLDS,
                      shuffle=True,
                      random_state=experiments_helper.SEED).split(dataset)):
            train_dataset = dataset.iloc[train_indices, :]
            test_dataset = dataset.iloc[test_indices, :]

            gbn_lc_score[idx_fold] = test_pc_lc_gbn(train_dataset,
                                                    test_dataset,
                                                    result_folder, idx_fold)
            gbn_rcot_score[idx_fold] = test_pc_rcot_gbn(
                train_dataset, test_dataset, result_folder, idx_fold)

        sum_gbn_lc_score = gbn_lc_score.sum(axis=0)
        sum_gbn_rcot_score = gbn_rcot_score.sum(axis=0)

        basefolder = os.path.basename(os.path.dirname(file))
        new_line = basefolder + "," + str(sum_gbn_lc_score) + "," + str(
            sum_gbn_rcot_score)

        print(new_line)

        string_file += '\n' + new_line

    with open('results_pc_gbn.csv', 'w') as f:
        f.write(string_file)
Ejemplo n.º 8
0
def kdeness_ckde():
    folds = experiments_helper.TRAINING_FOLDS
    patience = experiments_helper.PATIENCE

    files = experiments_helper.find_crossvalidation_datasets()
    valid_files = [
        f for f in files
        if experiments_helper.validate_dataset(f, folds) is not None
    ]

    n_ckde = np.full((len(valid_files), len(folds), 3, 10), np.nan)
    datasets = []
    n_vars = []
    for idx_file, file in enumerate(valid_files):
        x = experiments_helper.validate_dataset(
            file, experiments_helper.TRAINING_FOLDS)
        dataset, result_folder = x

        basefolder = os.path.basename(os.path.dirname(file))
        datasets.append(basefolder)
        n_vars.append(dataset.shape[1])

        for idx_f, f in enumerate(experiments_helper.TRAINING_FOLDS):
            for idx_fold in range(10):
                models_folder = result_folder + '/HillClimbing/SPBN_CKDE/Validation_' + str(
                    f) + '_5/' + str(idx_fold)
                all_models = sorted(glob.glob(models_folder + '/*.pickle'))
                final_model = load(all_models[-1])

                n_ckde[idx_file, idx_f, 0, idx_fold] = \
                    sum(map(lambda kv: kv[1] == NodeType.CKDE, final_model.node_types().items()))

        for idx_f, f in enumerate(experiments_helper.TRAINING_FOLDS):
            for idx_fold in range(10):
                models_folder = result_folder + '/PC/SPBN_CKDE/LinearCorrelation/Validation_' + str(
                    f) + '_5/' + str(idx_fold)
                all_models = sorted(glob.glob(models_folder + '/*.pickle'))
                final_model = load(all_models[-1])

                n_ckde[idx_file, idx_f, 1, idx_fold] = \
                    sum(map(lambda kv: kv[1] == NodeType.CKDE, final_model.node_types().items()))

        for idx_f, f in enumerate(experiments_helper.TRAINING_FOLDS):
            for idx_fold in range(10):
                models_folder = result_folder + '/PC/SPBN_CKDE/RCoT/Validation_' + str(
                    f) + '_5/' + str(idx_fold)
                all_models = sorted(glob.glob(models_folder + '/*.pickle'))
                final_model = load(all_models[-1])

                n_ckde[idx_file, idx_f, 2, idx_fold] = \
                    sum(map(lambda kv: kv[1] == NodeType.CKDE, final_model.node_types().items()))

    mean_ckde = np.mean(n_ckde, axis=3).reshape(len(valid_files), -1)
    algorithms = ["HC", "PC-PLC", "PC-RCoT"]
    names = [
        "CKDE_" + str(f) + "_" + algorithm for f in folds
        for algorithm in algorithms
    ]

    df = pd.DataFrame(mean_ckde, columns=names, index=datasets)
    df['n_vars'] = n_vars
    for f in folds:
        for algorithm in algorithms:
            df['%CKDE_' + str(f) + "_" +
               algorithm] = df.loc[:, 'CKDE_' + str(f) + "_" +
                                   algorithm] / df.loc[:, 'n_vars']

    N = df.shape[0]
    ind = np.arange(N)
    num_bars = len(folds) * len(algorithms)
    width = (1 - 0.3) / num_bars

    fig = plt.figure()
    ax = fig.add_subplot(111)

    offset = 0

    b = []

    color = {
        algorithms[0]: "#729CF5",
        algorithms[1]: "#FFB346",
        algorithms[2]: "#B5EA7F"
    }
    for f in folds:
        for algorithm in algorithms:
            t = ax.bar(ind + width * offset,
                       df['%CKDE_' + str(f) + "_" + algorithm].to_numpy(),
                       width,
                       align='edge',
                       linewidth=0.5,
                       edgecolor="black",
                       color=color[algorithm])
            offset += 1
            b.append(t)

    ax.set_ylabel('Ratio of CKDE variables')
    ax.set_xticks(ind + (1 - 0.3) / 2)
    ax.set_xticklabels(df.index)
    ax.tick_params(axis='x', rotation=90)

    plt.legend([t[0] for t in b], algorithms)
    tikzplotlib.save("plots/kdeness.tex",
                     standalone=True,
                     axis_width="25cm",
                     axis_height="10cm")