Example #1
0
def run_validation_spbn(train_data, folds, patience, result_folder, idx_fold):
    hc = GreedyHillClimbing()
    pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()])

    for k in folds:
        vl = ValidatedLikelihood(train_data, k=k, seed=0)

        for p in patience:
            fold_folder = result_folder + '/HillClimbing/SPBN_CKDE/Validation_' + str(
                k) + '_' + str(p) + '/' + str(idx_fold)
            pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True)

            if os.path.exists(fold_folder + '/end.lock'):
                continue

            cb_save = SaveModel(fold_folder)
            node_types = [(name, NodeType.CKDE)
                          for name in train_data.columns.values]
            start_model = SemiparametricBN(list(train_data.columns.values),
                                           node_types)
            bn = hc.estimate(pool,
                             vl,
                             start_model,
                             callback=cb_save,
                             patience=p,
                             verbose=True)
            iters = sorted(glob.glob(fold_folder + '/*.pickle'))
            last_file = os.path.basename(iters[-1])
            number = int(os.path.splitext(last_file)[0])
            bn.save(fold_folder + '/' + str(number + 1).zfill(6) + ".pickle")
            with open(fold_folder + '/end.lock', 'w') as f:
                pass
small_results = pd.DataFrame()
medium_results = pd.DataFrame()
large_results = pd.DataFrame()

for n in experiments_helper.INSTANCES:
    df = pd.read_csv('data/small_' + str(n) + ".csv")

    executions = np.empty((800, ))
    for i in range(800):
        if i % 10 == 0:
            print(str(i) + " executions")
        vl = ValidatedLikelihood(df, k=10, seed=i)
        start_model = SemiparametricBN(list(df.columns.values))
        hc = GreedyHillClimbing()
        pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()])

        start = time.time()
        bn = hc.estimate(pool, vl, start_model, patience=0)
        end = time.time()

        executions[i] = end - start

    small_results['SPBN_' + str(n)] = pd.Series(executions,
                                                name="SPBN_" + str(n))
    print("Small " + str(n) + " -- Time: " + str(executions.mean()) +
          ", std: " + str(np.std(executions, ddof=1)))

    df = pd.read_csv('data/medium_' + str(n) + ".csv")

    executions = np.empty((200, ))