Example #1
0
def run_validation_spbn(train_data, folds, patience, result_folder, idx_fold):
    hc = GreedyHillClimbing()
    pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()])

    for k in folds:
        vl = ValidatedLikelihood(train_data, k=k, seed=0)

        for p in patience:
            fold_folder = result_folder + '/HillClimbing/SPBN_CKDE/Validation_' + str(
                k) + '_' + str(p) + '/' + str(idx_fold)
            pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True)

            if os.path.exists(fold_folder + '/end.lock'):
                continue

            cb_save = SaveModel(fold_folder)
            node_types = [(name, NodeType.CKDE)
                          for name in train_data.columns.values]
            start_model = SemiparametricBN(list(train_data.columns.values),
                                           node_types)
            bn = hc.estimate(pool,
                             vl,
                             start_model,
                             callback=cb_save,
                             patience=p,
                             verbose=True)
            iters = sorted(glob.glob(fold_folder + '/*.pickle'))
            last_file = os.path.basename(iters[-1])
            number = int(os.path.splitext(last_file)[0])
            bn.save(fold_folder + '/' + str(number + 1).zfill(6) + ".pickle")
            with open(fold_folder + '/end.lock', 'w') as f:
                pass
def find_node_types(df, dag, model_folder, type_of_dag_string, patience):
    vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED)

    hc = GreedyHillClimbing()
    change_node_type = ChangeNodeTypeSet()

    for p in patience:
        result_folder = model_folder + '/PC/SPBN_CKDE/' + type_of_dag_string + '/' + str(
            p)
        pathlib.Path(result_folder).mkdir(parents=True, exist_ok=True)

        if os.path.exists(result_folder + '/end.lock'):
            continue

        cb_save = SaveModel(result_folder)
        node_types = [(name, NodeType.CKDE) for name in df.columns.values]
        start_model = SemiparametricBN(dag, node_types)
        bn = hc.estimate(change_node_type,
                         vl,
                         start_model,
                         callback=cb_save,
                         patience=p)

        iters = sorted(glob.glob(result_folder + '/*.pickle'))
        last_file = os.path.basename(iters[-1])
        number = int(os.path.splitext(last_file)[0])
        bn.save(result_folder + '/' + str(number + 1).zfill(6) + ".pickle")

        with open(result_folder + '/end.lock', 'w') as f:
            pass
Example #3
0
def remove_crossvalidated_nan(dataset, folds):
    to_delete = set()

    # Outer for: Performance CV
    for (idx_fold, (train_indices, test_indices)) in enumerate(
            KFold(EVALUATION_FOLDS, shuffle=True,
                  random_state=SEED).split(dataset)):
        train_data = dataset.iloc[train_indices, :]
        # Inner for: Validation CV
        for k in folds:
            vl = ValidatedLikelihood(train_data, k=k, seed=SEED)
            for (train_fold, _) in vl.cv_lik.cv:
                train_fold_pandas = train_fold.to_pandas()
                d = train_fold_pandas.columns[np.isclose(
                    train_fold_pandas.var(), 0)].tolist()
                to_delete.update(d)

    return to_delete
def run_pc_lc_spbn(train_data, folds, patience, result_folder, idx_fold):
    hc = GreedyHillClimbing()
    change_node_type = ChangeNodeTypeSet()

    pdag = load(result_folder + '/PC/graph-lc-' + str(idx_fold) + ".pickle")

    try:
        dag = pdag.to_dag()
    except ValueError:
        dag = pdag.to_approximate_dag()

    for k in folds:
        vl = ValidatedLikelihood(train_data, k=k, seed=experiments_helper.SEED)

        for p in patience:
            fold_folder = result_folder + '/PC/SPBN_CKDE/LinearCorrelation/Validation_' + str(
                k) + '_' + str(p) + '/' + str(idx_fold)
            pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True)

            if os.path.exists(fold_folder + '/end.lock'):
                continue

            cb_save = SaveModel(fold_folder)

            node_types = [(name, NodeType.CKDE) for name in dag.nodes()]
            start_model = SemiparametricBN(dag, node_types)

            bn = hc.estimate(change_node_type,
                             vl,
                             start_model,
                             callback=cb_save,
                             patience=p,
                             verbose=True)
            iters = sorted(glob.glob(fold_folder + '/*.pickle'))
            last_file = os.path.basename(iters[-1])
            number = int(os.path.splitext(last_file)[0])
            bn.save(fold_folder + '/' + str(number + 1).zfill(6) + ".pickle")
            with open(fold_folder + '/end.lock', 'w') as f:
                pass
def run_validation_kdebn(train_data, folds, patience, result_folder, idx_fold):
    hc = GreedyHillClimbing()
    arc_set = ArcOperatorSet()

    for k in folds:
        vl = ValidatedLikelihood(train_data, k=k, seed=0)

        for p in patience:
            fold_folder = result_folder + '/HillClimbing/KDEBN/Validation_' + str(k) + '_' + str(p) + '/' + str(idx_fold)
            pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True)

            if os.path.exists(fold_folder + '/end.lock'):
                continue

            cb_save = SaveModel(fold_folder)
            start_model = KDENetwork(list(train_data.columns.values))
            bn = hc.estimate(arc_set, vl, start_model, callback=cb_save, patience=p, verbose=True)
            iters = sorted(glob.glob(fold_folder + '/*.pickle'))
            last_file = os.path.basename(iters[-1])
            number = int(os.path.splitext(last_file)[0])
            bn.save(fold_folder + '/' + str(number+1).zfill(6) + ".pickle")
            with open(fold_folder + '/end.lock', 'w') as f:
                pass
import time

patience = experiments_helper.PATIENCE

small_results = pd.DataFrame()
medium_results = pd.DataFrame()
large_results = pd.DataFrame()

for n in experiments_helper.INSTANCES:
    df = pd.read_csv('data/small_' + str(n) + ".csv")

    executions = np.empty((800, ))
    for i in range(800):
        if i % 10 == 0:
            print(str(i) + " executions")
        vl = ValidatedLikelihood(df, k=10, seed=i)
        start_model = SemiparametricBN(list(df.columns.values))
        hc = GreedyHillClimbing()
        pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()])

        start = time.time()
        bn = hc.estimate(pool, vl, start_model, patience=0)
        end = time.time()

        executions[i] = end - start

    small_results['SPBN_' + str(n)] = pd.Series(executions,
                                                name="SPBN_" + str(n))
    print("Small " + str(n) + " -- Time: " + str(executions.mean()) +
          ", std: " + str(np.std(executions, ddof=1)))
from pybnesian.learning.algorithms import GreedyHillClimbing
from pybnesian.learning.algorithms.callbacks import SaveModel
from pybnesian.learning.operators import OperatorPool, ArcOperatorSet, ChangeNodeTypeSet
from pybnesian.learning.scores import ValidatedLikelihood
import pathlib
import os
import experiments_helper

hc = GreedyHillClimbing()
pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()])

for d in experiments_helper.DATASETS:
    for i in experiments_helper.INSTANCES:
        df = pd.read_csv(d + "_"  + str(i) + '.csv')

        vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED)

        for p in experiments_helper.PATIENCE:
            result_folder = 'models/' + d + '/' + str(i) + '/HillClimbing/SPBN/' + str(p)
            pathlib.Path(result_folder).mkdir(parents=True, exist_ok=True)

            if not os.path.exists(result_folder + '/end.lock'):
                cb_save = SaveModel(result_folder)
                start_model = SemiparametricBN(list(df.columns.values))
                bn = hc.estimate(pool, vl, start_model, callback=cb_save, patience=p)

                iters = sorted(glob.glob(result_folder + '/*.pickle'))
                last_file = os.path.basename(iters[-1])
                number = int(os.path.splitext(last_file)[0])
                bn.save(result_folder + '/' + str(number+1).zfill(6) + ".pickle")
                
Example #8
0
import pandas as pd
from pybnesian import load
from pybnesian.factors import NodeType
from pybnesian.learning.scores import ValidatedLikelihood
from pybnesian.models import SemiparametricBN
import glob

df_10000 = pd.read_csv('synthetic_10000.csv')
df_test = pd.read_csv('synthetic_test.csv')

models = sorted(glob.glob('models/10000/HillClimbing/SPBN_CKDE/0/*.pickle'))

vl = ValidatedLikelihood(df_10000, k=10, seed=0)

node_types = [(name, NodeType.CKDE) for name in df_10000.columns.values]
start_model = SemiparametricBN(list(df_10000.columns.values), node_types)

print("Start model")
print("\tTraining score: " + str(vl.score(start_model)))
print("\tValidation score: " + str(vl.vscore(start_model)))

start_model.fit(df_10000)
print("\tTest score: " + str(start_model.slogl(df_test)))

for m in models:
    bn = load(m)
    print("Model " + m)
    print("\tTraining score: " + str(vl.score(bn)))
    print("\tValidation score: " + str(vl.vscore(bn)))

    bn.fit(df_10000)
Example #9
0
                     axis_width="20.5cm",
                     axis_height="5cm")

    plt.figure()

    good = pd.concat(
        [atrium11.iloc[labels0_11, :], atrium12.iloc[labels0_12, :]])
    mid = pd.concat(
        [atrium11.iloc[labels1_11, :], atrium12.iloc[labels1_12, :]])
    poor = pd.concat(
        [atrium11.iloc[labels2_11, :], atrium12.iloc[labels2_12, :]])

    hc = GreedyHillClimbing()
    pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()])
    start_model = SemiparametricBN(list(good.columns.values))
    vl = ValidatedLikelihood(good, k=10)
    good_model = hc.estimate(pool, vl, start_model)
    good_model.fit(good)

    hc = GreedyHillClimbing()
    pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()])
    start_model = SemiparametricBN(list(mid.columns.values))
    vl = ValidatedLikelihood(mid, k=10)
    mid_model = hc.estimate(pool, vl, start_model)
    mid_model.fit(mid)

    hc = GreedyHillClimbing()
    pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()])
    start_model = SemiparametricBN(list(poor.columns.values))
    vl = ValidatedLikelihood(poor, k=10)
    poor_model = hc.estimate(pool, vl, start_model)
def test_pc_time(df, n_exec):
    local_results = pd.DataFrame()

    pc = PC()
    lc = LinearCorrelation(df)
    
    executions = np.empty((n_exec,))
    for i in range(n_exec):
        start = time.time()
        graph_lc = pc.estimate(lc)
        end = time.time()

        executions[i] = end - start

    local_results['PC-LC-Graph'] = pd.Series(executions, name="PC-LC-Graph")
    print("LC Graph Time: " + str(executions.mean()))

    try:
        dag = graph_lc.to_dag()
    except ValueError:
        dag = graph_lc.to_approximate_dag()

    executions = np.empty((n_exec,))
    for i in range(n_exec):
        hc = GreedyHillClimbing()
        change_node_type = ChangeNodeTypeSet()
        vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED)

        start_model = SemiparametricBN(dag)

        start = time.time()
        bn = hc.estimate(change_node_type, vl, start_model, patience=0)
        end = time.time()

        executions[i] = end - start
    
    local_results['PC-LC-NodeType'] = pd.Series(executions, name="PC-LC-NodeType")
    print("LC HC NodeType Time: " + str(executions.mean()))
    
    rcot = RCoT(df)
    
    for i in range(n_exec):
        start = time.time()
        graph_rcot = pc.estimate(rcot) 
        end = time.time()

        executions[i] = end - start

    local_results['PC-RCoT-Graph'] = pd.Series(executions, name="PC-RCoT-Graph")
    print("RCoT Graph Time: " + str(executions.mean()))

    for i in range(n_exec):
        hc = GreedyHillClimbing()
        change_node_type = ChangeNodeTypeSet()
        vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED)

        try:
            dag = graph_rcot.to_dag()
        except ValueError:
            dag = graph_rcot.to_approximate_dag()

        start_model = SemiparametricBN(dag)
        
        start = time.time()
        bn = hc.estimate(change_node_type, vl, start_model, patience=0)
        end = time.time()

        executions[i] = end - start

    local_results['PC-RCoT-NodeType'] = pd.Series(executions, name="PC-RCoT-NodeType")
    print("RCoT HC NodeType Time: " + str(executions.mean()))

    return local_results