def run_validation_spbn(train_data, folds, patience, result_folder, idx_fold): hc = GreedyHillClimbing() pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()]) for k in folds: vl = ValidatedLikelihood(train_data, k=k, seed=0) for p in patience: fold_folder = result_folder + '/HillClimbing/SPBN_CKDE/Validation_' + str( k) + '_' + str(p) + '/' + str(idx_fold) pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True) if os.path.exists(fold_folder + '/end.lock'): continue cb_save = SaveModel(fold_folder) node_types = [(name, NodeType.CKDE) for name in train_data.columns.values] start_model = SemiparametricBN(list(train_data.columns.values), node_types) bn = hc.estimate(pool, vl, start_model, callback=cb_save, patience=p, verbose=True) iters = sorted(glob.glob(fold_folder + '/*.pickle')) last_file = os.path.basename(iters[-1]) number = int(os.path.splitext(last_file)[0]) bn.save(fold_folder + '/' + str(number + 1).zfill(6) + ".pickle") with open(fold_folder + '/end.lock', 'w') as f: pass
def find_node_types(df, dag, model_folder, type_of_dag_string, patience): vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED) hc = GreedyHillClimbing() change_node_type = ChangeNodeTypeSet() for p in patience: result_folder = model_folder + '/PC/SPBN_CKDE/' + type_of_dag_string + '/' + str( p) pathlib.Path(result_folder).mkdir(parents=True, exist_ok=True) if os.path.exists(result_folder + '/end.lock'): continue cb_save = SaveModel(result_folder) node_types = [(name, NodeType.CKDE) for name in df.columns.values] start_model = SemiparametricBN(dag, node_types) bn = hc.estimate(change_node_type, vl, start_model, callback=cb_save, patience=p) iters = sorted(glob.glob(result_folder + '/*.pickle')) last_file = os.path.basename(iters[-1]) number = int(os.path.splitext(last_file)[0]) bn.save(result_folder + '/' + str(number + 1).zfill(6) + ".pickle") with open(result_folder + '/end.lock', 'w') as f: pass
def remove_crossvalidated_nan(dataset, folds): to_delete = set() # Outer for: Performance CV for (idx_fold, (train_indices, test_indices)) in enumerate( KFold(EVALUATION_FOLDS, shuffle=True, random_state=SEED).split(dataset)): train_data = dataset.iloc[train_indices, :] # Inner for: Validation CV for k in folds: vl = ValidatedLikelihood(train_data, k=k, seed=SEED) for (train_fold, _) in vl.cv_lik.cv: train_fold_pandas = train_fold.to_pandas() d = train_fold_pandas.columns[np.isclose( train_fold_pandas.var(), 0)].tolist() to_delete.update(d) return to_delete
def run_pc_lc_spbn(train_data, folds, patience, result_folder, idx_fold): hc = GreedyHillClimbing() change_node_type = ChangeNodeTypeSet() pdag = load(result_folder + '/PC/graph-lc-' + str(idx_fold) + ".pickle") try: dag = pdag.to_dag() except ValueError: dag = pdag.to_approximate_dag() for k in folds: vl = ValidatedLikelihood(train_data, k=k, seed=experiments_helper.SEED) for p in patience: fold_folder = result_folder + '/PC/SPBN_CKDE/LinearCorrelation/Validation_' + str( k) + '_' + str(p) + '/' + str(idx_fold) pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True) if os.path.exists(fold_folder + '/end.lock'): continue cb_save = SaveModel(fold_folder) node_types = [(name, NodeType.CKDE) for name in dag.nodes()] start_model = SemiparametricBN(dag, node_types) bn = hc.estimate(change_node_type, vl, start_model, callback=cb_save, patience=p, verbose=True) iters = sorted(glob.glob(fold_folder + '/*.pickle')) last_file = os.path.basename(iters[-1]) number = int(os.path.splitext(last_file)[0]) bn.save(fold_folder + '/' + str(number + 1).zfill(6) + ".pickle") with open(fold_folder + '/end.lock', 'w') as f: pass
def run_validation_kdebn(train_data, folds, patience, result_folder, idx_fold): hc = GreedyHillClimbing() arc_set = ArcOperatorSet() for k in folds: vl = ValidatedLikelihood(train_data, k=k, seed=0) for p in patience: fold_folder = result_folder + '/HillClimbing/KDEBN/Validation_' + str(k) + '_' + str(p) + '/' + str(idx_fold) pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True) if os.path.exists(fold_folder + '/end.lock'): continue cb_save = SaveModel(fold_folder) start_model = KDENetwork(list(train_data.columns.values)) bn = hc.estimate(arc_set, vl, start_model, callback=cb_save, patience=p, verbose=True) iters = sorted(glob.glob(fold_folder + '/*.pickle')) last_file = os.path.basename(iters[-1]) number = int(os.path.splitext(last_file)[0]) bn.save(fold_folder + '/' + str(number+1).zfill(6) + ".pickle") with open(fold_folder + '/end.lock', 'w') as f: pass
import time patience = experiments_helper.PATIENCE small_results = pd.DataFrame() medium_results = pd.DataFrame() large_results = pd.DataFrame() for n in experiments_helper.INSTANCES: df = pd.read_csv('data/small_' + str(n) + ".csv") executions = np.empty((800, )) for i in range(800): if i % 10 == 0: print(str(i) + " executions") vl = ValidatedLikelihood(df, k=10, seed=i) start_model = SemiparametricBN(list(df.columns.values)) hc = GreedyHillClimbing() pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()]) start = time.time() bn = hc.estimate(pool, vl, start_model, patience=0) end = time.time() executions[i] = end - start small_results['SPBN_' + str(n)] = pd.Series(executions, name="SPBN_" + str(n)) print("Small " + str(n) + " -- Time: " + str(executions.mean()) + ", std: " + str(np.std(executions, ddof=1)))
from pybnesian.learning.algorithms import GreedyHillClimbing from pybnesian.learning.algorithms.callbacks import SaveModel from pybnesian.learning.operators import OperatorPool, ArcOperatorSet, ChangeNodeTypeSet from pybnesian.learning.scores import ValidatedLikelihood import pathlib import os import experiments_helper hc = GreedyHillClimbing() pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()]) for d in experiments_helper.DATASETS: for i in experiments_helper.INSTANCES: df = pd.read_csv(d + "_" + str(i) + '.csv') vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED) for p in experiments_helper.PATIENCE: result_folder = 'models/' + d + '/' + str(i) + '/HillClimbing/SPBN/' + str(p) pathlib.Path(result_folder).mkdir(parents=True, exist_ok=True) if not os.path.exists(result_folder + '/end.lock'): cb_save = SaveModel(result_folder) start_model = SemiparametricBN(list(df.columns.values)) bn = hc.estimate(pool, vl, start_model, callback=cb_save, patience=p) iters = sorted(glob.glob(result_folder + '/*.pickle')) last_file = os.path.basename(iters[-1]) number = int(os.path.splitext(last_file)[0]) bn.save(result_folder + '/' + str(number+1).zfill(6) + ".pickle")
import pandas as pd from pybnesian import load from pybnesian.factors import NodeType from pybnesian.learning.scores import ValidatedLikelihood from pybnesian.models import SemiparametricBN import glob df_10000 = pd.read_csv('synthetic_10000.csv') df_test = pd.read_csv('synthetic_test.csv') models = sorted(glob.glob('models/10000/HillClimbing/SPBN_CKDE/0/*.pickle')) vl = ValidatedLikelihood(df_10000, k=10, seed=0) node_types = [(name, NodeType.CKDE) for name in df_10000.columns.values] start_model = SemiparametricBN(list(df_10000.columns.values), node_types) print("Start model") print("\tTraining score: " + str(vl.score(start_model))) print("\tValidation score: " + str(vl.vscore(start_model))) start_model.fit(df_10000) print("\tTest score: " + str(start_model.slogl(df_test))) for m in models: bn = load(m) print("Model " + m) print("\tTraining score: " + str(vl.score(bn))) print("\tValidation score: " + str(vl.vscore(bn))) bn.fit(df_10000)
axis_width="20.5cm", axis_height="5cm") plt.figure() good = pd.concat( [atrium11.iloc[labels0_11, :], atrium12.iloc[labels0_12, :]]) mid = pd.concat( [atrium11.iloc[labels1_11, :], atrium12.iloc[labels1_12, :]]) poor = pd.concat( [atrium11.iloc[labels2_11, :], atrium12.iloc[labels2_12, :]]) hc = GreedyHillClimbing() pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()]) start_model = SemiparametricBN(list(good.columns.values)) vl = ValidatedLikelihood(good, k=10) good_model = hc.estimate(pool, vl, start_model) good_model.fit(good) hc = GreedyHillClimbing() pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()]) start_model = SemiparametricBN(list(mid.columns.values)) vl = ValidatedLikelihood(mid, k=10) mid_model = hc.estimate(pool, vl, start_model) mid_model.fit(mid) hc = GreedyHillClimbing() pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()]) start_model = SemiparametricBN(list(poor.columns.values)) vl = ValidatedLikelihood(poor, k=10) poor_model = hc.estimate(pool, vl, start_model)
def test_pc_time(df, n_exec): local_results = pd.DataFrame() pc = PC() lc = LinearCorrelation(df) executions = np.empty((n_exec,)) for i in range(n_exec): start = time.time() graph_lc = pc.estimate(lc) end = time.time() executions[i] = end - start local_results['PC-LC-Graph'] = pd.Series(executions, name="PC-LC-Graph") print("LC Graph Time: " + str(executions.mean())) try: dag = graph_lc.to_dag() except ValueError: dag = graph_lc.to_approximate_dag() executions = np.empty((n_exec,)) for i in range(n_exec): hc = GreedyHillClimbing() change_node_type = ChangeNodeTypeSet() vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED) start_model = SemiparametricBN(dag) start = time.time() bn = hc.estimate(change_node_type, vl, start_model, patience=0) end = time.time() executions[i] = end - start local_results['PC-LC-NodeType'] = pd.Series(executions, name="PC-LC-NodeType") print("LC HC NodeType Time: " + str(executions.mean())) rcot = RCoT(df) for i in range(n_exec): start = time.time() graph_rcot = pc.estimate(rcot) end = time.time() executions[i] = end - start local_results['PC-RCoT-Graph'] = pd.Series(executions, name="PC-RCoT-Graph") print("RCoT Graph Time: " + str(executions.mean())) for i in range(n_exec): hc = GreedyHillClimbing() change_node_type = ChangeNodeTypeSet() vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED) try: dag = graph_rcot.to_dag() except ValueError: dag = graph_rcot.to_approximate_dag() start_model = SemiparametricBN(dag) start = time.time() bn = hc.estimate(change_node_type, vl, start_model, patience=0) end = time.time() executions[i] = end - start local_results['PC-RCoT-NodeType'] = pd.Series(executions, name="PC-RCoT-NodeType") print("RCoT HC NodeType Time: " + str(executions.mean())) return local_results