def run_validation_spbn(train_data, folds, patience, result_folder, idx_fold): hc = GreedyHillClimbing() pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()]) for k in folds: vl = ValidatedLikelihood(train_data, k=k, seed=0) for p in patience: fold_folder = result_folder + '/HillClimbing/SPBN_CKDE/Validation_' + str( k) + '_' + str(p) + '/' + str(idx_fold) pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True) if os.path.exists(fold_folder + '/end.lock'): continue cb_save = SaveModel(fold_folder) node_types = [(name, NodeType.CKDE) for name in train_data.columns.values] start_model = SemiparametricBN(list(train_data.columns.values), node_types) bn = hc.estimate(pool, vl, start_model, callback=cb_save, patience=p, verbose=True) iters = sorted(glob.glob(fold_folder + '/*.pickle')) last_file = os.path.basename(iters[-1]) number = int(os.path.splitext(last_file)[0]) bn.save(fold_folder + '/' + str(number + 1).zfill(6) + ".pickle") with open(fold_folder + '/end.lock', 'w') as f: pass
def find_node_types(df, dag, model_folder, type_of_dag_string, patience): vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED) hc = GreedyHillClimbing() change_node_type = ChangeNodeTypeSet() for p in patience: result_folder = model_folder + '/PC/SPBN_CKDE/' + type_of_dag_string + '/' + str( p) pathlib.Path(result_folder).mkdir(parents=True, exist_ok=True) if os.path.exists(result_folder + '/end.lock'): continue cb_save = SaveModel(result_folder) node_types = [(name, NodeType.CKDE) for name in df.columns.values] start_model = SemiparametricBN(dag, node_types) bn = hc.estimate(change_node_type, vl, start_model, callback=cb_save, patience=p) iters = sorted(glob.glob(result_folder + '/*.pickle')) last_file = os.path.basename(iters[-1]) number = int(os.path.splitext(last_file)[0]) bn.save(result_folder + '/' + str(number + 1).zfill(6) + ".pickle") with open(result_folder + '/end.lock', 'w') as f: pass
def run_pc_lc_spbn(train_data, folds, patience, result_folder, idx_fold): hc = GreedyHillClimbing() change_node_type = ChangeNodeTypeSet() pdag = load(result_folder + '/PC/graph-lc-' + str(idx_fold) + ".pickle") try: dag = pdag.to_dag() except ValueError: dag = pdag.to_approximate_dag() for k in folds: vl = ValidatedLikelihood(train_data, k=k, seed=experiments_helper.SEED) for p in patience: fold_folder = result_folder + '/PC/SPBN_CKDE/LinearCorrelation/Validation_' + str( k) + '_' + str(p) + '/' + str(idx_fold) pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True) if os.path.exists(fold_folder + '/end.lock'): continue cb_save = SaveModel(fold_folder) node_types = [(name, NodeType.CKDE) for name in dag.nodes()] start_model = SemiparametricBN(dag, node_types) bn = hc.estimate(change_node_type, vl, start_model, callback=cb_save, patience=p, verbose=True) iters = sorted(glob.glob(fold_folder + '/*.pickle')) last_file = os.path.basename(iters[-1]) number = int(os.path.splitext(last_file)[0]) bn.save(fold_folder + '/' + str(number + 1).zfill(6) + ".pickle") with open(fold_folder + '/end.lock', 'w') as f: pass
small_results = pd.DataFrame() medium_results = pd.DataFrame() large_results = pd.DataFrame() for n in experiments_helper.INSTANCES: df = pd.read_csv('data/small_' + str(n) + ".csv") executions = np.empty((800, )) for i in range(800): if i % 10 == 0: print(str(i) + " executions") vl = ValidatedLikelihood(df, k=10, seed=i) start_model = SemiparametricBN(list(df.columns.values)) hc = GreedyHillClimbing() pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()]) start = time.time() bn = hc.estimate(pool, vl, start_model, patience=0) end = time.time() executions[i] = end - start small_results['SPBN_' + str(n)] = pd.Series(executions, name="SPBN_" + str(n)) print("Small " + str(n) + " -- Time: " + str(executions.mean()) + ", std: " + str(np.std(executions, ddof=1))) df = pd.read_csv('data/medium_' + str(n) + ".csv") executions = np.empty((200, ))
import glob import pandas as pd from pybnesian import load from pybnesian.factors import NodeType from pybnesian.models import SemiparametricBN from pybnesian.learning.algorithms import GreedyHillClimbing from pybnesian.learning.algorithms.callbacks import SaveModel from pybnesian.learning.operators import OperatorPool, ArcOperatorSet, ChangeNodeTypeSet from pybnesian.learning.scores import ValidatedLikelihood import pathlib import os import experiments_helper hc = GreedyHillClimbing() change_node = ChangeNodeTypeSet() for d in experiments_helper.DATASETS: for i in experiments_helper.INSTANCES: df = pd.read_csv(d + "_" + str(i) + '.csv') pdag_lc = load('models/' + d + '/' + str(i) + '/PC/graph-lc.pickle') try: dag_lc = pdag_lc.to_dag() except ValueError: dag_lc = pdag_lc.to_approximate_dag() vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED) for p in experiments_helper.PATIENCE: result_folder = 'models/' + d + '/' + str(
def test_pc_time(df, n_exec): local_results = pd.DataFrame() pc = PC() lc = LinearCorrelation(df) executions = np.empty((n_exec,)) for i in range(n_exec): start = time.time() graph_lc = pc.estimate(lc) end = time.time() executions[i] = end - start local_results['PC-LC-Graph'] = pd.Series(executions, name="PC-LC-Graph") print("LC Graph Time: " + str(executions.mean())) try: dag = graph_lc.to_dag() except ValueError: dag = graph_lc.to_approximate_dag() executions = np.empty((n_exec,)) for i in range(n_exec): hc = GreedyHillClimbing() change_node_type = ChangeNodeTypeSet() vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED) start_model = SemiparametricBN(dag) start = time.time() bn = hc.estimate(change_node_type, vl, start_model, patience=0) end = time.time() executions[i] = end - start local_results['PC-LC-NodeType'] = pd.Series(executions, name="PC-LC-NodeType") print("LC HC NodeType Time: " + str(executions.mean())) rcot = RCoT(df) for i in range(n_exec): start = time.time() graph_rcot = pc.estimate(rcot) end = time.time() executions[i] = end - start local_results['PC-RCoT-Graph'] = pd.Series(executions, name="PC-RCoT-Graph") print("RCoT Graph Time: " + str(executions.mean())) for i in range(n_exec): hc = GreedyHillClimbing() change_node_type = ChangeNodeTypeSet() vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED) try: dag = graph_rcot.to_dag() except ValueError: dag = graph_rcot.to_approximate_dag() start_model = SemiparametricBN(dag) start = time.time() bn = hc.estimate(change_node_type, vl, start_model, patience=0) end = time.time() executions[i] = end - start local_results['PC-RCoT-NodeType'] = pd.Series(executions, name="PC-RCoT-NodeType") print("RCoT HC NodeType Time: " + str(executions.mean())) return local_results