Exemple #1
0
def run_validation_spbn(train_data, folds, patience, result_folder, idx_fold):
    hc = GreedyHillClimbing()
    pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()])

    for k in folds:
        vl = ValidatedLikelihood(train_data, k=k, seed=0)

        for p in patience:
            fold_folder = result_folder + '/HillClimbing/SPBN_CKDE/Validation_' + str(
                k) + '_' + str(p) + '/' + str(idx_fold)
            pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True)

            if os.path.exists(fold_folder + '/end.lock'):
                continue

            cb_save = SaveModel(fold_folder)
            node_types = [(name, NodeType.CKDE)
                          for name in train_data.columns.values]
            start_model = SemiparametricBN(list(train_data.columns.values),
                                           node_types)
            bn = hc.estimate(pool,
                             vl,
                             start_model,
                             callback=cb_save,
                             patience=p,
                             verbose=True)
            iters = sorted(glob.glob(fold_folder + '/*.pickle'))
            last_file = os.path.basename(iters[-1])
            number = int(os.path.splitext(last_file)[0])
            bn.save(fold_folder + '/' + str(number + 1).zfill(6) + ".pickle")
            with open(fold_folder + '/end.lock', 'w') as f:
                pass
def find_node_types(df, dag, model_folder, type_of_dag_string, patience):
    vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED)

    hc = GreedyHillClimbing()
    change_node_type = ChangeNodeTypeSet()

    for p in patience:
        result_folder = model_folder + '/PC/SPBN_CKDE/' + type_of_dag_string + '/' + str(
            p)
        pathlib.Path(result_folder).mkdir(parents=True, exist_ok=True)

        if os.path.exists(result_folder + '/end.lock'):
            continue

        cb_save = SaveModel(result_folder)
        node_types = [(name, NodeType.CKDE) for name in df.columns.values]
        start_model = SemiparametricBN(dag, node_types)
        bn = hc.estimate(change_node_type,
                         vl,
                         start_model,
                         callback=cb_save,
                         patience=p)

        iters = sorted(glob.glob(result_folder + '/*.pickle'))
        last_file = os.path.basename(iters[-1])
        number = int(os.path.splitext(last_file)[0])
        bn.save(result_folder + '/' + str(number + 1).zfill(6) + ".pickle")

        with open(result_folder + '/end.lock', 'w') as f:
            pass
def run_pc_lc_spbn(train_data, folds, patience, result_folder, idx_fold):
    hc = GreedyHillClimbing()
    change_node_type = ChangeNodeTypeSet()

    pdag = load(result_folder + '/PC/graph-lc-' + str(idx_fold) + ".pickle")

    try:
        dag = pdag.to_dag()
    except ValueError:
        dag = pdag.to_approximate_dag()

    for k in folds:
        vl = ValidatedLikelihood(train_data, k=k, seed=experiments_helper.SEED)

        for p in patience:
            fold_folder = result_folder + '/PC/SPBN_CKDE/LinearCorrelation/Validation_' + str(
                k) + '_' + str(p) + '/' + str(idx_fold)
            pathlib.Path(fold_folder).mkdir(parents=True, exist_ok=True)

            if os.path.exists(fold_folder + '/end.lock'):
                continue

            cb_save = SaveModel(fold_folder)

            node_types = [(name, NodeType.CKDE) for name in dag.nodes()]
            start_model = SemiparametricBN(dag, node_types)

            bn = hc.estimate(change_node_type,
                             vl,
                             start_model,
                             callback=cb_save,
                             patience=p,
                             verbose=True)
            iters = sorted(glob.glob(fold_folder + '/*.pickle'))
            last_file = os.path.basename(iters[-1])
            number = int(os.path.splitext(last_file)[0])
            bn.save(fold_folder + '/' + str(number + 1).zfill(6) + ".pickle")
            with open(fold_folder + '/end.lock', 'w') as f:
                pass
small_results = pd.DataFrame()
medium_results = pd.DataFrame()
large_results = pd.DataFrame()

for n in experiments_helper.INSTANCES:
    df = pd.read_csv('data/small_' + str(n) + ".csv")

    executions = np.empty((800, ))
    for i in range(800):
        if i % 10 == 0:
            print(str(i) + " executions")
        vl = ValidatedLikelihood(df, k=10, seed=i)
        start_model = SemiparametricBN(list(df.columns.values))
        hc = GreedyHillClimbing()
        pool = OperatorPool([ArcOperatorSet(), ChangeNodeTypeSet()])

        start = time.time()
        bn = hc.estimate(pool, vl, start_model, patience=0)
        end = time.time()

        executions[i] = end - start

    small_results['SPBN_' + str(n)] = pd.Series(executions,
                                                name="SPBN_" + str(n))
    print("Small " + str(n) + " -- Time: " + str(executions.mean()) +
          ", std: " + str(np.std(executions, ddof=1)))

    df = pd.read_csv('data/medium_' + str(n) + ".csv")

    executions = np.empty((200, ))
Exemple #5
0
import glob
import pandas as pd
from pybnesian import load
from pybnesian.factors import NodeType
from pybnesian.models import SemiparametricBN
from pybnesian.learning.algorithms import GreedyHillClimbing
from pybnesian.learning.algorithms.callbacks import SaveModel
from pybnesian.learning.operators import OperatorPool, ArcOperatorSet, ChangeNodeTypeSet
from pybnesian.learning.scores import ValidatedLikelihood
import pathlib
import os
import experiments_helper

hc = GreedyHillClimbing()
change_node = ChangeNodeTypeSet()

for d in experiments_helper.DATASETS:
    for i in experiments_helper.INSTANCES:
        df = pd.read_csv(d + "_" + str(i) + '.csv')

        pdag_lc = load('models/' + d + '/' + str(i) + '/PC/graph-lc.pickle')

        try:
            dag_lc = pdag_lc.to_dag()
        except ValueError:
            dag_lc = pdag_lc.to_approximate_dag()

        vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED)

        for p in experiments_helper.PATIENCE:
            result_folder = 'models/' + d + '/' + str(
def test_pc_time(df, n_exec):
    local_results = pd.DataFrame()

    pc = PC()
    lc = LinearCorrelation(df)
    
    executions = np.empty((n_exec,))
    for i in range(n_exec):
        start = time.time()
        graph_lc = pc.estimate(lc)
        end = time.time()

        executions[i] = end - start

    local_results['PC-LC-Graph'] = pd.Series(executions, name="PC-LC-Graph")
    print("LC Graph Time: " + str(executions.mean()))

    try:
        dag = graph_lc.to_dag()
    except ValueError:
        dag = graph_lc.to_approximate_dag()

    executions = np.empty((n_exec,))
    for i in range(n_exec):
        hc = GreedyHillClimbing()
        change_node_type = ChangeNodeTypeSet()
        vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED)

        start_model = SemiparametricBN(dag)

        start = time.time()
        bn = hc.estimate(change_node_type, vl, start_model, patience=0)
        end = time.time()

        executions[i] = end - start
    
    local_results['PC-LC-NodeType'] = pd.Series(executions, name="PC-LC-NodeType")
    print("LC HC NodeType Time: " + str(executions.mean()))
    
    rcot = RCoT(df)
    
    for i in range(n_exec):
        start = time.time()
        graph_rcot = pc.estimate(rcot) 
        end = time.time()

        executions[i] = end - start

    local_results['PC-RCoT-Graph'] = pd.Series(executions, name="PC-RCoT-Graph")
    print("RCoT Graph Time: " + str(executions.mean()))

    for i in range(n_exec):
        hc = GreedyHillClimbing()
        change_node_type = ChangeNodeTypeSet()
        vl = ValidatedLikelihood(df, k=10, seed=experiments_helper.SEED)

        try:
            dag = graph_rcot.to_dag()
        except ValueError:
            dag = graph_rcot.to_approximate_dag()

        start_model = SemiparametricBN(dag)
        
        start = time.time()
        bn = hc.estimate(change_node_type, vl, start_model, patience=0)
        end = time.time()

        executions[i] = end - start

    local_results['PC-RCoT-NodeType'] = pd.Series(executions, name="PC-RCoT-NodeType")
    print("RCoT HC NodeType Time: " + str(executions.mean()))

    return local_results