Exemple #1
0
def hillclimbsearch(df,
                    scoretype='bic',
                    black_list=None,
                    white_list=None,
                    max_indegree=None,
                    verbose=3):
    out = dict()
    # Set scoring type
    scoring_method = SetScoringType(df, scoretype)
    # Set search algorithm
    model = HillClimbSearch(df, scoring_method=scoring_method)
    # Compute best DAG
    try:
        best_model = model.estimate(max_indegree=max_indegree,
                                    black_list=black_list,
                                    white_list=white_list)
        # print("Works only for version > v.0.1.9")
    except:
        best_model = model.estimate(
            max_indegree=max_indegree)  #Can be be removed if pgmpy >v0.1.9

    # Store
    out['model'] = best_model
    out['model_edges'] = best_model.edges()
    # Return
    return (out)
Exemple #2
0
def _hillclimbsearch(df,
                     scoretype='bic',
                     black_list=None,
                     white_list=None,
                     max_indegree=None,
                     verbose=3):
    out = dict()
    # Set scoring type
    scoring_method = _SetScoringType(df, scoretype)
    # Set search algorithm
    model = HillClimbSearch(df, scoring_method=scoring_method)
    # Compute best DAG
    try:
        if ((black_list is not None) or (white_list is not None)):
            if verbose >= 3:
                print(
                    '[BNLEARN][STRUCTURE LEARNING] black_list and/or white_list are incorporated..'
                )  # Can be be removed if pgmpy >v0.1.9
        # print("Works only for version > v.0.1.9")
        best_model = model.estimate(max_indegree=max_indegree,
                                    black_list=black_list,
                                    white_list=white_list)
    except:
        best_model = model.estimate(
            max_indegree=max_indegree)  # Can be be removed if pgmpy >v0.1.9

    # Store
    out['model'] = best_model
    out['model_edges'] = best_model.edges()
    # Return
    return (out)
def generateDiffAndRarityModel(h0Diff, h0Rarity):
	# correlation matrix
	h0DiffCorrelation = np.corrcoef(h0Diff, rowvar=False)
	h0RarityCorrelation = np.corrcoef(h0Rarity, rowvar=False)

	# converting to pandas data frame
	h0Diff = pd.DataFrame(h0Diff, columns = ['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9'])
	h0Rarity = pd.DataFrame(h0Rarity, columns = ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8', 'r9'])

	print('\nestimating PGM\n')
	# using hill climbing algo
	hc = HillClimbSearch(h0Diff)
	
	# estimating model
	diffModel = hc.estimate(max_indegree = 40)
	print('difference model:\n', diffModel.edges())

	print('\nplotting heatmap for h0Diff correlation\n')
	sns.heatmap(h0DiffCorrelation, annot=True)
	plt.show()

	# using hill climbing algo
	hc = HillClimbSearch(h0Rarity)
	
	# estimating model
	rarityModel = hc.estimate(max_indegree = 20)
	print('rarity model:\n', rarityModel.edges())

	print('\nplotting heatmap for h0Rarity correlation\n')
	sns.heatmap(h0RarityCorrelation, annot=True)
	plt.show()

	return h0DiffModel, h0RarityModel
    def pgm_generate(self, target, data, pgm_stats, subnodes, child=None):

        subnodes = [str(int(node)) for node in subnodes]
        target = str(int(target))
        subnodes_no_target = [node for node in subnodes if node != target]
        data.columns = data.columns.astype(str)

        MK_blanket = self.search_MK(data, target, subnodes_no_target.copy())

        if child == None:
            est = HillClimbSearch(data[subnodes_no_target],
                                  scoring_method=BicScore(data))
            pgm_no_target = est.estimate()
            for node in MK_blanket:
                if node != target:
                    pgm_no_target.add_edge(node, target)

        #   Create the pgm
            pgm_explanation = BayesianModel()
            for node in pgm_no_target.nodes():
                pgm_explanation.add_node(node)
            for edge in pgm_no_target.edges():
                pgm_explanation.add_edge(edge[0], edge[1])

        #   Fit the pgm
            data_ex = data[subnodes].copy()
            data_ex[target] = data[target].apply(self.generalize_target)
            for node in subnodes_no_target:
                data_ex[node] = data[node].apply(self.generalize_others)
            pgm_explanation.fit(data_ex)
        else:
            data_ex = data[subnodes].copy()
            data_ex[target] = data[target].apply(self.generalize_target)
            for node in subnodes_no_target:
                data_ex[node] = data[node].apply(self.generalize_others)

            est = HillClimbSearch(data_ex, scoring_method=BicScore(data_ex))
            pgm_w_target_explanation = est.estimate()

            #   Create the pgm
            pgm_explanation = BayesianModel()
            for node in pgm_w_target_explanation.nodes():
                pgm_explanation.add_node(node)
            for edge in pgm_w_target_explanation.edges():
                pgm_explanation.add_edge(edge[0], edge[1])

            #   Fit the pgm
            data_ex = data[subnodes].copy()
            data_ex[target] = data[target].apply(self.generalize_target)
            for node in subnodes_no_target:
                data_ex[node] = data[node].apply(self.generalize_others)
            pgm_explanation.fit(data_ex)

        return pgm_explanation
class TimeHillClimbAlarmModel:
    timeout = 600.0

    def setup(self):
        model = get_example_model('alarm')
        samples = model.simulate(n_samples=int(1e4),
                                 seed=42,
                                 show_progress=False)
        self.scoring_method = K2Score(samples)
        self.est = HillClimbSearch(data=samples)

    def time_hillclimb(self):
        self.est.estimate(max_indegree=4,
                          scoring_method=self.scoring_method,
                          max_iter=int(1e4))
Exemple #6
0
def create_BN_model(data):
    #structure learning
    print("Structure learning")
    start_time = datetime.now()
    print("Start time: ", start_time)

    #DECOMMENT TO CREATE A MODEL WITH THE HILL CLIMB ALGORITHM
    hc = HillClimbSearch(data)

    best_model = hc.estimate()
    print(best_model.edges())
    edges = best_model.edges()

    model = BayesianModel(edges)

    print('Fitting the model...')

    # Evaluation of cpds using Maximum Likelihood Estimation
    model.fit(data)

    end_time = datetime.now()
    print("End time: ", end_time)

    model_write = BIFWriter(model)
    model_write.write_bif('model_pgmpy.bif')

    if model.check_model():
        print(
            "Your network structure and CPD's are correctly defined. The probabilities in the columns sum to 1. Hill Climb worked fine!"
        )
    else:
        print("not good")
    return (model, end_time - start_time)
Exemple #7
0
    def pgm_generate(self, target, data, stats, subnodes):
        stats_pd = pd.Series(stats, name='p-values')
        MK_blanket_frame = stats_pd[stats_pd < 0.05]
        MK_blanket = [node for node in MK_blanket_frame.index if node in subnodes]
        subnodes_no_target = [node for node in subnodes if node != target]
        est = HillClimbSearch(data[subnodes_no_target], scoring_method=BicScore(data))
        pgm_no_target = est.estimate()
        for node in MK_blanket:
            if node != target:
                pgm_no_target.add_edge(node,target)

    #   Create the pgm    
        pgm_explanation = BayesianModel()
        for node in pgm_no_target.nodes():
            pgm_explanation.add_node(node)
        for edge in pgm_no_target.edges():
            pgm_explanation.add_edge(edge[0],edge[1])

    #   Fit the pgm
        data_ex = data[subnodes].copy()
        data_ex[target] = data[target].apply(self.generalize_target)
        for node in subnodes_no_target:
            data_ex[node] = data[node].apply(self.generalize_others)
        pgm_explanation.fit(data_ex)

        return pgm_explanation
def Hybrid(dataset: pd.DataFrame):
    from pgmpy.estimators import MmhcEstimator
    from pgmpy.estimators import HillClimbSearch
    from pgmpy.estimators import BDeuScore, K2Score, BicScore
    from pgmpy.models import BayesianModel
    
    mmhc = MmhcEstimator(dataset)
    # The mmhc method takes a parameter significance_level(default=0.01) the desired Type 1 error probability of
    # falsely rejecting the null hypothesis that variables. That is, confining Type 1 error rate.
    # (Therefore, the lower value, the less we are gonna accept dependencies, resulting in a sparser graph.)
    skeleton = mmhc.mmpc()
    print("Part 1) Skeleton: ", skeleton.edges())

    # use hill climb search to orient the edges:
    hc = HillClimbSearch(dataset, scoring_method=BDeuScore(dataset, equivalent_sample_size=5))
    # Recording the evaluation of different iteration
    bdeu = BDeuScore(dataset, equivalent_sample_size=5)
    iter_list = [2**i for i in range(20)]
    eval_list = []
    for iteration in iter_list:
        DAG_connection = hc.estimate(tabu_length=10, white_list=skeleton.to_directed().edges(), max_iter=iteration)
        model = BayesianModel(DAG_connection.edges())
        print(bdeu.score(model))
        eval_list.append(bdeu.score(model))

    print("Part 2) Model:    ", model.edges())
    return model.edges(), [iter_list, eval_list]
def learnedStructureModel():
    # trainingData, testingData = differenceBetweenFeatures(True)
    trainingInputs, trainingOutputs, testingInputs, testingOutputs = \
     gtd.formSameWriterDiffWriterInputOutputFeaturePairs(5, True)

    trainingData = pd.DataFrame(
     data = np.concatenate((trainingInputs, trainingOutputs), axis=1),
     columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9',\
      'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19',
      'h'])

    testingData = pd.DataFrame(
     data = np.concatenate((testingInputs, testingOutputs), axis=1),
     columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9',\
      'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19',
      'h'])

    #trainingData = trainingData.drop(['f9', 'f18'], axis=1)
    #testingData = testingData.drop(['f9', 'f18'], axis=1)

    hc = HillClimbSearch(trainingData, scoring_method=BicScore(trainingData))
    model = hc.estimate(max_indegree=20)

    state_names = {
        'f1': [0, 1, 2, 3],
        'f2': [0, 1, 2, 3, 4],
        'f3': [0, 1, 2],
        'f4': [0, 1, 2, 3, 4],
        'f5': [0, 1, 2, 3],
        'f6': [0, 1, 2, 3],
        'f7': [0, 1, 2, 3],
        'f8': [0, 1, 2, 3, 4],
        'f9': [0, 1, 2],
        'f11': [0, 1, 2, 3],
        'f12': [0, 1, 2, 3, 4],
        'f13': [0, 1, 2],
        'f14': [0, 1, 2, 3, 4],
        'f15': [0, 1, 2, 3],
        'f16': [0, 1, 2, 3],
        'f17': [0, 1, 2, 3],
        'f18': [0, 1, 2, 3, 4],
        'f19': [0, 1, 2],
        'h': [0, 1]
    }

    # fit model and data, compute CPDs
    model.fit(trainingData,
              estimator=BayesianEstimator,
              prior_type='BDeu',
              state_names=state_names)

    print(model.edges())

    # inference object
    # computing probability of Hyothesis given evidence
    evidenceNodes = ['f1','f2','f3','f4','f5','f6','f7','f8','f9',\
      'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19']
    evaluateModel(model, testingData, 'h', evidenceNodes)
Exemple #10
0
def build_structure(data):
    df = pd.DataFrame(data)
    est = HillClimbSearch(df, scoring_method=BicScore(df))
    model = est.estimate()
    DAG = np.zeros((data.shape[1], data.shape[1]), np.int64)

    for edge in model.edges():
        DAG[edge[0], edge[1]] = 1

    np.save('dataset/DAG.npy', DAG)
    return DAG
Exemple #11
0
def bei_ye_si():
    warnings.filterwarnings("ignore")
    print('现在进行的算法是贝叶斯网络')
    f = open('泰坦尼克号.txt')
    dataset = pd.read_table(f, delim_whitespace=True)
    train = dataset[:800]
    test = dataset[800:]
    hc = HillClimbSearch(train, scoring_method=BicScore(train))
    best_model = hc.estimate()
    best_model.fit(train, estimator=BayesianEstimator,
                   prior_type="BDeu")  # default equivalent_sample_size=5
    predict_data = test.drop(columns=['Survived'], axis=1)
    y_pred = best_model.predict(predict_data)
    print(
        (y_pred['Survived'] == test['Survived']).sum() / len(test))  # 测试集精度'''
Exemple #12
0
    def estimate(self,
                 scoring_method=None,
                 tabu_length=10,
                 significance_level=0.01):
        if scoring_method is None:
            scoring_method = BDeuScore(self.data, equivalent_sample_size=10)

        skel = self.mmpc(significance_level)

        hc = HillClimbSearch(self.data, scoring_method=scoring_method)

        model = hc.estimate(white_list=skel.to_directed().edges(),
                            tabu_length=tabu_length)

        return model
def Hill_Climbing(dataset: pd.DataFrame):
    # from pgmpy.estimators import ExhaustiveSearch
    from pgmpy.estimators import HillClimbSearch
    from pgmpy.estimators import BDeuScore, K2Score, BicScore
    from pgmpy.models import BayesianModel

    bdeu = BDeuScore(dataset, equivalent_sample_size=5)
    
    hc = HillClimbSearch(dataset, scoring_method=BDeuScore(dataset, equivalent_sample_size=5))
    iter_list = [2**i for i in range(20)]
    eval_list = []
    for iteration in iter_list:
        DAG_connection = hc.estimate(tabu_length=10, max_iter=iteration)
        model = BayesianModel(DAG_connection.edges())
        print(bdeu.score(model))
        eval_list.append(bdeu.score(model))
    
    return model.edges(), [iter_list, eval_list]
Exemple #14
0
def scoreStructureLearn(data,
                        search='HillClimbSearch',
                        scoring_method='BicScore'):
    #基于score-search的结构学习
    #search:HillClimbSearch, ExhaustiveSearch
    #scoring_method: 'BicScore', K2Score, BdeuScore
    if scoring_method == 'BicScore':
        scoring_method_tmp = BicScore(data)
    elif scoring_method == 'K2Score':
        scoring_method_tmp = K2Score(data)
    elif scoring_method == 'BdeuScore':
        scoring_method_tmp = BdeuScore(data, equivalent_sample_size=5)
    if search == 'HillClimbSearch':
        es = HillClimbSearch(data, scoring_method=scoring_method_tmp)
    else:
        es = ExhaustiveSearch(data, scoring_method=scoring_method_tmp)
    best_model = es.estimate()
    return best_model
Exemple #15
0
    def learn_structure(self, method, scoring_method, log=True):
        ''' (4)
        Method that builds the structure of the data
        -----------------
        Parameters:
        method          : The technique used to search for the structure
            -> scoring_approx     - To use an approximated search with scoring method
            -> scoring_exhaustive - To use an exhaustive search with scoring method
            -> constraint         - To use the constraint based technique
        scoring_method : K2, bic, bdeu
        log             - "True" if you want to print debug information in the console    
        '''

        #Select the scoring method for the local search of the structure
        if scoring_method == "K2":
            scores = K2Score(self.data)
        elif scoring_method == "bic":
            scores = BicScore(self.data)
        elif scoring_method == "bdeu":
            scores = BdeuScore(self.data)

        #Select the actual method
        if method == "scoring_approx":
            est = HillClimbSearch(self.data, scores)
        elif method == "scoring_exhaustive":
            est = ExhaustiveSearch(self.data, scores)
        elif method == "constraint":
            est = ConstraintBasedEstimator(self.data)

        self.best_model = est.estimate()
        self.eliminate_isolated_nodes(
        )  # REMOVE all nodes not connected to anything else

        for edge in self.best_model.edges_iter():
            self.file_writer.write_txt(str(edge))

        self.log("Method used for structural learning: " + method, log)
        #self.log("Training instances skipped: " + str(self.extractor.get_skipped_lines()), log)
        self.log("Search terminated", log)
Exemple #16
0
def main():
    data, string = readData()
    genes = np.array(data.columns[1:])
    labels = np.array(data.columns)

    bayesianModel = BayesianModel()
    transitionModel = DBN()

    bayesianModel.add_nodes_from(genes)
    transitionModel.add_nodes_from(genes)

    bData, tData = getData(data, labels)
    
    print "\nDynamic Bayesian Network inference", 
    print "\nB_0 network relations:  "
    
    hcb = HillClimbSearch(bData, genes, scoring_method=BicScore(bData, labels, bk1=string, weight=4))
    best_model_b = hcb.estimate(start=bayesianModel, tabu_length=15, max_indegree=2)
    print(best_model_b.edges())

    printOutputB(best_model_b)

    print "\nLocal Probability Model: "
    best_model_b.fit(bData, BayesianEstimator)
    for cpd in best_model_b.get_cpds():
        print(cpd)

    print "\nB_transition network relations: "

    hct = HillClimbSearch(tData, genes, scoring_method=BicScore(tData, labels, bk1=string, weight=4))
    best_model_t = hct.estimate_dynamic(start=transitionModel, tabu_length=15, max_indegree=2)
    print(best_model_t.edges())

    printOutputT(best_model_t)

    print "\nLocal Probability Model: "
    best_model_t.fit(tData, BayesianEstimator)
    for cpd in best_model_t.get_cpds():
        print(cpd)
Exemple #17
0
# 时间:2020/12/21  15:38
import pandas as pd
import networkx as nx
from matplotlib import pyplot as plt
from pgmpy.models import BayesianModel
from pgmpy.estimators import HillClimbSearch
from pgmpy.estimators import BicScore

data = pd.read_csv(
    r'C:\Users\haomiaowu\Desktop\BN-Cheminformatics\Train-clear.csv')
bic = BicScore(data)

hs = HillClimbSearch(data, scoring_method=BicScore(data))
best_model = hs.estimate()
print(best_model.edges())

nx.draw(
    best_model,
    with_labels=True,
    node_size=1000,
    font_weight='bold',
    node_color='y',
)

plt.show()
Exemple #18
0
def main():
	#Fetching features data
	features_data = pd.read_csv(fileloc_features)
	features_data_f = features_data.add_prefix('f')
	features_data_g = features_data.add_prefix('g')
	#Seen Training Data
	seen_traindata = pd.read_csv(fileloc_seen_training, usecols = ['left','right','label'])
	#seen_traindata_f = pd.read_csv(fileloc_seen_training, usecols = ['left','label'])
	#seen_traindata_g = pd.read_csv(fileloc_seen_training, usecols = ['right','label'])
	seen_traindata_merged_f = seen_traindata.merge(features_data_f, left_on = 'left', right_on = 'fimagename')
	seen_traindata_merged_g = seen_traindata.merge(features_data_g, left_on = 'right', right_on = 'gimagename')
	seen_traindata_merged_f = seen_traindata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1)
	seen_traindata_merged_g = seen_traindata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1)
	seen_features_traindata_final = pd.concat([seen_traindata_merged_f, seen_traindata_merged_g], axis = 1)
	seen_label_traindata_final = seen_traindata.loc[:, 'label']
	seen_traindata_final = pd.concat([seen_features_traindata_final, seen_label_traindata_final], axis = 1)
	seen_traindata_final.replace([np.inf, -np.inf], np.nan)
	seen_traindata_final.dropna(inplace=True)  
	seen_traindata_final = seen_traindata_final.astype(int)
	seen_traindata_final_NDArray = seen_traindata_final.values 
	#Seen Validation Data
	seen_validationdata = pd.read_csv(fileloc_seen_validation, usecols = ['left','right','label'])
	#seen_validationdata_f = pd.read_csv(fileloc_seen_validation, usecols = ['left','label'])
	#seen_validationdata_g = pd.read_csv(fileloc_seen_validation, usecols = ['right','label'])
	seen_validationdata_merged_f = seen_validationdata.merge(features_data_f, left_on = 'left', right_on = 'fimagename')
	seen_validationdata_merged_g = seen_validationdata.merge(features_data_g, left_on = 'right', right_on = 'gimagename')
	seen_validationdata_merged_f = seen_validationdata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1)
	seen_validationdata_merged_g = seen_validationdata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1)
	seen_features_validationdata_final = pd.concat([seen_validationdata_merged_f, seen_validationdata_merged_g], axis = 1)
	seen_label_validationdata_final = seen_validationdata.loc[:, 'label']
	seen_validationdata_final = pd.concat([seen_features_validationdata_final, seen_label_validationdata_final], axis = 1)
	seen_validationdata_final.replace([np.inf, -np.inf], np.nan)
	seen_validationdata_final.dropna(inplace=True)
	seen_validationdata_final = seen_validationdata_final.astype(int)
	seen_validationdata_final_NDArray = seen_validationdata_final.values
	#Shuffled Training Data
	shuffled_traindata = pd.read_csv(fileloc_shuffled_training, usecols = ['left','right','label'])
	#shuffled_traindata_f = pd.read_csv(fileloc_shuffled_training, usecols = ['left','label'])
	#shuffled_traindata_g = pd.read_csv(fileloc_shuffled_training, usecols = ['right','label'])
	shuffled_traindata_merged_f = shuffled_traindata.merge(features_data_f, left_on = 'left', right_on = 'fimagename')
	shuffled_traindata_merged_g = shuffled_traindata.merge(features_data_g, left_on = 'right', right_on = 'gimagename')
	shuffled_traindata_merged_f = shuffled_traindata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1)
	shuffled_traindata_merged_g = shuffled_traindata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1)
	shuffled_features_traindata_final = pd.concat([shuffled_traindata_merged_f, shuffled_traindata_merged_g], axis = 1)
	shuffled_label_traindata_final = shuffled_traindata.loc[:, 'label']
	shuffled_traindata_final = pd.concat([shuffled_features_traindata_final, shuffled_label_traindata_final], axis = 1)
	shuffled_traindata_final.replace([np.inf, -np.inf], np.nan)
	shuffled_traindata_final.dropna(inplace=True)
	shuffled_traindata_final = shuffled_traindata_final.astype(int)
	shuffled_traindata_final_NDArray = shuffled_traindata_final.values
	#Shuffled Validation Data
	shuffled_validationdata = pd.read_csv(fileloc_shuffled_validation, usecols = ['left','right','label'])
	#shuffled_validationdata_f = pd.read_csv(fileloc_shuffled_validation, usecols = ['left','label'])
	#shuffled_validationdata_g = pd.read_csv(fileloc_shuffled_validation, usecols = ['right','label'])
	shuffled_validationdata_merged_f = shuffled_validationdata.merge(features_data_f, left_on = 'left', right_on = 'fimagename')
	shuffled_validationdata_merged_g = shuffled_validationdata.merge(features_data_g, left_on = 'right', right_on = 'gimagename')
	shuffled_validationdata_merged_f = shuffled_validationdata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1)
	shuffled_validationdata_merged_g = shuffled_validationdata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1)
	shuffled_features_validationdata_final = pd.concat([shuffled_validationdata_merged_f, shuffled_validationdata_merged_g], axis = 1)
	shuffled_label_validationdata_final = shuffled_validationdata.loc[:, 'label']
	shuffled_validationdata_final = pd.concat([shuffled_features_validationdata_final, shuffled_label_validationdata_final], axis = 1)
	shuffled_validationdata_final.replace([np.inf, -np.inf], np.nan)
	shuffled_validationdata_final.dropna(inplace=True)
	shuffled_validationdata_final = shuffled_validationdata_final.astype(int)
	shuffled_validationdata_final_NDArray = shuffled_validationdata_final.values
	#Unseen Training Data
	unseen_traindata = pd.read_csv(fileloc_unseen_training, usecols = ['left','right','label'])
	#unseen_traindata_f = pd.read_csv(fileloc_unseen_training, usecols = ['left','label'])
	#unseen_traindata_g = pd.read_csv(fileloc_unseen_training, usecols = ['right','label'])
	unseen_traindata_merged_f = unseen_traindata.merge(features_data_f, left_on = 'left', right_on = 'fimagename')
	unseen_traindata_merged_g = unseen_traindata.merge(features_data_g, left_on = 'right', right_on = 'gimagename')
	unseen_traindata_merged_f = unseen_traindata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1)
	unseen_traindata_merged_g = unseen_traindata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1)
	unseen_features_traindata_final = pd.concat([unseen_traindata_merged_f, unseen_traindata_merged_g], axis = 1)
	unseen_label_traindata_final = unseen_traindata.loc[:, 'label']
	unseen_traindata_final = pd.concat([unseen_features_traindata_final, unseen_label_traindata_final], axis = 1)
	unseen_traindata_final.replace([np.inf, -np.inf], np.nan)
	unseen_traindata_final.dropna(inplace=True)
	unseen_traindata_final = unseen_traindata_final.astype(int)
	unseen_traindata_final_NDArray = unseen_traindata_final.values
	#Unseen Validation Data
	unseen_validationdata = pd.read_csv(fileloc_unseen_validation, usecols = ['left','right','label'])
	#unseen_validationdata_f = pd.read_csv(fileloc_unseen_validation, usecols = ['left','label'])
	#unseen_validationdata_g = pd.read_csv(fileloc_unseen_validation, usecols = ['right','label'])
	unseen_validationdata_merged_f = unseen_validationdata.merge(features_data_f, left_on = 'left', right_on = 'fimagename')
	unseen_validationdata_merged_g = unseen_validationdata.merge(features_data_g, left_on = 'right', right_on = 'gimagename')
	unseen_validationdata_merged_f = unseen_validationdata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1)
	unseen_validationdata_merged_g = unseen_validationdata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1)
	unseen_features_validationdata_final = pd.concat([unseen_validationdata_merged_f, unseen_validationdata_merged_g], axis = 1)
	unseen_label_validationdata_final = unseen_validationdata.loc[:, 'label']
	unseen_validationdata_final = pd.concat([unseen_features_validationdata_final, unseen_label_validationdata_final], axis = 1)
	unseen_validationdata_final.replace([np.inf, -np.inf], np.nan)
	unseen_validationdata_final.dropna(inplace=True)
	unseen_validationdata_final = unseen_validationdata_final.astype(int)
	unseen_validationdata_final_NDArray = unseen_validationdata_final.values
	#Creating base models
	featureNamesList = ["pen_pressure","letter_spacing","size","dimension","is_lowercase","is_continuous","slantness","tilt","entry_stroke_a", "staff_of_a","formation_n","staff_of_d","exit_stroke_d","word_formation","constancy"]
	features_only_data = features_data[featureNamesList]
	initial_hcs = HillClimbSearch(features_only_data)
	initial_model = initial_hcs.estimate()
	#print(initial_model.edges())
	print("Hill Climb Done")
	basemodel = BayesianModel([('fpen_pressure', 'fis_lowercase'), ('fpen_pressure', 'fletter_spacing'), ('fsize', 'fslantness'), ('fsize', 'fpen_pressure'), 
								('fsize', 'fstaff_of_d'), ('fsize', 'fletter_spacing'), ('fsize', 'fexit_stroke_d'), ('fsize', 'fentry_stroke_a'), 
								('fdimension', 'fsize'), ('fdimension', 'fis_continuous'), ('fdimension', 'fslantness'), ('fdimension', 'fpen_pressure'), 
								('fis_lowercase', 'fstaff_of_a'), ('fis_lowercase', 'fexit_stroke_d'), ('fis_continuous', 'fexit_stroke_d'), ('fis_continuous', 'fletter_spacing'), 
								('fis_continuous', 'fentry_stroke_a'), ('fis_continuous', 'fstaff_of_a'), ('fis_continuous', 'fis_lowercase'), ('fslantness', 'fis_continuous'), 
								('fslantness', 'ftilt'), ('fentry_stroke_a', 'fpen_pressure'), ('fformation_n', 'fconstancy'), ('fformation_n', 'fword_formation'), ('fformation_n', 'fdimension'), 
								('fformation_n', 'fstaff_of_d'), ('fformation_n', 'fis_continuous'), ('fformation_n', 'fsize'), ('fformation_n', 'fstaff_of_a'), ('fstaff_of_d', 'fis_continuous'), 
								('fstaff_of_d', 'fexit_stroke_d'), ('fstaff_of_d', 'fis_lowercase'), ('fstaff_of_d', 'fslantness'), ('fstaff_of_d', 'fentry_stroke_a'), 
								('fword_formation', 'fdimension'), ('fword_formation', 'fstaff_of_a'), ('fword_formation', 'fsize'), ('fword_formation', 'fstaff_of_d'), 
								('fword_formation', 'fconstancy'), ('fconstancy', 'fstaff_of_a'), ('fconstancy', 'fletter_spacing'), ('fconstancy', 'fdimension'), 
								('gpen_pressure', 'gis_lowercase'), ('gpen_pressure', 'gletter_spacing'), ('gsize', 'gslantness'), ('gsize', 'gpen_pressure'), 
								('gsize', 'gstaff_of_d'), ('gsize', 'gletter_spacing'), ('gsize', 'gexit_stroke_d'), ('gsize', 'gentry_stroke_a'), ('gdimension', 'gsize'), 
								('gdimension', 'gis_continuous'), ('gdimension', 'gslantness'), ('gdimension', 'gpen_pressure'), ('gis_lowercase', 'gstaff_of_a'), 
								('gis_lowercase', 'gexit_stroke_d'), ('gis_continuous', 'gexit_stroke_d'), ('gis_continuous', 'gletter_spacing'), ('gis_continuous', 'gentry_stroke_a'), 
								('gis_continuous', 'gstaff_of_a'), ('gis_continuous', 'gis_lowercase'), ('gslantness', 'gis_continuous'), ('gslantness', 'gtilt'), 
								('gentry_stroke_a', 'gpen_pressure'), ('gformation_n', 'gconstancy'), ('gformation_n', 'gword_formation'), ('gformation_n', 'gdimension'), 
								('gformation_n', 'gstaff_of_d'), ('gformation_n', 'gis_continuous'), ('gformation_n', 'gsize'), ('gformation_n', 'gstaff_of_a'), ('gstaff_of_d', 'gis_continuous'), 
								('gstaff_of_d', 'gexit_stroke_d'), ('gstaff_of_d', 'gis_lowercase'), ('gstaff_of_d', 'gslantness'), ('gstaff_of_d', 'gentry_stroke_a'), 
								('gword_formation', 'gdimension'), ('gword_formation', 'gstaff_of_a'), ('gword_formation', 'gsize'), ('gword_formation', 'gstaff_of_d'), 
								('gword_formation', 'gconstancy'), ('gconstancy', 'gstaff_of_a'), ('gconstancy', 'gletter_spacing'), ('gconstancy', 'gdimension'),
								('fis_continuous', 'label'), ('fword_formation','label'),
								('gis_continuous', 'label'), ('gword_formation','label')])
	model_seen = basemodel.copy()
	model_shuffled = basemodel.copy()
	model_unseen = basemodel.copy()
	accuracies = {}
	#Training Seen Model
	model_seen.fit(seen_traindata_final)
	estimator_seen = BayesianEstimator(model_seen, seen_traindata_final)
	cpds=[]
	for featureName in featureNamesList :
		cpd = estimator_seen.estimate_cpd('f'+featureName)
		cpds.append(cpd)
		cpd = estimator_seen.estimate_cpd('g'+featureName)
		cpds.append(cpd)
	cpd = estimator_seen.estimate_cpd('label')
	cpds.append(cpd)
	model_seen.add_cpds(*cpds)
	print("CPDs Calculated")
	#Testing Seen Model - Training
	model_seen_ve = VariableElimination(model_seen)
	model_seen_traindata_predictions = []
	for i in range(seen_traindata_final_NDArray.shape[0]):
		evidenceDic = {}
		for index, featureName in enumerate(featureNamesList): 
			evidenceDic['f'+featureName]=(seen_traindata_final_NDArray[i,index]-1)
			evidenceDic['g'+featureName]=(seen_traindata_final_NDArray[i+15,index]-1)
		temp = model_seen_ve.map_query(variables=['label'],evidence=evidenceDic)
		model_seen_traindata_predictions.append(temp['label'])
	correctCnt = 0
	for i in range(len(model_seen_traindata_predictions)):
	    if(int(model_seen_traindata_predictions[i]) == int(seen_traindata_final_NDArray[i,30])):
	        correctCnt+=1
	accuracies["seen_train"]=correctCnt/len(model_seen_traindata_predictions)*100
	print("Bayesian Model Accuracy for Seen Training Data = "+str(accuracies["seen_train"]))
	#Testing Seen Model - Validation
	model_seen_ve = VariableElimination(model_seen)
	model_seen_validationdata_predictions = []
	for i in range(seen_validationdata_final_NDArray.shape[0]):
		evidenceDic = {}
		for index, featureName in enumerate(featureNamesList): 
			evidenceDic['f'+featureName]=seen_validationdata_final_NDArray[i,index]-1
			evidenceDic['g'+featureName]=seen_validationdata_final_NDArray[i+15,index]-1
		temp = model_seen_ve.map_query(variables=['label'],evidence=evidenceDic)
		model_seen_validationdata_predictions.append(temp['label'])
	correctCnt = 0
	for i in range(len(model_seen_validationdata_predictions)):
	    if(int(model_seen_validationdata_predictions[i]) == int(seen_validationdata_final_NDArray[i,30])):
	        correctCnt+=1
	accuracies["seen_validation"]=correctCnt/len(model_seen_validationdata_predictions)*100
	print("Bayesian Model Accuracy for Seen Validation Data = "+str(accuracies["seen_validation"]))
	#Training Shuffled Model
	model_shuffled.fit(shuffled_traindata_final)
	estimator_shuffled = BayesianEstimator(model_shuffled, shuffled_traindata_final)
	cpds=[]
	for featureName in featureNamesList :
		cpd = estimator_shuffled.estimate_cpd('f'+featureName)
		cpds.append(cpd)
		cpd = estimator_shuffled.estimate_cpd('g'+featureName)
		cpds.append(cpd)
	cpd = estimator_shuffled.estimate_cpd('label')
	cpds.append(cpd)
	model_shuffled.add_cpds(*cpds)
	#Testing Shuffled Model - Training
	model_shuffled_ve = VariableElimination(model_shuffled)
	model_shuffled_traindata_predictions = []
	for i in range(shuffled_traindata_final_NDArray.shape[0]):
		evidenceDic = {}
		for index, featureName in enumerate(featureNamesList): 
			evidenceDic['f'+featureName]=shuffled_traindata_final_NDArray[i,index]-1
			evidenceDic['g'+featureName]=shuffled_traindata_final_NDArray[i+15,index]-1
		temp = model_shuffled_ve.map_query(variables=['label'],evidence=evidenceDic)
		model_shuffled_traindata_predictions.append(temp['label'])
	correctCnt = 0
	for i in range(len(model_shuffled_traindata_predictions)):
	    if(int(model_shuffled_traindata_predictions[i]) == int(shuffled_traindata_final_NDArray[i,30])):
	        correctCnt+=1
	accuracies["shuffled_train"]=correctCnt/len(model_shuffled_traindata_predictions)*100
	print("Bayesian Model Accuracy for Shuffled Training Data = "+str(accuracies["shuffled_train"]))
	#Testing Shuffled Model - Validation
	model_shuffled_ve = VariableElimination(model_shuffled)
	model_shuffled_validationdata_predictions = []
	for i in range(shuffled_validationdata_final_NDArray.shape[0]):
		evidenceDic = {}
		for index, featureName in enumerate(featureNamesList): 
			evidenceDic['f'+featureName]=shuffled_validationdata_final_NDArray[i,index]-1
			evidenceDic['g'+featureName]=shuffled_validationdata_final_NDArray[i+15,index]-1
	temp = model_shuffled_ve.map_query(variables=['label'],evidence=evidenceDic)
	model_shuffled_validationdata_predictions.append(temp['label'])
	correctCnt = 0
	for i in range(len(model_shuffled_validationdata_predictions)):
	    if(int(model_shuffled_validationdata_predictions[i]) == int(shuffled_validationdata_final_NDArray[i,30])):
	        correctCnt+=1
	accuracies["shuffled_validation"]=correctCnt/len(model_shuffled_validationdata_predictions)*100
	print("Bayesian Model Accuracy for Shuffled Validation Data = "+str(accuracies["shuffled_validation"]))
	#Training Unseen Model
	model_unseen.fit(unseen_traindata_final)
	estimator_unseen = BayesianEstimator(model_unseen, unseen_traindata_final)
	cpds=[]
	for featureName in featureNamesList :
		cpd = estimator_unseen.estimate_cpd('f'+featureName)
		cpds.append(cpd)
		cpd = estimator_unseen.estimate_cpd('g'+featureName)
		cpds.append(cpd)
	cpd = estimator_unseen.estimate_cpd('label')
	cpds.append(cpd)
	model_unseen.add_cpds(*cpds)
	#Testing Unseen Model - Training
	model_unseen_ve = VariableElimination(model_unseen)
	model_unseen_traindata_predictions = []
	for i in range(unseen_traindata_final_NDArray.shape[0]):
		evidenceDic = {}
		for index, featureName in enumerate(featureNamesList): 
			evidenceDic['f'+featureName]=unseen_traindata_final_NDArray[i,index]-1
			evidenceDic['g'+featureName]=unseen_traindata_final_NDArray[i+15,index]-1
		temp = model_unseen_ve.map_query(variables=['label'],evidence=evidenceDic)
		model_unseen_traindata_predictions.append(temp['label'])
	correctCnt = 0
	for i in range(len(model_unseen_traindata_predictions)):
	    if(int(model_unseen_traindata_predictions[i]) == int(unseen_traindata_final_NDArray[i,30])):
	        correctCnt+=1
	accuracies["unseen_train"]=correctCnt/len(model_unseen_traindata_predictions)*100
	print("Bayesian Model Accuracy for Unseen Training Data = "+str(accuracies["unseen_train"]))
	#Testing Unseen Model - Validation
	model_unseen_ve = VariableElimination(model_unseen)
	model_unseen_validationdata_predictions = []
	for i in range(unseen_validationdata_final_NDArray.shape[0]):
		evidenceDic = {}
		for index, featureName in enumerate(featureNamesList): 
			evidenceDic['f'+featureName]=unseen_validationdata_final_NDArray[i,index]-1
			evidenceDic['g'+featureName]=unseen_validationdata_final_NDArray[i+15,index]-1
	temp = model_unseen_ve.map_query(variables=['label'],evidence=evidenceDic)
	model_unseen_validationdata_predictions.append(temp['label'])
	correctCnt = 0
	for i in range(len(model_unseen_validationdata_predictions)):
	    if(int(model_unseen_validationdata_predictions[i]) == int(unseen_validationdata_final_NDArray[i,30])):
	        correctCnt+=1
	accuracies["unseen_validation"]=correctCnt/len(model_unseen_validationdata_predictions)*100
	print("Bayesian Model Accuracy for Unseen Validation Data = "+str(accuracies["unseen_validation"]))
class TestBaseEstimator(unittest.TestCase):
    def setUp(self):
        self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB'))
        self.rand_data['C'] = self.rand_data['B']
        self.est_rand = HillClimbSearch(self.rand_data, scoring_method=K2Score(self.rand_data))
        self.model1 = BayesianModel()
        self.model1.add_nodes_from(['A', 'B', 'C'])
        self.model2 = self.model1.copy()
        self.model2.add_edge('A', 'B')

        # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv"
        self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv')
        self.titanic_data1 = self.titanic_data[["Survived", "Sex", "Pclass", "Age", "Embarked"]]
        self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]]
        self.est_titanic1 = HillClimbSearch(self.titanic_data1)
        self.est_titanic2 = HillClimbSearch(self.titanic_data2)

    def test_legal_operations(self):
        model2_legal_ops = list(self.est_rand._legal_operations(self.model2))
        model2_legal_ops_ref = [(('+', ('C', 'A')), -28.15602208305154),
                                (('+', ('A', 'C')), -28.155467430966382),
                                (('+', ('C', 'B')), 7636.947544933631),
                                (('+', ('B', 'C')), 7937.805375579936),
                                (('-', ('A', 'B')), 28.155467430966382),
                                (('flip', ('A', 'B')), -0.0005546520851567038)]
        self.assertSetEqual(set([op for op, score in model2_legal_ops]),
                            set([op for op, score in model2_legal_ops_ref]))

    def test_legal_operations_titanic(self):
        est = self.est_titanic1
        start_model = BayesianModel([("Survived", "Sex"),
                                     ("Pclass", "Age"),
                                     ("Pclass", "Embarked")])

        legal_ops = est._legal_operations(start_model)
        self.assertEqual(len(list(legal_ops)), 20)

        tabu_list = [('-', ("Survived", "Sex")),
                     ('-', ("Survived", "Pclass")),
                     ('flip', ("Age", "Pclass"))]
        legal_ops_tabu = est._legal_operations(start_model, tabu_list=tabu_list)
        self.assertEqual(len(list(legal_ops_tabu)), 18)

        legal_ops_indegree = est._legal_operations(start_model, max_indegree=1)
        self.assertEqual(len(list(legal_ops_indegree)), 11)

        legal_ops_both = est._legal_operations(start_model, tabu_list=tabu_list, max_indegree=1)
        legal_ops_both_ref = [(('+', ('Embarked', 'Survived')), 10.050632580087608),
                              (('+', ('Survived', 'Pclass')), 41.88868046549101),
                              (('+', ('Age', 'Survived')), -23.635716036430836),
                              (('+', ('Pclass', 'Survived')), 41.81314459373226),
                              (('+', ('Sex', 'Pclass')), 4.772261678792802),
                              (('-', ('Pclass', 'Age')), 11.546515590731815),
                              (('-', ('Pclass', 'Embarked')), -32.171482832532774),
                              (('flip', ('Pclass', 'Embarked')), 3.3563814191281836),
                              (('flip', ('Survived', 'Sex')), 0.039737027979640516)]
        self.assertSetEqual(set(legal_ops_both), set(legal_ops_both_ref))

    def test_estimate_rand(self):
        est1 = self.est_rand.estimate()
        self.assertSetEqual(set(est1.nodes()), set(['A', 'B', 'C']))
        self.assertTrue(est1.edges() == [('B', 'C')] or est1.edges() == [('C', 'B')])

        est2 = self.est_rand.estimate(start=BayesianModel([('A', 'B'), ('A', 'C')]))
        self.assertTrue(est2.edges() == [('B', 'C')] or est2.edges() == [('C', 'B')])

    def test_estimate_titanic(self):
        self.assertSetEqual(set(self.est_titanic2.estimate().edges()),
                            set([('Survived', 'Pclass'), ('Sex', 'Pclass'), ('Sex', 'Survived')]))

    def tearDown(self):
        del self.rand_data
        del self.est_rand
        del self.model1
        del self.titanic_data
        del self.titanic_data1
        del self.titanic_data2
        del self.est_titanic1
        del self.est_titanic2
data2 = pd.DataFrame(data=raw_data2)

import time

t0 = time.time()
# Uncomment below to perform exhaustive search
searcher = ExhaustiveSearch(data2, scoring_method=K2Score(data2))
search = searcher.all_scores()
print('time:', time.time() - t0)

# Uncomment for printout:
#for score, model in search:
#    print("{0}        {1}".format(score, model.edges()))

separator()

hcs = HillClimbSearch(data2, scoring_method=K2Score(data))
model = hcs.estimate()

hcs2 = HillClimbSearch(data2, scoring_method=K2Score(data2))
model2 = hcs2.estimate()

hcs_bic = HillClimbSearch(data, scoring_method=BicScore(data))
model_bic = hcs_bic.estimate()

hcs_bic2 = HillClimbSearch(data2, scoring_method=BicScore(data2))
model_bic2 = hcs_bic2.estimate()

# End of Task 6
Exemple #21
0
def task4():
	global andRawData, task4_best_bm
	k2Scores = []
	andRawData_temp = pd.DataFrame(andRawData.values, columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9'])
	#Model 1
	est = HillClimbSearch(andRawData_temp, scoring_method=K2Score(andRawData_temp))
	model_temp = est.estimate()
	estimator = BayesianEstimator(model_temp, andRawData_temp)
	for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']:
		cpd_fx = estimator.estimate_cpd(fx, prior_type="K2")
		model_temp.add_cpds(cpd_fx)
	task4_bms.append(model_temp)
	print("	Model 1: Model through HillClimbSearch is : "+str(model_temp.edges()))
	k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000))
	k2Scores_temp = k2Score.score(model_temp)
	k2Scores.append(k2Scores_temp)
	print("	Model 1: K2 Accuracy Score is "+str(k2Scores_temp))
	#Model 2: Manual Model based on HillClimbSearch
	model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f3', 'f8'), ('f1', 'f7'), ('f5', 'f3'), ('f9', 'f8'), ('f1', 'f6'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f2')])
	estimator = BayesianEstimator(model_temp, andRawData_temp)
	for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']:
		cpd_fx = estimator.estimate_cpd(fx, prior_type="K2")
		model_temp.add_cpds(cpd_fx)
	task4_bms.append(model_temp)
	print("	Model 2: Manual Model based on HillClimbSearch is : "+str(model_temp.edges()))
	k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000))
	k2Scores_temp = k2Score.score(model_temp)
	k2Scores.append(k2Scores_temp)
	print("	Model 2: K2 Accuracy Score is "+str(k2Scores_temp))
	#Model 3: Manual Model based on HillClimbSearch
	model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f3', 'f8'), ('f5', 'f7'), ('f5', 'f3'), ('f9', 'f8'), ('f1', 'f2'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f2')])
	estimator = BayesianEstimator(model_temp, andRawData_temp)
	for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']:
		cpd_fx = estimator.estimate_cpd(fx, prior_type="K2")
		model_temp.add_cpds(cpd_fx)
	task4_bms.append(model_temp)
	print("	Model 3: Manual Model based on HillClimbSearch is : "+str(model_temp.edges()))
	k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000))
	k2Scores_temp = k2Score.score(model_temp)
	k2Scores.append(k2Scores_temp)
	print("	Model 3: K2 Accuracy Score is "+str(k2Scores_temp))
	#Model 4: Manual Model based on HillClimbSearch
	model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f5', 'f7'), ('f5', 'f3'), ('f1', 'f2'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f8'),])
	estimator = BayesianEstimator(model_temp, andRawData_temp)
	for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']:
		cpd_fx = estimator.estimate_cpd(fx, prior_type="K2")
		model_temp.add_cpds(cpd_fx)
	task4_bms.append(model_temp)
	print("	Model 4: Manual Model based on HillClimbSearch is : "+str(model_temp.edges()))
	k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000))
	k2Scores_temp = k2Score.score(model_temp)
	k2Scores.append(k2Scores_temp)
	print("	Model 4: K2 Accuracy Score is "+str(k2Scores_temp))
	#Model 5: Manual Model based on Intuition
	model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f4', 'f7'), ('f1', 'f2'), ('f8', 'f5'), ('f9', 'f6'), ('f9', 'f8')])
	estimator = BayesianEstimator(model_temp, andRawData_temp)
	for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']:
		cpd_fx = estimator.estimate_cpd(fx, prior_type="K2")
		model_temp.add_cpds(cpd_fx)
	task4_bms.append(model_temp)
	print("	Model 5: Manual Model based on HillClimbSearch is : "+str(model_temp.edges()))
	k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000))
	k2Scores_temp = k2Score.score(model_temp)
	k2Scores.append(k2Scores_temp)
	print("	Model 5: K2 Accuracy Score is "+str(k2Scores_temp))
	task4_best_bm = task4_bms[k2Scores.index(max(k2Scores))]
	print("	Best Bayesian Model with the highest accuracy score is thus Model "+str(1+k2Scores.index(max(k2Scores))))
class TestBaseEstimator(unittest.TestCase):
    def setUp(self):
        self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)),
                                      columns=list("AB"))
        self.rand_data["C"] = self.rand_data["B"]
        self.est_rand = HillClimbSearch(self.rand_data,
                                        scoring_method=K2Score(self.rand_data))
        self.model1 = BayesianModel()
        self.model1.add_nodes_from(["A", "B", "C"])
        self.model2 = self.model1.copy()
        self.model2.add_edge("A", "B")

        # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv"
        self.titanic_data = pd.read_csv(
            "pgmpy/tests/test_estimators/testdata/titanic_train.csv")
        self.titanic_data1 = self.titanic_data[[
            "Survived", "Sex", "Pclass", "Age", "Embarked"
        ]]
        self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]]
        self.est_titanic1 = HillClimbSearch(self.titanic_data1)
        self.est_titanic2 = HillClimbSearch(self.titanic_data2)

    def test_legal_operations(self):
        model2_legal_ops = list(self.est_rand._legal_operations(self.model2))
        model2_legal_ops_ref = [
            (("+", ("C", "A")), -28.15602208305154),
            (("+", ("A", "C")), -28.155467430966382),
            (("+", ("C", "B")), 7636.947544933631),
            (("+", ("B", "C")), 7937.805375579936),
            (("-", ("A", "B")), 28.155467430966382),
            (("flip", ("A", "B")), -0.0005546520851567038),
        ]
        self.assertSetEqual(
            set([op for op, score in model2_legal_ops]),
            set([op for op, score in model2_legal_ops_ref]),
        )

    def test_legal_operations_titanic(self):
        est = self.est_titanic1
        start_model = BayesianModel([("Survived", "Sex"), ("Pclass", "Age"),
                                     ("Pclass", "Embarked")])

        legal_ops = est._legal_operations(start_model)
        self.assertEqual(len(list(legal_ops)), 20)

        tabu_list = [
            ("-", ("Survived", "Sex")),
            ("-", ("Survived", "Pclass")),
            ("flip", ("Age", "Pclass")),
        ]
        legal_ops_tabu = est._legal_operations(start_model,
                                               tabu_list=tabu_list)
        self.assertEqual(len(list(legal_ops_tabu)), 18)

        legal_ops_indegree = est._legal_operations(start_model, max_indegree=1)
        self.assertEqual(len(list(legal_ops_indegree)), 11)

        legal_ops_both = est._legal_operations(start_model,
                                               tabu_list=tabu_list,
                                               max_indegree=1)
        legal_ops_both_ref = [
            (("+", ("Embarked", "Survived")), 10.050632580087608),
            (("+", ("Survived", "Pclass")), 41.88868046549101),
            (("+", ("Age", "Survived")), -23.635716036430836),
            (("+", ("Pclass", "Survived")), 41.81314459373226),
            (("+", ("Sex", "Pclass")), 4.772261678792802),
            (("-", ("Pclass", "Age")), 11.546515590731815),
            (("-", ("Pclass", "Embarked")), -32.171482832532774),
            (("flip", ("Pclass", "Embarked")), 3.3563814191281836),
            (("flip", ("Survived", "Sex")), 0.039737027979640516),
        ]
        self.assertSetEqual(set(legal_ops_both), set(legal_ops_both_ref))

    def test_estimate_rand(self):
        est1 = self.est_rand.estimate()
        self.assertSetEqual(set(est1.nodes()), set(["A", "B", "C"]))
        self.assertTrue(
            list(est1.edges()) == [("B", "C")]
            or list(est1.edges()) == [("C", "B")])

        est2 = self.est_rand.estimate(start=BayesianModel([("A",
                                                            "B"), ("A", "C")]))
        self.assertTrue(
            list(est2.edges()) == [("B", "C")]
            or list(est2.edges()) == [("C", "B")])

    def test_estimate_titanic(self):
        self.assertSetEqual(
            set(self.est_titanic2.estimate().edges()),
            set([("Survived", "Pclass"), ("Sex", "Pclass"),
                 ("Sex", "Survived")]),
        )

    def tearDown(self):
        del self.rand_data
        del self.est_rand
        del self.model1
        del self.titanic_data
        del self.titanic_data1
        del self.titanic_data2
        del self.est_titanic1
        del self.est_titanic2
Exemple #23
0
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.preprocessing import KBinsDiscretizer

data = pd.read_csv("data/data_auto_mpg.csv")
# data = pd.DataFrame(np.random.randn(500, 5), columns=list('ABCDE'))
# data['F'] = data['A'] * data['B']

for col in data.columns:
    if (data[col].dtype == np.float64 or data[col].dtype == np.float32):
        # bin_size = np.unique(data[col].values).shape[0]
        # kbins = KBinsDiscretizer(n_bins=bin_size, encode='ordinal', strategy='uniform').fit(data[col].values.reshape(-1,1))
        # data[col] = kbins.transform(data[col].values.reshape(-1,1)).astype(np.int64)
        data[col] = data[col].astype(np.int64)

data = data.iloc[:, :10]

print(data.dtypes)
print(data)

print("aq")
est = HillClimbSearch(data, scoring_method=K2Score(data))
print("aq")
model = est.estimate(max_indegree=5)
print("aq")

print(model.edges)

plt.figure()
nx.draw_networkx(model)
plt.show()
def main():

    andPGM = PGM_t()
    print('loading features..')
    train_set, test_set = andPGM.load_features()
    print('loading features.. Done')
    # Bayesian network of 19 nodes, 9*2 variables of network given
    # Initial incomplete Bayesian model connected manually based on intuition
    print('Generating model.. ')
    initialModel = BayesianModel({})
    initialModel.add_nodes_from(andPGM.img_features.columns[1:10].tolist())
    initialModel.add_edges_from([('f6_a' , 'f2_a'),\
                             ('f3_a' , 'f4_a') ,\
                             ('f5_a' , 'f9_a') ,\
                             ('f4_a' , 'f7_a') ])

    # Use hill climb search algorithm to find network structure of initial 9 nodes
    hc = HillClimbSearch(data=andPGM.img_features.iloc[0:,1:10], \
                         scoring_method=BdeuScore(andPGM.img_features.iloc[0:,1:10], \
                                                  equivalent_sample_size=0.1*len(andPGM.img_features)), \
                         state_names = andPGM.states_9)
    # Get best estimated structure
    best_model = hc.estimate(start=initialModel)
    # Edges in the acquired graph
    print('model of 9 var: ', best_model.edges())

    # Create a Clone of generated Bayesian network structure
    clone_model = BayesianModel({})
    for edge in best_model.edges():
        new_edge = [edge[0][:-1] + 'b', edge[1][:-1] + 'b']
        clone_model.add_edges_from([new_edge])

    # Join together the Original and clone network through node 'same'
    multinetModel = BayesianModel({})
    multinetModel.add_edges_from(best_model.edges() + clone_model.edges())
    multinetModel.add_node('same')
    multinetModel.add_edge('f5_a', 'same')
    multinetModel.add_edge('f9_a', 'same')
    multinetModel.add_edge('f5_b', 'same')
    multinetModel.add_edge('f9_b', 'same')
    print('Generating model.. Done')
    # Edges in the final structure
    print('Final model: ', multinetModel.edges())

    print('Fit data into model..')
    # fit the data to model to generate CPDs using maximum likelyhood estimation
    multinetModel.fit(data=train_set, state_names=andPGM.states_all)
    print('Fit data into model.. Done')
    print('CPDs generated: ')
    cpds = multinetModel.get_cpds()
    for cpd in cpds:
        print(cpd)
    # Inference using Variable Elimination
    print('Start inference..')
    inference = VariableElimination(multinetModel)
    train_set_same = train_set[train_set['same'] == 0]
    train_set_not_same = train_set[train_set['same'] == 1]

    # Accuracy of positive inferences
    acc_same = andPGM.chk_accuracy(
        train_set_same,
        inference,
        variables=train_set_same.columns[0:9].tolist(),
        evidence=train_set_same.columns[9:19].tolist())
    print('accuracy of positives ', acc_same)

    # Accuracy of negative inferences
    acc_nt_same = andPGM.chk_accuracy(
        train_set_not_same,
        inference,
        variables=train_set_not_same.columns[0:9].tolist(),
        evidence=train_set_not_same.columns[9:19].tolist())
    print('accuracy of negatives', acc_nt_same)
Exemple #25
0
	with open("network.csv", "wb") as f:
		writer = csv.writer(f)
		writer.writerows(best_model.edges())

def main():
	data = readData()
	labels = np.array(dataset.columns)
	datasetNp = np.array(dataset)

	data = pd.DataFrame(datasetNp, columns=labels)
	n = labels.shape[0]

	output = np.chararray(3, itemsize=10)

	model = BayesianModel()
	print "\nBayesian Network Inference with Temporal Data", 
    print "\nNetwork relations:  "

	hc = HillClimbSearch(data, scoring_method=BdeuScore(data))
	best_model = hc.estimate(tabu_length=10, max_indegree=3)
	print(best_model.edges())

	best_model.fit(data, BayesianEstimator)
	for cpd in best_model.get_cpds():
    	print(cpd)

    printOutput(best_model)

if __name__ == '__main__': # chamada da funcao principal
    main()
mydata = np.genfromtxt(
    r'E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\converted\pgmpy\activities+time_ordered_withoutdatetime.csv',
    delimiter=",")
#pd.read_csv(r'E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\converted\pgmpy\data.csv')
#print(mydata)
data = pd.DataFrame(mydata, columns=feature_names)  #['X', 'Y'])
print(data)

list_of_scoring_methods = [
    BicScore(data),
    #BdeuScore(data),
    #K2Score(data)
]

for scoreMethod in list_of_scoring_methods:
    start_time = time.time()
    hc = HillClimbSearch(data, scoreMethod)
    best_model = hc.estimate()
    print(hc.scoring_method)
    print(best_model.edges())
    end_time = time.time()
    print("execution time in seconds:")
    print(end_time - start_time)

estimator = BayesianEstimator(best_model, data)
print(estimator.get_parameters(prior_type='K2'))  #, equivalent_sample_size=5)

#casas7_model = BayesianModel()
#casas7_model.fit(data, estimator=BayesianEstimator)#MaximumLikelihoodEstimator)
#print(casas7_model.get_cpds())
#casas7_model.get_n
data = pd.read_csv('data/breast-cancer-wisconsin.data',
                   names=col_names.columns)
data = data[data["bare_nuclei"] != '?']
data.set_index('id', inplace=True)  #stop the model from using id as a node

train, test = train_test_split(data, test_size=0.2, random_state=0)
Y_test = test['class']
test = test.drop(['class'], axis=1)

#convert labels to something that can be handled be sklearn's eval functions
labelencoder = LabelEncoder()
Y_test = labelencoder.fit_transform(Y_test.values.ravel())

### Greedy Structure Learning with Hill Climbing
hc = HillClimbSearch(data, scoring_method=BicScore(train))
hc_model = hc.estimate()

### Parameter Learning with Bayesian Estimation
hc_model.fit(train, estimator=BayesianEstimator, prior_type="BDeu")
### If the following for loop is un-commented the terminal will be flooded with CPDs
"""
for cpd in best_model.get_cpds():
    print(cpd)
"""

print()

### Another Method (it will throw errors about sample size - but it still runs and shouldn't be too messed up)
###Constraint Based Structure Learning
est = ConstraintBasedEstimator(train)
Exemple #28
0
import pandas as pd
import os

data = pd.read_csv("final_data.csv")

from pgmpy.estimators import BicScore, BdeuScore, ExhaustiveSearch, HillClimbSearch

est = HillClimbSearch(data,
                      scoring_method=BdeuScore(data,
                                               equivalent_sample_size=10))

best_model = est.estimate()
print("Structure found!")
print("Best Model")
print(best_model.edges())
'''
Parameters Estimation and CPD tables
'''
from pgmpy.models import BayesianModel
from pgmpy.estimators import BayesianEstimator

model = BayesianModel(best_model.edges())
data = pd.read_csv("final_training_set.csv")

estimator = BayesianEstimator(model, data)
parameters = estimator.get_parameters(prior_type='BDeu',
                                      equivalent_sample_size=10)

for cpd in parameters:
    model.add_cpds(cpd)
'''
Exemple #29
0
def _hillclimbsearch(df,
                     scoretype='bic',
                     black_list=None,
                     white_list=None,
                     max_indegree=None,
                     epsilon=1e-4,
                     max_iter=1e6,
                     bw_list_method='enforce',
                     verbose=3):
    """Heuristic hill climb searches for DAGs, to learn network structure from data. `estimate` attempts to find a model with optimal score.

    Description
    -----------
    Performs local hill climb search to estimates the `DAG` structure
    that has optimal score, according to the scoring method supplied in the constructor.
    Starts at model `start` and proceeds by step-by-step network modifications
    until a local maximum is reached. Only estimates network structure, no parametrization.

    Once more nodes are involved, one needs to switch to heuristic search.
    HillClimbSearch implements a greedy local search that starts from the DAG
    "start" (default: disconnected DAG) and proceeds by iteratively performing
    single-edge manipulations that maximally increase the score.
    The search terminates once a local maximum is found.

    For details on scoring see Koller & Friedman, Probabilistic Graphical Models, Section 18.4.3.3 (page 818).
    If a number `max_indegree` is provided, only modifications that keep the number
    of parents for each node below `max_indegree` are considered. A list of
    edges can optionally be passed as `black_list` or `white_list` to exclude those
    edges or to limit the search.

    """
    out = dict()
    # Set scoring type
    scoring_method = _SetScoringType(df, scoretype)
    # Set search algorithm
    model = HillClimbSearch(df, scoring_method=scoring_method)

    # Compute best DAG
    # PGMPY_VER = version.parse(pgmpy.__version__)>version.parse("0.1.9")
    # if PGMPY_VER:
    #     best_model = model.estimate(max_indegree=max_indegree, black_list=black_list, white_list=white_list)
    # else:
    #     best_model = model.estimate(max_indegree=max_indegree)  # Can be be removed if pgmpy >v0.1.9

    # Compute best DAG
    if bw_list_method == 'enforce':
        if (black_list is not None) or (white_list is not None):
            if verbose >= 3:
                print(
                    '[bnlearn]  >Enforcing nodes based on black_list and/or white_list.'
                )
        best_model = model.estimate(max_indegree=max_indegree,
                                    epsilon=epsilon,
                                    max_iter=max_iter,
                                    black_list=black_list,
                                    white_list=white_list)
    else:
        # At this point, variables are readily filtered based on bw_list_method or not (if nothing defined).
        best_model = model.estimate(max_indegree=max_indegree,
                                    epsilon=epsilon,
                                    max_iter=max_iter)

    # Store
    out['model'] = best_model
    out['model_edges'] = best_model.edges()
    # Return
    return (out)
from pgmpy.models import BayesianModel
from pgmpy.readwrite.BIF import BIFWriter
import pandas as pd
import numpy as np
from time import time
import graphviz as gv
import os

train = pd.read_csv('../msnbcWithHeader.csv', sep=',')
train = train[train.sum(axis=1) < 200]
train[train > 1] = 1

train_start = time()
bic = BicScore(train)
hc = HillClimbSearch(train, scoring_method=bic)
best_model = hc.estimate(prog_bar=True)
edges = best_model.edges()
model = BayesianModel(edges)
model.fit(train, estimator=BayesianEstimator, prior_type="BDeu")
variables = model.nodes()

print(model.edges())
train_end = time() - train_start
print("train time " + str(train_end))

my_graph = gv.Digraph(format='png')
for node in variables:
    my_graph.node(node)
for edge in edges:
    my_graph.edge(edge[0], edge[1])
filename = my_graph.render('../graph', view=True)
Exemple #31
0
    def learn(self, file1, file2):
        f1 = open(file1, encoding="utf8")
        lines = f1.readlines()
        edges = self.getegdes(lines[0])
        data = pd.read_csv(file2)

        G = nx.DiGraph()
        for i in range(int(len(edges) / 2)):
            G.add_edge(edges[2 * i], edges[2 * i + 1])

        est = HillClimbSearch(data, scoring_method=BicScore(data))
        model = est.estimate()
        G_ = nx.DiGraph()
        G_.add_edges_from(model.edges())

        for i, j in G_.edges():
            if i not in G.nodes() or j not in G.nodes():
                G.add_edge(i, j)
            elif not nx.has_path(G, j, i):
                G.add_edge(i, j)

        new_model = BayesianModel()
        new_model.add_edges_from(G.edges)
        G = new_model.copy()

        # N = G.number_of_nodes()
        # B = np.zeros((N*(N-1)//2, N))
        # i = 0
        # y = []
        # k = 0
        # nodes = list(G.nodes._nodes.keys())
        # for i in range(len(nodes)):
        #     for j in range(i+1, len(nodes)):
        #         if nx.has_path(G, nodes[i], nodes[j]):
        #             y.append(1)
        #             B[k, i] = 1
        #             B[k, j] = -1
        #         elif nx.has_path(G, nodes[j], nodes[i]):
        #             y.append(-1)
        #             B[k, i] = 1
        #             B[k, j] = -1
        #         else:
        #             y.append(0)
        #         k += 1
        #
        # W = np.eye(N, N)
        # est = HillClimbSearch(data, scoring_method=BicScore(data))
        # model = est.estimate()
        # G_ = nx.DiGraph()
        # G_.add_edges_from(model.edges())
        # queue = []
        # for node in G_.nodes():
        #     if G_.in_degree(node) == 0:
        #         queue.append(node)
        #         G.node[node]['s'] = N
        #     else:
        #         G.node[node]['s'] = N//2
        # while len(queue)>0:
        #     now = queue[0]
        #     l = list(G_._succ[now].keys())
        #     for i in l:
        #         G.node[i]['s'] = G.node[now]['s'] - 1
        #     queue += l
        #     queue.pop(0)
        #
        # phai = []
        # for node in G.nodes():
        #     phai.append(G.node[node]['s'])
        # miu1 = np.dot(np.transpose(B), B)
        # miu1 = np.linalg.pinv(miu1)
        # miu2 = np.dot(np.transpose(B), y)
        # miu2 = miu2 + phai
        # miu = np.dot(miu1, miu2)
        #
        # seq = miu.tolist()
        # seq = list(zip(seq, nodes))
        # seq = sorted(seq, key=lambda s: s[0])
        # seq = [x[1] for x in seq]

        # nx.draw(G)
        # plt.show()
        estimator = BayesianEstimator(G, data)

        edges = []
        for i in G.edges:
            edges.append(str(i))
        print(edges)
        for i in G.nodes:
            cpd = estimator.estimate_cpd(i, prior_type="K2")
            nodeName = i
            values = dict(data[i].value_counts())
            valueNum = len(values)
            CPT = np.transpose(cpd.values)
            # CPT = cpd.values
            sequence = cpd.variables[1::]
            card = []
            for x in sequence:
                s = len(dict(data[x].value_counts()))
                card.append(s)
            output = nodeName + '\t' + str(valueNum) + '\t' + str(
                CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card)
            print(output)