def hillclimbsearch(df, scoretype='bic', black_list=None, white_list=None, max_indegree=None, verbose=3): out = dict() # Set scoring type scoring_method = SetScoringType(df, scoretype) # Set search algorithm model = HillClimbSearch(df, scoring_method=scoring_method) # Compute best DAG try: best_model = model.estimate(max_indegree=max_indegree, black_list=black_list, white_list=white_list) # print("Works only for version > v.0.1.9") except: best_model = model.estimate( max_indegree=max_indegree) #Can be be removed if pgmpy >v0.1.9 # Store out['model'] = best_model out['model_edges'] = best_model.edges() # Return return (out)
def _hillclimbsearch(df, scoretype='bic', black_list=None, white_list=None, max_indegree=None, verbose=3): out = dict() # Set scoring type scoring_method = _SetScoringType(df, scoretype) # Set search algorithm model = HillClimbSearch(df, scoring_method=scoring_method) # Compute best DAG try: if ((black_list is not None) or (white_list is not None)): if verbose >= 3: print( '[BNLEARN][STRUCTURE LEARNING] black_list and/or white_list are incorporated..' ) # Can be be removed if pgmpy >v0.1.9 # print("Works only for version > v.0.1.9") best_model = model.estimate(max_indegree=max_indegree, black_list=black_list, white_list=white_list) except: best_model = model.estimate( max_indegree=max_indegree) # Can be be removed if pgmpy >v0.1.9 # Store out['model'] = best_model out['model_edges'] = best_model.edges() # Return return (out)
def generateDiffAndRarityModel(h0Diff, h0Rarity): # correlation matrix h0DiffCorrelation = np.corrcoef(h0Diff, rowvar=False) h0RarityCorrelation = np.corrcoef(h0Rarity, rowvar=False) # converting to pandas data frame h0Diff = pd.DataFrame(h0Diff, columns = ['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9']) h0Rarity = pd.DataFrame(h0Rarity, columns = ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8', 'r9']) print('\nestimating PGM\n') # using hill climbing algo hc = HillClimbSearch(h0Diff) # estimating model diffModel = hc.estimate(max_indegree = 40) print('difference model:\n', diffModel.edges()) print('\nplotting heatmap for h0Diff correlation\n') sns.heatmap(h0DiffCorrelation, annot=True) plt.show() # using hill climbing algo hc = HillClimbSearch(h0Rarity) # estimating model rarityModel = hc.estimate(max_indegree = 20) print('rarity model:\n', rarityModel.edges()) print('\nplotting heatmap for h0Rarity correlation\n') sns.heatmap(h0RarityCorrelation, annot=True) plt.show() return h0DiffModel, h0RarityModel
def pgm_generate(self, target, data, pgm_stats, subnodes, child=None): subnodes = [str(int(node)) for node in subnodes] target = str(int(target)) subnodes_no_target = [node for node in subnodes if node != target] data.columns = data.columns.astype(str) MK_blanket = self.search_MK(data, target, subnodes_no_target.copy()) if child == None: est = HillClimbSearch(data[subnodes_no_target], scoring_method=BicScore(data)) pgm_no_target = est.estimate() for node in MK_blanket: if node != target: pgm_no_target.add_edge(node, target) # Create the pgm pgm_explanation = BayesianModel() for node in pgm_no_target.nodes(): pgm_explanation.add_node(node) for edge in pgm_no_target.edges(): pgm_explanation.add_edge(edge[0], edge[1]) # Fit the pgm data_ex = data[subnodes].copy() data_ex[target] = data[target].apply(self.generalize_target) for node in subnodes_no_target: data_ex[node] = data[node].apply(self.generalize_others) pgm_explanation.fit(data_ex) else: data_ex = data[subnodes].copy() data_ex[target] = data[target].apply(self.generalize_target) for node in subnodes_no_target: data_ex[node] = data[node].apply(self.generalize_others) est = HillClimbSearch(data_ex, scoring_method=BicScore(data_ex)) pgm_w_target_explanation = est.estimate() # Create the pgm pgm_explanation = BayesianModel() for node in pgm_w_target_explanation.nodes(): pgm_explanation.add_node(node) for edge in pgm_w_target_explanation.edges(): pgm_explanation.add_edge(edge[0], edge[1]) # Fit the pgm data_ex = data[subnodes].copy() data_ex[target] = data[target].apply(self.generalize_target) for node in subnodes_no_target: data_ex[node] = data[node].apply(self.generalize_others) pgm_explanation.fit(data_ex) return pgm_explanation
class TimeHillClimbAlarmModel: timeout = 600.0 def setup(self): model = get_example_model('alarm') samples = model.simulate(n_samples=int(1e4), seed=42, show_progress=False) self.scoring_method = K2Score(samples) self.est = HillClimbSearch(data=samples) def time_hillclimb(self): self.est.estimate(max_indegree=4, scoring_method=self.scoring_method, max_iter=int(1e4))
def create_BN_model(data): #structure learning print("Structure learning") start_time = datetime.now() print("Start time: ", start_time) #DECOMMENT TO CREATE A MODEL WITH THE HILL CLIMB ALGORITHM hc = HillClimbSearch(data) best_model = hc.estimate() print(best_model.edges()) edges = best_model.edges() model = BayesianModel(edges) print('Fitting the model...') # Evaluation of cpds using Maximum Likelihood Estimation model.fit(data) end_time = datetime.now() print("End time: ", end_time) model_write = BIFWriter(model) model_write.write_bif('model_pgmpy.bif') if model.check_model(): print( "Your network structure and CPD's are correctly defined. The probabilities in the columns sum to 1. Hill Climb worked fine!" ) else: print("not good") return (model, end_time - start_time)
def pgm_generate(self, target, data, stats, subnodes): stats_pd = pd.Series(stats, name='p-values') MK_blanket_frame = stats_pd[stats_pd < 0.05] MK_blanket = [node for node in MK_blanket_frame.index if node in subnodes] subnodes_no_target = [node for node in subnodes if node != target] est = HillClimbSearch(data[subnodes_no_target], scoring_method=BicScore(data)) pgm_no_target = est.estimate() for node in MK_blanket: if node != target: pgm_no_target.add_edge(node,target) # Create the pgm pgm_explanation = BayesianModel() for node in pgm_no_target.nodes(): pgm_explanation.add_node(node) for edge in pgm_no_target.edges(): pgm_explanation.add_edge(edge[0],edge[1]) # Fit the pgm data_ex = data[subnodes].copy() data_ex[target] = data[target].apply(self.generalize_target) for node in subnodes_no_target: data_ex[node] = data[node].apply(self.generalize_others) pgm_explanation.fit(data_ex) return pgm_explanation
def Hybrid(dataset: pd.DataFrame): from pgmpy.estimators import MmhcEstimator from pgmpy.estimators import HillClimbSearch from pgmpy.estimators import BDeuScore, K2Score, BicScore from pgmpy.models import BayesianModel mmhc = MmhcEstimator(dataset) # The mmhc method takes a parameter significance_level(default=0.01) the desired Type 1 error probability of # falsely rejecting the null hypothesis that variables. That is, confining Type 1 error rate. # (Therefore, the lower value, the less we are gonna accept dependencies, resulting in a sparser graph.) skeleton = mmhc.mmpc() print("Part 1) Skeleton: ", skeleton.edges()) # use hill climb search to orient the edges: hc = HillClimbSearch(dataset, scoring_method=BDeuScore(dataset, equivalent_sample_size=5)) # Recording the evaluation of different iteration bdeu = BDeuScore(dataset, equivalent_sample_size=5) iter_list = [2**i for i in range(20)] eval_list = [] for iteration in iter_list: DAG_connection = hc.estimate(tabu_length=10, white_list=skeleton.to_directed().edges(), max_iter=iteration) model = BayesianModel(DAG_connection.edges()) print(bdeu.score(model)) eval_list.append(bdeu.score(model)) print("Part 2) Model: ", model.edges()) return model.edges(), [iter_list, eval_list]
def learnedStructureModel(): # trainingData, testingData = differenceBetweenFeatures(True) trainingInputs, trainingOutputs, testingInputs, testingOutputs = \ gtd.formSameWriterDiffWriterInputOutputFeaturePairs(5, True) trainingData = pd.DataFrame( data = np.concatenate((trainingInputs, trainingOutputs), axis=1), columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9',\ 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'h']) testingData = pd.DataFrame( data = np.concatenate((testingInputs, testingOutputs), axis=1), columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9',\ 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'h']) #trainingData = trainingData.drop(['f9', 'f18'], axis=1) #testingData = testingData.drop(['f9', 'f18'], axis=1) hc = HillClimbSearch(trainingData, scoring_method=BicScore(trainingData)) model = hc.estimate(max_indegree=20) state_names = { 'f1': [0, 1, 2, 3], 'f2': [0, 1, 2, 3, 4], 'f3': [0, 1, 2], 'f4': [0, 1, 2, 3, 4], 'f5': [0, 1, 2, 3], 'f6': [0, 1, 2, 3], 'f7': [0, 1, 2, 3], 'f8': [0, 1, 2, 3, 4], 'f9': [0, 1, 2], 'f11': [0, 1, 2, 3], 'f12': [0, 1, 2, 3, 4], 'f13': [0, 1, 2], 'f14': [0, 1, 2, 3, 4], 'f15': [0, 1, 2, 3], 'f16': [0, 1, 2, 3], 'f17': [0, 1, 2, 3], 'f18': [0, 1, 2, 3, 4], 'f19': [0, 1, 2], 'h': [0, 1] } # fit model and data, compute CPDs model.fit(trainingData, estimator=BayesianEstimator, prior_type='BDeu', state_names=state_names) print(model.edges()) # inference object # computing probability of Hyothesis given evidence evidenceNodes = ['f1','f2','f3','f4','f5','f6','f7','f8','f9',\ 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19'] evaluateModel(model, testingData, 'h', evidenceNodes)
def build_structure(data): df = pd.DataFrame(data) est = HillClimbSearch(df, scoring_method=BicScore(df)) model = est.estimate() DAG = np.zeros((data.shape[1], data.shape[1]), np.int64) for edge in model.edges(): DAG[edge[0], edge[1]] = 1 np.save('dataset/DAG.npy', DAG) return DAG
def bei_ye_si(): warnings.filterwarnings("ignore") print('现在进行的算法是贝叶斯网络') f = open('泰坦尼克号.txt') dataset = pd.read_table(f, delim_whitespace=True) train = dataset[:800] test = dataset[800:] hc = HillClimbSearch(train, scoring_method=BicScore(train)) best_model = hc.estimate() best_model.fit(train, estimator=BayesianEstimator, prior_type="BDeu") # default equivalent_sample_size=5 predict_data = test.drop(columns=['Survived'], axis=1) y_pred = best_model.predict(predict_data) print( (y_pred['Survived'] == test['Survived']).sum() / len(test)) # 测试集精度'''
def estimate(self, scoring_method=None, tabu_length=10, significance_level=0.01): if scoring_method is None: scoring_method = BDeuScore(self.data, equivalent_sample_size=10) skel = self.mmpc(significance_level) hc = HillClimbSearch(self.data, scoring_method=scoring_method) model = hc.estimate(white_list=skel.to_directed().edges(), tabu_length=tabu_length) return model
def Hill_Climbing(dataset: pd.DataFrame): # from pgmpy.estimators import ExhaustiveSearch from pgmpy.estimators import HillClimbSearch from pgmpy.estimators import BDeuScore, K2Score, BicScore from pgmpy.models import BayesianModel bdeu = BDeuScore(dataset, equivalent_sample_size=5) hc = HillClimbSearch(dataset, scoring_method=BDeuScore(dataset, equivalent_sample_size=5)) iter_list = [2**i for i in range(20)] eval_list = [] for iteration in iter_list: DAG_connection = hc.estimate(tabu_length=10, max_iter=iteration) model = BayesianModel(DAG_connection.edges()) print(bdeu.score(model)) eval_list.append(bdeu.score(model)) return model.edges(), [iter_list, eval_list]
def scoreStructureLearn(data, search='HillClimbSearch', scoring_method='BicScore'): #基于score-search的结构学习 #search:HillClimbSearch, ExhaustiveSearch #scoring_method: 'BicScore', K2Score, BdeuScore if scoring_method == 'BicScore': scoring_method_tmp = BicScore(data) elif scoring_method == 'K2Score': scoring_method_tmp = K2Score(data) elif scoring_method == 'BdeuScore': scoring_method_tmp = BdeuScore(data, equivalent_sample_size=5) if search == 'HillClimbSearch': es = HillClimbSearch(data, scoring_method=scoring_method_tmp) else: es = ExhaustiveSearch(data, scoring_method=scoring_method_tmp) best_model = es.estimate() return best_model
def learn_structure(self, method, scoring_method, log=True): ''' (4) Method that builds the structure of the data ----------------- Parameters: method : The technique used to search for the structure -> scoring_approx - To use an approximated search with scoring method -> scoring_exhaustive - To use an exhaustive search with scoring method -> constraint - To use the constraint based technique scoring_method : K2, bic, bdeu log - "True" if you want to print debug information in the console ''' #Select the scoring method for the local search of the structure if scoring_method == "K2": scores = K2Score(self.data) elif scoring_method == "bic": scores = BicScore(self.data) elif scoring_method == "bdeu": scores = BdeuScore(self.data) #Select the actual method if method == "scoring_approx": est = HillClimbSearch(self.data, scores) elif method == "scoring_exhaustive": est = ExhaustiveSearch(self.data, scores) elif method == "constraint": est = ConstraintBasedEstimator(self.data) self.best_model = est.estimate() self.eliminate_isolated_nodes( ) # REMOVE all nodes not connected to anything else for edge in self.best_model.edges_iter(): self.file_writer.write_txt(str(edge)) self.log("Method used for structural learning: " + method, log) #self.log("Training instances skipped: " + str(self.extractor.get_skipped_lines()), log) self.log("Search terminated", log)
def main(): data, string = readData() genes = np.array(data.columns[1:]) labels = np.array(data.columns) bayesianModel = BayesianModel() transitionModel = DBN() bayesianModel.add_nodes_from(genes) transitionModel.add_nodes_from(genes) bData, tData = getData(data, labels) print "\nDynamic Bayesian Network inference", print "\nB_0 network relations: " hcb = HillClimbSearch(bData, genes, scoring_method=BicScore(bData, labels, bk1=string, weight=4)) best_model_b = hcb.estimate(start=bayesianModel, tabu_length=15, max_indegree=2) print(best_model_b.edges()) printOutputB(best_model_b) print "\nLocal Probability Model: " best_model_b.fit(bData, BayesianEstimator) for cpd in best_model_b.get_cpds(): print(cpd) print "\nB_transition network relations: " hct = HillClimbSearch(tData, genes, scoring_method=BicScore(tData, labels, bk1=string, weight=4)) best_model_t = hct.estimate_dynamic(start=transitionModel, tabu_length=15, max_indegree=2) print(best_model_t.edges()) printOutputT(best_model_t) print "\nLocal Probability Model: " best_model_t.fit(tData, BayesianEstimator) for cpd in best_model_t.get_cpds(): print(cpd)
# 时间:2020/12/21 15:38 import pandas as pd import networkx as nx from matplotlib import pyplot as plt from pgmpy.models import BayesianModel from pgmpy.estimators import HillClimbSearch from pgmpy.estimators import BicScore data = pd.read_csv( r'C:\Users\haomiaowu\Desktop\BN-Cheminformatics\Train-clear.csv') bic = BicScore(data) hs = HillClimbSearch(data, scoring_method=BicScore(data)) best_model = hs.estimate() print(best_model.edges()) nx.draw( best_model, with_labels=True, node_size=1000, font_weight='bold', node_color='y', ) plt.show()
def main(): #Fetching features data features_data = pd.read_csv(fileloc_features) features_data_f = features_data.add_prefix('f') features_data_g = features_data.add_prefix('g') #Seen Training Data seen_traindata = pd.read_csv(fileloc_seen_training, usecols = ['left','right','label']) #seen_traindata_f = pd.read_csv(fileloc_seen_training, usecols = ['left','label']) #seen_traindata_g = pd.read_csv(fileloc_seen_training, usecols = ['right','label']) seen_traindata_merged_f = seen_traindata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') seen_traindata_merged_g = seen_traindata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') seen_traindata_merged_f = seen_traindata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) seen_traindata_merged_g = seen_traindata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) seen_features_traindata_final = pd.concat([seen_traindata_merged_f, seen_traindata_merged_g], axis = 1) seen_label_traindata_final = seen_traindata.loc[:, 'label'] seen_traindata_final = pd.concat([seen_features_traindata_final, seen_label_traindata_final], axis = 1) seen_traindata_final.replace([np.inf, -np.inf], np.nan) seen_traindata_final.dropna(inplace=True) seen_traindata_final = seen_traindata_final.astype(int) seen_traindata_final_NDArray = seen_traindata_final.values #Seen Validation Data seen_validationdata = pd.read_csv(fileloc_seen_validation, usecols = ['left','right','label']) #seen_validationdata_f = pd.read_csv(fileloc_seen_validation, usecols = ['left','label']) #seen_validationdata_g = pd.read_csv(fileloc_seen_validation, usecols = ['right','label']) seen_validationdata_merged_f = seen_validationdata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') seen_validationdata_merged_g = seen_validationdata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') seen_validationdata_merged_f = seen_validationdata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) seen_validationdata_merged_g = seen_validationdata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) seen_features_validationdata_final = pd.concat([seen_validationdata_merged_f, seen_validationdata_merged_g], axis = 1) seen_label_validationdata_final = seen_validationdata.loc[:, 'label'] seen_validationdata_final = pd.concat([seen_features_validationdata_final, seen_label_validationdata_final], axis = 1) seen_validationdata_final.replace([np.inf, -np.inf], np.nan) seen_validationdata_final.dropna(inplace=True) seen_validationdata_final = seen_validationdata_final.astype(int) seen_validationdata_final_NDArray = seen_validationdata_final.values #Shuffled Training Data shuffled_traindata = pd.read_csv(fileloc_shuffled_training, usecols = ['left','right','label']) #shuffled_traindata_f = pd.read_csv(fileloc_shuffled_training, usecols = ['left','label']) #shuffled_traindata_g = pd.read_csv(fileloc_shuffled_training, usecols = ['right','label']) shuffled_traindata_merged_f = shuffled_traindata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') shuffled_traindata_merged_g = shuffled_traindata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') shuffled_traindata_merged_f = shuffled_traindata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) shuffled_traindata_merged_g = shuffled_traindata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) shuffled_features_traindata_final = pd.concat([shuffled_traindata_merged_f, shuffled_traindata_merged_g], axis = 1) shuffled_label_traindata_final = shuffled_traindata.loc[:, 'label'] shuffled_traindata_final = pd.concat([shuffled_features_traindata_final, shuffled_label_traindata_final], axis = 1) shuffled_traindata_final.replace([np.inf, -np.inf], np.nan) shuffled_traindata_final.dropna(inplace=True) shuffled_traindata_final = shuffled_traindata_final.astype(int) shuffled_traindata_final_NDArray = shuffled_traindata_final.values #Shuffled Validation Data shuffled_validationdata = pd.read_csv(fileloc_shuffled_validation, usecols = ['left','right','label']) #shuffled_validationdata_f = pd.read_csv(fileloc_shuffled_validation, usecols = ['left','label']) #shuffled_validationdata_g = pd.read_csv(fileloc_shuffled_validation, usecols = ['right','label']) shuffled_validationdata_merged_f = shuffled_validationdata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') shuffled_validationdata_merged_g = shuffled_validationdata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') shuffled_validationdata_merged_f = shuffled_validationdata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) shuffled_validationdata_merged_g = shuffled_validationdata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) shuffled_features_validationdata_final = pd.concat([shuffled_validationdata_merged_f, shuffled_validationdata_merged_g], axis = 1) shuffled_label_validationdata_final = shuffled_validationdata.loc[:, 'label'] shuffled_validationdata_final = pd.concat([shuffled_features_validationdata_final, shuffled_label_validationdata_final], axis = 1) shuffled_validationdata_final.replace([np.inf, -np.inf], np.nan) shuffled_validationdata_final.dropna(inplace=True) shuffled_validationdata_final = shuffled_validationdata_final.astype(int) shuffled_validationdata_final_NDArray = shuffled_validationdata_final.values #Unseen Training Data unseen_traindata = pd.read_csv(fileloc_unseen_training, usecols = ['left','right','label']) #unseen_traindata_f = pd.read_csv(fileloc_unseen_training, usecols = ['left','label']) #unseen_traindata_g = pd.read_csv(fileloc_unseen_training, usecols = ['right','label']) unseen_traindata_merged_f = unseen_traindata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') unseen_traindata_merged_g = unseen_traindata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') unseen_traindata_merged_f = unseen_traindata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) unseen_traindata_merged_g = unseen_traindata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) unseen_features_traindata_final = pd.concat([unseen_traindata_merged_f, unseen_traindata_merged_g], axis = 1) unseen_label_traindata_final = unseen_traindata.loc[:, 'label'] unseen_traindata_final = pd.concat([unseen_features_traindata_final, unseen_label_traindata_final], axis = 1) unseen_traindata_final.replace([np.inf, -np.inf], np.nan) unseen_traindata_final.dropna(inplace=True) unseen_traindata_final = unseen_traindata_final.astype(int) unseen_traindata_final_NDArray = unseen_traindata_final.values #Unseen Validation Data unseen_validationdata = pd.read_csv(fileloc_unseen_validation, usecols = ['left','right','label']) #unseen_validationdata_f = pd.read_csv(fileloc_unseen_validation, usecols = ['left','label']) #unseen_validationdata_g = pd.read_csv(fileloc_unseen_validation, usecols = ['right','label']) unseen_validationdata_merged_f = unseen_validationdata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') unseen_validationdata_merged_g = unseen_validationdata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') unseen_validationdata_merged_f = unseen_validationdata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) unseen_validationdata_merged_g = unseen_validationdata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) unseen_features_validationdata_final = pd.concat([unseen_validationdata_merged_f, unseen_validationdata_merged_g], axis = 1) unseen_label_validationdata_final = unseen_validationdata.loc[:, 'label'] unseen_validationdata_final = pd.concat([unseen_features_validationdata_final, unseen_label_validationdata_final], axis = 1) unseen_validationdata_final.replace([np.inf, -np.inf], np.nan) unseen_validationdata_final.dropna(inplace=True) unseen_validationdata_final = unseen_validationdata_final.astype(int) unseen_validationdata_final_NDArray = unseen_validationdata_final.values #Creating base models featureNamesList = ["pen_pressure","letter_spacing","size","dimension","is_lowercase","is_continuous","slantness","tilt","entry_stroke_a", "staff_of_a","formation_n","staff_of_d","exit_stroke_d","word_formation","constancy"] features_only_data = features_data[featureNamesList] initial_hcs = HillClimbSearch(features_only_data) initial_model = initial_hcs.estimate() #print(initial_model.edges()) print("Hill Climb Done") basemodel = BayesianModel([('fpen_pressure', 'fis_lowercase'), ('fpen_pressure', 'fletter_spacing'), ('fsize', 'fslantness'), ('fsize', 'fpen_pressure'), ('fsize', 'fstaff_of_d'), ('fsize', 'fletter_spacing'), ('fsize', 'fexit_stroke_d'), ('fsize', 'fentry_stroke_a'), ('fdimension', 'fsize'), ('fdimension', 'fis_continuous'), ('fdimension', 'fslantness'), ('fdimension', 'fpen_pressure'), ('fis_lowercase', 'fstaff_of_a'), ('fis_lowercase', 'fexit_stroke_d'), ('fis_continuous', 'fexit_stroke_d'), ('fis_continuous', 'fletter_spacing'), ('fis_continuous', 'fentry_stroke_a'), ('fis_continuous', 'fstaff_of_a'), ('fis_continuous', 'fis_lowercase'), ('fslantness', 'fis_continuous'), ('fslantness', 'ftilt'), ('fentry_stroke_a', 'fpen_pressure'), ('fformation_n', 'fconstancy'), ('fformation_n', 'fword_formation'), ('fformation_n', 'fdimension'), ('fformation_n', 'fstaff_of_d'), ('fformation_n', 'fis_continuous'), ('fformation_n', 'fsize'), ('fformation_n', 'fstaff_of_a'), ('fstaff_of_d', 'fis_continuous'), ('fstaff_of_d', 'fexit_stroke_d'), ('fstaff_of_d', 'fis_lowercase'), ('fstaff_of_d', 'fslantness'), ('fstaff_of_d', 'fentry_stroke_a'), ('fword_formation', 'fdimension'), ('fword_formation', 'fstaff_of_a'), ('fword_formation', 'fsize'), ('fword_formation', 'fstaff_of_d'), ('fword_formation', 'fconstancy'), ('fconstancy', 'fstaff_of_a'), ('fconstancy', 'fletter_spacing'), ('fconstancy', 'fdimension'), ('gpen_pressure', 'gis_lowercase'), ('gpen_pressure', 'gletter_spacing'), ('gsize', 'gslantness'), ('gsize', 'gpen_pressure'), ('gsize', 'gstaff_of_d'), ('gsize', 'gletter_spacing'), ('gsize', 'gexit_stroke_d'), ('gsize', 'gentry_stroke_a'), ('gdimension', 'gsize'), ('gdimension', 'gis_continuous'), ('gdimension', 'gslantness'), ('gdimension', 'gpen_pressure'), ('gis_lowercase', 'gstaff_of_a'), ('gis_lowercase', 'gexit_stroke_d'), ('gis_continuous', 'gexit_stroke_d'), ('gis_continuous', 'gletter_spacing'), ('gis_continuous', 'gentry_stroke_a'), ('gis_continuous', 'gstaff_of_a'), ('gis_continuous', 'gis_lowercase'), ('gslantness', 'gis_continuous'), ('gslantness', 'gtilt'), ('gentry_stroke_a', 'gpen_pressure'), ('gformation_n', 'gconstancy'), ('gformation_n', 'gword_formation'), ('gformation_n', 'gdimension'), ('gformation_n', 'gstaff_of_d'), ('gformation_n', 'gis_continuous'), ('gformation_n', 'gsize'), ('gformation_n', 'gstaff_of_a'), ('gstaff_of_d', 'gis_continuous'), ('gstaff_of_d', 'gexit_stroke_d'), ('gstaff_of_d', 'gis_lowercase'), ('gstaff_of_d', 'gslantness'), ('gstaff_of_d', 'gentry_stroke_a'), ('gword_formation', 'gdimension'), ('gword_formation', 'gstaff_of_a'), ('gword_formation', 'gsize'), ('gword_formation', 'gstaff_of_d'), ('gword_formation', 'gconstancy'), ('gconstancy', 'gstaff_of_a'), ('gconstancy', 'gletter_spacing'), ('gconstancy', 'gdimension'), ('fis_continuous', 'label'), ('fword_formation','label'), ('gis_continuous', 'label'), ('gword_formation','label')]) model_seen = basemodel.copy() model_shuffled = basemodel.copy() model_unseen = basemodel.copy() accuracies = {} #Training Seen Model model_seen.fit(seen_traindata_final) estimator_seen = BayesianEstimator(model_seen, seen_traindata_final) cpds=[] for featureName in featureNamesList : cpd = estimator_seen.estimate_cpd('f'+featureName) cpds.append(cpd) cpd = estimator_seen.estimate_cpd('g'+featureName) cpds.append(cpd) cpd = estimator_seen.estimate_cpd('label') cpds.append(cpd) model_seen.add_cpds(*cpds) print("CPDs Calculated") #Testing Seen Model - Training model_seen_ve = VariableElimination(model_seen) model_seen_traindata_predictions = [] for i in range(seen_traindata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=(seen_traindata_final_NDArray[i,index]-1) evidenceDic['g'+featureName]=(seen_traindata_final_NDArray[i+15,index]-1) temp = model_seen_ve.map_query(variables=['label'],evidence=evidenceDic) model_seen_traindata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_seen_traindata_predictions)): if(int(model_seen_traindata_predictions[i]) == int(seen_traindata_final_NDArray[i,30])): correctCnt+=1 accuracies["seen_train"]=correctCnt/len(model_seen_traindata_predictions)*100 print("Bayesian Model Accuracy for Seen Training Data = "+str(accuracies["seen_train"])) #Testing Seen Model - Validation model_seen_ve = VariableElimination(model_seen) model_seen_validationdata_predictions = [] for i in range(seen_validationdata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=seen_validationdata_final_NDArray[i,index]-1 evidenceDic['g'+featureName]=seen_validationdata_final_NDArray[i+15,index]-1 temp = model_seen_ve.map_query(variables=['label'],evidence=evidenceDic) model_seen_validationdata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_seen_validationdata_predictions)): if(int(model_seen_validationdata_predictions[i]) == int(seen_validationdata_final_NDArray[i,30])): correctCnt+=1 accuracies["seen_validation"]=correctCnt/len(model_seen_validationdata_predictions)*100 print("Bayesian Model Accuracy for Seen Validation Data = "+str(accuracies["seen_validation"])) #Training Shuffled Model model_shuffled.fit(shuffled_traindata_final) estimator_shuffled = BayesianEstimator(model_shuffled, shuffled_traindata_final) cpds=[] for featureName in featureNamesList : cpd = estimator_shuffled.estimate_cpd('f'+featureName) cpds.append(cpd) cpd = estimator_shuffled.estimate_cpd('g'+featureName) cpds.append(cpd) cpd = estimator_shuffled.estimate_cpd('label') cpds.append(cpd) model_shuffled.add_cpds(*cpds) #Testing Shuffled Model - Training model_shuffled_ve = VariableElimination(model_shuffled) model_shuffled_traindata_predictions = [] for i in range(shuffled_traindata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=shuffled_traindata_final_NDArray[i,index]-1 evidenceDic['g'+featureName]=shuffled_traindata_final_NDArray[i+15,index]-1 temp = model_shuffled_ve.map_query(variables=['label'],evidence=evidenceDic) model_shuffled_traindata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_shuffled_traindata_predictions)): if(int(model_shuffled_traindata_predictions[i]) == int(shuffled_traindata_final_NDArray[i,30])): correctCnt+=1 accuracies["shuffled_train"]=correctCnt/len(model_shuffled_traindata_predictions)*100 print("Bayesian Model Accuracy for Shuffled Training Data = "+str(accuracies["shuffled_train"])) #Testing Shuffled Model - Validation model_shuffled_ve = VariableElimination(model_shuffled) model_shuffled_validationdata_predictions = [] for i in range(shuffled_validationdata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=shuffled_validationdata_final_NDArray[i,index]-1 evidenceDic['g'+featureName]=shuffled_validationdata_final_NDArray[i+15,index]-1 temp = model_shuffled_ve.map_query(variables=['label'],evidence=evidenceDic) model_shuffled_validationdata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_shuffled_validationdata_predictions)): if(int(model_shuffled_validationdata_predictions[i]) == int(shuffled_validationdata_final_NDArray[i,30])): correctCnt+=1 accuracies["shuffled_validation"]=correctCnt/len(model_shuffled_validationdata_predictions)*100 print("Bayesian Model Accuracy for Shuffled Validation Data = "+str(accuracies["shuffled_validation"])) #Training Unseen Model model_unseen.fit(unseen_traindata_final) estimator_unseen = BayesianEstimator(model_unseen, unseen_traindata_final) cpds=[] for featureName in featureNamesList : cpd = estimator_unseen.estimate_cpd('f'+featureName) cpds.append(cpd) cpd = estimator_unseen.estimate_cpd('g'+featureName) cpds.append(cpd) cpd = estimator_unseen.estimate_cpd('label') cpds.append(cpd) model_unseen.add_cpds(*cpds) #Testing Unseen Model - Training model_unseen_ve = VariableElimination(model_unseen) model_unseen_traindata_predictions = [] for i in range(unseen_traindata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=unseen_traindata_final_NDArray[i,index]-1 evidenceDic['g'+featureName]=unseen_traindata_final_NDArray[i+15,index]-1 temp = model_unseen_ve.map_query(variables=['label'],evidence=evidenceDic) model_unseen_traindata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_unseen_traindata_predictions)): if(int(model_unseen_traindata_predictions[i]) == int(unseen_traindata_final_NDArray[i,30])): correctCnt+=1 accuracies["unseen_train"]=correctCnt/len(model_unseen_traindata_predictions)*100 print("Bayesian Model Accuracy for Unseen Training Data = "+str(accuracies["unseen_train"])) #Testing Unseen Model - Validation model_unseen_ve = VariableElimination(model_unseen) model_unseen_validationdata_predictions = [] for i in range(unseen_validationdata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=unseen_validationdata_final_NDArray[i,index]-1 evidenceDic['g'+featureName]=unseen_validationdata_final_NDArray[i+15,index]-1 temp = model_unseen_ve.map_query(variables=['label'],evidence=evidenceDic) model_unseen_validationdata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_unseen_validationdata_predictions)): if(int(model_unseen_validationdata_predictions[i]) == int(unseen_validationdata_final_NDArray[i,30])): correctCnt+=1 accuracies["unseen_validation"]=correctCnt/len(model_unseen_validationdata_predictions)*100 print("Bayesian Model Accuracy for Unseen Validation Data = "+str(accuracies["unseen_validation"]))
class TestBaseEstimator(unittest.TestCase): def setUp(self): self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) self.rand_data['C'] = self.rand_data['B'] self.est_rand = HillClimbSearch(self.rand_data, scoring_method=K2Score(self.rand_data)) self.model1 = BayesianModel() self.model1.add_nodes_from(['A', 'B', 'C']) self.model2 = self.model1.copy() self.model2.add_edge('A', 'B') # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv') self.titanic_data1 = self.titanic_data[["Survived", "Sex", "Pclass", "Age", "Embarked"]] self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] self.est_titanic1 = HillClimbSearch(self.titanic_data1) self.est_titanic2 = HillClimbSearch(self.titanic_data2) def test_legal_operations(self): model2_legal_ops = list(self.est_rand._legal_operations(self.model2)) model2_legal_ops_ref = [(('+', ('C', 'A')), -28.15602208305154), (('+', ('A', 'C')), -28.155467430966382), (('+', ('C', 'B')), 7636.947544933631), (('+', ('B', 'C')), 7937.805375579936), (('-', ('A', 'B')), 28.155467430966382), (('flip', ('A', 'B')), -0.0005546520851567038)] self.assertSetEqual(set([op for op, score in model2_legal_ops]), set([op for op, score in model2_legal_ops_ref])) def test_legal_operations_titanic(self): est = self.est_titanic1 start_model = BayesianModel([("Survived", "Sex"), ("Pclass", "Age"), ("Pclass", "Embarked")]) legal_ops = est._legal_operations(start_model) self.assertEqual(len(list(legal_ops)), 20) tabu_list = [('-', ("Survived", "Sex")), ('-', ("Survived", "Pclass")), ('flip', ("Age", "Pclass"))] legal_ops_tabu = est._legal_operations(start_model, tabu_list=tabu_list) self.assertEqual(len(list(legal_ops_tabu)), 18) legal_ops_indegree = est._legal_operations(start_model, max_indegree=1) self.assertEqual(len(list(legal_ops_indegree)), 11) legal_ops_both = est._legal_operations(start_model, tabu_list=tabu_list, max_indegree=1) legal_ops_both_ref = [(('+', ('Embarked', 'Survived')), 10.050632580087608), (('+', ('Survived', 'Pclass')), 41.88868046549101), (('+', ('Age', 'Survived')), -23.635716036430836), (('+', ('Pclass', 'Survived')), 41.81314459373226), (('+', ('Sex', 'Pclass')), 4.772261678792802), (('-', ('Pclass', 'Age')), 11.546515590731815), (('-', ('Pclass', 'Embarked')), -32.171482832532774), (('flip', ('Pclass', 'Embarked')), 3.3563814191281836), (('flip', ('Survived', 'Sex')), 0.039737027979640516)] self.assertSetEqual(set(legal_ops_both), set(legal_ops_both_ref)) def test_estimate_rand(self): est1 = self.est_rand.estimate() self.assertSetEqual(set(est1.nodes()), set(['A', 'B', 'C'])) self.assertTrue(est1.edges() == [('B', 'C')] or est1.edges() == [('C', 'B')]) est2 = self.est_rand.estimate(start=BayesianModel([('A', 'B'), ('A', 'C')])) self.assertTrue(est2.edges() == [('B', 'C')] or est2.edges() == [('C', 'B')]) def test_estimate_titanic(self): self.assertSetEqual(set(self.est_titanic2.estimate().edges()), set([('Survived', 'Pclass'), ('Sex', 'Pclass'), ('Sex', 'Survived')])) def tearDown(self): del self.rand_data del self.est_rand del self.model1 del self.titanic_data del self.titanic_data1 del self.titanic_data2 del self.est_titanic1 del self.est_titanic2
data2 = pd.DataFrame(data=raw_data2) import time t0 = time.time() # Uncomment below to perform exhaustive search searcher = ExhaustiveSearch(data2, scoring_method=K2Score(data2)) search = searcher.all_scores() print('time:', time.time() - t0) # Uncomment for printout: #for score, model in search: # print("{0} {1}".format(score, model.edges())) separator() hcs = HillClimbSearch(data2, scoring_method=K2Score(data)) model = hcs.estimate() hcs2 = HillClimbSearch(data2, scoring_method=K2Score(data2)) model2 = hcs2.estimate() hcs_bic = HillClimbSearch(data, scoring_method=BicScore(data)) model_bic = hcs_bic.estimate() hcs_bic2 = HillClimbSearch(data2, scoring_method=BicScore(data2)) model_bic2 = hcs_bic2.estimate() # End of Task 6
def task4(): global andRawData, task4_best_bm k2Scores = [] andRawData_temp = pd.DataFrame(andRawData.values, columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9']) #Model 1 est = HillClimbSearch(andRawData_temp, scoring_method=K2Score(andRawData_temp)) model_temp = est.estimate() estimator = BayesianEstimator(model_temp, andRawData_temp) for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']: cpd_fx = estimator.estimate_cpd(fx, prior_type="K2") model_temp.add_cpds(cpd_fx) task4_bms.append(model_temp) print(" Model 1: Model through HillClimbSearch is : "+str(model_temp.edges())) k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000)) k2Scores_temp = k2Score.score(model_temp) k2Scores.append(k2Scores_temp) print(" Model 1: K2 Accuracy Score is "+str(k2Scores_temp)) #Model 2: Manual Model based on HillClimbSearch model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f3', 'f8'), ('f1', 'f7'), ('f5', 'f3'), ('f9', 'f8'), ('f1', 'f6'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f2')]) estimator = BayesianEstimator(model_temp, andRawData_temp) for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']: cpd_fx = estimator.estimate_cpd(fx, prior_type="K2") model_temp.add_cpds(cpd_fx) task4_bms.append(model_temp) print(" Model 2: Manual Model based on HillClimbSearch is : "+str(model_temp.edges())) k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000)) k2Scores_temp = k2Score.score(model_temp) k2Scores.append(k2Scores_temp) print(" Model 2: K2 Accuracy Score is "+str(k2Scores_temp)) #Model 3: Manual Model based on HillClimbSearch model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f3', 'f8'), ('f5', 'f7'), ('f5', 'f3'), ('f9', 'f8'), ('f1', 'f2'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f2')]) estimator = BayesianEstimator(model_temp, andRawData_temp) for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']: cpd_fx = estimator.estimate_cpd(fx, prior_type="K2") model_temp.add_cpds(cpd_fx) task4_bms.append(model_temp) print(" Model 3: Manual Model based on HillClimbSearch is : "+str(model_temp.edges())) k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000)) k2Scores_temp = k2Score.score(model_temp) k2Scores.append(k2Scores_temp) print(" Model 3: K2 Accuracy Score is "+str(k2Scores_temp)) #Model 4: Manual Model based on HillClimbSearch model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f5', 'f7'), ('f5', 'f3'), ('f1', 'f2'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f8'),]) estimator = BayesianEstimator(model_temp, andRawData_temp) for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']: cpd_fx = estimator.estimate_cpd(fx, prior_type="K2") model_temp.add_cpds(cpd_fx) task4_bms.append(model_temp) print(" Model 4: Manual Model based on HillClimbSearch is : "+str(model_temp.edges())) k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000)) k2Scores_temp = k2Score.score(model_temp) k2Scores.append(k2Scores_temp) print(" Model 4: K2 Accuracy Score is "+str(k2Scores_temp)) #Model 5: Manual Model based on Intuition model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f4', 'f7'), ('f1', 'f2'), ('f8', 'f5'), ('f9', 'f6'), ('f9', 'f8')]) estimator = BayesianEstimator(model_temp, andRawData_temp) for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']: cpd_fx = estimator.estimate_cpd(fx, prior_type="K2") model_temp.add_cpds(cpd_fx) task4_bms.append(model_temp) print(" Model 5: Manual Model based on HillClimbSearch is : "+str(model_temp.edges())) k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000)) k2Scores_temp = k2Score.score(model_temp) k2Scores.append(k2Scores_temp) print(" Model 5: K2 Accuracy Score is "+str(k2Scores_temp)) task4_best_bm = task4_bms[k2Scores.index(max(k2Scores))] print(" Best Bayesian Model with the highest accuracy score is thus Model "+str(1+k2Scores.index(max(k2Scores))))
class TestBaseEstimator(unittest.TestCase): def setUp(self): self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list("AB")) self.rand_data["C"] = self.rand_data["B"] self.est_rand = HillClimbSearch(self.rand_data, scoring_method=K2Score(self.rand_data)) self.model1 = BayesianModel() self.model1.add_nodes_from(["A", "B", "C"]) self.model2 = self.model1.copy() self.model2.add_edge("A", "B") # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv( "pgmpy/tests/test_estimators/testdata/titanic_train.csv") self.titanic_data1 = self.titanic_data[[ "Survived", "Sex", "Pclass", "Age", "Embarked" ]] self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] self.est_titanic1 = HillClimbSearch(self.titanic_data1) self.est_titanic2 = HillClimbSearch(self.titanic_data2) def test_legal_operations(self): model2_legal_ops = list(self.est_rand._legal_operations(self.model2)) model2_legal_ops_ref = [ (("+", ("C", "A")), -28.15602208305154), (("+", ("A", "C")), -28.155467430966382), (("+", ("C", "B")), 7636.947544933631), (("+", ("B", "C")), 7937.805375579936), (("-", ("A", "B")), 28.155467430966382), (("flip", ("A", "B")), -0.0005546520851567038), ] self.assertSetEqual( set([op for op, score in model2_legal_ops]), set([op for op, score in model2_legal_ops_ref]), ) def test_legal_operations_titanic(self): est = self.est_titanic1 start_model = BayesianModel([("Survived", "Sex"), ("Pclass", "Age"), ("Pclass", "Embarked")]) legal_ops = est._legal_operations(start_model) self.assertEqual(len(list(legal_ops)), 20) tabu_list = [ ("-", ("Survived", "Sex")), ("-", ("Survived", "Pclass")), ("flip", ("Age", "Pclass")), ] legal_ops_tabu = est._legal_operations(start_model, tabu_list=tabu_list) self.assertEqual(len(list(legal_ops_tabu)), 18) legal_ops_indegree = est._legal_operations(start_model, max_indegree=1) self.assertEqual(len(list(legal_ops_indegree)), 11) legal_ops_both = est._legal_operations(start_model, tabu_list=tabu_list, max_indegree=1) legal_ops_both_ref = [ (("+", ("Embarked", "Survived")), 10.050632580087608), (("+", ("Survived", "Pclass")), 41.88868046549101), (("+", ("Age", "Survived")), -23.635716036430836), (("+", ("Pclass", "Survived")), 41.81314459373226), (("+", ("Sex", "Pclass")), 4.772261678792802), (("-", ("Pclass", "Age")), 11.546515590731815), (("-", ("Pclass", "Embarked")), -32.171482832532774), (("flip", ("Pclass", "Embarked")), 3.3563814191281836), (("flip", ("Survived", "Sex")), 0.039737027979640516), ] self.assertSetEqual(set(legal_ops_both), set(legal_ops_both_ref)) def test_estimate_rand(self): est1 = self.est_rand.estimate() self.assertSetEqual(set(est1.nodes()), set(["A", "B", "C"])) self.assertTrue( list(est1.edges()) == [("B", "C")] or list(est1.edges()) == [("C", "B")]) est2 = self.est_rand.estimate(start=BayesianModel([("A", "B"), ("A", "C")])) self.assertTrue( list(est2.edges()) == [("B", "C")] or list(est2.edges()) == [("C", "B")]) def test_estimate_titanic(self): self.assertSetEqual( set(self.est_titanic2.estimate().edges()), set([("Survived", "Pclass"), ("Sex", "Pclass"), ("Sex", "Survived")]), ) def tearDown(self): del self.rand_data del self.est_rand del self.model1 del self.titanic_data del self.titanic_data1 del self.titanic_data2 del self.est_titanic1 del self.est_titanic2
import matplotlib.pyplot as plt import networkx as nx from sklearn.preprocessing import KBinsDiscretizer data = pd.read_csv("data/data_auto_mpg.csv") # data = pd.DataFrame(np.random.randn(500, 5), columns=list('ABCDE')) # data['F'] = data['A'] * data['B'] for col in data.columns: if (data[col].dtype == np.float64 or data[col].dtype == np.float32): # bin_size = np.unique(data[col].values).shape[0] # kbins = KBinsDiscretizer(n_bins=bin_size, encode='ordinal', strategy='uniform').fit(data[col].values.reshape(-1,1)) # data[col] = kbins.transform(data[col].values.reshape(-1,1)).astype(np.int64) data[col] = data[col].astype(np.int64) data = data.iloc[:, :10] print(data.dtypes) print(data) print("aq") est = HillClimbSearch(data, scoring_method=K2Score(data)) print("aq") model = est.estimate(max_indegree=5) print("aq") print(model.edges) plt.figure() nx.draw_networkx(model) plt.show()
def main(): andPGM = PGM_t() print('loading features..') train_set, test_set = andPGM.load_features() print('loading features.. Done') # Bayesian network of 19 nodes, 9*2 variables of network given # Initial incomplete Bayesian model connected manually based on intuition print('Generating model.. ') initialModel = BayesianModel({}) initialModel.add_nodes_from(andPGM.img_features.columns[1:10].tolist()) initialModel.add_edges_from([('f6_a' , 'f2_a'),\ ('f3_a' , 'f4_a') ,\ ('f5_a' , 'f9_a') ,\ ('f4_a' , 'f7_a') ]) # Use hill climb search algorithm to find network structure of initial 9 nodes hc = HillClimbSearch(data=andPGM.img_features.iloc[0:,1:10], \ scoring_method=BdeuScore(andPGM.img_features.iloc[0:,1:10], \ equivalent_sample_size=0.1*len(andPGM.img_features)), \ state_names = andPGM.states_9) # Get best estimated structure best_model = hc.estimate(start=initialModel) # Edges in the acquired graph print('model of 9 var: ', best_model.edges()) # Create a Clone of generated Bayesian network structure clone_model = BayesianModel({}) for edge in best_model.edges(): new_edge = [edge[0][:-1] + 'b', edge[1][:-1] + 'b'] clone_model.add_edges_from([new_edge]) # Join together the Original and clone network through node 'same' multinetModel = BayesianModel({}) multinetModel.add_edges_from(best_model.edges() + clone_model.edges()) multinetModel.add_node('same') multinetModel.add_edge('f5_a', 'same') multinetModel.add_edge('f9_a', 'same') multinetModel.add_edge('f5_b', 'same') multinetModel.add_edge('f9_b', 'same') print('Generating model.. Done') # Edges in the final structure print('Final model: ', multinetModel.edges()) print('Fit data into model..') # fit the data to model to generate CPDs using maximum likelyhood estimation multinetModel.fit(data=train_set, state_names=andPGM.states_all) print('Fit data into model.. Done') print('CPDs generated: ') cpds = multinetModel.get_cpds() for cpd in cpds: print(cpd) # Inference using Variable Elimination print('Start inference..') inference = VariableElimination(multinetModel) train_set_same = train_set[train_set['same'] == 0] train_set_not_same = train_set[train_set['same'] == 1] # Accuracy of positive inferences acc_same = andPGM.chk_accuracy( train_set_same, inference, variables=train_set_same.columns[0:9].tolist(), evidence=train_set_same.columns[9:19].tolist()) print('accuracy of positives ', acc_same) # Accuracy of negative inferences acc_nt_same = andPGM.chk_accuracy( train_set_not_same, inference, variables=train_set_not_same.columns[0:9].tolist(), evidence=train_set_not_same.columns[9:19].tolist()) print('accuracy of negatives', acc_nt_same)
with open("network.csv", "wb") as f: writer = csv.writer(f) writer.writerows(best_model.edges()) def main(): data = readData() labels = np.array(dataset.columns) datasetNp = np.array(dataset) data = pd.DataFrame(datasetNp, columns=labels) n = labels.shape[0] output = np.chararray(3, itemsize=10) model = BayesianModel() print "\nBayesian Network Inference with Temporal Data", print "\nNetwork relations: " hc = HillClimbSearch(data, scoring_method=BdeuScore(data)) best_model = hc.estimate(tabu_length=10, max_indegree=3) print(best_model.edges()) best_model.fit(data, BayesianEstimator) for cpd in best_model.get_cpds(): print(cpd) printOutput(best_model) if __name__ == '__main__': # chamada da funcao principal main()
mydata = np.genfromtxt( r'E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\converted\pgmpy\activities+time_ordered_withoutdatetime.csv', delimiter=",") #pd.read_csv(r'E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\converted\pgmpy\data.csv') #print(mydata) data = pd.DataFrame(mydata, columns=feature_names) #['X', 'Y']) print(data) list_of_scoring_methods = [ BicScore(data), #BdeuScore(data), #K2Score(data) ] for scoreMethod in list_of_scoring_methods: start_time = time.time() hc = HillClimbSearch(data, scoreMethod) best_model = hc.estimate() print(hc.scoring_method) print(best_model.edges()) end_time = time.time() print("execution time in seconds:") print(end_time - start_time) estimator = BayesianEstimator(best_model, data) print(estimator.get_parameters(prior_type='K2')) #, equivalent_sample_size=5) #casas7_model = BayesianModel() #casas7_model.fit(data, estimator=BayesianEstimator)#MaximumLikelihoodEstimator) #print(casas7_model.get_cpds()) #casas7_model.get_n
data = pd.read_csv('data/breast-cancer-wisconsin.data', names=col_names.columns) data = data[data["bare_nuclei"] != '?'] data.set_index('id', inplace=True) #stop the model from using id as a node train, test = train_test_split(data, test_size=0.2, random_state=0) Y_test = test['class'] test = test.drop(['class'], axis=1) #convert labels to something that can be handled be sklearn's eval functions labelencoder = LabelEncoder() Y_test = labelencoder.fit_transform(Y_test.values.ravel()) ### Greedy Structure Learning with Hill Climbing hc = HillClimbSearch(data, scoring_method=BicScore(train)) hc_model = hc.estimate() ### Parameter Learning with Bayesian Estimation hc_model.fit(train, estimator=BayesianEstimator, prior_type="BDeu") ### If the following for loop is un-commented the terminal will be flooded with CPDs """ for cpd in best_model.get_cpds(): print(cpd) """ print() ### Another Method (it will throw errors about sample size - but it still runs and shouldn't be too messed up) ###Constraint Based Structure Learning est = ConstraintBasedEstimator(train)
import pandas as pd import os data = pd.read_csv("final_data.csv") from pgmpy.estimators import BicScore, BdeuScore, ExhaustiveSearch, HillClimbSearch est = HillClimbSearch(data, scoring_method=BdeuScore(data, equivalent_sample_size=10)) best_model = est.estimate() print("Structure found!") print("Best Model") print(best_model.edges()) ''' Parameters Estimation and CPD tables ''' from pgmpy.models import BayesianModel from pgmpy.estimators import BayesianEstimator model = BayesianModel(best_model.edges()) data = pd.read_csv("final_training_set.csv") estimator = BayesianEstimator(model, data) parameters = estimator.get_parameters(prior_type='BDeu', equivalent_sample_size=10) for cpd in parameters: model.add_cpds(cpd) '''
def _hillclimbsearch(df, scoretype='bic', black_list=None, white_list=None, max_indegree=None, epsilon=1e-4, max_iter=1e6, bw_list_method='enforce', verbose=3): """Heuristic hill climb searches for DAGs, to learn network structure from data. `estimate` attempts to find a model with optimal score. Description ----------- Performs local hill climb search to estimates the `DAG` structure that has optimal score, according to the scoring method supplied in the constructor. Starts at model `start` and proceeds by step-by-step network modifications until a local maximum is reached. Only estimates network structure, no parametrization. Once more nodes are involved, one needs to switch to heuristic search. HillClimbSearch implements a greedy local search that starts from the DAG "start" (default: disconnected DAG) and proceeds by iteratively performing single-edge manipulations that maximally increase the score. The search terminates once a local maximum is found. For details on scoring see Koller & Friedman, Probabilistic Graphical Models, Section 18.4.3.3 (page 818). If a number `max_indegree` is provided, only modifications that keep the number of parents for each node below `max_indegree` are considered. A list of edges can optionally be passed as `black_list` or `white_list` to exclude those edges or to limit the search. """ out = dict() # Set scoring type scoring_method = _SetScoringType(df, scoretype) # Set search algorithm model = HillClimbSearch(df, scoring_method=scoring_method) # Compute best DAG # PGMPY_VER = version.parse(pgmpy.__version__)>version.parse("0.1.9") # if PGMPY_VER: # best_model = model.estimate(max_indegree=max_indegree, black_list=black_list, white_list=white_list) # else: # best_model = model.estimate(max_indegree=max_indegree) # Can be be removed if pgmpy >v0.1.9 # Compute best DAG if bw_list_method == 'enforce': if (black_list is not None) or (white_list is not None): if verbose >= 3: print( '[bnlearn] >Enforcing nodes based on black_list and/or white_list.' ) best_model = model.estimate(max_indegree=max_indegree, epsilon=epsilon, max_iter=max_iter, black_list=black_list, white_list=white_list) else: # At this point, variables are readily filtered based on bw_list_method or not (if nothing defined). best_model = model.estimate(max_indegree=max_indegree, epsilon=epsilon, max_iter=max_iter) # Store out['model'] = best_model out['model_edges'] = best_model.edges() # Return return (out)
from pgmpy.models import BayesianModel from pgmpy.readwrite.BIF import BIFWriter import pandas as pd import numpy as np from time import time import graphviz as gv import os train = pd.read_csv('../msnbcWithHeader.csv', sep=',') train = train[train.sum(axis=1) < 200] train[train > 1] = 1 train_start = time() bic = BicScore(train) hc = HillClimbSearch(train, scoring_method=bic) best_model = hc.estimate(prog_bar=True) edges = best_model.edges() model = BayesianModel(edges) model.fit(train, estimator=BayesianEstimator, prior_type="BDeu") variables = model.nodes() print(model.edges()) train_end = time() - train_start print("train time " + str(train_end)) my_graph = gv.Digraph(format='png') for node in variables: my_graph.node(node) for edge in edges: my_graph.edge(edge[0], edge[1]) filename = my_graph.render('../graph', view=True)
def learn(self, file1, file2): f1 = open(file1, encoding="utf8") lines = f1.readlines() edges = self.getegdes(lines[0]) data = pd.read_csv(file2) G = nx.DiGraph() for i in range(int(len(edges) / 2)): G.add_edge(edges[2 * i], edges[2 * i + 1]) est = HillClimbSearch(data, scoring_method=BicScore(data)) model = est.estimate() G_ = nx.DiGraph() G_.add_edges_from(model.edges()) for i, j in G_.edges(): if i not in G.nodes() or j not in G.nodes(): G.add_edge(i, j) elif not nx.has_path(G, j, i): G.add_edge(i, j) new_model = BayesianModel() new_model.add_edges_from(G.edges) G = new_model.copy() # N = G.number_of_nodes() # B = np.zeros((N*(N-1)//2, N)) # i = 0 # y = [] # k = 0 # nodes = list(G.nodes._nodes.keys()) # for i in range(len(nodes)): # for j in range(i+1, len(nodes)): # if nx.has_path(G, nodes[i], nodes[j]): # y.append(1) # B[k, i] = 1 # B[k, j] = -1 # elif nx.has_path(G, nodes[j], nodes[i]): # y.append(-1) # B[k, i] = 1 # B[k, j] = -1 # else: # y.append(0) # k += 1 # # W = np.eye(N, N) # est = HillClimbSearch(data, scoring_method=BicScore(data)) # model = est.estimate() # G_ = nx.DiGraph() # G_.add_edges_from(model.edges()) # queue = [] # for node in G_.nodes(): # if G_.in_degree(node) == 0: # queue.append(node) # G.node[node]['s'] = N # else: # G.node[node]['s'] = N//2 # while len(queue)>0: # now = queue[0] # l = list(G_._succ[now].keys()) # for i in l: # G.node[i]['s'] = G.node[now]['s'] - 1 # queue += l # queue.pop(0) # # phai = [] # for node in G.nodes(): # phai.append(G.node[node]['s']) # miu1 = np.dot(np.transpose(B), B) # miu1 = np.linalg.pinv(miu1) # miu2 = np.dot(np.transpose(B), y) # miu2 = miu2 + phai # miu = np.dot(miu1, miu2) # # seq = miu.tolist() # seq = list(zip(seq, nodes)) # seq = sorted(seq, key=lambda s: s[0]) # seq = [x[1] for x in seq] # nx.draw(G) # plt.show() estimator = BayesianEstimator(G, data) edges = [] for i in G.edges: edges.append(str(i)) print(edges) for i in G.nodes: cpd = estimator.estimate_cpd(i, prior_type="K2") nodeName = i values = dict(data[i].value_counts()) valueNum = len(values) CPT = np.transpose(cpd.values) # CPT = cpd.values sequence = cpd.variables[1::] card = [] for x in sequence: s = len(dict(data[x].value_counts())) card.append(s) output = nodeName + '\t' + str(valueNum) + '\t' + str( CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card) print(output)