def learn(self, file1, file2): f1 = open(file1, encoding="utf8") lines = f1.readlines() edges = self.getegdes(lines[0]) data = pd.read_csv(file2) G = nx.DiGraph() for i in range(int(len(edges) / 2)): G.add_edge(edges[2 * i], edges[2 * i + 1]) est = HillClimbSearch(data, scoring_method=BicScore(data)) model = est.estimate() G_ = nx.DiGraph() G_.add_edges_from(model.edges()) for i, j in G_.edges(): if i not in G.nodes() or j not in G.nodes(): G.add_edge(i, j) elif not nx.has_path(G, j, i): G.add_edge(i, j) new_model = BayesianModel() new_model.add_edges_from(G.edges) G = new_model.copy() # N = G.number_of_nodes() # B = np.zeros((N*(N-1)//2, N)) # i = 0 # y = [] # k = 0 # nodes = list(G.nodes._nodes.keys()) # for i in range(len(nodes)): # for j in range(i+1, len(nodes)): # if nx.has_path(G, nodes[i], nodes[j]): # y.append(1) # B[k, i] = 1 # B[k, j] = -1 # elif nx.has_path(G, nodes[j], nodes[i]): # y.append(-1) # B[k, i] = 1 # B[k, j] = -1 # else: # y.append(0) # k += 1 # # W = np.eye(N, N) # est = HillClimbSearch(data, scoring_method=BicScore(data)) # model = est.estimate() # G_ = nx.DiGraph() # G_.add_edges_from(model.edges()) # queue = [] # for node in G_.nodes(): # if G_.in_degree(node) == 0: # queue.append(node) # G.node[node]['s'] = N # else: # G.node[node]['s'] = N//2 # while len(queue)>0: # now = queue[0] # l = list(G_._succ[now].keys()) # for i in l: # G.node[i]['s'] = G.node[now]['s'] - 1 # queue += l # queue.pop(0) # # phai = [] # for node in G.nodes(): # phai.append(G.node[node]['s']) # miu1 = np.dot(np.transpose(B), B) # miu1 = np.linalg.pinv(miu1) # miu2 = np.dot(np.transpose(B), y) # miu2 = miu2 + phai # miu = np.dot(miu1, miu2) # # seq = miu.tolist() # seq = list(zip(seq, nodes)) # seq = sorted(seq, key=lambda s: s[0]) # seq = [x[1] for x in seq] # nx.draw(G) # plt.show() estimator = BayesianEstimator(G, data) edges = [] for i in G.edges: edges.append(str(i)) print(edges) for i in G.nodes: cpd = estimator.estimate_cpd(i, prior_type="K2") nodeName = i values = dict(data[i].value_counts()) valueNum = len(values) CPT = np.transpose(cpd.values) # CPT = cpd.values sequence = cpd.variables[1::] card = [] for x in sequence: s = len(dict(data[x].value_counts())) card.append(s) output = nodeName + '\t' + str(valueNum) + '\t' + str( CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card) print(output)
def main(): #Fetching features data features_data = pd.read_csv(fileloc_features) features_data_f = features_data.add_prefix('f') features_data_g = features_data.add_prefix('g') #Seen Training Data seen_traindata = pd.read_csv(fileloc_seen_training, usecols = ['left','right','label']) #seen_traindata_f = pd.read_csv(fileloc_seen_training, usecols = ['left','label']) #seen_traindata_g = pd.read_csv(fileloc_seen_training, usecols = ['right','label']) seen_traindata_merged_f = seen_traindata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') seen_traindata_merged_g = seen_traindata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') seen_traindata_merged_f = seen_traindata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) seen_traindata_merged_g = seen_traindata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) seen_features_traindata_final = pd.concat([seen_traindata_merged_f, seen_traindata_merged_g], axis = 1) seen_label_traindata_final = seen_traindata.loc[:, 'label'] seen_traindata_final = pd.concat([seen_features_traindata_final, seen_label_traindata_final], axis = 1) seen_traindata_final.replace([np.inf, -np.inf], np.nan) seen_traindata_final.dropna(inplace=True) seen_traindata_final = seen_traindata_final.astype(int) seen_traindata_final_NDArray = seen_traindata_final.values #Seen Validation Data seen_validationdata = pd.read_csv(fileloc_seen_validation, usecols = ['left','right','label']) #seen_validationdata_f = pd.read_csv(fileloc_seen_validation, usecols = ['left','label']) #seen_validationdata_g = pd.read_csv(fileloc_seen_validation, usecols = ['right','label']) seen_validationdata_merged_f = seen_validationdata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') seen_validationdata_merged_g = seen_validationdata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') seen_validationdata_merged_f = seen_validationdata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) seen_validationdata_merged_g = seen_validationdata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) seen_features_validationdata_final = pd.concat([seen_validationdata_merged_f, seen_validationdata_merged_g], axis = 1) seen_label_validationdata_final = seen_validationdata.loc[:, 'label'] seen_validationdata_final = pd.concat([seen_features_validationdata_final, seen_label_validationdata_final], axis = 1) seen_validationdata_final.replace([np.inf, -np.inf], np.nan) seen_validationdata_final.dropna(inplace=True) seen_validationdata_final = seen_validationdata_final.astype(int) seen_validationdata_final_NDArray = seen_validationdata_final.values #Shuffled Training Data shuffled_traindata = pd.read_csv(fileloc_shuffled_training, usecols = ['left','right','label']) #shuffled_traindata_f = pd.read_csv(fileloc_shuffled_training, usecols = ['left','label']) #shuffled_traindata_g = pd.read_csv(fileloc_shuffled_training, usecols = ['right','label']) shuffled_traindata_merged_f = shuffled_traindata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') shuffled_traindata_merged_g = shuffled_traindata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') shuffled_traindata_merged_f = shuffled_traindata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) shuffled_traindata_merged_g = shuffled_traindata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) shuffled_features_traindata_final = pd.concat([shuffled_traindata_merged_f, shuffled_traindata_merged_g], axis = 1) shuffled_label_traindata_final = shuffled_traindata.loc[:, 'label'] shuffled_traindata_final = pd.concat([shuffled_features_traindata_final, shuffled_label_traindata_final], axis = 1) shuffled_traindata_final.replace([np.inf, -np.inf], np.nan) shuffled_traindata_final.dropna(inplace=True) shuffled_traindata_final = shuffled_traindata_final.astype(int) shuffled_traindata_final_NDArray = shuffled_traindata_final.values #Shuffled Validation Data shuffled_validationdata = pd.read_csv(fileloc_shuffled_validation, usecols = ['left','right','label']) #shuffled_validationdata_f = pd.read_csv(fileloc_shuffled_validation, usecols = ['left','label']) #shuffled_validationdata_g = pd.read_csv(fileloc_shuffled_validation, usecols = ['right','label']) shuffled_validationdata_merged_f = shuffled_validationdata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') shuffled_validationdata_merged_g = shuffled_validationdata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') shuffled_validationdata_merged_f = shuffled_validationdata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) shuffled_validationdata_merged_g = shuffled_validationdata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) shuffled_features_validationdata_final = pd.concat([shuffled_validationdata_merged_f, shuffled_validationdata_merged_g], axis = 1) shuffled_label_validationdata_final = shuffled_validationdata.loc[:, 'label'] shuffled_validationdata_final = pd.concat([shuffled_features_validationdata_final, shuffled_label_validationdata_final], axis = 1) shuffled_validationdata_final.replace([np.inf, -np.inf], np.nan) shuffled_validationdata_final.dropna(inplace=True) shuffled_validationdata_final = shuffled_validationdata_final.astype(int) shuffled_validationdata_final_NDArray = shuffled_validationdata_final.values #Unseen Training Data unseen_traindata = pd.read_csv(fileloc_unseen_training, usecols = ['left','right','label']) #unseen_traindata_f = pd.read_csv(fileloc_unseen_training, usecols = ['left','label']) #unseen_traindata_g = pd.read_csv(fileloc_unseen_training, usecols = ['right','label']) unseen_traindata_merged_f = unseen_traindata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') unseen_traindata_merged_g = unseen_traindata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') unseen_traindata_merged_f = unseen_traindata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) unseen_traindata_merged_g = unseen_traindata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) unseen_features_traindata_final = pd.concat([unseen_traindata_merged_f, unseen_traindata_merged_g], axis = 1) unseen_label_traindata_final = unseen_traindata.loc[:, 'label'] unseen_traindata_final = pd.concat([unseen_features_traindata_final, unseen_label_traindata_final], axis = 1) unseen_traindata_final.replace([np.inf, -np.inf], np.nan) unseen_traindata_final.dropna(inplace=True) unseen_traindata_final = unseen_traindata_final.astype(int) unseen_traindata_final_NDArray = unseen_traindata_final.values #Unseen Validation Data unseen_validationdata = pd.read_csv(fileloc_unseen_validation, usecols = ['left','right','label']) #unseen_validationdata_f = pd.read_csv(fileloc_unseen_validation, usecols = ['left','label']) #unseen_validationdata_g = pd.read_csv(fileloc_unseen_validation, usecols = ['right','label']) unseen_validationdata_merged_f = unseen_validationdata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') unseen_validationdata_merged_g = unseen_validationdata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') unseen_validationdata_merged_f = unseen_validationdata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) unseen_validationdata_merged_g = unseen_validationdata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) unseen_features_validationdata_final = pd.concat([unseen_validationdata_merged_f, unseen_validationdata_merged_g], axis = 1) unseen_label_validationdata_final = unseen_validationdata.loc[:, 'label'] unseen_validationdata_final = pd.concat([unseen_features_validationdata_final, unseen_label_validationdata_final], axis = 1) unseen_validationdata_final.replace([np.inf, -np.inf], np.nan) unseen_validationdata_final.dropna(inplace=True) unseen_validationdata_final = unseen_validationdata_final.astype(int) unseen_validationdata_final_NDArray = unseen_validationdata_final.values #Creating base models featureNamesList = ["pen_pressure","letter_spacing","size","dimension","is_lowercase","is_continuous","slantness","tilt","entry_stroke_a", "staff_of_a","formation_n","staff_of_d","exit_stroke_d","word_formation","constancy"] features_only_data = features_data[featureNamesList] initial_hcs = HillClimbSearch(features_only_data) initial_model = initial_hcs.estimate() #print(initial_model.edges()) print("Hill Climb Done") basemodel = BayesianModel([('fpen_pressure', 'fis_lowercase'), ('fpen_pressure', 'fletter_spacing'), ('fsize', 'fslantness'), ('fsize', 'fpen_pressure'), ('fsize', 'fstaff_of_d'), ('fsize', 'fletter_spacing'), ('fsize', 'fexit_stroke_d'), ('fsize', 'fentry_stroke_a'), ('fdimension', 'fsize'), ('fdimension', 'fis_continuous'), ('fdimension', 'fslantness'), ('fdimension', 'fpen_pressure'), ('fis_lowercase', 'fstaff_of_a'), ('fis_lowercase', 'fexit_stroke_d'), ('fis_continuous', 'fexit_stroke_d'), ('fis_continuous', 'fletter_spacing'), ('fis_continuous', 'fentry_stroke_a'), ('fis_continuous', 'fstaff_of_a'), ('fis_continuous', 'fis_lowercase'), ('fslantness', 'fis_continuous'), ('fslantness', 'ftilt'), ('fentry_stroke_a', 'fpen_pressure'), ('fformation_n', 'fconstancy'), ('fformation_n', 'fword_formation'), ('fformation_n', 'fdimension'), ('fformation_n', 'fstaff_of_d'), ('fformation_n', 'fis_continuous'), ('fformation_n', 'fsize'), ('fformation_n', 'fstaff_of_a'), ('fstaff_of_d', 'fis_continuous'), ('fstaff_of_d', 'fexit_stroke_d'), ('fstaff_of_d', 'fis_lowercase'), ('fstaff_of_d', 'fslantness'), ('fstaff_of_d', 'fentry_stroke_a'), ('fword_formation', 'fdimension'), ('fword_formation', 'fstaff_of_a'), ('fword_formation', 'fsize'), ('fword_formation', 'fstaff_of_d'), ('fword_formation', 'fconstancy'), ('fconstancy', 'fstaff_of_a'), ('fconstancy', 'fletter_spacing'), ('fconstancy', 'fdimension'), ('gpen_pressure', 'gis_lowercase'), ('gpen_pressure', 'gletter_spacing'), ('gsize', 'gslantness'), ('gsize', 'gpen_pressure'), ('gsize', 'gstaff_of_d'), ('gsize', 'gletter_spacing'), ('gsize', 'gexit_stroke_d'), ('gsize', 'gentry_stroke_a'), ('gdimension', 'gsize'), ('gdimension', 'gis_continuous'), ('gdimension', 'gslantness'), ('gdimension', 'gpen_pressure'), ('gis_lowercase', 'gstaff_of_a'), ('gis_lowercase', 'gexit_stroke_d'), ('gis_continuous', 'gexit_stroke_d'), ('gis_continuous', 'gletter_spacing'), ('gis_continuous', 'gentry_stroke_a'), ('gis_continuous', 'gstaff_of_a'), ('gis_continuous', 'gis_lowercase'), ('gslantness', 'gis_continuous'), ('gslantness', 'gtilt'), ('gentry_stroke_a', 'gpen_pressure'), ('gformation_n', 'gconstancy'), ('gformation_n', 'gword_formation'), ('gformation_n', 'gdimension'), ('gformation_n', 'gstaff_of_d'), ('gformation_n', 'gis_continuous'), ('gformation_n', 'gsize'), ('gformation_n', 'gstaff_of_a'), ('gstaff_of_d', 'gis_continuous'), ('gstaff_of_d', 'gexit_stroke_d'), ('gstaff_of_d', 'gis_lowercase'), ('gstaff_of_d', 'gslantness'), ('gstaff_of_d', 'gentry_stroke_a'), ('gword_formation', 'gdimension'), ('gword_formation', 'gstaff_of_a'), ('gword_formation', 'gsize'), ('gword_formation', 'gstaff_of_d'), ('gword_formation', 'gconstancy'), ('gconstancy', 'gstaff_of_a'), ('gconstancy', 'gletter_spacing'), ('gconstancy', 'gdimension'), ('fis_continuous', 'label'), ('fword_formation','label'), ('gis_continuous', 'label'), ('gword_formation','label')]) model_seen = basemodel.copy() model_shuffled = basemodel.copy() model_unseen = basemodel.copy() accuracies = {} #Training Seen Model model_seen.fit(seen_traindata_final) estimator_seen = BayesianEstimator(model_seen, seen_traindata_final) cpds=[] for featureName in featureNamesList : cpd = estimator_seen.estimate_cpd('f'+featureName) cpds.append(cpd) cpd = estimator_seen.estimate_cpd('g'+featureName) cpds.append(cpd) cpd = estimator_seen.estimate_cpd('label') cpds.append(cpd) model_seen.add_cpds(*cpds) print("CPDs Calculated") #Testing Seen Model - Training model_seen_ve = VariableElimination(model_seen) model_seen_traindata_predictions = [] for i in range(seen_traindata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=(seen_traindata_final_NDArray[i,index]-1) evidenceDic['g'+featureName]=(seen_traindata_final_NDArray[i+15,index]-1) temp = model_seen_ve.map_query(variables=['label'],evidence=evidenceDic) model_seen_traindata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_seen_traindata_predictions)): if(int(model_seen_traindata_predictions[i]) == int(seen_traindata_final_NDArray[i,30])): correctCnt+=1 accuracies["seen_train"]=correctCnt/len(model_seen_traindata_predictions)*100 print("Bayesian Model Accuracy for Seen Training Data = "+str(accuracies["seen_train"])) #Testing Seen Model - Validation model_seen_ve = VariableElimination(model_seen) model_seen_validationdata_predictions = [] for i in range(seen_validationdata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=seen_validationdata_final_NDArray[i,index]-1 evidenceDic['g'+featureName]=seen_validationdata_final_NDArray[i+15,index]-1 temp = model_seen_ve.map_query(variables=['label'],evidence=evidenceDic) model_seen_validationdata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_seen_validationdata_predictions)): if(int(model_seen_validationdata_predictions[i]) == int(seen_validationdata_final_NDArray[i,30])): correctCnt+=1 accuracies["seen_validation"]=correctCnt/len(model_seen_validationdata_predictions)*100 print("Bayesian Model Accuracy for Seen Validation Data = "+str(accuracies["seen_validation"])) #Training Shuffled Model model_shuffled.fit(shuffled_traindata_final) estimator_shuffled = BayesianEstimator(model_shuffled, shuffled_traindata_final) cpds=[] for featureName in featureNamesList : cpd = estimator_shuffled.estimate_cpd('f'+featureName) cpds.append(cpd) cpd = estimator_shuffled.estimate_cpd('g'+featureName) cpds.append(cpd) cpd = estimator_shuffled.estimate_cpd('label') cpds.append(cpd) model_shuffled.add_cpds(*cpds) #Testing Shuffled Model - Training model_shuffled_ve = VariableElimination(model_shuffled) model_shuffled_traindata_predictions = [] for i in range(shuffled_traindata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=shuffled_traindata_final_NDArray[i,index]-1 evidenceDic['g'+featureName]=shuffled_traindata_final_NDArray[i+15,index]-1 temp = model_shuffled_ve.map_query(variables=['label'],evidence=evidenceDic) model_shuffled_traindata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_shuffled_traindata_predictions)): if(int(model_shuffled_traindata_predictions[i]) == int(shuffled_traindata_final_NDArray[i,30])): correctCnt+=1 accuracies["shuffled_train"]=correctCnt/len(model_shuffled_traindata_predictions)*100 print("Bayesian Model Accuracy for Shuffled Training Data = "+str(accuracies["shuffled_train"])) #Testing Shuffled Model - Validation model_shuffled_ve = VariableElimination(model_shuffled) model_shuffled_validationdata_predictions = [] for i in range(shuffled_validationdata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=shuffled_validationdata_final_NDArray[i,index]-1 evidenceDic['g'+featureName]=shuffled_validationdata_final_NDArray[i+15,index]-1 temp = model_shuffled_ve.map_query(variables=['label'],evidence=evidenceDic) model_shuffled_validationdata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_shuffled_validationdata_predictions)): if(int(model_shuffled_validationdata_predictions[i]) == int(shuffled_validationdata_final_NDArray[i,30])): correctCnt+=1 accuracies["shuffled_validation"]=correctCnt/len(model_shuffled_validationdata_predictions)*100 print("Bayesian Model Accuracy for Shuffled Validation Data = "+str(accuracies["shuffled_validation"])) #Training Unseen Model model_unseen.fit(unseen_traindata_final) estimator_unseen = BayesianEstimator(model_unseen, unseen_traindata_final) cpds=[] for featureName in featureNamesList : cpd = estimator_unseen.estimate_cpd('f'+featureName) cpds.append(cpd) cpd = estimator_unseen.estimate_cpd('g'+featureName) cpds.append(cpd) cpd = estimator_unseen.estimate_cpd('label') cpds.append(cpd) model_unseen.add_cpds(*cpds) #Testing Unseen Model - Training model_unseen_ve = VariableElimination(model_unseen) model_unseen_traindata_predictions = [] for i in range(unseen_traindata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=unseen_traindata_final_NDArray[i,index]-1 evidenceDic['g'+featureName]=unseen_traindata_final_NDArray[i+15,index]-1 temp = model_unseen_ve.map_query(variables=['label'],evidence=evidenceDic) model_unseen_traindata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_unseen_traindata_predictions)): if(int(model_unseen_traindata_predictions[i]) == int(unseen_traindata_final_NDArray[i,30])): correctCnt+=1 accuracies["unseen_train"]=correctCnt/len(model_unseen_traindata_predictions)*100 print("Bayesian Model Accuracy for Unseen Training Data = "+str(accuracies["unseen_train"])) #Testing Unseen Model - Validation model_unseen_ve = VariableElimination(model_unseen) model_unseen_validationdata_predictions = [] for i in range(unseen_validationdata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=unseen_validationdata_final_NDArray[i,index]-1 evidenceDic['g'+featureName]=unseen_validationdata_final_NDArray[i+15,index]-1 temp = model_unseen_ve.map_query(variables=['label'],evidence=evidenceDic) model_unseen_validationdata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_unseen_validationdata_predictions)): if(int(model_unseen_validationdata_predictions[i]) == int(unseen_validationdata_final_NDArray[i,30])): correctCnt+=1 accuracies["unseen_validation"]=correctCnt/len(model_unseen_validationdata_predictions)*100 print("Bayesian Model Accuracy for Unseen Validation Data = "+str(accuracies["unseen_validation"]))
class TestBayesianModelMethods(unittest.TestCase): def setUp(self): self.G = BayesianModel([('a', 'd'), ('b', 'd'), ('d', 'e'), ('b', 'c')]) self.G1 = BayesianModel([('diff', 'grade'), ('intel', 'grade')]) diff_cpd = TabularCPD('diff', 2, values=[[0.2], [0.8]]) intel_cpd = TabularCPD('intel', 3, values=[[0.5], [0.3], [0.2]]) grade_cpd = TabularCPD('grade', 3, values=[[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [0.8, 0.8, 0.8, 0.8, 0.8, 0.8]], evidence=['diff', 'intel'], evidence_card=[2, 3]) self.G1.add_cpds(diff_cpd, intel_cpd, grade_cpd) self.G2 = BayesianModel([('d', 'g'), ('g', 'l'), ('i', 'g'), ('i', 'l')]) def test_moral_graph(self): moral_graph = self.G.moralize() self.assertListEqual(sorted(moral_graph.nodes()), ['a', 'b', 'c', 'd', 'e']) for edge in moral_graph.edges(): self.assertTrue(edge in [('a', 'b'), ('a', 'd'), ('b', 'c'), ('d', 'b'), ('e', 'd')] or (edge[1], edge[0]) in [('a', 'b'), ('a', 'd'), ('b', 'c'), ('d', 'b'), ('e', 'd')]) def test_moral_graph_with_edge_present_over_parents(self): G = BayesianModel([('a', 'd'), ('d', 'e'), ('b', 'd'), ('b', 'c'), ('a', 'b')]) moral_graph = G.moralize() self.assertListEqual(sorted(moral_graph.nodes()), ['a', 'b', 'c', 'd', 'e']) for edge in moral_graph.edges(): self.assertTrue(edge in [('a', 'b'), ('c', 'b'), ('d', 'a'), ('d', 'b'), ('d', 'e')] or (edge[1], edge[0]) in [('a', 'b'), ('c', 'b'), ('d', 'a'), ('d', 'b'), ('d', 'e')]) def test_get_ancestors_of_success(self): ancenstors1 = self.G2._get_ancestors_of('g') ancenstors2 = self.G2._get_ancestors_of('d') ancenstors3 = self.G2._get_ancestors_of(['i', 'l']) self.assertEqual(ancenstors1, {'d', 'i', 'g'}) self.assertEqual(ancenstors2, {'d'}) self.assertEqual(ancenstors3, {'g', 'i', 'l', 'd'}) def test_get_ancestors_of_failure(self): self.assertRaises(ValueError, self.G2._get_ancestors_of, 'h') def test_local_independencies(self): self.assertEqual(self.G.local_independencies('a'), Independencies(['a', ['b', 'c']])) self.assertEqual(self.G.local_independencies('c'), Independencies(['c', ['a', 'd', 'e'], 'b'])) self.assertEqual(self.G.local_independencies('d'), Independencies(['d', 'c', ['b', 'a']])) self.assertEqual(self.G.local_independencies('e'), Independencies(['e', ['c', 'b', 'a'], 'd'])) self.assertEqual(self.G.local_independencies('b'), Independencies(['b', 'a'])) self.assertEqual(self.G1.local_independencies('grade'), Independencies()) def test_get_independencies(self): chain = BayesianModel([('X', 'Y'), ('Y', 'Z')]) self.assertEqual(chain.get_independencies(), Independencies(('X', 'Z', 'Y'), ('Z', 'X', 'Y'))) fork = BayesianModel([('Y', 'X'), ('Y', 'Z')]) self.assertEqual(fork.get_independencies(), Independencies(('X', 'Z', 'Y'), ('Z', 'X', 'Y'))) collider = BayesianModel([('X', 'Y'), ('Z', 'Y')]) self.assertEqual(collider.get_independencies(), Independencies(('X', 'Z'), ('Z', 'X'))) def test_is_imap(self): val = [ 0.01, 0.01, 0.08, 0.006, 0.006, 0.048, 0.004, 0.004, 0.032, 0.04, 0.04, 0.32, 0.024, 0.024, 0.192, 0.016, 0.016, 0.128 ] JPD = JointProbabilityDistribution(['diff', 'intel', 'grade'], [2, 3, 3], val) fac = DiscreteFactor(['diff', 'intel', 'grade'], [2, 3, 3], val) self.assertTrue(self.G1.is_imap(JPD)) self.assertRaises(TypeError, self.G1.is_imap, fac) def test_get_immoralities(self): G = BayesianModel([('x', 'y'), ('z', 'y'), ('x', 'z'), ('w', 'y')]) self.assertEqual(G.get_immoralities(), {('w', 'x'), ('w', 'z')}) G1 = BayesianModel([('x', 'y'), ('z', 'y'), ('z', 'x'), ('w', 'y')]) self.assertEqual(G1.get_immoralities(), {('w', 'x'), ('w', 'z')}) G2 = BayesianModel([('x', 'y'), ('z', 'y'), ('x', 'z'), ('w', 'y'), ('w', 'x')]) self.assertEqual(G2.get_immoralities(), {('w', 'z')}) def test_is_iequivalent(self): G = BayesianModel([('x', 'y'), ('z', 'y'), ('x', 'z'), ('w', 'y')]) self.assertRaises(TypeError, G.is_iequivalent, MarkovModel()) G1 = BayesianModel([('V', 'W'), ('W', 'X'), ('X', 'Y'), ('Z', 'Y')]) G2 = BayesianModel([('W', 'V'), ('X', 'W'), ('X', 'Y'), ('Z', 'Y')]) self.assertTrue(G1.is_iequivalent(G2)) G3 = BayesianModel([('W', 'V'), ('W', 'X'), ('Y', 'X'), ('Z', 'Y')]) self.assertFalse(G3.is_iequivalent(G2)) def test_copy(self): model_copy = self.G1.copy() self.assertEqual(sorted(self.G1.nodes()), sorted(model_copy.nodes())) self.assertEqual(sorted(self.G1.edges()), sorted(model_copy.edges())) self.assertNotEqual(id(self.G1.get_cpds('diff')), id(model_copy.get_cpds('diff'))) self.G1.remove_cpds('diff') diff_cpd = TabularCPD('diff', 2, values=[[0.3], [0.7]]) self.G1.add_cpds(diff_cpd) self.assertNotEqual(self.G1.get_cpds('diff'), model_copy.get_cpds('diff')) self.G1.remove_node('intel') self.assertNotEqual(sorted(self.G1.nodes()), sorted(model_copy.nodes())) self.assertNotEqual(sorted(self.G1.edges()), sorted(model_copy.edges())) def test_remove_node(self): self.G1.remove_node('diff') self.assertEqual(sorted(self.G1.nodes()), sorted(['grade', 'intel'])) self.assertRaises(ValueError, self.G1.get_cpds, 'diff') def test_remove_nodes_from(self): self.G1.remove_nodes_from(['diff', 'grade']) self.assertEqual(sorted(self.G1.nodes()), sorted(['intel'])) self.assertRaises(ValueError, self.G1.get_cpds, 'diff') self.assertRaises(ValueError, self.G1.get_cpds, 'grade') def tearDown(self): del self.G del self.G1
class TestBaseEstimator(unittest.TestCase): def setUp(self): self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list("AB")) self.rand_data["C"] = self.rand_data["B"] self.est_rand = HillClimbSearch(self.rand_data, scoring_method=K2Score(self.rand_data)) self.model1 = BayesianModel() self.model1.add_nodes_from(["A", "B", "C"]) self.model2 = self.model1.copy() self.model2.add_edge("A", "B") # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv( "pgmpy/tests/test_estimators/testdata/titanic_train.csv") self.titanic_data1 = self.titanic_data[[ "Survived", "Sex", "Pclass", "Age", "Embarked" ]] self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] self.est_titanic1 = HillClimbSearch(self.titanic_data1) self.est_titanic2 = HillClimbSearch(self.titanic_data2) def test_legal_operations(self): model2_legal_ops = list(self.est_rand._legal_operations(self.model2)) model2_legal_ops_ref = [ (("+", ("C", "A")), -28.15602208305154), (("+", ("A", "C")), -28.155467430966382), (("+", ("C", "B")), 7636.947544933631), (("+", ("B", "C")), 7937.805375579936), (("-", ("A", "B")), 28.155467430966382), (("flip", ("A", "B")), -0.0005546520851567038), ] self.assertSetEqual( set([op for op, score in model2_legal_ops]), set([op for op, score in model2_legal_ops_ref]), ) def test_legal_operations_titanic(self): est = self.est_titanic1 start_model = BayesianModel([("Survived", "Sex"), ("Pclass", "Age"), ("Pclass", "Embarked")]) legal_ops = est._legal_operations(start_model) self.assertEqual(len(list(legal_ops)), 20) tabu_list = [ ("-", ("Survived", "Sex")), ("-", ("Survived", "Pclass")), ("flip", ("Age", "Pclass")), ] legal_ops_tabu = est._legal_operations(start_model, tabu_list=tabu_list) self.assertEqual(len(list(legal_ops_tabu)), 18) legal_ops_indegree = est._legal_operations(start_model, max_indegree=1) self.assertEqual(len(list(legal_ops_indegree)), 11) legal_ops_both = est._legal_operations(start_model, tabu_list=tabu_list, max_indegree=1) legal_ops_both_ref = [ (("+", ("Embarked", "Survived")), 10.050632580087608), (("+", ("Survived", "Pclass")), 41.88868046549101), (("+", ("Age", "Survived")), -23.635716036430836), (("+", ("Pclass", "Survived")), 41.81314459373226), (("+", ("Sex", "Pclass")), 4.772261678792802), (("-", ("Pclass", "Age")), 11.546515590731815), (("-", ("Pclass", "Embarked")), -32.171482832532774), (("flip", ("Pclass", "Embarked")), 3.3563814191281836), (("flip", ("Survived", "Sex")), 0.039737027979640516), ] self.assertSetEqual(set(legal_ops_both), set(legal_ops_both_ref)) def test_estimate_rand(self): est1 = self.est_rand.estimate() self.assertSetEqual(set(est1.nodes()), set(["A", "B", "C"])) self.assertTrue( list(est1.edges()) == [("B", "C")] or list(est1.edges()) == [("C", "B")]) est2 = self.est_rand.estimate(start=BayesianModel([("A", "B"), ("A", "C")])) self.assertTrue( list(est2.edges()) == [("B", "C")] or list(est2.edges()) == [("C", "B")]) def test_estimate_titanic(self): self.assertSetEqual( set(self.est_titanic2.estimate().edges()), set([("Survived", "Pclass"), ("Sex", "Pclass"), ("Sex", "Survived")]), ) def tearDown(self): del self.rand_data del self.est_rand del self.model1 del self.titanic_data del self.titanic_data1 del self.titanic_data2 del self.est_titanic1 del self.est_titanic2
class TestBayesianModelMethods(unittest.TestCase): def setUp(self): self.G = BayesianModel([('a', 'd'), ('b', 'd'), ('d', 'e'), ('b', 'c')]) self.G1 = BayesianModel([('diff', 'grade'), ('intel', 'grade')]) diff_cpd = TabularCPD('diff', 2, values=[[0.2], [0.8]]) intel_cpd = TabularCPD('intel', 3, values=[[0.5], [0.3], [0.2]]) grade_cpd = TabularCPD('grade', 3, values=[[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [0.8, 0.8, 0.8, 0.8, 0.8, 0.8]], evidence=['diff', 'intel'], evidence_card=[2, 3]) self.G1.add_cpds(diff_cpd, intel_cpd, grade_cpd) self.G2 = BayesianModel([('d', 'g'), ('g', 'l'), ('i', 'g'), ('i', 'l')]) def test_moral_graph(self): moral_graph = self.G.moralize() self.assertListEqual(sorted(moral_graph.nodes()), ['a', 'b', 'c', 'd', 'e']) for edge in moral_graph.edges(): self.assertTrue(edge in [('a', 'b'), ('a', 'd'), ('b', 'c'), ('d', 'b'), ('e', 'd')] or (edge[1], edge[0]) in [('a', 'b'), ('a', 'd'), ('b', 'c'), ('d', 'b'), ('e', 'd')]) def test_moral_graph_with_edge_present_over_parents(self): G = BayesianModel([('a', 'd'), ('d', 'e'), ('b', 'd'), ('b', 'c'), ('a', 'b')]) moral_graph = G.moralize() self.assertListEqual(sorted(moral_graph.nodes()), ['a', 'b', 'c', 'd', 'e']) for edge in moral_graph.edges(): self.assertTrue(edge in [('a', 'b'), ('c', 'b'), ('d', 'a'), ('d', 'b'), ('d', 'e')] or (edge[1], edge[0]) in [('a', 'b'), ('c', 'b'), ('d', 'a'), ('d', 'b'), ('d', 'e')]) def test_get_ancestors_of_success(self): ancenstors1 = self.G2._get_ancestors_of('g') ancenstors2 = self.G2._get_ancestors_of('d') ancenstors3 = self.G2._get_ancestors_of(['i', 'l']) self.assertEqual(ancenstors1, {'d', 'i', 'g'}) self.assertEqual(ancenstors2, {'d'}) self.assertEqual(ancenstors3, {'g', 'i', 'l', 'd'}) def test_get_ancestors_of_failure(self): self.assertRaises(ValueError, self.G2._get_ancestors_of, 'h') def test_local_independencies(self): self.assertEqual(self.G.local_independencies('a'), Independencies(['a', ['b', 'c']])) self.assertEqual(self.G.local_independencies('c'), Independencies(['c', ['a', 'd', 'e'], 'b'])) self.assertEqual(self.G.local_independencies('d'), Independencies(['d', 'c', ['b', 'a']])) self.assertEqual(self.G.local_independencies('e'), Independencies(['e', ['c', 'b', 'a'], 'd'])) self.assertEqual(self.G.local_independencies('b'), Independencies(['b', 'a'])) self.assertEqual(self.G1.local_independencies('grade'), Independencies()) def test_get_independencies(self): chain = BayesianModel([('X', 'Y'), ('Y', 'Z')]) self.assertEqual(chain.get_independencies(), Independencies(('X', 'Z', 'Y'), ('Z', 'X', 'Y'))) fork = BayesianModel([('Y', 'X'), ('Y', 'Z')]) self.assertEqual(fork.get_independencies(), Independencies(('X', 'Z', 'Y'), ('Z', 'X', 'Y'))) collider = BayesianModel([('X', 'Y'), ('Z', 'Y')]) self.assertEqual(collider.get_independencies(), Independencies(('X', 'Z'), ('Z', 'X'))) def test_is_imap(self): val = [0.01, 0.01, 0.08, 0.006, 0.006, 0.048, 0.004, 0.004, 0.032, 0.04, 0.04, 0.32, 0.024, 0.024, 0.192, 0.016, 0.016, 0.128] JPD = JointProbabilityDistribution(['diff', 'intel', 'grade'], [2, 3, 3], val) fac = DiscreteFactor(['diff', 'intel', 'grade'], [2, 3, 3], val) self.assertTrue(self.G1.is_imap(JPD)) self.assertRaises(TypeError, self.G1.is_imap, fac) def test_get_immoralities(self): G = BayesianModel([('x', 'y'), ('z', 'y'), ('x', 'z'), ('w', 'y')]) self.assertEqual(G.get_immoralities(), {('w', 'x'), ('w', 'z')}) G1 = BayesianModel([('x', 'y'), ('z', 'y'), ('z', 'x'), ('w', 'y')]) self.assertEqual(G1.get_immoralities(), {('w', 'x'), ('w', 'z')}) G2 = BayesianModel([('x', 'y'), ('z', 'y'), ('x', 'z'), ('w', 'y'), ('w', 'x')]) self.assertEqual(G2.get_immoralities(), {('w', 'z')}) def test_is_iequivalent(self): G = BayesianModel([('x', 'y'), ('z', 'y'), ('x', 'z'), ('w', 'y')]) self.assertRaises(TypeError, G.is_iequivalent, MarkovModel()) G1 = BayesianModel([('V', 'W'), ('W', 'X'), ('X', 'Y'), ('Z', 'Y')]) G2 = BayesianModel([('W', 'V'), ('X', 'W'), ('X', 'Y'), ('Z', 'Y')]) self.assertTrue(G1.is_iequivalent(G2)) G3 = BayesianModel([('W', 'V'), ('W', 'X'), ('Y', 'X'), ('Z', 'Y')]) self.assertFalse(G3.is_iequivalent(G2)) def test_copy(self): model_copy = self.G1.copy() self.assertEqual(sorted(self.G1.nodes()), sorted(model_copy.nodes())) self.assertEqual(sorted(self.G1.edges()), sorted(model_copy.edges())) self.assertNotEqual(id(self.G1.get_cpds('diff')), id(model_copy.get_cpds('diff'))) self.G1.remove_cpds('diff') diff_cpd = TabularCPD('diff', 2, values=[[0.3], [0.7]]) self.G1.add_cpds(diff_cpd) self.assertNotEqual(self.G1.get_cpds('diff'), model_copy.get_cpds('diff')) self.G1.remove_node('intel') self.assertNotEqual(sorted(self.G1.nodes()), sorted(model_copy.nodes())) self.assertNotEqual(sorted(self.G1.edges()), sorted(model_copy.edges())) def test_remove_node(self): self.G1.remove_node('diff') self.assertEqual(sorted(self.G1.nodes()), sorted(['grade', 'intel'])) self.assertRaises(ValueError, self.G1.get_cpds, 'diff') def test_remove_nodes_from(self): self.G1.remove_nodes_from(['diff', 'grade']) self.assertEqual(sorted(self.G1.nodes()), sorted(['intel'])) self.assertRaises(ValueError, self.G1.get_cpds, 'diff') self.assertRaises(ValueError, self.G1.get_cpds, 'grade') def tearDown(self): del self.G del self.G1
class TestBayesianModelMethods(unittest.TestCase): def setUp(self): self.G = BayesianModel([("a", "d"), ("b", "d"), ("d", "e"), ("b", "c")]) self.G1 = BayesianModel([("diff", "grade"), ("intel", "grade")]) diff_cpd = TabularCPD("diff", 2, values=[[0.2], [0.8]]) intel_cpd = TabularCPD("intel", 3, values=[[0.5], [0.3], [0.2]]) grade_cpd = TabularCPD( "grade", 3, values=[ [0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [0.8, 0.8, 0.8, 0.8, 0.8, 0.8], ], evidence=["diff", "intel"], evidence_card=[2, 3], ) self.G1.add_cpds(diff_cpd, intel_cpd, grade_cpd) self.G2 = BayesianModel([("d", "g"), ("g", "l"), ("i", "g"), ("i", "l")]) def test_moral_graph(self): moral_graph = self.G.moralize() self.assertListEqual(sorted(moral_graph.nodes()), ["a", "b", "c", "d", "e"]) for edge in moral_graph.edges(): self.assertTrue(edge in [("a", "b"), ("a", "d"), ("b", "c"), ("d", "b"), ("e", "d")] or (edge[1], edge[0]) in [("a", "b"), ("a", "d"), ("b", "c"), ("d", "b"), ("e", "d")]) def test_moral_graph_with_edge_present_over_parents(self): G = BayesianModel([("a", "d"), ("d", "e"), ("b", "d"), ("b", "c"), ("a", "b")]) moral_graph = G.moralize() self.assertListEqual(sorted(moral_graph.nodes()), ["a", "b", "c", "d", "e"]) for edge in moral_graph.edges(): self.assertTrue(edge in [("a", "b"), ("c", "b"), ("d", "a"), ("d", "b"), ("d", "e")] or (edge[1], edge[0]) in [("a", "b"), ("c", "b"), ("d", "a"), ("d", "b"), ("d", "e")]) def test_get_ancestors_of_success(self): ancenstors1 = self.G2._get_ancestors_of("g") ancenstors2 = self.G2._get_ancestors_of("d") ancenstors3 = self.G2._get_ancestors_of(["i", "l"]) self.assertEqual(ancenstors1, {"d", "i", "g"}) self.assertEqual(ancenstors2, {"d"}) self.assertEqual(ancenstors3, {"g", "i", "l", "d"}) def test_get_ancestors_of_failure(self): self.assertRaises(ValueError, self.G2._get_ancestors_of, "h") def test_get_cardinality(self): self.assertDictEqual(self.G1.get_cardinality(), { "diff": 2, "intel": 3, "grade": 3 }) def test_get_cardinality_with_node(self): self.assertEqual(self.G1.get_cardinality("diff"), 2) self.assertEqual(self.G1.get_cardinality("intel"), 3) self.assertEqual(self.G1.get_cardinality("grade"), 3) def test_local_independencies(self): self.assertEqual(self.G.local_independencies("a"), Independencies(["a", ["b", "c"]])) self.assertEqual( self.G.local_independencies("c"), Independencies(["c", ["a", "d", "e"], "b"]), ) self.assertEqual(self.G.local_independencies("d"), Independencies(["d", "c", ["b", "a"]])) self.assertEqual( self.G.local_independencies("e"), Independencies(["e", ["c", "b", "a"], "d"]), ) self.assertEqual(self.G.local_independencies("b"), Independencies(["b", "a"])) self.assertEqual(self.G1.local_independencies("grade"), Independencies()) def test_get_independencies(self): chain = BayesianModel([("X", "Y"), ("Y", "Z")]) self.assertEqual(chain.get_independencies(), Independencies(("X", "Z", "Y"), ("Z", "X", "Y"))) fork = BayesianModel([("Y", "X"), ("Y", "Z")]) self.assertEqual(fork.get_independencies(), Independencies(("X", "Z", "Y"), ("Z", "X", "Y"))) collider = BayesianModel([("X", "Y"), ("Z", "Y")]) self.assertEqual(collider.get_independencies(), Independencies(("X", "Z"), ("Z", "X"))) def test_is_imap(self): val = [ 0.01, 0.01, 0.08, 0.006, 0.006, 0.048, 0.004, 0.004, 0.032, 0.04, 0.04, 0.32, 0.024, 0.024, 0.192, 0.016, 0.016, 0.128, ] JPD = JointProbabilityDistribution(["diff", "intel", "grade"], [2, 3, 3], val) fac = DiscreteFactor(["diff", "intel", "grade"], [2, 3, 3], val) self.assertTrue(self.G1.is_imap(JPD)) self.assertRaises(TypeError, self.G1.is_imap, fac) def test_markov_blanet(self): G = DAG([ ("x", "y"), ("z", "y"), ("y", "w"), ("y", "v"), ("u", "w"), ("s", "v"), ("w", "t"), ("w", "m"), ("v", "n"), ("v", "q"), ]) self.assertEqual(set(G.get_markov_blanket("y")), set(["s", "w", "x", "u", "z", "v"])) def test_get_immoralities(self): G = BayesianModel([("x", "y"), ("z", "y"), ("x", "z"), ("w", "y")]) self.assertEqual(G.get_immoralities(), {("w", "x"), ("w", "z")}) G1 = BayesianModel([("x", "y"), ("z", "y"), ("z", "x"), ("w", "y")]) self.assertEqual(G1.get_immoralities(), {("w", "x"), ("w", "z")}) G2 = BayesianModel([("x", "y"), ("z", "y"), ("x", "z"), ("w", "y"), ("w", "x")]) self.assertEqual(G2.get_immoralities(), {("w", "z")}) def test_is_iequivalent(self): G = BayesianModel([("x", "y"), ("z", "y"), ("x", "z"), ("w", "y")]) self.assertRaises(TypeError, G.is_iequivalent, MarkovModel()) G1 = BayesianModel([("V", "W"), ("W", "X"), ("X", "Y"), ("Z", "Y")]) G2 = BayesianModel([("W", "V"), ("X", "W"), ("X", "Y"), ("Z", "Y")]) self.assertTrue(G1.is_iequivalent(G2)) G3 = BayesianModel([("W", "V"), ("W", "X"), ("Y", "X"), ("Z", "Y")]) self.assertFalse(G3.is_iequivalent(G2)) def test_copy(self): model_copy = self.G1.copy() self.assertEqual(sorted(self.G1.nodes()), sorted(model_copy.nodes())) self.assertEqual(sorted(self.G1.edges()), sorted(model_copy.edges())) self.assertNotEqual(id(self.G1.get_cpds("diff")), id(model_copy.get_cpds("diff"))) self.G1.remove_cpds("diff") diff_cpd = TabularCPD("diff", 2, values=[[0.3], [0.7]]) self.G1.add_cpds(diff_cpd) self.assertNotEqual(self.G1.get_cpds("diff"), model_copy.get_cpds("diff")) self.G1.remove_node("intel") self.assertNotEqual(sorted(self.G1.nodes()), sorted(model_copy.nodes())) self.assertNotEqual(sorted(self.G1.edges()), sorted(model_copy.edges())) def test_remove_node(self): self.G1.remove_node("diff") self.assertEqual(sorted(self.G1.nodes()), sorted(["grade", "intel"])) self.assertRaises(ValueError, self.G1.get_cpds, "diff") def test_remove_nodes_from(self): self.G1.remove_nodes_from(["diff", "grade"]) self.assertEqual(sorted(self.G1.nodes()), sorted(["intel"])) self.assertRaises(ValueError, self.G1.get_cpds, "diff") self.assertRaises(ValueError, self.G1.get_cpds, "grade") def tearDown(self): del self.G del self.G1
class DecomposableModel: """ A class for learning decomposable models from coarsened kDGs. Attributes: alpha (int): the equivalent sample size of the Dirichlet uniform prior. data (np.array): An np.array which holds the data set. num_vars (int): The number of variables. data_frame (pd.DataFrame): The data frame version of data. var_states (list): For each i in range(numvars), varstates[i] is the number of variable states observed. bdeu (pgmpy.BDeuScore): A BDeuScore object with uniform prior alpha used to score networks. mst (nx.Graph): A copy of the maximal spanning tree which is learned from the BDeu score of the data undirected (nx.Graph): A networkx graph which stores the learned undirected models. directed (pgmpy.BayesianModel): A pgmpy BayesianModel which stores the learned directed model via minimum I-map. """ def __init__(self, fileName, alpha=1): """ Build the empty graph model with n=data.shape[1] vertices. Set other attributes for use in model learning. Args: :param fileName (string): The path of the .csv file containing the data. :param alpha (int): the equivalent sample size of the Dirichlet uniform prior. """ self.alpha = alpha self.data = np.genfromtxt(fileName, delimiter=',') self.num_vars = self.data.shape[1] self.data_frame = pd.DataFrame(self.data, columns=list(range(self.num_vars))) self.var_states = [] for index in range(len(self.data_frame.columns)): self.var_states.append(self.data_frame[index].nunique()) self.bdeu = BDeuScore(self.data_frame, equivalent_sample_size=alpha) self.undirected = nx.Graph() self.undirected.add_nodes_from(range(self.num_vars)) self.maxtree = self.undirected.copy() self.directed = BayesianModel() def mst(self): """Builds the maximum spanning tree for the given data and alpha. The results is kept in the undirected attribute. """ complete_graph = nx.Graph() lexicographic_edges = list(combinations(list(range(self.num_vars)), 2)) weight_list = self.get_weight_list(lexicographic_edges) for index, edge in enumerate(lexicographic_edges): complete_graph.add_edge(edge[0], edge[1], weight=-1 * weight_list[index]) edges = list(nx.algorithms.tree.minimum_spanning_edges(complete_graph, algorithm='kruskal', data=False)) self.maxtree.add_edges_from(edges) if nx.classes.function.is_empty(self.directed): self.undirected = self.maxtree.copy() def get_score(self, V): """ The score function. In the case of alpha=0, it computes the entropy of a given set of variables. In the case of alpha>0, it computes the BDeu score with equivalent sample size equal to alpha associated to a set of variables: BDeu(V)= sum_v ln(gamma(N_v + alpha/r_V))- ln(gamma(alpha/r_V) where N_v denotes the number of cases of the dataset in which variable V takes the value v, and r_V denotes the cardinality (the number of different values) of the set of variables V. Remarks: this method only works for discrete random variables. :param V (list): A subset of variables given by their indices, list(int). :return scores (np.array): The BDeu scores of variables V given the data set D. """ # Obtain the cardinality of V if len(V) == 0: return scp.special.loggamma(self.alpha) - scp.special.loggamma(self.num_vars + self.alpha) r = [self.var_states[i] for i in V] rV = np.prod(r) # Get the frequencies of every observed configuration N = np.unique(self.data[:, V], return_counts=True, axis=0)[1] # Get the empirical probabilities o the observed configurations if self.alpha > 0: lG = scp.special.loggamma(self.alpha / rV) - scp.special.loggamma(N + self.alpha / rV) # The probability of each of the (rV-len(N)) unobserved configurations of V # Compute the entropy (in 2 basis) return np.sum(lG) else: N /= self.data.shape[0] return -np.sum(N * np.log(N)) def get_weight_list(self, edges): """ Return the scores for edges which could be added to the empty graph. :param edges (list(tuples)): The list of edges to consider adding to the empty graph. :return weight_list (list): The list of weights corresponding to each edge in edges. """ weight_list = [] for edge in edges: weight_list.append(self.get_score([edge[0]]) + self.get_score([edge[1]]) - self.get_score(list(edge)) - self.get_score([])) return weight_list def get_objective(self, dict_of_vars): """ In the ILP model formulation of https://opt-ml.org/oldopt/papers/OPT2015_paper_36, this defines the vector w_{u,v|S}. That is, each element is the BDeu score related to adding edge (u,v) given separator S. :param dict_of_vars (dictionary): A dictionary which maps each candidate {u,v|S} to an index i in w. :return w (np.array): The vector of weights given the dictionary elements. That is, w[i] = score({u,v|S}). """ w = np.zeros(len(dict_of_vars.keys())) for key in dict_of_vars.keys(): to_scoreUVS = list({key[0][0], key[0][1]}.union(key[1])) to_scoreUS = list({key[0][0]}.union(key[1])) to_scoreVS = list({key[0][1]}.union(key[1])) to_scoreS = list(key[1]) w[dict_of_vars[key]] = self.get_score(to_scoreUS) + self.get_score(to_scoreVS) - \ self.get_score(to_scoreUVS) - self.get_score(to_scoreS) return w def learn(self, k_max=np.inf, l_max=np.inf, max_time=1000.0, max_time_gurobi=500.0, maximal=False, coarsen=False, extra_iters=np.inf): """An algorithm which performs the learning of kDGs (or MkDGs) by the coarsening procedure proposed in https://opt-ml.org/oldopt/papers/OPT2015_paper_36. The learned graph is placed in the undirected attribute :param k_max (int): The maximal clique size of the learned network. :param l_max (int): The maximal length of an edge which may be considered for addition to the model. :param max_time (float): The maximum running time of the coarsening algorithm. :param max_time_gurobi (float): The maximum running time for each call to gurobi :param maximal (bool): Whether to learn a kDG (False) or an MkDG (True). Remarks: The maximal running time is only considers terminating the learning process after each coarsening step. Therefore, the actual running time may be longer than what is specified by max_time """ if nx.classes.function.is_empty(self.maxtree): self.mst() if coarsen: current_model = self.undirected iter = 1 else: current_model = self.maxtree iter = 0 sol = [1] current_max_clique = 2 start = time.time() end = 0 while np.linalg.norm(sol, ord=1) > 0.01 and (end - start) < max_time: edge_list = ch.get_sorted_edges(current_model) cliques = ch.chain_of_cliques(current_model) if not maximal: for clique in cliques: if len(clique) > current_max_clique: current_max_clique = len(clique) if current_max_clique == k_max: break sep_list = ch.get_separators(cliques) comps = ch.get_comp_list(current_model, sep_list) sep_to_comps = dict(zip(sep_list, comps)) e_to_seps = ch.get_seps_for_cand_edges(comps, sep_list) list_vars = co.generate_dec_variables(e_to_seps, l_max) dict_of_vars = {uvS: ind for ind, uvS in enumerate(list_vars)} c = self.get_objective(dict_of_vars) m = gp.Model("iterate" + str(current_max_clique)) x = m.addMVar(shape=len(c), vtype=GRB.BINARY, name="x") m.setObjective(c @ x, GRB.MAXIMIZE) m.Params.MIPFocus = 1 m.Params.timeLimit = max_time_gurobi A1, b1 = co.type1(e_to_seps, dict_of_vars) m.addConstr(A1 @ x <= b1, name="c1") A2u, b2u, A2v, b2v = co.type2(e_to_seps, dict_of_vars, edge_list) m.addConstr(A2u @ x <= b2u, name="c2u") m.addConstr(A2v @ x <= b2v, name="c2v") #A3, b3, r = co.type3(sep_to_comps, dict_of_vars) if maximal: A3, b3, r = co.type3(sep_to_comps, dict_of_vars) m.addConstr(A3[0:r, :] @ x == b3[0:r], name="c3eq") m.addConstr(A3[r:len(b3), :] @ x <= b3[r:len(b3)], name="c3ineq") current_max_clique += 1 elif coarsen: sep_list = ch.get_separators(cliques) comps = ch.get_nb_comp_list(current_model, sep_list) sep_to_comps = dict(zip(sep_list, comps)) e_to_seps = ch.get_seps_for_cand_edges(comps, sep_list) list_vars = co.generate_dec_variables(e_to_seps, l_max) dict_of_vars = {uvS: ind for ind, uvS in enumerate(list_vars)} c = self.get_objective(dict_of_vars) m = gp.Model("iterate" + str(current_max_clique)) x = m.addMVar(shape=len(c), vtype=GRB.BINARY, name="x") m.setObjective(c @ x, GRB.MAXIMIZE) m.Params.MIPFocus = 1 m.Params.timeLimit = max_time_gurobi A1, b1 = co.type1(e_to_seps, dict_of_vars) m.addConstr(A1 @ x <= b1, name="c1") A2u, b2u, A2v, b2v = co.type2(e_to_seps, dict_of_vars, edge_list) m.addConstr(A2u @ x <= b2u, name="c2u") m.addConstr(A2v @ x <= b2v, name="c2v") A3, b3, r = co.type3(sep_to_comps, dict_of_vars) m.addConstr(A3[0:r, :] @ x == b3[0:r], name="c3eq") m.addConstr(A3[r:len(b3), :] @ x <= b3[r:len(b3)], name="c3ineq") else: A3, b3, r = co.type3(sep_to_comps, dict_of_vars) m.addConstr(A3 @ x <= b3, name="c3") m.optimize() sol = x.X new_edges = ch.update_graph(sol, dict_of_vars) e = edge_list + new_edges current_model = nx.Graph(e) end = time.time() if iter == extra_iters: break iter += 1 self.undirected = current_model def to_bn(self, use_mst=False): """ Given the undirected graph, return the directed model corresponding to the minimal I-map. The directed model is placed in the directed attribute. Remarks: This method supports models which are not connected. :param use_mst (boolean): To direct a prelearned model in the directed attribute, direct_mst=False. To obtain the directed model from the maximum spanning tree learned by self.mst, use direct_mst=True. """ # Generate connected components if use_mst: edges = list(self.maxtree.edges()) vertices = list(self.maxtree.nodes) mm = MarkovModel() mm.add_nodes_from(vertices) mm.add_edges_from(edges) bm = mm.to_bayesian_model() self.directed = bm else: connected = list(nx.algorithms.components.connected_components(self.undirected)) # Hold the edges of each connected component as a list(list()) connected_comps = list() # If the graph is not completely connected if len(connected) > 1: self.directed = BayesianModel() for comp in connected: # If the connected component is not a single vertex if len(comp) > 1: edges_to_add = list() temp = MarkovModel() for vert1 in comp: neighbours = list(self.undirected.neighbors(vert1)) for vert2 in neighbours: if not ((vert1, vert2) in edges_to_add) and not ((vert2, vert1) in edges_to_add): edges_to_add.append((vert1, vert2)) temp.add_nodes_from(comp) temp.add_edges_from(edges_to_add) temp_bn = temp.to_bayesian_model() connected_comps.append(temp_bn) else: self.directed.add_nodes_from(comp) for bn in connected_comps: self.directed.add_nodes_from(list(bn.nodes)) self.directed.add_edges_from(list(bn.edges)) else: # If the graph is completely connected, just add all edges to markov model edges = list(self.undirected.edges()) vertices = list(self.undirected.nodes) mm = MarkovModel() mm.add_nodes_from(vertices) mm.add_edges_from(edges) bm = mm.to_bayesian_model() self.directed = bm def fic(self): """It obtain the maximal set of edges of length 1 with the maximum total weight that can be added to the current structure. This is equivalent to adding edges of length 1 with the ILP formulation with the equality condition in type 3 constraints :return: The set of edges due to the different separators that can be added, list(((u,v),S) where S is a frozenset(int) and u< v """ current_model = self.undirected cliques = ch.chain_of_cliques(current_model) sep_list = ch.get_separators(cliques) comps = ch.get_nb_comp_list (current_model,sep_list) sep_to_comp = dict(zip(sep_list, comps)) edges = list() for S in sep_to_comp: # Compute the weights an select the best edges among different connected components in the mantle n = len(sep_to_comp[S]) E = np.ndarray(shape=(n, n, 2), dtype=int) W = np.zeros(shape=(n, n), dtype=float) for i in range(n - 1): for j in range(i + 1, n): wMax = -np.inf uMax = -1 vMax = -1 for u in sep_to_comp[S][i]: for v in sep_to_comp[S][j]: # Replace this function by the function in which you compute the score w = self.get_score([u] + list(S)) + self.get_score([v] + list(S)) \ - self.get_score([u, v] + list(S)) - self.get_score(list(S)) if (w > wMax): wMax = w uMax = u vMax = v E[i, j] = [uMax, vMax] W[i, j] = wMax E[j, i] = [uMax, vMax] W[j, i] = wMax # Construct the maximum weighted spanning tree (Prim's algorithm with adjacency lists) coarser than the given forest unvisited = list(range(1, n)) maxU = np.zeros(n).astype(int) maxW = W[0, :] for l in range(n - 1): ind = np.argmax(maxW[unvisited]) v = unvisited[ind] u = maxU[v] if E[u, v][0] < E[u, v][1]: e = (E[u, v][0], E[u, v][1]) else: e = (E[u, v][1], E[u, v][0]) edges.append((e, S)) del unvisited[ind] for w in unvisited: if (maxW[w] < W[v, w]): maxW[w] = W[v, w] maxU[w] = v to_add = [item[0] for item in edges] return to_add def coarsen(self): """Coarsen the current model in self.directed by using the procedure described in "Efficient approximation of probability distributions with k-order decomposable models". This approach is called Forced Iterative Coarsening (FIC) in "Learning decomposable models by coarsening". """ edges_to_add = self.fic() self.undirected.add_edges_from(edges_to_add) def greedy_learn(self, k_max=np.inf, time_limit=np.inf): """An algorithm for learning kDGs using a greedy hill-climbing approach. The code is influenced by the design of the hill climbing approach used in the pgmpy package (https://pgmpy.org/_modules/pgmpy/estimators/HillClimbSearch.html). However, we only consider edge additions which maintain chordality. The learned models are placed in the undirected and directed attributes. :param k_max (int): The maximal clique size of the graph to learn :param time_limit: The time limit of the search """ # build the mst spanning tree initial_graph = CliqueTree() start = time.time() # Generate MST if nx.classes.function.is_empty(self.maxtree): self.mst() for edge in list(self.get_model_mst().edges()): initial_graph.add_edge(edge[0], edge[1]) # the best bn learned so far is the MST self.to_bn(use_mst=True) best_bn = self.get_model_directed() # Flag which controls when the algorithm finishes add_edge = True if k_max > 2: while add_edge: running_time = time.time() - start best_score_delta = (0, None) if running_time > time_limit: break current_bn = best_bn.copy() add_edge = False # make a list of all possible edges that could be added to the graph diff = initial_graph.insertable for edge in diff: running_time = time.time() - start if running_time > time_limit: break # add an edge to the graph initial_graph.add_edge(edge[0], edge[1]) cliques = initial_graph.nodes_in_clique # check if the graph is chordal and has the right clique number if DecomposableModel.get_clique_num(cliques) <= k_max: # get the score of the new graph greedy_bn, greedy_score_delta = DecomposableModel.add_edge_to_bn(edge, current_bn.copy(), self.bdeu) # is the score better than previous graphs? if greedy_score_delta > best_score_delta[0]: # If we add can add an edge, the algorithm should continue looking for more edges add_edge = True best_score_delta = (greedy_score_delta, edge) best_bn = greedy_bn # after we check the candidate edge, remove it from the graph and keep looking initial_graph.remove_edge(edge[0], edge[1]) # make the new initial graph the best graph from the previous search if add_edge: initial_graph.add_edge(best_score_delta[1][0], best_score_delta[1][1]) self.undirected = initial_graph.G self.directed = best_bn @staticmethod def get_clique_num(cliques): """ A static method for greedy_learn which return the maximal clique size of a model. :param cliques (dict): The dictionary given by the nodes_in_clique attribute of CliqueTree. The cliques are the dictionary keys. :return clique_number (int): The number of vertices in the largest clique. """ clique_number = 0 for key in cliques.keys(): if len(cliques[key]) > clique_number: clique_number = len(cliques[key]) return clique_number @staticmethod def add_edge_to_bn(edge, bn, BDeu): """ A static method for greedy_learn which computes the local score delta for adding an edge. The correct direction of the edge is also determined for the directed model. :param edge tuple(int): The edge (undirected) to consider adding. :param bn (pgmpy.BayesianModel): The current bn model we wish to add edge to. :param BDeu (pgmpy.BDeuScore): The BDeu score object which calculate the local scores. :return bn (pgmpy.BayesianModel): The bn provided by the user with edge added in the correct orientation :return score_delta (float): The best local score delta for adding the edge to the current bn. """ try: local_score = BDeu.local_score bn.add_edge(edge[0], edge[1]) new_parents = bn.get_parents(edge[1]) old_parents = list(set(new_parents) - {edge[0]}) score_delta1 = local_score(edge[1], new_parents) - local_score(edge[1], old_parents) bn.remove_edge(edge[0], edge[1]) bn.add_edge(edge[1], edge[0]) new_parents = bn.get_parents(edge[0]) old_parents = list(set(new_parents) - {edge[1]}) score_delta2 = local_score(edge[0], new_parents) - local_score(edge[0], old_parents) bn.remove_edge(edge[1], edge[0]) if score_delta1 > score_delta2: bn.add_edge(edge[0], edge[1]) return bn, score_delta1 else: bn.add_edge(edge[1], edge[0]) return bn, score_delta2 except ValueError: try: bn.add_edge(edge[0], edge[1]) new_parents = bn.get_parents(edge[1]) old_parents = list(set(new_parents) - {edge[0]}) score_delta1 = local_score(edge[1], new_parents) - local_score(edge[1], old_parents) return bn, score_delta1 except ValueError: bn.add_edge(edge[1], edge[0]) new_parents = bn.get_parents(edge[0]) old_parents = list(set(new_parents) - {edge[1]}) score_delta2 = local_score(edge[0], new_parents) - local_score(edge[0], old_parents) return bn, score_delta2 def get_model_directed(self): return self.directed.copy() def get_model_undirected(self): return self.undirected.copy() def get_model_mst(self): return self.maxtree.copy() def get_score_function(self): return self.bdeu
def edge_eval(x_c: str, x_n: str, model: BayesianModel, estimator: BDeuScore) -> float: # Define function for evaluating each edge connecting to X_n copy = model.copy() copy.add_edge(x_c, x_n) return estimator.score(copy)
class ExactCounterfactual(object): """ A class for performing Exact counterfactual inference in both the Standard and Twin Network approaches. N.B.: For logging time, this relies on a custom edit of pgmpy.inference.ExactInference.VariableElimination, where the query also returns (as a second return) the time it takes to perform factor marginalization. """ def __init__(self, verbose=False, merge=False): """ Initialize the class. Args: verbose: whether or not to automatically print the Twin & standard inference times. merge: whether or not to perform node merging. """ self.verbose = verbose self.merge = merge def construct(self, causal_model=None, G=None, df=None, n_samples=20000): """ Init Args: twin_network: a TwinNetwork class. G: a networkx graph describing the dependency relationships. df: a dataframe of samples from that graph, used to construct the conditional probability tables. """ if causal_model is None: assert G is not None and df is not None, "Must initialize G and df if no TwinNetwork passed." self.G = G self.df = df else: self.scm = causal_model self.G = causal_model.G.copy() samples = causal_model.sample(n_samples) self.df = pd.DataFrame(samples, columns=causal_model.ordering) self.model = None # reset self.twin_model = None # reset self.counterfactual_model = None # reset self._compile_model() def _compile_model(self): """ Makes a pgmpy model out of a networkx graph and parameterizes its CPD with CPTs estimated from a model. """ self.model = BayesianModel(list(self.G.edges)) self._construct_CPD() def create_twin_network(self, node_of_interest, observed, intervention): """ Generate self.twin_model based on the current model, then merge nodes and eliminate nodes that are conditionally independent of the counterfactual node of interest. Args: node_of_interest: the node of interest to perform inference on. observed: a dictionary of {node: observed_value} to condition on. intervention: a dictionary of {node: intervention_value} to intervene on. """ self.twin_model = self.model.copy() self.twin_model.add_nodes_from([ "{}tn".format(n) for n in list(self.twin_model.nodes) if len(list(self.model.predecessors(n))) != 0 ]) # add all non-noise nodes self.twin_model.add_edges_from([ ("{}tn".format(pa), "{}tn".format(ch)) for pa, ch in list(self.model.edges) if len(list(self.model.predecessors(pa))) != 0 ]) # add all non-noise edges self.twin_model.add_edges_from([ (pa, "{}tn".format(ch)) for pa, ch in list(self.model.edges) if len(list(self.model.predecessors(pa))) == 0 ]) #add all noise edges # merge nodes if merge flag is true if self.merge: self.merge_nodes(node_of_interest, intervention) # get appropriately ordered CPTs for new merged representation duplicate_cpts = [] for node in self.twin_model.nodes: if node[-2:] == "tn": # if in the twin network model node_parents = list(self.twin_model.predecessors(node)) non_twin_parents = [ pa.replace("tn", "") for pa in node_parents ] cpt = TabularCPD( node, 2, self.model.get_cpds( node[:-2]).reorder_parents(non_twin_parents), node_parents, len(node_parents) * [2]) duplicate_cpts.append(cpt) self.twin_model.add_cpds(*duplicate_cpts) # make model efficient modified_intervention = { n + "tn": intervention[n] for n in intervention } # modify for twin network syntax self.intervene(modified_intervention, twin=True) self._eliminate_conditionally_independent(node_of_interest, observed, intervention) def _construct_CPD(self, counterfactual=False, df=None): cpt_list = [] if df is None: df = self.df for node in self.G.nodes: cpt_list.append(self._get_node_CPT(node, df)) if counterfactual: self.counterfactual_model.add_cpds(*cpt_list) else: self.model.add_cpds(*cpt_list) self.df = None # erase df to make object pickleable, otherwise the object becomes unpicklable. (Important for parallel processing) def _get_node_CPT(self, node, df=None): parents = list(self.G.predecessors(node)) if len(parents) == 0: # if root node (latent) mu = df[node].mean() return TabularCPD(node, 2, values=[[1 - mu], [mu]]) elif len(parents) > 0: mus = df.groupby(parents)[node].mean().reset_index() uniques = mus[parents].drop_duplicates() parent_combos = list(product(*[[0, 1] for _ in parents])) appends = [] for combo in parent_combos: if not (uniques == np.array(combo) ).all(1).any(): # if value not enumerated in sample appends.append(list(combo) + [0.5]) # add an uninformative prior add_df = pd.DataFrame(appends, columns=parents + [node]) mus = pd.concat((mus, add_df), axis=0) mus = mus.sort_values(by=parents) mus = mus[node].values cpt = np.vstack((1. - mus, mus)) cpt = TabularCPD(node, 2, values=cpt, evidence=parents, evidence_card=len(parents) * [2]) return cpt def query(self, var, observed, counterfactual=False, twin=False): """ Run an arbitrary query by Variable Elimination. What is the analytic cost of this? You have to do K noise queries in a graph with K endog nodes + K exog nodes in normal CFI. In twin network inference, you have to do 1 query in a graph with 2K endog nodes + K exog nodes. Args: var: variable of interest, i.e. P(Var | Observed) observed: a dictionary of {node_name: observed_value} to condition on. counterfactual: if true, uses the counterfactual model. (self.counterfactual_model) twin: if true, uses the twin network model. (self.twin_model) Returns: """ if not isinstance(var, list): var = [var] if twin: # time_start = time.time() infer = VariableElimination(self.efficient_twin_model) result, time_elapsed = infer.query(var, evidence=observed, stopwatch=True) self.twin_inference_time = time_elapsed elif counterfactual: # time_start = time.time() infer = VariableElimination(self.counterfactual_model) result, time_elapsed = infer.query(var, evidence=observed, stopwatch=True) self.standard_inference_time = self.joint_inference_time + time_elapsed else: infer = VariableElimination(self.model) result, time_elapsed = infer.query(var, evidence=observed, stopwatch=True) return result, time_elapsed def intervene(self, intervention, counterfactual=False, twin=False): """ Performs the intervention on the BN object by setting the CPT to be deterministic and removing parents. Args: intervention: a dictionary of {node_name: intervention_value} to intervene on. """ cpt_list = [] if counterfactual and not twin: model = self.counterfactual_model elif twin and not counterfactual: model = self.twin_model else: model = self.model for node in intervention: if node in model.nodes: # do-calculus graph surgery: remove edges from parents parent_edges = [(pa, node) for pa in model.predecessors(node)] model.remove_edges_from(parent_edges) model.remove_node("U{}".format(node)) # set new deterministic CPT value = intervention[node] cpt = [[], []] cpt[value] = [1] cpt[int(not bool(value))] = [0] new_cpt = TabularCPD(node, 2, values=cpt) cpt_list.append(new_cpt) # override existing CPTs model.add_cpds(*cpt_list) def abduction(self, observed, n_samples=None): # infer latent joint and store the time it takes noise_nodes = [ n for n in self.G.nodes if len(list(self.G.predecessors(n))) == 0 ] new_joint, time_elapsed = self.query(noise_nodes, observed) self.joint_inference_time = time_elapsed new_joint = new_joint.values.ravel() # sample from network with new latent distribution ## sample from joint dim = 2**len(noise_nodes) val_idx = np.arange(dim) # define number of samples if n_samples is None: # be careful with this! n_samples = min( [30 * 2**(len(list(self.G.nodes)) - len(noise_nodes)), 100000]) noise_sample_idx = np.random.choice(val_idx, size=n_samples, p=new_joint) vals = np.array( list(product(*[[0, 1] for _ in range(len(noise_nodes))]))) noise_samples = vals[noise_sample_idx] ## intervene in DAG self.scm.do( {n: noise_samples[:, i] for i, n in enumerate(noise_nodes)}) ## sample with these interventions counterfactual_samples = pd.DataFrame(self.scm.sample(n_samples), columns=self.scm.ordering) # construct cpts with new distribution self.counterfactual_model = self.model.copy() self._construct_CPD(counterfactual=True, df=counterfactual_samples) def exact_abduction_prediction(self, noi, ev, intn, n_joint_samples=30000): # sample from exact joint distribution start = time.time() joint = self.query(self.scm._get_exog_nodes(), ev)[0] values = np.array( list(product(*[range(card) for card in joint.cardinality]))) n_joint_samples = max([n_joint_samples, 30 * values.shape[0]]) probabilities = joint.values.ravel() idx = np.random.choice(np.arange(values.shape[0]), size=n_joint_samples, p=probabilities) samples = values[idx] samples = { joint.variables[i]: samples[:, i] for i in range(len(joint.variables)) } print(time.time() - start) # pass joint samples self.scm.do(samples) # format intervention if isinstance(intn[list(intn.keys())[0]], int): intn = {k: intn[k] * np.ones(n_joint_samples) for k in intn} self.scm.do(intn) # sample form new model prediction = self.scm.sample(return_pandas=True)[noi] return prediction.mean() def enumerate_inference(self, noi, ev, intn, n_samples=30000): """ Performs exact counterfactual inference by enumeration. """ intn = {k: intn[k] * np.ones(n_samples) for k in intn} joint_sample, joint_prob = self.posterior_enumerate(ev) joint_samples = joint_sample[np.random.choice(np.arange( joint_sample.shape[0]), p=joint_prob, size=n_samples)] joint_samples = { node: joint_samples[:, i] for i, node in enumerate(self.scm._get_exog_nodes()) } self.scm.do(joint_samples) self.scm.do(intn) prediction = self.scm.sample(return_pandas=True)[noi] return prediction.mean() def posterior_enumerate(self, evidence): """ Inference via enumeration. """ # set up enumeration exog_nodes = self.scm._get_exog_nodes() endog_nodes = self.scm._get_endog_nodes() evidence_array = np.array( [evidence[k] for k in endog_nodes if k in evidence]) evidence_index = [ i for i, v in enumerate(endog_nodes) if v in evidence ] combinations = np.array( list(product(*[range(2) for _ in range(len(exog_nodes))]))) probabilities = np.array( [self.scm.G.nodes[node]['p'] for node in exog_nodes]) prior = combinations * probabilities + (1 - combinations) * ( 1 - probabilities) def vector_compare(val_prob): joint_sample, prior = val_prob self.scm.do({ exog_nodes[i]: joint_sample[i] for i in range(len(exog_nodes)) }) samp = self.scm.sample().flatten() if np.all(evidence_array == samp[evidence_index]): return np.product(prior) else: return 0 posterior = np.array( [i for i in map(vector_compare, zip(combinations, prior))]) posterior = posterior / np.sum(posterior) return combinations, posterior def _generate_counterfactual_model(self, observed, intervention, n_samples=None): """ Runs the standard counterfactual inference procedure and returns an intervened model with the posterior. Args: observed: a dictionary of {node: observed_value} to condition on. intervention: a dictionary of {node: intervention_value} to intervene on. """ self.abduction(observed, n_samples) self.intervene(intervention, counterfactual=True) def standard_counterfactual_query(self, node_of_interest, observed, intervention, n_samples_for_approx=None): """ Query and sample from the counterfactual model. Args: observed: a dictionary of {node: observed_value} to condition on. intervention: a dictionary of {node: intervention_value} to intervene on. n_samples: number of samples to draw from the counterfactual world model. """ # infer latents and generate model, also initializes self.standard_inference_time self._generate_counterfactual_model(observed, intervention, n_samples=n_samples_for_approx) # then run the query ## for stability, pass in as evidence a deterministic value for the intervention node int_noise_node_values = { "U{}".format(k): intervention[k] for k in intervention } q, time_elapsed = self.query(node_of_interest, observed=int_noise_node_values, counterfactual=True) self.standard_inference_time = self.joint_inference_time + time_elapsed return q def merge_nodes(self, node_of_interest, intervention): """ Merge nodes in the Twin Counterfactual network. In place modifies `self.twin_model`. Works by giving children of the node to be eliminated to its factual counterpart. Operates topologically. """ # find every non-descendant of the intervention nodes nondescendant_sets = [] all_nodes = set([i for i in list(self.model.nodes) if i[0] != 'U']) for node in intervention: nondescendant_sets.append( all_nodes.difference(set(nx.descendants(self.model, node)))) dont_merge = [node_of_interest] + list(intervention.keys()) shared_nondescendants = set.intersection( *nondescendant_sets) - set(dont_merge) # now modify twin network to replace all _tn variables with their regular counterpart ordered_nondescendants = [ n for n in nx.topological_sort(self.model) if n in list(shared_nondescendants) ] for node in ordered_nondescendants: # start with the oldest nodes twin_node = node + "tn" tn_children = self.twin_model.successors(twin_node) self.twin_model.add_edges_from([(node, c) for c in tn_children]) self.twin_model.remove_node(twin_node) def _eliminate_conditionally_independent(self, node_of_interest, observed, intervention): """ Generate an "efficient" twin network model by removing nodes that are d-separated from the node of interest given observed and intervened variables. Args: node_of_interest: the node of interest in the query. observed: a dictionary of {node: observed_value} to condition on. intervention: a dictionary of {node: intervention_value} to intervene on. """ conditioned_on = list(observed) + list(intervention) self.efficient_twin_model = self.twin_model.copy() for node in [n for n in self.twin_model.nodes if n[-2:] == "tn"]: try: if not self.efficient_twin_model.is_active_trail( node, node_of_interest + "tn", observed=conditioned_on): self.efficient_twin_model.remove_node(node) except: pass def twin_counterfactual_query(self, node_of_interest, observed, intervention): """ Query and sample from the counterfactual model. Args: observed: a dictionary of {node: observed_value} to condition on. intervention: a dictionary of {node: intervention_value} to intervene on. n_samples: number of samples to draw from the counterfactual world model. """ self.create_twin_network(node_of_interest, observed, intervention) # then, create the twin network result, time_elapsed = self.query( node_of_interest + "tn", observed, twin=True) # log time it takes to do p(Vtn | E) return result def sample(self, n_samples=1, counterfactual=False, twin=False): """ Perform forward sampling from the model. Args: n_samples: the number of samples you'd like to return """ if counterfactual: model = self.counterfactual_model elif twin: model = self.twin_model else: model = self.model inference = BayesianModelSampling(model) return inference.forward_sample(size=n_samples, return_type='dataframe') def compare_times(self, node_of_interest, observed, intervention, n_samples_for_approx=None): """ Compare the times it takes to do inference in the standard and twin network counterfactual inference approaches. Args: node_of_interest: the node of interest to perform inference on. observed: a dictionary of {node: observed_value} to condition on. intervention: a dictionary of {node: intervention_value} to intervene on. """ try: with warnings.catch_warnings(): warnings.simplefilter("ignore") print("A. Performing Standard Counterfactual Inference.") self.standard_counterfactual_query(node_of_interest, observed, intervention, n_samples_for_approx) print("B. Performing Twin Network Counterfactual Inference.") # first, reset the graph network self.scm.G = self.scm.G_original.copy() self.twin_counterfactual_query(node_of_interest, observed, intervention) if self.verbose: print(self.standard_inference_time, self.twin_inference_time) return self except Exception as e: print(e) print((node_of_interest, observed, intervention)) return False # return False bool to indicate failed experiment.
class MyClass(object): def __init__(self, case): self.case = case self.results = [] self.networx_test = nx.DiGraph() self.pgmpy_test = BayesianModel() self.networx = nx.DiGraph() self.pgmpy = BayesianModel() self.best_error = math.inf self.best_topology = [0,0,nx.DiGraph] self.dictionary = [] self.header = {} self.nodes_0 = [] self.edges_0 = {} self.nodes = [] self.edges = {} self.cpds = {} self.colors_dictionary ={} self.colors_table =[] self.colors_cpd = [] self.learning_data = {} self.nummber_of_colors = 0 self._util = Utilities(case) self._lat = Lattices(self._util) def get_my_colors(self): evidence = [] cardinality = [] for i, node in enumerate(self.nodes): if 'BEN' in node[0] or 'MEM' in node[0]: evidence.append(node[0]) cardinality.append(node[1]['cardinality']) self.colors_dictionary, self.colors_table, self.colors_cpd = self.color_cpd('WORLD',3,evidence,cardinality) self.number_of_colors = self.colors_table.shape[1] print('Number of colors : ', self.number_of_colors) print(self.colors_cpd) #print(self.colors_cpd.values) def color_cpd(self,var,card_var,evidence,cardinality): table = CPD.get_index_matrix(cardinality) colors ={} hi = 1 lo = 0 C = np.prod(cardinality) matrix = np.full((3, C), 1. / 3.) matrix[0] = [hi, lo, lo, hi, lo, lo, hi, lo, hi, lo, lo, hi, lo, lo, hi, lo] matrix[1] = [lo, hi, lo, lo, hi, lo, lo, hi, lo, hi, lo, lo, hi, lo, lo, hi] matrix[2] = [lo, lo, hi, lo, lo, hi, lo, lo, lo, lo, hi, lo, lo, hi, lo, lo] cpd =TabularCPD(variable=var, variable_card=card_var, values=matrix, evidence=evidence, evidence_card=cardinality) for i, node in enumerate(evidence): colors.update({node:table[i]}) return colors,table, cpd # def set_color(self, color): # col = self.colors_table[:, color] # for i in range(0,len(col)): # node = 'BENS_'+ str(i) # self.pgmpy.get_cpds(node).values = CPD.RON_cpd(node, self.pgmpy.get_cardinality(node), mu = int(col[i])).values def add_edges(self, topology): self.networx.remove_edges_from(self.edges) self.edges = [] shape = np.asarray(topology).shape # ''' let's first remove all void nodes ----> not necssary -----> delete the code ??''' # nodes_to_remove = [] # rows = np.sum(topology, axis = 1) # columns = np.sum(topology,axis = 0) # for row in range(0, len(rows)): # if rows[row] == 0: # nodes_to_remove.append('WORLD_' + str(row)) # for column in range(0, len(columns)): # if columns[column] == 0: # nodes_to_remove.append('BENS_' + str(column)) # self.networx.remove_nodes_from(nodes_to_remove) self.nodes = self.networx.nodes(data = True) for column in range(0,shape[1]): for row in range(0,shape[0]): if topology[row][column] == 1: parent = 'BENS_' + str(column) child = 'WORLD_'+ str(row) self.networx.add_edge(parent, child) self.edges = self.networx.edges() def add_dummy_cpds(self): for i, node in enumerate(self.nodes): cardinality = node[1]['cardinality'] if ('BEN' in node[0]) or ('MEM' in node[0]): self.nodes[i][1]['cpd'] = CPD.create_fixed_parent(cardinality, modus = 'uniform') else: incoming_nodes = self.networx.in_edges(node[0]) if len(incoming_nodes) == 0: self.nodes[i][1]['cpd'] = CPD.create_random_child(cardinality, modus = 'orphan') continue card_parent = [] for m, n in enumerate(incoming_nodes): par = self.networx.node[n[0]]['cardinality'] card_parent.append(par) self.nodes[i][1]['cpd'] = CPD.create_random_child(cardinality, card_parent) def create_learning_data(self): self.get_my_colors() self.learning_data = {} for i, node in enumerate(self.nodes): print('node in create learnin data : ', node[0]) if "BEN" in node[0]: self.learning_data.update({node[0]:self.colors_table[i].tolist()}) if "WORLD" in node[0]: shape = self.colors_cpd.values.shape reshaped_cpd = self.colors_cpd.values.reshape(shape[0], int(np.prod(shape)/shape[0])) for hue in range(0,3): if str(hue) in node[0]: self.learning_data.update({node[0]:reshaped_cpd[hue,:].tolist()}) print('Learning data') print(self.learning_data) def do_inference(self, models, expected_result): for key in models: err = models[key].process() def test_topology(self): self.networx_test = self.networx.copy() self.pgmpy_test = self.pgmpy.copy() model = {'main': GenerativeModel(SensoryInputVirtualPeepo(self), self.pgmpy_test)} expected_result = [0,0,0] ''' ------ going through all possible "colors''' for color in range(0, self.number_of_colors): states = self.colors_table[:,color] shape = self.colors_cpd.values.shape reshaped_cpd = self.colors_cpd.values.reshape(shape[0], int(np.prod(shape) / shape[0])) expected_result = reshaped_cpd[:,int(color)] for i, pixel in enumerate(states): cardinality = self.pgmpy_test.get_cardinality('BENS_'+str(i)) self.pgmpy_test.get_cpds('BENS_' + str(i)).values = CPD.create_fixed_parent(cardinality, state = int(pixel)) self.do_inference(model ,expected_result) def estimate_parameters(self): data = pd.DataFrame(data=self.learning_data) estimator = BayesianEstimator(self.pgmpy, data) for i, node in enumerate(self.nodes): if 'LAN' in node[0] or 'MOTOR' in node[0] or 'WORLD' in node[0]: self.pgmpy.get_cpds(node[0]).values = estimator.estimate_cpd('WORLD_0', prior_type='dirichlet', pseudo_counts=[2, 3]).values # print('cpd for ', node[0]) # print(self.pgmpy.get_cpds(node[0])) def do_it(self): '''EXPLANATIONS''' self.networx_test, self.dictionary, self.header = self._util.get_network() self.networx = self.networx_test.copy() self.nodes = self.networx.nodes(data=True) self.create_learning_data() print('incoming panda data') print(self.learning_data) print('Dictionary : ', self.dictionary) ''' -------------- Constructing all possible topologies, --> option : restrain the number with the treshold : 0 -> all possible topologies, 100 -> only the fully connnected topology''' possible_topologies = self._lat.get_possible_topologies(treshold = 50)#setting the entropy at a 50% -> only topologies with an entropy >= 0.5 will be considered print("Possible topologies : ", len(possible_topologies)) entropy = 0 count = 0#TEMPORARY ''' -------------- walking through all toplogies''' for topology in possible_topologies: entropy = topology[1] if entropy == 0: continue#safeguard topo = topology[0] #self.networx = self.networx_0.copy() edges = [] parent = '' child = '' ''' ----------- for each topology we construct the edges and update dummy cpd (necessary as the shape of the LENs cpd's can change depending on the number of incoming nodes''' self.add_edges(topo) self.add_dummy_cpds() ''' ----------- convert DiGraph to pgmpy and check''' self.pgmpy = self._util.translate_digraph_to_pgmpy(self.networx) self.pgmpy.check_model() '''------------ ask pgmpy to guess the best cpd's of the LANs and LENs -> provide pgmpy with the learning data''' self.estimate_parameters() '''-------------- Testing the constructed topology''' self.test_topology() '''following 4 lines to remove : just use to check whether the algorithms are correct regarding the edges building''' count += 1 #print('edges : ', self.edges) if count > 10: break print('Check -> number of processed topologies in loop : ', count) # print('My colors : ') # print(self.colors_table) # print(self.colors_cpd) '''TO DO ---------------------------------------------------- a) add random cpds , convert to pgmpy BN, b) enbedd the skeleton loop within the learning loop-> loop through all possible colors and the expected classification -- > for each skeleton with the possible color as BEN, make pgmpy guess the best cpd's with the method class in pgmpy.estimators.BayesianEstimator.BayesianEstimator(model, data, **kwargs)[source] estimate_cpd(node, prior_type='BDeu', pseudo_counts=[], equivalent_sample_size=5)[source] -- > make inference and calulate the 'error (to be determined) ---> log the error as a tuple (error, 'entropy of the skeleton') c) create output (grapgh?) ''' ''' the methods have to be completed to cope with a general case i.e. BENS,MEMS,LANS, MOTORs, WORLDs but for the moment being we just assume there are only BEN's and WORLD's''' # self.networx.add_edge('BENS_1','WORLD_1') # self.networx.node['BENS_1']['cpd'] = [0.8,0.2] # self.networx.node['WORLD_2']['cpd'] = [[0.8, 0.2, 0.5,0.3],[0.2,0.8,0.5,0.7]] ''' if a best model has ben found, save it -> first update the Utility class object and save it''' # self._util.update_networkx(self.networx, self.dictionary, self.header) # self._util.save_network() # self._util.update_pgmpy(self.pgmpy, self.dictionary, self.header) # self._util.save_pgmpy_network() self.draw() return self.results def draw(self): '''TO REMOVE LATER''' plt.figure(figsize=(10, 5)) pos = nx.circular_layout(self.networx, scale=2) #node_labels = nx.get_node_attributes(self.networx, 'cpd') nx.draw(self.networx, pos, node_size=1200, node_color='lightblue', linewidths=0.25, font_size=10, font_weight='bold', with_labels=True) plt.show()
class TestBaseEstimator(unittest.TestCase): def setUp(self): self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) self.rand_data['C'] = self.rand_data['B'] self.est_rand = HillClimbSearch(self.rand_data, scoring_method=K2Score(self.rand_data)) self.model1 = BayesianModel() self.model1.add_nodes_from(['A', 'B', 'C']) self.model2 = self.model1.copy() self.model2.add_edge('A', 'B') # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv') self.titanic_data1 = self.titanic_data[["Survived", "Sex", "Pclass", "Age", "Embarked"]] self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] self.est_titanic1 = HillClimbSearch(self.titanic_data1) self.est_titanic2 = HillClimbSearch(self.titanic_data2) def test_legal_operations(self): model2_legal_ops = list(self.est_rand._legal_operations(self.model2)) model2_legal_ops_ref = [(('+', ('C', 'A')), -28.15602208305154), (('+', ('A', 'C')), -28.155467430966382), (('+', ('C', 'B')), 7636.947544933631), (('+', ('B', 'C')), 7937.805375579936), (('-', ('A', 'B')), 28.155467430966382), (('flip', ('A', 'B')), -0.0005546520851567038)] self.assertSetEqual(set([op for op, score in model2_legal_ops]), set([op for op, score in model2_legal_ops_ref])) def test_legal_operations_titanic(self): est = self.est_titanic1 start_model = BayesianModel([("Survived", "Sex"), ("Pclass", "Age"), ("Pclass", "Embarked")]) legal_ops = est._legal_operations(start_model) self.assertEqual(len(list(legal_ops)), 20) tabu_list = [('-', ("Survived", "Sex")), ('-', ("Survived", "Pclass")), ('flip', ("Age", "Pclass"))] legal_ops_tabu = est._legal_operations(start_model, tabu_list=tabu_list) self.assertEqual(len(list(legal_ops_tabu)), 18) legal_ops_indegree = est._legal_operations(start_model, max_indegree=1) self.assertEqual(len(list(legal_ops_indegree)), 11) legal_ops_both = est._legal_operations(start_model, tabu_list=tabu_list, max_indegree=1) legal_ops_both_ref = [(('+', ('Embarked', 'Survived')), 10.050632580087608), (('+', ('Survived', 'Pclass')), 41.88868046549101), (('+', ('Age', 'Survived')), -23.635716036430836), (('+', ('Pclass', 'Survived')), 41.81314459373226), (('+', ('Sex', 'Pclass')), 4.772261678792802), (('-', ('Pclass', 'Age')), 11.546515590731815), (('-', ('Pclass', 'Embarked')), -32.171482832532774), (('flip', ('Pclass', 'Embarked')), 3.3563814191281836), (('flip', ('Survived', 'Sex')), 0.039737027979640516)] self.assertSetEqual(set(legal_ops_both), set(legal_ops_both_ref)) def test_estimate_rand(self): est1 = self.est_rand.estimate() self.assertSetEqual(set(est1.nodes()), set(['A', 'B', 'C'])) self.assertTrue(est1.edges() == [('B', 'C')] or est1.edges() == [('C', 'B')]) est2 = self.est_rand.estimate(start=BayesianModel([('A', 'B'), ('A', 'C')])) self.assertTrue(est2.edges() == [('B', 'C')] or est2.edges() == [('C', 'B')]) def test_estimate_titanic(self): self.assertSetEqual(set(self.est_titanic2.estimate().edges()), set([('Survived', 'Pclass'), ('Sex', 'Pclass'), ('Sex', 'Survived')])) def tearDown(self): del self.rand_data del self.est_rand del self.model1 del self.titanic_data del self.titanic_data1 del self.titanic_data2 del self.est_titanic1 del self.est_titanic2