def test_score_titanic(self): scorer = K2Score(self.titanic_data2) titanic = BayesianModel([("Sex", "Survived"), ("Pclass", "Survived")]) self.assertAlmostEqual(scorer.score(titanic), -1891.0630673606006) titanic2 = BayesianModel([("Pclass", "Sex"), ]) titanic2.add_nodes_from(["Sex", "Survived", "Pclass"]) self.assertLess(scorer.score(titanic2), scorer.score(titanic))
def setup(self): model = get_example_model('alarm') samples = model.simulate(n_samples=int(1e4), seed=42, show_progress=False) self.scoring_method = K2Score(samples) self.est = HillClimbSearch(data=samples)
def __init__(self, data, scoring_method=None, **kwargs): """ Class for heuristic hill climb searches for DAGs, to learn network structure from data. `estimate` attempts to find a model with optimal score. Parameters ---------- data: pandas DataFrame object datafame object where each column represents one variable. (If some values in the data are missing the data cells should be set to `numpy.NaN`. Note that pandas converts each column containing `numpy.NaN`s to dtype `float`.) scoring_method: Instance of a `StructureScore`-subclass (`K2Score` is used as default) An instance of `K2Score`, `BdeuScore`, or `BicScore`. This score is optimized during structure estimation by the `estimate`-method. state_names: dict (optional) A dict indicating, for each variable, the discrete set of states (or values) that the variable can take. If unspecified, the observed values in the data set are taken to be the only possible states. complete_samples_only: bool (optional, default `True`) Specifies how to deal with missing data, if present. If set to `True` all rows that contain `np.Nan` somewhere are ignored. If `False` then, for each variable, every row where neither the variable nor its parents are `np.NaN` is used. This sets the behavior of the `state_count`-method. """ if scoring_method is not None: self.scoring_method = scoring_method else: self.scoring_method = K2Score(data, **kwargs) super(HillClimbSearch, self).__init__(data, **kwargs)
def estimate(self, tabu_length=100, max_indegree=2, black_list=None, epsilon=1e-4, max_iter=1e6, show_progress=True): # We will be using K2Score for this model score = K2Score(data=self.data) # Model gets the score for a node and its parents # This is used on every iteration for all possible changes # This is greddy and picks the best available option score_fn = score.local_score # Initialize a Starting DAG # PGMPY made a DAG class that adds some functionality to nx.DiGrpah start_dag = DAG() start_dag.add_nodes_from(self.variables) # Set the edges we do not want to have in the graph if black_list is None: black_list = set() else: black_list = set(black_list) # Just change Maxindegree to a certain number when doing the model # I think this is to keep track of the changes we already made to the model tabu_list = deque(maxlen=tabu_length) # Initialize a current model current_model = start_dag if show_progress: iteration = trange(int(max_iter)) else: iteration = range(int(max_iter)) for _ in iteration: # Get the best operations based on K2 score with self._legal_operations best_operation, best_score_change = max(self._legal_operations( model=current_model, score=score_fn, tabu_list=tabu_list, max_indegree=max_indegree, black_list=black_list, ), key=lambda t: t[1]) if best_score_change < epsilon: break elif best_operation[0] == '+': current_model.add_edge(*best_operation[1]) tabu_list.append(("-", best_operation[1])) elif best_operation[0] == '-': current_model.remove_edge(*best_operation[1]) tabu_list.append(("+", best_operation[1])) elif best_operation[0] == 'flip': X, Y = best_operation[1] current_model.remove_edge(X, Y) current_model.add_edge(Y, X) tabu_list.append(best_operation) return current_model
def _SetScoringType(df, scoretype, verbose=3): if verbose >= 3: print('[bnlearn] >Set scoring type at [%s]' % (scoretype)) if scoretype == 'bic': scoring_method = BicScore(df) elif scoretype == 'k2': scoring_method = K2Score(df) elif scoretype == 'bdeu': scoring_method = BDeuScore(df, equivalent_sample_size=5) return (scoring_method)
def SetScoringType(df, scoretype, verbose=3): if verbose >= 3: print('[BNLEARN][STRUCTURE LEARNING] Set scoring type at [%s]' % (scoretype)) if scoretype == 'bic': scoring_method = BicScore(df) elif scoretype == 'k2': scoring_method = K2Score(df) elif scoretype == 'bdeu': scoring_method = BdeuScore(df, equivalent_sample_size=5) return (scoring_method)
def scoreStructureLearn(data, search='HillClimbSearch', scoring_method='BicScore'): #基于score-search的结构学习 #search:HillClimbSearch, ExhaustiveSearch #scoring_method: 'BicScore', K2Score, BdeuScore if scoring_method == 'BicScore': scoring_method_tmp = BicScore(data) elif scoring_method == 'K2Score': scoring_method_tmp = K2Score(data) elif scoring_method == 'BdeuScore': scoring_method_tmp = BdeuScore(data, equivalent_sample_size=5) if search == 'HillClimbSearch': es = HillClimbSearch(data, scoring_method=scoring_method_tmp) else: es = ExhaustiveSearch(data, scoring_method=scoring_method_tmp) best_model = es.estimate() return best_model
def setUp(self): self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list("AB")) self.rand_data["C"] = self.rand_data["B"] self.est_rand = HillClimbSearch(self.rand_data, scoring_method=K2Score(self.rand_data)) self.model1 = BayesianModel() self.model1.add_nodes_from(["A", "B", "C"]) self.model2 = self.model1.copy() self.model2.add_edge("A", "B") # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv( "pgmpy/tests/test_estimators/testdata/titanic_train.csv") self.titanic_data1 = self.titanic_data[[ "Survived", "Sex", "Pclass", "Age", "Embarked" ]] self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] self.est_titanic1 = HillClimbSearch(self.titanic_data1) self.est_titanic2 = HillClimbSearch(self.titanic_data2)
def learn_structure(self, method, scoring_method, log=True): ''' (4) Method that builds the structure of the data ----------------- Parameters: method : The technique used to search for the structure -> scoring_approx - To use an approximated search with scoring method -> scoring_exhaustive - To use an exhaustive search with scoring method -> constraint - To use the constraint based technique scoring_method : K2, bic, bdeu log - "True" if you want to print debug information in the console ''' #Select the scoring method for the local search of the structure if scoring_method == "K2": scores = K2Score(self.data) elif scoring_method == "bic": scores = BicScore(self.data) elif scoring_method == "bdeu": scores = BdeuScore(self.data) #Select the actual method if method == "scoring_approx": est = HillClimbSearch(self.data, scores) elif method == "scoring_exhaustive": est = ExhaustiveSearch(self.data, scores) elif method == "constraint": est = ConstraintBasedEstimator(self.data) self.best_model = est.estimate() self.eliminate_isolated_nodes( ) # REMOVE all nodes not connected to anything else for edge in self.best_model.edges_iter(): self.file_writer.write_txt(str(edge)) self.log("Method used for structural learning: " + method, log) #self.log("Training instances skipped: " + str(self.extractor.get_skipped_lines()), log) self.log("Search terminated", log)
def scoreModels(h0Diff, h0Rarity): diffModel0 = [('d5', 'd9'), ('d5', 'd3'), ('d3', 'd4'), ('d3', 'd8'), ('d9', 'd6'), ('d9', 'd1'), ('d9', 'd7'), ('d9', 'd8')] diffModel1 = [('d2', 'd5'), ('d5', 'd9'), ('d5', 'd3'), ('d3', 'd4'), ('d3', 'd8'), ('d9', 'd6'), ('d9', 'd1'), ('d9', 'd7'), ('d9', 'd8')] diffModel2 = [('d1', 'd2'), ('d5', 'd9'), ('d5', 'd3'), ('d3', 'd4'), ('d3', 'd8'), ('d9', 'd6'), ('d9', 'd1'), ('d9', 'd7'), ('d9', 'd8')] print(' \nestimating K2/BIC score of difference structures\n') print('k2score model0: {0} BicScore model0: {1}'.format( K2Score(h0Diff).score(BayesianModel(diffModel0)), BicScore(h0Diff).score(BayesianModel(diffModel0)))) print('k2score model1: {0} BicScore model1: {1}'.format( K2Score(h0Diff).score(BayesianModel(diffModel1)), BicScore(h0Diff).score(BayesianModel(diffModel1)))) print('k2score model2: {0} BicScore model2: {1}'.format( K2Score(h0Diff).score(BayesianModel(diffModel2)), BicScore(h0Diff).score(BayesianModel(diffModel2)))) rarityModel0 = [('r5', 'r9'), ('r5', 'r3'), ('r9', 'r1'), ('r8', 'r3'), ('r6', 'r9'), ('r6', 'r3')] rarityModel1 = [('r6', 'r9'), ('r7', 'r9'), ('r3', 'r4'), ('r3', 'r5'), ('r3', 'r9'), ('r2', 'r9'), ('r5', 'r9'), ('r9', 'r8'), ('r9', 'r1')] rarityModel2 = [('r7', 'r9'), ('r4', 'r3'), ('r4', 'r9'), ('r1', 'r2'), ('r1', 'r9'), ('r2', 'r9'), ('r5', 'r9'), ('r9', 'r8'), ('r9', 'r6')] print(' \nestimating K2/BIC score of rarity structures\n') print('k2score model0: {0} BicScore model0: {1}'.format( K2Score(h0Rarity).score(BayesianModel(rarityModel0)), BicScore(h0Rarity).score(BayesianModel(rarityModel0)))) print('k2score model1: {0} BicScore model1: {1}'.format( K2Score(h0Rarity).score(BayesianModel(rarityModel1)), BicScore(h0Rarity).score(BayesianModel(rarityModel1)))) print('k2score model2: {0} BicScore model2: {1}'.format( K2Score(h0Rarity).score(BayesianModel(rarityModel2)), BicScore(h0Rarity).score(BayesianModel(rarityModel2))))
cpd1.append(p_64) cpd1.append(p_36) cpd1.append(p4) model1.add_cpds(*cpd1) print("------------------------------------------") print("Edges of model1:", model1.edges()) print("Checking Model1:", model1.check_model()) print("------------------------------------------") '''generate data for model1''' inference = BayesianModelSampling(model1) data=inference.forward_sample(size=3000, return_type='dataframe') print("Data for model1:") print(data) k2=K2Score(data) print('Model1 K2 Score: ' + str(k2.score(model1))) '''Inference''' from pgmpy.inference import VariableElimination infer = VariableElimination(model1) print("Inference of x3:") print(infer.query(['x3']) ['x3']) print("Inference of x5|x2:") print(infer.query(['x5'], evidence={ 'x2': 1}) ['x5']) ''''Model2''' model2 = BayesianModel([('x1', 'x2'),('x1', 'x6'),('x2','x5'),('x2','x3'),('x6','x4')]) model2.add_cpds(p1,p_21,p_52,p_32,p_46,p_61)
def create_BN_model_using_BayesianEstimator(data): #data = pd.DataFrame(sensor_data)#, columns= feature_names)#['X', 'Y']) #print(data) data = pd.DataFrame( data ) #read_data_from_file_remove_date_and_time(r"E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\converted\pgmpy\sensor+PCA_n=5.csv" , data_type='float')) #print(data) #start_time = time.time() # 2 hours running, without output #hc = HillClimbSearch(data, scoring_method=BicScore(data)) #best_model = hc.estimate() #print(hc.scoring_method) #print(best_model.edges()) #end_time = time.time() #print("execution time in seconds:") #print(end_time-start_time) #start_time = time.time() #hc = HillClimbSearch(data, scoring_method=BdeuScore(data)) #best_model = hc.estimate() #print(hc.scoring_method) #print(best_model.edges()) #end_time = time.time() #print("execution time in seconds:") #print(end_time-start_time) #structure learning print("structure learning") start_time = time.time() hc = HillClimbSearch(data, scoring_method=K2Score( data)) #BicScore(data))#K2Score(data))BdeuScore(data) best_model = hc.estimate() print(hc.scoring_method) print(best_model.edges()) end_time = time.time() print("execution time in seconds:{}".format(end_time - start_time)) #parameter learning #model = BayesianModel([('A', 'C'), ('B', 'C')]) #model.fit(data) #model.get_cpds() ###### #best_model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D')]) casas7_model = BayesianModel(best_model.edges()) print("*******************") #BayesianEstimator.get_parameters(self, prior_type, equivalent_sample_size, pseudo_counts) #####estimator = BayesianEstimator(best_model, data) #####print(estimator.get_parameters(prior_type='K2'))#, equivalent_sample_size=5) estimator = BayesianEstimator(casas7_model, data) #casas7_model.fit(data, estimator=BayesianEstimator, prior_type="K2")#MaximumLikelihoodEstimator) ######print(casas7_model.get_cpds()) ###casas7_model.predict(data) #print("casas7_model.node:{}".format(casas7_model.node)) ########return estimator return estimator
import numpy as np import pandas as pd from pgmpy.estimators import HillClimbSearch from pgmpy.models import BayesianModel from pgmpy.estimators import K2Score from pgmpy.factors.discrete import TabularCPD from pgmpy.sampling import BayesianModelSampling from pgmpy.inference import VariableElimination feature_val1 = pd.read_csv('15features_f.csv') '''1pen_pressure 2letter_spacing 3size 4dimension 5is_lowercase 6is_continuous 7slantness 8tilt 9entry_stroke_a 10staff_of_a 11formation_n 12staff_of_d 13exit_stroke_d 14word_formation 15constancy''' hill = HillClimbSearch(feature_val1, scoring_method=K2Score(feature_val1)) f_model = hill.estimate() print(f_model.edges()) feature_val2 = pd.read_csv('15features_g.csv') hill1 = HillClimbSearch(feature_val2, scoring_method=K2Score(feature_val2)) g_model = hill1.estimate() print(g_model.edges()) corr_mat = feature_val1.corr() print(corr_mat) corr_feature = set() for i in range(len(corr_mat.columns)): for j in range(i): if abs(corr_mat.iloc[i, j]) > 0.2:
model5.add_edges_from([('x1', 'x2'), ('x1', 'x6'), ('x6', 'x4'), ('x2', 'x3'), ('x3', 'x5')]) model5.add_cpds(cpd_x1, cpd_x1x2, cpd_x1x6, cpd_x6x4, cpd_x2x3, cpd_x3x5) inference = BayesianModelSampling(model5) # print(inference.forward_sample(size=1000, return_type='dataframe')) data5 = inference.forward_sample(size=1000, return_type='dataframe') # ##### Evaluating the models using K2 score on the generated data # In[70]: # Evaluating the models on the data sets generated by them data = pd.concat([data1, data2, data3, data4, data5]) data.shape k2 = K2Score(data) print('Model 1 K2 Score: ' + str(k2.score(model1))) # model 1 is the best model print('Model 2 K2 Score: ' + str(k2.score(model2))) print('Model 3 K2 Score: ' + str(k2.score(model3))) print('Model 4 K2 Score: ' + str(k2.score(model4))) print('Model 5 K2 Score: ' + str(k2.score(model5))) # ##### Find the high and low probability patterns of 'th' # In[153]: # Finding 'th' highest frequency pattern frequency = data.groupby(['x1', 'x2', 'x3', 'x4', 'x5', 'x6']).size().to_frame('count').reset_index()
models = [model1, model2] [m.fit(data) for m in models] # ML-fit STATE_NAMES = model1.cpds[0].state_names print('\nState names:') for s in STATE_NAMES: print(s, STATE_NAMES[s]) # Information for the curious: # Structure-scores: http://pgmpy.org/estimators.html#structure-score # K2-score: for instance http://www.lx.it.pt/~asmc/pub/talks/09-TA/ta_pres.pdf # Additive smoothing and pseudocount: https://en.wikipedia.org/wiki/Additive_smoothing # Scoring functions: https://www.cs.helsinki.fi/u/bmmalone/probabilistic-models-spring-2014/ScoringFunctions.pdf k2 = K2Score(data) print('Structure scores:', [k2.score(m) for m in models]) separator() print('\n\nExhaustive structure search based on structure scores:') from pgmpy.estimators import ExhaustiveSearch from pgmpy.estimators import HillClimbSearch from pgmpy.estimators import BicScore # Warning: Doing exhaustive search on a PGM with all 5 variables # takes more time than you should have to wait. Hence # re-fit the models to data where some variable(s) has been removed # for this assignement. raw_data2 = {
def opt(self, file1, file2): f1 = open(file1, encoding="utf8") lines = f1.readlines() nodes = self.getegdes(lines[0]) edges = self.getegdes(lines[1]) data = pd.read_csv(file2) G = BayesianModel() G.add_nodes_from(nodes) for i in range(int(len(edges) / 2)): G.add_edge(edges[2 * i], edges[2 * i + 1]) # nx.draw(G) # plt.show() k2 = K2Score(data).score(G) bic = BicScore(data).score(G) bdeu = BDeuScore(data).score(G) print(k2, ",", bic, ",", bdeu) est = HillClimbSearch(data, scoring_method=K2Score(data)) model = est.estimate() model_edges = model.edges() G_ = nx.DiGraph() G_.add_edges_from(model_edges) G_copy = nx.DiGraph() G_copy.add_edges_from(G.edges) add = [] add_mut = [] delete = [] delete_mut = [] # a = list(G.edges._adjdict.key()) for edge in model_edges: node1 = edge[0] node2 = edge[1] if not nx.has_path(G, node2, node1): if not G.has_edge(node1, node2): this = (node1, node2) # this = '('+node1+','+node2+')' add.append(this) x = data[node1] mut = mr.mutual_info_score(data[node1], data[node2]) add_mut.append(mut) seq = list(zip(add_mut, add)) seq = sorted(seq, key=lambda s: s[0], reverse=True) alpha = 0.015 # if seq[0][0] > alpha: # add = seq[0:1] add = seq[0:1] data_edges = [] for edge in G.edges: node1 = edge[0] node2 = edge[1] mut = mr.mutual_info_score(data[node1], data[node2]) delete_mut.append(mut) data_edges.append(edge) # if not (nx.has_path(G_, node1, node2) or nx.has_path(G_, node2, node1)): # this = '('+node1+','+node2+')' # delete.append(this) seq = list(zip(delete_mut, data_edges)) seq = sorted(seq, key=lambda s: s[0]) # if seq[0][0] < alpha: # delete = seq[0:1] if len(edges) > 2: delete = seq[0:1] if len(add) > 0: if delete[0][0] > add[0][0]: delete = [] print('add') for i in add: print(str(i[1]) + "," + str(i[0])) print('delete') for j in delete: print(str(j[1]) + "," + str(j[0])) # print(j[0]) print('cpt') estimator = BayesianEstimator(G, data) for i in G.nodes: cpd = estimator.estimate_cpd(i, prior_type="K2") nodeName = i values = dict(data[i].value_counts()) valueNum = len(values) CPT = np.transpose(cpd.values) # CPT = cpd.values sequence = cpd.variables[1::] card = [] for x in sequence: s = len(dict(data[x].value_counts())) card.append(s) output = nodeName + '\t' + str(valueNum) + '\t' + str( CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card) print(output) print('mutual') output1 = [] for i in range(int(len(edges) / 2)): mut = mr.mutual_info_score(data[edges[2 * i]], data[edges[2 * i + 1]]) output1.append(mut) output2 = {} for node1 in G.nodes(): d = {} for node2 in G.nodes(): if node1 == node2: continue mut = mr.mutual_info_score(data[node1], data[node2]) d[node2] = mut output2[node1] = d print(output1) print(output2)
import matplotlib.pyplot as plt import networkx as nx from sklearn.preprocessing import KBinsDiscretizer data = pd.read_csv("data/data_auto_mpg.csv") # data = pd.DataFrame(np.random.randn(500, 5), columns=list('ABCDE')) # data['F'] = data['A'] * data['B'] for col in data.columns: if (data[col].dtype == np.float64 or data[col].dtype == np.float32): # bin_size = np.unique(data[col].values).shape[0] # kbins = KBinsDiscretizer(n_bins=bin_size, encode='ordinal', strategy='uniform').fit(data[col].values.reshape(-1,1)) # data[col] = kbins.transform(data[col].values.reshape(-1,1)).astype(np.int64) data[col] = data[col].astype(np.int64) data = data.iloc[:, :10] print(data.dtypes) print(data) print("aq") est = HillClimbSearch(data, scoring_method=K2Score(data)) print("aq") model = est.estimate(max_indegree=5) print("aq") print(model.edges) plt.figure() nx.draw_networkx(model) plt.show()
#feature_names.append("Person") #print(feature_names) #mydata = np.random.randint(low=0, high=2,size=(100, 6)) mydata = np.genfromtxt(r'E:\Lessons_tutorials\Behavioural user profile articles\Datasets\Aras\House A\CSV_Summery\Sequential\Day\occur\Whole_data.csv', delimiter=",") #pd.read_csv(r'E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\converted\pgmpy\data.csv') #print(mydata) feature_names = [str(i) for i in range (1,41)] feature_names.append("Person") feature_names.append("activity") print(feature_names) data = pd.DataFrame(mydata, columns= feature_names)#['X', 'Y']) print(data) list_of_scoring_methods = [#BicScore(data), #BdeuScore(data), K2Score(data)] for scoreMethod in list_of_scoring_methods: start_time = time.time() hc = HillClimbSearch(data, scoreMethod) best_model = hc.estimate() print(hc.scoring_method) print(best_model.edges()) end_time = time.time() print("execution time in seconds:") print(end_time-start_time) #casas7_model = BayesianModel() #casas7_model.fit(data, estimator=BayesianEstimator)#MaximumLikelihoodEstimator)
def test_score(self): self.assertAlmostEqual(K2Score(self.d1).score(self.m1), -10.73813429536977) self.assertEqual(K2Score(self.d1).score(BayesianModel()), 0)
from pgmpy.estimators import ExhaustiveSearch, K2Score if __name__ == '__main__': # fp = os.path.join('data', 'MTurk_Harvey.csv') # df = pd.read_csv(fp) # data = np.genfromtxt(fp, delimiter=",", dtype=float, skip_header=1) # x = data[:,:-1] # y = data[:,-1] # data = pd.DataFrame(np.random.randint(0, 5, size=(2500, 3)), columns=list('XYZ')) # data['sum'] = data.sum(axis=1) # #print(data) # est = ConstraintBasedEstimator(data) # skel, sep_sets = est.estimate_skeleton() # print(skel.edges()) # s = ExhaustiveSearch(pd.DataFrame(data={'Temperature': [23, 19],'Weather': ['sunny', 'cloudy'],'Humidity': [65, 75]})) # print(len(list(s.all_dags()))) # for dag in s.all_dags(): # print(dag.edges()) data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) data['C'] = data['B'] searcher = ExhaustiveSearch(data, scoring_method=K2Score(data)) for score, model in searcher.all_scores(): print score print model.edges()
models = [model1, model2] [m.fit(data) for m in models] # ML-fit STATE_NAMES = model1.cpds[0].state_names #print(model2.cpds[3]) print('\nState names:') for s in STATE_NAMES: print(s, STATE_NAMES[s]) # Information for the curious: # Structure-scores: http://pgmpy.org/estimators.html#structure-score # K2-score: for instance http://www.lx.it.pt/~asmc/pub/talks/09-TA/ta_pres.pdf # Additive smoothing and pseudocount: https://en.wikipedia.org/wiki/Additive_smoothing # Scoring functions: https://www.cs.helsinki.fi/u/bmmalone/probabilistic-models-spring-2014/ScoringFunctions.pdf k2 = K2Score(data) print('Structure scores:', [k2.score(m) for m in models]) separator() print('\n\nExhaustive structure search based on structure scores:') from pgmpy.estimators import ExhaustiveSearch, HillClimbSearch, BicScore # Warning: Doing exhaustive search on a PGM with all 5 variables # takes more time than you should have to wait. Hence # re-fit the models to data where some variable(s) has been removed # for this assignement. raw_data2 = { 'age': data['age'], 'avg_cs': data['avg_cs'],
# %% codecell from pgmpy.estimators import BDeuScore, K2Score, BicScore # Create random data sample with 3 variables, where Z is dependent on X, Y: data: DataFrame = DataFrame(data=np.random.randint(low=0, high=4, size=(5000, 2)), columns=list('XY')) # Making Z dependent (in some arbitrary relation like addition) on X and Y data['Z'] = data['X'] + data['Y'] # %% codecell # Creating the scoring objects from this data: bdeu: BDeuScore = BDeuScore(data, equivalent_sample_size=5) k2: K2Score = K2Score(data=data) bic: BicScore = BicScore(data=data) # %% codecell commonEvidenceModel: BayesianModel = BayesianModel([('X', 'Z'), ('Y', 'Z')]) drawGraph(commonEvidenceModel) # %% codecell commonCauseModel: BayesianModel = BayesianModel([('X', 'Z'), ('X', 'Y')]) drawGraph(commonCauseModel) # %% codecell bdeu.score(commonEvidenceModel) # %% codecell k2.score(commonEvidenceModel) # %% codecell bic.score(commonEvidenceModel)
PRED[X_j] = NEW_PRED[i, Xj] X_mat = X_mat.difference(S) X_pred = X_pred.intersection(S) break def pi(G, Xi): return set([p for p, f in G.edges if f == Xi]) def beta(G, xi): pass data = pd.read_csv("../data/asia.csv") newData = data.copy() for col in newData.columns: if (newData[col].dtype == np.float64 or newData[col].dtype == np.float32): newData[col] = newData[col].astype(np.int64) newData = newData.iloc[:, :7] e_t = [1, 1, 1, 1, 1, 1] G = HillClimbSearch(newData, scoring_method=K2Score(newData)).estimate(max_indegree=5) MaxIndependentSet(data, e_t, G, pi)
import pandas as pd from pgmpy.estimators import HillClimbSearch, ExhaustiveSearch from pgmpy.estimators import BDeuScore, BicScore, K2Score ##结构学习 data = pd.read_csv('data.csv', encoding='gb18030') df = pd.DataFrame(data) bic = BicScore(df) k2 = K2Score(df) hc = HillClimbSearch(df, scoring_method=bic) #hc = ExhaustiveSearch(df, k2) model = hc.estimate() for ee in model.edges(): print(ee) ##参数学习 from pgmpy.models import BayesianModel mod = BayesianModel(model.edges()) mod.fit(df) for cpd in mod.get_cpds(): print(cpd) #print(mod.local_independencies('HA')) ##模型推理 from pgmpy.inference import VariableElimination, BeliefPropagation cancer_infer = VariableElimination(mod) q = cancer_infer.query(variables=['HA']) print(q)