def estimate(self): """ Estimates the `BayesianModel` structure that fits best to the given data set, according to the scoring method supplied in the constructor. Exhaustively searches through all models. Only estimates network structure, no parametrization. Returns ------- model: `BayesianModel` instance A `BayesianModel` with maximal score. Examples -------- >>> import pandas as pd >>> import numpy as np >>> from pgmpy.estimators import ExhaustiveSearch >>> # create random data sample with 3 variables, where B and C are identical: >>> data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) >>> data['C'] = data['B'] >>> est = ExhaustiveSearch(data) >>> best_model = est.estimate() >>> best_model <pgmpy.models.BayesianModel.BayesianModel object at 0x7f695c535470> >>> best_model.edges() [('B', 'C')] """ best_dag = max(self.all_dags(), key=self.scoring_method.score) best_model = BayesianModel() best_model.add_nodes_from(sorted(best_dag.nodes())) best_model.add_edges_from(sorted(best_dag.edges())) return best_model
def test_score_titanic(self): scorer = BicScore(self.titanic_data2) titanic = BayesianModel([("Sex", "Survived"), ("Pclass", "Survived")]) self.assertAlmostEqual(scorer.score(titanic), -1896.7250012840179) titanic2 = BayesianModel([("Pclass", "Sex")]) titanic2.add_nodes_from(["Sex", "Survived", "Pclass"]) self.assertLess(scorer.score(titanic2), scorer.score(titanic))
def Hill_Climb_Search(data,state_names): epsilon = 1e-14 nodes = state_names.keys() start = BayesianModel() start.add_nodes_from(nodes) current_model = start while True: best_score_delta = 0 best_operation = None for operation, score_delta in operations(data,current_model, state_names): if score_delta > best_score_delta: best_operation = operation best_score_delta = score_delta if best_operation is None or best_score_delta < epsilon: break elif best_operation[0] == 'add': current_model.add_edge(*best_operation[1]) elif best_operation[0] == 'delete': current_model.remove_edge(*best_operation[1]) elif best_operation[0] == 'reverse': X, Y = best_operation[1] current_model.remove_edge(X, Y) current_model.add_edge(Y, X) print 'Iteration:' print current_model.edges() print 'Score: ',best_score_delta print 'Best operation: ',best_operation print current_model.edges() return current_model
def get_model(self): """ Returns an instance of Bayesian Model. """ model = BayesianModel() model.add_nodes_from(self.variables) model.add_edges_from(self.edges) model.name = self.model_name tabular_cpds = [] for var, values in self.variable_CPD.items(): evidence = values['CONDSET'] if 'CONDSET' in values else [] cpd = values['DPIS'] evidence_card = values[ 'CARDINALITY'] if 'CARDINALITY' in values else [] states = self.variables[var]['STATES'] cpd = TabularCPD(var, len(states), cpd, evidence=evidence, evidence_card=evidence_card) tabular_cpds.append(cpd) model.add_cpds(*tabular_cpds) for var, properties in self.variables.items(): model.node[var] = properties return model
def estimate(self): """ Estimates the `BayesianModel` structure that fits best to the given data set, according to the scoring method supplied in the constructor. Exhaustively searches through all models. Only estimates network structure, no parametrization. Returns ------- model: `BayesianModel` instance A `BayesianModel` with maximal score. Examples -------- >>> import pandas as pd >>> import numpy as np >>> from pgmpy.estimators import ExhaustiveSearch >>> # create random data sample with 3 variables, where B and C are identical: >>> data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) >>> data['C'] = data['B'] >>> est = ExhaustiveSearch(data) >>> best_model = est.estimate() >>> best_model <pgmpy.models.BayesianModel.BayesianModel object at 0x7f695c535470> >>> best_model.edges() [('B', 'C')] """ best_dag = max(self.all_dags(), key=self.scoring_method.score) best_model = BayesianModel() best_model.add_nodes_from(sorted(best_dag.nodes())) best_model.add_edges_from(sorted(best_dag.edges())) return best_model
def _train_bn(self): model = BayesianModel(self._dag.edges) model.add_nodes_from(self._dag.nodes) model.fit(self._get_training_data(), BayesianEstimator, prior_type='BDeu') return model
def get_model(self): """ Returns an instance of Bayesian Model. """ model = BayesianModel() model.add_nodes_from(self.variables) model.add_edges_from(self.edges) model.name = self.model_name tabular_cpds = [] for var, values in self.variable_CPD.items(): evidence = values["CONDSET"] if "CONDSET" in values else [] cpd = values["DPIS"] evidence_card = values[ "CARDINALITY"] if "CARDINALITY" in values else [] states = self.variables[var]["STATES"] cpd = TabularCPD(var, len(states), cpd, evidence=evidence, evidence_card=evidence_card) tabular_cpds.append(cpd) model.add_cpds(*tabular_cpds) if nx.__version__.startswith("1"): for var, properties in self.variables.items(): model.nodes[var] = properties else: for var, properties in self.variables.items(): model._node[var] = properties return model
def get_model(self): """ Returns the fitted bayesian model Example ---------- >>> from pgmpy.readwrite import BIFReader >>> reader = BIFReader("bif_test.bif") >>> reader.get_model() <pgmpy.models.BayesianModel.BayesianModel object at 0x7f20af154320> """ try: model = BayesianModel(self.variable_edges) model.name = self.network_name model.add_nodes_from(self.variable_names) tabular_cpds = [] for var in sorted(self.variable_cpds.keys()): values = self.variable_cpds[var] cpd = TabularCPD(var, len(self.variable_states[var]), values, evidence=self.variable_parents[var], evidence_card=[len(self.variable_states[evidence_var]) for evidence_var in self.variable_parents[var]]) tabular_cpds.append(cpd) model.add_cpds(*tabular_cpds) for node, properties in self.variable_properties.items(): for prop in properties: prop_name, prop_value = map(lambda t: t.strip(), prop.split('=')) model.node[node][prop_name] = prop_value return model except AttributeError: raise AttributeError('First get states of variables, edges, parents and network name')
def test_score_titanic(self): scorer = K2Score(self.titanic_data2) titanic = BayesianModel([("Sex", "Survived"), ("Pclass", "Survived")]) self.assertAlmostEqual(scorer.score(titanic), -1891.0630673606006) titanic2 = BayesianModel([("Pclass", "Sex"), ]) titanic2.add_nodes_from(["Sex", "Survived", "Pclass"]) self.assertLess(scorer.score(titanic2), scorer.score(titanic))
def test_score_titanic(self): scorer = BdeuScore(self.titanic_data2, equivalent_sample_size=25) titanic = BayesianModel([("Sex", "Survived"), ("Pclass", "Survived")]) self.assertAlmostEqual(scorer.score(titanic), -1892.7383393910427) titanic2 = BayesianModel([("Pclass", "Sex")]) titanic2.add_nodes_from(["Sex", "Survived", "Pclass"]) self.assertLess(scorer.score(titanic2), scorer.score(titanic))
def get_model(self): model = BayesianModel() model.add_nodes_from(self.variables) model.add_edges_from(self.edge_list) model.name = self.network_name tabular_cpds = [] for var, values in self.variable_CPD.items(): evidence_card = [ len(self.variable_states[evidence_var]) for evidence_var in self.variable_parents[var] ] cpd = TabularCPD( var, len(self.variable_states[var]), values, evidence=self.variable_parents[var], evidence_card=evidence_card, state_names=self.get_states(), ) tabular_cpds.append(cpd) model.add_cpds(*tabular_cpds) for node, properties in self.variable_property.items(): for prop in properties: if prop is not None: prop_name, prop_value = map(lambda t: t.strip(), prop.split("=")) model.nodes[node][prop_name] = prop_value return model
def single_bayes_net(df, independent_vars, dependent_vars): model = BayesianModel() model.add_nodes_from(independent_vars) for independent_var in independent_vars: for dependent_var in dependent_vars: model.add_edge(independent_var, dependent_var) model.fit(df) return model
def get_model(self): """ Returns the model instance of the ProbModel. Return --------------- model: an instance of BayesianModel. Examples ------- >>> reader = ProbModelXMLReader() >>> reader.get_model() """ if self.probnet.get("type") == "BayesianNetwork": model = BayesianModel() model.add_nodes_from(self.probnet["Variables"].keys()) model.add_edges_from(self.probnet["edges"].keys()) tabular_cpds = [] cpds = self.probnet["Potentials"] for cpd in cpds: var = list(cpd["Variables"].keys())[0] states = self.probnet["Variables"][var]["States"] evidence = cpd["Variables"][var] evidence_card = [ len(self.probnet["Variables"][evidence_var]["States"]) for evidence_var in evidence ] arr = list(map(float, cpd["Values"].split())) values = np.array(arr) values = values.reshape((len(states), values.size // len(states))) tabular_cpds.append( TabularCPD(var, len(states), values, evidence, evidence_card) ) model.add_cpds(*tabular_cpds) variables = model.nodes() for var in variables: for prop_name, prop_value in self.probnet["Variables"][var].items(): model.nodes[var][prop_name] = prop_value edges = model.edges() if nx.__version__.startswith("1"): for edge in edges: for prop_name, prop_value in self.probnet["edges"][edge].items(): model.edge[edge[0]][edge[1]][prop_name] = prop_value else: for edge in edges: for prop_name, prop_value in self.probnet["edges"][edge].items(): model.adj[edge[0]][edge[1]][prop_name] = prop_value return model else: raise ValueError("Please specify only Bayesian Network.")
def create_bayes_net(file, keep_atts, edges): atts = pd.read_csv(file) atts = atts[keep_atts] graph = BayesianModel() graph.add_nodes_from(atts.columns) # defining the structure of edges graph.add_edges_from(edges) # fit estimates the CPD tables for the given structure graph.fit(atts) return graph
class TestBayesianModelFitPredict(unittest.TestCase): def setUp(self): self.model_disconnected = BayesianModel() self.model_disconnected.add_nodes_from(['A', 'B', 'C', 'D', 'E']) self.model_connected = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')]) def test_disconnected_fit(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model_disconnected.fit(values) for node in ['A', 'B', 'C', 'D', 'E']: cpd = self.model_disconnected.get_cpds(node) self.assertEqual(cpd.variable, node) np_test.assert_array_equal(cpd.cardinality, np.array([2])) value = (values.ix[:, node].value_counts() / values.ix[:, node].value_counts().sum()) value = value.reindex(sorted(value.index)).values np_test.assert_array_equal(cpd.values, value) def test_connected_predict(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:800] predict_data = values[800:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict, predict_data) predict_data.drop('E', axis=1, inplace=True) e_predict = self.model_connected.predict(predict_data) np_test.assert_array_equal( e_predict.values.ravel(), np.array([ 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0 ])) def tearDown(self): del self.model_connected del self.model_disconnected
class TestBayesianModelFitPredict(unittest.TestCase): def setUp(self): self.model_disconnected = BayesianModel() self.model_disconnected.add_nodes_from(['A', 'B', 'C', 'D', 'E']) self.model_connected = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')]) def test_disconnected_fit(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model_disconnected.fit(values) for node in ['A', 'B', 'C', 'D', 'E']: cpd = self.model_disconnected.get_cpds(node) self.assertEqual(cpd.variable, node) np_test.assert_array_equal(cpd.cardinality, np.array([2])) value = (values.ix[:, node].value_counts() / values.ix[:, node].value_counts().sum()) value = value.reindex(sorted(value.index)).values np_test.assert_array_equal(cpd.values, value) def test_connected_predict(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:800] predict_data = values[800:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict, predict_data) predict_data.drop('E', axis=1, inplace=True) e_predict = self.model_connected.predict(predict_data) np_test.assert_array_equal(e_predict.values.ravel(), np.array([1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0])) def tearDown(self): del self.model_connected del self.model_disconnected
def create_model_and_inference(): dep_df = pd.read_csv('dependencies.csv', sep=';') def connect(df, source, edgelist): source_df = df[df['Column2'] == source] for col in source_df.iloc[0, 3:len(source_df.columns)]: target_df = df[df['Column1'] == col]['Column2'] if not target_df.empty: target = target_df.item() if not (target, source) in edgelist: edgelist.append((source, target)) connect(df, target, edgelist) edges = [] connect(dep_df, 'myproximus-usage', edges) edges = [(t[1], t[0]) for t in edges] nodes = set(itertools.chain.from_iterable(edges)) nodes_df = dep_df.iloc[:, 1].to_frame() nodes_df = nodes_df[nodes_df['Column2'].isin(nodes)] nodes_df['0'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['1'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['2'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['3'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['4'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['5'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['6'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['7'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['8'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['9'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['10'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df = nodes_df.set_index('Column2').transpose() model = BayesianModel() model.add_nodes_from(nodes) for edge in edges: try: model.add_edge(edge[0], edge[1]) except: print('WARNING: tried to add edge which forms loop: ' + str(edge)) model.fit(nodes_df, estimator=BayesianEstimator, prior_type="BDeu") # for cpd in model.get_cpds(): # print(cpd) draw_network(model.nodes(), model.edges(), {}, []) return model, VariableElimination(model)
def bayes_net_from_populational_data(data, independent_vars, dependent_vars): model = BayesianModel() model.add_nodes_from(independent_vars) for independent_var in independent_vars: for dependent_var in dependent_vars: model.add_edge(independent_var, dependent_var) cpd_list = [] state_names = BayesNetHelper.get_state_names_from_df( data, independent_vars | dependent_vars) for node in independent_vars | dependent_vars: cpd = BayesNetHelper.compute_cpd(model, node, data, state_names) cpd_list.append(cpd) model.add_cpds(*cpd_list) return model
def fully_connected_model(nodes=None): if not nodes: nodes = [BOREDOM, DESIRE, MOBILE, MOTOR_HYPO, LEFT_ARM] network = BayesianModel() network.add_nodes_from(nodes) for hypo in nodes: if 'hypo' in hypo: for obs in nodes: if 'obs' in obs or 'motor' in obs: network.add_edge(u=hypo, v=obs) network.fit(TRAINING_DATA, estimator=BayesianEstimator, prior_type="BDeu") return network
def create_bayes_net(): atts = pd.read_csv('../../data/list_attr_celeba.csv') atts = atts[KEEP_ATTS] graph = BayesianModel() graph.add_nodes_from(atts.columns) graph.add_edges_from([('Young', 'Eyeglasses'), ('Young', 'Bald'), ('Young', 'Mustache'), ('Male', 'Mustache'), ('Male', 'Smiling'), ('Male', 'Wearing_Lipstick'), ('Young', 'Mouth_Slightly_Open'), ('Young', 'Narrow_Eyes'), ('Male', 'Narrow_Eyes'), ('Smiling', 'Narrow_Eyes'), ('Smiling', 'Mouth_Slightly_Open'), ('Young', 'Smiling')]) graph.fit(atts) return graph
def generate_approx_model_from_graph(ebunch, nodes, df): """ Aprende un modelo Bayesiano de pgmpy usando un datos de un dataframe de pandas. Primero se hace un barajado de los datos. """ df = df.sample(frac=1) approx_model = BayesianModel(ebunch) approx_model.add_nodes_from(nodes) state_names = dict() for pair in ebunch: state_names[pair[0]] = [0, 1] state_names[pair[1]] = [0, 1] for node in nodes: state_names[node] = [0, 1] approx_model.fit(df, state_names=state_names, estimator=SmoothedMaximumLikelihoodEstimator) return approx_model
def bif2bayesian(pathname, verbose=3): """ Returns the fitted bayesian model Example ---------- >>> from pgmpy.readwrite import BIFReader >>> reader = BIFReader("bif_test.bif") >>> reader.get_model() <pgmpy.models.BayesianModel.BayesianModel object at 0x7f20af154320> """ if verbose >= 3: print('[BNLEARN] Loading bif file <%s>' % (pathname)) bifmodel = readwrite.BIF.BIFReader(path=pathname) #bifmodel.get_edges() try: model = BayesianModel(bifmodel.variable_edges) model.name = bifmodel.network_name model.add_nodes_from(bifmodel.variable_names) tabular_cpds = [] for var in sorted(bifmodel.variable_cpds.keys()): values = bifmodel.variable_cpds[var] cpd = TabularCPD( var, len(bifmodel.variable_states[var]), values, evidence=bifmodel.variable_parents[var], evidence_card=[ len(bifmodel.variable_states[evidence_var]) for evidence_var in bifmodel.variable_parents[var] ]) tabular_cpds.append(cpd) model.add_cpds(*tabular_cpds) # for node, properties in bifmodel.variable_properties.items(): # for prop in properties: # prop_name, prop_value = map(lambda t: t.strip(), prop.split('=')) # model.node[node][prop_name] = prop_value return model except AttributeError: raise AttributeError( '[BNLEARN] First get states of variables, edges, parents and network names' )
def get_model(self): """ Returns an instance of Bayesian Model or Markov Model. Varibles are in the pattern var_0, var_1, var_2 where var_0 is 0th index variable, var_1 is 1st index variable. Return ------ model: an instance of Bayesian or Markov Model. Examples -------- >>> reader = UAIReader('TestUAI.uai') >>> reader.get_model() """ if self.network_type == 'BAYES': model = BayesianModel() model.add_nodes_from(self.variables) model.add_edges_from(self.edges) tabular_cpds = [] for cpd in self.tables: child_var = cpd[0] states = int(self.domain[child_var]) arr = list(map(float, cpd[1])) values = np.array(arr) values = values.reshape(states, values.size // states) tabular_cpds.append(TabularCPD(child_var, states, values)) model.add_cpds(*tabular_cpds) return model elif self.network_type == 'MARKOV': model = MarkovModel(self.edges) factors = [] for table in self.tables: variables = table[0] cardinality = [int(self.domain[var]) for var in variables] value = list(map(float, table[1])) factor = DiscreteFactor(variables=variables, cardinality=cardinality, values=value) factors.append(factor) model.add_factors(*factors) return model
def createBayesGraph(graph_list,mapping,data): ''' Creating the bayesian network graph and table the graph_list, mapping and data are the parameters needed for creating the tables this function returns: bayes_model - the bayes model and its order cpds_array - array of the tables categories_each_element - categories of each element in the graph ''' cpds_array = [] categories_each_element = {} # Returning an array with the values of each element bayes_model = BayesianModel() bayes_model.add_nodes_from(list(mapping)) for value in graph_list: temp_list=value.split(',') bayes_model.add_edge(temp_list[0],temp_list[1]) data_dict = {mapping[i]: data[:,i] for i in range(0, len(mapping))} data_dict_pd = pandas.DataFrame(data=data_dict) bayes_model.fit(data_dict_pd) cpds_tables = bayes_model.get_cpds() # Creating the array which returs to the client for cpd in cpds_tables: cpds_list = {} for cat in cpd.state_names: categories_each_element[cat] = cpd.state_names[cat] cpd_string = str(cpd).split('|') temp_array = [] cpd_matrix_values = [] digits_numbers = False for a in cpd_string: if (is_number(a)): temp_array.append(float(a.strip())) digits_numbers = True elif ("-+" in a and digits_numbers == True): cpd_matrix_values.append(temp_array) temp_array = [] digits_numbers = False cpds_list[str(list(cpd.variables))] = cpd_matrix_values cpds_array.append(cpds_list) return(bayes_model,cpds_array,categories_each_element)
def join(reference_bayes, second_bayes, new_dependent_vars, new_independent_vars, ref_num_of_records, second_num_of_records): final_bayes = BayesianModel() #all independent variables should stay the same final_bayes.add_nodes_from(new_independent_vars) final_bayes.add_cpds(*[ reference_bayes.get_cpds(node=node) if node in reference_bayes.nodes else second_bayes.get_cpds(node=node) for node in new_independent_vars ]) for node in new_dependent_vars: final_bayes.add_node(node) ref_parents = set() second_parents = set() if node in reference_bayes: ref_parents = set(reference_bayes.get_parents(node)) if node in second_bayes: second_parents = set(second_bayes.get_parents(node)) if (len(ref_parents) == 0): final_bayes.add_edges_from([(parent, node) for parent in second_parents]) final_bayes.add_cpds(second_bayes.get_cpds(node=node)) else: final_bayes.add_edges_from([(parent, node) for parent in ref_parents]) if len(second_parents - ref_parents) > 0: raise ValueError('This join can not be performed since the\ second distribution contains new independent variable\ (s) for node {}. Please consider dropping these new \ dependencies or switching reference distribution. '. format(str(node))) elif ref_parents == second_parents: new_cpd = BayesNetHelper.calculate_weighted_cpds( reference_bayes.get_cpds(node=node), second_bayes.get_cpds(node=node), ref_num_of_records, second_num_of_records) final_bayes.add_cpds(new_cpd) else: final_bayes.add_cpds(reference_bayes.get_cpds(node=node)) return final_bayes
def create_bayes_net(): atts = pd.read_csv('./data/list_attr_celeba.csv') atts = atts[KEEP_ATTS] graph = BayesianModel() graph.add_nodes_from(atts.columns) # can't automate this part # defining the structure of edges graph.add_edges_from([('Young', 'Eyeglasses'), ('Young', 'Bald'), ('Young', 'Mustache'), ('Male', 'Mustache'), ('Male', 'Smiling'), ('Male', 'Wearing_Lipstick'), ('Young', 'Mouth_Slightly_Open'), ('Young', 'Narrow_Eyes'), ('Male', 'Narrow_Eyes'), ('Smiling', 'Narrow_Eyes'), ('Smiling', 'Mouth_Slightly_Open'), ('Young', 'Smiling')]) # fit estimates the CPD tables for the given structure graph.fit(atts) return graph
def create_network(models, processes, files): for p in range(files): temp_model = BayesianModel() for e in range(len(processes[p].get_errors())): temp_error = processes[p].get_error(e) for c in range(len(temp_error.get_causes())): temp_cause = temp_error.get_cause(c) q = temp_cause.get_occ_prob( ) / temp_error.get_total_cause_prob() temp_cause.set_occ_prob(q) temp_model.add_nodes_from([temp_cause, temp_error]) temp_model.add_edge(temp_cause, temp_error) temp_cause_cpd = TabularCPD(variable=temp_cause, variable_card=2, values=[[q, 1 - q]]) temp_model.add_cpds(temp_cause_cpd) temp_error_cpd = TabularCPD( variable=temp_error, variable_card=2, values=get_initial_error_cpd(len(temp_error.get_causes())), evidence=temp_error.get_causes(), evidence_card=[2] * (len(temp_error.get_causes()))) temp_model.add_cpds(temp_error_cpd) for f in range(len(temp_error.get_effects())): temp_effect = temp_error.get_effect(f) temp_model.add_nodes_from([temp_error, temp_effect]) temp_model.add_edge(temp_error, temp_effect) models.append(temp_model) #plotting Failure Tree dot = to_pydot(models[p]) with open('failure_tree_graph_%s.png' % processes[p], 'wb') as f: f.write(dot.create_png()) #Sample output of CPDs for causes and errors for e in range(len(processes[p].get_errors())): for c in range(len(processes[p].get_error(e).get_causes())): print( temp_model.get_cpds( processes[p].get_error(e).get_cause(c))) print(temp_model.get_cpds(processes[p].get_error(e)))
def main(): data, string = readData() genes = np.array(data.columns[1:]) labels = np.array(data.columns) bayesianModel = BayesianModel() transitionModel = DBN() bayesianModel.add_nodes_from(genes) transitionModel.add_nodes_from(genes) bData, tData = getData(data, labels) print "\nDynamic Bayesian Network inference", print "\nB_0 network relations: " hcb = HillClimbSearch(bData, genes, scoring_method=BicScore(bData, labels, bk1=string, weight=4)) best_model_b = hcb.estimate(start=bayesianModel, tabu_length=15, max_indegree=2) print(best_model_b.edges()) printOutputB(best_model_b) print "\nLocal Probability Model: " best_model_b.fit(bData, BayesianEstimator) for cpd in best_model_b.get_cpds(): print(cpd) print "\nB_transition network relations: " hct = HillClimbSearch(tData, genes, scoring_method=BicScore(tData, labels, bk1=string, weight=4)) best_model_t = hct.estimate_dynamic(start=transitionModel, tabu_length=15, max_indegree=2) print(best_model_t.edges()) printOutputT(best_model_t) print "\nLocal Probability Model: " best_model_t.fit(tData, BayesianEstimator) for cpd in best_model_t.get_cpds(): print(cpd)
def evaluate_single_graph(df_samples, graph, bn_truth, nb_repeat=3): testing_graph = BayesianModel() testing_graph.add_nodes_from(bn_truth.causal_graph.nodes()) for edge in remove_bidirected_edges(graph.edges()): try: testing_graph.add_edge(edge[0], edge[1]) except Exception as e: try: testing_graph.add_edge(edge[1], edge[0]) except Exception as e: print(e) continue testing_graph.fit(df_samples, estimator=BayesianEstimator) testing_graph.check_model() bn_test = BayesianNetwork(testing_graph) set_observe(bn_test.bn) set_observe(bn_truth.bn) bn_truth.set_state_names() bn_test.set_state_names() return { 'SID': SID(bn_truth.causal_graph, bn_test.causal_graph), 'SHD': SHD(bn_truth.causal_graph, bn_test.causal_graph), 'OD': np.mean([ ODist(bn_truth, bn_test, 1000, discrete=True) for _ in range(nb_repeat) ]), 'ID': np.mean([ IDist(bn_truth, bn_test, 1000, discrete=True) for _ in range(nb_repeat) ]) }
def reduce_model(self, evidence): model = copy.deepcopy(self.model) continuous_factors = [ factor for factor in model.factors if isinstance(factor, ContinuousFactor) ] for var, val in evidence.items(): for factor in continuous_factors: if var in factor.scope( ) and "F(" in var: # make sure that we only reduce at this stage for continuous values, let the inference algorithm deal with reducing for binary variables factor.reduce([(var, val)]) new_model = BayesianModel() additional_evidence = {} for node in model.factors: if isinstance(node, ContinuousFactor): if len(node.scope()) == 1: node = TabularCPD( str(node.scope()[0]), 2, [[node.assignment(0), node.assignment(1)]]) else: node = to_CPD(node) var = node.variable for v in node.scope(): if var != v: new_model.add_edge(str(v), str(var)) if "same_reason" in var: additional_evidence[var] = 1 new_model.add_nodes_from([str(n) for n in node.scope()]) new_model.add_cpds(node) return new_model, additional_evidence
def cal(self, file1, file2): f1 = open(file1) lines = f1.readlines() nodes = self.getegdes(lines[0]) edges = self.getegdes(lines[1]) data = pd.read_csv(file2) G = BayesianModel() G.add_nodes_from(nodes) for i in range(int(len(edges) / 2)): G.add_edge(edges[2 * i], edges[2 * i + 1]) output1 = [] for i in range(int(len(edges) / 2)): mut = mr.mutual_info_score(data[edges[2 * i]], data[edges[2 * i + 1]]) output1.append(mut) output2 = {} for node1 in G.nodes(): d = {} for node2 in G.nodes(): if node1 == node2: continue mut = mr.mutual_info_score(data[node1], data[node2]) d[node2] = mut output2[node1] = d print(output1) print(output2) with open('mutual_output.txt', 'w') as f: f.write(str(output1)) f.write('\n') f.write(str(output2))
class TestGibbsSampling(unittest.TestCase): def setUp(self): # A test Bayesian model diff_cpd = TabularCPD('diff', 2, [[0.6], [0.4]]) intel_cpd = TabularCPD('intel', 2, [[0.7], [0.3]]) grade_cpd = TabularCPD('grade', 3, [[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], evidence=['diff', 'intel'], evidence_card=[2, 2]) self.bayesian_model = BayesianModel() self.bayesian_model.add_nodes_from(['diff', 'intel', 'grade']) self.bayesian_model.add_edges_from([('diff', 'grade'), ('intel', 'grade')]) self.bayesian_model.add_cpds(diff_cpd, intel_cpd, grade_cpd) # A test Markov model self.markov_model = MarkovModel([('A', 'B'), ('C', 'B'), ('B', 'D')]) factor_ab = Factor(['A', 'B'], [2, 3], [1, 2, 3, 4, 5, 6]) factor_cb = Factor(['C', 'B'], [4, 3], [3, 1, 4, 5, 7, 8, 1, 3, 10, 4, 5, 6]) factor_bd = Factor(['B', 'D'], [3, 2], [5, 7, 2, 1, 9, 3]) self.markov_model.add_factors(factor_ab, factor_cb, factor_bd) self.gibbs = GibbsSampling(self.bayesian_model) def tearDown(self): del self.bayesian_model del self.markov_model @patch('pgmpy.inference.Sampling.GibbsSampling._get_kernel_from_bayesian_model', autospec=True) @patch('pgmpy.models.MarkovChain.__init__', autospec=True) def test_init_bayesian_model(self, init, get_kernel): model = MagicMock(spec_set=BayesianModel) gibbs = GibbsSampling(model) init.assert_called_once_with(gibbs) get_kernel.assert_called_once_with(gibbs, model) @patch('pgmpy.inference.Sampling.GibbsSampling._get_kernel_from_markov_model', autospec=True) def test_init_markov_model(self, get_kernel): model = MagicMock(spec_set=MarkovModel) gibbs = GibbsSampling(model) get_kernel.assert_called_once_with(gibbs, model) def test_get_kernel_from_bayesian_model(self): gibbs = GibbsSampling() gibbs._get_kernel_from_bayesian_model(self.bayesian_model) self.assertListEqual(list(gibbs.variables), self.bayesian_model.nodes()) self.assertDictEqual(gibbs.cardinalities, {'diff': 2, 'intel': 2, 'grade': 3}) def test_get_kernel_from_markov_model(self): gibbs = GibbsSampling() gibbs._get_kernel_from_markov_model(self.markov_model) self.assertListEqual(list(gibbs.variables), self.markov_model.nodes()) self.assertDictEqual(gibbs.cardinalities, {'A': 2, 'B': 3, 'C': 4, 'D': 2}) def test_sample(self): start_state = [State('diff', 0), State('intel', 0), State('grade', 0)] sample = self.gibbs.sample(start_state, 2) self.assertEquals(len(sample), 2) self.assertEquals(len(sample.columns), 3) self.assertIn('diff', sample.columns) self.assertIn('intel', sample.columns) self.assertIn('grade', sample.columns) self.assertTrue(set(sample['diff']).issubset({0, 1})) self.assertTrue(set(sample['intel']).issubset({0, 1})) self.assertTrue(set(sample['grade']).issubset({0, 1, 2})) @patch("pgmpy.inference.Sampling.GibbsSampling.random_state", autospec=True) def test_sample_less_arg(self, random_state): self.gibbs.state = None random_state.return_value = [State('diff', 0), State('intel', 0), State('grade', 0)] sample = self.gibbs.sample(size=2) random_state.assert_called_once_with(self.gibbs) self.assertEqual(len(sample), 2) def test_generate_sample(self): start_state = [State('diff', 0), State('intel', 0), State('grade', 0)] gen = self.gibbs.generate_sample(start_state, 2) samples = [sample for sample in gen] self.assertEqual(len(samples), 2) self.assertEqual({samples[0][0].var, samples[0][1].var, samples[0][2].var}, {'diff', 'intel', 'grade'}) self.assertEqual({samples[1][0].var, samples[1][1].var, samples[1][2].var}, {'diff', 'intel', 'grade'}) @patch("pgmpy.inference.Sampling.GibbsSampling.random_state", autospec=True) def test_generate_sample_less_arg(self, random_state): self.gibbs.state = None gen = self.gibbs.generate_sample(size=2) samples = [sample for sample in gen] random_state.assert_called_once_with(self.gibbs) self.assertEqual(len(samples), 2)
ax_temp.bar(x, z, zs=y, zdir='y', alpha=0.6, color='r' * 4) ax_temp.set_xlabel('X') ax_temp.set_ylabel('Y') ax_temp.set_zlabel('Z') ax_temp.title.set_text(('Feature ' + str(mean_indices[counter]))) counter += 1 plt.show() # Learning naive bayes model from various subsets of data naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2]) naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2, 4]) naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2, 3, 4, 5]) # Splitting train and test data for PGM model temp_data = pd.concat([all_city_data, pd.DataFrame(all_city_label, columns=[13])], axis=1) pgm_train_set = temp_data.loc[0:700] pgm_test_set = temp_data.loc[700:] print(pgm_train_set) # Implementing PGM model on data # Using these features: 0: (age) 1: (sex) 2: (cp) pgm_model = BayesianModel() pgm_model.add_nodes_from([0, 1, 2, 13]) pgm_model.add_edges_from([(1, 13)]) pgm_model.fit(pgm_train_set.loc[:, [0, 1, 2, 13]]) pgm_test_set = pgm_test_set.loc[:, [0, 1, 2, 13]].drop(13, axis=1) print(pgm_test_set) print(pgm_model.get_cpds(13))
class TestBayesianModelFitPredict(unittest.TestCase): def setUp(self): self.model_disconnected = BayesianModel() self.model_disconnected.add_nodes_from(['A', 'B', 'C', 'D', 'E']) self.model_connected = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')]) self.model2 = BayesianModel([('A', 'C'), ('B', 'C')]) self.data1 = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]}) self.data2 = pd.DataFrame(data={'A': [0, np.NaN, 1], 'B': [0, 1, 0], 'C': [1, 1, np.NaN], 'D': [np.NaN, 'Y', np.NaN]}) # data_link - "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv', dtype=str) self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] def test_bayesian_fit(self): print(isinstance(BayesianEstimator, BaseEstimator)) print(isinstance(MaximumLikelihoodEstimator, BaseEstimator)) self.model2.fit(self.data1, estimator=BayesianEstimator, prior_type="dirichlet", pseudo_counts=[9, 3]) self.assertEqual(self.model2.get_cpds('B'), TabularCPD('B', 2, [[11.0 / 15], [4.0 / 15]])) def test_fit_missing_data(self): self.model2.fit(self.data2, state_names={'C': [0, 1]}, complete_samples_only=False) cpds = set([TabularCPD('A', 2, [[0.5], [0.5]]), TabularCPD('B', 2, [[2. / 3], [1. / 3]]), TabularCPD('C', 2, [[0, 0.5, 0.5, 0.5], [1, 0.5, 0.5, 0.5]], evidence=['A', 'B'], evidence_card=[2, 2])]) self.assertSetEqual(cpds, set(self.model2.get_cpds())) def test_disconnected_fit(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model_disconnected.fit(values) for node in ['A', 'B', 'C', 'D', 'E']: cpd = self.model_disconnected.get_cpds(node) self.assertEqual(cpd.variable, node) np_test.assert_array_equal(cpd.cardinality, np.array([2])) value = (values.ix[:, node].value_counts() / values.ix[:, node].value_counts().sum()) value = value.reindex(sorted(value.index)).values np_test.assert_array_equal(cpd.values, value) def test_predict(self): titanic = BayesianModel() titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")]) titanic.fit(self.titanic_data2[500:]) p1 = titanic.predict(self.titanic_data2[["Sex", "Pclass"]][:30]) p2 = titanic.predict(self.titanic_data2[["Survived", "Pclass"]][:30]) p3 = titanic.predict(self.titanic_data2[["Survived", "Sex"]][:30]) p1_res = np.array(['0', '1', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']) p2_res = np.array(['male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'female', 'female', 'male', 'female', 'male', 'male', 'female', 'male']) p3_res = np.array(['3', '1', '1', '1', '3', '3', '3', '3', '1', '1', '1', '1', '3', '3', '3', '1', '3', '1', '3', '1', '3', '1', '1', '1', '3', '1', '3', '3', '1', '3']) np_test.assert_array_equal(p1.values.ravel(), p1_res) np_test.assert_array_equal(p2.values.ravel(), p2_res) np_test.assert_array_equal(p3.values.ravel(), p3_res) def test_connected_predict(self): np.random.seed(42) values = pd.DataFrame(np.array(np.random.randint(low=0, high=2, size=(1000, 5)), dtype=str), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:800] predict_data = values[800:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict, predict_data) predict_data.drop('E', axis=1, inplace=True) e_predict = self.model_connected.predict(predict_data) np_test.assert_array_equal(e_predict.values.ravel(), np.array([1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0], dtype=str)) def test_connected_predict_probability(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(100, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:80] predict_data = values[80:].copy() self.model_connected.fit(fit_data) predict_data.drop('E', axis=1, inplace=True) e_prob = self.model_connected.predict_probability(predict_data) np_test.assert_allclose(e_prob.values.ravel(), np.array([0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.5 ]), atol = 0) predict_data = pd.DataFrame(np.random.randint(low=0, high=2, size=(1, 5)), columns=['A', 'B', 'C', 'F', 'E'])[:] def test_predict_probability_errors(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(2, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:1] predict_data = values[1:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict_probability, predict_data) predict_data = pd.DataFrame(np.random.randint(low=0, high=2, size=(1, 5)), columns=['A', 'B', 'C', 'F', 'E'])[:] self.assertRaises(ValueError, self.model_connected.predict_probability, predict_data) def tearDown(self): del self.model_connected del self.model_disconnected
class TestBaseModelCreation(unittest.TestCase): def setUp(self): self.G = BayesianModel() def test_class_init_without_data(self): self.assertIsInstance(self.G, nx.DiGraph) def test_class_init_with_data_string(self): self.g = BayesianModel([('a', 'b'), ('b', 'c')]) self.assertListEqual(sorted(self.g.nodes()), ['a', 'b', 'c']) self.assertListEqual(hf.recursive_sorted(self.g.edges()), [['a', 'b'], ['b', 'c']]) def test_class_init_with_data_nonstring(self): BayesianModel([(1, 2), (2, 3)]) def test_add_node_string(self): self.G.add_node('a') self.assertListEqual(self.G.nodes(), ['a']) def test_add_node_nonstring(self): self.G.add_node(1) def test_add_nodes_from_string(self): self.G.add_nodes_from(['a', 'b', 'c', 'd']) self.assertListEqual(sorted(self.G.nodes()), ['a', 'b', 'c', 'd']) def test_add_nodes_from_non_string(self): self.G.add_nodes_from([1, 2, 3, 4]) def test_add_edge_string(self): self.G.add_edge('d', 'e') self.assertListEqual(sorted(self.G.nodes()), ['d', 'e']) self.assertListEqual(self.G.edges(), [('d', 'e')]) self.G.add_nodes_from(['a', 'b', 'c']) self.G.add_edge('a', 'b') self.assertListEqual(hf.recursive_sorted(self.G.edges()), [['a', 'b'], ['d', 'e']]) def test_add_edge_nonstring(self): self.G.add_edge(1, 2) def test_add_edge_selfloop(self): self.assertRaises(ValueError, self.G.add_edge, 'a', 'a') def test_add_edge_result_cycle(self): self.G.add_edges_from([('a', 'b'), ('a', 'c')]) self.assertRaises(ValueError, self.G.add_edge, 'c', 'a') def test_add_edges_from_string(self): self.G.add_edges_from([('a', 'b'), ('b', 'c')]) self.assertListEqual(sorted(self.G.nodes()), ['a', 'b', 'c']) self.assertListEqual(hf.recursive_sorted(self.G.edges()), [['a', 'b'], ['b', 'c']]) self.G.add_nodes_from(['d', 'e', 'f']) self.G.add_edges_from([('d', 'e'), ('e', 'f')]) self.assertListEqual(sorted(self.G.nodes()), ['a', 'b', 'c', 'd', 'e', 'f']) self.assertListEqual( hf.recursive_sorted(self.G.edges()), hf.recursive_sorted([('a', 'b'), ('b', 'c'), ('d', 'e'), ('e', 'f')])) def test_add_edges_from_nonstring(self): self.G.add_edges_from([(1, 2), (2, 3)]) def test_add_edges_from_self_loop(self): self.assertRaises(ValueError, self.G.add_edges_from, [('a', 'a')]) def test_add_edges_from_result_cycle(self): self.assertRaises(ValueError, self.G.add_edges_from, [('a', 'b'), ('b', 'c'), ('c', 'a')]) def test_update_node_parents_bm_constructor(self): self.g = BayesianModel([('a', 'b'), ('b', 'c')]) self.assertListEqual(self.g.predecessors('a'), []) self.assertListEqual(self.g.predecessors('b'), ['a']) self.assertListEqual(self.g.predecessors('c'), ['b']) def test_update_node_parents(self): self.G.add_nodes_from(['a', 'b', 'c']) self.G.add_edges_from([('a', 'b'), ('b', 'c')]) self.assertListEqual(self.G.predecessors('a'), []) self.assertListEqual(self.G.predecessors('b'), ['a']) self.assertListEqual(self.G.predecessors('c'), ['b']) def tearDown(self): del self.G
class TestBaseEstimator(unittest.TestCase): def setUp(self): self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) self.rand_data['C'] = self.rand_data['B'] self.est_rand = HillClimbSearch(self.rand_data, scoring_method=K2Score(self.rand_data)) self.model1 = BayesianModel() self.model1.add_nodes_from(['A', 'B', 'C']) self.model2 = self.model1.copy() self.model2.add_edge('A', 'B') # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv') self.titanic_data1 = self.titanic_data[["Survived", "Sex", "Pclass", "Age", "Embarked"]] self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] self.est_titanic1 = HillClimbSearch(self.titanic_data1) self.est_titanic2 = HillClimbSearch(self.titanic_data2) def test_legal_operations(self): model2_legal_ops = list(self.est_rand._legal_operations(self.model2)) model2_legal_ops_ref = [(('+', ('C', 'A')), -28.15602208305154), (('+', ('A', 'C')), -28.155467430966382), (('+', ('C', 'B')), 7636.947544933631), (('+', ('B', 'C')), 7937.805375579936), (('-', ('A', 'B')), 28.155467430966382), (('flip', ('A', 'B')), -0.0005546520851567038)] self.assertSetEqual(set([op for op, score in model2_legal_ops]), set([op for op, score in model2_legal_ops_ref])) def test_legal_operations_titanic(self): est = self.est_titanic1 start_model = BayesianModel([("Survived", "Sex"), ("Pclass", "Age"), ("Pclass", "Embarked")]) legal_ops = est._legal_operations(start_model) self.assertEqual(len(list(legal_ops)), 20) tabu_list = [('-', ("Survived", "Sex")), ('-', ("Survived", "Pclass")), ('flip', ("Age", "Pclass"))] legal_ops_tabu = est._legal_operations(start_model, tabu_list=tabu_list) self.assertEqual(len(list(legal_ops_tabu)), 18) legal_ops_indegree = est._legal_operations(start_model, max_indegree=1) self.assertEqual(len(list(legal_ops_indegree)), 11) legal_ops_both = est._legal_operations(start_model, tabu_list=tabu_list, max_indegree=1) legal_ops_both_ref = [(('+', ('Embarked', 'Survived')), 10.050632580087608), (('+', ('Survived', 'Pclass')), 41.88868046549101), (('+', ('Age', 'Survived')), -23.635716036430836), (('+', ('Pclass', 'Survived')), 41.81314459373226), (('+', ('Sex', 'Pclass')), 4.772261678792802), (('-', ('Pclass', 'Age')), 11.546515590731815), (('-', ('Pclass', 'Embarked')), -32.171482832532774), (('flip', ('Pclass', 'Embarked')), 3.3563814191281836), (('flip', ('Survived', 'Sex')), 0.039737027979640516)] self.assertSetEqual(set(legal_ops_both), set(legal_ops_both_ref)) def test_estimate_rand(self): est1 = self.est_rand.estimate() self.assertSetEqual(set(est1.nodes()), set(['A', 'B', 'C'])) self.assertTrue(est1.edges() == [('B', 'C')] or est1.edges() == [('C', 'B')]) est2 = self.est_rand.estimate(start=BayesianModel([('A', 'B'), ('A', 'C')])) self.assertTrue(est2.edges() == [('B', 'C')] or est2.edges() == [('C', 'B')]) def test_estimate_titanic(self): self.assertSetEqual(set(self.est_titanic2.estimate().edges()), set([('Survived', 'Pclass'), ('Sex', 'Pclass'), ('Sex', 'Survived')])) def tearDown(self): del self.rand_data del self.est_rand del self.model1 del self.titanic_data del self.titanic_data1 del self.titanic_data2 del self.est_titanic1 del self.est_titanic2
from pgmpy.models import BayesianModel from pgmpy.factors import TabularCPD # Creating the above bayesian network model = BayesianModel() model.add_nodes_from(['Rain', 'TrafficJam']) model.add_edge('Rain', 'TrafficJam') model.add_edge('Accident', 'TrafficJam') cpd_rain = TabularCPD('Rain', 2, [[0.4], [0.6]]) cpd_accident = TabularCPD('Accident', 2, [[0.2], [0.8]]) cpd_traffic_jam = TabularCPD('TrafficJam', 2, [[0.9, 0.6, 0.7, 0.1], [0.1, 0.4, 0.3, 0.9]], evidence=['Rain', 'Accident'], evidence_card=[2, 2]) model.add_cpds(cpd_rain, cpd_accident, cpd_traffic_jam) model.add_node('LongQueues') model.add_edge('TrafficJam', 'LongQueues') cpd_long_queues = TabularCPD('LongQueues', 2, [[0.9, 0.2], [0.1, 0.8]], evidence=['TrafficJam'], evidence_card=[2]) model.add_cpds(cpd_long_queues) model.add_nodes_from(['GettingUpLate', 'LateForSchool']) model.add_edges_from([('GettingUpLate', 'LateForSchool'), ('TrafficJam', 'LateForSchool')]) cpd_getting_up_late = TabularCPD('GettingUpLate', 2, [[0.6], [0.4]]) cpd_late_for_school = TabularCPD('LateForSchool', 2, [[0.9, 0.45, 0.8, 0.1], [0.1, 0.55, 0.2, 0.9]],
class TestBaseModelCreation(unittest.TestCase): def setUp(self): self.G = BayesianModel() def test_class_init_without_data(self): self.assertIsInstance(self.G, nx.DiGraph) def test_class_init_with_data_string(self): self.g = BayesianModel([('a', 'b'), ('b', 'c')]) self.assertListEqual(sorted(self.g.nodes()), ['a', 'b', 'c']) self.assertListEqual(hf.recursive_sorted(self.g.edges()), [['a', 'b'], ['b', 'c']]) def test_class_init_with_data_nonstring(self): BayesianModel([(1, 2), (2, 3)]) def test_add_node_string(self): self.G.add_node('a') self.assertListEqual(self.G.nodes(), ['a']) def test_add_node_nonstring(self): self.G.add_node(1) def test_add_nodes_from_string(self): self.G.add_nodes_from(['a', 'b', 'c', 'd']) self.assertListEqual(sorted(self.G.nodes()), ['a', 'b', 'c', 'd']) def test_add_nodes_from_non_string(self): self.G.add_nodes_from([1, 2, 3, 4]) def test_add_edge_string(self): self.G.add_edge('d', 'e') self.assertListEqual(sorted(self.G.nodes()), ['d', 'e']) self.assertListEqual(self.G.edges(), [('d', 'e')]) self.G.add_nodes_from(['a', 'b', 'c']) self.G.add_edge('a', 'b') self.assertListEqual(hf.recursive_sorted(self.G.edges()), [['a', 'b'], ['d', 'e']]) def test_add_edge_nonstring(self): self.G.add_edge(1, 2) def test_add_edge_selfloop(self): self.assertRaises(ValueError, self.G.add_edge, 'a', 'a') def test_add_edge_result_cycle(self): self.G.add_edges_from([('a', 'b'), ('a', 'c')]) self.assertRaises(ValueError, self.G.add_edge, 'c', 'a') def test_add_edges_from_string(self): self.G.add_edges_from([('a', 'b'), ('b', 'c')]) self.assertListEqual(sorted(self.G.nodes()), ['a', 'b', 'c']) self.assertListEqual(hf.recursive_sorted(self.G.edges()), [['a', 'b'], ['b', 'c']]) self.G.add_nodes_from(['d', 'e', 'f']) self.G.add_edges_from([('d', 'e'), ('e', 'f')]) self.assertListEqual(sorted(self.G.nodes()), ['a', 'b', 'c', 'd', 'e', 'f']) self.assertListEqual(hf.recursive_sorted(self.G.edges()), hf.recursive_sorted([('a', 'b'), ('b', 'c'), ('d', 'e'), ('e', 'f')])) def test_add_edges_from_nonstring(self): self.G.add_edges_from([(1, 2), (2, 3)]) def test_add_edges_from_self_loop(self): self.assertRaises(ValueError, self.G.add_edges_from, [('a', 'a')]) def test_add_edges_from_result_cycle(self): self.assertRaises(ValueError, self.G.add_edges_from, [('a', 'b'), ('b', 'c'), ('c', 'a')]) def test_update_node_parents_bm_constructor(self): self.g = BayesianModel([('a', 'b'), ('b', 'c')]) self.assertListEqual(self.g.predecessors('a'), []) self.assertListEqual(self.g.predecessors('b'), ['a']) self.assertListEqual(self.g.predecessors('c'), ['b']) def test_update_node_parents(self): self.G.add_nodes_from(['a', 'b', 'c']) self.G.add_edges_from([('a', 'b'), ('b', 'c')]) self.assertListEqual(self.G.predecessors('a'), []) self.assertListEqual(self.G.predecessors('b'), ['a']) self.assertListEqual(self.G.predecessors('c'), ['b']) def tearDown(self): del self.G
def pdag_to_dag(pdag): """Completes a PDAG to a DAG, without adding v-structures, if such a completion exists. If no faithful extension is possible, some fully oriented DAG that corresponds to the PDAG is returned and a warning is generated. This is a static method. Parameters ---------- pdag: DirectedGraph A directed acyclic graph pattern, consisting in (acyclic) directed edges as well as "undirected" edges, represented as both-way edges between nodes. Returns ------- dag: BayesianModel A faithful orientation of pdag, if one exists. Otherwise any fully orientated DAG/BayesianModel with the structure of pdag. References ---------- [1] Chickering, Learning Equivalence Classes of Bayesian-Network Structures, 2002; See page 454 (last paragraph) for the algorithm pdag_to_dag http://www.jmlr.org/papers/volume2/chickering02a/chickering02a.pdf [2] Dor & Tarsi, A simple algorithm to construct a consistent extension of a partially oriented graph, 1992, http://ftp.cs.ucla.edu/pub/stat_ser/r185-dor-tarsi.pdf Examples -------- >>> import pandas as pd >>> import numpy as np >>> from pgmpy.base import DirectedGraph >>> from pgmpy.estimators import ConstraintBasedEstimator >>> data = pd.DataFrame(np.random.randint(0, 4, size=(5000, 3)), columns=list('ABD')) >>> data['C'] = data['A'] - data['B'] >>> data['D'] += data['A'] >>> c = ConstraintBasedEstimator(data) >>> pdag = c.skeleton_to_pdag(*c.estimate_skeleton()) >>> pdag.edges() [('B', 'C'), ('D', 'A'), ('A', 'D'), ('A', 'C')] >>> c.pdag_to_dag(pdag).edges() [('B', 'C'), ('A', 'D'), ('A', 'C')] >>> # pdag_to_dag is static: ... pdag1 = DirectedGraph([('A', 'B'), ('C', 'B'), ('C', 'D'), ('D', 'C'), ('D', 'A'), ('A', 'D')]) >>> ConstraintBasedEstimator.pdag_to_dag(pdag1).edges() [('D', 'C'), ('C', 'B'), ('A', 'B'), ('A', 'D')] >>> # example of a pdag with no faithful extension: ... pdag2 = DirectedGraph([('A', 'B'), ('A', 'C'), ('B', 'C'), ('C', 'B')]) >>> ConstraintBasedEstimator.pdag_to_dag(pdag2).edges() UserWarning: PDAG has no faithful extension (= no oriented DAG with the same v-structures as PDAG). Remaining undirected PDAG edges oriented arbitrarily. [('B', 'C'), ('A', 'B'), ('A', 'C')] """ pdag = pdag.copy() dag = BayesianModel() dag.add_nodes_from(pdag.nodes()) # add already directed edges of pdag to dag for X, Y in pdag.edges(): if not pdag.has_edge(Y, X): dag.add_edge(X, Y) while pdag.number_of_nodes() > 0: # find node with (1) no directed outgoing edges and # (2) the set of undirected neighbors is either empty or # undirected neighbors + parents of X are a clique found = False for X in pdag.nodes(): directed_outgoing_edges = set(pdag.successors(X)) - set(pdag.predecessors(X)) undirected_neighbors = set(pdag.successors(X)) & set(pdag.predecessors(X)) neighbors_are_clique = all((pdag.has_edge(Y, Z) for Z in pdag.predecessors(X) for Y in undirected_neighbors if not Y == Z)) if not directed_outgoing_edges and \ (not undirected_neighbors or neighbors_are_clique): found = True # add all edges of X as outgoing edges to dag for Y in pdag.predecessors(X): dag.add_edge(Y, X) pdag.remove_node(X) break if not found: warn("PDAG has no faithful extension (= no oriented DAG with the " + "same v-structures as PDAG). Remaining undirected PDAG edges " + "oriented arbitrarily.") for X, Y in pdag.edges(): if not dag.has_edge(Y, X): try: dag.add_edge(X, Y) except ValueError: pass break return dag
# Bayesian network for students from pgmpy.models import BayesianModel model = BayesianModel() # Add nodes model.add_nodes_from(['difficulty', 'intelligence', 'grade', 'sat', 'letter']) print(model.nodes()) # Add edges model.add_edges_from([('difficulty', 'grade'), ('intelligence', 'grade'), ('intelligence', 'sat'), ('grade', 'letter')]) print(model.edges())