def estimate_cpd(self, node): """ Method to estimate the CPD for a given variable. Parameters ---------- node: int, string (any hashable python object) The name of the variable for which the CPD is to be estimated. Returns ------- CPD: TabularCPD Examples -------- >>> import pandas as pd >>> from pgmpy.models import BayesianModel >>> from pgmpy.estimators import MaximumLikelihoodEstimator >>> data = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]}) >>> model = BayesianModel([('A', 'C'), ('B', 'C')]) >>> cpd_A = MaximumLikelihoodEstimator(model, data).estimate_cpd('A') >>> print(cpd_A) ╒══════╤══════════╕ │ A(0) │ 0.666667 │ ├──────┼──────────┤ │ A(1) │ 0.333333 │ ╘══════╧══════════╛ >>> cpd_C = MaximumLikelihoodEstimator(model, data).estimate_cpd('C') >>> print(cpd_C) ╒══════╤══════╤══════╤══════╤══════╕ │ A │ A(0) │ A(0) │ A(1) │ A(1) │ ├──────┼──────┼──────┼──────┼──────┤ │ B │ B(0) │ B(1) │ B(0) │ B(1) │ ├──────┼──────┼──────┼──────┼──────┤ │ C(0) │ 0.0 │ 0.0 │ 1.0 │ 0.5 │ ├──────┼──────┼──────┼──────┼──────┤ │ C(1) │ 1.0 │ 1.0 │ 0.0 │ 0.5 │ ╘══════╧══════╧══════╧══════╧══════╛ """ state_counts = self.state_counts(node) # if a column contains only `0`s (no states observed for some configuration # of parents' states) fill that column uniformly instead state_counts.ix[:, (state_counts == 0).all()] = 1 parents = sorted(self.model.get_parents(node)) parents_cardinalities = [ len(self.state_names[parent]) for parent in parents ] node_cardinality = len(self.state_names[node]) cpd = TabularCPD(node, node_cardinality, np.array(state_counts), evidence=parents, evidence_card=parents_cardinalities, state_names=self.state_names) cpd.normalize() return cpd
def estimate_cpd(self, node): """ Method to estimate the CPD for a given variable. Parameters ---------- node: int, string (any hashable python object) The name of the variable for which the CPD is to be estimated. Returns ------- CPD: TabularCPD Examples -------- >>> import pandas as pd >>> from pgmpy.models import BayesianModel >>> from pgmpy.estimators import MaximumLikelihoodEstimator >>> data = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]}) >>> model = BayesianModel([('A', 'C'), ('B', 'C')]) >>> cpd_A = MaximumLikelihoodEstimator(model, data).estimate_cpd('A') >>> print(cpd_A) ╒══════╤══════════╕ │ A(0) │ 0.666667 │ ├──────┼──────────┤ │ A(1) │ 0.333333 │ ╘══════╧══════════╛ >>> cpd_C = MaximumLikelihoodEstimator(model, data).estimate_cpd('C') >>> print(cpd_C) ╒══════╤══════╤══════╤══════╤══════╕ │ A │ A(0) │ A(0) │ A(1) │ A(1) │ ├──────┼──────┼──────┼──────┼──────┤ │ B │ B(0) │ B(1) │ B(0) │ B(1) │ ├──────┼──────┼──────┼──────┼──────┤ │ C(0) │ 0.0 │ 0.0 │ 1.0 │ 0.5 │ ├──────┼──────┼──────┼──────┼──────┤ │ C(1) │ 1.0 │ 1.0 │ 0.0 │ 0.5 │ ╘══════╧══════╧══════╧══════╧══════╛ """ state_counts = self.state_counts(node) # if a column contains only `0`s (no states observed for some configuration # of parents' states) fill that column uniformly instead state_counts.ix[:, (state_counts == 0).all()] = 1 parents = sorted(self.model.get_parents(node)) parents_cardinalities = [len(self.state_names[parent]) for parent in parents] node_cardinality = len(self.state_names[node]) cpd = TabularCPD(node, node_cardinality, np.array(state_counts), evidence=parents, evidence_card=parents_cardinalities, state_names=self.state_names) cpd.normalize() return cpd
def estimate_cpd(self, node): state_counts = self.state_counts(node) state_counts.ix[:, (state_counts == 0).all()] = 1 parents = sorted(self.model.get_parents(node)) parents_cardinalities = [ len(self.state_names[parent]) for parent in parents ] node_cardinality = len(self.state_names[node]) cpd = TabularCPD(node, node_cardinality, np.array(state_counts), evidence=parents, evidence_card=parents_cardinalities, state_names=self.state_names) cpd.normalize() return cpd
def compute_cpd(model, node, data, state_names): # this is a similar function to pgmpy BayesianModel.fit() # https://github.com/pgmpy/pgmpy node_cardinality = len(state_names[node]) state_name = {node: state_names[node]} parents = sorted(model.get_parents(node)) parents_cardinalities = [ len(state_names[parent]) for parent in parents ] #get values #print('data') #print(data) if parents: state_name.update( {parent: state_names[parent] for parent in parents}) #get values parents_states = [state_names[parent] for parent in parents] state_value_data = data.groupby([node] + parents).sum().unstack(parents) #drop 'counts' state_value_data = state_value_data.droplevel(0, axis=1) row_index = state_names[node] if (len(parents) > 1): column_index = pd.MultiIndex.from_product(parents_states, names=parents) state_values = state_value_data.reindex(index=row_index, columns=column_index) state_values = state_value_data else: state_value_data = data.groupby([node]).sum() state_values = state_value_data.reindex(state_names[node]) cpd = TabularCPD( node, node_cardinality, state_values, evidence=parents, evidence_card=parents_cardinalities, state_names=state_name, ) cpd.normalize() return cpd
def estimate_cpd(self, node, prior_type='BDeu', pseudo_counts=[], equivalent_sample_size=5): """ Method to estimate the CPD for a given variable. Parameters ---------- node: int, string (any hashable python object) The name of the variable for which the CPD is to be estimated. prior_type: 'dirichlet', 'BDeu', 'K2', string indicting which type of prior to use for the model parameters. - If 'prior_type' is 'dirichlet', the following must be provided: 'pseudo_counts' = dirichlet hyperparameters; a list or dict with a "virtual" count for each variable state. The virtual counts are added to the actual state counts found in the data. (if a list is provided, a lexicographic ordering of states is assumed) - If 'prior_type' is 'BDeu', then an 'equivalent_sample_size' must be specified instead of 'pseudo_counts'. This is equivalent to 'prior_type=dirichlet' and using uniform 'pseudo_counts' of `equivalent_sample_size/(node_cardinality*np.prod(parents_cardinalities))`. - A prior_type of 'K2' is a shorthand for 'dirichlet' + setting every pseudo_count to 1, regardless of the cardinality of the variable. Returns ------- CPD: TabularCPD Examples -------- >>> import pandas as pd >>> from pgmpy.models import BayesianModel >>> from pgmpy.estimators import BayesianEstimator >>> data = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]}) >>> model = BayesianModel([('A', 'C'), ('B', 'C')]) >>> estimator = BayesianEstimator(model, data) >>> cpd_C = estimator.estimate_cpd('C', prior_type="dirichlet", pseudo_counts=[1, 2]) >>> print(cpd_C) ╒══════╤══════╤══════╤══════╤════════════════════╕ │ A │ A(0) │ A(0) │ A(1) │ A(1) │ ├──────┼──────┼──────┼──────┼────────────────────┤ │ B │ B(0) │ B(1) │ B(0) │ B(1) │ ├──────┼──────┼──────┼──────┼────────────────────┤ │ C(0) │ 0.25 │ 0.25 │ 0.5 │ 0.3333333333333333 │ ├──────┼──────┼──────┼──────┼────────────────────┤ │ C(1) │ 0.75 │ 0.75 │ 0.5 │ 0.6666666666666666 │ ╘══════╧══════╧══════╧══════╧════════════════════╛ """ node_cardinality = len(self.state_names[node]) parents = sorted(self.model.get_parents(node)) parents_cardinalities = [len(self.state_names[parent]) for parent in parents] if prior_type == 'K2': pseudo_counts = [1] * node_cardinality elif prior_type == 'BDeu': alpha = float(equivalent_sample_size) / (node_cardinality * np.prod(parents_cardinalities)) pseudo_counts = [alpha] * node_cardinality elif prior_type == 'dirichlet': if not len(pseudo_counts) == node_cardinality: raise ValueError("'pseudo_counts' should have length {0}".format(node_cardinality)) if isinstance(pseudo_counts, dict): pseudo_counts = sorted(pseudo_counts.values()) else: raise ValueError("'prior_type' not specified") state_counts = self.state_counts(node) bayesian_counts = (state_counts.T + pseudo_counts).T cpd = TabularCPD(node, node_cardinality, np.array(bayesian_counts), evidence=parents, evidence_card=parents_cardinalities, state_names=self.state_names) cpd.normalize() return cpd
def estimate_cpd_dynamic(self, node, temp=1, prior_type='BDeu', pseudo_counts=[], equivalent_sample_size=5): """ Method to estimate the CPD for a given variable. Parameters ---------- node: int, string (any hashable python object) The name of the variable for which the CPD is to be estimated. temp: integer = 0 or 1 that represents the time of the node in the Dynamic Bayesian Network prior_type: 'dirichlet', 'BDeu', 'K2', string indicting which type of prior to use for the model parameters. - If 'prior_type' is 'dirichlet', the following must be provided: 'pseudo_counts' = dirichlet hyperparameters; a list or dict with a "virtual" count for each variable state. The virtual counts are added to the actual state counts found in the data. (if a list is provided, a lexicographic ordering of states is assumed) - If 'prior_type' is 'BDeu', then an 'equivalent_sample_size' must be specified instead of 'pseudo_counts'. This is equivalent to 'prior_type=dirichlet' and using uniform 'pseudo_counts' of `equivalent_sample_size/(node_cardinality*np.prod(parents_cardinalities))`. - A prior_type of 'K2' is a shorthand for 'dirichlet' + setting every pseudo_count to 1, regardless of the cardinality of the variable. Returns ------- CPD: TabularCPD Examples -------- >>> import pandas as pd >>> import numpy as np >>> from pgmpy.models import DynamicBayesianNetwork as DBN >>> from pgmpy.estimators import BayesianEstimator >>> data = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]}) >>> transitionModel = DBN() >>> labels = np.array(data.columns) >>> transitionModel.add_nodes_from(labels) >>> transitionModel.add_edge(('A',0), ('B',1)) >>> estimator = BayesianEstimator(transitionModel, data) >>> cpd_B = estimator.estimate_cpd_dynamic('B', prior_type="dirichlet", pseudo_counts=[1, 2]) >>> print(cpd_B) +----------+--------+----------------+ | ('A', 0) | A_0(0) | A_0(1) | +----------+--------+----------------+ | B(0) | 0.4 | 0.333333333333 | +----------+--------+----------------+ | B(1) | 0.6 | 0.666666666667 | +----------+--------+----------------+ """ node_cardinality = len(self.state_names[node]) parents = sorted(self.model.get_parents_dynamic( self.model, node, temp)) parents_cardinalities = [ len(self.state_names[parent]) for parent, temp in parents ] if prior_type == 'K2': pseudo_counts = [1] * node_cardinality elif prior_type == 'BDeu': alpha = float(equivalent_sample_size) / ( node_cardinality * np.prod(parents_cardinalities)) pseudo_counts = [alpha] * node_cardinality elif prior_type == 'dirichlet': if not len(pseudo_counts) == node_cardinality: raise ValueError( "'pseudo_counts' should have length {0}".format( node_cardinality)) if isinstance(pseudo_counts, dict): pseudo_counts = sorted(pseudo_counts.values()) else: raise ValueError("'prior_type' not specified") tData = self.calculate_t_data(node, parents) state_counts = self.state_counts_dynamic(node, parents, tData) bayesian_counts = (state_counts.T + pseudo_counts).T cpd = TabularCPD(node, node_cardinality, np.array(bayesian_counts), evidence=parents, evidence_card=parents_cardinalities, state_names=self.state_names) cpd.normalize() return cpd
def estimate_cpd(self, node, prior_type='BDeu', pseudo_counts=[], equivalent_sample_size=5): """ Method to estimate the CPD for a given variable. Parameters ---------- node: int, string (any hashable python object) The name of the variable for which the CPD is to be estimated. prior_type: 'dirichlet', 'BDeu', 'K2', string indicting which type of prior to use for the model parameters. - If 'prior_type' is 'dirichlet', the following must be provided: 'pseudo_counts' = dirichlet hyperparameters; a list or dict with a "virtual" count for each variable state. The virtual counts are added to the actual state counts found in the data. (if a list is provided, a lexicographic ordering of states is assumed) - If 'prior_type' is 'BDeu', then an 'equivalent_sample_size' must be specified instead of 'pseudo_counts'. This is equivalent to 'prior_type=dirichlet' and using uniform 'pseudo_counts' of `equivalent_sample_size/(node_cardinality*np.prod(parents_cardinalities))`. - A prior_type of 'K2' is a shorthand for 'dirichlet' + setting every pseudo_count to 1, regardless of the cardinality of the variable. Returns ------- CPD: TabularCPD Examples -------- >>> import pandas as pd >>> from pgmpy.models import BayesianModel >>> from pgmpy.estimators import BayesianEstimator >>> data = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]}) >>> model = BayesianModel([('A', 'C'), ('B', 'C')]) >>> estimator = BayesianEstimator(model, data) >>> cpd_C = estimator.estimate_cpd('C', prior_type="dirichlet", pseudo_counts=[1, 2]) >>> print(cpd_C) ╒══════╤══════╤══════╤══════╤════════════════════╕ │ A │ A(0) │ A(0) │ A(1) │ A(1) │ ├──────┼──────┼──────┼──────┼────────────────────┤ │ B │ B(0) │ B(1) │ B(0) │ B(1) │ ├──────┼──────┼──────┼──────┼────────────────────┤ │ C(0) │ 0.25 │ 0.25 │ 0.5 │ 0.3333333333333333 │ ├──────┼──────┼──────┼──────┼────────────────────┤ │ C(1) │ 0.75 │ 0.75 │ 0.5 │ 0.6666666666666666 │ ╘══════╧══════╧══════╧══════╧════════════════════╛ """ node_cardinality = len(self.state_names[node]) parents = sorted(self.model.get_parents(node)) parents_cardinalities = [ len(self.state_names[parent]) for parent in parents ] if prior_type == 'K2': pseudo_counts = [1] * node_cardinality elif prior_type == 'BDeu': alpha = float(equivalent_sample_size) / ( node_cardinality * np.prod(parents_cardinalities)) pseudo_counts = [alpha] * node_cardinality elif prior_type == 'dirichlet': if not len(pseudo_counts) == node_cardinality: raise ValueError( "'pseudo_counts' should have length {0}".format( node_cardinality)) if isinstance(pseudo_counts, dict): pseudo_counts = sorted(pseudo_counts.values()) else: raise ValueError("'prior_type' not specified") state_counts = self.state_counts(node) bayesian_counts = (state_counts.T + pseudo_counts).T cpd = TabularCPD(node, node_cardinality, np.array(bayesian_counts), evidence=parents, evidence_card=parents_cardinalities, state_names=self.state_names) cpd.normalize() return cpd
cpd_x6x1 = TabularCPD( 'x1', 4, [t8_array[1, 0:5], t8_array[2, 0:5], t8_array[3, 0:5], t8_array[4, 0:5]], ['x6'], [5]) cpd_x6 = TabularCPD('x6', 5, [t8_array[0:5, 0]]) cpd_x2 = TabularCPD('x2', 5, [t4_array[0:5, 0]]) cpd_x6x2 = TabularCPD('x2', 5, [ t8_array[5, 0:5], t8_array[6, 0:5], t8_array[7, 0:5], t8_array[8, 0:5], t8_array[9, 0:5] ], ['x6'], [5]) # Normalizing the CPDs cpd_x1x2.normalize(True) cpd_x1x4.normalize(True) cpd_x1x6.normalize(True) cpd_x1.normalize(True) cpd_x2x5.normalize(True) cpd_x5x2.normalize(True) cpd_x2x3.normalize(True) cpd_x3.normalize(True) cpd_x3x2.normalize(True) cpd_x3x6.normalize(True) cpd_x6x4.normalize(True) cpd_x4x6.normalize(True) cpd_x4x1.normalize(True) cpd_x6x1.normalize(True) cpd_x6.normalize(True) cpd_x2.normalize(True)