def estimate(self):
        """
        Estimates the `BayesianModel` structure that fits best to the given data set,
        according to the scoring method supplied in the constructor.
        Exhaustively searches through all models. Only estimates network structure, no parametrization.

        Returns
        -------
        model: `BayesianModel` instance
            A `BayesianModel` with maximal score.

        Examples
        --------
        >>> import pandas as pd
        >>> import numpy as np
        >>> from pgmpy.estimators import ExhaustiveSearch
        >>> # create random data sample with 3 variables, where B and C are identical:
        >>> data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB'))
        >>> data['C'] = data['B']
        >>> est = ExhaustiveSearch(data)
        >>> best_model = est.estimate()
        >>> best_model
        <pgmpy.models.BayesianModel.BayesianModel object at 0x7f695c535470>
        >>> best_model.edges()
        [('B', 'C')]
        """

        best_dag = max(self.all_dags(), key=self.scoring_method.score)

        best_model = BayesianModel()
        best_model.add_nodes_from(sorted(best_dag.nodes()))
        best_model.add_edges_from(sorted(best_dag.edges()))
        return best_model
    def test_predict(self):
        titanic = BayesianModel()
        titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")])
        titanic.fit(self.titanic_data2[500:])

        p1 = titanic.predict(self.titanic_data2[["Sex", "Pclass"]][:30])
        p2 = titanic.predict(self.titanic_data2[["Survived", "Pclass"]][:30])
        p3 = titanic.predict(self.titanic_data2[["Survived", "Sex"]][:30])

        p1_res = np.array([
            '0', '1', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0',
            '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
            '0', '0', '0', '0'
        ])
        p2_res = np.array([
            'male', 'female', 'female', 'female', 'male', 'male', 'male',
            'male', 'female', 'female', 'female', 'female', 'male', 'male',
            'male', 'female', 'male', 'female', 'male', 'female', 'male',
            'female', 'female', 'female', 'male', 'female', 'male', 'male',
            'female', 'male'
        ])
        p3_res = np.array([
            '3', '1', '1', '1', '3', '3', '3', '3', '1', '1', '1', '1', '3',
            '3', '3', '1', '3', '1', '3', '1', '3', '1', '1', '1', '3', '1',
            '3', '3', '1', '3'
        ])

        np_test.assert_array_equal(p1.values.ravel(), p1_res)
        np_test.assert_array_equal(p2.values.ravel(), p2_res)
        np_test.assert_array_equal(p3.values.ravel(), p3_res)
Example #3
0
    def get_model(self):
        """
        Returns an instance of Bayesian Model.
        """
        model = BayesianModel()
        model.add_nodes_from(self.variables)
        model.add_edges_from(self.edges)
        model.name = self.model_name

        tabular_cpds = []
        for var, values in self.variable_CPD.items():
            evidence = values["CONDSET"] if "CONDSET" in values else []
            cpd = values["DPIS"]
            evidence_card = values[
                "CARDINALITY"] if "CARDINALITY" in values else []
            states = self.variables[var]["STATES"]
            cpd = TabularCPD(var,
                             len(states),
                             cpd,
                             evidence=evidence,
                             evidence_card=evidence_card)
            tabular_cpds.append(cpd)

        model.add_cpds(*tabular_cpds)

        if nx.__version__.startswith("1"):
            for var, properties in self.variables.items():
                model.nodes[var] = properties
        else:
            for var, properties in self.variables.items():
                model._node[var] = properties

        return model
    def get_model(self):
        """
        Returns an instance of Bayesian Model.
        """
        model = BayesianModel()
        model.add_nodes_from(self.variables)
        model.add_edges_from(self.edges)
        model.name = self.model_name

        tabular_cpds = []
        for var, values in self.variable_CPD.items():
            evidence = values['CONDSET'] if 'CONDSET' in values else []
            cpd = values['DPIS']
            evidence_card = values[
                'CARDINALITY'] if 'CARDINALITY' in values else []
            states = self.variables[var]['STATES']
            cpd = TabularCPD(var,
                             len(states),
                             cpd,
                             evidence=evidence,
                             evidence_card=evidence_card)
            tabular_cpds.append(cpd)

        model.add_cpds(*tabular_cpds)

        for var, properties in self.variables.items():
            model.node[var] = properties

        return model
Example #5
0
    def estimate(self):
        """
        Estimates the `BayesianModel` structure that fits best to the given data set,
        according to the scoring method supplied in the constructor.
        Exhaustively searches through all models. Only estimates network structure, no parametrization.

        Returns
        -------
        model: `BayesianModel` instance
            A `BayesianModel` with maximal score.

        Examples
        --------
        >>> import pandas as pd
        >>> import numpy as np
        >>> from pgmpy.estimators import ExhaustiveSearch
        >>> # create random data sample with 3 variables, where B and C are identical:
        >>> data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB'))
        >>> data['C'] = data['B']
        >>> est = ExhaustiveSearch(data)
        >>> best_model = est.estimate()
        >>> best_model
        <pgmpy.models.BayesianModel.BayesianModel object at 0x7f695c535470>
        >>> best_model.edges()
        [('B', 'C')]
        """

        best_dag = max(self.all_dags(), key=self.scoring_method.score)

        best_model = BayesianModel()
        best_model.add_nodes_from(sorted(best_dag.nodes()))
        best_model.add_edges_from(sorted(best_dag.edges()))
        return best_model
Example #6
0
    def get_model(self):
        model = BayesianModel()
        model.add_nodes_from(self.variables)
        model.add_edges_from(self.edge_list)
        model.name = self.network_name

        tabular_cpds = []
        for var, values in self.variable_CPD.items():
            evidence_card = [
                len(self.variable_states[evidence_var])
                for evidence_var in self.variable_parents[var]
            ]
            cpd = TabularCPD(
                var,
                len(self.variable_states[var]),
                values,
                evidence=self.variable_parents[var],
                evidence_card=evidence_card,
                state_names=self.get_states(),
            )
            tabular_cpds.append(cpd)

        model.add_cpds(*tabular_cpds)

        for node, properties in self.variable_property.items():
            for prop in properties:
                if prop is not None:
                    prop_name, prop_value = map(lambda t: t.strip(), prop.split("="))
                    model.nodes[node][prop_name] = prop_value

        return model
Example #7
0
    def to_bayesian_model(self):
        """
        Creates a Bayesian Model which is a minimum I-Map for this markov model.

        The ordering of parents may not remain constant. It would depend on the
        ordering of variable in the junction tree (which is not constant) all the
        time.

        Examples
        --------
        >>> from pgmpy.models import MarkovModel
        >>> from pgmpy.factors import Factor
        >>> mm = MarkovModel()
        >>> mm.add_nodes_from(['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7'])
        >>> mm.add_edges_from([('x1', 'x3'), ('x1', 'x4'), ('x2', 'x4'),
        ...                    ('x2', 'x5'), ('x3', 'x6'), ('x4', 'x6'),
        ...                    ('x4', 'x7'), ('x5', 'x7')])
        >>> phi = [Factor(edge, [2, 2], np.random.rand(4)) for edge in mm.edges()]
        >>> mm.add_factors(*phi)
        >>> bm = mm.to_bayesian_model()
        """
        from pgmpy.models import BayesianModel

        bm = BayesianModel()
        var_clique_dict = defaultdict(tuple)
        var_order = []

        # Create a junction tree from the markov model.
        # Creation of clique tree involves triangulation, finding maximal cliques
        # and creating a tree from these cliques
        junction_tree = self.to_junction_tree()

        # create an ordering of the nodes based on the ordering of the clique
        # in which it appeared first
        root_node = junction_tree.nodes()[0]
        bfs_edges = nx.bfs_edges(junction_tree, root_node)
        for node in root_node:
            var_clique_dict[node] = root_node
            var_order.append(node)
        for edge in bfs_edges:
            clique_node = edge[1]
            for node in clique_node:
                if not var_clique_dict[node]:
                    var_clique_dict[node] = clique_node
                    var_order.append(node)

        # create a bayesian model by adding edges from parent of node to node as
        # par(x_i) = (var(c_k) - x_i) \cap {x_1, ..., x_{i-1}}
        for node_index in range(len(var_order)):
            node = var_order[node_index]
            node_parents = (set(var_clique_dict[node]) -
                            set([node])).intersection(
                                set(var_order[:node_index]))
            bm.add_edges_from([(parent, node) for parent in node_parents])
            # TODO : Convert factor into CPDs
        return bm
    def to_bayesian_model(self):
        """
        Creates a Bayesian Model which is a minimum I-Map for this markov model.

        The ordering of parents may not remain constant. It would depend on the
        ordering of variable in the junction tree (which is not constant) all the
        time.

        Examples
        --------
        >>> from pgmpy.models import MarkovModel
        >>> from pgmpy.factors.discrete import DiscreteFactor
        >>> mm = MarkovModel()
        >>> mm.add_nodes_from(['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7'])
        >>> mm.add_edges_from([('x1', 'x3'), ('x1', 'x4'), ('x2', 'x4'),
        ...                    ('x2', 'x5'), ('x3', 'x6'), ('x4', 'x6'),
        ...                    ('x4', 'x7'), ('x5', 'x7')])
        >>> phi = [DiscreteFactor(edge, [2, 2], np.random.rand(4)) for edge in mm.edges()]
        >>> mm.add_factors(*phi)
        >>> bm = mm.to_bayesian_model()
        """
        from pgmpy.models import BayesianModel

        bm = BayesianModel()
        var_clique_dict = defaultdict(tuple)
        var_order = []

        # Create a junction tree from the markov model.
        # Creation of clique tree involves triangulation, finding maximal cliques
        # and creating a tree from these cliques
        junction_tree = self.to_junction_tree()

        # create an ordering of the nodes based on the ordering of the clique
        # in which it appeared first
        root_node = junction_tree.nodes()[0]
        bfs_edges = nx.bfs_edges(junction_tree, root_node)
        for node in root_node:
            var_clique_dict[node] = root_node
            var_order.append(node)
        for edge in bfs_edges:
            clique_node = edge[1]
            for node in clique_node:
                if not var_clique_dict[node]:
                    var_clique_dict[node] = clique_node
                    var_order.append(node)

        # create a bayesian model by adding edges from parent of node to node as
        # par(x_i) = (var(c_k) - x_i) \cap {x_1, ..., x_{i-1}}
        for node_index in range(len(var_order)):
            node = var_order[node_index]
            node_parents = (set(var_clique_dict[node]) - set([node])).intersection(
                set(var_order[:node_index]))
            bm.add_edges_from([(parent, node) for parent in node_parents])
            # TODO : Convert factor into CPDs
        return bm
Example #9
0
    def get_model(self):
        """
        Returns the model instance of the ProbModel.

        Return
        ---------------
        model: an instance of BayesianModel.

        Examples
        -------
        >>> reader = ProbModelXMLReader()
        >>> reader.get_model()
        """
        if self.probnet.get("type") == "BayesianNetwork":
            model = BayesianModel()
            model.add_nodes_from(self.probnet["Variables"].keys())
            model.add_edges_from(self.probnet["edges"].keys())

            tabular_cpds = []
            cpds = self.probnet["Potentials"]
            for cpd in cpds:
                var = list(cpd["Variables"].keys())[0]
                states = self.probnet["Variables"][var]["States"]
                evidence = cpd["Variables"][var]
                evidence_card = [
                    len(self.probnet["Variables"][evidence_var]["States"])
                    for evidence_var in evidence
                ]
                arr = list(map(float, cpd["Values"].split()))
                values = np.array(arr)
                values = values.reshape((len(states), values.size // len(states)))
                tabular_cpds.append(
                    TabularCPD(var, len(states), values, evidence, evidence_card)
                )

            model.add_cpds(*tabular_cpds)

            variables = model.nodes()
            for var in variables:
                for prop_name, prop_value in self.probnet["Variables"][var].items():
                    model.nodes[var][prop_name] = prop_value
            edges = model.edges()

            if nx.__version__.startswith("1"):
                for edge in edges:
                    for prop_name, prop_value in self.probnet["edges"][edge].items():
                        model.edge[edge[0]][edge[1]][prop_name] = prop_value
            else:
                for edge in edges:
                    for prop_name, prop_value in self.probnet["edges"][edge].items():
                        model.adj[edge[0]][edge[1]][prop_name] = prop_value
            return model
        else:
            raise ValueError("Please specify only Bayesian Network.")
Example #10
0
def train_model_idx(train_model) -> BayesianModel:
    """
    This Bayesian model is identical to the train_model() fixture, with the exception that node names
    are integers from zero to 1, mapped by:

    {"a": 0, "b": 1, "c": 2, "d": 3, "e": 4}
    """
    model = BayesianModel()
    idx_map = {"a": 0, "b": 1, "c": 2, "d": 3, "e": 4}
    model.add_edges_from([(idx_map[u], idx_map[v]) for u, v in train_model.edges])
    return model
    def get_model(self):
        """
        Returns the fitted bayesian model

        Example
        ----------
        >>> from pgmpy.readwrite import BIFReader
        >>> reader = BIFReader("bif_test.bif")
        >>> reader.get_model()
        <pgmpy.models.BayesianModel.BayesianModel object at 0x7f20af154320>
        """
        try:
            model = BayesianModel()
            model.add_nodes_from(self.variable_names)
            model.add_edges_from(self.variable_edges)
            model.name = self.network_name

            tabular_cpds = []
            for var in sorted(self.variable_cpds.keys()):
                values = self.variable_cpds[var]
                sn = {
                    p_var: self.variable_states[p_var]
                    for p_var in self.variable_parents[var]
                }
                sn[var] = self.variable_states[var]
                cpd = TabularCPD(
                    var,
                    len(self.variable_states[var]),
                    values,
                    evidence=self.variable_parents[var],
                    evidence_card=[
                        len(self.variable_states[evidence_var])
                        for evidence_var in self.variable_parents[var]
                    ],
                    state_names=sn,
                )
                tabular_cpds.append(cpd)

            model.add_cpds(*tabular_cpds)

            if self.include_properties:
                for node, properties in self.variable_properties.items():
                    for prop in properties:
                        prop_name, prop_value = map(lambda t: t.strip(),
                                                    prop.split("="))
                        model.nodes[node][prop_name] = prop_value

            return model

        except AttributeError:
            raise AttributeError(
                "First get states of variables, edges, parents and network name"
            )
Example #12
0
def create_bayes_net(file, keep_atts, edges):
    atts = pd.read_csv(file)
    atts = atts[keep_atts]
    graph = BayesianModel()
    graph.add_nodes_from(atts.columns)

    # defining the structure of edges
    graph.add_edges_from(edges)

    # fit estimates the CPD tables for the given structure
    graph.fit(atts)

    return graph
Example #13
0
def make_bayes_net(load=False, subtree=True, modelsdir=MODEL_CPDS_DIR):
    print('Making bayes net')
    graph_file = RUNNING_MODEL_DIR + '/' + 'graph.p'
    if os.path.isfile(graph_file) and load == True:
        print('Loading saved graph from file...')
        G = pickle.load(open(graph_file, 'rb'))
        G.check_model()
    else:
        print('loading data...')
        training_labels, go_dict = load_label_data()
        if subtree:
            labels_list = _subtree_labels()
            print(labels_list)
        else:
            labels_list = go_dict.keys()

        print('adding nodes and edges...')
        G = BayesianModel()
        G.add_edges_from([(label, label + '_hat') for label in labels_list])
        obo_graph = obonet.read_obo(OBODB_FILE)
        for label in labels_list:
            children = [
                c for c in networkx.ancestors(obo_graph, label)
                if c in labels_list
            ]
            for child in children:
                G.add_edge(child, label)

        predicted_cpds = get_model_cpds(labels_list=labels_list,
                                        modelsdir=MODEL_CPDS_DIR)
        for cpd in predicted_cpds:
            G.add_cpds(cpd)
        true_label_cpds = get_true_label_cpds(training_labels,
                                              go_dict,
                                              labels_list=labels_list)
        for cpd in true_label_cpds:
            G.add_cpds(cpd)
        remove_list = []
        for node in G.nodes():
            if G.get_cpds(node) == None:
                remove_list.append(node)
                # remove_list.append(node+'_hat')
        for node in remove_list:
            if node in G:
                G.remove_node(node)
        G.check_model()
        pickle.dump(G, open(graph_file, 'wb'))
    return G
Example #14
0
def create_bayes_net():
    atts = pd.read_csv('../../data/list_attr_celeba.csv')
    atts = atts[KEEP_ATTS]
    graph = BayesianModel()
    graph.add_nodes_from(atts.columns)

    graph.add_edges_from([('Young', 'Eyeglasses'), ('Young', 'Bald'),
                          ('Young', 'Mustache'), ('Male', 'Mustache'),
                          ('Male', 'Smiling'), ('Male', 'Wearing_Lipstick'),
                          ('Young', 'Mouth_Slightly_Open'),
                          ('Young', 'Narrow_Eyes'), ('Male', 'Narrow_Eyes'),
                          ('Smiling', 'Narrow_Eyes'),
                          ('Smiling', 'Mouth_Slightly_Open'),
                          ('Young', 'Smiling')])
    graph.fit(atts)
    return graph
Example #15
0
    def get_model(self):
        """
        Returns an instance of Bayesian Model or Markov Model.
        Varibles are in the pattern var_0, var_1, var_2 where var_0 is
        0th index variable, var_1 is 1st index variable.

        Return
        ------
        model: an instance of Bayesian or Markov Model.

        Examples
        --------
        >>> reader = UAIReader('TestUAI.uai')
        >>> reader.get_model()
        """
        if self.network_type == 'BAYES':
            model = BayesianModel()
            model.add_nodes_from(self.variables)
            model.add_edges_from(self.edges)

            tabular_cpds = []
            for cpd in self.tables:
                child_var = cpd[0]
                states = int(self.domain[child_var])
                arr = list(map(float, cpd[1]))
                values = np.array(arr)
                values = values.reshape(states, values.size // states)
                tabular_cpds.append(TabularCPD(child_var, states, values))

            model.add_cpds(*tabular_cpds)
            return model

        elif self.network_type == 'MARKOV':
            model = MarkovModel(self.edges)

            factors = []
            for table in self.tables:
                variables = table[0]
                cardinality = [int(self.domain[var]) for var in variables]
                value = list(map(float, table[1]))
                factor = DiscreteFactor(variables=variables,
                                        cardinality=cardinality,
                                        values=value)
                factors.append(factor)

            model.add_factors(*factors)
            return model
Example #16
0
    def join(reference_bayes, second_bayes, new_dependent_vars,
             new_independent_vars, ref_num_of_records, second_num_of_records):
        final_bayes = BayesianModel()
        #all independent variables should stay the same
        final_bayes.add_nodes_from(new_independent_vars)
        final_bayes.add_cpds(*[
            reference_bayes.get_cpds(node=node) if node in
            reference_bayes.nodes else second_bayes.get_cpds(node=node)
            for node in new_independent_vars
        ])
        for node in new_dependent_vars:
            final_bayes.add_node(node)
            ref_parents = set()
            second_parents = set()
            if node in reference_bayes:
                ref_parents = set(reference_bayes.get_parents(node))
            if node in second_bayes:
                second_parents = set(second_bayes.get_parents(node))

            if (len(ref_parents) == 0):
                final_bayes.add_edges_from([(parent, node)
                                            for parent in second_parents])
                final_bayes.add_cpds(second_bayes.get_cpds(node=node))
            else:
                final_bayes.add_edges_from([(parent, node)
                                            for parent in ref_parents])
                if len(second_parents - ref_parents) > 0:
                    raise ValueError('This join can not be performed since the\
                         second distribution contains new independent variable\
                         (s) for node {}. Please consider dropping these new \
                         dependencies or switching reference distribution. '.
                                     format(str(node)))
                elif ref_parents == second_parents:
                    new_cpd = BayesNetHelper.calculate_weighted_cpds(
                        reference_bayes.get_cpds(node=node),
                        second_bayes.get_cpds(node=node), ref_num_of_records,
                        second_num_of_records)
                    final_bayes.add_cpds(new_cpd)
                else:
                    final_bayes.add_cpds(reference_bayes.get_cpds(node=node))
        return final_bayes
Example #17
0
def create_bayes_net():
    atts = pd.read_csv('./data/list_attr_celeba.csv')
    atts = atts[KEEP_ATTS]
    graph = BayesianModel()
    graph.add_nodes_from(atts.columns)

    # can't automate this part
    # defining the structure of edges
    graph.add_edges_from([('Young', 'Eyeglasses'), ('Young', 'Bald'),
                          ('Young', 'Mustache'), ('Male', 'Mustache'),
                          ('Male', 'Smiling'), ('Male', 'Wearing_Lipstick'),
                          ('Young', 'Mouth_Slightly_Open'),
                          ('Young', 'Narrow_Eyes'), ('Male', 'Narrow_Eyes'),
                          ('Smiling', 'Narrow_Eyes'),
                          ('Smiling', 'Mouth_Slightly_Open'),
                          ('Young', 'Smiling')])

    # fit estimates the CPD tables for the given structure
    graph.fit(atts)

    return graph
Example #18
0
    def minimal_imap(self, order):
        """
        Returns a Bayesian Model which is minimal IMap of the Joint Probability Distribution
        considering the order of the variables.

        Parameters
        ----------
        order: array-like
            The order of the random variables.

        Examples
        --------
        >>> import numpy as np
        >>> from pgmpy.factors import JointProbabilityDistribution
        >>> prob = JointProbabilityDistribution(['x1', 'x2', 'x3'], [2, 3, 2], np.ones(12)/12)
        >>> bayesian_model = prob.minimal_imap(order=['x2', 'x1', 'x3'])
        >>> bayesian_model
        <pgmpy.models.models.models at 0x7fd7440a9320>
        >>> bayesian_model.edges()
        [('x1', 'x3'), ('x2', 'x3')]
        """
        from pgmpy.models import BayesianModel

        def get_subsets(u):
            for r in range(len(u) + 1):
                for i in itertools.combinations(u, r):
                    yield i

        G = BayesianModel()
        for variable_index in range(len(order)):
            u = order[:variable_index]
            for subset in get_subsets(u):
                if (len(subset) < len(u) and self.check_independence(
                    [order[variable_index]],
                        set(u) - set(subset), subset, True)):
                    G.add_edges_from([(variable, order[variable_index])
                                      for variable in subset])
        return G
    def minimal_imap(self, order):
        """
        Returns a Bayesian Model which is minimal IMap of the Joint Probability Distribution
        considering the order of the variables.

        Parameters
        ----------
        order: array-like
            The order of the random variables.

        Examples
        --------
        >>> import numpy as np
        >>> from pgmpy.factors import JointProbabilityDistribution
        >>> prob = JointProbabilityDistribution(['x1', 'x2', 'x3'], [2, 3, 2], np.ones(12)/12)
        >>> bayesian_model = prob.minimal_imap(order=['x2', 'x1', 'x3'])
        >>> bayesian_model
        <pgmpy.models.models.models at 0x7fd7440a9320>
        >>> bayesian_model.edges()
        [('x1', 'x3'), ('x2', 'x3')]
        """
        from pgmpy.models import BayesianModel

        def get_subsets(u):
            for r in range(len(u) + 1):
                for i in itertools.combinations(u, r):
                    yield i

        G = BayesianModel()
        for variable_index in range(len(order)):
            u = order[:variable_index]
            for subset in get_subsets(u):
                if (len(subset) < len(u) and
                    self.check_independence([order[variable_index]], set(u)-set(subset), subset, True)):
                    G.add_edges_from([(variable, order[variable_index]) for variable in subset])
        return G
    def test_predict(self):
        titanic = BayesianModel()
        titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")])
        titanic.fit(self.titanic_data2[500:])

        p1 = titanic.predict(self.titanic_data2[["Sex", "Pclass"]][:30])
        p2 = titanic.predict(self.titanic_data2[["Survived", "Pclass"]][:30])
        p3 = titanic.predict(self.titanic_data2[["Survived", "Sex"]][:30])

        p1_res =  np.array(['0', '1', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0',
                            '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
                            '0', '0', '0', '0'])
        p2_res = np.array(['male', 'female', 'female', 'female', 'male', 'male', 'male',
                           'male', 'female', 'female', 'female', 'female', 'male', 'male',
                           'male', 'female', 'male', 'female', 'male', 'female', 'male',
                           'female', 'female', 'female', 'male', 'female', 'male', 'male',
                           'female', 'male'])
        p3_res = np.array(['3', '1', '1', '1', '3', '3', '3', '3', '1', '1', '1', '1', '3',
                           '3', '3', '1', '3', '1', '3', '1', '3', '1', '1', '1', '3', '1',
                           '3', '3', '1', '3'])

        np_test.assert_array_equal(p1.values.ravel(), p1_res)
        np_test.assert_array_equal(p2.values.ravel(), p2_res)
        np_test.assert_array_equal(p3.values.ravel(), p3_res)
Example #21
0
# Bayesian network for students
from pgmpy.models import BayesianModel
model = BayesianModel()
# Add nodes
model.add_nodes_from(['difficulty', 'intelligence', 'grade', 'sat', 'letter'])
print(model.nodes())
# Add edges
model.add_edges_from([('difficulty', 'grade'), ('intelligence', 'grade'), ('intelligence', 'sat'), ('grade', 'letter')])
print(model.edges())
def main():

    andPGM = PGM_t()
    print('loading features..')
    train_set, test_set = andPGM.load_features()
    print('loading features.. Done')
    # Bayesian network of 19 nodes, 9*2 variables of network given
    # Initial incomplete Bayesian model connected manually based on intuition
    print('Generating model.. ')
    initialModel = BayesianModel({})
    initialModel.add_nodes_from(andPGM.img_features.columns[1:10].tolist())
    initialModel.add_edges_from([('f6_a' , 'f2_a'),\
                             ('f3_a' , 'f4_a') ,\
                             ('f5_a' , 'f9_a') ,\
                             ('f4_a' , 'f7_a') ])

    # Use hill climb search algorithm to find network structure of initial 9 nodes
    hc = HillClimbSearch(data=andPGM.img_features.iloc[0:,1:10], \
                         scoring_method=BdeuScore(andPGM.img_features.iloc[0:,1:10], \
                                                  equivalent_sample_size=0.1*len(andPGM.img_features)), \
                         state_names = andPGM.states_9)
    # Get best estimated structure
    best_model = hc.estimate(start=initialModel)
    # Edges in the acquired graph
    print('model of 9 var: ', best_model.edges())

    # Create a Clone of generated Bayesian network structure
    clone_model = BayesianModel({})
    for edge in best_model.edges():
        new_edge = [edge[0][:-1] + 'b', edge[1][:-1] + 'b']
        clone_model.add_edges_from([new_edge])

    # Join together the Original and clone network through node 'same'
    multinetModel = BayesianModel({})
    multinetModel.add_edges_from(best_model.edges() + clone_model.edges())
    multinetModel.add_node('same')
    multinetModel.add_edge('f5_a', 'same')
    multinetModel.add_edge('f9_a', 'same')
    multinetModel.add_edge('f5_b', 'same')
    multinetModel.add_edge('f9_b', 'same')
    print('Generating model.. Done')
    # Edges in the final structure
    print('Final model: ', multinetModel.edges())

    print('Fit data into model..')
    # fit the data to model to generate CPDs using maximum likelyhood estimation
    multinetModel.fit(data=train_set, state_names=andPGM.states_all)
    print('Fit data into model.. Done')
    print('CPDs generated: ')
    cpds = multinetModel.get_cpds()
    for cpd in cpds:
        print(cpd)
    # Inference using Variable Elimination
    print('Start inference..')
    inference = VariableElimination(multinetModel)
    train_set_same = train_set[train_set['same'] == 0]
    train_set_not_same = train_set[train_set['same'] == 1]

    # Accuracy of positive inferences
    acc_same = andPGM.chk_accuracy(
        train_set_same,
        inference,
        variables=train_set_same.columns[0:9].tolist(),
        evidence=train_set_same.columns[9:19].tolist())
    print('accuracy of positives ', acc_same)

    # Accuracy of negative inferences
    acc_nt_same = andPGM.chk_accuracy(
        train_set_not_same,
        inference,
        variables=train_set_not_same.columns[0:9].tolist(),
        evidence=train_set_not_same.columns[9:19].tolist())
    print('accuracy of negatives', acc_nt_same)
        ax_temp.bar(x, z, zs=y, zdir='y', alpha=0.6, color='r' * 4)
        ax_temp.set_xlabel('X')
        ax_temp.set_ylabel('Y')
        ax_temp.set_zlabel('Z')
        ax_temp.title.set_text(('Feature ' + str(mean_indices[counter])))
        counter += 1
plt.show()

# Learning naive bayes model from various subsets of data
naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2])
naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2, 4])
naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2, 3, 4, 5])

# Splitting train and test data for PGM model
temp_data = pd.concat([all_city_data, pd.DataFrame(all_city_label, columns=[13])], axis=1)
pgm_train_set = temp_data.loc[0:700]
pgm_test_set = temp_data.loc[700:]
print(pgm_train_set)


# Implementing PGM model on data
# Using these features: 0: (age) 1: (sex) 2: (cp)
pgm_model = BayesianModel()
pgm_model.add_nodes_from([0, 1, 2, 13])
pgm_model.add_edges_from([(1, 13)])
pgm_model.fit(pgm_train_set.loc[:, [0, 1, 2, 13]])
pgm_test_set = pgm_test_set.loc[:, [0, 1, 2, 13]].drop(13, axis=1)
print(pgm_test_set)
print(pgm_model.get_cpds(13))
Example #24
0
cpd_rain = TabularCPD('Rain', 2, [[0.4], [0.6]])
cpd_accident = TabularCPD('Accident', 2, [[0.2], [0.8]])
cpd_traffic_jam = TabularCPD('TrafficJam', 2,
                             [[0.9, 0.6, 0.7, 0.1],
                              [0.1, 0.4, 0.3, 0.9]],
                             evidence=['Rain', 'Accident'],
                             evidence_card=[2, 2])
model.add_cpds(cpd_rain, cpd_accident, cpd_traffic_jam)
model.add_node('LongQueues')
model.add_edge('TrafficJam', 'LongQueues')
cpd_long_queues = TabularCPD('LongQueues', 2,
                             [[0.9, 0.2],
                              [0.1, 0.8]],
                             evidence=['TrafficJam'],
                             evidence_card=[2])
model.add_cpds(cpd_long_queues)
model.add_nodes_from(['GettingUpLate', 'LateForSchool'])
model.add_edges_from([('GettingUpLate', 'LateForSchool'),
                      ('TrafficJam', 'LateForSchool')])
cpd_getting_up_late = TabularCPD('GettingUpLate', 2,
                                 [[0.6], [0.4]])
cpd_late_for_school = TabularCPD('LateForSchool', 2,
                                 [[0.9, 0.45, 0.8, 0.1],
                                  [0.1, 0.55, 0.2, 0.9]],
                                 evidence=['GettingUpLate', 'TrafficJam'],
                                 evidence_card=[2, 2])
model.add_cpds(cpd_getting_up_late, cpd_late_for_school)
# Conversion from BayesianModel to MarkovModel is accomplished by
mm = model.to_markov_model()
mm.edges()
Example #25
0
class TestBaseModelCreation(unittest.TestCase):
    def setUp(self):
        self.G = BayesianModel()

    def test_class_init_without_data(self):
        self.assertIsInstance(self.G, nx.DiGraph)

    def test_class_init_with_data_string(self):
        self.g = BayesianModel([('a', 'b'), ('b', 'c')])
        self.assertListEqual(sorted(self.g.nodes()), ['a', 'b', 'c'])
        self.assertListEqual(hf.recursive_sorted(self.g.edges()),
                             [['a', 'b'], ['b', 'c']])

    def test_class_init_with_data_nonstring(self):
        BayesianModel([(1, 2), (2, 3)])

    def test_add_node_string(self):
        self.G.add_node('a')
        self.assertListEqual(self.G.nodes(), ['a'])

    def test_add_node_nonstring(self):
        self.G.add_node(1)

    def test_add_nodes_from_string(self):
        self.G.add_nodes_from(['a', 'b', 'c', 'd'])
        self.assertListEqual(sorted(self.G.nodes()), ['a', 'b', 'c', 'd'])

    def test_add_nodes_from_non_string(self):
        self.G.add_nodes_from([1, 2, 3, 4])

    def test_add_edge_string(self):
        self.G.add_edge('d', 'e')
        self.assertListEqual(sorted(self.G.nodes()), ['d', 'e'])
        self.assertListEqual(self.G.edges(), [('d', 'e')])
        self.G.add_nodes_from(['a', 'b', 'c'])
        self.G.add_edge('a', 'b')
        self.assertListEqual(hf.recursive_sorted(self.G.edges()),
                             [['a', 'b'], ['d', 'e']])

    def test_add_edge_nonstring(self):
        self.G.add_edge(1, 2)

    def test_add_edge_selfloop(self):
        self.assertRaises(ValueError, self.G.add_edge, 'a', 'a')

    def test_add_edge_result_cycle(self):
        self.G.add_edges_from([('a', 'b'), ('a', 'c')])
        self.assertRaises(ValueError, self.G.add_edge, 'c', 'a')

    def test_add_edges_from_string(self):
        self.G.add_edges_from([('a', 'b'), ('b', 'c')])
        self.assertListEqual(sorted(self.G.nodes()), ['a', 'b', 'c'])
        self.assertListEqual(hf.recursive_sorted(self.G.edges()),
                             [['a', 'b'], ['b', 'c']])
        self.G.add_nodes_from(['d', 'e', 'f'])
        self.G.add_edges_from([('d', 'e'), ('e', 'f')])
        self.assertListEqual(sorted(self.G.nodes()),
                             ['a', 'b', 'c', 'd', 'e', 'f'])
        self.assertListEqual(hf.recursive_sorted(self.G.edges()),
                             hf.recursive_sorted([('a', 'b'), ('b', 'c'),
                                                  ('d', 'e'), ('e', 'f')]))

    def test_add_edges_from_nonstring(self):
        self.G.add_edges_from([(1, 2), (2, 3)])

    def test_add_edges_from_self_loop(self):
        self.assertRaises(ValueError, self.G.add_edges_from,
                          [('a', 'a')])

    def test_add_edges_from_result_cycle(self):
        self.assertRaises(ValueError, self.G.add_edges_from,
                          [('a', 'b'), ('b', 'c'), ('c', 'a')])

    def test_update_node_parents_bm_constructor(self):
        self.g = BayesianModel([('a', 'b'), ('b', 'c')])
        self.assertListEqual(self.g.predecessors('a'), [])
        self.assertListEqual(self.g.predecessors('b'), ['a'])
        self.assertListEqual(self.g.predecessors('c'), ['b'])

    def test_update_node_parents(self):
        self.G.add_nodes_from(['a', 'b', 'c'])
        self.G.add_edges_from([('a', 'b'), ('b', 'c')])
        self.assertListEqual(self.G.predecessors('a'), [])
        self.assertListEqual(self.G.predecessors('b'), ['a'])
        self.assertListEqual(self.G.predecessors('c'), ['b'])

    def tearDown(self):
        del self.G
Example #26
0
    for n in G.nodes():
        parents  = G.predecessors(n)
        parents_card = [cards.get(p,2) for p in parents]
        card = cards.get(n,2)
        values = np.full((card,np.prod(parents_card)),1.0/card)
        cpd = TabularCPD(n,card,values,parents,parents_card)
        G.add_cpds(cpd)
                    

    
# now I want to be able to do sampling and updating ...



G = BayesianModel()
G.add_edges_from([("X1","Y"),("X2","Y"),("X3","Y")])
set_uniform_cpds(G)

# idea

#1) sample a model

#2) sample a beleif about the state of that model from current data we have
    # requires posterior not just maximum likelyhood estimates.

    


#3) take best action according to that belief

#4) add result to data set
Example #27
0
class TestDirectedGraphCPDOperations(unittest.TestCase):
    def setUp(self):
        self.graph = BayesianModel()

    def test_add_single_cpd(self):
        cpd = TabularCPD('grade', 2, np.random.rand(2, 4),
                         ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd)
        self.assertListEqual(self.graph.get_cpds(), [cpd])

    def test_add_multiple_cpds(self):
        cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1))
        cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4),
                          ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.assertListEqual(self.graph.get_cpds(), [cpd1, cpd2, cpd3])

    def test_remove_single_cpd(self):
        cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1))
        cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4),
                          ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.graph.remove_cpds(cpd1)
        self.assertListEqual(self.graph.get_cpds(), [cpd2, cpd3])

    def test_remove_multiple_cpds(self):
        cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1))
        cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4),
                          ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.graph.remove_cpds(cpd1, cpd3)
        self.assertListEqual(self.graph.get_cpds(), [cpd2])

    def test_remove_single_cpd_string(self):
        cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1))
        cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4),
                          ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.graph.remove_cpds('diff')
        self.assertListEqual(self.graph.get_cpds(), [cpd2, cpd3])

    def test_remove_multiple_cpds_string(self):
        cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1))
        cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4),
                          ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.graph.remove_cpds('diff', 'grade')
        self.assertListEqual(self.graph.get_cpds(), [cpd2])

    def test_get_cpd_for_node(self):
        cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1))
        cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4),
                          ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.assertEqual(self.graph.get_cpds('diff'), cpd1)
        self.assertEqual(self.graph.get_cpds('intel'), cpd2)
        self.assertEqual(self.graph.get_cpds('grade'), cpd3)

    def test_get_cpd_raises_error(self):
        cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1))
        cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4),
                          ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.assertRaises(ValueError, self.graph.get_cpds, 'sat')

    def tearDown(self):
        del self.graph
Example #28
0
class BayesianNetwork:
    """
    Base class for Bayesian Network (BN), a probabilistic weighted DAG where nodes represent variables,
    edges represent the causal relationships between variables.

    ``BayesianNetwork`` stores nodes with their possible states, edges and
    conditional probability distributions (CPDs) of each node.

    ``BayesianNetwork`` is built on top of the ``StructureModel``, which is an extension of ``networkx.DiGraph``
    (see :func:`causalnex.structure.structuremodel.StructureModel`).

    In order to define the ``BayesianNetwork``, users should provide a relevant ``StructureModel``.
    Once ``BayesianNetwork`` is initialised, no changes to the ``StructureModel`` can be made
    and CPDs can be learned from the data.

    The learned CPDs can be then used for likelihood estimation and predictions.

    Example:
    ::
        >>> # Create a Bayesian Network with a manually defined DAG.
        >>> from causalnex.structure import StructureModel
        >>> from causalnex.network import BayesianNetwork
        >>>
        >>> sm = StructureModel()
        >>> sm.add_edges_from([
        >>>                    ('rush_hour', 'traffic'),
        >>>                    ('weather', 'traffic')
        >>>                    ])
        >>> bn = BayesianNetwork(sm)
        >>> # A created ``BayesianNetwork`` stores nodes and edges defined by the ``StructureModel``
        >>> bn.nodes
        ['rush_hour', 'traffic', 'weather']
        >>>
        >>> bn.edges
        [('rush_hour', 'traffic'), ('weather', 'traffic')]
        >>> # A ``BayesianNetwork`` doesn't store any CPDs yet
        >>> bn.cpds
        >>> {}
        >>>
        >>> # Learn the nodes' states from the data
        >>> import pandas as pd
        >>> data = pd.DataFrame({
        >>>                      'rush_hour': [True, False, False, False, True, False, True],
        >>>                      'weather': ['Terrible', 'Good', 'Bad', 'Good', 'Bad', 'Bad', 'Good'],
        >>>                      'traffic': ['heavy', 'light', 'heavy', 'light', 'heavy', 'heavy', 'heavy']
        >>>                      })
        >>> bn = bn.fit_node_states(data)
        >>> bn.node_states
        {'rush_hour': {False, True}, 'weather': {'Bad', 'Good', 'Terrible'}, 'traffic': {'heavy', 'light'}}
        >>> # Learn the CPDs from the data
        >>> bn = bn.fit_cpds(data)
        >>> # Use the learned CPDs to make predictions on the unseen data
        >>> test_data = pd.DataFrame({
        >>>                           'rush_hour': [False, False, True, True],
        >>>                           'weather': ['Good', 'Bad', 'Good', 'Bad']
        >>>                           })
        >>> bn.predict(test_data, "traffic").to_dict()
        >>> {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}}
        >>> bn.predict_probability(test_data, "traffic").to_dict()
        {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}}
        {'traffic_light': {0: 0.75, 1: 0.25, 2: 0.3333333333333333, 3: 0.3333333333333333},
         'traffic_heavy': {0: 0.25, 1: 0.75, 2: 0.6666666666666666, 3: 0.6666666666666666}}
    """

    def __init__(self, structure: StructureModel):
        """
        Create a ``BayesianNetwork`` with a DAG defined by ``StructureModel``.

        Args:
            structure: a graph representing a causal relationship between variables.
                       In the structure
                           - cycles are not allowed;
                           - multiple (parallel) edges are not allowed;
                           - isolated nodes and multiple components are not allowed.

        Raises:
            ValueError: If the structure is not a connected DAG.
        """
        n_components = nx.number_weakly_connected_components(structure)

        if n_components > 1:
            raise ValueError(
                f"The given structure has {n_components} separated graph components. "
                "Please make sure it has only one."
            )

        if not nx.is_directed_acyclic_graph(structure):
            cycle = nx.find_cycle(structure)
            raise ValueError(
                f"The given structure is not acyclic. Please review the following cycle: {cycle}"
            )

        # _node_states is a Dict in the form `dict: {node: dict: {state: index}}`.
        # Underlying libraries expect all states to be integers from zero, and
        # thus this dict is used to convert from state -> idx, and then back from idx -> state as required
        self._node_states = {}  # type: Dict[str: Dict[Hashable, int]]
        self._structure = structure

        # _model is a pgmpy Bayesian Model.
        # It is used for:
        #                - probability fitting
        #                - predictions
        self._model = BayesianModel()
        self._model.add_edges_from(structure.edges)

    @property
    def structure(self) -> StructureModel:
        """
        ``StructureModel`` defining the DAG of the Bayesian Network.

        Returns:
            A ``StructureModel`` of the Bayesian Network.
        """
        return self._structure

    @property
    def nodes(self) -> List[str]:
        """
        List of all nodes contained within the Bayesian Network.

        Returns:
            A list of node names.
        """
        return list(self._model.nodes)

    @property
    def node_states(self) -> Dict[str, Set[Hashable]]:
        """
        Dictionary of all states that each node can take.

        Returns:
            A dictionary of node and its possible states, in format of `dict: {node: state}`.
        """
        return {node: set(states.keys()) for node, states in self._node_states.items()}

    @node_states.setter
    def node_states(self, nodes: Dict[str, Set[Hashable]]):
        """
        Set the list of nodes that are contained within the Bayesian Network.
        The states of all nodes must be provided.

        Args:
            nodes: A dictionary of node and its possible states, in format of `dict: {node: state}`.

        Raises:
            ValueError: if a node contains a None state.
            KeyError: if a node is missing.
        """
        missing_feature = set(self.nodes).difference(set(nodes.keys()))

        if missing_feature:
            raise KeyError(
                "The data does not cover all the features found in the Bayesian Network. "
                f"Please check the following features: {missing_feature}"
            )

        self._node_states = {}

        for node, states in nodes.items():
            if any(pd.isnull(list(states))):
                raise ValueError(f"node '{node}' contains None state")

            self._node_states[node] = {v: k for k, v in enumerate(sorted(states))}

    @property
    def edges(self) -> List[Tuple[str, str]]:
        """
        List of all edges contained within the Bayesian Network, as a Tuple(from_node, to_node).

        Returns:
            A list of all edges.
        """
        return list(self._model.edges)

    @property
    def cpds(self) -> Dict[str, pd.DataFrame]:
        """
        Conditional Probability Distributions of each node within the Bayesian Network.

        The row-index of each dataframe is all possible states for the node.
        The col-index of each dataframe is a MultiIndex that describes all possible permutations of parent states.

        For example, for a node :math:`P(A | B, D)`, where
        .. math::
            - A \\in \\text{{"a", "b", "c", "d"}}
            - B \\in \\text{{"x", "y", "z"}}
            - C \\in \\text{{False, True}}

        >>> b         x                   y               z
        >>> d     False     True      False True      False     True
        >>> a
        >>> a  0.265306  0.214286  0.066667  0.25  0.444444  0.000000
        >>> b  0.183673  0.214286  0.200000  0.25  0.222222  0.666667
        >>> c  0.285714  0.285714  0.400000  0.25  0.333333  0.333333
        >>> d  0.265306  0.285714  0.333333  0.25  0.000000  0.000000

        Returns:
            Conditional Probability Distributions of each node within the Bayesian Network.
        """
        cpds = {}

        for cpd in self._model.cpds:
            names = cpd.variables[1:]
            cols = [""]

            if names:
                cols = pd.MultiIndex.from_product(
                    [sorted(self._node_states[var].keys()) for var in names],
                    names=names,
                )

            cpds[cpd.variable] = pd.DataFrame(
                cpd.values.reshape(
                    len(self._node_states[cpd.variable]), max(1, len(cols))
                )
            )
            cpds[cpd.variable][cpd.variable] = sorted(
                self._node_states[cpd.variable].keys()
            )
            cpds[cpd.variable].set_index([cpd.variable], inplace=True)
            cpds[cpd.variable].columns = cols

        return cpds

    def set_cpd(self, node: str, df: pd.DataFrame) -> "BayesianNetwork":
        """
        Provide self-defined CPD to Bayesian Network

        Args:
            node: the node to add self-defined cpd.
            df: self-defined cpd in pandas DataFrame format.

        Returns:
            self

        Raises:
            IndexError: if the index names of the pandas DataFrame does not match the expected DataFrame.
            ValueError: if node does not exist in Bayesian Network or a bad cpd table is provided.
        """
        if node not in self.nodes:
            raise ValueError(f'Non-existing node "{node}"')

        # Check Table
        true_parents = {
            parent_node: self.node_states[parent_node]
            for parent_node in self._structure.predecessors(node)
        }
        table_parents = {
            name: set(df.columns.levels[i].values)
            for i, name in enumerate(df.columns.names)
        }
        if not (
            set(df.index.values) == self.node_states[node]
            and true_parents == table_parents
            and df.index.name == node
        ):
            raise IndexError("Wrong index values. Please check your indices")

        sorted_df = df.reindex(sorted(df.columns), axis=1)
        node_card = len(self.node_states[node])
        evidence, evidence_card = zip(
            *[(key, len(table_parents[key])) for key in sorted(table_parents.keys())]
        )
        tabular_cpd = TabularCPD(
            node,
            node_card,
            sorted_df.values,
            evidence=evidence,
            evidence_card=evidence_card,
        )
        model_copy = copy.deepcopy(self._model)
        model_copy.add_cpds(tabular_cpd)
        model_copy.check_model()

        self._model = model_copy
        return self

    def fit_node_states(self, df: pd.DataFrame) -> "BayesianNetwork":
        """
        Fit all states of nodes that can appear in the data.
        The dataframe provided should contain every possible state (values that can be taken) for every column.

        Args:
            df: data to fit node states from. Each column indicates a node and each row
                an observed combination of states.

        Returns:
            self

        Raises:
            ValueError: if dataframe contains any missing data.
        """
        self.node_states = {c: set(df[c].unique()) for c in df.columns}
        return self

    def _state_to_index(
        self,
        df: pd.DataFrame,
        nodes: Optional[List[str]] = None,
    ) -> pd.DataFrame:
        """
        Transforms all values in df to an integer, as defined by the mapping from fit_node_states.

        Args:
            df: data to transform
            nodes: list of nodes to map to index. None means all.

        Returns:
            The transformed dataframe.

        Raises:
            ValueError: if nodes have not been fit, or if column names do not match node names.
        """
        df.is_copy = False
        cols = nodes if nodes else df.columns

        for col in cols:
            df[col] = df[col].map(self._node_states[col])

        df.is_copy = True
        return df

    def fit_cpds(
        self,
        data: pd.DataFrame,
        method: str = "MaximumLikelihoodEstimator",
        bayes_prior: Optional[str] = None,
        equivalent_sample_size: Optional[int] = None,
    ) -> "BayesianNetwork":
        """
        Learn conditional probability distributions for all nodes in the Bayesian Network, conditioned on
        their incoming edges (parents).

        Args:
            data: dataframe containing one column per node in the Bayesian Network.
            method: how to fit probabilities. One of:
                    - "MaximumLikelihoodEstimator": fit probabilities using Maximum Likelihood Estimation;
                    - "BayesianEstimator": fit probabilities using Bayesian Parameter Estimation. Use bayes_prior.
            bayes_prior: how to construct the Bayesian prior used by method="BayesianEstimator". One of:
                         - "K2": shorthand for dirichlet where all pseudo_counts are 1
                                 regardless of variable cardinality;
                         - "BDeu": equivalent of using Dirichlet and using uniform 'pseudo_counts' of
                                   `equivalent_sample_size / (node_cardinality * np.prod(parents_cardinalities))`
                                   for each node. Use equivelant_sample_size.
            equivalent_sample_size: used by BDeu bayes_prior to compute pseudo_counts.

        Returns:
            self

        Raises:
            ValueError: if an invalid method or bayes_prior is specified.
        """
        state_names = {k: list(v.values()) for k, v in self._node_states.items()}

        transformed_data = data.copy(deep=True)  # type: pd.DataFrame
        transformed_data = self._state_to_index(transformed_data[self.nodes])

        if method == "MaximumLikelihoodEstimator":
            self._model.fit(
                data=transformed_data,
                estimator=MaximumLikelihoodEstimator,
                state_names=state_names,
            )

        elif method == "BayesianEstimator":
            valid_bayes_priors = ["BDeu", "K2"]
            if bayes_prior not in valid_bayes_priors:
                raise ValueError(
                    f"unrecognised bayes_prior, please use one of {valid_bayes_priors}"
                )

            self._model.fit(
                data=transformed_data,
                estimator=BayesianEstimator,
                prior_type=bayes_prior,
                equivalent_sample_size=equivalent_sample_size,
                state_names=state_names,
            )
        else:
            valid_methods = ["MaximumLikelihoodEstimator", "BayesianEstimator"]
            raise ValueError(f"unrecognised method, please use one of {valid_methods}")

        return self

    def fit_node_states_and_cpds(
        self,
        data: pd.DataFrame,
        method: str = "MaximumLikelihoodEstimator",
        bayes_prior: Optional[str] = None,
        equivalent_sample_size: Optional[int] = None,
    ) -> "BayesianNetwork":
        """
        Call `fit_node_states` and then `fit_cpds`.

        Args:
            data: dataframe containing one column per node in the Bayesian Network.
            method: how to fit probabilities. One of:
                    - "MaximumLikelihoodEstimator": fit probabilities using Maximum Likelihood Estimation;
                    - "BayesianEstimator": fit probabilities using Bayesian Parameter Estimation. Use bayes_prior.
            bayes_prior: how to construct the Bayesian prior used by method="BayesianEstimator". One of:
                         - "K2": shorthand for dirichlet where all pseudo_counts are 1
                                 regardless of variable cardinality;
                         - "BDeu": equivalent of using dirichlet and using uniform 'pseudo_counts' of
                                   `equivalent_sample_size / (node_cardinality * np.prod(parents_cardinalities))`
                                   for each node. Use equivelant_sample_size.
            equivalent_sample_size: used by BDeu bayes_prior to compute pseudo_counts.

        Returns:
            self
        """
        return self.fit_node_states(data).fit_cpds(
            data, method, bayes_prior, equivalent_sample_size
        )

    def add_node(
        self,
        node: str,
        edges_to_add: List[Tuple[str, str]],
        edges_to_remove: List[Tuple[str, str]],
    ) -> "BayesianNetwork":
        """
        Adding a latent variable to the structure model, as well as its corresponding edges

        Args:
            node: Name of the node
            edges_to_add: which edges to add to the structure
            edges_to_remove: which edges to remove from the structure

        Returns:
            self

        Raises:
            ValueError: If lv_name exists in the network or
                if `edges_to_add` include edges NOT containing the latent variable or
                if `edges_to_remove` include edges containing the latent variable
        """
        if any(node not in edges for edges in edges_to_add):
            raise ValueError(f"Should only add edges containing node '{node}'")
        if any(node in edges for edges in edges_to_remove):
            raise ValueError(f"Should only remove edges NOT containing node '{node}'")

        self._structure.add_edges_from(edges_to_add)
        self._structure.remove_edges_from(edges_to_remove)
        self._model.add_edges_from(edges_to_add)
        self._model.remove_edges_from(edges_to_remove)

        return self

    def fit_latent_cpds(  # pylint: disable=too-many-arguments
        self,
        lv_name: str,
        lv_states: List,
        data: pd.DataFrame,
        box_constraints: Optional[Dict[str, Tuple[pd.DataFrame, pd.DataFrame]]] = None,
        priors: Optional[Dict[str, pd.DataFrame]] = None,
        initial_params: Union[str, Dict[str, pd.DataFrame]] = "random",
        non_missing_data_factor: int = 1,
        n_runs: int = 20,
        stopping_delta: float = 0.0,
    ) -> "BayesianNetwork":
        """
        This runs the EM algorithm to estimate the CPDs of latent variables and their corresponding Markov blanket

        Args:
            lv_name: Latent variable name
            lv_states: the states the LV can assume
            data: dataframe, must contain all variables in the Markov Blanket of the latent variable. Include one column
                with the latent variable name, filled with np.nan for missing info about LV.
                If some data is present about the LV, create complete columns.
            n_runs: max number of EM alternations
            stopping_delta: if max difference in current - last iteration CPDS < stopping_delta => convergence reached
            initial_params: way to initialise parameters. Can be:
                - "random": random values (default)
                - "avg": uniform distributions everywhere. Not advised, as it may be the a stationary point on itself
                - if provide a dictionary of dataframes, this will be used as the initialisation
            box_constraints: minimum and maximum values for each model parameter. Specified with a dictionary mapping:
                - Node
                - two dataframes, in order: Min(P(Node|Par(Node))) and Max(P(Node|Par(Node)))
            priors: priors, provided as a mapping Node -> dataframe with Dirichilet priors for P(Node|Par(Node))
            non_missing_data_factor:
                This is a weight added to the non-missing data samples. The effect is as if the amount of data provided
                was bigger. Empirically, helps to set the factor to 10 if the non missing data is ~1% of the dataset

        Returns:
            self

        Raises:
            ValueError: if the latent variable is not a string or
                if the latent variable cannot be found in the network or
                if the latent variable is present/observed in the data
                if the latent variable states are empty
        """
        if not isinstance(lv_name, str):
            raise ValueError(f"Invalid latent variable name '{lv_name}'")
        if lv_name not in self._structure:
            raise ValueError(f"Latent variable '{lv_name}' not added to the network")
        if not isinstance(lv_states, list) or len(lv_states) == 0:
            raise ValueError(f"Latent variable '{lv_name}' contains no states")

        # Register states for the latent variable
        self._node_states[lv_name] = {v: k for k, v in enumerate(sorted(lv_states))}

        # Run EM algorithm
        estimator = EMSingleLatentVariable(
            sm=self.structure,
            data=data,
            lv_name=lv_name,
            node_states={n: sorted(s) for n, s in self.node_states.items()},
            initial_params=initial_params,
            box_constraints=box_constraints,
            priors=priors,
            non_missing_data_factor=non_missing_data_factor,
        )
        estimator.run(n_runs=n_runs, stopping_delta=stopping_delta)

        # Add CPDs into the model
        tab_cpds = [pd_to_tabular_cpd(el) for el in estimator.cpds.values()]
        self._model.add_cpds(*tab_cpds)

        return self

    def predict(self, data: pd.DataFrame, node: str) -> pd.DataFrame:
        """
        Predict the state of a node based on some input data, using the Bayesian Network.

        Args:
            data: data to make prediction.
            node: the node to predict.

        Returns:
            A dataframe of predictions, containing a single column name {node}_prediction.
        """
        if all(parent in data.columns for parent in self._model.get_parents(node)):
            return self._predict_from_complete_data(data, node)

        return self._predict_from_incomplete_data(data, node)

    def _predict_from_complete_data(
        self,
        data: pd.DataFrame,
        node: str,
    ) -> pd.DataFrame:
        """
        Predict state of node given all parents of node exist within data.
        This method inspects the CPD of node directly, since all parent states are known.
        This avoids traversing the full network to compute marginals.
        This method is fast.

        Args:
            data: data to make prediction.
            node: the node to predict.

        Returns:
            A dataframe of predictions, containing a single column named {node}_prediction.
        """
        transformed_data = data.copy(deep=True)  # type: pd.DataFrame

        parents = sorted(self._model.get_parents(node))
        cpd = self.cpds[node]

        transformed_data[f"{node}_prediction"] = transformed_data.apply(
            lambda row: cpd[tuple(row[parent] for parent in parents)].idxmax()
            if parents
            else cpd[""].idxmax(),
            axis=1,
        )
        return transformed_data[[node + "_prediction"]]

    def _predict_from_incomplete_data(
        self,
        data: pd.DataFrame,
        node: str,
    ) -> pd.DataFrame:
        """
        Predict state of node when some parents of node do not exist within data.
        This method uses the pgmpy predict function, which predicts the most likely state for every node
        that is not contained within data.
        With incomplete data, pgmpy goes beyond parents in the network to determine the most likely predictions.
        This method is slow.

        Args:
            data: data to make prediction.
            node: the node to predict.

        Returns:
            A dataframe of predictions, containing a single column name {node}_prediction.
        """
        transformed_data = data.copy(deep=True)  # type: pd.DataFrame
        self._state_to_index(transformed_data)

        # pgmpy will predict all missing data, so drop column we want to predict
        transformed_data = transformed_data.drop(columns=[node])
        predictions = self._model.predict(transformed_data)[[node]]

        return predictions.rename(columns={node: node + "_prediction"})

    def predict_probability(self, data: pd.DataFrame, node: str) -> pd.DataFrame:
        """
        Predict the probability of each possible state of a node, based on some input data.

        Args:
            data: data to make prediction.
            node: the node to predict probabilities.

        Returns:
            A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}.
        """
        if all(parent in data.columns for parent in self._model.get_parents(node)):
            return self._predict_probability_from_complete_data(data, node)

        return self._predict_probability_from_incomplete_data(data, node)

    def _predict_probability_from_complete_data(
        self,
        data: pd.DataFrame,
        node: str,
    ) -> pd.DataFrame:
        """
        Predict the probability of each possible state of a node, based on some input data.
        This method inspects the CPD of node directly, since all parent states are known.
        This avoids traversing the full network to compute marginals.
        This method is fast.

        Args:
            data: data to make prediction.
            node: the node to predict probabilities.

        Returns:
            A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}.
        """
        transformed_data = data.copy(deep=True)  # type: pd.DataFrame

        parents = sorted(self._model.get_parents(node))
        cpd = self.cpds[node]

        def lookup_probability(row, s):
            """Retrieve probability from CPD"""
            if parents:
                return cpd[tuple(row[parent] for parent in parents)].loc[s]
            return cpd.at[s, ""]

        for state in self.node_states[node]:
            transformed_data[f"{node}_{state}"] = transformed_data.apply(
                lambda row, st=state: lookup_probability(row, st), axis=1
            )

        return transformed_data[[f"{node}_{state}" for state in self.node_states[node]]]

    def _predict_probability_from_incomplete_data(
        self,
        data: pd.DataFrame,
        node: str,
    ) -> pd.DataFrame:
        """
        Predict the probability of each possible state of a node, based on some input data.
        This method uses the pgmpy predict_probability function, which predicts the probability
        of every state for every node that is not contained within data.
        With incomplete data, pgmpy goes beyond parents in the network to determine the most likely predictions.
        This method is slow.

        Args:
            data: data to make prediction.
            node: the node to predict probabilities.

        Returns:
            A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}.
        """
        transformed_data = data.copy(deep=True)  # type: pd.DataFrame
        self._state_to_index(transformed_data)

        # pgmpy will predict all missing data, so drop column we want to predict
        transformed_data = transformed_data.drop(columns=[node])

        probability = self._model.predict_probability(
            transformed_data
        )  # type: pd.DataFrame

        # keep only probabilities for the node we are interested in
        cols = []
        pattern = re.compile(f"^{node}_[0-9]+$")

        # disabled open pylint issue (https://github.com/PyCQA/pylint/issues/2962)
        for col in probability.columns:
            if pattern.match(col):
                cols.append(col)

        probability = probability[cols]
        probability.columns = cols
        return probability
Example #29
0
# ------------------------------------------------------ ( traffic_jam -> long_queues )
model.add_node('long_queues')
model.add_edge('traffic_jam', 'long_queues')
cpd_long_queues = TabularCPD('long_queues', 2,
	                         [[0.9, 0.2],
	                          [0.1, 0.8]],
	                          evidence=['traffic_jam'],
	                          evidence_card=[2])
model.add_cpds(cpd_long_queues)


# ------------------------------------------------------ ( getting_up_late -> late_for_school )
# ------------------------------------------------------ ( traffic_jam -> late_for_school )
model.add_nodes_from(['getting_up_late',
	                  'late_for_school'])
model.add_edges_from([('getting_up_late', 'late_for_school'),
	                  ('traffic_jam', 'late_for_school')])

cpd_getting_up_late = TabularCPD('getting_up_late', 2,
	                             [[0.6], [0.4]])
cpd_late_for_school = TabularCPD('late_for_school', 2,
	                             [[0.9, 0.45, 0.8, 0.1],
	                              [0.1, 0.55, 0.3, 0.9]],
	                              evidence=['getting_up_late', 'traffic_jam'],
	                              evidence_card=[2, 2])

model.add_cpds(cpd_getting_up_late, cpd_late_for_school)
model.get_cpds()
" [<TabularCPD representing P(rain: 2)                                               at fsjidfsjdfaskdf>, "
" [<TabularCPD representing P(accident: 2)                                           at fsxfgsdfgfsjdfaskdf>, "
" [<TabularCPD representing P(traffic_jam: 2 | rain:2, accident:2)                   at fsjidf234sjdfaskdf>, "
" [<TabularCPD representing P(long_queues: 2 | traffic_jam:2)                        at fsjidf234sjdfaskdf>, "
Example #30
0
class TestDirectedGraphCPDOperations(unittest.TestCase):
    def setUp(self):
        self.graph = BayesianModel()

    def test_add_single_cpd(self):
        cpd = TabularCPD('grade',
                         2,
                         values=np.random.rand(2, 4),
                         evidence=['diff', 'intel'],
                         evidence_card=[2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd)
        self.assertListEqual(self.graph.get_cpds(), [cpd])

    def test_add_multiple_cpds(self):
        cpd1 = TabularCPD('diff', 2, values=np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, values=np.random.rand(2, 1))
        cpd3 = TabularCPD('grade',
                          2,
                          values=np.random.rand(2, 4),
                          evidence=['diff', 'intel'],
                          evidence_card=[2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.assertListEqual(self.graph.get_cpds(), [cpd1, cpd2, cpd3])

    def test_remove_single_cpd(self):
        cpd1 = TabularCPD('diff', 2, values=np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, values=np.random.rand(2, 1))
        cpd3 = TabularCPD('grade',
                          2,
                          values=np.random.rand(2, 4),
                          evidence=['diff', 'intel'],
                          evidence_card=[2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.graph.remove_cpds(cpd1)
        self.assertListEqual(self.graph.get_cpds(), [cpd2, cpd3])

    def test_remove_multiple_cpds(self):
        cpd1 = TabularCPD('diff', 2, values=np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, values=np.random.rand(2, 1))
        cpd3 = TabularCPD('grade',
                          2,
                          values=np.random.rand(2, 4),
                          evidence=['diff', 'intel'],
                          evidence_card=[2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.graph.remove_cpds(cpd1, cpd3)
        self.assertListEqual(self.graph.get_cpds(), [cpd2])

    def test_remove_single_cpd_string(self):
        cpd1 = TabularCPD('diff', 2, values=np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, values=np.random.rand(2, 1))
        cpd3 = TabularCPD('grade',
                          2,
                          values=np.random.rand(2, 4),
                          evidence=['diff', 'intel'],
                          evidence_card=[2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.graph.remove_cpds('diff')
        self.assertListEqual(self.graph.get_cpds(), [cpd2, cpd3])

    def test_remove_multiple_cpds_string(self):
        cpd1 = TabularCPD('diff', 2, values=np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, values=np.random.rand(2, 1))
        cpd3 = TabularCPD('grade',
                          2,
                          values=np.random.rand(2, 4),
                          evidence=['diff', 'intel'],
                          evidence_card=[2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.graph.remove_cpds('diff', 'grade')
        self.assertListEqual(self.graph.get_cpds(), [cpd2])

    def test_get_cpd_for_node(self):
        cpd1 = TabularCPD('diff', 2, values=np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, values=np.random.rand(2, 1))
        cpd3 = TabularCPD('grade',
                          2,
                          values=np.random.rand(2, 4),
                          evidence=['diff', 'intel'],
                          evidence_card=[2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.assertEqual(self.graph.get_cpds('diff'), cpd1)
        self.assertEqual(self.graph.get_cpds('intel'), cpd2)
        self.assertEqual(self.graph.get_cpds('grade'), cpd3)

    def test_get_cpd_raises_error(self):
        cpd1 = TabularCPD('diff', 2, values=np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, values=np.random.rand(2, 1))
        cpd3 = TabularCPD('grade',
                          2,
                          values=np.random.rand(2, 4),
                          evidence=['diff', 'intel'],
                          evidence_card=[2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.assertRaises(ValueError, self.graph.get_cpds, 'sat')

    def tearDown(self):
        del self.graph
Example #31
0
class BayesianNetwork:
    """
    Base class for Bayesian Network (BN), a probabilistic weighted DAG where nodes represent variables,
    edges represent the causal relationships between variables.

    ``BayesianNetwork`` stores nodes with their possible states, edges and
    conditional probability distributions (CPDs) of each node.

    ``BayesianNetwork`` is built on top of the ``StructureModel``, which is an extension of ``networkx.DiGraph``
    (see :func:`causalnex.structure.structuremodel.StructureModel`).

    In order to define the ``BayesianNetwork``, users should provide a relevant ``StructureModel``.
    Once ``BayesianNetwork`` is initialised, no changes to the ``StructureModel`` can be made
    and CPDs can be learned from the data.

    The learned CPDs can be then used for likelihood estimation and predictions.

    Example:
    ::
        >>> # Create a Bayesian Network with a manually defined DAG.
        >>> from causalnex.structure import StructureModel
        >>> from causalnex.network import BayesianNetwork
        >>>
        >>> sm = StructureModel()
        >>> sm.add_edges_from([
        >>>                    ('rush_hour', 'traffic'),
        >>>                    ('weather', 'traffic')
        >>>                    ])
        >>> bn = BayesianNetwork(sm)
        >>> # A created ``BayesianNetwork`` stores nodes and edges defined by the ``StructureModel``
        >>> bn.nodes
        ['rush_hour', 'traffic', 'weather']
        >>>
        >>> bn.edges
        [('rush_hour', 'traffic'), ('weather', 'traffic')]
        >>> # A ``BayesianNetwork`` doesn't store any CPDs yet
        >>> bn.cpds
        >>> {}
        >>>
        >>> # Learn the nodes' states from the data
        >>> import pandas as pd
        >>> data = pd.DataFrame({
        >>>                      'rush_hour': [True, False, False, False, True, False, True],
        >>>                      'weather': ['Terrible', 'Good', 'Bad', 'Good', 'Bad', 'Bad', 'Good'],
        >>>                      'traffic': ['heavy', 'light', 'heavy', 'light', 'heavy', 'heavy', 'heavy']
        >>>                      })
        >>> bn = bn.fit_node_states(data)
        >>> bn.node_states
        {'rush_hour': {False, True}, 'weather': {'Bad', 'Good', 'Terrible'}, 'traffic': {'heavy', 'light'}}
        >>> # Learn the CPDs from the data
        >>> bn = bn.fit_cpds(data)
        >>> # Use the learned CPDs to make predictions on the unseen data
        >>> test_data = pd.DataFrame({
        >>>                           'rush_hour': [False, False, True, True],
        >>>                           'weather': ['Good', 'Bad', 'Good', 'Bad']
        >>>                           })
        >>> bn.predict(test_data, "traffic").to_dict()
        >>> {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}}
        >>> bn.predict_probability(test_data, "traffic").to_dict()
        {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}}
        {'traffic_light': {0: 0.75, 1: 0.25, 2: 0.3333333333333333, 3: 0.3333333333333333},
         'traffic_heavy': {0: 0.25, 1: 0.75, 2: 0.6666666666666666, 3: 0.6666666666666666}}
    """
    def __init__(self, structure: StructureModel):
        """
        Create a ``BayesianNetwork`` with a DAG defined by ``StructureModel``.

        Args:
            structure: a graph representing a causal relationship between variables.
                       In the structure
                           - cycles are not allowed;
                           - multiple (parallel) edges are not allowed;
                           - isolated nodes and multiple components are not allowed.

        Raises:
            ValueError: If the structure is not a connected DAG.
        """
        n_components = nx.number_weakly_connected_components(structure)

        if n_components > 1:
            raise ValueError(
                "The given structure has {n_components} separated graph components. "
                "Please make sure it has only one.".format(
                    n_components=n_components))

        if not nx.is_directed_acyclic_graph(structure):
            cycle = nx.find_cycle(structure)
            raise ValueError(
                "The given structure is not acyclic. Please review the following cycle: {cycle}"
                .format(cycle=cycle))

        # _node_states is a Dict in the form `dict: {node: dict: {state: index}}`.
        # Underlying libraries expect all states to be integers from zero, and
        # thus this dict is used to convert from state -> idx, and then back from idx -> state as required
        self._node_states = None  # type: Dict[str: Dict[Hashable, int]]
        self._structure = structure

        # _model is a pgmpy Bayesian Model.
        # It is used for:
        #                - probability fitting
        #                - predictions
        self._model = BayesianModel()
        self._model.add_edges_from(structure.edges)

    @property
    def structure(self) -> StructureModel:
        """
        ``StructureModel`` defining the DAG of the Bayesian Network.

        Returns:
            A ``StructureModel`` of the Bayesian Network.
        """
        return self._structure

    @property
    def nodes(self) -> List[str]:
        """
        List of all nodes contained within the Bayesian Network.

        Returns:
            A list of node names.
        """
        return list(self._model.nodes)

    @property
    def node_states(self) -> Dict[str, Set[Hashable]]:
        """
        Dictionary of all states that each node can take.

        Returns:
            A dictionary of node and its possible states, in format of `dict: {node: state}`.
        """
        return {
            node: set(states.keys())
            for node, states in self._node_states.items()
        }

    @node_states.setter
    def node_states(self, nodes: Dict[str, Set[Hashable]]):
        """
        Set the list of nodes that are contained within the Bayesian Network.
        The states of all nodes must be provided.

        Args:
            nodes: A dictionary of node and its possible states, in format of `dict: {node: state}`.

        Raises:
            ValueError: if a node contains a None state.
            KeyError: if a node is missing.
        """
        missing_feature = set(self.nodes).difference(set(nodes.keys()))
        if missing_feature:
            raise KeyError(
                "The data does not cover all the features found in the Bayesian Network. "
                "Please check the following features: {nodes}".format(
                    nodes=missing_feature))

        for node, states in nodes.items():
            if any(pd.isnull(list(states))):
                raise ValueError(
                    "node '{node}' contains None state".format(node=node))
        self._node_states = {
            n: {v: k
                for k, v in enumerate(sorted(nodes[n]))}
            for n in nodes
        }

    @property
    def edges(self) -> List[Tuple[str, str]]:
        """
        List of all edges contained within the Bayesian Network, as a Tuple(from_node, to_node).

        Returns:
            A list of all edges.
        """
        return list(self._model.edges)

    @property
    def cpds(self) -> Dict[str, pd.DataFrame]:
        """
        Conditional Probability Distributions of each node within the Bayesian Network.

        The row-index of each dataframe is all possible states for the node.
        The col-index of each dataframe is a MultiIndex that describes all possible permutations of parent states.

        For example, for a node :math:`P(A | B, D)`, where
        .. math::
            - A \\in \\text{{"a", "b", "c", "d"}}
            - B \\in \\text{{"x", "y", "z"}}
            - C \\in \\text{{False, True}}

        >>> b         x                   y               z
        >>> d     False     True      False True      False     True
        >>> a
        >>> a  0.265306  0.214286  0.066667  0.25  0.444444  0.000000
        >>> b  0.183673  0.214286  0.200000  0.25  0.222222  0.666667
        >>> c  0.285714  0.285714  0.400000  0.25  0.333333  0.333333
        >>> d  0.265306  0.285714  0.333333  0.25  0.000000  0.000000

        Returns:
            Conditional Probability Distributions of each node within the Bayesian Network.
        """
        cpds = dict()
        for cpd in self._model.cpds:

            iterables = [
                sorted(self._node_states[var].keys())
                for var in cpd.variables[1:]
            ]
            cols = [""]
            if iterables:
                cols = pd.MultiIndex.from_product(iterables,
                                                  names=cpd.variables[1:])

            cpds[cpd.variable] = pd.DataFrame(
                cpd.values.reshape(len(self._node_states[cpd.variable]),
                                   max(1, len(cols))))
            cpds[cpd.variable][cpd.variable] = sorted(
                self._node_states[cpd.variable].keys())
            cpds[cpd.variable].set_index([cpd.variable], inplace=True)
            cpds[cpd.variable].columns = cols

        return cpds

    def fit_node_states(self, df: pd.DataFrame) -> "BayesianNetwork":
        """
        Fit all states of nodes that can appear in the data.
        The dataframe provided should contain every possible state (values that can be taken) for every column.

        Args:
            df: data to fit node states from. Each column indicates a node and each row
                an observed combination of states.

        Returns:
            self

        Raises:
            ValueError: if dataframe contains any missing data.
        """
        self.node_states = {c: set(df[c].unique()) for c in df.columns}

        return self

    def _state_to_index(self,
                        df: pd.DataFrame,
                        nodes: List[str] = None) -> pd.DataFrame:
        """
        Transforms all values in df to an integer, as defined by the mapping from fit_node_states.

        Args:
            df: data to transform
            nodes: list of nodes to map to index. None means all.

        Returns:
            The transformed dataframe.

        Raises:
            ValueError: if nodes have not been fit, or if column names do not match node names.
        """

        df.is_copy = False
        cols = nodes if nodes else df.columns
        for col in cols:
            df[col] = df[col].map(self._node_states[col])
        df.is_copy = True
        return df

    def fit_cpds(
        self,
        data: pd.DataFrame,
        method: str = "MaximumLikelihoodEstimator",
        bayes_prior: str = None,
        equivalent_sample_size: int = None,
    ) -> "BayesianNetwork":
        """
        Learn conditional probability distributions for all nodes in the Bayesian Network, conditioned on
        their incoming edges (parents).

        Args:
            data: dataframe containing one column per node in the Bayesian Network.
            method: how to fit probabilities. One of:
                    - "MaximumLikelihoodEstimator": fit probabilities using Maximum Likelihood Estimation;
                    - "BayesianEstimator": fit probabilities using Bayesian Parameter Estimation. Use bayes_prior.
            bayes_prior: how to construct the Bayesian prior used by method="BayesianEstimator". One of:
                         - "K2": shorthand for dirichlet where all pseudo_counts are 1
                                 regardless of variable cardinality;
                         - "BDeu": equivalent of using Dirichlet and using uniform 'pseudo_counts' of
                                   `equivalent_sample_size / (node_cardinality * np.prod(parents_cardinalities))`
                                   for each node. Use equivelant_sample_size.
            equivalent_sample_size: used by BDeu bayes_prior to compute pseudo_counts.

        Returns:
            self

        Raises:
            ValueError: if an invalid method or bayes_prior is specified.

        """

        state_names = {
            k: list(v.values())
            for k, v in self._node_states.items()
        }

        transformed_data = data.copy(deep=True)  # type: pd.DataFrame
        transformed_data = self._state_to_index(transformed_data[self.nodes])

        if method == "MaximumLikelihoodEstimator":
            self._model.fit(
                data=transformed_data,
                estimator=MaximumLikelihoodEstimator,
                state_names=state_names,
            )

        elif method == "BayesianEstimator":
            valid_bayes_priors = ["BDeu", "K2"]
            if bayes_prior not in valid_bayes_priors:
                raise ValueError(
                    "unrecognised bayes_prior, please use on of %s" %
                    " ".join(valid_bayes_priors))

            self._model.fit(
                data=transformed_data,
                estimator=BayesianEstimator,
                prior_type=bayes_prior,
                equivalent_sample_size=equivalent_sample_size,
                state_names=state_names,
            )
        else:
            valid_methods = ["MaximumLikelihoodEstimator", "BayesianEstimator"]
            raise ValueError("unrecognised method, please use on of %s" %
                             " ".join(valid_methods))

        return self

    def fit_node_states_and_cpds(
        self,
        data: pd.DataFrame,
        method: str = "MaximumLikelihoodEstimator",
        bayes_prior: str = None,
        equivalent_sample_size: int = None,
    ) -> "BayesianNetwork":
        """
        Call `fit_node_states` and then `fit_cpds`.

        Args:
            data: dataframe containing one column per node in the Bayesian Network.
            method: how to fit probabilities. One of:
                    - "MaximumLikelihoodEstimator": fit probabilities using Maximum Likelihood Estimation;
                    - "BayesianEstimator": fit probabilities using Bayesian Parameter Estimation. Use bayes_prior.
            bayes_prior: how to construct the Bayesian prior used by method="BayesianEstimator". One of:
                         - "K2": shorthand for dirichlet where all pseudo_counts are 1
                                 regardless of variable cardinality;
                         - "BDeu": equivalent of using dirichlet and using uniform 'pseudo_counts' of
                                   `equivalent_sample_size / (node_cardinality * np.prod(parents_cardinalities))`
                                   for each node. Use equivelant_sample_size.
            equivalent_sample_size: used by BDeu bayes_prior to compute pseudo_counts.

        Returns:
            self
        """

        return self.fit_node_states(data).fit_cpds(data, method, bayes_prior,
                                                   equivalent_sample_size)

    def predict(self, data: pd.DataFrame, node: str) -> pd.DataFrame:
        """
        Predict the state of a node based on some input data, using the Bayesian Network.

        Args:
            data: data to make prediction.
            node: the node to predict.

        Returns:
            A dataframe of predictions, containing a single column name {node}_prediction.
        """

        if all(parent in data.columns
               for parent in self._model.get_parents(node)):
            return self._predict_from_complete_data(data, node)

        return self._predict_from_incomplete_data(data, node)

    def _predict_from_complete_data(self, data: pd.DataFrame,
                                    node: str) -> pd.DataFrame:
        """
        Predicts state of node given all parents of node exist within data.
        This method inspects the CPD of node directly, since all parent states are known.
        This avoids traversing the full network to compute marginals.
        This method is fast.

        Args:
            data: data to make prediction.
            node: the node to predict.

        Returns:
            A dataframe of predictions, containing a single column named {node}_prediction.
        """
        transformed_data = data.copy(deep=True)  # type: pd.DataFrame

        parents = sorted(self._model.get_parents(node))
        cpd = self.cpds[node]

        transformed_data["{node}_prediction".format(
            node=node)] = transformed_data.apply(
                lambda row: cpd[tuple(row[parent]
                                      for parent in parents)].idxmax()
                if parents else cpd[""].idxmax(),
                axis=1,
            )
        return transformed_data[[node + "_prediction"]]

    def _predict_from_incomplete_data(self, data: pd.DataFrame,
                                      node: str) -> pd.DataFrame:
        """
        Predicts state of node when some parents of node do not exist within data.
        This method uses the pgmpy predict function, which predicts the most likely state for every node
        that is not contained within data.
        With incomplete data, pgmpy goes beyond parents in the network to determine the most likely predictions.
        This method is slow.

        Args:
            data: data to make prediction.
            node: the node to predict.

        Returns:
            A dataframe of predictions, containing a single column name {node}_prediction.
        """

        transformed_data = deepcopy(data)  # type: pd.DataFrame
        self._state_to_index(transformed_data)

        # transformed_data.is_copy()

        # pgmpy will predict all missing data, so drop column we want to predict
        transformed_data = transformed_data.drop(columns=[node])

        predictions = self._model.predict(transformed_data)[[node]]

        return predictions.rename(columns={node: node + "_prediction"})

    def predict_probability(self, data: pd.DataFrame,
                            node: str) -> pd.DataFrame:
        """
        Predict the probability of each possible state of a node, based on some input data.

        Args:
            data: data to make prediction.
            node: the node to predict probabilities.

        Returns:
            A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}.
        """

        if all(parent in data.columns
               for parent in self._model.get_parents(node)):
            return self._predict_probability_from_complete_data(data, node)

        return self._predict_probability_from_incomplete_data(data, node)

    def _predict_probability_from_complete_data(self, data: pd.DataFrame,
                                                node: str) -> pd.DataFrame:
        """
        Predict the probability of each possible state of a node, based on some input data.
        This method inspects the CPD of node directly, since all parent states are known.
        This avoids traversing the full network to compute marginals.
        This method is fast.

        Args:
            data: data to make prediction.
            node: the node to predict probabilities.

        Returns:
            A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}.
        """
        transformed_data = data.copy(deep=True)  # type: pd.DataFrame

        parents = sorted(self._model.get_parents(node))
        cpd = self.cpds[node]

        def lookup_probability(row, s):
            """Retrieve probability from CPD"""
            if parents:
                return cpd[tuple(row[parent] for parent in parents)].loc[s]
            return cpd.at[s, ""]

        for state in self.node_states[node]:
            transformed_data["{n}_{s}".format(
                n=node, s=state)] = transformed_data.apply(
                    lambda row, st=state: lookup_probability(row, st), axis=1)

        return transformed_data[[
            "{n}_{s}".format(n=node, s=state)
            for state in self.node_states[node]
        ]]

    def _predict_probability_from_incomplete_data(self, data: pd.DataFrame,
                                                  node: str) -> pd.DataFrame:
        """
        Predict the probability of each possible state of a node, based on some input data.
        This method uses the pgmpy predict_probability function, which predicts the probability
        of every state for every node that is not contained within data.
        With incomplete data, pgmpy goes beyond parents in the network to determine the most likely predictions.
        This method is slow.

        Args:
            data: data to make prediction.
            node: the node to predict probabilities.

        Returns:
            A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}.
        """
        transformed_data = data.copy(deep=True)  # type: pd.DataFrame
        self._state_to_index(transformed_data)

        # pgmpy will predict all missing data, so drop column we want to predict
        transformed_data = transformed_data.drop(columns=[node])

        probability = self._model.predict_probability(
            transformed_data)  # type: pd.DataFrame

        # keep only probabilities for the node we are interested in
        cols = []
        pattern = re.compile("^{node}_[0-9]+$".format(node=node))
        # disabled open pylint issue (https://github.com/PyCQA/pylint/issues/2962)
        for col in probability.columns:
            if pattern.match(col):
                cols.append(col)
        probability = probability[cols]
        probability.columns = cols

        return probability
Example #32
0
# pr = {}
# data = pd.read_csv('data.csv') #"fisrm.csv"
# data_size = len(data)
model = BayesianModel()
list_edges = []
for i in range(3):
    list_edges += [('DI' + str(i), 'DFT' + str(i)),
                ('TQ', 'DFT' + str(i)),
                ('DI' + str(i), 'RD' + str(i)),
                ('DFT' + str(i), 'RD' + str(i)),
                ('RD' + str(i), 'DFO' + str(i)),
                ('OU', 'DFO' + str(i))]

list_edges += [('RD0', 'DI1'), ('RD1', 'DI2'), ('DPQ', 'DI0'), ('C', 'DI0')]

model.add_edges_from(list_edges)
model.fit(data, estimator_type = BayesianEstimator, prior_type = "BDeu", equivalent_sample_size = 10)
for edge in model.edges():
    print(edge)
    print("\n")
infer = VariableElimination(model)

nodes = model.nodes()
Distribution = {}

for key in pr.keys():
    Distribution[key] = [1 - abs(np.sign(pr[key] - i)) for i in range(5)]
    nodes.remove(key)
    print('pr done')

for key in nodes:
Example #33
0
class TestGibbsSampling(unittest.TestCase):
    def setUp(self):
        # A test Bayesian model
        diff_cpd = TabularCPD('diff', 2, [[0.6], [0.4]])
        intel_cpd = TabularCPD('intel', 2, [[0.7], [0.3]])
        grade_cpd = TabularCPD('grade', 3, [[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]],
                               evidence=['diff', 'intel'], evidence_card=[2, 2])
        self.bayesian_model = BayesianModel()
        self.bayesian_model.add_nodes_from(['diff', 'intel', 'grade'])
        self.bayesian_model.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.bayesian_model.add_cpds(diff_cpd, intel_cpd, grade_cpd)

        # A test Markov model
        self.markov_model = MarkovModel([('A', 'B'), ('C', 'B'), ('B', 'D')])
        factor_ab = Factor(['A', 'B'], [2, 3], [1, 2, 3, 4, 5, 6])
        factor_cb = Factor(['C', 'B'], [4, 3], [3, 1, 4, 5, 7, 8, 1, 3, 10, 4, 5, 6])
        factor_bd = Factor(['B', 'D'], [3, 2], [5, 7, 2, 1, 9, 3])
        self.markov_model.add_factors(factor_ab, factor_cb, factor_bd)

        self.gibbs = GibbsSampling(self.bayesian_model)

    def tearDown(self):
        del self.bayesian_model
        del self.markov_model

    @patch('pgmpy.inference.Sampling.GibbsSampling._get_kernel_from_bayesian_model', autospec=True)
    @patch('pgmpy.models.MarkovChain.__init__', autospec=True)
    def test_init_bayesian_model(self, init, get_kernel):
        model = MagicMock(spec_set=BayesianModel)
        gibbs = GibbsSampling(model)
        init.assert_called_once_with(gibbs)
        get_kernel.assert_called_once_with(gibbs, model)

    @patch('pgmpy.inference.Sampling.GibbsSampling._get_kernel_from_markov_model', autospec=True)
    def test_init_markov_model(self, get_kernel):
        model = MagicMock(spec_set=MarkovModel)
        gibbs = GibbsSampling(model)
        get_kernel.assert_called_once_with(gibbs, model)

    def test_get_kernel_from_bayesian_model(self):
        gibbs = GibbsSampling()
        gibbs._get_kernel_from_bayesian_model(self.bayesian_model)
        self.assertListEqual(list(gibbs.variables), self.bayesian_model.nodes())
        self.assertDictEqual(gibbs.cardinalities, {'diff': 2, 'intel': 2, 'grade': 3})

    def test_get_kernel_from_markov_model(self):
        gibbs = GibbsSampling()
        gibbs._get_kernel_from_markov_model(self.markov_model)
        self.assertListEqual(list(gibbs.variables), self.markov_model.nodes())
        self.assertDictEqual(gibbs.cardinalities, {'A': 2, 'B': 3, 'C': 4, 'D': 2})

    def test_sample(self):
        start_state = [State('diff', 0), State('intel', 0), State('grade', 0)]
        sample = self.gibbs.sample(start_state, 2)
        self.assertEquals(len(sample), 2)
        self.assertEquals(len(sample.columns), 3)
        self.assertIn('diff', sample.columns)
        self.assertIn('intel', sample.columns)
        self.assertIn('grade', sample.columns)
        self.assertTrue(set(sample['diff']).issubset({0, 1}))
        self.assertTrue(set(sample['intel']).issubset({0, 1}))
        self.assertTrue(set(sample['grade']).issubset({0, 1, 2}))


    @patch("pgmpy.inference.Sampling.GibbsSampling.random_state", autospec=True)
    def test_sample_less_arg(self, random_state):
        self.gibbs.state = None
        random_state.return_value = [State('diff', 0), State('intel', 0), State('grade', 0)]
        sample = self.gibbs.sample(size=2)
        random_state.assert_called_once_with(self.gibbs)
        self.assertEqual(len(sample), 2)


    def test_generate_sample(self):
        start_state = [State('diff', 0), State('intel', 0), State('grade', 0)]
        gen = self.gibbs.generate_sample(start_state, 2)
        samples = [sample for sample in gen]
        self.assertEqual(len(samples), 2)
        self.assertEqual({samples[0][0].var, samples[0][1].var, samples[0][2].var}, {'diff', 'intel', 'grade'})
        self.assertEqual({samples[1][0].var, samples[1][1].var, samples[1][2].var}, {'diff', 'intel', 'grade'})


    @patch("pgmpy.inference.Sampling.GibbsSampling.random_state", autospec=True)
    def test_generate_sample_less_arg(self, random_state):
        self.gibbs.state = None
        gen = self.gibbs.generate_sample(size=2)
        samples = [sample for sample in gen]
        random_state.assert_called_once_with(self.gibbs)
        self.assertEqual(len(samples), 2)
Example #34
0
Created on Oct 27, 2017

@author: Adele
'''

import numpy as np
import pandas

data = pandas.read_csv("kaggle.csv")

data2 = data[["Survived", "Sex", "Pclass"]]
#data2 = data[["Survived", "Sex", "Pclass"]].replace(["female", "male"], [0, 1]).replace({"Pclass": {3: 0}})

intrain = np.random.rand(len(data2)) < 0.8

dtrain = data2[intrain]
dtest = data2[~intrain]

##print(len(dtrain), len(dtest))

from pgmpy.models import BayesianModel
titanic = BayesianModel()
titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")])
titanic.fit(dtrain)
for cpd in titanic.get_cpds():
    print(cpd)


print(dtest[["Sex", "Pclass"]])
titanic.predict(dtest[["Sex", "Pclass"]])
    def setUp(self):
        nodes = {
            "c": {
                "STATES": ["Present", "Absent"],
                "DESCRIPTION": "(c) Brain Tumor",
                "YPOS": "11935",
                "XPOS": "15250",
                "TYPE": "discrete",
            },
            "a": {
                "STATES": ["Present", "Absent"],
                "DESCRIPTION": "(a) Metastatic Cancer",
                "YPOS": "10465",
                "XPOS": "13495",
                "TYPE": "discrete",
            },
            "b": {
                "STATES": ["Present", "Absent"],
                "DESCRIPTION": "(b) Serum Calcium Increase",
                "YPOS": "11965",
                "XPOS": "11290",
                "TYPE": "discrete",
            },
            "e": {
                "STATES": ["Present", "Absent"],
                "DESCRIPTION": "(e) Papilledema",
                "YPOS": "13240",
                "XPOS": "17305",
                "TYPE": "discrete",
            },
            "f": {
                "STATES": ["Present", "Absent"],
                "DESCRIPTION": "(f) Asthma",
                "YPOS": "10489",
                "XPOS": "13440",
                "TYPE": "discrete",
            },
            "d": {
                "STATES": ["Present", "Absent"],
                "DESCRIPTION": "(d) Coma",
                "YPOS": "12985",
                "XPOS": "13960",
                "TYPE": "discrete",
            },
        }
        model = BayesianModel()
        model.add_nodes_from(["a", "b", "c", "d", "e", "f"])
        model.add_edges_from([("b", "d"), ("a", "b"), ("a", "c"), ("c", "d"),
                              ("c", "e")])
        cpd_distribution = {
            "a": {
                "TYPE": "discrete",
                "DPIS": np.array([[0.2, 0.8]])
            },
            "e": {
                "TYPE": "discrete",
                "DPIS": np.array([[0.8, 0.2], [0.6, 0.4]]),
                "CONDSET": ["c"],
                "CARDINALITY": [2],
            },
            "f": {
                "TYPE": "discrete",
                "DPIS": np.array([[0.3, 0.7]])
            },
            "b": {
                "TYPE": "discrete",
                "DPIS": np.array([[0.8, 0.2], [0.2, 0.8]]),
                "CONDSET": ["a"],
                "CARDINALITY": [2],
            },
            "c": {
                "TYPE": "discrete",
                "DPIS": np.array([[0.2, 0.8], [0.05, 0.95]]),
                "CONDSET": ["a"],
                "CARDINALITY": [2],
            },
            "d": {
                "TYPE":
                "discrete",
                "DPIS":
                np.array([[0.8, 0.2], [0.9, 0.1], [0.7, 0.3], [0.05, 0.95]]),
                "CONDSET": ["b", "c"],
                "CARDINALITY": [2, 2],
            },
        }

        tabular_cpds = []
        for var, values in cpd_distribution.items():
            evidence = values["CONDSET"] if "CONDSET" in values else []
            cpd = values["DPIS"]
            evidence_card = values[
                "CARDINALITY"] if "CARDINALITY" in values else []
            states = nodes[var]["STATES"]
            cpd = TabularCPD(var,
                             len(states),
                             cpd,
                             evidence=evidence,
                             evidence_card=evidence_card)
            tabular_cpds.append(cpd)
        model.add_cpds(*tabular_cpds)

        if nx.__version__.startswith("1"):
            for var, properties in nodes.items():
                model.nodes[var] = properties
        else:
            for var, properties in nodes.items():
                model._node[var] = properties

        self.maxDiff = None
        self.writer = XMLBeliefNetwork.XBNWriter(model=model)
Example #36
0
class TestBaseModelCreation(unittest.TestCase):
    def setUp(self):
        self.G = BayesianModel()

    def test_class_init_without_data(self):
        self.assertIsInstance(self.G, nx.DiGraph)

    def test_class_init_with_data_string(self):
        self.g = BayesianModel([('a', 'b'), ('b', 'c')])
        self.assertListEqual(sorted(self.g.nodes()), ['a', 'b', 'c'])
        self.assertListEqual(hf.recursive_sorted(self.g.edges()),
                             [['a', 'b'], ['b', 'c']])

    def test_class_init_with_data_nonstring(self):
        BayesianModel([(1, 2), (2, 3)])

    def test_add_node_string(self):
        self.G.add_node('a')
        self.assertListEqual(self.G.nodes(), ['a'])

    def test_add_node_nonstring(self):
        self.G.add_node(1)

    def test_add_nodes_from_string(self):
        self.G.add_nodes_from(['a', 'b', 'c', 'd'])
        self.assertListEqual(sorted(self.G.nodes()), ['a', 'b', 'c', 'd'])

    def test_add_nodes_from_non_string(self):
        self.G.add_nodes_from([1, 2, 3, 4])

    def test_add_edge_string(self):
        self.G.add_edge('d', 'e')
        self.assertListEqual(sorted(self.G.nodes()), ['d', 'e'])
        self.assertListEqual(self.G.edges(), [('d', 'e')])
        self.G.add_nodes_from(['a', 'b', 'c'])
        self.G.add_edge('a', 'b')
        self.assertListEqual(hf.recursive_sorted(self.G.edges()),
                             [['a', 'b'], ['d', 'e']])

    def test_add_edge_nonstring(self):
        self.G.add_edge(1, 2)

    def test_add_edge_selfloop(self):
        self.assertRaises(ValueError, self.G.add_edge, 'a', 'a')

    def test_add_edge_result_cycle(self):
        self.G.add_edges_from([('a', 'b'), ('a', 'c')])
        self.assertRaises(ValueError, self.G.add_edge, 'c', 'a')

    def test_add_edges_from_string(self):
        self.G.add_edges_from([('a', 'b'), ('b', 'c')])
        self.assertListEqual(sorted(self.G.nodes()), ['a', 'b', 'c'])
        self.assertListEqual(hf.recursive_sorted(self.G.edges()),
                             [['a', 'b'], ['b', 'c']])
        self.G.add_nodes_from(['d', 'e', 'f'])
        self.G.add_edges_from([('d', 'e'), ('e', 'f')])
        self.assertListEqual(sorted(self.G.nodes()),
                             ['a', 'b', 'c', 'd', 'e', 'f'])
        self.assertListEqual(
            hf.recursive_sorted(self.G.edges()),
            hf.recursive_sorted([('a', 'b'), ('b', 'c'), ('d', 'e'),
                                 ('e', 'f')]))

    def test_add_edges_from_nonstring(self):
        self.G.add_edges_from([(1, 2), (2, 3)])

    def test_add_edges_from_self_loop(self):
        self.assertRaises(ValueError, self.G.add_edges_from, [('a', 'a')])

    def test_add_edges_from_result_cycle(self):
        self.assertRaises(ValueError, self.G.add_edges_from, [('a', 'b'),
                                                              ('b', 'c'),
                                                              ('c', 'a')])

    def test_update_node_parents_bm_constructor(self):
        self.g = BayesianModel([('a', 'b'), ('b', 'c')])
        self.assertListEqual(self.g.predecessors('a'), [])
        self.assertListEqual(self.g.predecessors('b'), ['a'])
        self.assertListEqual(self.g.predecessors('c'), ['b'])

    def test_update_node_parents(self):
        self.G.add_nodes_from(['a', 'b', 'c'])
        self.G.add_edges_from([('a', 'b'), ('b', 'c')])
        self.assertListEqual(self.G.predecessors('a'), [])
        self.assertListEqual(self.G.predecessors('b'), ['a'])
        self.assertListEqual(self.G.predecessors('c'), ['b'])

    def tearDown(self):
        del self.G
Example #37
0
from pgmpy.models import BayesianModel

G = BayesianModel()
G.add_edges_from([('BirthAsphyxia', 'Disease'), ('Disease', 'LVH'),
                  ('LVH', 'LVHreport'), ('Disease', 'DuctFlow'),
                  ('DuctFlow', 'HypDistrib'), ('HypDistrib', 'LowerBodyO2'),
                  ('Disease', 'CardiacMixing'),
                  ('CardiacMixing', 'HypoxiaInO2'), ('HypoxiaInO2', 'RUQO2'),
                  ('Disease', 'LungParench'), ('LungParench', 'CO2'),
                  ('CO2', 'CO2Report'), ('LungFlow', 'ChestXray'),
                  ('ChestXray', 'XrayReport'), ('Disease', 'Sick'),
                  ('Sick', 'Grunting'), ('Grunting', 'GruntingReport'),
                  ('Sick', 'Age'), ('Disease', 'Age')])
print(G)
indip = G.get_independencies()
print(indip)
Example #38
0
def evaluate_graphs(data, graph_folder, dst_folder, nnode_filter=40):
    evaluation_points = []
    for dataset in data:
        print(dataset['filename'])
        generator, nnodes, nb_samples = dataset['filename'].split(
            '.')[0].split('_')

        if int(nnodes) != nnode_filter:
            continue

        df_samples = samples_to_df(dataset['dataset'],
                                   bins=3)  # Used 5 bins when 5 nodes
        # print(df_samples.head())
        scm = dataset['SCM']

        ground_truth = BayesianModel()
        ground_truth.add_nodes_from([str(i) for i in scm.causal_graph.nodes()])
        ground_truth.add_edges_from([(str(a), str(b))
                                     for a, b in scm.causal_graph.edges()])
        ground_truth.fit(df_samples, estimator=BayesianEstimator)
        ground_truth.check_model()
        bn_truth = BayesianNetwork(ground_truth)

        for algo_name, algo in algorithms.items():
            print(algo.__name__)
            # if algo.__name__ == 'CAM':
            #     continue

            algo_file = algo_name + '_' + dataset['filename']
            algo_path = graph_folder + algo_file

            if not os.path.isfile(algo_path):
                continue

            with open(algo_path, 'rb') as f:
                graph = pickle.loads(f.read())

            if 'Unnamed: 0' in graph.nodes():
                graph.remove_node('Unnamed: 0')  # problem with pickling pandas

            if algo.__name__ == 'GS' or algo.__name__ == 'IAMB' or algo.__name__ == 'MMPC':
                mapping = {}
                for n in graph.nodes():
                    mapping[n] = n.split('X')[1]
                graph = nx.relabel_nodes(graph, mapping)

            eval_point = {
                'generator': generator,
                'nnodes': nnodes,
                'nb_samples': nb_samples,
                'algo': algo_name
            }

            eval_point.update(
                evaluate_single_graph(df_samples, graph, bn_truth))
            evaluation_points.append(eval_point)

    filename = os.path.join(dst_folder,
                            'results-n' + str(nnode_filter) + '.json')
    with open(filename, 'w') as f:
        f.write(json.dumps(evaluation_points, default=to_serializable))
Example #39
0
    def learn(self, file1, file2):
        f1 = open(file1, encoding="utf8")
        lines = f1.readlines()
        edges = self.getegdes(lines[0])
        data = pd.read_csv(file2)

        G = nx.DiGraph()
        for i in range(int(len(edges) / 2)):
            G.add_edge(edges[2 * i], edges[2 * i + 1])

        est = HillClimbSearch(data, scoring_method=BicScore(data))
        model = est.estimate()
        G_ = nx.DiGraph()
        G_.add_edges_from(model.edges())

        for i, j in G_.edges():
            if i not in G.nodes() or j not in G.nodes():
                G.add_edge(i, j)
            elif not nx.has_path(G, j, i):
                G.add_edge(i, j)

        new_model = BayesianModel()
        new_model.add_edges_from(G.edges)
        G = new_model.copy()

        # N = G.number_of_nodes()
        # B = np.zeros((N*(N-1)//2, N))
        # i = 0
        # y = []
        # k = 0
        # nodes = list(G.nodes._nodes.keys())
        # for i in range(len(nodes)):
        #     for j in range(i+1, len(nodes)):
        #         if nx.has_path(G, nodes[i], nodes[j]):
        #             y.append(1)
        #             B[k, i] = 1
        #             B[k, j] = -1
        #         elif nx.has_path(G, nodes[j], nodes[i]):
        #             y.append(-1)
        #             B[k, i] = 1
        #             B[k, j] = -1
        #         else:
        #             y.append(0)
        #         k += 1
        #
        # W = np.eye(N, N)
        # est = HillClimbSearch(data, scoring_method=BicScore(data))
        # model = est.estimate()
        # G_ = nx.DiGraph()
        # G_.add_edges_from(model.edges())
        # queue = []
        # for node in G_.nodes():
        #     if G_.in_degree(node) == 0:
        #         queue.append(node)
        #         G.node[node]['s'] = N
        #     else:
        #         G.node[node]['s'] = N//2
        # while len(queue)>0:
        #     now = queue[0]
        #     l = list(G_._succ[now].keys())
        #     for i in l:
        #         G.node[i]['s'] = G.node[now]['s'] - 1
        #     queue += l
        #     queue.pop(0)
        #
        # phai = []
        # for node in G.nodes():
        #     phai.append(G.node[node]['s'])
        # miu1 = np.dot(np.transpose(B), B)
        # miu1 = np.linalg.pinv(miu1)
        # miu2 = np.dot(np.transpose(B), y)
        # miu2 = miu2 + phai
        # miu = np.dot(miu1, miu2)
        #
        # seq = miu.tolist()
        # seq = list(zip(seq, nodes))
        # seq = sorted(seq, key=lambda s: s[0])
        # seq = [x[1] for x in seq]

        # nx.draw(G)
        # plt.show()
        estimator = BayesianEstimator(G, data)

        edges = []
        for i in G.edges:
            edges.append(str(i))
        print(edges)
        for i in G.nodes:
            cpd = estimator.estimate_cpd(i, prior_type="K2")
            nodeName = i
            values = dict(data[i].value_counts())
            valueNum = len(values)
            CPT = np.transpose(cpd.values)
            # CPT = cpd.values
            sequence = cpd.variables[1::]
            card = []
            for x in sequence:
                s = len(dict(data[x].value_counts()))
                card.append(s)
            output = nodeName + '\t' + str(valueNum) + '\t' + str(
                CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card)
            print(output)