Exemple #1
0
    def test_estimate_skeleton(self):
        data = pd.DataFrame(np.random.randint(0, 2, size=(1000, 5)),
                            columns=list('ABCDE'))
        data['F'] = data['A'] + data['B'] + data['C']
        est = ConstraintBasedEstimator(data)
        skel, sep_sets = est.estimate_skeleton()
        self.assertTrue(
            self._edge_list_equal(skel.edges(), [('A', 'F'), ('B', 'F'),
                                                 ('C', 'F')]))

        sep_sets_ref = {
            frozenset({'D', 'F'}): (),
            frozenset({'D', 'B'}): (),
            frozenset({'A', 'C'}): (),
            frozenset({'D', 'E'}): (),
            frozenset({'E', 'F'}): (),
            frozenset({'E', 'C'}): (),
            frozenset({'E', 'B'}): (),
            frozenset({'D', 'C'}): (),
            frozenset({'A', 'B'}): (),
            frozenset({'A', 'E'}): (),
            frozenset({'B', 'C'}): (),
            frozenset({'A', 'D'}): ()
        }
        self.assertEqual(set(sep_sets.keys()), set(sep_sets_ref.keys()))
 def test_estimate(self):
     data = pd.DataFrame(np.random.randint(0, 3, size=(1000, 3)),
                         columns=list("XYZ"))
     data["sum"] = data.sum(axis=1)
     model = ConstraintBasedEstimator(data).estimate()
     self.assertSetEqual(set(model.edges()),
                         set([("Z", "sum"), ("X", "sum"), ("Y", "sum")]))
Exemple #3
0
    def test_build_skeleton(self):
        ind = Independencies(['B', 'C'], ['A', ['B', 'C'], 'D'])
        ind = ind.closure()
        skel1, sep_sets1 = ConstraintBasedEstimator.build_skeleton("ABCD", ind)
        self.assertTrue(
            self._edge_list_equal(skel1.edges(), [('A', 'D'), ('B', 'D'),
                                                  ('C', 'D')]))

        sep_sets_ref1 = {
            frozenset({'A', 'C'}): (),
            frozenset({'A', 'B'}): (),
            frozenset({'C', 'B'}): ()
        }
        self.assertEqual(sep_sets1, sep_sets_ref1)

        model = BayesianModel([('A', 'C'), ('B', 'C'), ('B', 'D'), ('C', 'E')])
        skel2, sep_sets2 = ConstraintBasedEstimator.build_skeleton(
            model.nodes(), model.get_independencies())
        self.assertTrue(
            self._edge_list_equal(skel2, [('D', 'B'), ('A', 'C'), ('B', 'C'),
                                          ('C', 'E')]))

        sep_sets_ref2 = {
            frozenset({'D', 'C'}): ('B', ),
            frozenset({'E', 'B'}): ('C', ),
            frozenset({'A', 'D'}): (),
            frozenset({'E', 'D'}): ('C', ),
            frozenset({'E', 'A'}): ('C', ),
            frozenset({'A', 'B'}): ()
        }
        # witnesses/seperators might change on each run, so we cannot compare directly
        self.assertEqual(sep_sets2.keys(), sep_sets_ref2.keys())
        self.assertEqual([len(v) for v in sorted(sep_sets2.values())],
                         [len(v) for v in sorted(sep_sets_ref2.values())])
    def test_estimate_skeleton(self):
        data = pd.DataFrame(np.random.randint(0, 2, size=(1000, 5)),
                            columns=list("ABCDE"))
        data["F"] = data["A"] + data["B"] + data["C"]
        est = ConstraintBasedEstimator(data)
        skel, sep_sets = est.estimate_skeleton()
        self.assertTrue(
            self._edge_list_equal(skel.edges(), [("A", "F"), ("B", "F"),
                                                 ("C", "F")]))

        sep_sets_ref = {
            frozenset({"D", "F"}): (),
            frozenset({"D", "B"}): (),
            frozenset({"A", "C"}): (),
            frozenset({"D", "E"}): (),
            frozenset({"E", "F"}): (),
            frozenset({"E", "C"}): (),
            frozenset({"E", "B"}): (),
            frozenset({"D", "C"}): (),
            frozenset({"A", "B"}): (),
            frozenset({"A", "E"}): (),
            frozenset({"B", "C"}): (),
            frozenset({"A", "D"}): (),
        }
        self.assertEqual(set(sep_sets.keys()), set(sep_sets_ref.keys()))
Exemple #5
0
 def test_estimate(self):
     data = pd.DataFrame(np.random.randint(0, 3, size=(1000, 3)),
                         columns=list('XYZ'))
     data['sum'] = data.sum(axis=1)
     model = ConstraintBasedEstimator(data).estimate()
     self.assertSetEqual(set(model.edges()),
                         set([('Z', 'sum'), ('X', 'sum'), ('Y', 'sum')]))
    def test_estimate_skeleton2(self):
        data = pd.DataFrame(np.random.randint(0, 2, size=(1000, 3)), columns=list('XYZ'))
        data['X'] += data['Z']
        data['Y'] += data['Z']
        est = ConstraintBasedEstimator(data)
        skel, sep_sets = est.estimate_skeleton()

        self.assertTrue(self._edge_list_equal(skel.edges(), [('X', 'Z'), ('Y', 'Z')]))
        self.assertEqual(sep_sets, {frozenset(('X', 'Y')): ('Z',)})
Exemple #7
0
    def explain(self,
                num_samples=10,
                percentage=50,
                top_node=None,
                p_threshold=0.05,
                pred_threshold=0.1):

        num_nodes = self.X_feat.shape[0]
        if top_node == None:
            top_node = int(num_nodes / 20)

#         Round 1
        Samples = self.batch_perturb_features_on_node(int(num_samples / 2),
                                                      range(num_nodes),
                                                      percentage, p_threshold,
                                                      pred_threshold)

        data = pd.DataFrame(Samples)
        est = ConstraintBasedEstimator(data)

        p_values = []
        candidate_nodes = []

        target = num_nodes  # The entry for the graph classification data is at "num_nodes"
        for node in range(num_nodes):
            chi2, p = chi_square(node, target, [], data)
            p_values.append(p)

        number_candidates = int(top_node * 4)
        candidate_nodes = np.argpartition(
            p_values, number_candidates)[0:number_candidates]

        #         Round 2
        Samples = self.batch_perturb_features_on_node(num_samples,
                                                      candidate_nodes,
                                                      percentage, p_threshold,
                                                      pred_threshold)
        data = pd.DataFrame(Samples)
        est = ConstraintBasedEstimator(data)

        p_values = []
        dependent_nodes = []

        target = num_nodes
        for node in range(num_nodes):
            chi2, p = chi_square(node, target, [], data)
            p_values.append(p)
            if p < p_threshold:
                dependent_nodes.append(node)

        top_p = np.min((top_node, num_nodes - 1))
        ind_top_p = np.argpartition(p_values, top_p)[0:top_p]
        pgm_nodes = list(ind_top_p)

        return pgm_nodes, p_values, candidate_nodes
    def test_estimate_skeleton2(self):
        data = pd.DataFrame(np.random.randint(0, 2, size=(1000, 3)),
                            columns=list("XYZ"))
        data["X"] += data["Z"]
        data["Y"] += data["Z"]
        est = ConstraintBasedEstimator(data)
        skel, sep_sets = est.estimate_skeleton()

        self.assertTrue(
            self._edge_list_equal(skel.edges(), [("X", "Z"), ("Y", "Z")]))
        self.assertEqual(sep_sets, {frozenset(("X", "Y")): ("Z", )})
Exemple #9
0
    def test_estimate_skeleton2(self):
        data = pd.DataFrame(np.random.randint(0, 2, size=(1000, 3)),
                            columns=list('XYZ'))
        data['X'] += data['Z']
        data['Y'] += data['Z']
        est = ConstraintBasedEstimator(data)
        skel, sep_sets = est.estimate_skeleton()

        self.assertTrue(
            self._edge_list_equal(skel.edges(), [('X', 'Z'), ('Y', 'Z')]))
        self.assertEqual(sep_sets, {frozenset(('X', 'Y')): ('Z', )})
    def test_estimate_from_independencies(self):
        ind = Independencies(['B', 'C'], ['A', ['B', 'C'], 'D'])
        ind = ind.closure()
        model = ConstraintBasedEstimator.estimate_from_independencies("ABCD", ind)

        self.assertSetEqual(set(model.edges()),
                            set([('B', 'D'), ('A', 'D'), ('C', 'D')]))

        model1 = BayesianModel([('A', 'C'), ('B', 'C'), ('B', 'D'), ('C', 'E')])
        model2 = ConstraintBasedEstimator.estimate_from_independencies(
                            model1.nodes(),
                            model1.get_independencies())

        self.assertTrue(set(model2.edges()) == set(model1.edges()) or
                        set(model2.edges()) == set([('B', 'C'), ('A', 'C'), ('C', 'E'), ('D', 'B')]))
def Constraint_based(dataset: pd.DataFrame):
    from pgmpy.estimators import ConstraintBasedEstimator

    est = ConstraintBasedEstimator(dataset)

    # Construct dag
    skel, seperating_sets = est.estimate_skeleton(significance_level=0.01)
    print("Undirected edges:", skel.edges())

    pdag = est.skeleton_to_pdag(skel, seperating_sets)
    print("PDAG edges:", pdag.edges())

    model = est.pdag_to_dag(pdag)
    print("DAG edges:", model.edges())

    return model.edges()
    def test_pdag_to_dag(self):
        pdag1 = DirectedGraph([('A', 'B'), ('C', 'B'), ('C', 'D'), ('D', 'C'), ('D', 'A'), ('A', 'D')])
        dag1 = ConstraintBasedEstimator.pdag_to_dag(pdag1)
        self.assertTrue(('A', 'B') in dag1.edges() and
                        ('C', 'B') in dag1.edges() and
                        len(dag1.edges()) == 4)

        pdag2 = DirectedGraph([('B', 'C'), ('D', 'A'), ('A', 'D'), ('A', 'C')])
        dag2 = ConstraintBasedEstimator.pdag_to_dag(pdag2)
        self.assertTrue(set(dag2.edges()) == set([('B', 'C'), ('A', 'D'), ('A', 'C')]) or
                        set(dag2.edges()) == set([('B', 'C'), ('D', 'A'), ('A', 'C')]))

        pdag3 = DirectedGraph([('B', 'C'), ('D', 'C'), ('C', 'D'), ('A', 'C')])
        dag3 = ConstraintBasedEstimator.pdag_to_dag(pdag3)
        self.assertSetEqual(set([('B', 'C'), ('C', 'D'), ('A', 'C')]),
                            set(dag3.edges()))
Exemple #13
0
def _constraintsearch(df, significance_level=0.05, verbose=3):
    """Contrain search.

    test_conditional_independence() returns a tripel (chi2, p_value, sufficient_data),
    consisting in the computed chi2 test statistic, the p_value of the test, and a heuristig
    flag that indicates if the sample size was sufficient.
    The p_value is the probability of observing the computed chi2 statistic (or an even higher chi2 value),
    given the null hypothesis that X and Y are independent given Zs.
    This can be used to make independence judgements, at a given level of significance.
    """
    out = dict()
    # Set search algorithm
    model = ConstraintBasedEstimator(df)

    # Some checks for dependency
    #    print(_is_independent(est, 'Sprinkler', 'Rain', significance_level=significance_level))
    #    print(_is_independent(est, 'Cloudy', 'Rain', significance_level=significance_level))
    #    print(_is_independent(est, 'Sprinkler', 'Rain',  ['Wet_Grass'], significance_level=significance_level))
    """
    DAG (pattern) construction
    With a method for independence testing at hand, we can construct a DAG from the data set in three steps:
        1. Construct an undirected skeleton - `estimate_skeleton()`
        2. Orient compelled edges to obtain partially directed acyclid graph (PDAG; I-equivalence class of DAGs) - `skeleton_to_pdag()`
        3. Extend DAG pattern to a DAG by conservatively orienting the remaining edges in some way - `pdag_to_dag()`

        Step 1.&2. form the so-called PC algorithm, see [2], page 550. PDAGs are `DirectedGraph`s, that may contain both-way edges, to indicate that the orientation for the edge is not determined.
    """
    # Estimate using chi2
    [skel, seperating_sets
     ] = model.estimate_skeleton(significance_level=significance_level)

    print("Undirected edges: ", skel.edges())
    pdag = model.skeleton_to_pdag(skel, seperating_sets)
    print("PDAG edges: ", pdag.edges())
    dag = model.pdag_to_dag(pdag)
    print("DAG edges: ", dag.edges())

    out['undirected'] = skel
    out['undirected_edges'] = skel.edges()
    out['pdag'] = pdag
    out['pdag_edges'] = pdag.edges()
    out['dag'] = dag
    out['dag_edges'] = dag.edges()

    # Search using "estimate()" method provides a shorthand for the three steps above and directly returns a "BayesianModel"
    best_model = model.estimate(significance_level=significance_level)
    out['model'] = best_model
    out['model_edges'] = best_model.edges()

    print(best_model.edges())
    """
    PC PDAG construction is only guaranteed to work under the assumption that the
    identified set of independencies is *faithful*, i.e. there exists a DAG that
    exactly corresponds to it. Spurious dependencies in the data set can cause
    the reported independencies to violate faithfulness. It can happen that the
    estimated PDAG does not have any faithful completions (i.e. edge orientations
    that do not introduce new v-structures). In that case a warning is issued.
    """
    return (out)
Exemple #14
0
def pc(mat):
    data = pd.DataFrame(mat)
    c = ConstraintBasedEstimator(data)
    model = c.estimate()

    # kinda hacky, but can't find a more direct way
    # of getting the adj matrix
    g = nx.DiGraph()
    g.add_nodes_from(model.nodes())
    g.add_edges_from(model.edges())

    # specify nodelist to maintain ordering
    # consistent with dataframe
    # TODO this is a non-weighted adjacency matrix,
    # but according to the paper, it might need to be? given
    # their signs are being checked
    return nx.adjacency_matrix(g, nodelist=data.columns).todense()
    def test_pdag_to_dag(self):
        pdag1 = nx.DiGraph([("A", "B"), ("C", "B"), ("C", "D"), ("D", "C"),
                            ("D", "A"), ("A", "D")])
        dag1 = ConstraintBasedEstimator.pdag_to_dag(pdag1)
        self.assertTrue(("A", "B") in dag1.edges()
                        and ("C", "B") in dag1.edges()
                        and len(dag1.edges()) == 4)

        pdag2 = nx.DiGraph([("B", "C"), ("D", "A"), ("A", "D"), ("A", "C")])
        dag2 = ConstraintBasedEstimator.pdag_to_dag(pdag2)
        self.assertTrue(
            set(dag2.edges()) == set([("B", "C"), ("A", "D"), ("A", "C")])
            or set(dag2.edges()) == set([("B", "C"), ("D", "A"), ("A", "C")]))

        pdag3 = nx.DiGraph([("B", "C"), ("D", "C"), ("C", "D"), ("A", "C")])
        dag3 = ConstraintBasedEstimator.pdag_to_dag(pdag3)
        self.assertSetEqual(set([("B", "C"), ("C", "D"), ("A", "C")]),
                            set(dag3.edges()))
Exemple #16
0
    def pdag2dag(self, edge_dict):
        pdag_edges = [(pi, n) for n, p in edge_dict.items() for pi in p]
        pdag = DAG(pdag_edges)
        dag_edges = ConstraintBasedEstimator.pdag_to_dag(pdag).edges()
        dag = dict([(n, set()) for n in range(len(edge_dict))])
        for e in dag_edges:
            dag[e[1]].add(e[0])

        return dag
    def test_estimate_from_independencies(self):
        ind = Independencies(["B", "C"], ["A", ["B", "C"], "D"])
        ind = ind.closure()
        model = ConstraintBasedEstimator.estimate_from_independencies(
            "ABCD", ind)

        self.assertSetEqual(set(model.edges()),
                            set([("B", "D"), ("A", "D"), ("C", "D")]))

        model1 = BayesianModel([("A", "C"), ("B", "C"), ("B", "D"),
                                ("C", "E")])
        model2 = ConstraintBasedEstimator.estimate_from_independencies(
            model1.nodes(), model1.get_independencies())

        self.assertTrue(
            set(model2.edges()) == set(model1.edges())
            or set(model2.edges()) == set([("B", "C"), ("A", "C"), ("C", "E"),
                                           ("D", "B")]))
Exemple #18
0
    def test_pdag_to_dag(self):
        pdag1 = DirectedGraph([('A', 'B'), ('C', 'B'), ('C', 'D'), ('D', 'C'),
                               ('D', 'A'), ('A', 'D')])
        dag1 = ConstraintBasedEstimator.pdag_to_dag(pdag1)
        self.assertTrue(('A', 'B') in dag1.edges()
                        and ('C', 'B') in dag1.edges()
                        and len(dag1.edges()) == 4)

        pdag2 = DirectedGraph([('B', 'C'), ('D', 'A'), ('A', 'D'), ('A', 'C')])
        dag2 = ConstraintBasedEstimator.pdag_to_dag(pdag2)
        self.assertTrue(
            set(dag2.edges()) == set([('B', 'C'), ('A', 'D'), ('A', 'C')])
            or set(dag2.edges()) == set([('B', 'C'), ('D', 'A'), ('A', 'C')]))

        pdag3 = DirectedGraph([('B', 'C'), ('D', 'C'), ('C', 'D'), ('A', 'C')])
        dag3 = ConstraintBasedEstimator.pdag_to_dag(pdag3)
        self.assertSetEqual(set([('B', 'C'), ('C', 'D'), ('A', 'C')]),
                            set(dag3.edges()))
Exemple #19
0
    def test_estimate_from_independencies(self):
        ind = Independencies(['B', 'C'], ['A', ['B', 'C'], 'D'])
        ind = ind.closure()
        model = ConstraintBasedEstimator.estimate_from_independencies(
            "ABCD", ind)

        self.assertSetEqual(set(model.edges()),
                            set([('B', 'D'), ('A', 'D'), ('C', 'D')]))

        model1 = BayesianModel([('A', 'C'), ('B', 'C'), ('B', 'D'),
                                ('C', 'E')])
        model2 = ConstraintBasedEstimator.estimate_from_independencies(
            model1.nodes(), model1.get_independencies())

        self.assertTrue(
            set(model2.edges()) == set(model1.edges())
            or set(model2.edges()) == set([('B', 'C'), ('A', 'C'), ('C', 'E'),
                                           ('D', 'B')]))
    def test_estimate_skeleton(self):
        data = pd.DataFrame(np.random.randint(0, 2, size=(1000, 5)), columns=list('ABCDE'))
        data['F'] = data['A'] + data['B'] + data['C']
        est = ConstraintBasedEstimator(data)
        skel, sep_sets = est.estimate_skeleton()
        self.assertTrue(self._edge_list_equal(skel.edges(),
                                              [('A', 'F'), ('B', 'F'), ('C', 'F')]))

        sep_sets_ref = {frozenset({'D', 'F'}): (),
                        frozenset({'D', 'B'}): (),
                        frozenset({'A', 'C'}): (),
                        frozenset({'D', 'E'}): (),
                        frozenset({'E', 'F'}): (),
                        frozenset({'E', 'C'}): (),
                        frozenset({'E', 'B'}): (),
                        frozenset({'D', 'C'}): (),
                        frozenset({'A', 'B'}): (),
                        frozenset({'A', 'E'}): (),
                        frozenset({'B', 'C'}): (),
                        frozenset({'A', 'D'}): ()}
        self.assertEqual(set(sep_sets.keys()), set(sep_sets_ref.keys()))
    def test_build_skeleton(self):
        ind = Independencies(['B', 'C'], ['A', ['B', 'C'], 'D'])
        ind = ind.closure()
        skel1, sep_sets1 = ConstraintBasedEstimator.build_skeleton("ABCD", ind)
        self.assertTrue(self._edge_list_equal(skel1.edges(), [('A', 'D'), ('B', 'D'), ('C', 'D')]))

        sep_sets_ref1 = {frozenset({'A', 'C'}): (), frozenset({'A', 'B'}): (), frozenset({'C', 'B'}): ()}
        self.assertEqual(sep_sets1, sep_sets_ref1)

        model = BayesianModel([('A', 'C'), ('B', 'C'), ('B', 'D'), ('C', 'E')])
        skel2, sep_sets2 = ConstraintBasedEstimator.build_skeleton(model.nodes(), model.get_independencies())
        self.assertTrue(self._edge_list_equal(skel2, [('D', 'B'), ('A', 'C'), ('B', 'C'), ('C', 'E')]))

        sep_sets_ref2 = {frozenset({'D', 'C'}): ('B',),
                         frozenset({'E', 'B'}): ('C',),
                         frozenset({'A', 'D'}): (),
                         frozenset({'E', 'D'}): ('C',),
                         frozenset({'E', 'A'}): ('C',),
                         frozenset({'A', 'B'}): ()}
        # witnesses/seperators might change on each run, so we cannot compare directly
        self.assertEqual(sep_sets2.keys(), sep_sets_ref2.keys())
        self.assertEqual([len(v) for v in sorted(sep_sets2.values())],
                         [len(v) for v in sorted(sep_sets_ref2.values())])
    def test_build_skeleton(self):
        ind = Independencies(["B", "C"], ["A", ["B", "C"], "D"])
        ind = ind.closure()
        skel1, sep_sets1 = ConstraintBasedEstimator.build_skeleton("ABCD", ind)
        self.assertTrue(
            self._edge_list_equal(skel1.edges(), [("A", "D"), ("B", "D"),
                                                  ("C", "D")]))

        sep_sets_ref1 = {
            frozenset({"A", "C"}): (),
            frozenset({"A", "B"}): (),
            frozenset({"C", "B"}): (),
        }
        self.assertEqual(sep_sets1, sep_sets_ref1)

        model = BayesianModel([("A", "C"), ("B", "C"), ("B", "D"), ("C", "E")])
        skel2, sep_sets2 = ConstraintBasedEstimator.build_skeleton(
            model.nodes(), model.get_independencies())
        self.assertTrue(
            self._edge_list_equal(skel2, [("D", "B"), ("A", "C"), ("B", "C"),
                                          ("C", "E")]))

        sep_sets_ref2 = {
            frozenset({"D", "C"}): ("B", ),
            frozenset({"E", "B"}): ("C", ),
            frozenset({"A", "D"}): (),
            frozenset({"E", "D"}): ("C", ),
            frozenset({"E", "A"}): ("C", ),
            frozenset({"A", "B"}): (),
        }
        # witnesses/seperators might change on each run, so we cannot compare directly
        self.assertEqual(sep_sets2.keys(), sep_sets_ref2.keys())
        self.assertEqual(
            [len(v) for v in sorted(sep_sets2.values())],
            [len(v) for v in sorted(sep_sets_ref2.values())],
        )
Exemple #23
0
    def learn_structure(self, method, scoring_method, log=True):
        ''' (4)
        Method that builds the structure of the data
        -----------------
        Parameters:
        method          : The technique used to search for the structure
            -> scoring_approx     - To use an approximated search with scoring method
            -> scoring_exhaustive - To use an exhaustive search with scoring method
            -> constraint         - To use the constraint based technique
        scoring_method : K2, bic, bdeu
        log             - "True" if you want to print debug information in the console    
        '''

        #Select the scoring method for the local search of the structure
        if scoring_method == "K2":
            scores = K2Score(self.data)
        elif scoring_method == "bic":
            scores = BicScore(self.data)
        elif scoring_method == "bdeu":
            scores = BdeuScore(self.data)

        #Select the actual method
        if method == "scoring_approx":
            est = HillClimbSearch(self.data, scores)
        elif method == "scoring_exhaustive":
            est = ExhaustiveSearch(self.data, scores)
        elif method == "constraint":
            est = ConstraintBasedEstimator(self.data)

        self.best_model = est.estimate()
        self.eliminate_isolated_nodes(
        )  # REMOVE all nodes not connected to anything else

        for edge in self.best_model.edges_iter():
            self.file_writer.write_txt(str(edge))

        self.log("Method used for structural learning: " + method, log)
        #self.log("Training instances skipped: " + str(self.extractor.get_skipped_lines()), log)
        self.log("Search terminated", log)
Exemple #24
0
    def test_skeleton_to_pdag(self):
        data = pd.DataFrame(np.random.randint(0, 3, size=(1000, 3)),
                            columns=list('ABD'))
        data['C'] = data['A'] - data['B']
        data['D'] += data['A']
        c = ConstraintBasedEstimator(data)
        pdag = c.skeleton_to_pdag(*c.estimate_skeleton())
        self.assertSetEqual(
            set(pdag.edges()),
            set([('B', 'C'), ('A', 'D'), ('A', 'C'), ('D', 'A')]))

        skel = UndirectedGraph([('A', 'B'), ('A', 'C')])
        sep_sets1 = {frozenset({'B', 'C'}): ()}
        self.assertSetEqual(set(c.skeleton_to_pdag(skel, sep_sets1).edges()),
                            set([('B', 'A'), ('C', 'A')]))

        sep_sets2 = {frozenset({'B', 'C'}): ('A', )}
        pdag2 = c.skeleton_to_pdag(skel, sep_sets2)
        self.assertSetEqual(
            set(c.skeleton_to_pdag(skel, sep_sets2).edges()),
            set([('A', 'B'), ('B', 'A'), ('A', 'C'), ('C', 'A')]))
    def test_skeleton_to_pdag(self):
        data = pd.DataFrame(np.random.randint(0, 3, size=(1000, 3)), columns=list('ABD'))
        data['C'] = data['A'] - data['B']
        data['D'] += data['A']
        c = ConstraintBasedEstimator(data)
        pdag = c.skeleton_to_pdag(*c.estimate_skeleton())
        self.assertSetEqual(set(pdag.edges()),
                            set([('B', 'C'), ('A', 'D'), ('A', 'C'), ('D', 'A')]))

        skel = UndirectedGraph([('A', 'B'), ('A', 'C')])
        sep_sets1 = {frozenset({'B', 'C'}): ()}
        self.assertSetEqual(set(c.skeleton_to_pdag(skel, sep_sets1).edges()),
                            set([('B', 'A'), ('C', 'A')]))

        sep_sets2 = {frozenset({'B', 'C'}): ('A',)}
        pdag2 = c.skeleton_to_pdag(skel, sep_sets2)
        self.assertSetEqual(set(c.skeleton_to_pdag(skel, sep_sets2).edges()),
                            set([('A', 'B'), ('B', 'A'), ('A', 'C'), ('C', 'A')]))
    def test_skeleton_to_pdag(self):
        data = pd.DataFrame(np.random.randint(0, 3, size=(1000, 3)),
                            columns=list("ABD"))
        data["C"] = data["A"] - data["B"]
        data["D"] += data["A"]
        c = ConstraintBasedEstimator(data)
        pdag = c.skeleton_to_pdag(*c.estimate_skeleton())
        self.assertSetEqual(
            set(pdag.edges()),
            set([("B", "C"), ("A", "D"), ("A", "C"), ("D", "A")]))

        skel = UndirectedGraph([("A", "B"), ("A", "C")])
        sep_sets1 = {frozenset({"B", "C"}): ()}
        self.assertSetEqual(
            set(c.skeleton_to_pdag(skel, sep_sets1).edges()),
            set([("B", "A"), ("C", "A")]),
        )

        sep_sets2 = {frozenset({"B", "C"}): ("A", )}
        pdag2 = c.skeleton_to_pdag(skel, sep_sets2)
        self.assertSetEqual(
            set(c.skeleton_to_pdag(skel, sep_sets2).edges()),
            set([("A", "B"), ("B", "A"), ("A", "C"), ("C", "A")]),
        )
hc = HillClimbSearch(data, scoring_method=BicScore(train))
hc_model = hc.estimate()

### Parameter Learning with Bayesian Estimation
hc_model.fit(train, estimator=BayesianEstimator, prior_type="BDeu")
### If the following for loop is un-commented the terminal will be flooded with CPDs
"""
for cpd in best_model.get_cpds():
    print(cpd)
"""

print()

### Another Method (it will throw errors about sample size - but it still runs and shouldn't be too messed up)
###Constraint Based Structure Learning
est = ConstraintBasedEstimator(train)

skel, seperating_sets = est.estimate_skeleton(significance_level=0.01)
print("Undirected edges: ", skel.edges())

pdag = est.skeleton_to_pdag(skel, seperating_sets)
print("PDAG edges:       ", pdag.edges())

cb_model = est.pdag_to_dag(pdag)
print("DAG edges:        ", cb_model.edges())

### Parameter learning with MLE
cb_model.fit(train, estimator=MaximumLikelihoodEstimator)

#Notice the significant difference in the connections that this version produces
#Print the final significant edges learned from constraint-based learning
Exemple #28
0
 seed(0)
 column_size = data.shape[1]
 random_columns = sample(range(column_size), 100)
 data = data.iloc[:, random_columns]
 #Delete invoices with all zeros from the data
 data = data[(data.T != 0).any()]
 row_size = data.shape[0]
 random_indices = sample(range(row_size), 2000)
 smallDF = data.iloc[random_indices, :]
 smallDF.shape
 PseudoCounts = {}
 #Pseudocounts are given (1,1) for uniform
 for productName in smallDF.columns:
     PseudoCounts[productName] = [1, 1]
 print('Existing network not found')
 est = ConstraintBasedEstimator(smallDF)
 print('Starting to estimate the model structure, might take a while...')
 start = time.time()
 model = est.estimate(significance_level=0.05)
 end = time.time()
 print('Time spent to estimate model structure {0}'.format(end - start))
 print('Edges of the model:')
 print(model.edges())
 print('Starting to estimate model parameters..')
 start = time.time()
 model.fit(smallDF,
           estimator=BayesianEstimator,
           prior_type='dirichlet',
           pseudo_counts=PseudoCounts)
 end = time.time()
 print('Time spent to estimete the model parameters {0}'.format(end -
    def learn_structure(self,
                        file_path,
                        algorithm="hc",
                        significance_level=0.05):
        """
        Employs `pgmpy` package's Bayesian Network structure learning algorithms to learn
        structure from a dataset. Saves a tabular version of the result as a CSV file.

        Arguments:
            algorithm: str, optional (default = 'hc')
                Determines whether the hill-climbing or Peter-Clark are employed.
                Two possible values include: 'hc', 'pc'. Note, I found a bug in pgmpy implementation
                halfway through this project. Don't use the 'pc' method.
            file_path: str, the absolute path to save the file to (e.g. "~/Desktop/BN_structure.csv")
            significance_level: float, option (default = 0.05)
                Statistical significance cutoff for use in pruning the network when using the PC
                algorithm. Lower values produce sparser networks.

        Returns:
            None
        """
        self.structure_algorithm = algorithm

        if self.verbose:
            print(
                "Depending on the number of variables in your dataset, this might take some time..."
            )

        # Learn structure, using one of the algorithms
        np.random.seed(self.random_seed)

        if algorithm == "hc":

            # Filter out columns with zero correlation with target variable
            self.filtered_df = self._initial_filter()

            # Run HC algorithm
            self.structure_model = HillClimbSearch(
                self.filtered_df,
                scoring_method=BicScore(self.filtered_df)).estimate()

            if self.verbose:
                print(
                    f"Structure learned! Saving structure to the following CSV: {file_path}"
                )

            # Eliminate isolated subgraphs
            G = self.structure_model.to_undirected()

            connected_nodes = list(
                nx.algorithms.components.node_connected_component(
                    G, self.target_variable))

            disconnected_nodes = list(
                set(list(self.structure_model.nodes)) - set(connected_nodes))

            for node in disconnected_nodes:
                self.structure_model.remove_node(node)
                self.filtered_df.drop([node], axis=1, inplace=True)

            pd.DataFrame(
                list(self.structure_model.edges),
                columns=["from_variable", "to_variable"],
            ).to_csv(file_path, index=False)

        elif algorithm == "pc":
            self.filtered_df = self.df
            self.structure_model = ConstraintBasedEstimator(
                self.filtered_df).estimate(
                    significance_level=significance_level)

            if self.verbose:
                print(
                    f"Structure learned! Saving structure to the following CSV: {file_path}"
                )

            pd.DataFrame(
                list(self.structure_model.edges),
                columns=["from_variable", "to_variable"],
            ).to_csv(file_path, index=False)
class Bayes_Net(Core):
    """
    Methods to read_in data and learn the structure and conditional probability tables for
    a Bayesian Network, as well as assessing the strength of the causal influence of endogenous
    variables on the target variable of interest.


    Parameters
    ----------
    target_variable: str, name of the column containing the outcome variable.

    verbose: bool, optional (default = False). Determines whether the user will get verbose status updates.

    random_seed: int, optional.


    Attributes
    ----------
    verbose: boolean
        Whether verbose mode is activated

    target_variable: string
        Name of the target variable in the dataset

    df: pd.DataFrame
        pandas dataframe of input dataset

    structure_algorithm: string
        Name of the learning structure algo that was chosen

    structure_model: pgmpy.base.DAG.DAG
        Learned DAG but without conditional probability tables estimated

    bn_model: pgmpy.models.BayesianModel
        Proper, learned Bayesian Network with conditional probability tables estimated

    odds_ratios: pd.DataFrame
        DataFrame containing odds ratios for all interventions and levels


    Methods
    ----------
    read_data: (self, file_path, **kwargs)
        Reads in dataset using. Essentially a wrapper for pandas' `read_csv` function.

    learn_structure: (self, file_path, algorithm = 'hc')
        Learns the structure of a DAG from data. Saves structure as a CSV to disk.
        Note: this is technically not a bayesian network yet, as we don't have the
        conditional probability tables estimated yet.

    plot_network: (self, file_path, **kwargs)
        Plots the Bayesian Network (highlighting target variable) and saves PNG to disk.

    plot_causal_influence: (self, file_path)
        Uses belief propagation to perform inference and calculates odds ratios for how
        changes in intervention evidence will impact the target variable. A forest plot is
        produced from this.
    """
    def __init__(self, target_variable, random_seed=0, verbose=False):
        self.verbose = verbose
        self.target_variable = target_variable
        self.random_seed = random_seed

        # Validate the params
        self._validate_init_params()

        if self.verbose:
            print("Using the following params for Bayesian Network model:")
            pprint(self.get_params(), indent=4)

    def _validate_init_params(self):
        """
        Very basic checks that the params used when instantiating Bayes_Net look okay
        """
        # Checks for target_variable
        if not isinstance(self.target_variable, str):
            raise TypeError(
                f"target_variable parameter must be a string type, but found type {type(self.target_variable)}"
            )

        # Checks for verbose
        if not isinstance(self.verbose, bool):
            raise TypeError(
                f"verbose parameter must be a boolean type, but found type {type(self.verbose)}"
            )

        # Checks for random_seed
        if not isinstance(self.random_seed, (int, type(None))):
            raise TypeError(
                f"random_seed parameter must be an int, but found type {type(self.random_seed)}"
            )

        if (isinstance(self.random_seed, int)) and self.random_seed < 0:
            raise ValueError(f"random_seed parameter must be > 0")

    def read_data(self, file_path, **kwargs):
        """
        Wrapper for pandas `read_csv` function. Assumes file is CSV with a header row.

        Arguments:
            file_path: str, the absolute file path to the CSV file
            **kwargs: any additional keywords for pandas' `read_csv` function

        Returns:
            None
        """
        self.df = pd.read_csv(filepath_or_buffer=file_path, **kwargs)

        # Check that target variable is in the dataset
        if self.target_variable not in self.df:
            raise ValueError(
                "The target variable you specified isn't in the dataset!")

        if self.verbose:
            print("Successfully read in CSV")

        return None

    def _cramers_v(self, x, y):
        """
        Static method to that calculates Cramers V correlation between two categorical variables
        """
        confusion_matrix = pd.crosstab(x, y)
        chi2 = ss.chi2_contingency(confusion_matrix)[0]

        n = confusion_matrix.sum().sum()
        phi2 = chi2 / n
        r, k = confusion_matrix.shape
        phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))

        rcorr = r - ((r - 1)**2) / (n - 1)
        kcorr = k - ((k - 1)**2) / (n - 1)

        return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

    def _initial_filter(self):
        """
        Filters out nodes with zero correlation with target variable
        """

        relevant_vars = []

        for node in self.df.columns:
            if self._cramers_v(self.df[self.target_variable],
                               self.df[node]) > 0:
                relevant_vars.append(node)

        return self.df[relevant_vars]

    def learn_structure(self,
                        file_path,
                        algorithm="hc",
                        significance_level=0.05):
        """
        Employs `pgmpy` package's Bayesian Network structure learning algorithms to learn
        structure from a dataset. Saves a tabular version of the result as a CSV file.

        Arguments:
            algorithm: str, optional (default = 'hc')
                Determines whether the hill-climbing or Peter-Clark are employed.
                Two possible values include: 'hc', 'pc'. Note, I found a bug in pgmpy implementation
                halfway through this project. Don't use the 'pc' method.
            file_path: str, the absolute path to save the file to (e.g. "~/Desktop/BN_structure.csv")
            significance_level: float, option (default = 0.05)
                Statistical significance cutoff for use in pruning the network when using the PC
                algorithm. Lower values produce sparser networks.

        Returns:
            None
        """
        self.structure_algorithm = algorithm

        if self.verbose:
            print(
                "Depending on the number of variables in your dataset, this might take some time..."
            )

        # Learn structure, using one of the algorithms
        np.random.seed(self.random_seed)

        if algorithm == "hc":

            # Filter out columns with zero correlation with target variable
            self.filtered_df = self._initial_filter()

            # Run HC algorithm
            self.structure_model = HillClimbSearch(
                self.filtered_df,
                scoring_method=BicScore(self.filtered_df)).estimate()

            if self.verbose:
                print(
                    f"Structure learned! Saving structure to the following CSV: {file_path}"
                )

            # Eliminate isolated subgraphs
            G = self.structure_model.to_undirected()

            connected_nodes = list(
                nx.algorithms.components.node_connected_component(
                    G, self.target_variable))

            disconnected_nodes = list(
                set(list(self.structure_model.nodes)) - set(connected_nodes))

            for node in disconnected_nodes:
                self.structure_model.remove_node(node)
                self.filtered_df.drop([node], axis=1, inplace=True)

            pd.DataFrame(
                list(self.structure_model.edges),
                columns=["from_variable", "to_variable"],
            ).to_csv(file_path, index=False)

        elif algorithm == "pc":
            self.filtered_df = self.df
            self.structure_model = ConstraintBasedEstimator(
                self.filtered_df).estimate(
                    significance_level=significance_level)

            if self.verbose:
                print(
                    f"Structure learned! Saving structure to the following CSV: {file_path}"
                )

            pd.DataFrame(
                list(self.structure_model.edges),
                columns=["from_variable", "to_variable"],
            ).to_csv(file_path, index=False)

    def plot_network(self, file_path, **kwargs):
        """
        Plots the learned structure, highlighting the target variable.

        Arguments:
            file_path: str, the absolute path to save the file to (e.g. "~/Desktop/plot.png")
            **kwargs: additional keyword arguments for networkx's draw function

        Returns:
            None
        """
        if self.verbose:
            print(
                f"Saving Bayesian Network plot to the following PNG file: {file_path}"
            )

        # Identify target variable so we can highlight it in the plot
        target_index = list(self.structure_model).index(self.target_variable)
        node_size_list = [300] * len(list(self.structure_model.nodes))
        node_color_list = ["#95ABDF"] * len(list(self.structure_model.nodes))
        node_size_list[target_index] = 1500
        node_color_list[target_index] = "#F09A9A"

        # Clear any existing pyplot fig, create plot, and save to disk
        plt.clf()
        nx.draw(
            self.structure_model,
            node_size=node_size_list,
            node_color=node_color_list,
            with_labels=True,
            **kwargs,
        )
        plt.savefig(expanduser(file_path), format="PNG", dpi=300)

    def _estimate_CPT(self):
        """
        Estimates the conditional probability tables associated with each node in the
        Bayesian Network.
        """

        self.bn_model = BayesianModel(list(self.structure_model.edges))
        self.cpt_model = BayesianEstimator(self.bn_model, self.filtered_df)

        for node in list(self.bn_model.nodes):
            self.bn_model.add_cpds(self.cpt_model.estimate_cpd(node))

    def plot_causal_influence(self, file_path):
        """
        Computes the odds of the target variable being value 1 over value 0 (i.e. the odds ratio)
        by iterating through all other network variables/nodes, changing their values,
        and observing how the probability of the target variable changes. Belief propagation
        is used for inference. A forest plot is produced from this and saved to disk.

        Arguments:
            file_path: str, the absolute path to save the file to (e.g. "~/Desktop/forest_plot.png")

        Returns:
            None
        """

        # Estimate CPTs
        self._estimate_CPT()

        if self.verbose:
            print(f"Calculating influence of all nodes on target node")

        if not self.bn_model.check_model():
            print("""
                There is a problem with your network structure. You have disconnected nodes
                or separated sub-networks. Please examine your network plot and re-learn your
                network structure with tweaked settings.
                """)
            return None

        if self.target_variable not in self.bn_model.nodes:
            print("""
                Your target variable has no parent nodes! Can't perform inference! Please examine
                your network plot and re-learn your network structure with tweaked settings.
                """)
            return None

        # Prep for belief propagation
        belief_propagation = BeliefPropagation(self.bn_model)
        belief_propagation.calibrate()

        # Iterate over all intervention nodes and values, calculating odds ratios w.r.t target variable
        overall_dict = {}

        variables_to_test = list(
            set(list(self.bn_model.nodes)) - set(list(self.target_variable)))

        for node in variables_to_test:
            results = []
            for value in self.filtered_df[node].unique():
                prob = belief_propagation.query(
                    variables=[self.target_variable],
                    evidence={
                        node: value
                    },
                    show_progress=False,
                ).values
                results.append([node, value, prob[0], prob[1]])

            results_df = pd.DataFrame(
                results,
                columns=["node", "value", "probability_0", "probability_1"])
            results_df["odds_1"] = (results_df["probability_1"] /
                                    results_df["probability_0"])
            results_df = results_df.sort_values(
                "value", ascending=True, inplace=False).reset_index(drop=True)

            overall_dict[node] = results_df

        final_df_list = []

        for node, temp_df in overall_dict.items():
            first_value = temp_df["odds_1"].iloc[0]
            temp_df["odds_ratio"] = (temp_df["odds_1"] / first_value).round(3)
            final_df_list.append(temp_df)

        final_df = pd.concat(final_df_list)[["node", "value", "odds_ratio"]]
        self.odds_ratios = final_df

        if self.verbose:
            print(f"Saving forest plot to the following PNG file: {file_path}")

        # Clean up the dataframe of odds ratios so plot can have nice labels
        final_df2 = (pd.concat([
            final_df,
            final_df.groupby("node")["value"].apply(
                lambda x: x.shift(-1).iloc[-1]).reset_index(),
        ]).sort_values(by=["node", "value"],
                       ascending=True).reset_index(drop=True))
        final_df2["node"][final_df2["value"].isnull()] = np.nan
        final_df2["value"] = final_df2["value"].astype("Int32").astype(str)
        final_df2["value"].replace({np.nan: ""}, inplace=True)
        final_df3 = final_df2.reset_index(drop=True).reset_index()
        final_df3.rename(columns={"index": "vertical_index"}, inplace=True)
        final_df3["y_label"] = final_df3["node"] + " = " + final_df3["value"]
        final_df3["y_label"][final_df3["odds_ratio"] == 1.0] = (
            final_df3["y_label"] + " (ref)")
        final_df3["y_label"].fillna("", inplace=True)

        # Produce large plot
        plt.clf()
        plt.title(
            "Strength of Associations Between Interventions and Target Variable"
        )
        plt.scatter(
            x=final_df3["odds_ratio"],
            y=final_df3["vertical_index"],
            s=70,
            color="b",
            alpha=0.5,
        )
        plt.xlabel("Odds Ratio")
        plt.axvline(x=1.0, color="red", linewidth="1.5", linestyle="--")
        plt.yticks(final_df3["vertical_index"], final_df3["y_label"])

        for _, row in final_df3.iterrows():
            if not np.isnan(row["odds_ratio"]):
                plt.plot(
                    [0, row["odds_ratio"]],
                    [row["vertical_index"], row["vertical_index"]],
                    color="black",
                    linewidth="0.4",
                )

        plt.xlim([0, final_df3["odds_ratio"].max() + 1])

        figure = plt.gcf()
        figure.set_size_inches(12, 7)

        plt.savefig(expanduser(file_path),
                    bbox_inches="tight",
                    format="PNG",
                    dpi=300)
 def test_estimate(self):
     data = pd.DataFrame(np.random.randint(0, 3, size=(1000, 3)), columns=list('XYZ'))
     data['sum'] = data.sum(axis=1)
     model = ConstraintBasedEstimator(data).estimate()
     self.assertSetEqual(set(model.edges()),
                         set([('Z', 'sum'), ('X', 'sum'), ('Y', 'sum')]))
Exemple #32
0
def constraintStructureLearn(data, significance_level=0.01):
    #根据条件独立性约束构建贝叶斯网络
    est = ConstraintBasedEstimator(data)
    best_model = est.estimate(significance_level)
    return best_model
# #### Conditional Independence Tests
# Independencies in the data can be identified using $\chi$-squared conditional independence hypothesis tests. Constraint-based estimators in pgmpy have a `test_conditional_independence(X, Y, Z)` method that performs a hypothesis test on the data sample to check if $X$ is independent from $Y$ given a set of variables $Z$s.
#
# **Example 1:** Linear Relationships Data
# %% codecell
from pgmpy.estimators import ConstraintBasedEstimator

data: DataFrame = DataFrame(data=np.random.randint(low=0,
                                                   high=3,
                                                   size=(2500, 8)),
                            columns=list('ABCDEFGH'))
data['A'] += data['B'] + data['C']
data['H'] = data['G'] - data['A']
data['E'] *= data['F']

est: ConstraintBasedEstimator = ConstraintBasedEstimator(data=data)

assert not est.test_conditional_independence('B', 'H')
assert est.test_conditional_independence('B', 'E')
assert not est.test_conditional_independence('A', 'B')

assert est.test_conditional_independence(X='B', Y='H', Zs=['A'])

assert est.test_conditional_independence('A', 'G')

assert not est.test_conditional_independence('A', 'G', Zs=['H'])
assert not est.test_conditional_independence('A', 'H', Zs=['G'])

# %% markdown [markdown]
# `test_conditional_independence()` returns a triple `(chi2, pValue, sufficientData)` consisting of the computed $\chi$-squared test statistic, the `pValue` of the test, and a heuristic flag that indicates if the sample size was sufficient. The `pValue` is the probability of observing the computed $\chi$-squared statistic (or an even higher $\chi$-squared value) given the null hypothesis that $X$ and $Y$ are independent given $Z$s.
#