Exemple #1
0
def random_dag(number_of_nodes: int = 5,
               edge_density: float = 0.4,
               max_in_degree: int = 4) -> DAG:
    """Create a connected, random directed acyclic graph (DAG), with the given number of nodes,
    the given edge density, and with no node exceeding having too high in degree"""
    node_names = [f"X{i}" for i in range(number_of_nodes)]
    dag = DAG()

    # First make sure the dag is connected
    visited = list()
    unvisited = list(node_names)
    node = random.choice(unvisited)
    unvisited.remove(node)
    visited.append(node)
    dag.add_node(node)

    while unvisited:
        node = random.choice(unvisited)
        neighbor = random.choice(visited)
        if node_names.index(node) < node_names.index(
                neighbor) and dag.in_degree(neighbor) < max_in_degree:
            dag.add_edge(node, neighbor)
        elif node_names.index(neighbor) < node_names.index(node):
            dag.add_edge(neighbor, node)
        else:
            continue
        unvisited.remove(node)
        visited.append(node)

    # Then add edges until desired density is reached
    maximum_number_of_edges = number_of_nodes * (number_of_nodes - 1) / 2
    while dag.number_of_edges() < int(edge_density * maximum_number_of_edges):
        add_random_edge(dag, node_names)

    return dag
Exemple #2
0
    def estimate(self,
                 tabu_length=100,
                 max_indegree=2,
                 black_list=None,
                 epsilon=1e-4,
                 max_iter=1e6,
                 show_progress=True):

        # We will be using K2Score for this model
        score = K2Score(data=self.data)
        # Model gets the score for a node and its parents
        # This is used on every iteration for all possible changes
        # This is greddy and picks the best available option
        score_fn = score.local_score
        # Initialize a Starting DAG
        # PGMPY made a DAG class that adds some functionality to nx.DiGrpah
        start_dag = DAG()
        start_dag.add_nodes_from(self.variables)
        # Set the edges we do not want to have in the graph
        if black_list is None:
            black_list = set()
        else:
            black_list = set(black_list)

        # Just change Maxindegree to a certain number when doing the model

        # I think this is to keep track of the changes we already made to the model
        tabu_list = deque(maxlen=tabu_length)
        # Initialize a current model
        current_model = start_dag
        if show_progress:
            iteration = trange(int(max_iter))
        else:
            iteration = range(int(max_iter))
        for _ in iteration:
            # Get the best operations based on K2 score with self._legal_operations
            best_operation, best_score_change = max(self._legal_operations(
                model=current_model,
                score=score_fn,
                tabu_list=tabu_list,
                max_indegree=max_indegree,
                black_list=black_list,
            ),
                                                    key=lambda t: t[1])

            if best_score_change < epsilon:
                break
            elif best_operation[0] == '+':
                current_model.add_edge(*best_operation[1])
                tabu_list.append(("-", best_operation[1]))
            elif best_operation[0] == '-':
                current_model.remove_edge(*best_operation[1])
                tabu_list.append(("+", best_operation[1]))
            elif best_operation[0] == 'flip':
                X, Y = best_operation[1]
                current_model.remove_edge(X, Y)
                current_model.add_edge(Y, X)
                tabu_list.append(best_operation)

        return current_model
Exemple #3
0
class TestDoOperator(unittest.TestCase):
    def setUp(self):
        self.graph = DAG()
        self.graph.add_edges_from([("X", "A"), ("A", "Y"), ("A", "B")])

    def test_do(self):
        dag_do_x = self.graph.do("A")
        self.assertEqual(set(dag_do_x.nodes()), set(self.graph.nodes()))
        self.assertEqual(sorted(list(dag_do_x.edges())), [("A", "B"), ("A", "Y")])
 def test_markov_blanet(self):
     G = DAG([
         ("x", "y"),
         ("z", "y"),
         ("y", "w"),
         ("y", "v"),
         ("u", "w"),
         ("s", "v"),
         ("w", "t"),
         ("w", "m"),
         ("v", "n"),
         ("v", "q"),
     ])
     self.assertEqual(set(G.get_markov_blanket("y")),
                      set(["s", "w", "x", "u", "z", "v"]))
Exemple #5
0
    def pdag2dag(self, edge_dict):
        pdag_edges = [(pi, n) for n, p in edge_dict.items() for pi in p]
        pdag = DAG(pdag_edges)
        dag_edges = ConstraintBasedEstimator.pdag_to_dag(pdag).edges()
        dag = dict([(n, set()) for n in range(len(edge_dict))])
        for e in dag_edges:
            dag[e[1]].add(e[0])

        return dag
Exemple #6
0
class TestDAGMoralization(unittest.TestCase):
    def setUp(self):
        self.graph = DAG()
        self.graph.add_edges_from([("diff", "grade"), ("intel", "grade")])

    def test_get_parents(self):
        self.assertListEqual(sorted(self.graph.get_parents("grade")), ["diff", "intel"])

    def test_moralize(self):
        moral_graph = self.graph.moralize()
        self.assertListEqual(
            hf.recursive_sorted(moral_graph.edges()),
            [["diff", "grade"], ["diff", "intel"], ["grade", "intel"]],
        )

    def test_get_children(self):
        self.assertListEqual(sorted(self.graph.get_children("diff")), ["grade"])

    def tearDown(self):
        del self.graph
    def estimate(self):
        """
        Estimates the `DAG` structure that fits best to the given data set,
        according to the scoring method supplied in the constructor.
        Exhaustively searches through all models. Only estimates network structure, no parametrization.

        Returns
        -------
        model: `DAG` instance
            A `DAG` with maximal score.

        Examples
        --------
        >>> import pandas as pd
        >>> import numpy as np
        >>> from pgmpy.estimators import ExhaustiveSearch
        >>> # create random data sample with 3 variables, where B and C are identical:
        >>> data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB'))
        >>> data['C'] = data['B']
        >>> est = ExhaustiveSearch(data)
        >>> best_model = est.estimate()
        >>> best_model
        <pgmpy.base.DAG.DAG object at 0x7f695c535470>
        >>> best_model.edges()
        [('B', 'C')]
        """

        best_dag = max(self.all_dags(), key=self.scoring_method.score)

        best_model = DAG()
        best_model.add_nodes_from(sorted(best_dag.nodes()))
        best_model.add_edges_from(sorted(best_dag.edges()))
        return best_model
Exemple #8
0
def add_random_edge(dag: DAG,
                    node_order: List[str],
                    max_in_degree: int = 4) -> None:
    """Add a random edge to the graph, that respects the given node_order, and
    also doesn't add a link if the sampled node has maximal in_degree already.

    It may not add any edge.
    """
    n1, n2 = random.sample(node_order, 2)
    if node_order.index(n1) < node_order.index(n2) and dag.in_degree(
            n2) < max_in_degree:
        dag.add_edge(n1, n2)
    elif node_order.index(n2) < node_order.index(n1) and dag.in_degree(
            n1) < max_in_degree:
        dag.add_edge(n2, n1)
 def setUp(self):
     self.G = BayesianModel([("d", "g"), ("i", "g"), ("g", "l"),
                             ("i", "s")])
     self.G2 = DAG([("d", "g"), ("i", "g"), ("g", "l"), ("i", "s")])
class TestBayesianModelCPD(unittest.TestCase):
    def setUp(self):
        self.G = BayesianModel([("d", "g"), ("i", "g"), ("g", "l"),
                                ("i", "s")])
        self.G2 = DAG([("d", "g"), ("i", "g"), ("g", "l"), ("i", "s")])

    def test_active_trail_nodes(self):
        self.assertEqual(sorted(self.G2.active_trail_nodes("d")["d"]),
                         ["d", "g", "l"])
        self.assertEqual(sorted(self.G2.active_trail_nodes("i")["i"]),
                         ["g", "i", "l", "s"])
        self.assertEqual(sorted(self.G2.active_trail_nodes(["d", "i"])["d"]),
                         ["d", "g", "l"])

    def test_active_trail_nodes_args(self):
        self.assertEqual(
            sorted(self.G2.active_trail_nodes(["d", "l"], observed="g")["d"]),
            ["d", "i", "s"],
        )
        self.assertEqual(
            sorted(self.G2.active_trail_nodes(["d", "l"], observed="g")["l"]),
            ["l"])
        self.assertEqual(
            sorted(self.G2.active_trail_nodes("s", observed=["i", "l"])["s"]),
            ["s"])
        self.assertEqual(
            sorted(self.G2.active_trail_nodes("s", observed=["d", "l"])["s"]),
            ["g", "i", "s"],
        )

    def test_is_active_trail_triplets(self):
        self.assertTrue(self.G.is_active_trail("d", "l"))
        self.assertTrue(self.G.is_active_trail("g", "s"))
        self.assertFalse(self.G.is_active_trail("d", "i"))
        self.assertTrue(self.G.is_active_trail("d", "i", observed="g"))
        self.assertFalse(self.G.is_active_trail("d", "l", observed="g"))
        self.assertFalse(self.G.is_active_trail("i", "l", observed="g"))
        self.assertTrue(self.G.is_active_trail("d", "i", observed="l"))
        self.assertFalse(self.G.is_active_trail("g", "s", observed="i"))

    def test_is_active_trail(self):
        self.assertFalse(self.G.is_active_trail("d", "s"))
        self.assertTrue(self.G.is_active_trail("s", "l"))
        self.assertTrue(self.G.is_active_trail("d", "s", observed="g"))
        self.assertFalse(self.G.is_active_trail("s", "l", observed="g"))

    def test_is_active_trail_args(self):
        self.assertFalse(self.G.is_active_trail("s", "l", "i"))
        self.assertFalse(self.G.is_active_trail("s", "l", "g"))
        self.assertTrue(self.G.is_active_trail("d", "s", "l"))
        self.assertFalse(self.G.is_active_trail("d", "s", ["i", "l"]))

    def test_get_cpds(self):
        cpd_d = TabularCPD("d", 2, values=np.random.rand(2, 1))
        cpd_i = TabularCPD("i", 2, values=np.random.rand(2, 1))
        cpd_g = TabularCPD(
            "g",
            2,
            values=np.random.rand(2, 4),
            evidence=["d", "i"],
            evidence_card=[2, 2],
        )
        cpd_l = TabularCPD("l",
                           2,
                           values=np.random.rand(2, 2),
                           evidence=["g"],
                           evidence_card=[2])
        cpd_s = TabularCPD("s",
                           2,
                           values=np.random.rand(2, 2),
                           evidence=["i"],
                           evidence_card=[2])
        self.G.add_cpds(cpd_d, cpd_i, cpd_g, cpd_l, cpd_s)

        self.assertEqual(self.G.get_cpds("d").variable, "d")

    def test_get_cpds1(self):
        self.model = BayesianModel([("A", "AB")])
        cpd_a = TabularCPD("A", 2, values=np.random.rand(2, 1))
        cpd_ab = TabularCPD("AB",
                            2,
                            values=np.random.rand(2, 2),
                            evidence=["A"],
                            evidence_card=[2])

        self.model.add_cpds(cpd_a, cpd_ab)
        self.assertEqual(self.model.get_cpds("A").variable, "A")
        self.assertEqual(self.model.get_cpds("AB").variable, "AB")
        self.assertRaises(ValueError, self.model.get_cpds, "B")

        self.model.add_node("B")
        self.assertIsNone(self.model.get_cpds("B"))

    def test_add_single_cpd(self):
        cpd_s = TabularCPD("s", 2, np.random.rand(2, 2), ["i"], [2])
        self.G.add_cpds(cpd_s)
        self.assertListEqual(self.G.get_cpds(), [cpd_s])

    def test_add_multiple_cpds(self):
        cpd_d = TabularCPD("d", 2, values=np.random.rand(2, 1))
        cpd_i = TabularCPD("i", 2, values=np.random.rand(2, 1))
        cpd_g = TabularCPD(
            "g",
            2,
            values=np.random.rand(2, 4),
            evidence=["d", "i"],
            evidence_card=[2, 2],
        )
        cpd_l = TabularCPD("l",
                           2,
                           values=np.random.rand(2, 2),
                           evidence=["g"],
                           evidence_card=[2])
        cpd_s = TabularCPD("s",
                           2,
                           values=np.random.rand(2, 2),
                           evidence=["i"],
                           evidence_card=[2])

        self.G.add_cpds(cpd_d, cpd_i, cpd_g, cpd_l, cpd_s)
        self.assertEqual(self.G.get_cpds("d"), cpd_d)
        self.assertEqual(self.G.get_cpds("i"), cpd_i)
        self.assertEqual(self.G.get_cpds("g"), cpd_g)
        self.assertEqual(self.G.get_cpds("l"), cpd_l)
        self.assertEqual(self.G.get_cpds("s"), cpd_s)

    def test_check_model(self):
        cpd_g = TabularCPD(
            "g",
            2,
            values=np.array([[0.2, 0.3, 0.4, 0.6], [0.8, 0.7, 0.6, 0.4]]),
            evidence=["d", "i"],
            evidence_card=[2, 2],
        )

        cpd_s = TabularCPD(
            "s",
            2,
            values=np.array([[0.2, 0.3], [0.8, 0.7]]),
            evidence=["i"],
            evidence_card=[2],
        )

        cpd_l = TabularCPD(
            "l",
            2,
            values=np.array([[0.2, 0.3], [0.8, 0.7]]),
            evidence=["g"],
            evidence_card=[2],
        )

        self.G.add_cpds(cpd_g, cpd_s, cpd_l)
        self.assertRaises(ValueError, self.G.check_model)

        cpd_d = TabularCPD("d", 2, values=[[0.8, 0.2]])
        cpd_i = TabularCPD("i", 2, values=[[0.7, 0.3]])
        self.G.add_cpds(cpd_d, cpd_i)

        self.assertTrue(self.G.check_model())

    def test_check_model1(self):
        cpd_g = TabularCPD(
            "g",
            2,
            values=np.array([[0.2, 0.3], [0.8, 0.7]]),
            evidence=["i"],
            evidence_card=[2],
        )
        self.G.add_cpds(cpd_g)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_g)

        cpd_g = TabularCPD(
            "g",
            2,
            values=np.array([[0.2, 0.3, 0.4, 0.6], [0.8, 0.7, 0.6, 0.4]]),
            evidence=["d", "s"],
            evidence_card=[2, 2],
        )
        self.G.add_cpds(cpd_g)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_g)

        cpd_g = TabularCPD(
            "g",
            2,
            values=np.array([[0.2, 0.3], [0.8, 0.7]]),
            evidence=["l"],
            evidence_card=[2],
        )
        self.G.add_cpds(cpd_g)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_g)

        cpd_l = TabularCPD(
            "l",
            2,
            values=np.array([[0.2, 0.3], [0.8, 0.7]]),
            evidence=["d"],
            evidence_card=[2],
        )
        self.G.add_cpds(cpd_l)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_l)

        cpd_l = TabularCPD(
            "l",
            2,
            values=np.array([[0.2, 0.3, 0.4, 0.6], [0.8, 0.7, 0.6, 0.4]]),
            evidence=["d", "i"],
            evidence_card=[2, 2],
        )
        self.G.add_cpds(cpd_l)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_l)

        cpd_l = TabularCPD(
            "l",
            2,
            values=np.array([
                [0.2, 0.3, 0.4, 0.6, 0.2, 0.3, 0.4, 0.6],
                [0.8, 0.7, 0.6, 0.4, 0.8, 0.7, 0.6, 0.4],
            ]),
            evidence=["g", "d", "i"],
            evidence_card=[2, 2, 2],
        )
        self.G.add_cpds(cpd_l)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_l)

    def test_check_model2(self):
        cpd_s = TabularCPD(
            "s",
            2,
            values=np.array([[0.5, 0.3], [0.8, 0.7]]),
            evidence=["i"],
            evidence_card=[2],
        )
        self.G.add_cpds(cpd_s)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_s)

        cpd_g = TabularCPD(
            "g",
            2,
            values=np.array([[0.2, 0.3, 0.4, 0.6], [0.3, 0.7, 0.6, 0.4]]),
            evidence=["d", "i"],
            evidence_card=[2, 2],
        )
        self.G.add_cpds(cpd_g)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_g)

        cpd_l = TabularCPD(
            "l",
            2,
            values=np.array([[0.2, 0.3], [0.1, 0.7]]),
            evidence=["g"],
            evidence_card=[2],
        )
        self.G.add_cpds(cpd_l)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_l)

    def tearDown(self):
        del self.G
Exemple #11
0
class TestDAGCreation(unittest.TestCase):
    def setUp(self):
        self.graph = DAG()

    def test_class_init_without_data(self):
        self.assertIsInstance(self.graph, DAG)

    def test_class_init_with_data_string(self):
        self.graph = DAG([("a", "b"), ("b", "c")])
        self.assertListEqual(sorted(self.graph.nodes()), ["a", "b", "c"])
        self.assertListEqual(
            hf.recursive_sorted(self.graph.edges()), [["a", "b"], ["b", "c"]]
        )

    def test_add_node_string(self):
        self.graph.add_node("a")
        self.assertListEqual(list(self.graph.nodes()), ["a"])

    def test_add_node_nonstring(self):
        self.graph.add_node(1)

    def test_add_nodes_from_string(self):
        self.graph.add_nodes_from(["a", "b", "c", "d"])
        self.assertListEqual(sorted(self.graph.nodes()), ["a", "b", "c", "d"])

    def test_add_nodes_from_non_string(self):
        self.graph.add_nodes_from([1, 2, 3, 4])

    def test_add_node_weight(self):
        self.graph.add_node("weighted_a", 0.3)
        self.assertEqual(self.graph.nodes["weighted_a"]["weight"], 0.3)

    def test_add_nodes_from_weight(self):
        self.graph.add_nodes_from(["weighted_b", "weighted_c"], [0.5, 0.6])
        self.assertEqual(self.graph.nodes["weighted_b"]["weight"], 0.5)
        self.assertEqual(self.graph.nodes["weighted_c"]["weight"], 0.6)

        self.graph.add_nodes_from(["e", "f"])
        self.assertEqual(self.graph.nodes["e"]["weight"], None)
        self.assertEqual(self.graph.nodes["f"]["weight"], None)

    def test_add_edge_string(self):
        self.graph.add_edge("d", "e")
        self.assertListEqual(sorted(self.graph.nodes()), ["d", "e"])
        self.assertListEqual(list(self.graph.edges()), [("d", "e")])
        self.graph.add_nodes_from(["a", "b", "c"])
        self.graph.add_edge("a", "b")
        self.assertListEqual(
            hf.recursive_sorted(self.graph.edges()), [["a", "b"], ["d", "e"]]
        )

    def test_add_edge_nonstring(self):
        self.graph.add_edge(1, 2)

    def test_add_edges_from_string(self):
        self.graph.add_edges_from([("a", "b"), ("b", "c")])
        self.assertListEqual(sorted(self.graph.nodes()), ["a", "b", "c"])
        self.assertListEqual(
            hf.recursive_sorted(self.graph.edges()), [["a", "b"], ["b", "c"]]
        )
        self.graph.add_nodes_from(["d", "e", "f"])
        self.graph.add_edges_from([("d", "e"), ("e", "f")])
        self.assertListEqual(sorted(self.graph.nodes()), ["a", "b", "c", "d", "e", "f"])
        self.assertListEqual(
            hf.recursive_sorted(self.graph.edges()),
            hf.recursive_sorted([("a", "b"), ("b", "c"), ("d", "e"), ("e", "f")]),
        )

    def test_add_edges_from_nonstring(self):
        self.graph.add_edges_from([(1, 2), (2, 3)])

    def test_add_edge_weight(self):
        self.graph.add_edge("a", "b", weight=0.3)
        if nx.__version__.startswith("1"):
            self.assertEqual(self.graph.edge["a"]["b"]["weight"], 0.3)
        else:
            self.assertEqual(self.graph.adj["a"]["b"]["weight"], 0.3)

    def test_add_edges_from_weight(self):
        self.graph.add_edges_from([("b", "c"), ("c", "d")], weights=[0.5, 0.6])
        if nx.__version__.startswith("1"):
            self.assertEqual(self.graph.edge["b"]["c"]["weight"], 0.5)
            self.assertEqual(self.graph.edge["c"]["d"]["weight"], 0.6)

            self.graph.add_edges_from([("e", "f")])
            self.assertEqual(self.graph.edge["e"]["f"]["weight"], None)
        else:
            self.assertEqual(self.graph.adj["b"]["c"]["weight"], 0.5)
            self.assertEqual(self.graph.adj["c"]["d"]["weight"], 0.6)

            self.graph.add_edges_from([("e", "f")])
            self.assertEqual(self.graph.adj["e"]["f"]["weight"], None)

    def test_update_node_parents_bm_constructor(self):
        self.graph = DAG([("a", "b"), ("b", "c")])
        self.assertListEqual(list(self.graph.predecessors("a")), [])
        self.assertListEqual(list(self.graph.predecessors("b")), ["a"])
        self.assertListEqual(list(self.graph.predecessors("c")), ["b"])

    def test_update_node_parents(self):
        self.graph.add_nodes_from(["a", "b", "c"])
        self.graph.add_edges_from([("a", "b"), ("b", "c")])
        self.assertListEqual(list(self.graph.predecessors("a")), [])
        self.assertListEqual(list(self.graph.predecessors("b")), ["a"])
        self.assertListEqual(list(self.graph.predecessors("c")), ["b"])

    def test_get_leaves(self):
        self.graph.add_edges_from(
            [("A", "B"), ("B", "C"), ("B", "D"), ("D", "E"), ("D", "F"), ("A", "G")]
        )
        self.assertEqual(sorted(self.graph.get_leaves()), sorted(["C", "G", "E", "F"]))

    def test_get_roots(self):
        self.graph.add_edges_from(
            [("A", "B"), ("B", "C"), ("B", "D"), ("D", "E"), ("D", "F"), ("A", "G")]
        )
        self.assertEqual(["A"], self.graph.get_roots())
        self.graph.add_edge("H", "G")
        self.assertEqual(sorted(["A", "H"]), sorted(self.graph.get_roots()))

    def test_init_with_cycle(self):
        self.assertRaises(ValueError, DAG, [("a", "a")])
        self.assertRaises(ValueError, DAG, [("a", "b"), ("b", "a")])
        self.assertRaises(ValueError, DAG, [("a", "b"), ("b", "c"), ("c", "a")])

    def tearDown(self):
        del self.graph
Exemple #12
0
 def test_class_init_with_data_string(self):
     self.graph = DAG([("a", "b"), ("b", "c")])
     self.assertListEqual(sorted(self.graph.nodes()), ["a", "b", "c"])
     self.assertListEqual(
         hf.recursive_sorted(self.graph.edges()), [["a", "b"], ["b", "c"]]
     )
Exemple #13
0
 def setUp(self):
     self.graph = DAG()
     self.graph.add_edges_from([("diff", "grade"), ("intel", "grade")])
Exemple #14
0
 def test_update_node_parents_bm_constructor(self):
     self.graph = DAG([("a", "b"), ("b", "c")])
     self.assertListEqual(list(self.graph.predecessors("a")), [])
     self.assertListEqual(list(self.graph.predecessors("b")), ["a"])
     self.assertListEqual(list(self.graph.predecessors("c")), ["b"])
Exemple #15
0
 def setUp(self):
     self.graph = DAG()
Exemple #16
0
    def pdag_to_dag(pdag):
        """Completes a PDAG to a DAG, without adding v-structures, if such a
        completion exists. If no faithful extension is possible, some fully
        oriented DAG that corresponds to the PDAG is returned and a warning is
        generated. This is a static method.

        Parameters
        ----------
        pdag: DAG
            A directed acyclic graph pattern, consisting in (acyclic) directed edges
            as well as "undirected" edges, represented as both-way edges between
            nodes.

        Returns
        -------
        dag: DAG
            A faithful orientation of pdag, if one exists. Otherwise any
            fully orientated DAG/BayesianModel with the structure of pdag.

        References
        ----------
        [1] Chickering, Learning Equivalence Classes of Bayesian-Network Structures,
            2002; See page 454 (last paragraph) for the algorithm pdag_to_dag
            http://www.jmlr.org/papers/volume2/chickering02a/chickering02a.pdf
        [2] Dor & Tarsi, A simple algorithm to construct a consistent extension
            of a partially oriented graph, 1992,
            http://ftp.cs.ucla.edu/pub/stat_ser/r185-dor-tarsi.pdf

        Examples
        --------
        >>> import pandas as pd
        >>> import numpy as np
        >>> from pgmpy.base import DAG
        >>> from pgmpy.estimators import ConstraintBasedEstimator
        >>> data = pd.DataFrame(np.random.randint(0, 4, size=(5000, 3)), columns=list('ABD'))
        >>> data['C'] = data['A'] - data['B']
        >>> data['D'] += data['A']
        >>> c = ConstraintBasedEstimator(data)
        >>> pdag = c.skeleton_to_pdag(*c.estimate_skeleton())
        >>> pdag.edges()
        [('B', 'C'), ('D', 'A'), ('A', 'D'), ('A', 'C')]
        >>> c.pdag_to_dag(pdag).edges()
        [('B', 'C'), ('A', 'D'), ('A', 'C')]

        >>> # pdag_to_dag is static:
        ... pdag1 = DAG([('A', 'B'), ('C', 'B'), ('C', 'D'), ('D', 'C'), ('D', 'A'), ('A', 'D')])
        >>> ConstraintBasedEstimator.pdag_to_dag(pdag1).edges()
        [('D', 'C'), ('C', 'B'), ('A', 'B'), ('A', 'D')]

        >>> # example of a pdag with no faithful extension:
        ... pdag2 = DAG([('A', 'B'), ('A', 'C'), ('B', 'C'), ('C', 'B')])
        >>> ConstraintBasedEstimator.pdag_to_dag(pdag2).edges()
        UserWarning: PDAG has no faithful extension (= no oriented DAG with the same v-structures as PDAG).
        Remaining undirected PDAG edges oriented arbitrarily.
        [('B', 'C'), ('A', 'B'), ('A', 'C')]
        """

        pdag = pdag.copy()
        dag = DAG()
        dag.add_nodes_from(pdag.nodes())

        # add already directed edges of pdag to dag
        for X, Y in pdag.edges():
            if not pdag.has_edge(Y, X):
                dag.add_edge(X, Y)

        while pdag.number_of_nodes() > 0:
            # find node with (1) no directed outgoing edges and
            #                (2) the set of undirected neighbors is either empty or
            #                    undirected neighbors + parents of X are a clique
            found = False
            for X in pdag.nodes():
                directed_outgoing_edges = set(pdag.successors(X)) - set(
                    pdag.predecessors(X))
                undirected_neighbors = set(pdag.successors(X)) & set(
                    pdag.predecessors(X))
                neighbors_are_clique = all((pdag.has_edge(Y, Z)
                                            for Z in pdag.predecessors(X)
                                            for Y in undirected_neighbors
                                            if not Y == Z))

                if not directed_outgoing_edges and (not undirected_neighbors
                                                    or neighbors_are_clique):
                    found = True
                    # add all edges of X as outgoing edges to dag
                    for Y in pdag.predecessors(X):
                        dag.add_edge(Y, X)

                    pdag.remove_node(X)
                    break

            if not found:
                warn(
                    "PDAG has no faithful extension (= no oriented DAG with the "
                    +
                    "same v-structures as PDAG). Remaining undirected PDAG edges "
                    + "oriented arbitrarily.")
                for X, Y in pdag.edges():
                    if not dag.has_edge(Y, X):
                        try:
                            dag.add_edge(X, Y)
                        except ValueError:
                            pass
                break

        return dag
    def estimate(
        self, start=None, tabu_length=0, max_indegree=None, epsilon=1e-4, max_iter=1e6
    ):
        """
        Performs local hill climb search to estimates the `DAG` structure
        that has optimal score, according to the scoring method supplied in the constructor.
        Starts at model `start` and proceeds by step-by-step network modifications
        until a local maximum is reached. Only estimates network structure, no parametrization.

        Parameters
        ----------
        start: DAG instance
            The starting point for the local search. By default a completely disconnected network is used.

        tabu_length: int
            If provided, the last `tabu_length` graph modifications cannot be reversed
            during the search procedure. This serves to enforce a wider exploration
            of the search space. Default value: 100.

        max_indegree: int or None
            If provided and unequal None, the procedure only searches among models
            where all nodes have at most `max_indegree` parents. Defaults to None.

        epsilon: float (default: 1e-4)
            Defines the exit condition. If the improvement in score is less than `epsilon`,
            the learned model is returned.

        max_iter: int (default: 1e6)
            The maximum number of iterations allowed. Returns the learned model when the
            number of iterations is greater than `max_iter`.

        Returns
        -------
        model: `DAG` instance
            A `DAG` at a (local) score maximum.

        Examples
        --------
        >>> import pandas as pd
        >>> import numpy as np
        >>> from pgmpy.estimators import HillClimbSearch, BicScore
        >>> # create data sample with 9 random variables:
        ... data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 9)), columns=list('ABCDEFGHI'))
        >>> # add 10th dependent variable
        ... data['J'] = data['A'] * data['B']
        >>> est = HillClimbSearch(data, scoring_method=BicScore(data))
        >>> best_model = est.estimate()
        >>> sorted(best_model.nodes())
        ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
        >>> best_model.edges()
        [('B', 'J'), ('A', 'J')]
        >>> # search a model with restriction on the number of parents:
        >>> est.estimate(max_indegree=1).edges()
        [('J', 'A'), ('B', 'J')]
        """
        nodes = self.state_names.keys()
        if start is None:
            start = DAG()
            start.add_nodes_from(nodes)
        elif not isinstance(start, DAG) or not set(start.nodes()) == set(nodes):
            raise ValueError(
                "'start' should be a DAG with the same variables as the data set, or 'None'."
            )

        tabu_list = []
        current_model = start

        iter_no = 0
        while iter_no <= max_iter:
            iter_no += 1

            best_score_delta = 0
            best_operation = None

            for operation, score_delta in self._legal_operations(
                current_model, tabu_list, max_indegree
            ):
                if score_delta > best_score_delta:
                    best_operation = operation
                    best_score_delta = score_delta

            if best_operation is None or best_score_delta < epsilon:
                break
            elif best_operation[0] == "+":
                current_model.add_edge(*best_operation[1])
                tabu_list = ([("-", best_operation[1])] + tabu_list)[:tabu_length]
            elif best_operation[0] == "-":
                current_model.remove_edge(*best_operation[1])
                tabu_list = ([("+", best_operation[1])] + tabu_list)[:tabu_length]
            elif best_operation[0] == "flip":
                X, Y = best_operation[1]
                current_model.remove_edge(X, Y)
                current_model.add_edge(Y, X)
                tabu_list = ([best_operation] + tabu_list)[:tabu_length]

        return current_model
Exemple #18
0
 def setUp(self):
     self.graph = DAG()
     self.graph.add_edges_from([("X", "A"), ("A", "Y"), ("A", "B")])