Esempio n. 1
0
    def test_list_of_dfs_as_input(self, data_dynotears_p2):
        """
        the result when given a list of dataframes should be the same as a single dataframe.
        Also, stacking two dataframes should give the same result as well
        """
        df = pd.DataFrame(data_dynotears_p2["X"],
                          columns=["a", "b", "c", "d", "e"])
        df.loc[-1, :] = data_dynotears_p2["Y"][0, :5]
        df.loc[-2, :] = data_dynotears_p2["Y"][0, 5:10]

        df = df.sort_index()
        df_ = df.copy()
        df_.index = range(100, 152)
        df = pd.concat([df, df_])
        sm = from_pandas_dynamic(df, p=2, w_threshold=0.05)
        sm_1 = from_pandas_dynamic([df], p=2, w_threshold=0.05)
        sm_2 = from_pandas_dynamic([df, df], p=2, w_threshold=0.05)

        assert list(sm_2.edges) == list(sm_1.edges)
        assert list(sm.edges) == list(sm_1.edges)

        weights = np.array([w for _, _, w in sm.edges(data="weight")])
        weights_1 = np.array([w for _, _, w in sm_1.edges(data="weight")])
        weights_2 = np.array([w for _, _, w in sm_2.edges(data="weight")])
        assert np.max(np.abs(weights - weights_1)) < 0.001
        assert np.max(np.abs(weights - weights_2)) < 0.001
Esempio n. 2
0
    def test_single_iter_gets_converged_fail_warnings(self, data_dynotears_p1):
        """
        With a single iteration on this dataset, learn_structure fails to converge and should give warnings.
        """

        with pytest.warns(
                UserWarning,
                match=r"Failed to converge\. Consider increasing max_iter."):
            from_pandas_dynamic(pd.DataFrame(data_dynotears_p1["X"]),
                                p=1,
                                max_iter=1)
Esempio n. 3
0
    def test_expected_structure_learned_p1(self, data_dynotears_p1):
        """
        Given a small data set with p=1, find all the intra-slice edges and the majority of the inter-slice ones
        """
        df = pd.DataFrame(data_dynotears_p1["X"],
                          columns=["a", "b", "c", "d", "e"])
        df.loc[-1, :] = data_dynotears_p1["Y"][0, :]
        df = df.sort_index()

        sm = from_pandas_dynamic(
            df,
            p=1,
            w_threshold=0.2,
        )
        map_ = dict(zip(range(5), ["a", "b", "c", "d", "e"]))
        w_edges = [("{i}_lag0".format(i=map_[i]), "{j}_lag0".format(j=map_[j]))
                   for i in range(5) for j in range(5)
                   if data_dynotears_p1["W"][i, j] != 0]
        a_edges = [(
            "{i_1}_lag{i_2}".format(i_1=map_[i % 5], i_2=1 + i // 5),
            "{j}_lag0".format(j=map_[j]),
        ) for i in range(5) for j in range(5)
                   if data_dynotears_p1["A"][i, j] != 0]

        edges_in_sm_and_a = [el for el in sm.edges if el in a_edges]
        sm_inter_edges = [el for el in sm.edges if "lag0" not in el[0]]
        assert sorted(el for el in sm.edges
                      if "lag0" in el[0]) == sorted(w_edges)
        assert len(edges_in_sm_and_a) / len(a_edges) > 0.6
        assert len(edges_in_sm_and_a) / len(sm_inter_edges) > 0.9
Esempio n. 4
0
def dynotears(data, tau_max=5, alpha=0.0):
    graph_dict = dict()
    for name in data.columns:
        graph_dict[name] = []

    sm = from_pandas_dynamic(data, p=tau_max, w_threshold=0.01, lambda_w=0.05, lambda_a=0.05)

    # print(sm.edges)
    # print(sm.pred)

    tname_to_name_dict = dict()
    count_lag = 0
    idx_name = 0
    for tname in sm.nodes:
        tname_to_name_dict[tname] = data.columns[idx_name]
        if count_lag == tau_max:
            idx_name = idx_name +1
            count_lag = -1
        count_lag = count_lag +1

    for ce in sm.edges:
        c = ce[0]
        e = ce[1]
        tc = int(c.partition("lag")[2])
        te = int(e.partition("lag")[2])
        t = tc - te
        if (tname_to_name_dict[c], -t) not in graph_dict[tname_to_name_dict[e]]:
            graph_dict[tname_to_name_dict[e]].append((tname_to_name_dict[c], -t))

    # g = sm.to_directed()
    return graph_dict
    def test_expected_structure_learned_p2(self, data_dynotears_p2):
        """
        Given a small data set with p=2, all the intra-slice must be correct, and 90%+ found.
        the majority of the inter edges must be found too
        """
        df = pd.DataFrame(data_dynotears_p2["X"],
                          columns=["a", "b", "c", "d", "e"])
        df.loc[-1, :] = data_dynotears_p2["Y"][0, :5]
        df.loc[-2, :] = data_dynotears_p2["Y"][0, 5:10]

        df = df.sort_index()

        sm = from_pandas_dynamic(
            df,
            p=2,
            w_threshold=0.25,
        )
        map_ = dict(zip(range(5), ["a", "b", "c", "d", "e"]))
        w_edges = [(f"{map_[i]}_lag0", f"{map_[j]}_lag0") for i in range(5)
                   for j in range(5) if data_dynotears_p2["W"][i, j] != 0]
        a_edges = [(
            f"{map_[i % 5]}_lag{1 + i // 5}",
            f"{map_[j]}_lag0",
        ) for i in range(5) for j in range(5)
                   if data_dynotears_p2["A"][i, j] != 0]

        edges_in_sm_and_a = [el for el in sm.edges if el in a_edges]
        sm_inter_edges = [el for el in sm.edges if "lag0" not in el[0]]
        sm_intra_edges = [el for el in sm.edges if "lag0" in el[0]]

        assert len([el for el in sm_intra_edges if el not in w_edges]) == 0
        assert (len([el for el in w_edges if el not in sm_intra_edges]) /
                len(w_edges) <= 1.0)
        assert len(edges_in_sm_and_a) / len(a_edges) > 0.5
        assert len(edges_in_sm_and_a) / len(sm_inter_edges) > 0.5
Esempio n. 6
0
    def test_isolated_nodes_exist(self, data_dynotears_p2):
        """Isolated nodes should still be in the learned structure"""
        df = pd.DataFrame(data_dynotears_p2["X"],
                          columns=["a", "b", "c", "d", "e"])
        df.loc[-1, :] = data_dynotears_p2["Y"][0, :5]
        df.loc[-2, :] = data_dynotears_p2["Y"][0, 5:10]
        df = df.sort_index()

        sm = from_pandas_dynamic(df, p=2, w_threshold=1)
        assert len(sm.edges) == 2
        assert len(sm.nodes) == 15
Esempio n. 7
0
 def test_edges_contain_weight(self, data_dynotears_p3):
     """Edges must contain the 'weight' from the adjacent table """
     sm = from_pandas_dynamic(
         pd.DataFrame(data_dynotears_p3["X"],
                      columns=["a", "b", "c", "d", "e"]),
         p=3,
     )
     assert np.all([
         isinstance(w, (float, int, np.number))
         for u, v, w in sm.edges(data="weight")
     ])
Esempio n. 8
0
 def test_tabu_children(self, data_dynotears_p3):
     """
     If tabu relationships are set, the corresponding edges must not exist
     """
     sm = from_pandas_dynamic(
         pd.DataFrame(data_dynotears_p3["X"],
                      columns=["a", "b", "c", "d", "e"]),
         p=3,
         tabu_child_nodes=["c", "d"],
     )
     assert not ([el for el in sm.edges if "c_lag" in el[1]])
     assert not ([el for el in sm.edges if "d_lag" in el[1]])
     sm = from_pandas_dynamic(
         pd.DataFrame(data_dynotears_p3["X"],
                      columns=["a", "b", "c", "d", "e"]),
         p=3,
         tabu_child_nodes=["a", "b"],
     )
     assert not ([el for el in sm.edges if "a_lag" in el[1]])
     assert not ([el for el in sm.edges if "b_lag" in el[1]])
Esempio n. 9
0
    def test_inverse_relationships_get_negative_weight(self):
        """If a == -b always, there should be an edge a->b or b->a with coefficient close to minus one """

        np.random.seed(17)
        data = pd.DataFrame([[el, -el]
                             for el in np.random.choice(100, size=500)],
                            columns=["a", "b"])
        sm = from_pandas_dynamic(data, p=1, w_threshold=0.1)
        edge = (sm.get_edge_data("b_lag0", "a_lag0")
                or sm.get_edge_data("a_lag0", "b_lag0"))["weight"]
        assert -1.01 < edge <= -0.99
Esempio n. 10
0
 def test_no_cycles(self, data_dynotears_p2):
     """
     The learned structure should be acyclic
     """
     sm = from_pandas_dynamic(
         pd.DataFrame(data_dynotears_p2["X"],
                      columns=["a", "b", "c", "d", "e"]),
         p=2,
         w_threshold=0.05,
     )
     assert nx.algorithms.is_directed_acyclic_graph(sm)
Esempio n. 11
0
 def test_all_columns_in_structure(self, data_dynotears_p2):
     """Every columns that is in the data should become a node in the learned structure"""
     sm = from_pandas_dynamic(
         pd.DataFrame(data_dynotears_p2["X"],
                      columns=["a", "b", "c", "d", "e"]),
         p=2,
         w_threshold=0.4,
     )
     assert sorted(sm.nodes) == [
         "{var}_lag{l_val}".format(var=var, l_val=l_val)
         for var in ["a", "b", "c", "d", "e"] for l_val in range(3)
     ]
Esempio n. 12
0
    def test_tabu_edges_on_non_existing_edges_do_nothing(
            self, data_dynotears_p2):
        """If tabu edges do not exist in the original unconstrained network then nothing changes"""
        df = pd.DataFrame(data_dynotears_p2["X"],
                          columns=["a", "b", "c", "d", "e"])
        df.loc[-1, :] = data_dynotears_p2["Y"][0, :5]
        df.loc[-2, :] = data_dynotears_p2["Y"][0, 5:10]
        df = df.sort_index()

        sm = from_pandas_dynamic(
            df,
            p=2,
            w_threshold=0.2,
        )
        sm_2 = from_pandas_dynamic(
            df,
            p=2,
            w_threshold=0.2,
            tabu_edges=[(0, "a", "a"), (0, "a", "b"), (0, "a", "c"),
                        (0, "a", "d")],
        )
        assert set(sm_2.edges) == set(sm.edges)
Esempio n. 13
0
    def test_inter_edges(self, data_dynotears_p3):
        """
        inter-slice edges must be {var}_lag{l} -> {var'}_lag0 , l > 0
        """

        sm = from_pandas_dynamic(
            pd.DataFrame(data_dynotears_p3["X"],
                         columns=["a", "b", "c", "d", "e"]),
            p=3,
        )
        for start, end in sm.edges:
            if int(start[-1]) > 0:
                assert int(end[-1]) == 0
Esempio n. 14
0
    def test_certain_relationships_get_near_certain_weight(self):
        """If a == b always, ther should be an edge a->b or b->a with coefficient close to one """

        np.random.seed(17)
        data = pd.DataFrame(
            [[np.sqrt(el), np.sqrt(el)]
             for el in np.random.choice(100, size=500)],
            columns=["a", "b"],
        )
        sm = from_pandas_dynamic(data, p=1, w_threshold=0.1)
        edge = (sm.get_edge_data("b_lag0", "a_lag0")
                or sm.get_edge_data("a_lag0", "b_lag0"))["weight"]

        assert 0.99 < edge <= 1.01
Esempio n. 15
0
    def test_discondinuity(self):
        """
        The result when having a point of discontinuity must be the same as if we cut the df in two (on the discont.
        point) and provide the two datasets as input.

        This is because, inside, the algorithm cuts the dfs into continuous datasets
        """
        np.random.seed(12)
        df = pd.DataFrame(np.random.random([100, 5]),
                          columns=["a", "b", "c", "d", "e"])
        df_2 = pd.DataFrame(
            np.random.random([100, 5]),
            columns=["a", "b", "c", "d", "e"],
            index=np.arange(200, 300),
        )

        sm = from_pandas_dynamic(pd.concat([df, df_2], axis=0),
                                 p=2,
                                 w_threshold=0.05)
        sm_1 = from_pandas_dynamic([df, df_2], p=2, w_threshold=0.05)

        assert [
            (u, v, round(w, 3)) for u, v, w in sm_1.edges(data="weight")
        ] == [(u, v, round(w, 3)) for u, v, w in sm.edges(data="weight")]
Esempio n. 16
0
    def test_naming_nodes(self, data_dynotears_p3):
        """
        Nodes should have the format {var}_lag{l}
        """
        sm = from_pandas_dynamic(
            pd.DataFrame(data_dynotears_p3["X"],
                         columns=["a", "b", "c", "d", "e"]),
            p=3,
        )
        pattern = re.compile(r"[abcde]_lag[0-3]")

        for node in sm.nodes:
            match = pattern.match(node)
            assert match
            assert match.group() == node
Esempio n. 17
0
    def test_tabu_edges(self, data_dynotears_p3):
        """
        Tabu edges must not be in the edges learnt
        """
        sm = from_pandas_dynamic(
            pd.DataFrame(data_dynotears_p3["X"],
                         columns=["a", "b", "c", "d", "e"]),
            p=3,
            tabu_edges=[(0, "c", "e"), (0, "a", "d"), (1, "b", "e"),
                        (1, "d", "e")],
        )

        assert ("c_lag0", "e_lag0") not in sm.edges
        assert ("a_lag0", "d_lag0") not in sm.edges
        assert ("b_lag1", "e_lag0") not in sm.edges
        assert ("d_lag1", "e_lag0") not in sm.edges
Esempio n. 18
0
    def test_multiple_tabu(self, data_dynotears_p3):
        """
        If tabu relationships are set, the corresponding edges must not exist
        """
        sm = from_pandas_dynamic(
            pd.DataFrame(data_dynotears_p3["X"],
                         columns=["a", "b", "c", "d", "e"]),
            p=3,
            tabu_edges=[(0, "a", "e"), (0, "a", "d"), (1, "b", "e"),
                        (1, "d", "e")],
            tabu_child_nodes=["a", "b"],
            tabu_parent_nodes=["d"],
        )

        assert ("a_lag0", "e_lag0") not in sm.edges
        assert ("a_lag0", "d_lag0") not in sm.edges
        assert ("b_lag1", "e_lag0") not in sm.edges
        assert ("d_lag1", "e_lag0") not in sm.edges
        assert not ([el for el in sm.edges if "a_lag" in el[1]])
        assert not ([el for el in sm.edges if "b_lag" in el[1]])
        assert not ([el for el in sm.edges if "d_lag" in el[0]])
Esempio n. 19
0
    def test_empty_data_raises_error(self):
        """
        Providing an empty data set should result in a Value Error explaining that data must not be empty.
        This error is useful to catch and handle gracefully, because otherwise the user would experience
        misleading division by zero, or unpacking errors.
        """
        with pytest.raises(
                ValueError,
                match="Input data X is empty, cannot learn any structure"):
            from_pandas_dynamic(pd.DataFrame(np.empty([2, 5])), p=2)

        with pytest.raises(
                ValueError,
                match="Input data X is empty, cannot learn any structure"):
            from_pandas_dynamic(pd.DataFrame(np.empty([1, 5])), p=1)

        with pytest.raises(
                ValueError,
                match="Input data X is empty, cannot learn any structure"):
            from_pandas_dynamic(pd.DataFrame(np.empty([0, 5])), p=1)
Esempio n. 20
0
    def test_incorrect_input_format(self):
        with pytest.raises(
                ValueError,
                match="Provided empty list of time_series."
                " At least one DataFrame must be provided",
        ):
            from_pandas_dynamic([], 1)

        with pytest.raises(
                ValueError,
                match=r"All columns must have numeric data\. "
                r"Consider mapping the following columns to int: \['a'\]",
        ):
            from_pandas_dynamic(pd.DataFrame([["1"]], columns=["a"]), 1)

        with pytest.raises(
                TypeError,
                match="Time series entries must be instances of `pd.DataFrame`",
        ):
            from_pandas_dynamic([np.array([1, 2])], 1)

        with pytest.raises(
                ValueError,
                match=
                "Index for dataframe must be provided in increasing order",
        ):
            df = pd.DataFrame(np.random.random([5, 5]), index=[3, 1, 2, 5, 0])
            from_pandas_dynamic(df, 1)

        with pytest.raises(
                ValueError,
                match="All inputs must have the same columns and same types",
        ):
            df = pd.DataFrame(
                np.random.random([5, 5]),
                columns=["a", "b", "c", "d", "e"],
            )
            df_2 = pd.DataFrame(
                np.random.random([5, 5]),
                columns=["a", "b", "c", "d", "f"],
            )
            from_pandas_dynamic([df, df_2], 1)

        with pytest.raises(
                ValueError,
                match="All inputs must have the same columns and same types",
        ):
            df = pd.DataFrame(
                np.random.random([5, 5]),
                columns=["a", "b", "c", "d", "e"],
            )
            df_2 = pd.DataFrame(
                np.random.random([5, 5]),
                columns=["a", "b", "c", "d", "e"],
            )
            df_2["a"] = df_2["a"].astype(int)
            from_pandas_dynamic([df, df_2], 1)

        with pytest.raises(
                TypeError,
                match="Index must be integers",
        ):
            df = pd.DataFrame(np.random.random([5, 5]),
                              index=[0, 1, 2, 3.0, 4])
            from_pandas_dynamic(df, 1)