def test_add_weighted_edges_from_other(self):
        """edges added with other origin should throw an error"""

        sm = StructureModel()

        with pytest.raises(ValueError, match="^Unknown origin: must be one of.*$"):
            sm.add_weighted_edges_from([(1, 2, 0.5)], origin="other")
 def test_all_states_included(self):
     """All states in a node should be included"""
     cg = StructureModel()
     cg.add_weighted_edges_from([("a", "b", 1)])
     bn = BayesianNetwork(cg).fit_node_states(
         pd.DataFrame([[i, i] for i in range(10)], columns=["a", "b"]))
     assert all(v in bn.node_states["a"] for v in range(10))
 def test_fit_with_null_states_raises_error(self):
     """An error should be raised if fit is called with null data"""
     cg = StructureModel()
     cg.add_weighted_edges_from([("a", "b", 1)])
     with pytest.raises(ValueError, match="node '.*' contains None state"):
         BayesianNetwork(cg).fit_node_states(
             pd.DataFrame([[None, 1]], columns=["a", "b"]))
Beispiel #4
0
    def test_number_of_nodes(self, num_nodes):
        """Length of each row in generated data equals num_nodes"""
        graph = StructureModel()
        edges = [(n, n + 1, 1) for n in range(num_nodes - 1)]
        graph.add_weighted_edges_from(edges)

        data = generate_binary_data(graph, 100, seed=10)
        assert all(len(sample) == num_nodes for sample in data)
    def test_add_weighted_edges_from_default(self):
        """edges added with default origin should be identified as unknown origin"""

        sm = StructureModel()
        edges = [(1, 2, 0.5), (2, 3, 0.5)]
        sm.add_weighted_edges_from(edges)

        assert all((u, v, w) in sm.edges.data("weight") for u, v, w in edges)
        assert all((u, v, "unknown") in sm.edges.data("origin") for u, v, w in edges)
    def test_add_weighted_edges_from_custom_attr(self):
        """it should be possible to add edges with custom attributes"""

        sm = StructureModel()
        edges = [(1, 2, 0.5), (2, 3, 0.5)]
        sm.add_weighted_edges_from(edges, x="Y")

        assert all((u, v, w) in sm.edges.data("weight") for u, v, w in edges)
        assert all((u, v, "Y") in sm.edges.data("x") for u, v, _ in edges)
    def test_add_weighted_edges_from_multiple_times(self):
        """adding edges again should update the edges origin attr"""

        sm = StructureModel()
        edges = [(1, 2, 0.5), (2, 3, 0.5)]
        sm.add_weighted_edges_from(edges, origin="unknown")
        assert all((u, v, "unknown") in sm.edges.data("origin") for u, v, _ in edges)
        sm.add_weighted_edges_from(edges, origin="learned")
        assert all((u, v, "learned") in sm.edges.data("origin") for u, v, _ in edges)
    def test_add_weighted_edges_from_expert(self):
        """edges added with expert origin should be labelled as expert origin"""

        sm = StructureModel()
        edges = [(1, 2, 0.5), (2, 3, 0.5)]
        sm.add_weighted_edges_from(edges, origin="expert")

        assert all((u, v, w) in sm.edges.data("weight") for u, v, w in edges)
        assert all((u, v, "expert") in sm.edges.data("origin") for u, v, w in edges)
    def test_add_multiple_weighted_edges(self):
        """it should be possible to add multiple edges with different origins"""

        sm = StructureModel()
        sm.add_weighted_edges_from([(1, 2, 0.5)], origin="unknown")
        sm.add_weighted_edges_from([(1, 3, 0.5)], origin="learned")
        sm.add_weighted_edges_from([(1, 4, 0.5)], origin="expert")

        assert (1, 2, "unknown") in sm.edges.data("origin")
        assert (1, 3, "learned") in sm.edges.data("origin")
        assert (1, 4, "expert") in sm.edges.data("origin")
Beispiel #10
0
    def test_number_of_columns(self, num_nodes, n_categories):
        """Length of dataframe is in the correct shape"""
        graph = StructureModel()
        edges = [(n, n + 1, 1) for n in range(num_nodes - 1)]
        graph.add_weighted_edges_from(edges)

        data = generate_categorical_dataframe(graph,
                                              100,
                                              seed=10,
                                              n_categories=n_categories)
        assert data.shape[1] == (num_nodes * n_categories)
    def test_remove_edges_below_threshold(self):
        """Edges whose weight is less than a defined threshold should be removed"""

        sm = StructureModel()
        strong_edges = [(1, 2, 1.0), (1, 3, 0.8), (1, 5, 2.0)]
        weak_edges = [(1, 4, 0.4), (2, 3, 0.6), (3, 5, 0.5)]
        sm.add_weighted_edges_from(strong_edges)
        sm.add_weighted_edges_from(weak_edges)

        sm.remove_edges_below_threshold(0.7)

        assert set(sm.edges(data="weight")) == set(strong_edges)
Beispiel #12
0
    def generator(num_nodes, seed, weight=None):
        np.random.seed(seed)

        sm = StructureModel()
        nodes = list("".join(x) for x in product(
            string.ascii_lowercase, string.ascii_lowercase))[:num_nodes]
        np.random.shuffle(nodes)
        sm.add_nodes_from(nodes)

        # one edge:
        sm.add_weighted_edges_from([("aa", "ab", weight)])
        return sm
    def test_negative_weights(self):
        """Negative edges whose absolute value is greater than the defined threshold should not be removed"""

        sm = StructureModel()
        strong_edges = [(1, 2, -3.0), (3, 1, 0.7), (1, 5, -2.0)]
        weak_edges = [(1, 4, 0.4), (2, 3, -0.6), (3, 5, -0.5)]
        sm.add_weighted_edges_from(strong_edges)
        sm.add_weighted_edges_from(weak_edges)

        sm.remove_edges_below_threshold(0.7)

        assert set(sm.edges(data="weight")) == set(strong_edges)
Beispiel #14
0
    def test_fit_with_missing_feature_in_data(self):
        """An error should be raised if fit is called with missing feature in data"""
        cg = StructureModel()

        cg.add_weighted_edges_from([("a", "e", 1)])
        with pytest.raises(
            KeyError,
            match="The data does not cover all the features found in the Bayesian Network. "
            "Please check the following features: {'e'}",
        ):
            BayesianNetwork(cg).fit_node_states(
                pd.DataFrame([[1, 1, 1, 1]], columns=["a", "b", "c", "d"])
            )
    def test_different_origins_and_weights(self):
        """The subgraph returned should still have the edge data preserved from the original graph"""

        sm = StructureModel()
        sm.add_weighted_edges_from([(1, 2, 2.0)], origin="unknown")
        sm.add_weighted_edges_from([(1, 3, 1.0)], origin="learned")
        sm.add_weighted_edges_from([(5, 6, 0.7)], origin="expert")

        subgraph = sm.get_target_subgraph(2)

        assert set(subgraph.edges.data("origin")) == {
            (1, 2, "unknown"),
            (1, 3, "learned"),
        }
        assert set(subgraph.edges.data("weight")) == {(1, 2, 2.0), (1, 3, 1.0)}
Beispiel #16
0
    def test_get_structure(self):
        """The structure retrieved should be the same"""

        sm = StructureModel()

        sm.add_weighted_edges_from([(1, 2, 2.0)], origin="unknown")
        sm.add_weighted_edges_from([(1, 3, 1.0)], origin="learned")
        sm.add_weighted_edges_from([(3, 5, 0.7)], origin="expert")

        bn = BayesianNetwork(sm)

        sm_from_bn = bn.structure

        assert set(sm.edges.data("origin")) == set(sm_from_bn.edges.data("origin"))
        assert set(sm.edges.data("weight")) == set(sm_from_bn.edges.data("weight"))

        assert set(sm.nodes) == set(sm_from_bn.nodes)
Beispiel #17
0
    def test_set_structure(self):
        """An error should be raised if setting the structure"""

        sm = StructureModel()
        sm.add_weighted_edges_from([(1, 2, 2.0)], origin="unknown")
        sm.add_weighted_edges_from([(1, 3, 1.0)], origin="learned")
        sm.add_weighted_edges_from([(3, 5, 0.7)], origin="expert")

        bn = BayesianNetwork(sm)

        new_sm = StructureModel()
        sm.add_weighted_edges_from([(2, 5, 3.0)], origin="unknown")
        sm.add_weighted_edges_from([(2, 3, 2.0)], origin="learned")
        sm.add_weighted_edges_from([(3, 4, 1.7)], origin="expert")

        with pytest.raises(AttributeError, match=r"can't set attribute"):
            bn.structure = new_sm
    def test_equal_weights(self):
        """Edges whose absolute value is equal to the defined threshold should not be removed"""

        sm = StructureModel()
        strong_edges = [(1, 2, 1.0), (1, 5, 2.0)]
        equal_edges = [(1, 3, 0.6), (2, 3, 0.6)]
        weak_edges = [(1, 4, 0.4), (3, 5, 0.5)]
        sm.add_weighted_edges_from(strong_edges)
        sm.add_weighted_edges_from(equal_edges)
        sm.add_weighted_edges_from(weak_edges)

        sm.remove_edges_below_threshold(0.6)

        assert set(sm.edges(data="weight")) == set.union(
            set(strong_edges), set(equal_edges))
Beispiel #19
0
def graph():
    graph = StructureModel()
    edges = [(n, n + 1, 1) for n in range(5)]
    graph.add_weighted_edges_from(edges)
    return graph
Beispiel #20
0
def from_pandas_dynamic(  # pylint: disable=too-many-arguments
    time_series: Union[pd.DataFrame, List[pd.DataFrame]],
    p: int,
    lambda_w: float = 0.1,
    lambda_a: float = 0.1,
    max_iter: int = 100,
    h_tol: float = 1e-8,
    w_threshold: float = 0.0,
    tabu_edges: List[Tuple[int, int, int]] = None,
    tabu_parent_nodes: List[int] = None,
    tabu_child_nodes: List[int] = None,
) -> StructureModel:
    """
    Learn the graph structure of a Dynamic Bayesian Network describing conditional dependencies between variables in
    data. The input data is a time series or a list of realisations of a same time series.
    The optimisation is to minimise a score function F(W, A) over the graph's contemporaneous (intra-slice) weighted
    adjacency matrix, W, and lagged (inter-slice) weighted adjacency matrix, A, subject to the a constraint function
    h(W), where h_value(W) == 0 characterises an acyclic graph. h(W) > 0 is a continuous, differentiable function that
    encapsulated how acyclic the graph is (less = more acyclic).

    Based on "DYNOTEARS: Structure Learning from Time-Series Data".
    https://arxiv.org/abs/2002.00498
    @inproceedings{pamfil2020dynotears,
        title={DYNOTEARS: Structure Learning from Time-Series Data},
        author={Pamfil, Roxana and Sriwattanaworachai, Nisara and Desai, Shaan and Pilgerstorfer,
        Philip and Georgatzis, Konstantinos and Beaumont, Paul and Aragam, Bryon},
        booktitle={International Conference on Artificial Intelligence and Statistics},
        pages={1595--1605},
        year={2020}year={2020},
    }
    Args:
        time_series: pd.DataFrame or List of pd.DataFrame instances.
        If a list is provided each element of the list being an realisation of a time series (i.e. time series governed
        by the same processes)
        The columns of the data frame represent the variables in the model, and the *index represents the time index*.
        Successive events, therefore, must be indexed with one integer of difference between them too.
        p: Number of past interactions we allow the model to create. The state of a variable at time `t` is affected by
        past variables up to a `t-p`, as well as by other variables at `t`.
        lambda_w: parameter for l1 regularisation of intra-slice edges
        lambda_a: parameter for l1 regularisation of inter-slice edges
        max_iter: max number of dual ascent steps during optimisation.
        h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0).
        w_threshold: fixed threshold for absolute edge weights.
        tabu_edges: list of edges(lag, from, to) not to be included in the graph. `lag == 0` implies that the edge is
        forbidden in the INTRA graph (W), while lag > 0 implies an INTER-slice weight equal zero.
        tabu_parent_nodes: list of nodes banned from being a parent of any other nodes.
        tabu_child_nodes: list of nodes banned from being a child of any other nodes.

    Returns:
        StructureModel representing the model learnt. The node names are noted as `{var}_lag{l}`, where `var` is the
        original variable name as in the give in the input data frames and `l`, in 0,1,2..p is the correspondent
        time lag.
    """
    time_series = [time_series
                   ] if not isinstance(time_series, list) else time_series

    X, Xlags = DynamicDataTransformer(p=p).fit_transform(time_series,
                                                         return_df=False)

    col_idx = {c: i for i, c in enumerate(time_series[0].columns)}
    idx_col = {i: c for c, i in col_idx.items()}

    if tabu_edges:
        tabu_edges = [(lag, col_idx[u], col_idx[v])
                      for lag, u, v in tabu_edges]
    if tabu_parent_nodes:
        tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes]
    if tabu_child_nodes:
        tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes]

    g = from_numpy_dynamic(
        X,
        Xlags,
        lambda_w,
        lambda_a,
        max_iter,
        h_tol,
        w_threshold,
        tabu_edges,
        tabu_parent_nodes,
        tabu_child_nodes,
    )

    sm = StructureModel()
    sm.add_nodes_from([
        "{var}_lag{l_val}".format(var=var, l_val=l_val)
        for var in col_idx.keys() for l_val in range(p + 1)
    ])
    sm.add_weighted_edges_from(
        [(
            _format_name_from_pandas(idx_col, u),
            _format_name_from_pandas(idx_col, v),
            w,
        ) for u, v, w in g.edges.data("weight")],
        origin="learned",
    )

    return sm
Beispiel #21
0
 def test_all_nodes_included(self, weighted_edges, data):
     """No errors if all the nodes can be found in the columns of training data"""
     cg = StructureModel()
     cg.add_weighted_edges_from(weighted_edges)
     bn = BayesianNetwork(cg).fit_node_states(data)
     assert all(node in data.columns for node in bn.node_states.keys())