def test_add_weighted_edges_from_other(self): """edges added with other origin should throw an error""" sm = StructureModel() with pytest.raises(ValueError, match="^Unknown origin: must be one of.*$"): sm.add_weighted_edges_from([(1, 2, 0.5)], origin="other")
def test_all_states_included(self): """All states in a node should be included""" cg = StructureModel() cg.add_weighted_edges_from([("a", "b", 1)]) bn = BayesianNetwork(cg).fit_node_states( pd.DataFrame([[i, i] for i in range(10)], columns=["a", "b"])) assert all(v in bn.node_states["a"] for v in range(10))
def test_fit_with_null_states_raises_error(self): """An error should be raised if fit is called with null data""" cg = StructureModel() cg.add_weighted_edges_from([("a", "b", 1)]) with pytest.raises(ValueError, match="node '.*' contains None state"): BayesianNetwork(cg).fit_node_states( pd.DataFrame([[None, 1]], columns=["a", "b"]))
def test_number_of_nodes(self, num_nodes): """Length of each row in generated data equals num_nodes""" graph = StructureModel() edges = [(n, n + 1, 1) for n in range(num_nodes - 1)] graph.add_weighted_edges_from(edges) data = generate_binary_data(graph, 100, seed=10) assert all(len(sample) == num_nodes for sample in data)
def test_add_weighted_edges_from_default(self): """edges added with default origin should be identified as unknown origin""" sm = StructureModel() edges = [(1, 2, 0.5), (2, 3, 0.5)] sm.add_weighted_edges_from(edges) assert all((u, v, w) in sm.edges.data("weight") for u, v, w in edges) assert all((u, v, "unknown") in sm.edges.data("origin") for u, v, w in edges)
def test_add_weighted_edges_from_custom_attr(self): """it should be possible to add edges with custom attributes""" sm = StructureModel() edges = [(1, 2, 0.5), (2, 3, 0.5)] sm.add_weighted_edges_from(edges, x="Y") assert all((u, v, w) in sm.edges.data("weight") for u, v, w in edges) assert all((u, v, "Y") in sm.edges.data("x") for u, v, _ in edges)
def test_add_weighted_edges_from_multiple_times(self): """adding edges again should update the edges origin attr""" sm = StructureModel() edges = [(1, 2, 0.5), (2, 3, 0.5)] sm.add_weighted_edges_from(edges, origin="unknown") assert all((u, v, "unknown") in sm.edges.data("origin") for u, v, _ in edges) sm.add_weighted_edges_from(edges, origin="learned") assert all((u, v, "learned") in sm.edges.data("origin") for u, v, _ in edges)
def test_add_weighted_edges_from_expert(self): """edges added with expert origin should be labelled as expert origin""" sm = StructureModel() edges = [(1, 2, 0.5), (2, 3, 0.5)] sm.add_weighted_edges_from(edges, origin="expert") assert all((u, v, w) in sm.edges.data("weight") for u, v, w in edges) assert all((u, v, "expert") in sm.edges.data("origin") for u, v, w in edges)
def test_add_multiple_weighted_edges(self): """it should be possible to add multiple edges with different origins""" sm = StructureModel() sm.add_weighted_edges_from([(1, 2, 0.5)], origin="unknown") sm.add_weighted_edges_from([(1, 3, 0.5)], origin="learned") sm.add_weighted_edges_from([(1, 4, 0.5)], origin="expert") assert (1, 2, "unknown") in sm.edges.data("origin") assert (1, 3, "learned") in sm.edges.data("origin") assert (1, 4, "expert") in sm.edges.data("origin")
def test_number_of_columns(self, num_nodes, n_categories): """Length of dataframe is in the correct shape""" graph = StructureModel() edges = [(n, n + 1, 1) for n in range(num_nodes - 1)] graph.add_weighted_edges_from(edges) data = generate_categorical_dataframe(graph, 100, seed=10, n_categories=n_categories) assert data.shape[1] == (num_nodes * n_categories)
def test_remove_edges_below_threshold(self): """Edges whose weight is less than a defined threshold should be removed""" sm = StructureModel() strong_edges = [(1, 2, 1.0), (1, 3, 0.8), (1, 5, 2.0)] weak_edges = [(1, 4, 0.4), (2, 3, 0.6), (3, 5, 0.5)] sm.add_weighted_edges_from(strong_edges) sm.add_weighted_edges_from(weak_edges) sm.remove_edges_below_threshold(0.7) assert set(sm.edges(data="weight")) == set(strong_edges)
def generator(num_nodes, seed, weight=None): np.random.seed(seed) sm = StructureModel() nodes = list("".join(x) for x in product( string.ascii_lowercase, string.ascii_lowercase))[:num_nodes] np.random.shuffle(nodes) sm.add_nodes_from(nodes) # one edge: sm.add_weighted_edges_from([("aa", "ab", weight)]) return sm
def test_negative_weights(self): """Negative edges whose absolute value is greater than the defined threshold should not be removed""" sm = StructureModel() strong_edges = [(1, 2, -3.0), (3, 1, 0.7), (1, 5, -2.0)] weak_edges = [(1, 4, 0.4), (2, 3, -0.6), (3, 5, -0.5)] sm.add_weighted_edges_from(strong_edges) sm.add_weighted_edges_from(weak_edges) sm.remove_edges_below_threshold(0.7) assert set(sm.edges(data="weight")) == set(strong_edges)
def test_fit_with_missing_feature_in_data(self): """An error should be raised if fit is called with missing feature in data""" cg = StructureModel() cg.add_weighted_edges_from([("a", "e", 1)]) with pytest.raises( KeyError, match="The data does not cover all the features found in the Bayesian Network. " "Please check the following features: {'e'}", ): BayesianNetwork(cg).fit_node_states( pd.DataFrame([[1, 1, 1, 1]], columns=["a", "b", "c", "d"]) )
def test_different_origins_and_weights(self): """The subgraph returned should still have the edge data preserved from the original graph""" sm = StructureModel() sm.add_weighted_edges_from([(1, 2, 2.0)], origin="unknown") sm.add_weighted_edges_from([(1, 3, 1.0)], origin="learned") sm.add_weighted_edges_from([(5, 6, 0.7)], origin="expert") subgraph = sm.get_target_subgraph(2) assert set(subgraph.edges.data("origin")) == { (1, 2, "unknown"), (1, 3, "learned"), } assert set(subgraph.edges.data("weight")) == {(1, 2, 2.0), (1, 3, 1.0)}
def test_get_structure(self): """The structure retrieved should be the same""" sm = StructureModel() sm.add_weighted_edges_from([(1, 2, 2.0)], origin="unknown") sm.add_weighted_edges_from([(1, 3, 1.0)], origin="learned") sm.add_weighted_edges_from([(3, 5, 0.7)], origin="expert") bn = BayesianNetwork(sm) sm_from_bn = bn.structure assert set(sm.edges.data("origin")) == set(sm_from_bn.edges.data("origin")) assert set(sm.edges.data("weight")) == set(sm_from_bn.edges.data("weight")) assert set(sm.nodes) == set(sm_from_bn.nodes)
def test_set_structure(self): """An error should be raised if setting the structure""" sm = StructureModel() sm.add_weighted_edges_from([(1, 2, 2.0)], origin="unknown") sm.add_weighted_edges_from([(1, 3, 1.0)], origin="learned") sm.add_weighted_edges_from([(3, 5, 0.7)], origin="expert") bn = BayesianNetwork(sm) new_sm = StructureModel() sm.add_weighted_edges_from([(2, 5, 3.0)], origin="unknown") sm.add_weighted_edges_from([(2, 3, 2.0)], origin="learned") sm.add_weighted_edges_from([(3, 4, 1.7)], origin="expert") with pytest.raises(AttributeError, match=r"can't set attribute"): bn.structure = new_sm
def test_equal_weights(self): """Edges whose absolute value is equal to the defined threshold should not be removed""" sm = StructureModel() strong_edges = [(1, 2, 1.0), (1, 5, 2.0)] equal_edges = [(1, 3, 0.6), (2, 3, 0.6)] weak_edges = [(1, 4, 0.4), (3, 5, 0.5)] sm.add_weighted_edges_from(strong_edges) sm.add_weighted_edges_from(equal_edges) sm.add_weighted_edges_from(weak_edges) sm.remove_edges_below_threshold(0.6) assert set(sm.edges(data="weight")) == set.union( set(strong_edges), set(equal_edges))
def graph(): graph = StructureModel() edges = [(n, n + 1, 1) for n in range(5)] graph.add_weighted_edges_from(edges) return graph
def from_pandas_dynamic( # pylint: disable=too-many-arguments time_series: Union[pd.DataFrame, List[pd.DataFrame]], p: int, lambda_w: float = 0.1, lambda_a: float = 0.1, max_iter: int = 100, h_tol: float = 1e-8, w_threshold: float = 0.0, tabu_edges: List[Tuple[int, int, int]] = None, tabu_parent_nodes: List[int] = None, tabu_child_nodes: List[int] = None, ) -> StructureModel: """ Learn the graph structure of a Dynamic Bayesian Network describing conditional dependencies between variables in data. The input data is a time series or a list of realisations of a same time series. The optimisation is to minimise a score function F(W, A) over the graph's contemporaneous (intra-slice) weighted adjacency matrix, W, and lagged (inter-slice) weighted adjacency matrix, A, subject to the a constraint function h(W), where h_value(W) == 0 characterises an acyclic graph. h(W) > 0 is a continuous, differentiable function that encapsulated how acyclic the graph is (less = more acyclic). Based on "DYNOTEARS: Structure Learning from Time-Series Data". https://arxiv.org/abs/2002.00498 @inproceedings{pamfil2020dynotears, title={DYNOTEARS: Structure Learning from Time-Series Data}, author={Pamfil, Roxana and Sriwattanaworachai, Nisara and Desai, Shaan and Pilgerstorfer, Philip and Georgatzis, Konstantinos and Beaumont, Paul and Aragam, Bryon}, booktitle={International Conference on Artificial Intelligence and Statistics}, pages={1595--1605}, year={2020}year={2020}, } Args: time_series: pd.DataFrame or List of pd.DataFrame instances. If a list is provided each element of the list being an realisation of a time series (i.e. time series governed by the same processes) The columns of the data frame represent the variables in the model, and the *index represents the time index*. Successive events, therefore, must be indexed with one integer of difference between them too. p: Number of past interactions we allow the model to create. The state of a variable at time `t` is affected by past variables up to a `t-p`, as well as by other variables at `t`. lambda_w: parameter for l1 regularisation of intra-slice edges lambda_a: parameter for l1 regularisation of inter-slice edges max_iter: max number of dual ascent steps during optimisation. h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0). w_threshold: fixed threshold for absolute edge weights. tabu_edges: list of edges(lag, from, to) not to be included in the graph. `lag == 0` implies that the edge is forbidden in the INTRA graph (W), while lag > 0 implies an INTER-slice weight equal zero. tabu_parent_nodes: list of nodes banned from being a parent of any other nodes. tabu_child_nodes: list of nodes banned from being a child of any other nodes. Returns: StructureModel representing the model learnt. The node names are noted as `{var}_lag{l}`, where `var` is the original variable name as in the give in the input data frames and `l`, in 0,1,2..p is the correspondent time lag. """ time_series = [time_series ] if not isinstance(time_series, list) else time_series X, Xlags = DynamicDataTransformer(p=p).fit_transform(time_series, return_df=False) col_idx = {c: i for i, c in enumerate(time_series[0].columns)} idx_col = {i: c for c, i in col_idx.items()} if tabu_edges: tabu_edges = [(lag, col_idx[u], col_idx[v]) for lag, u, v in tabu_edges] if tabu_parent_nodes: tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes] if tabu_child_nodes: tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes] g = from_numpy_dynamic( X, Xlags, lambda_w, lambda_a, max_iter, h_tol, w_threshold, tabu_edges, tabu_parent_nodes, tabu_child_nodes, ) sm = StructureModel() sm.add_nodes_from([ "{var}_lag{l_val}".format(var=var, l_val=l_val) for var in col_idx.keys() for l_val in range(p + 1) ]) sm.add_weighted_edges_from( [( _format_name_from_pandas(idx_col, u), _format_name_from_pandas(idx_col, v), w, ) for u, v, w in g.edges.data("weight")], origin="learned", ) return sm
def test_all_nodes_included(self, weighted_edges, data): """No errors if all the nodes can be found in the columns of training data""" cg = StructureModel() cg.add_weighted_edges_from(weighted_edges) bn = BayesianNetwork(cg).fit_node_states(data) assert all(node in data.columns for node in bn.node_states.keys())