def _matrices_to_structure_model(w_est: np.ndarray, a_est: np.ndarray) -> StructureModel: """ Converts the matrices output by dynotears (W and A) into a StructureModel We use the following convention: - {var}_lag{l} where l is the lag value (i.e. from how many previous timestamps the edge is coming - if we deal with a intra_slice_node, `l == 0` Args: w_est: Intra-slice weight matrix a_est: Inter-slice matrix Returns: StructureModel representing the structure learnt """ sm = StructureModel() lag_cols = [ "{var}_lag{l_val}".format(var=var, l_val=l_val) for l_val in range(1 + (a_est.shape[0] // a_est.shape[1])) for var in range(a_est.shape[1]) ] sm.add_nodes_from(lag_cols) sm.add_edges_from([(lag_cols[i], lag_cols[j], dict(weight=w_est[i, j])) for i in range(w_est.shape[0]) for j in range(w_est.shape[1]) if w_est[i, j] != 0]) sm.add_edges_from([(lag_cols[i + w_est.shape[0]], lag_cols[j], dict(weight=a_est[i, j])) for i in range(a_est.shape[0]) for j in range(a_est.shape[1]) if a_est[i, j] != 0]) return sm
def test_isolates(self): """Should return None if the structure model only contains isolates""" nodes = [1, 3, 5, 2, 7] sm = StructureModel() sm.add_nodes_from(nodes) assert sm.get_largest_subgraph() is None
def test_zero_lambda(self): """ A wrong initialisation could lead to counts always being zero if they dont have parents. """ graph = StructureModel() graph.add_nodes_from(list(range(20))) df = generate_count_dataframe(graph, 10000) assert not np.any(df.mean() == 0)
def test_graph_with_no_edges(self): """Can still run even if the graph is without edges""" sm = StructureModel() nodes = [1, 2, 3] sm.add_nodes_from(nodes) sm.remove_edges_below_threshold(0.6) assert set(sm.nodes) == set(nodes) assert set(sm.edges) == set()
def test_isolates(self): """Should return an isolated node""" nodes = [1, 3, 5, 2, 7] sm = StructureModel() sm.add_nodes_from(nodes) subgraph = sm.get_target_subgraph(1) expected_graph = StructureModel() expected_graph.add_node(1) assert set(subgraph.nodes) == set(expected_graph.nodes) assert set(subgraph.edges) == set(expected_graph.edges)
def generator(num_nodes, seed, weight=None): np.random.seed(seed) sm = StructureModel() nodes = list("".join(x) for x in product( string.ascii_lowercase, string.ascii_lowercase))[:num_nodes] np.random.shuffle(nodes) sm.add_nodes_from(nodes) # one edge: sm.add_weighted_edges_from([("aa", "ab", weight)]) return sm
def test_isolates(self): """Should return an isolated node""" nodes = [1, 3, 5, 2, 7] sm = StructureModel() sm.add_nodes_from(nodes) blanket = sm.get_markov_blanket(1) expected_graph = StructureModel() expected_graph.add_node(1) assert set(blanket.nodes) == set(expected_graph.nodes) assert set(blanket.edges) == set(expected_graph.edges)
def test_baseline_probability_probit(self, graph, distribution): """Test whether probability centered around 50% if no intercept given""" graph = StructureModel() graph.add_nodes_from(["A"]) data = generate_binary_data( graph, 1000000, distribution=distribution, noise_scale=0.1, seed=10, intercept=False, ) assert 0.45 < data[:, 0].mean() < 0.55
def test_intercept_probability_logit(self, graph, distribution): """Test whether probability is not centered around 50% when using an intercept""" graph = StructureModel() graph.add_nodes_from(["A"]) data = generate_binary_data( graph, 1000000, distribution=distribution, noise_scale=0.1, seed=10, intercept=True, ) mean_prob = data[:, 0].mean() assert not np.isclose(mean_prob, 0.5, atol=0.05)
def test_isolates_nodes_and_edges(self): """Should be able to return the subgraph with the specified node""" edges = [(0, 1), (1, 2), (1, 3), (5, 6), (4, 5)] isolated_nodes = [7, 8, 9] sm = StructureModel() sm.add_edges_from(edges) sm.add_nodes_from(isolated_nodes) subgraph = sm.get_target_subgraph(5) expected_edges = [(5, 6), (4, 5)] expected_graph = StructureModel() expected_graph.add_edges_from(expected_edges) assert set(subgraph.nodes) == set(expected_graph.nodes) assert set(subgraph.edges) == set(expected_graph.edges)
def test_intercept_probability(self, graph, distribution, n_categories): """Test whether probability is not centered around 50% when using an intercept""" graph = StructureModel() graph.add_nodes_from(["A"]) data = generate_categorical_dataframe( graph, 1000000, distribution=distribution, n_categories=n_categories, noise_scale=0.1, seed=10, intercept=True, ) assert not np.allclose( data.mean(axis=0), 1 / n_categories, atol=0.01, rtol=0)
def test_isolates_nodes_and_edges(self): """Should be able to return the largest subgraph""" edges = [(0, 1), (1, 2), (1, 3), (5, 6)] isolated_nodes = [7, 8, 9] sm = StructureModel() sm.add_edges_from(edges) sm.add_nodes_from(isolated_nodes) largest_subgraph = sm.get_largest_subgraph() expected_edges = [(0, 1), (1, 2), (1, 3)] expected_graph = StructureModel() expected_graph.add_edges_from(expected_edges) assert set(largest_subgraph.nodes) == set(expected_graph.nodes) assert set(largest_subgraph.edges) == set(expected_graph.edges)
def test_baseline_probability(self, graph, distribution, n_categories): """Test whether probability centered around 50% if no intercept given""" graph = StructureModel() graph.add_nodes_from(["A"]) data = generate_categorical_dataframe( graph, 10000, distribution=distribution, n_categories=n_categories, noise_scale=1.0, seed=10, intercept=False, ) # without intercept, the probabilities should be fairly uniform assert np.allclose(data.mean(axis=0), 1 / n_categories, atol=0.01, rtol=0)
def from_pandas_dynamic( # pylint: disable=too-many-arguments time_series: Union[pd.DataFrame, List[pd.DataFrame]], p: int, lambda_w: float = 0.1, lambda_a: float = 0.1, max_iter: int = 100, h_tol: float = 1e-8, w_threshold: float = 0.0, tabu_edges: List[Tuple[int, int, int]] = None, tabu_parent_nodes: List[int] = None, tabu_child_nodes: List[int] = None, ) -> StructureModel: """ Learn the graph structure of a Dynamic Bayesian Network describing conditional dependencies between variables in data. The input data is a time series or a list of realisations of a same time series. The optimisation is to minimise a score function F(W, A) over the graph's contemporaneous (intra-slice) weighted adjacency matrix, W, and lagged (inter-slice) weighted adjacency matrix, A, subject to the a constraint function h(W), where h_value(W) == 0 characterises an acyclic graph. h(W) > 0 is a continuous, differentiable function that encapsulated how acyclic the graph is (less = more acyclic). Based on "DYNOTEARS: Structure Learning from Time-Series Data". https://arxiv.org/abs/2002.00498 @inproceedings{pamfil2020dynotears, title={DYNOTEARS: Structure Learning from Time-Series Data}, author={Pamfil, Roxana and Sriwattanaworachai, Nisara and Desai, Shaan and Pilgerstorfer, Philip and Georgatzis, Konstantinos and Beaumont, Paul and Aragam, Bryon}, booktitle={International Conference on Artificial Intelligence and Statistics}, pages={1595--1605}, year={2020}year={2020}, } Args: time_series: pd.DataFrame or List of pd.DataFrame instances. If a list is provided each element of the list being an realisation of a time series (i.e. time series governed by the same processes) The columns of the data frame represent the variables in the model, and the *index represents the time index*. Successive events, therefore, must be indexed with one integer of difference between them too. p: Number of past interactions we allow the model to create. The state of a variable at time `t` is affected by past variables up to a `t-p`, as well as by other variables at `t`. lambda_w: parameter for l1 regularisation of intra-slice edges lambda_a: parameter for l1 regularisation of inter-slice edges max_iter: max number of dual ascent steps during optimisation. h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0). w_threshold: fixed threshold for absolute edge weights. tabu_edges: list of edges(lag, from, to) not to be included in the graph. `lag == 0` implies that the edge is forbidden in the INTRA graph (W), while lag > 0 implies an INTER-slice weight equal zero. tabu_parent_nodes: list of nodes banned from being a parent of any other nodes. tabu_child_nodes: list of nodes banned from being a child of any other nodes. Returns: StructureModel representing the model learnt. The node names are noted as `{var}_lag{l}`, where `var` is the original variable name as in the give in the input data frames and `l`, in 0,1,2..p is the correspondent time lag. """ time_series = [time_series ] if not isinstance(time_series, list) else time_series X, Xlags = DynamicDataTransformer(p=p).fit_transform(time_series, return_df=False) col_idx = {c: i for i, c in enumerate(time_series[0].columns)} idx_col = {i: c for c, i in col_idx.items()} if tabu_edges: tabu_edges = [(lag, col_idx[u], col_idx[v]) for lag, u, v in tabu_edges] if tabu_parent_nodes: tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes] if tabu_child_nodes: tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes] g = from_numpy_dynamic( X, Xlags, lambda_w, lambda_a, max_iter, h_tol, w_threshold, tabu_edges, tabu_parent_nodes, tabu_child_nodes, ) sm = StructureModel() sm.add_nodes_from([ "{var}_lag{l_val}".format(var=var, l_val=l_val) for var in col_idx.keys() for l_val in range(p + 1) ]) sm.add_weighted_edges_from( [( _format_name_from_pandas(idx_col, u), _format_name_from_pandas(idx_col, v), w, ) for u, v, w in g.edges.data("weight")], origin="learned", ) return sm