コード例 #1
0
def _matrices_to_structure_model(w_est: np.ndarray,
                                 a_est: np.ndarray) -> StructureModel:
    """
    Converts the matrices output by dynotears (W and A) into a StructureModel
    We use the following convention:
    - {var}_lag{l} where l is the lag value (i.e. from how many previous timestamps the edge is coming
    - if we deal with a intra_slice_node, `l == 0`
    Args:
        w_est: Intra-slice weight matrix
        a_est: Inter-slice matrix

    Returns:
        StructureModel representing the structure learnt

    """
    sm = StructureModel()
    lag_cols = [
        "{var}_lag{l_val}".format(var=var, l_val=l_val)
        for l_val in range(1 + (a_est.shape[0] // a_est.shape[1]))
        for var in range(a_est.shape[1])
    ]
    sm.add_nodes_from(lag_cols)
    sm.add_edges_from([(lag_cols[i], lag_cols[j], dict(weight=w_est[i, j]))
                       for i in range(w_est.shape[0])
                       for j in range(w_est.shape[1]) if w_est[i, j] != 0])
    sm.add_edges_from([(lag_cols[i + w_est.shape[0]], lag_cols[j],
                        dict(weight=a_est[i, j]))
                       for i in range(a_est.shape[0])
                       for j in range(a_est.shape[1]) if a_est[i, j] != 0])
    return sm
コード例 #2
0
    def test_isolates(self):
        """Should return None if the structure model only contains isolates"""

        nodes = [1, 3, 5, 2, 7]
        sm = StructureModel()
        sm.add_nodes_from(nodes)
        assert sm.get_largest_subgraph() is None
コード例 #3
0
 def test_zero_lambda(self):
     """
     A wrong initialisation could lead to counts always being zero if they dont
     have parents.
     """
     graph = StructureModel()
     graph.add_nodes_from(list(range(20)))
     df = generate_count_dataframe(graph, 10000)
     assert not np.any(df.mean() == 0)
コード例 #4
0
    def test_graph_with_no_edges(self):
        """Can still run even if the graph is without edges"""

        sm = StructureModel()
        nodes = [1, 2, 3]
        sm.add_nodes_from(nodes)
        sm.remove_edges_below_threshold(0.6)

        assert set(sm.nodes) == set(nodes)
        assert set(sm.edges) == set()
コード例 #5
0
    def test_isolates(self):
        """Should return an isolated node"""

        nodes = [1, 3, 5, 2, 7]
        sm = StructureModel()
        sm.add_nodes_from(nodes)
        subgraph = sm.get_target_subgraph(1)
        expected_graph = StructureModel()
        expected_graph.add_node(1)

        assert set(subgraph.nodes) == set(expected_graph.nodes)
        assert set(subgraph.edges) == set(expected_graph.edges)
コード例 #6
0
    def generator(num_nodes, seed, weight=None):
        np.random.seed(seed)

        sm = StructureModel()
        nodes = list("".join(x) for x in product(
            string.ascii_lowercase, string.ascii_lowercase))[:num_nodes]
        np.random.shuffle(nodes)
        sm.add_nodes_from(nodes)

        # one edge:
        sm.add_weighted_edges_from([("aa", "ab", weight)])
        return sm
コード例 #7
0
    def test_isolates(self):
        """Should return an isolated node"""

        nodes = [1, 3, 5, 2, 7]
        sm = StructureModel()
        sm.add_nodes_from(nodes)
        blanket = sm.get_markov_blanket(1)

        expected_graph = StructureModel()
        expected_graph.add_node(1)

        assert set(blanket.nodes) == set(expected_graph.nodes)
        assert set(blanket.edges) == set(expected_graph.edges)
コード例 #8
0
 def test_baseline_probability_probit(self, graph, distribution):
     """Test whether probability centered around 50% if no intercept given"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_binary_data(
         graph,
         1000000,
         distribution=distribution,
         noise_scale=0.1,
         seed=10,
         intercept=False,
     )
     assert 0.45 < data[:, 0].mean() < 0.55
コード例 #9
0
 def test_intercept_probability_logit(self, graph, distribution):
     """Test whether probability is not centered around 50% when using an intercept"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_binary_data(
         graph,
         1000000,
         distribution=distribution,
         noise_scale=0.1,
         seed=10,
         intercept=True,
     )
     mean_prob = data[:, 0].mean()
     assert not np.isclose(mean_prob, 0.5, atol=0.05)
コード例 #10
0
    def test_isolates_nodes_and_edges(self):
        """Should be able to return the subgraph with the specified node"""

        edges = [(0, 1), (1, 2), (1, 3), (5, 6), (4, 5)]
        isolated_nodes = [7, 8, 9]
        sm = StructureModel()
        sm.add_edges_from(edges)
        sm.add_nodes_from(isolated_nodes)
        subgraph = sm.get_target_subgraph(5)
        expected_edges = [(5, 6), (4, 5)]
        expected_graph = StructureModel()
        expected_graph.add_edges_from(expected_edges)

        assert set(subgraph.nodes) == set(expected_graph.nodes)
        assert set(subgraph.edges) == set(expected_graph.edges)
コード例 #11
0
 def test_intercept_probability(self, graph, distribution, n_categories):
     """Test whether probability is not centered around 50% when using an intercept"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_categorical_dataframe(
         graph,
         1000000,
         distribution=distribution,
         n_categories=n_categories,
         noise_scale=0.1,
         seed=10,
         intercept=True,
     )
     assert not np.allclose(
         data.mean(axis=0), 1 / n_categories, atol=0.01, rtol=0)
コード例 #12
0
    def test_isolates_nodes_and_edges(self):
        """Should be able to return the largest subgraph"""

        edges = [(0, 1), (1, 2), (1, 3), (5, 6)]
        isolated_nodes = [7, 8, 9]
        sm = StructureModel()
        sm.add_edges_from(edges)
        sm.add_nodes_from(isolated_nodes)
        largest_subgraph = sm.get_largest_subgraph()

        expected_edges = [(0, 1), (1, 2), (1, 3)]
        expected_graph = StructureModel()
        expected_graph.add_edges_from(expected_edges)

        assert set(largest_subgraph.nodes) == set(expected_graph.nodes)
        assert set(largest_subgraph.edges) == set(expected_graph.edges)
コード例 #13
0
 def test_baseline_probability(self, graph, distribution, n_categories):
     """Test whether probability centered around 50% if no intercept given"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_categorical_dataframe(
         graph,
         10000,
         distribution=distribution,
         n_categories=n_categories,
         noise_scale=1.0,
         seed=10,
         intercept=False,
     )
     # without intercept, the probabilities should be fairly uniform
     assert np.allclose(data.mean(axis=0),
                        1 / n_categories,
                        atol=0.01,
                        rtol=0)
コード例 #14
0
def from_pandas_dynamic(  # pylint: disable=too-many-arguments
    time_series: Union[pd.DataFrame, List[pd.DataFrame]],
    p: int,
    lambda_w: float = 0.1,
    lambda_a: float = 0.1,
    max_iter: int = 100,
    h_tol: float = 1e-8,
    w_threshold: float = 0.0,
    tabu_edges: List[Tuple[int, int, int]] = None,
    tabu_parent_nodes: List[int] = None,
    tabu_child_nodes: List[int] = None,
) -> StructureModel:
    """
    Learn the graph structure of a Dynamic Bayesian Network describing conditional dependencies between variables in
    data. The input data is a time series or a list of realisations of a same time series.
    The optimisation is to minimise a score function F(W, A) over the graph's contemporaneous (intra-slice) weighted
    adjacency matrix, W, and lagged (inter-slice) weighted adjacency matrix, A, subject to the a constraint function
    h(W), where h_value(W) == 0 characterises an acyclic graph. h(W) > 0 is a continuous, differentiable function that
    encapsulated how acyclic the graph is (less = more acyclic).

    Based on "DYNOTEARS: Structure Learning from Time-Series Data".
    https://arxiv.org/abs/2002.00498
    @inproceedings{pamfil2020dynotears,
        title={DYNOTEARS: Structure Learning from Time-Series Data},
        author={Pamfil, Roxana and Sriwattanaworachai, Nisara and Desai, Shaan and Pilgerstorfer,
        Philip and Georgatzis, Konstantinos and Beaumont, Paul and Aragam, Bryon},
        booktitle={International Conference on Artificial Intelligence and Statistics},
        pages={1595--1605},
        year={2020}year={2020},
    }
    Args:
        time_series: pd.DataFrame or List of pd.DataFrame instances.
        If a list is provided each element of the list being an realisation of a time series (i.e. time series governed
        by the same processes)
        The columns of the data frame represent the variables in the model, and the *index represents the time index*.
        Successive events, therefore, must be indexed with one integer of difference between them too.
        p: Number of past interactions we allow the model to create. The state of a variable at time `t` is affected by
        past variables up to a `t-p`, as well as by other variables at `t`.
        lambda_w: parameter for l1 regularisation of intra-slice edges
        lambda_a: parameter for l1 regularisation of inter-slice edges
        max_iter: max number of dual ascent steps during optimisation.
        h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0).
        w_threshold: fixed threshold for absolute edge weights.
        tabu_edges: list of edges(lag, from, to) not to be included in the graph. `lag == 0` implies that the edge is
        forbidden in the INTRA graph (W), while lag > 0 implies an INTER-slice weight equal zero.
        tabu_parent_nodes: list of nodes banned from being a parent of any other nodes.
        tabu_child_nodes: list of nodes banned from being a child of any other nodes.

    Returns:
        StructureModel representing the model learnt. The node names are noted as `{var}_lag{l}`, where `var` is the
        original variable name as in the give in the input data frames and `l`, in 0,1,2..p is the correspondent
        time lag.
    """
    time_series = [time_series
                   ] if not isinstance(time_series, list) else time_series

    X, Xlags = DynamicDataTransformer(p=p).fit_transform(time_series,
                                                         return_df=False)

    col_idx = {c: i for i, c in enumerate(time_series[0].columns)}
    idx_col = {i: c for c, i in col_idx.items()}

    if tabu_edges:
        tabu_edges = [(lag, col_idx[u], col_idx[v])
                      for lag, u, v in tabu_edges]
    if tabu_parent_nodes:
        tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes]
    if tabu_child_nodes:
        tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes]

    g = from_numpy_dynamic(
        X,
        Xlags,
        lambda_w,
        lambda_a,
        max_iter,
        h_tol,
        w_threshold,
        tabu_edges,
        tabu_parent_nodes,
        tabu_child_nodes,
    )

    sm = StructureModel()
    sm.add_nodes_from([
        "{var}_lag{l_val}".format(var=var, l_val=l_val)
        for var in col_idx.keys() for l_val in range(p + 1)
    ])
    sm.add_weighted_edges_from(
        [(
            _format_name_from_pandas(idx_col, u),
            _format_name_from_pandas(idx_col, v),
            w,
        ) for u, v, w in g.edges.data("weight")],
        origin="learned",
    )

    return sm