Ejemplo n.º 1
0
def test_categorized_data():
    """Test the function that turn continuous data into categorical."""
    raw = np.array([[1.0, 1.4, 3.0], [2.0, 2.2, 5.0]]).T
    n_bins = 2
    data = categorized_data(raw, n_bins)

    data_true = np.array([[0, 0, 1], [0, 0, 1]]).T
    assert np.array_equal(data, data_true)
Ejemplo n.º 2
0
    def fit(self, TS, n_bins=40, atol=1e-6, **kwargs):
        """Reconstruct causal parents of nodes by optimizing causation entropy.

        Optimal causation entropy method reconstructs parents of nodes in a
        causal diagram for systems that rest on three Markov assumptions: Let
        $X_t$ be the system state at time $t$, denote node $i$'s causal parents
        as $N_i$ and its state as $X_t^{(i)}$. The following three statements
        hold for every node $i$:
        1. $P(X_t | X_{t-1}, X_{t-2}, ...) = P(X_t | X_{t-1}) = P(X_{t'} | X_{t'-1})$
        2. $P(X_t^{(i)} | X_{t-1}) = P(X_t^{(i)} | X_{t-1}^{(N_i)})$
        3. $P(X_t^{(i)} | X_{t-1}^{(J)}) \neq P(X_t^{(i) | X_{t-1}^{(K)}})$
           whenever $J, K$ are sets of nodes such that $J \cap N_i \neq K \cap N_i$

        Sun et al. proved that for any set of nodes $I$ in systems satisfying
        the above three conditions, its causal parents $N_I$ is the minimal
        set of nodes $K$ that maximizes the causation entropy $C_{K \rightarrow I}$.
        The more general form of causation entropy is defined as
        $C_{J \rightarrow I | K} = H(X_{t+1}^{(I)} | X_t^{(K)}) - H(X_{t+1}^{(I)} | X_t^{(K)}, X_t^{(J)})$
        where $H(X|Y)$ is the conditional entropy of $X$ conditioned on $Y$.
        Sun et al. also showed that the causal parents $N_I$ can be efficiently
        found by first building a superset $S \supset N_I$ via heuristic and
        then removing noncausal nodes in $S$. The causal diagram can hence be
        reconstructed from time series data by applying the proposed algorithm
        to every node.

        Params
        ------
        TS (np.ndarray): $N \times L$ array consisting of $L$ observations
                         from $N$ sensors.

        data (np.ndarray): Array of data with nodes as columns and observations
                           of quantity on nodes as rows.

        n_bins (int): Number of bins when transforming continuous data into its
                      binned categorical version (universal for all nodes).

        atol (float): Absolute tolerance to determine whether causal entropy is
                      closed to zero.

        Returns
        -------
        G (nx.Graph): A reconstructed graph with $N$ nodes.

        Notes
        -----
        1. Nodes' causal parents can be found in results['parents'].

        2. Current implementation naively thresholds the causation entropy to
           determine whether it's closed to zero or not. This can potentially
           lead to sensitivity to the tolerance hyperparameter. Sun et al.
           suggested to perform a permutation test for every causation entropy
           computed to determine its siginificance, which is more costly on
           computations.

        """
        data = TS.T  # Transpose the time series to make observations the rows
        _, N = data.shape

        # Transform the data into its binned categorical version,
        # which is a pre-processing before computing entropy
        data = categorized_data(data, n_bins)

        # Find causal parents of each node
        adjlist = dict()
        for node in range(N):
            parents = causal_superset({node}, data, atol)
            remove_noncausal(parents, {node}, data, atol)
            adjlist[node] = parents

        # Build the reconstructed graph
        A = nx.to_numpy_array(nx.DiGraph(adjlist).reverse())
        G = create_graph(A, create_using=nx.DiGraph(), remove_self_loops=False)
        self.results['graph'] = G
        self.results['parents'] = adjlist

        return G