Esempio n. 1
0
 def test_transform_test(self):
     p = 3
     df = pd.DataFrame({'a': np.arange(100), 'b': np.arange(100, 200)})
     expected_mapping = {
         0: 'a_t',
         1: 'b_t',
         2: 'a_t-1',
         3: 'b_t-1',
         4: 'a_t-2',
         5: 'b_t-2',
         6: 'a_t-3',
         7: 'b_t-3'
     }
     expected_matrix = np.array([
         np.arange(3, 100),
         np.arange(103, 200),
         np.arange(2, 99),
         np.arange(102, 199),
         np.arange(1, 98),
         np.arange(101, 198),
         np.arange(0, 97),
         np.arange(100, 197)
     ]).T
     result_mapping, result_matrix = transform_ts(df, p)
     self.assertDictEqual(expected_mapping, result_mapping)
     self.assertTrue(np.all(expected_matrix == result_matrix))
Esempio n. 2
0
def pc_chen_modified(indep_test_func, ts_data, p, alpha):
    dim = ts_data.shape[1]
    node_mapping, data_matrix = transform_ts(ts_data, p)
    corr_matrix = np.corrcoef(data_matrix, rowvar=False)

    adj_matrix = np.zeros((data_matrix.shape[1], data_matrix.shape[1]))
    adj_matrix[dim:, :dim] = 1
    adj_matrix = np.maximum(adj_matrix, adj_matrix.T)
    G = nx.from_numpy_matrix(adj_matrix)
    G, _ = _estimate_skeleton(G,
                              partial_corr_test,
                              data_matrix,
                              alpha,
                              corr_matrix=corr_matrix)

    DAG = G.to_directed()
    DAG.remove_edges_from([(u, v) for (u, v) in DAG.edges() if v >= u])
    return nx.relabel_nodes(DAG, node_mapping)
Esempio n. 3
0
def pc_chen(indep_test_func, ts_data, p, alpha):
    dim = ts_data.shape[1]
    node_mapping, data_matrix = transform_ts(ts_data, p)
    corr_matrix = np.corrcoef(data_matrix, rowvar=False)

    adj_matrix = np.ones((data_matrix.shape[1], data_matrix.shape[1]))
    np.fill_diagonal(adj_matrix, 0)
    G = nx.from_numpy_matrix(adj_matrix)

    G, sep_sets = _estimate_skeleton(G,
                                     partial_corr_test,
                                     data_matrix,
                                     alpha,
                                     corr_matrix=corr_matrix)

    DG = G.to_directed()
    DG.remove_edges_from([(u, v) for (u, v) in DG.edges() if v >= dim])
    DAG = estimate_cpdag(DG, sep_sets)
    return nx.relabel_nodes(DAG, node_mapping)
Esempio n. 4
0
def pc_incremental_subsets(indep_test,
                           ts,
                           alpha=0.05,
                           max_p=20,
                           start=0,
                           steps=1,
                           ic='bic',
                           patiency=1,
                           verbose=False):
    # precalculated information
    dim = ts.shape[1]

    # verbose information
    graphs = {}
    times = {}
    bics = {}

    # initial graph
    present_nodes = range(dim)
    if start > 0:
        node_mapping, data_matrix = transform_ts(ts, start)
        corr_matrix = np.corrcoef(data_matrix, rowvar=False)
        start_time = time()
        G = pc_chen_modified(indep_test, ts, start, alpha)
        times[start] = time() - start_time
        graphs[start] = nx.relabel_nodes(G.copy(), node_mapping)
        bics[start] = _graph_ic(start, dim, data_matrix, G, ic)
        best_bic = bics[start]
        best_p = start
    else:
        G = nx.DiGraph()
        G.add_nodes_from(present_nodes)
        best_bic = np.inf
        best_p = 0

    no_imp = 0

    # iteration step
    for p in range(start + steps, max_p + 1, steps):
        start_time = time()
        node_mapping, data_matrix = transform_ts(ts, p)
        corr_matrix = np.corrcoef(data_matrix, rowvar=False)
        new_nodes = list(
            range((p - steps + 1) * dim,
                  min(p + 1, max_p + 1) * dim))

        # step 1
        G.add_nodes_from(new_nodes)

        # step 2
        for x_t, x in product(present_nodes, new_nodes):
            p_value, statistic = indep_test(data_matrix,
                                            x_t,
                                            x,
                                            set(),
                                            corr_matrix=corr_matrix)
            if p_value <= alpha:
                G.add_edge(x, x_t)

        # step 3 for each subset
        for subset_size in range(1, len(G.nodes())):
            for x_t in present_nodes:
                in_set = set(G.predecessors(x_t))
                if len(in_set) <= subset_size:
                    continue
                for x in in_set:
                    cond_max = in_set - set([x])
                    for cond in set(combinations(cond_max, subset_size)):
                        p_value, statistic = indep_test(
                            data_matrix, x_t, x, cond, corr_matrix=corr_matrix)
                        if p_value > alpha:
                            G.remove_edge(x, x_t)
                            break

        # verbose information
        graphs[p] = nx.relabel_nodes(G.copy(), node_mapping)
        times[p] = time() - start_time
        bics[p] = _graph_ic(p, dim, data_matrix, G, ic)

        # early stopping
        if bics[p] < best_bic:
            best_bic = bics[p]
            best_p = p
            no_imp = 0
        else:
            no_imp += 1
            if no_imp >= patiency:
                break

    if verbose:
        return nx.relabel_nodes(graphs[best_p],
                                node_mapping), graphs, times, bics
    else:
        return nx.relabel_nodes(graphs[best_p], node_mapping)
Esempio n. 5
0
def pc_incremental_pc1(indep_test,
                       ts,
                       alpha=0.05,
                       max_p=20,
                       start=0,
                       steps=1,
                       ic='bic',
                       patiency=1,
                       verbose=False,
                       **kwargs):
    # precalculated information
    dim = ts.shape[1]

    # verbose information
    graphs = {}
    times = {}
    bics = {}

    # initial graph
    present_nodes = range(dim)
    if start > 0:
        node_mapping, data_matrix = transform_ts(ts, start)
        corr_matrix = np.corrcoef(data_matrix, rowvar=False)
        start_time = time()
        G = pc_chen_modified(indep_test, ts, start, alpha)
        times[start] = time() - start_time
        graphs[start] = nx.relabel_nodes(G.copy(), node_mapping)
        bics[start] = _graph_ic(start, dim, data_matrix, G, ic)
        best_bic = bics[start]
        best_p = start
    else:
        G = nx.DiGraph()
        G.add_nodes_from(present_nodes)
        best_bic = np.inf
        best_p = 0

    no_imp = 0

    # iteration step
    for p in range(start + steps, max_p + 1, steps):
        start_time = time()
        node_mapping, data_matrix = transform_ts(ts, p)
        corr_matrix = np.corrcoef(data_matrix, rowvar=False)
        new_nodes = list(
            range((p - steps + 1) * dim,
                  min(p + 1, max_p + 1) * dim))

        # step 1: Add new nodes
        G.add_nodes_from(new_nodes)

        # step 2: Connect new nodes if not unconditionally independent
        for x_t, x in product(present_nodes, new_nodes):
            p_value, statistic = indep_test(data_matrix,
                                            x_t,
                                            x,
                                            set(),
                                            corr_matrix=corr_matrix)
            if p_value <= alpha:
                G.add_edge(x, x_t)

        # step 3: Check all connected nodes
        for x_t in present_nodes:
            parents = list(set(G.predecessors(x_t)))
            # Goes up to full neighborhood, perhaps limit this
            max_cond_dim = float('inf')
            condition_size = 0
            # PC_1
            while condition_size < max_cond_dim and condition_size < len(
                    parents) - 1:
                parent_stats = defaultdict(lambda: float('inf'))
                for x in parents:
                    other_parents = [e for e in parents if e != x]
                    condition = other_parents[:condition_size]

                    p_value, statistic = indep_test(data_matrix,
                                                    x_t,
                                                    x,
                                                    condition,
                                                    corr_matrix=corr_matrix)
                    parent_stats[x] = min(parent_stats[x], np.abs(statistic))

                    if p_value > alpha:
                        G.remove_edge(x, x_t)
                        del parent_stats[x]

                parents = [
                    k for k, v in sorted(
                        parent_stats.items(), key=lambda v: v[1], reverse=True)
                ]
                condition_size += 1

        # verbose information
        graphs[p] = nx.relabel_nodes(G.copy(), node_mapping)
        times[p] = time() - start_time
        bics[p] = _graph_ic(p, dim, data_matrix, G, ic)

        # early stopping
        if bics[p] < best_bic:
            best_bic = bics[p]
            best_p = p
            no_imp = 0
        else:
            no_imp += 1
            if no_imp >= patiency:
                break

    if verbose:
        return nx.relabel_nodes(graphs[best_p],
                                node_mapping), graphs, times, bics
    else:
        return nx.relabel_nodes(graphs[best_p], node_mapping)