Exemple #1
0
    def _run_igsp(dag_num):
        # === GENERATE FILENAME
        sample_folder = sample_folders[dag_num]
        alg_folder = os.path.join(sample_folder, 'estimates', 'igsp')
        os.makedirs(alg_folder, exist_ok=True)
        filename = os.path.join(
            alg_folder,
            'nruns=%d,depth=%d,alpha=%.2e,alpha_invariant=%.2e.npy' %
            (nruns, depth, alpha, alpha_invariant))

        # === RUN ALGORITHM
        if not os.path.exists(filename) or overwrite:
            obs_samples, setting_list, sample_dict = get_dag_samples(
                ndags,
                nnodes,
                nneighbors,
                nsamples,
                nsettings,
                num_known,
                num_unknown,
                intervention,
                dag_num,
                nonlinear=nonlinear)

            if nonlinear:
                suffstat = gauss_ci_suffstat(obs_samples)
                suffstat_inv = gauss_invariance_suffstat(
                    obs_samples,
                    [setting['samples'] for setting in setting_list])
                ci_tester = MemoizedCI_Tester(gauss_ci_test,
                                              suffstat,
                                              alpha=alpha)
                inv_tester = MemoizedInvarianceTester(gauss_invariance_test,
                                                      suffstat_inv,
                                                      alpha=alpha_invariant)
            else:
                suffstat = gauss_ci_suffstat(obs_samples)
                suffstat_inv = gauss_invariance_suffstat(
                    obs_samples,
                    [setting['samples'] for setting in setting_list])
                ci_tester = MemoizedCI_Tester(gauss_ci_test,
                                              suffstat,
                                              alpha=alpha)
                inv_tester = MemoizedInvarianceTester(gauss_invariance_test,
                                                      suffstat_inv,
                                                      alpha=alpha_invariant)

            est_dag = igsp([{
                'interventions': setting['known_interventions']
            } for setting in setting_list],
                           nodes,
                           ci_tester,
                           inv_tester,
                           depth=depth,
                           nruns=nruns)

            np.save(filename, est_dag.to_amat()[0])
            return est_dag
        else:
            return cd.DAG.from_amat(np.load(filename))
Exemple #2
0
    def run_fci(graph_num):
        results_filename = get_alg_estimate_filename(ngraphs,
                                                     nnodes,
                                                     nlatent,
                                                     exp_nbrs,
                                                     graph_num,
                                                     nsamples,
                                                     GSPO_NAME,
                                                     alpha=alpha,
                                                     initial=initial,
                                                     depth=depth,
                                                     max_iters=max_iters)
        time_filename = get_alg_time_filename(ngraphs,
                                              nnodes,
                                              nlatent,
                                              exp_nbrs,
                                              graph_num,
                                              nsamples,
                                              GSPO_NAME,
                                              alpha=alpha,
                                              initial=initial,
                                              depth=depth,
                                              max_iters=max_iters)

        if OVERWRITE or not os.path.exists(results_filename):
            samples = get_mag_samples(ngraphs, nnodes, nlatent, exp_nbrs,
                                      graph_num, nsamples)
            start = time.time()
            suffstat = gauss_ci_suffstat(samples)
            ci_tester = MemoizedCI_Tester(gauss_ci_test, suffstat, alpha=alpha)
            est_mag = gspo(set(range(nnodes)),
                           ci_tester,
                           initial_imap=initial,
                           depth=depth,
                           nruns=nruns,
                           max_iters=max_iters,
                           make_minimal=lmc_update)
            time_used = time.time() - start

            os.makedirs(os.path.dirname(results_filename), exist_ok=True)
            np.save(results_filename, est_mag.to_amat())
            np.save(time_filename, time_used)
            return est_mag, time_used
        else:
            time_used = np.load(time_filename)
            return cd.AncestralGraph.from_amat(
                np.load(results_filename)), time_used
Exemple #3
0
def run_gsp(X,
            alpha,
            nodes: set,
            depth: Optional[int] = 4,
            nruns: int = 5,
            verbose: bool = False,
            initial_undirected: Optional[Union[str,
                                               UndirectedGraph]] = 'threshold',
            initial_permutations: Optional[List] = None,
            fixed_orders=set(),
            fixed_adjacencies=set(),
            fixed_gaps=set(),
            use_lowest=True,
            max_iters=float('inf'),
            factor=2,
            progress_bar=False,
            summarize=False):
    # obtain sufficient statistics (causaldag.utils.ci_tests)
    obs_suffstat = gauss_ci_suffstat(X, invert=False)

    # define CI tester
    ci_tester = MemoizedCI_Tester(gauss_ci_test, obs_suffstat, alpha=alpha)

    # run GSP
    est_dag = gsp(nodes=nodes,
                  ci_tester=ci_tester,
                  depth=depth,
                  nruns=nruns,
                  verbose=verbose,
                  initial_undirected=initial_undirected,
                  initial_permutations=initial_permutations,
                  fixed_orders=fixed_orders,
                  fixed_adjacencies=fixed_adjacencies,
                  fixed_gaps=fixed_gaps,
                  use_lowest=use_lowest,
                  max_iters=max_iters,
                  factor=factor,
                  progress_bar=progress_bar,
                  summarize=summarize)

    # convert dag to adjacency matrix, here specifying that the columns are "source" axis, so edge from j->i
    est_cpdag, _ = est_dag.cpdag().to_amat(source_axis=1)

    return est_cpdag
Exemple #4
0
def prepare_igsp(obs_samples,
                 iv_samples_list,
                 targets_list,
                 alpha=1e-3,
                 alpha_inv=1e-3,
                 ci_test="gaussian"):

    # Form sufficient statistics
    if ci_test == "gaussian":
        obs_suffstat = gauss_ci_suffstat(obs_samples)
        invariance_suffstat = gauss_invariance_suffstat(
            obs_samples, iv_samples_list)

        # Create CI and invariance
        ci_tester = MemoizedCI_Tester(gauss_ci_test, obs_suffstat, alpha=alpha)
        invariance_tester = MemoizedInvarianceTester(gauss_invariance_test,
                                                     invariance_suffstat,
                                                     alpha=alpha_inv)
    elif ci_test == "hsic":
        contexts = {i: s for i, s in enumerate(iv_samples_list)}
        invariance_suffstat = {"obs_samples": obs_samples}
        invariance_suffstat.update(contexts)

        # Create CI and invariance
        ci_tester = MemoizedCI_Tester(hsic_test, obs_samples, alpha=alpha)
        invariance_tester = MemoizedInvarianceTester(hsic_invariance_test,
                                                     invariance_suffstat,
                                                     alpha=alpha_inv)
    elif ci_test == "kci":
        contexts = {i: s for i, s in enumerate(iv_samples_list)}
        invariance_suffstat = {"obs_samples": obs_samples}
        invariance_suffstat.update(contexts)

        # Create CI and invariance
        ci_tester = MemoizedCI_Tester(kci_test, obs_samples, alpha=alpha)
        invariance_tester = MemoizedInvarianceTester(kci_invariance_test,
                                                     invariance_suffstat,
                                                     alpha=alpha_inv)
    else:
        raise ValueError(
            f"CI test '{ci_test}' does not exist. Choose between: [gaussian, hsic, kci]"
        )
    return ci_tester, invariance_tester
Exemple #5
0
from tqdm import tqdm
os.makedirs(ESTIMATED_FOLDER, exist_ok=True)
import json

OVERWRITE = True

# === LOAD SAMPLES
sample_dict = dict()
for file in os.listdir(SACHS_DATA_FOLDER):
    samples = pd.read_csv(os.path.join(SACHS_DATA_FOLDER, file), sep=',')
    iv_str = file.split('=')[1][:-4]
    ivs = frozenset({int(iv_str)}) if iv_str != '' else frozenset()
    sample_dict[ivs] = samples.values
obs_samples = sample_dict[frozenset()]
all_samples = np.concatenate(tuple(sample_dict.values()), axis=0)
suffstat = gauss_ci_suffstat(obs_samples)
suffstat_all = dict(C=np.corrcoef(all_samples, rowvar=False), n=all_samples.shape[0])

setting_list = [
    {'known_interventions': iv_nodes}
    for iv_nodes, samples in sample_dict.items()
    if iv_nodes != frozenset()
]
iv_samples_list = [sample_dict[setting['known_interventions']] for setting in setting_list]
invariance_suffstat = gauss_invariance_suffstat(obs_samples, iv_samples_list)
hsic_invariance_suffstat = {iv: samples for iv, samples in enumerate(iv_samples_list)}
hsic_invariance_suffstat['obs_samples'] = obs_samples

# === RUN UNKNOWN TARGET IGSP WITH GAUSS CI
for alpha in tqdm([1e-1, 1e-2, 1e-3, 2e-1, 3e-1, 4e-1, 5e-1, 5e-2]):
    alpha_i = 1e-20
Exemple #6
0
from line_profiler import LineProfiler
import causaldag as cd
from causaldag.inference.structural import pcalg, skeleton
import numpy as np
from causaldag.utils.ci_tests import MemoizedCI_Tester, gauss_ci_suffstat, gauss_ci_test
import random
np.random.seed(1729)
random.seed(1729)

nnodes = 20
nodes = set(range(nnodes))
g = cd.rand.rand_weights(cd.rand.directed_erdos(nnodes, 3/(nnodes-1), 1))
iv_node = random
nsamples = 1000
samples = g.sample(nsamples)
suffstat = gauss_ci_suffstat(samples)
profiler = LineProfiler()


def run_pc():
    for i in range(100):
        ci_tester = MemoizedCI_Tester(gauss_ci_test, suffstat)
        pcalg(nodes, ci_tester, max_cond_set=None, verbose=True)


profiler.add_function(pcalg)
profiler.runcall(run_pc)
profiler.print_stats()
def dci_orient(
        X1,
        X2,
        skeleton: set,
        nodes_cond_set: set,
        rh1: RegressionHelper = None,
        rh2: RegressionHelper = None,
        alpha: float = 0.1,
        max_set_size: int = 3,
        verbose: int = 0
):
    """
    Orients edges in the skeleton of the difference DAG.

    Parameters
    ----------
    X1: array, shape = [n_samples, n_features]
        First dataset.    
    X2: array, shape = [n_samples, n_features]
        Second dataset.
    skeleton: set
        Set of edges in the skeleton of the difference-DAG.
    nodes_cond_set: set
        Nodes to be considered as conditioning sets.
    rh1: RegressionHelper, default = None
        Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class.
    rh2: RegressionHelper, default = None
        Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class.
    alpha: float, default = 0.1
        Significance level parameter for determining orientation of an edge.
        Lower alpha results in more directed edges in the difference-DAG.
    max_set_size: int, default = 3
        Maximum conditioning set size used to test regression invariance.
        Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3.
    verbose: int, default = 0
        The verbosity level of logging messages.

    See Also
    --------
    dci, dci_undirected_graph, dci_skeleton

    Returns
    -------
    oriented_edges: set
        Set of edges in the skeleton of the difference-DAG for which directionality could be determined.
    unoriented_edges: set
        Set of edges in the skeleton of the difference-DAG for which directionality could not be determined.
    """

    if verbose > 0:
        print("DCI edge orientation...")

    assert 0 <= alpha <= 1, "alpha must be in [0,1] range."

    if rh1 is None or rh2 is None:
        # obtain sufficient statistics
        suffstat1 = gauss_ci_suffstat(X1)
        suffstat2 = gauss_ci_suffstat(X2)
        rh1 = RegressionHelper(suffstat1)
        rh2 = RegressionHelper(suffstat2)

    nodes = {i for i, j in skeleton} | {j for i, j in skeleton}
    oriented_edges = set()

    n1 = rh1.suffstat['n']
    n2 = rh2.suffstat['n']
    for i, j in skeleton:
        for cond_i, cond_j in zip(powerset(nodes_cond_set - {i}, r_max=max_set_size),
                                  powerset(nodes_cond_set - {j}, r_max=max_set_size)):
            # compute residual variances for i
            beta1_i, var1_i, _ = rh1.regression(i, list(cond_i))
            beta2_i, var2_i, _ = rh2.regression(i, list(cond_i))
            # compute p-value for invariance of residual variances for i
            pvalue_i = ncfdtr(n1 - len(cond_i), n2 - len(cond_i), 0, var1_i / var2_i)
            pvalue_i = 2 * min(pvalue_i, 1 - pvalue_i)

            # compute residual variances for j
            beta1_j, var1_j, _ = rh1.regression(j, list(cond_j))
            beta2_j, var2_j, _ = rh2.regression(j, list(cond_j))
            # compute p-value for invariance of residual variances for j
            pvalue_j = ncfdtr(n1 - len(cond_j), n2 - len(cond_j), 0, var1_j / var2_j)
            pvalue_j = 2 * min(pvalue_j, 1 - pvalue_j)

            if ((pvalue_i > alpha) | (pvalue_j > alpha)):
                # orient the edge according to highest p-value
                if pvalue_i > pvalue_j:
                    edge = (j, i) if j in cond_i else (i, j)
                    pvalue_used = pvalue_i
                else:
                    edge = (i, j) if i in cond_j else (j, i)
                    pvalue_used = pvalue_j
                oriented_edges.add(edge)

                if verbose > 0:
                    print("Oriented (%d, %d) as %s since p-value=%.5f > alpha=%.5f" % (i, j, edge, pvalue_used, alpha))
                break

    # orient edges via graph traversal
    unoriented_edges_before_traversal = skeleton - oriented_edges - {(j, i) for i, j in oriented_edges}
    unoriented_edges = unoriented_edges_before_traversal.copy()
    g = nx.DiGraph()
    for i, j in oriented_edges:
        g.add_edge(i, j)
    g.add_nodes_from(nodes)

    for i, j in unoriented_edges_before_traversal:
        chain_path = list(nx.all_simple_paths(g, source=i, target=j))
        if len(chain_path) > 0:
            oriented_edges.add((i, j))
            unoriented_edges.remove((i, j))
            if verbose > 0:
                print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (i, j)))
        else:
            chain_path = list(nx.all_simple_paths(g, source=j, target=i))
            if len(chain_path) > 0:
                oriented_edges.add((j, i))
                unoriented_edges.remove((i, j))
                if verbose > 0:
                    print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (j, i)))

    # form an adjacency matrix containing directed and undirected edges
    num_nodes = X1.shape[1]
    adjacency_matrix = edges2adjacency(num_nodes, unoriented_edges, undirected=True) + edges2adjacency(num_nodes,
                                                                                                       oriented_edges,
                                                                                                       undirected=False)
    return adjacency_matrix
def dci_orient_order_independent(
        X1,
        X2,
        skeletons: Union[Dict[float, set], set],
        nodes_cond_set: set,
        rh1: RegressionHelper = None,
        rh2: RegressionHelper = None,
        alpha: float = 0.1,
        max_set_size: int = 3,
        verbose: int = 0
):
    if verbose > 0:
        print("DCI edge orientation...")

    assert 0 <= alpha <= 1, "alpha must be in [0,1] range."

    if rh1 is None or rh2 is None:
        # obtain sufficient statistics
        suffstat1 = gauss_ci_suffstat(X1)
        suffstat2 = gauss_ci_suffstat(X2)
        rh1 = RegressionHelper(suffstat1)
        rh2 = RegressionHelper(suffstat2)

    if isinstance(skeletons, dict):
        return {
            alpha: dci_orient_order_independent(
                X1,
                X2,
                skeleton,
                nodes_cond_set,
                rh1,
                rh2,
                alpha=alpha,
                max_set_size=max_set_size
            )
            for alpha, skeleton in skeletons.items()
        }

    skeleton = {frozenset({i, j}) for i, j in skeletons}
    nodes = {i for i, j in skeleton} | {j for i, j in skeleton}
    d_nx = nx.DiGraph()
    d_nx.add_nodes_from(nodes)
    nodes_with_decided_parents = set()

    n1 = rh1.suffstat['n']
    n2 = rh2.suffstat['n']
    for parent_set_size in range(max_set_size + 2):
        if verbose > 0: print(f"Trying parent sets of size {parent_set_size}")
        pvalue_dict = dict()
        for i in nodes - nodes_with_decided_parents:
            for cond_i in itertools.combinations(nodes_cond_set - {i}, parent_set_size):
                beta1_i, var1_i, _ = rh1.regression(i, list(cond_i))
                beta2_i, var2_i, _ = rh2.regression(i, list(cond_i))
                pvalue_i = ncfdtr(n1 - parent_set_size, n2 - parent_set_size, 0, var1_i / var2_i)
                pvalue_i = 2 * min(pvalue_i, 1 - pvalue_i)
                pvalue_dict[(i, frozenset(cond_i))] = pvalue_i
        # sort p-value dict
        sorted_pvalue_dict = [
            (pvalue, i, cond_i)
            for (i, cond_i), pvalue in sorted(pvalue_dict.items(), key=op.itemgetter(1), reverse=True)
            if pvalue > alpha
        ]
        while sorted_pvalue_dict:
            _, i, cond_i = sorted_pvalue_dict.pop(0)
            i_children = {j for j in nodes - cond_i - {i} if frozenset({i, j}) in skeleton}

            # don't use this parent set if it contradicts the existing edges
            if any(j in d_nx.successors(i) for j in cond_i):
                continue
            if any(j in d_nx.predecessors(i) for j in i_children):
                continue

            # don't use this parent set if it creates a cycle
            if any(j in nx.descendants(d_nx, i) for j in cond_i):
                continue
            if any(j in nx.ancestors(d_nx, i) for j in i_children):
                continue

            edges = {(j, i) for j in cond_i if frozenset({i, j}) in skeleton} | \
                    {(i, j) for j in nodes - cond_i - {i} if frozenset({i, j}) in skeleton}
            nodes_with_decided_parents.add(i)
            if verbose > 0: print(f"Adding {edges}")
            d_nx.add_edges_from(edges)

    # orient edges via graph traversal
    oriented_edges = set(d_nx.edges)
    unoriented_edges_before_traversal = skeleton - {frozenset({j, i}) for i, j in oriented_edges}
    unoriented_edges = unoriented_edges_before_traversal.copy()
    g = nx.DiGraph()
    for i, j in oriented_edges:
        g.add_edge(i, j)
    g.add_nodes_from(nodes)

    for i, j in unoriented_edges_before_traversal:
        chain_path = list(nx.all_simple_paths(g, source=i, target=j))
        if len(chain_path) > 0:
            oriented_edges.add((i, j))
            unoriented_edges.remove(frozenset({i, j}))
            if verbose > 0:
                print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (i, j)))
        else:
            chain_path = list(nx.all_simple_paths(g, source=j, target=i))
            if len(chain_path) > 0:
                oriented_edges.add((j, i))
                unoriented_edges.remove(frozenset({i, j}))
                if verbose > 0:
                    print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (j, i)))

    # form an adjacency matrix containing directed and undirected edges
    num_nodes = X1.shape[1]
    adjacency_matrix = edges2adjacency(num_nodes, unoriented_edges, undirected=True) + edges2adjacency(num_nodes,
                                                                                                       oriented_edges,
                                                                                                       undirected=False)
    return adjacency_matrix
def dci_skeleton(
        X1,
        X2,
        difference_ug: list,
        nodes_cond_set: set,
        rh1: RegressionHelper = None,
        rh2: RegressionHelper = None,
        alpha: float = 0.1,
        max_set_size: int = 3,
        verbose: int = 0,
        lam: float = 0,
        progress: bool = False
):
    """
    Estimates the skeleton of the difference-DAG.

    Parameters
    ----------
    X1: array, shape = [n_samples, n_features]
        First dataset.    
    X2: array, shape = [n_samples, n_features]
        Second dataset.
    difference_ug: list
        List of tuples that represents edges in the difference undirected graph.
    nodes_cond_set: set
        Nodes to be considered as conditioning sets.
    rh1: RegressionHelper, default = None
        Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class.
    rh2: RegressionHelper, default = None
        Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class.
    alpha: float, default = 0.1
        Significance level parameter for determining presence of edges in the skeleton of the difference graph.
        Lower alpha results in sparser difference graph.
    max_set_size: int, default = 3
        Maximum conditioning set size used to test regression invariance.
        Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3.
    verbose: int, default = 0
        The verbosity level of logging messages.
    lam: float, default = 0
        Amount of regularization for regression (becomes ridge regression if nonzero).

    See Also
    --------
    dci, dci_undirected_graph, dci_orient

    Returns
    -------
    skeleton: set
        Set of edges in the skeleton of the difference-DAG.
    """

    if verbose > 0:
        print("DCI skeleton estimation...")

    assert 0 <= alpha <= 1, "alpha must be in [0,1] range."

    if rh1 is None or rh2 is None:
        # obtain sufficient statistics
        suffstat1 = gauss_ci_suffstat(X1)
        suffstat2 = gauss_ci_suffstat(X2)
        rh1 = RegressionHelper(suffstat1)
        rh2 = RegressionHelper(suffstat2)

    n1 = rh1.suffstat['n']
    n2 = rh2.suffstat['n']

    skeleton = {(i, j) for i, j in difference_ug}

    difference_ug = tqdm(difference_ug) if (progress and len(difference_ug) != 0) else difference_ug
    for i, j in difference_ug:
        for cond_set in powerset(nodes_cond_set - {i, j}, r_max=max_set_size):
            cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i]

            # calculate regression coefficients (j regressed on cond_set_j) for both datasets
            beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i, lam=lam)
            beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i, lam=lam)

            # compute statistic and p-value
            j_ix = cond_set_i.index(j)
            stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \
                     inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix]
            pval_i = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i)

            #  remove i-j from skeleton if i regressed on (j, cond_set) is invariant
            i_invariant = pval_i > alpha
            if i_invariant:
                if verbose > 1:
                    print(
                        f"Removing edge {j}->{i} since p-value={pval_i:.5f} > alpha={alpha:.5f} with cond set {cond_set_i}")
                skeleton.remove((i, j))
                break
            elif verbose > 1:
                print(
                    f"Keeping edge {i}-{j} for now, since p-value={pval_i:.5f} < alpha={alpha:.5f} with cond set {cond_set_i}")

            # calculate regression coefficients (i regressed on cond_set_i) for both datasets
            beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j)
            beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j)

            # compute statistic and p-value
            i_ix = cond_set_j.index(i)
            stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \
                     inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix]
            pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j)

            #  remove i-j from skeleton if j regressed on (i, cond_set) is invariant
            j_invariant = pval_j > alpha
            if j_invariant:
                if verbose > 1:
                    print(
                        f"Removing edge {i}->{j} since p-value={pval_j:.5f} > alpha={alpha:.5f} with cond set {cond_set_j}")
                skeleton.remove((i, j))
                break
            elif verbose > 1:
                print(
                    f"Keeping edge {i}-{j} for now, since p-value={pval_j:.5f} < alpha={alpha:.5f} with cond set {cond_set_j}")

    return skeleton
def dci(
        X1,
        X2,
        alpha_ug: float = 1.0,
        alpha_skeleton: float = 0.1,
        alpha_orient: float = 0.1,
        max_set_size: Optional[int] = 3,
        difference_ug: list = None,
        nodes_cond_set: set = None,
        max_iter: int = 1000,
        edge_threshold: float = 0,
        verbose: int = 0,
        lam: float = 0,
        progress: bool = False,
        order_independent: bool = True
):
    """
    Uses the Difference Causal Inference (DCI) algorithm to estimate the difference-DAG between two settings.

    Parameters
    ----------
    X1: array, shape = [n_samples, n_features]
        First dataset.    
    X2: array, shape = [n_samples, n_features]
        Second dataset.
    alpha_ug: float, default = 1.0
        L1 regularization parameter for estimating the difference undirected graph via KLIEP algorithm.
    alpha_skeleton: float, default = 0.1
        Significance level parameter for determining presence of edges in the skeleton of the difference graph. 
        Lower alpha_skeleton results in sparser difference graph.
    alpha_orient: float, default = 0.1
        Significance level parameter for determining orientation of an edge. 
        Lower alpha_orient results in more directed edges in the difference-DAG.
    max_set_size: int, default = 3
        Maximum conditioning set size used to test regression invariance.
        Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3.
        If None, conditioning sets of all sizes will be used.
    difference_ug: list, default = None
        List of tuples that represents edges in the difference undirected graph. If difference_ug is None, 
        KLIEP algorithm for estimating the difference undirected graph will be run. 
        If the number of nodes is small, difference_ug could be taken to be the complete graph between all the nodes.
    nodes_cond_set: set
        Nodes to be considered as conditioning sets.
    max_iter: int, default = 1000
        Maximum number of iterations for gradient descent in KLIEP algorithm.
    edge_threshold: float, default = 0
        Edge weight cutoff for keeping an edge for KLIEP algorithm (all edges above or equal to this threshold are kept).
    verbose: int, default = 0
        The verbosity level of logging messages.
    lam: float, default = 0
        Amount of regularization for regression (becomes ridge regression if nonzero).

    See Also
    --------
    dci_undirected_graph, dci_skeleton, dci_orient

    Returns
    -------
    adjacency_matrix: array, shape  = [n_features, n_features]
        Estimated difference-DAG. Edges that were found to be different between two settings but the orientation
        could not be determined, are represented by assigning 1 in both directions, i.e. adjacency_matrix[i,j] = 1
        and adjacency_matrix[j,i] = 1. Otherwise for oriented edges, only adjacency_matrix[i,j] = 1 is assigned. 
        Assignment of 0 in the adjacency matrix represents no edge.

    References
    ----------
        [1] Wang, Y., Squires, C., Belyaeva, A., & Uhler, C. (2018). Direct estimation of differences in causal graphs. 
        In Advances in Neural Information Processing Systems (pp. 3770-3781).
    """

    assert 0 <= alpha_skeleton <= 1, "alpha_skeleton must be in [0,1] range."
    assert 0 <= alpha_orient <= 1, "alpha_orient must be in [0,1] range."

    num_nodes = X1.shape[1]
    # obtain sufficient statistics
    suffstat1 = gauss_ci_suffstat(X1)
    suffstat2 = gauss_ci_suffstat(X2)
    rh1 = RegressionHelper(suffstat1)
    rh2 = RegressionHelper(suffstat2)

    # compute the difference undirected graph via KLIEP if the differece_ug is not provided
    if difference_ug is None or nodes_cond_set is None:
        difference_ug, nodes_cond_set = dci_undirected_graph(
            X1,
            X2,
            alpha=alpha_ug,
            max_iter=max_iter,
            edge_threshold=edge_threshold,
            verbose=verbose
        )
        if verbose > 0: print(f"{len(difference_ug)} edges in the difference UG, over {len(nodes_cond_set)} nodes")

    # estimate the skeleton of the difference-DAG 
    skeleton = dci_skeleton(
        X1,
        X2,
        difference_ug,
        nodes_cond_set,
        rh1=rh1,
        rh2=rh2,
        alpha=alpha_skeleton,
        max_set_size=max_set_size,
        verbose=verbose,
        lam=lam,
        progress=progress
    )
    if verbose > 0: print(f"{len(skeleton)} edges in the difference skeleton")

    # orient edges of the skeleton of the difference-DAG
    orient_algorithm = dci_orient if not order_independent else dci_orient_order_independent
    adjacency_matrix = orient_algorithm(
        X1,
        X2,
        skeleton,
        nodes_cond_set,
        rh1=rh1,
        rh2=rh2,
        alpha=alpha_orient,
        max_set_size=max_set_size,
        verbose=verbose
    )

    return adjacency_matrix
def dci_skeleton_multiple(
        X1,
        X2,
        alpha_skeleton_grid: list = [0.1, 0.5],
        max_set_size: int = 3,
        difference_ug: list = None,
        nodes_cond_set: set = None,
        rh1: RegressionHelper = None,
        rh2: RegressionHelper = None,
        verbose: int = 0,
        lam: float = 0,
        progress: bool = False,
        true_diff: Optional[Set] = None
):
    if verbose > 0:
        print("DCI skeleton estimation...")

    if rh1 is None or rh2 is None:
        # obtain sufficient statistics
        suffstat1 = gauss_ci_suffstat(X1)
        suffstat2 = gauss_ci_suffstat(X2)
        rh1 = RegressionHelper(suffstat1)
        rh2 = RegressionHelper(suffstat2)

    n1 = rh1.suffstat['n']
    n2 = rh2.suffstat['n']

    for alpha in alpha_skeleton_grid:
        assert 0 <= alpha <= 1, "alpha must be in [0,1] range."
    min_alpha = min(alpha_skeleton_grid)

    skeletons = {alpha: {(i, j) for i, j in difference_ug} for alpha in alpha_skeleton_grid}
    difference_ug = tqdm(difference_ug) if (progress and len(difference_ug) != 0) else difference_ug

    for i, j in difference_ug:
        for cond_set in powerset(nodes_cond_set - {i, j}, r_max=max_set_size):
            cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i]

            # calculate regression coefficients (j regressed on cond_set_j) for both datasets
            beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i, lam=lam)
            beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i, lam=lam)

            # compute statistic and p-value
            j_ix = cond_set_i.index(j)
            stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \
                     inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix]
            pval_i = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i)

            #  remove i-j from skeleton if i regressed on (j, cond_set) is invariant
            i_invariant = pval_i > min_alpha
            if i_invariant:
                removed_alphas = [alpha for alpha in alpha_skeleton_grid if pval_i > alpha]
                if verbose > 1:
                    print(
                        f"Removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_i:.5f} with cond set {cond_set_i}")
                for alpha in removed_alphas:
                    skeletons[alpha].discard((i, j))
                if true_diff is not None and (i, j) in true_diff or (j, i) in true_diff:
                    print(
                        f"Incorrectly removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_i:.6f} with cond set {cond_set_i}")
                if len(removed_alphas) == len(alpha_skeleton_grid):
                    break
            elif verbose > 1:
                print(f"Keeping edge {i}-{j} for now, since p-value={pval_i:.5f} with cond set {cond_set_i}")

            # calculate regression coefficients (i regressed on cond_set_i) for both datasets
            beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j)
            beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j)

            # compute statistic and p-value
            i_ix = cond_set_j.index(i)
            stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \
                     inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix]
            pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j)

            #  remove i-j from skeleton if j regressed on (i, cond_set) is invariant
            j_invariant = pval_j > min_alpha
            if j_invariant:
                removed_alphas = [alpha for alpha in alpha_skeleton_grid if pval_j > alpha]
                if verbose > 1:
                    print(
                        f"Removing edge {i}->{j} for alpha={removed_alphas} since p-value={pval_j:.5f} with cond set {cond_set_j}")
                for alpha in removed_alphas:
                    skeletons[alpha].discard((i, j))
                if true_diff is not None and (i, j) in true_diff or (j, i) in true_diff:
                    print(
                        f"Incorrectly removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_j:.6f} with cond set {cond_set_i}")
                if len(removed_alphas) == len(alpha_skeleton_grid):
                    break
            elif verbose > 1:
                print(f"Keeping edge {i}-{j} for now, since p-value={pval_j:.5f}with cond set {cond_set_j}")

    return skeletons
from causaldag.utils.ci_tests import MemoizedCI_Tester, gauss_ci_test, gauss_ci_suffstat
import numpy as np
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

nnodes = 50
exp_nbrs_list = [2]*5 + [3]*5 + [4]*5 + [5]*5
ngraphs = len(exp_nbrs_list)
nsamples = 2*nnodes
dags = [cd.rand.directed_erdos(nnodes, exp_nbrs/(nnodes-1)) for exp_nbrs in exp_nbrs_list]
gdags = [cd.rand.rand_weights(dag) for dag in dags]
samples = [gdag.sample(nsamples) for gdag in gdags]
suffstats = [gauss_ci_suffstat(samples) for samples in samples]
ci_testers1 = [MemoizedCI_Tester(gauss_ci_test, suffstat) for suffstat in suffstats]

perms = [random.sample(list(range(nnodes)), nnodes) for _ in range(ngraphs)]
imaps1 = list(tqdm((cd.permutation2dag(perm, ci_tester, verbose=False) for perm, ci_tester in zip(perms, ci_testers1)), total=ngraphs))
true_max_degrees = [dag.max_in_degree for dag in dags]
ci_tests_per_dag = [list(zip(*ci_tester.ci_dict.keys()))[-1] for ci_tester in ci_testers1]
ci_tests_sizes = [np.array([len(ci_test) for ci_test in ci_tests]) for ci_tests in ci_tests_per_dag]
max_ci_test_sizes = [sizes.max() for sizes in ci_tests_sizes]

plt.clf()
plt.scatter(true_max_degrees, max_ci_test_sizes)
plt.xlabel('Max degree')
plt.ylabel('Max test size')
# plt.ion()
# plt.show()
Exemple #13
0
def get_bs_dags(num_bs,
                obs_samples,
                nsamples_obs,
                nnodes,
                cheat_cpdag=None,
                bic=True):
    """
    takes in a number of bootstrap dags and observational data, outputs a list of bootstrapped dags
    cheat_dag is for debugging and doing experimets where we allow access to the MEC: on the first
    round forces the cheat cpdag into the sample
    """
    #subsample data in DAG bootstrap, and learn the DAG + MLE estimates of parameters
    bs_dags = []  # a list of the dags we get from the bootstrap
    bs_index = {}  #a mapping from dag string to index in the list
    count_dags = 0  #number unique dags
    total_dags = 0  #number of dags
    samples_per_bs = nsamples_obs
    nodes = set(range(nnodes))

    while total_dags < num_bs:

        if total_dags == 0 and isinstance(cheat_cpdag, np.ndarray):
            est_cpdag = cheat_cpdag
        else:
            bs_i = np.random.choice(nsamples_obs, samples_per_bs, replace=True)
            bs_data = obs_samples[bs_i]
            #from this sample learn the DAG and an MLE of the parameters
            obs_suffstat = gauss_ci_suffstat(bs_data)
            invariance_suffstat = gauss_invariance_suffstat(obs_samples, [])
            alpha = 1e-3
            alpha_inv = 1e-3
            ci_tester = MemoizedCI_Tester(gauss_ci_test,
                                          obs_suffstat,
                                          alpha=alpha)
            invariance_tester = MemoizedInvarianceTester(gauss_invariance_test,
                                                         invariance_suffstat,
                                                         alpha=alpha_inv)
            setting_list = []
            est_dag, est_targets_list = unknown_target_igsp(setting_list,
                                                            nodes,
                                                            ci_tester,
                                                            invariance_tester,
                                                            nruns=5,
                                                            depth=None)

            est_dag = est_dag.to_amat()[0]
            est_cpdag = main.cpdag_from_dag_observational(est_dag)

        if mec_size.mec_size(est_cpdag) <= num_bs:
            #now compute the mec and add all mec members
            mec_dags = mec_size.enumerate_dags(est_cpdag)
        else:
            #get just enough dags if mec too big
            mec_dags = mec_size.uniform_sample_dag_plural(est_cpdag,
                                                          num_bs -
                                                          len(bs_dags),
                                                          exact=False)
        for est_dag in mec_dags:
            if est_dag.tobytes() in bs_index:
                #increase weight by one if we double count
                bs_dags[bs_index[est_dag.tobytes()]]['w'] += (1 / num_bs)
                bs_dags[bs_index[est_dag.tobytes()]]['count'] += 1
                #count is the number of times the dag appears in the multiset
            else:
                A, b = finite.get_weights_MLE(est_dag, obs_samples)
                bs_dags.append({
                    'dag': est_dag,
                    'A': A,
                    'b': b,
                    'w': (1 / num_bs),
                    'count': 1
                })
                bs_index[est_dag.tobytes()] = count_dags
                count_dags += 1
            total_dags += 1
        if total_dags > num_bs:
            break

    #for now we've just made all the weights 1/num_dags

    #now correct weights by computing the posterior of each DAG
    T = len(bs_dags)
    logPy = finite.llhood(
        [obs_samples], [[]], bs_dags,
        (np.zeros(nnodes), 0))  #getting the likelihood of the observations
    #print(logPy)
    weighted_logPy = np.zeros(T)
    for i in range(T):
        #use the BIC
        weighted_logPy[i] = logPy[i] + np.log(
            bs_dags[i]
            ['w'])  #- np.sum(bs_dags[i]['dag']) * np.log(nsamples_obs) / 2
        if bic:
            weighted_logPy[i] = weighted_logPy[i] - np.sum(
                bs_dags[i]['dag']) * np.log(nsamples_obs) / 2
    denom = logsumexp(weighted_logPy)
    #now set w for each DAG to be the posterior
    for i in range(T):
        bs_dags[i]['w'] = np.exp(weighted_logPy[i] - denom)
    #remove all the tiny weights and renormalize
    w_sum = 0
    bs_dags_pruned = []
    for i in range(T):
        if bs_dags[i]['w'] >= 0.001:
            bs_dags_pruned.append(bs_dags[i])
            w_sum += bs_dags[i]['w']
    T = len(bs_dags_pruned)
    for i in range(T):
        bs_dags_pruned[i]['w'] = bs_dags_pruned[i]['w'] / w_sum
    return bs_dags_pruned
Exemple #14
0
    def _run_utigsp(dag_num):
        # === GENERATE FILENAME
        sample_folder = sample_folders[dag_num]
        alg_folder = os.path.join(sample_folder, 'estimates', 'utigsp')
        os.makedirs(alg_folder, exist_ok=True)
        no_target_str = '_no_targets' if no_targets else ''
        filename = os.path.join(
            alg_folder,
            f'nruns=%d,depth=%d,alpha=%.2e,alpha_invariant=%.2e{no_target_str}.npy'
            % (nruns, depth, alpha, alpha_invariant))

        # === RUN ALGORITHM
        if not os.path.exists(filename) or overwrite:
            obs_samples, setting_list, _ = get_dag_samples(ndags,
                                                           nnodes,
                                                           nneighbors,
                                                           nsamples,
                                                           nsettings,
                                                           num_known,
                                                           num_unknown,
                                                           intervention,
                                                           dag_num,
                                                           nonlinear=nonlinear)

            if nonlinear:
                suffstat = gauss_ci_suffstat(obs_samples)
                suffstat_inv = gauss_invariance_suffstat(
                    obs_samples,
                    [setting['samples'] for setting in setting_list])
                ci_tester = MemoizedCI_Tester(gauss_ci_test,
                                              suffstat,
                                              alpha=alpha)
                inv_tester = MemoizedInvarianceTester(gauss_invariance_test,
                                                      suffstat_inv,
                                                      alpha=alpha_invariant)
            else:
                suffstat = gauss_ci_suffstat(obs_samples)
                suffstat_inv = gauss_invariance_suffstat(
                    obs_samples,
                    [setting['samples'] for setting in setting_list])
                ci_tester = MemoizedCI_Tester(gauss_ci_test,
                                              suffstat,
                                              alpha=alpha)
                inv_tester = MemoizedInvarianceTester(gauss_invariance_test,
                                                      suffstat_inv,
                                                      alpha=alpha_invariant)

            est_dag, learned_intervention_targets = unknown_target_igsp(
                [{
                    'known_interventions': setting['known_interventions']
                } for setting in setting_list],
                nodes,
                ci_tester,
                inv_tester,
                depth=depth,
                nruns=nruns,
                no_targets=no_targets)

            np.save(filename, est_dag.to_amat()[0])
            json.dump(
                list(map(list, learned_intervention_targets)),
                open(filename + '_learned_intervention_targets.json', 'w'))
            return est_dag, learned_intervention_targets
        else:
            learned_intervention_targets = json.load(
                open(filename + '_learned_intervention_targets.json'))
            return cd.DAG.from_amat(
                np.load(filename)), learned_intervention_targets