def _run_igsp(dag_num): # === GENERATE FILENAME sample_folder = sample_folders[dag_num] alg_folder = os.path.join(sample_folder, 'estimates', 'igsp') os.makedirs(alg_folder, exist_ok=True) filename = os.path.join( alg_folder, 'nruns=%d,depth=%d,alpha=%.2e,alpha_invariant=%.2e.npy' % (nruns, depth, alpha, alpha_invariant)) # === RUN ALGORITHM if not os.path.exists(filename) or overwrite: obs_samples, setting_list, sample_dict = get_dag_samples( ndags, nnodes, nneighbors, nsamples, nsettings, num_known, num_unknown, intervention, dag_num, nonlinear=nonlinear) if nonlinear: suffstat = gauss_ci_suffstat(obs_samples) suffstat_inv = gauss_invariance_suffstat( obs_samples, [setting['samples'] for setting in setting_list]) ci_tester = MemoizedCI_Tester(gauss_ci_test, suffstat, alpha=alpha) inv_tester = MemoizedInvarianceTester(gauss_invariance_test, suffstat_inv, alpha=alpha_invariant) else: suffstat = gauss_ci_suffstat(obs_samples) suffstat_inv = gauss_invariance_suffstat( obs_samples, [setting['samples'] for setting in setting_list]) ci_tester = MemoizedCI_Tester(gauss_ci_test, suffstat, alpha=alpha) inv_tester = MemoizedInvarianceTester(gauss_invariance_test, suffstat_inv, alpha=alpha_invariant) est_dag = igsp([{ 'interventions': setting['known_interventions'] } for setting in setting_list], nodes, ci_tester, inv_tester, depth=depth, nruns=nruns) np.save(filename, est_dag.to_amat()[0]) return est_dag else: return cd.DAG.from_amat(np.load(filename))
def run_fci(graph_num): results_filename = get_alg_estimate_filename(ngraphs, nnodes, nlatent, exp_nbrs, graph_num, nsamples, GSPO_NAME, alpha=alpha, initial=initial, depth=depth, max_iters=max_iters) time_filename = get_alg_time_filename(ngraphs, nnodes, nlatent, exp_nbrs, graph_num, nsamples, GSPO_NAME, alpha=alpha, initial=initial, depth=depth, max_iters=max_iters) if OVERWRITE or not os.path.exists(results_filename): samples = get_mag_samples(ngraphs, nnodes, nlatent, exp_nbrs, graph_num, nsamples) start = time.time() suffstat = gauss_ci_suffstat(samples) ci_tester = MemoizedCI_Tester(gauss_ci_test, suffstat, alpha=alpha) est_mag = gspo(set(range(nnodes)), ci_tester, initial_imap=initial, depth=depth, nruns=nruns, max_iters=max_iters, make_minimal=lmc_update) time_used = time.time() - start os.makedirs(os.path.dirname(results_filename), exist_ok=True) np.save(results_filename, est_mag.to_amat()) np.save(time_filename, time_used) return est_mag, time_used else: time_used = np.load(time_filename) return cd.AncestralGraph.from_amat( np.load(results_filename)), time_used
def run_gsp(X, alpha, nodes: set, depth: Optional[int] = 4, nruns: int = 5, verbose: bool = False, initial_undirected: Optional[Union[str, UndirectedGraph]] = 'threshold', initial_permutations: Optional[List] = None, fixed_orders=set(), fixed_adjacencies=set(), fixed_gaps=set(), use_lowest=True, max_iters=float('inf'), factor=2, progress_bar=False, summarize=False): # obtain sufficient statistics (causaldag.utils.ci_tests) obs_suffstat = gauss_ci_suffstat(X, invert=False) # define CI tester ci_tester = MemoizedCI_Tester(gauss_ci_test, obs_suffstat, alpha=alpha) # run GSP est_dag = gsp(nodes=nodes, ci_tester=ci_tester, depth=depth, nruns=nruns, verbose=verbose, initial_undirected=initial_undirected, initial_permutations=initial_permutations, fixed_orders=fixed_orders, fixed_adjacencies=fixed_adjacencies, fixed_gaps=fixed_gaps, use_lowest=use_lowest, max_iters=max_iters, factor=factor, progress_bar=progress_bar, summarize=summarize) # convert dag to adjacency matrix, here specifying that the columns are "source" axis, so edge from j->i est_cpdag, _ = est_dag.cpdag().to_amat(source_axis=1) return est_cpdag
def prepare_igsp(obs_samples, iv_samples_list, targets_list, alpha=1e-3, alpha_inv=1e-3, ci_test="gaussian"): # Form sufficient statistics if ci_test == "gaussian": obs_suffstat = gauss_ci_suffstat(obs_samples) invariance_suffstat = gauss_invariance_suffstat( obs_samples, iv_samples_list) # Create CI and invariance ci_tester = MemoizedCI_Tester(gauss_ci_test, obs_suffstat, alpha=alpha) invariance_tester = MemoizedInvarianceTester(gauss_invariance_test, invariance_suffstat, alpha=alpha_inv) elif ci_test == "hsic": contexts = {i: s for i, s in enumerate(iv_samples_list)} invariance_suffstat = {"obs_samples": obs_samples} invariance_suffstat.update(contexts) # Create CI and invariance ci_tester = MemoizedCI_Tester(hsic_test, obs_samples, alpha=alpha) invariance_tester = MemoizedInvarianceTester(hsic_invariance_test, invariance_suffstat, alpha=alpha_inv) elif ci_test == "kci": contexts = {i: s for i, s in enumerate(iv_samples_list)} invariance_suffstat = {"obs_samples": obs_samples} invariance_suffstat.update(contexts) # Create CI and invariance ci_tester = MemoizedCI_Tester(kci_test, obs_samples, alpha=alpha) invariance_tester = MemoizedInvarianceTester(kci_invariance_test, invariance_suffstat, alpha=alpha_inv) else: raise ValueError( f"CI test '{ci_test}' does not exist. Choose between: [gaussian, hsic, kci]" ) return ci_tester, invariance_tester
from tqdm import tqdm os.makedirs(ESTIMATED_FOLDER, exist_ok=True) import json OVERWRITE = True # === LOAD SAMPLES sample_dict = dict() for file in os.listdir(SACHS_DATA_FOLDER): samples = pd.read_csv(os.path.join(SACHS_DATA_FOLDER, file), sep=',') iv_str = file.split('=')[1][:-4] ivs = frozenset({int(iv_str)}) if iv_str != '' else frozenset() sample_dict[ivs] = samples.values obs_samples = sample_dict[frozenset()] all_samples = np.concatenate(tuple(sample_dict.values()), axis=0) suffstat = gauss_ci_suffstat(obs_samples) suffstat_all = dict(C=np.corrcoef(all_samples, rowvar=False), n=all_samples.shape[0]) setting_list = [ {'known_interventions': iv_nodes} for iv_nodes, samples in sample_dict.items() if iv_nodes != frozenset() ] iv_samples_list = [sample_dict[setting['known_interventions']] for setting in setting_list] invariance_suffstat = gauss_invariance_suffstat(obs_samples, iv_samples_list) hsic_invariance_suffstat = {iv: samples for iv, samples in enumerate(iv_samples_list)} hsic_invariance_suffstat['obs_samples'] = obs_samples # === RUN UNKNOWN TARGET IGSP WITH GAUSS CI for alpha in tqdm([1e-1, 1e-2, 1e-3, 2e-1, 3e-1, 4e-1, 5e-1, 5e-2]): alpha_i = 1e-20
from line_profiler import LineProfiler import causaldag as cd from causaldag.inference.structural import pcalg, skeleton import numpy as np from causaldag.utils.ci_tests import MemoizedCI_Tester, gauss_ci_suffstat, gauss_ci_test import random np.random.seed(1729) random.seed(1729) nnodes = 20 nodes = set(range(nnodes)) g = cd.rand.rand_weights(cd.rand.directed_erdos(nnodes, 3/(nnodes-1), 1)) iv_node = random nsamples = 1000 samples = g.sample(nsamples) suffstat = gauss_ci_suffstat(samples) profiler = LineProfiler() def run_pc(): for i in range(100): ci_tester = MemoizedCI_Tester(gauss_ci_test, suffstat) pcalg(nodes, ci_tester, max_cond_set=None, verbose=True) profiler.add_function(pcalg) profiler.runcall(run_pc) profiler.print_stats()
def dci_orient( X1, X2, skeleton: set, nodes_cond_set: set, rh1: RegressionHelper = None, rh2: RegressionHelper = None, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0 ): """ Orients edges in the skeleton of the difference DAG. Parameters ---------- X1: array, shape = [n_samples, n_features] First dataset. X2: array, shape = [n_samples, n_features] Second dataset. skeleton: set Set of edges in the skeleton of the difference-DAG. nodes_cond_set: set Nodes to be considered as conditioning sets. rh1: RegressionHelper, default = None Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class. rh2: RegressionHelper, default = None Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class. alpha: float, default = 0.1 Significance level parameter for determining orientation of an edge. Lower alpha results in more directed edges in the difference-DAG. max_set_size: int, default = 3 Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. verbose: int, default = 0 The verbosity level of logging messages. See Also -------- dci, dci_undirected_graph, dci_skeleton Returns ------- oriented_edges: set Set of edges in the skeleton of the difference-DAG for which directionality could be determined. unoriented_edges: set Set of edges in the skeleton of the difference-DAG for which directionality could not be determined. """ if verbose > 0: print("DCI edge orientation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." if rh1 is None or rh2 is None: # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) nodes = {i for i, j in skeleton} | {j for i, j in skeleton} oriented_edges = set() n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] for i, j in skeleton: for cond_i, cond_j in zip(powerset(nodes_cond_set - {i}, r_max=max_set_size), powerset(nodes_cond_set - {j}, r_max=max_set_size)): # compute residual variances for i beta1_i, var1_i, _ = rh1.regression(i, list(cond_i)) beta2_i, var2_i, _ = rh2.regression(i, list(cond_i)) # compute p-value for invariance of residual variances for i pvalue_i = ncfdtr(n1 - len(cond_i), n2 - len(cond_i), 0, var1_i / var2_i) pvalue_i = 2 * min(pvalue_i, 1 - pvalue_i) # compute residual variances for j beta1_j, var1_j, _ = rh1.regression(j, list(cond_j)) beta2_j, var2_j, _ = rh2.regression(j, list(cond_j)) # compute p-value for invariance of residual variances for j pvalue_j = ncfdtr(n1 - len(cond_j), n2 - len(cond_j), 0, var1_j / var2_j) pvalue_j = 2 * min(pvalue_j, 1 - pvalue_j) if ((pvalue_i > alpha) | (pvalue_j > alpha)): # orient the edge according to highest p-value if pvalue_i > pvalue_j: edge = (j, i) if j in cond_i else (i, j) pvalue_used = pvalue_i else: edge = (i, j) if i in cond_j else (j, i) pvalue_used = pvalue_j oriented_edges.add(edge) if verbose > 0: print("Oriented (%d, %d) as %s since p-value=%.5f > alpha=%.5f" % (i, j, edge, pvalue_used, alpha)) break # orient edges via graph traversal unoriented_edges_before_traversal = skeleton - oriented_edges - {(j, i) for i, j in oriented_edges} unoriented_edges = unoriented_edges_before_traversal.copy() g = nx.DiGraph() for i, j in oriented_edges: g.add_edge(i, j) g.add_nodes_from(nodes) for i, j in unoriented_edges_before_traversal: chain_path = list(nx.all_simple_paths(g, source=i, target=j)) if len(chain_path) > 0: oriented_edges.add((i, j)) unoriented_edges.remove((i, j)) if verbose > 0: print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (i, j))) else: chain_path = list(nx.all_simple_paths(g, source=j, target=i)) if len(chain_path) > 0: oriented_edges.add((j, i)) unoriented_edges.remove((i, j)) if verbose > 0: print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (j, i))) # form an adjacency matrix containing directed and undirected edges num_nodes = X1.shape[1] adjacency_matrix = edges2adjacency(num_nodes, unoriented_edges, undirected=True) + edges2adjacency(num_nodes, oriented_edges, undirected=False) return adjacency_matrix
def dci_orient_order_independent( X1, X2, skeletons: Union[Dict[float, set], set], nodes_cond_set: set, rh1: RegressionHelper = None, rh2: RegressionHelper = None, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0 ): if verbose > 0: print("DCI edge orientation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." if rh1 is None or rh2 is None: # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) if isinstance(skeletons, dict): return { alpha: dci_orient_order_independent( X1, X2, skeleton, nodes_cond_set, rh1, rh2, alpha=alpha, max_set_size=max_set_size ) for alpha, skeleton in skeletons.items() } skeleton = {frozenset({i, j}) for i, j in skeletons} nodes = {i for i, j in skeleton} | {j for i, j in skeleton} d_nx = nx.DiGraph() d_nx.add_nodes_from(nodes) nodes_with_decided_parents = set() n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] for parent_set_size in range(max_set_size + 2): if verbose > 0: print(f"Trying parent sets of size {parent_set_size}") pvalue_dict = dict() for i in nodes - nodes_with_decided_parents: for cond_i in itertools.combinations(nodes_cond_set - {i}, parent_set_size): beta1_i, var1_i, _ = rh1.regression(i, list(cond_i)) beta2_i, var2_i, _ = rh2.regression(i, list(cond_i)) pvalue_i = ncfdtr(n1 - parent_set_size, n2 - parent_set_size, 0, var1_i / var2_i) pvalue_i = 2 * min(pvalue_i, 1 - pvalue_i) pvalue_dict[(i, frozenset(cond_i))] = pvalue_i # sort p-value dict sorted_pvalue_dict = [ (pvalue, i, cond_i) for (i, cond_i), pvalue in sorted(pvalue_dict.items(), key=op.itemgetter(1), reverse=True) if pvalue > alpha ] while sorted_pvalue_dict: _, i, cond_i = sorted_pvalue_dict.pop(0) i_children = {j for j in nodes - cond_i - {i} if frozenset({i, j}) in skeleton} # don't use this parent set if it contradicts the existing edges if any(j in d_nx.successors(i) for j in cond_i): continue if any(j in d_nx.predecessors(i) for j in i_children): continue # don't use this parent set if it creates a cycle if any(j in nx.descendants(d_nx, i) for j in cond_i): continue if any(j in nx.ancestors(d_nx, i) for j in i_children): continue edges = {(j, i) for j in cond_i if frozenset({i, j}) in skeleton} | \ {(i, j) for j in nodes - cond_i - {i} if frozenset({i, j}) in skeleton} nodes_with_decided_parents.add(i) if verbose > 0: print(f"Adding {edges}") d_nx.add_edges_from(edges) # orient edges via graph traversal oriented_edges = set(d_nx.edges) unoriented_edges_before_traversal = skeleton - {frozenset({j, i}) for i, j in oriented_edges} unoriented_edges = unoriented_edges_before_traversal.copy() g = nx.DiGraph() for i, j in oriented_edges: g.add_edge(i, j) g.add_nodes_from(nodes) for i, j in unoriented_edges_before_traversal: chain_path = list(nx.all_simple_paths(g, source=i, target=j)) if len(chain_path) > 0: oriented_edges.add((i, j)) unoriented_edges.remove(frozenset({i, j})) if verbose > 0: print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (i, j))) else: chain_path = list(nx.all_simple_paths(g, source=j, target=i)) if len(chain_path) > 0: oriented_edges.add((j, i)) unoriented_edges.remove(frozenset({i, j})) if verbose > 0: print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (j, i))) # form an adjacency matrix containing directed and undirected edges num_nodes = X1.shape[1] adjacency_matrix = edges2adjacency(num_nodes, unoriented_edges, undirected=True) + edges2adjacency(num_nodes, oriented_edges, undirected=False) return adjacency_matrix
def dci_skeleton( X1, X2, difference_ug: list, nodes_cond_set: set, rh1: RegressionHelper = None, rh2: RegressionHelper = None, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0, lam: float = 0, progress: bool = False ): """ Estimates the skeleton of the difference-DAG. Parameters ---------- X1: array, shape = [n_samples, n_features] First dataset. X2: array, shape = [n_samples, n_features] Second dataset. difference_ug: list List of tuples that represents edges in the difference undirected graph. nodes_cond_set: set Nodes to be considered as conditioning sets. rh1: RegressionHelper, default = None Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class. rh2: RegressionHelper, default = None Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class. alpha: float, default = 0.1 Significance level parameter for determining presence of edges in the skeleton of the difference graph. Lower alpha results in sparser difference graph. max_set_size: int, default = 3 Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. verbose: int, default = 0 The verbosity level of logging messages. lam: float, default = 0 Amount of regularization for regression (becomes ridge regression if nonzero). See Also -------- dci, dci_undirected_graph, dci_orient Returns ------- skeleton: set Set of edges in the skeleton of the difference-DAG. """ if verbose > 0: print("DCI skeleton estimation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." if rh1 is None or rh2 is None: # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] skeleton = {(i, j) for i, j in difference_ug} difference_ug = tqdm(difference_ug) if (progress and len(difference_ug) != 0) else difference_ug for i, j in difference_ug: for cond_set in powerset(nodes_cond_set - {i, j}, r_max=max_set_size): cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i] # calculate regression coefficients (j regressed on cond_set_j) for both datasets beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i, lam=lam) beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i, lam=lam) # compute statistic and p-value j_ix = cond_set_i.index(j) stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \ inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix] pval_i = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i) # remove i-j from skeleton if i regressed on (j, cond_set) is invariant i_invariant = pval_i > alpha if i_invariant: if verbose > 1: print( f"Removing edge {j}->{i} since p-value={pval_i:.5f} > alpha={alpha:.5f} with cond set {cond_set_i}") skeleton.remove((i, j)) break elif verbose > 1: print( f"Keeping edge {i}-{j} for now, since p-value={pval_i:.5f} < alpha={alpha:.5f} with cond set {cond_set_i}") # calculate regression coefficients (i regressed on cond_set_i) for both datasets beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j) beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j) # compute statistic and p-value i_ix = cond_set_j.index(i) stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \ inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix] pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j) # remove i-j from skeleton if j regressed on (i, cond_set) is invariant j_invariant = pval_j > alpha if j_invariant: if verbose > 1: print( f"Removing edge {i}->{j} since p-value={pval_j:.5f} > alpha={alpha:.5f} with cond set {cond_set_j}") skeleton.remove((i, j)) break elif verbose > 1: print( f"Keeping edge {i}-{j} for now, since p-value={pval_j:.5f} < alpha={alpha:.5f} with cond set {cond_set_j}") return skeleton
def dci( X1, X2, alpha_ug: float = 1.0, alpha_skeleton: float = 0.1, alpha_orient: float = 0.1, max_set_size: Optional[int] = 3, difference_ug: list = None, nodes_cond_set: set = None, max_iter: int = 1000, edge_threshold: float = 0, verbose: int = 0, lam: float = 0, progress: bool = False, order_independent: bool = True ): """ Uses the Difference Causal Inference (DCI) algorithm to estimate the difference-DAG between two settings. Parameters ---------- X1: array, shape = [n_samples, n_features] First dataset. X2: array, shape = [n_samples, n_features] Second dataset. alpha_ug: float, default = 1.0 L1 regularization parameter for estimating the difference undirected graph via KLIEP algorithm. alpha_skeleton: float, default = 0.1 Significance level parameter for determining presence of edges in the skeleton of the difference graph. Lower alpha_skeleton results in sparser difference graph. alpha_orient: float, default = 0.1 Significance level parameter for determining orientation of an edge. Lower alpha_orient results in more directed edges in the difference-DAG. max_set_size: int, default = 3 Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. If None, conditioning sets of all sizes will be used. difference_ug: list, default = None List of tuples that represents edges in the difference undirected graph. If difference_ug is None, KLIEP algorithm for estimating the difference undirected graph will be run. If the number of nodes is small, difference_ug could be taken to be the complete graph between all the nodes. nodes_cond_set: set Nodes to be considered as conditioning sets. max_iter: int, default = 1000 Maximum number of iterations for gradient descent in KLIEP algorithm. edge_threshold: float, default = 0 Edge weight cutoff for keeping an edge for KLIEP algorithm (all edges above or equal to this threshold are kept). verbose: int, default = 0 The verbosity level of logging messages. lam: float, default = 0 Amount of regularization for regression (becomes ridge regression if nonzero). See Also -------- dci_undirected_graph, dci_skeleton, dci_orient Returns ------- adjacency_matrix: array, shape = [n_features, n_features] Estimated difference-DAG. Edges that were found to be different between two settings but the orientation could not be determined, are represented by assigning 1 in both directions, i.e. adjacency_matrix[i,j] = 1 and adjacency_matrix[j,i] = 1. Otherwise for oriented edges, only adjacency_matrix[i,j] = 1 is assigned. Assignment of 0 in the adjacency matrix represents no edge. References ---------- [1] Wang, Y., Squires, C., Belyaeva, A., & Uhler, C. (2018). Direct estimation of differences in causal graphs. In Advances in Neural Information Processing Systems (pp. 3770-3781). """ assert 0 <= alpha_skeleton <= 1, "alpha_skeleton must be in [0,1] range." assert 0 <= alpha_orient <= 1, "alpha_orient must be in [0,1] range." num_nodes = X1.shape[1] # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) # compute the difference undirected graph via KLIEP if the differece_ug is not provided if difference_ug is None or nodes_cond_set is None: difference_ug, nodes_cond_set = dci_undirected_graph( X1, X2, alpha=alpha_ug, max_iter=max_iter, edge_threshold=edge_threshold, verbose=verbose ) if verbose > 0: print(f"{len(difference_ug)} edges in the difference UG, over {len(nodes_cond_set)} nodes") # estimate the skeleton of the difference-DAG skeleton = dci_skeleton( X1, X2, difference_ug, nodes_cond_set, rh1=rh1, rh2=rh2, alpha=alpha_skeleton, max_set_size=max_set_size, verbose=verbose, lam=lam, progress=progress ) if verbose > 0: print(f"{len(skeleton)} edges in the difference skeleton") # orient edges of the skeleton of the difference-DAG orient_algorithm = dci_orient if not order_independent else dci_orient_order_independent adjacency_matrix = orient_algorithm( X1, X2, skeleton, nodes_cond_set, rh1=rh1, rh2=rh2, alpha=alpha_orient, max_set_size=max_set_size, verbose=verbose ) return adjacency_matrix
def dci_skeleton_multiple( X1, X2, alpha_skeleton_grid: list = [0.1, 0.5], max_set_size: int = 3, difference_ug: list = None, nodes_cond_set: set = None, rh1: RegressionHelper = None, rh2: RegressionHelper = None, verbose: int = 0, lam: float = 0, progress: bool = False, true_diff: Optional[Set] = None ): if verbose > 0: print("DCI skeleton estimation...") if rh1 is None or rh2 is None: # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] for alpha in alpha_skeleton_grid: assert 0 <= alpha <= 1, "alpha must be in [0,1] range." min_alpha = min(alpha_skeleton_grid) skeletons = {alpha: {(i, j) for i, j in difference_ug} for alpha in alpha_skeleton_grid} difference_ug = tqdm(difference_ug) if (progress and len(difference_ug) != 0) else difference_ug for i, j in difference_ug: for cond_set in powerset(nodes_cond_set - {i, j}, r_max=max_set_size): cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i] # calculate regression coefficients (j regressed on cond_set_j) for both datasets beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i, lam=lam) beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i, lam=lam) # compute statistic and p-value j_ix = cond_set_i.index(j) stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \ inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix] pval_i = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i) # remove i-j from skeleton if i regressed on (j, cond_set) is invariant i_invariant = pval_i > min_alpha if i_invariant: removed_alphas = [alpha for alpha in alpha_skeleton_grid if pval_i > alpha] if verbose > 1: print( f"Removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_i:.5f} with cond set {cond_set_i}") for alpha in removed_alphas: skeletons[alpha].discard((i, j)) if true_diff is not None and (i, j) in true_diff or (j, i) in true_diff: print( f"Incorrectly removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_i:.6f} with cond set {cond_set_i}") if len(removed_alphas) == len(alpha_skeleton_grid): break elif verbose > 1: print(f"Keeping edge {i}-{j} for now, since p-value={pval_i:.5f} with cond set {cond_set_i}") # calculate regression coefficients (i regressed on cond_set_i) for both datasets beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j) beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j) # compute statistic and p-value i_ix = cond_set_j.index(i) stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \ inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix] pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j) # remove i-j from skeleton if j regressed on (i, cond_set) is invariant j_invariant = pval_j > min_alpha if j_invariant: removed_alphas = [alpha for alpha in alpha_skeleton_grid if pval_j > alpha] if verbose > 1: print( f"Removing edge {i}->{j} for alpha={removed_alphas} since p-value={pval_j:.5f} with cond set {cond_set_j}") for alpha in removed_alphas: skeletons[alpha].discard((i, j)) if true_diff is not None and (i, j) in true_diff or (j, i) in true_diff: print( f"Incorrectly removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_j:.6f} with cond set {cond_set_i}") if len(removed_alphas) == len(alpha_skeleton_grid): break elif verbose > 1: print(f"Keeping edge {i}-{j} for now, since p-value={pval_j:.5f}with cond set {cond_set_j}") return skeletons
from causaldag.utils.ci_tests import MemoizedCI_Tester, gauss_ci_test, gauss_ci_suffstat import numpy as np from tqdm import tqdm import random import matplotlib.pyplot as plt import seaborn as sns sns.set() nnodes = 50 exp_nbrs_list = [2]*5 + [3]*5 + [4]*5 + [5]*5 ngraphs = len(exp_nbrs_list) nsamples = 2*nnodes dags = [cd.rand.directed_erdos(nnodes, exp_nbrs/(nnodes-1)) for exp_nbrs in exp_nbrs_list] gdags = [cd.rand.rand_weights(dag) for dag in dags] samples = [gdag.sample(nsamples) for gdag in gdags] suffstats = [gauss_ci_suffstat(samples) for samples in samples] ci_testers1 = [MemoizedCI_Tester(gauss_ci_test, suffstat) for suffstat in suffstats] perms = [random.sample(list(range(nnodes)), nnodes) for _ in range(ngraphs)] imaps1 = list(tqdm((cd.permutation2dag(perm, ci_tester, verbose=False) for perm, ci_tester in zip(perms, ci_testers1)), total=ngraphs)) true_max_degrees = [dag.max_in_degree for dag in dags] ci_tests_per_dag = [list(zip(*ci_tester.ci_dict.keys()))[-1] for ci_tester in ci_testers1] ci_tests_sizes = [np.array([len(ci_test) for ci_test in ci_tests]) for ci_tests in ci_tests_per_dag] max_ci_test_sizes = [sizes.max() for sizes in ci_tests_sizes] plt.clf() plt.scatter(true_max_degrees, max_ci_test_sizes) plt.xlabel('Max degree') plt.ylabel('Max test size') # plt.ion() # plt.show()
def get_bs_dags(num_bs, obs_samples, nsamples_obs, nnodes, cheat_cpdag=None, bic=True): """ takes in a number of bootstrap dags and observational data, outputs a list of bootstrapped dags cheat_dag is for debugging and doing experimets where we allow access to the MEC: on the first round forces the cheat cpdag into the sample """ #subsample data in DAG bootstrap, and learn the DAG + MLE estimates of parameters bs_dags = [] # a list of the dags we get from the bootstrap bs_index = {} #a mapping from dag string to index in the list count_dags = 0 #number unique dags total_dags = 0 #number of dags samples_per_bs = nsamples_obs nodes = set(range(nnodes)) while total_dags < num_bs: if total_dags == 0 and isinstance(cheat_cpdag, np.ndarray): est_cpdag = cheat_cpdag else: bs_i = np.random.choice(nsamples_obs, samples_per_bs, replace=True) bs_data = obs_samples[bs_i] #from this sample learn the DAG and an MLE of the parameters obs_suffstat = gauss_ci_suffstat(bs_data) invariance_suffstat = gauss_invariance_suffstat(obs_samples, []) alpha = 1e-3 alpha_inv = 1e-3 ci_tester = MemoizedCI_Tester(gauss_ci_test, obs_suffstat, alpha=alpha) invariance_tester = MemoizedInvarianceTester(gauss_invariance_test, invariance_suffstat, alpha=alpha_inv) setting_list = [] est_dag, est_targets_list = unknown_target_igsp(setting_list, nodes, ci_tester, invariance_tester, nruns=5, depth=None) est_dag = est_dag.to_amat()[0] est_cpdag = main.cpdag_from_dag_observational(est_dag) if mec_size.mec_size(est_cpdag) <= num_bs: #now compute the mec and add all mec members mec_dags = mec_size.enumerate_dags(est_cpdag) else: #get just enough dags if mec too big mec_dags = mec_size.uniform_sample_dag_plural(est_cpdag, num_bs - len(bs_dags), exact=False) for est_dag in mec_dags: if est_dag.tobytes() in bs_index: #increase weight by one if we double count bs_dags[bs_index[est_dag.tobytes()]]['w'] += (1 / num_bs) bs_dags[bs_index[est_dag.tobytes()]]['count'] += 1 #count is the number of times the dag appears in the multiset else: A, b = finite.get_weights_MLE(est_dag, obs_samples) bs_dags.append({ 'dag': est_dag, 'A': A, 'b': b, 'w': (1 / num_bs), 'count': 1 }) bs_index[est_dag.tobytes()] = count_dags count_dags += 1 total_dags += 1 if total_dags > num_bs: break #for now we've just made all the weights 1/num_dags #now correct weights by computing the posterior of each DAG T = len(bs_dags) logPy = finite.llhood( [obs_samples], [[]], bs_dags, (np.zeros(nnodes), 0)) #getting the likelihood of the observations #print(logPy) weighted_logPy = np.zeros(T) for i in range(T): #use the BIC weighted_logPy[i] = logPy[i] + np.log( bs_dags[i] ['w']) #- np.sum(bs_dags[i]['dag']) * np.log(nsamples_obs) / 2 if bic: weighted_logPy[i] = weighted_logPy[i] - np.sum( bs_dags[i]['dag']) * np.log(nsamples_obs) / 2 denom = logsumexp(weighted_logPy) #now set w for each DAG to be the posterior for i in range(T): bs_dags[i]['w'] = np.exp(weighted_logPy[i] - denom) #remove all the tiny weights and renormalize w_sum = 0 bs_dags_pruned = [] for i in range(T): if bs_dags[i]['w'] >= 0.001: bs_dags_pruned.append(bs_dags[i]) w_sum += bs_dags[i]['w'] T = len(bs_dags_pruned) for i in range(T): bs_dags_pruned[i]['w'] = bs_dags_pruned[i]['w'] / w_sum return bs_dags_pruned
def _run_utigsp(dag_num): # === GENERATE FILENAME sample_folder = sample_folders[dag_num] alg_folder = os.path.join(sample_folder, 'estimates', 'utigsp') os.makedirs(alg_folder, exist_ok=True) no_target_str = '_no_targets' if no_targets else '' filename = os.path.join( alg_folder, f'nruns=%d,depth=%d,alpha=%.2e,alpha_invariant=%.2e{no_target_str}.npy' % (nruns, depth, alpha, alpha_invariant)) # === RUN ALGORITHM if not os.path.exists(filename) or overwrite: obs_samples, setting_list, _ = get_dag_samples(ndags, nnodes, nneighbors, nsamples, nsettings, num_known, num_unknown, intervention, dag_num, nonlinear=nonlinear) if nonlinear: suffstat = gauss_ci_suffstat(obs_samples) suffstat_inv = gauss_invariance_suffstat( obs_samples, [setting['samples'] for setting in setting_list]) ci_tester = MemoizedCI_Tester(gauss_ci_test, suffstat, alpha=alpha) inv_tester = MemoizedInvarianceTester(gauss_invariance_test, suffstat_inv, alpha=alpha_invariant) else: suffstat = gauss_ci_suffstat(obs_samples) suffstat_inv = gauss_invariance_suffstat( obs_samples, [setting['samples'] for setting in setting_list]) ci_tester = MemoizedCI_Tester(gauss_ci_test, suffstat, alpha=alpha) inv_tester = MemoizedInvarianceTester(gauss_invariance_test, suffstat_inv, alpha=alpha_invariant) est_dag, learned_intervention_targets = unknown_target_igsp( [{ 'known_interventions': setting['known_interventions'] } for setting in setting_list], nodes, ci_tester, inv_tester, depth=depth, nruns=nruns, no_targets=no_targets) np.save(filename, est_dag.to_amat()[0]) json.dump( list(map(list, learned_intervention_targets)), open(filename + '_learned_intervention_targets.json', 'w')) return est_dag, learned_intervention_targets else: learned_intervention_targets = json.load( open(filename + '_learned_intervention_targets.json')) return cd.DAG.from_amat( np.load(filename)), learned_intervention_targets