def to_maximal(self): for i, j in itr.combinations(self._nodes, r=2): if not self.has_any_edge(i, j): never_msep = not any( self.msep(i, j, S) for S in core_utils.powerset(self._nodes - {i, j})) if never_msep: self.add_bidirected(i, j)
def _is_i_contradicting(i, j, dag): """ i -> j is I-contradicting if either: 1) there exists S, a subset of the neighbors of j besides i, s.t. f^I(j|S) = f(j|S) for all I containing i but not j 2) there exists I with j \in I but i \not\in I, s.t. f^I(i|S) \not\eq f(i|S) for all subsets S of the neighbors of i besides j If there are only single node interventions, this condition becomes: 1) {i} \in I and f^{i}(j) = f(j) or 2) {j} \in I and f^{j}(i) \neq f(i) """ if only_single_node: setting_num_i = interventions2setting_nums.get(frozenset({i})) if setting_num_i is not None and invariance_tester.is_invariant( j, context=setting_num_i): return True setting_num_j = interventions2setting_nums.get(frozenset({j})) if setting_num_j is not None and not invariance_tester.is_invariant( i, context=setting_num_j): return True return False else: # === TEST CONDITION 1 neighbors_j = dag.neighbors_of(j) - {i} for s in powerset(neighbors_j): for setting_num, setting in enumerate(setting_list): if i in setting['interventions'] and j not in setting[ 'interventions']: if not invariance_tester.is_invariant( j, context=setting_num, cond_set=s): return True neighbors_i = dag.neighbors_of(i) - {j} for setting_num, setting in enumerate(setting_list): if j in setting['interventions'] and i not in setting[ 'interventions']: i_always_varies = all( invariance_tester.is_invariant( i, context=setting_num, cond_set=s) for s in powerset(neighbors_i)) if i_always_varies: return True return False
def perm2dag_subsets(perm, ci_tester, max_subset_size=None): """ Not recommended unless max_subset_size set very small. Not thoroughly tested. """ arcs = set() nodes = set(perm) for i, pi_i in enumerate(perm): for candidate_parent_set in powerset(perm[:i], r_max=max_subset_size): print(candidate_parent_set) if all( ci_tester.is_ci(i, j, candidate_parent_set) for j in nodes - {i} - candidate_parent_set): # if ci_tester.is_ci(i, nodes - {i} - candidate_parent_set, candidate_parent_set): arcs.update({(parent, i) for parent in candidate_parent_set}) break return DAG(nodes=nodes, arcs=arcs)
def dci_orient( X1, X2, skeleton: set, nodes_cond_set: set, rh1: RegressionHelper = None, rh2: RegressionHelper = None, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0 ): """ Orients edges in the skeleton of the difference DAG. Parameters ---------- X1: array, shape = [n_samples, n_features] First dataset. X2: array, shape = [n_samples, n_features] Second dataset. skeleton: set Set of edges in the skeleton of the difference-DAG. nodes_cond_set: set Nodes to be considered as conditioning sets. rh1: RegressionHelper, default = None Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class. rh2: RegressionHelper, default = None Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class. alpha: float, default = 0.1 Significance level parameter for determining orientation of an edge. Lower alpha results in more directed edges in the difference-DAG. max_set_size: int, default = 3 Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. verbose: int, default = 0 The verbosity level of logging messages. See Also -------- dci, dci_undirected_graph, dci_skeleton Returns ------- oriented_edges: set Set of edges in the skeleton of the difference-DAG for which directionality could be determined. unoriented_edges: set Set of edges in the skeleton of the difference-DAG for which directionality could not be determined. """ if verbose > 0: print("DCI edge orientation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." if rh1 is None or rh2 is None: # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) nodes = {i for i, j in skeleton} | {j for i, j in skeleton} oriented_edges = set() n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] for i, j in skeleton: for cond_i, cond_j in zip(powerset(nodes_cond_set - {i}, r_max=max_set_size), powerset(nodes_cond_set - {j}, r_max=max_set_size)): # compute residual variances for i beta1_i, var1_i, _ = rh1.regression(i, list(cond_i)) beta2_i, var2_i, _ = rh2.regression(i, list(cond_i)) # compute p-value for invariance of residual variances for i pvalue_i = ncfdtr(n1 - len(cond_i), n2 - len(cond_i), 0, var1_i / var2_i) pvalue_i = 2 * min(pvalue_i, 1 - pvalue_i) # compute residual variances for j beta1_j, var1_j, _ = rh1.regression(j, list(cond_j)) beta2_j, var2_j, _ = rh2.regression(j, list(cond_j)) # compute p-value for invariance of residual variances for j pvalue_j = ncfdtr(n1 - len(cond_j), n2 - len(cond_j), 0, var1_j / var2_j) pvalue_j = 2 * min(pvalue_j, 1 - pvalue_j) if ((pvalue_i > alpha) | (pvalue_j > alpha)): # orient the edge according to highest p-value if pvalue_i > pvalue_j: edge = (j, i) if j in cond_i else (i, j) pvalue_used = pvalue_i else: edge = (i, j) if i in cond_j else (j, i) pvalue_used = pvalue_j oriented_edges.add(edge) if verbose > 0: print("Oriented (%d, %d) as %s since p-value=%.5f > alpha=%.5f" % (i, j, edge, pvalue_used, alpha)) break # orient edges via graph traversal unoriented_edges_before_traversal = skeleton - oriented_edges - {(j, i) for i, j in oriented_edges} unoriented_edges = unoriented_edges_before_traversal.copy() g = nx.DiGraph() for i, j in oriented_edges: g.add_edge(i, j) g.add_nodes_from(nodes) for i, j in unoriented_edges_before_traversal: chain_path = list(nx.all_simple_paths(g, source=i, target=j)) if len(chain_path) > 0: oriented_edges.add((i, j)) unoriented_edges.remove((i, j)) if verbose > 0: print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (i, j))) else: chain_path = list(nx.all_simple_paths(g, source=j, target=i)) if len(chain_path) > 0: oriented_edges.add((j, i)) unoriented_edges.remove((i, j)) if verbose > 0: print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (j, i))) # form an adjacency matrix containing directed and undirected edges num_nodes = X1.shape[1] adjacency_matrix = edges2adjacency(num_nodes, unoriented_edges, undirected=True) + edges2adjacency(num_nodes, oriented_edges, undirected=False) return adjacency_matrix
def dci_skeleton( X1, X2, difference_ug: list, nodes_cond_set: set, rh1: RegressionHelper = None, rh2: RegressionHelper = None, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0, lam: float = 0, progress: bool = False ): """ Estimates the skeleton of the difference-DAG. Parameters ---------- X1: array, shape = [n_samples, n_features] First dataset. X2: array, shape = [n_samples, n_features] Second dataset. difference_ug: list List of tuples that represents edges in the difference undirected graph. nodes_cond_set: set Nodes to be considered as conditioning sets. rh1: RegressionHelper, default = None Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class. rh2: RegressionHelper, default = None Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class. alpha: float, default = 0.1 Significance level parameter for determining presence of edges in the skeleton of the difference graph. Lower alpha results in sparser difference graph. max_set_size: int, default = 3 Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. verbose: int, default = 0 The verbosity level of logging messages. lam: float, default = 0 Amount of regularization for regression (becomes ridge regression if nonzero). See Also -------- dci, dci_undirected_graph, dci_orient Returns ------- skeleton: set Set of edges in the skeleton of the difference-DAG. """ if verbose > 0: print("DCI skeleton estimation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." if rh1 is None or rh2 is None: # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] skeleton = {(i, j) for i, j in difference_ug} difference_ug = tqdm(difference_ug) if (progress and len(difference_ug) != 0) else difference_ug for i, j in difference_ug: for cond_set in powerset(nodes_cond_set - {i, j}, r_max=max_set_size): cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i] # calculate regression coefficients (j regressed on cond_set_j) for both datasets beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i, lam=lam) beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i, lam=lam) # compute statistic and p-value j_ix = cond_set_i.index(j) stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \ inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix] pval_i = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i) # remove i-j from skeleton if i regressed on (j, cond_set) is invariant i_invariant = pval_i > alpha if i_invariant: if verbose > 1: print( f"Removing edge {j}->{i} since p-value={pval_i:.5f} > alpha={alpha:.5f} with cond set {cond_set_i}") skeleton.remove((i, j)) break elif verbose > 1: print( f"Keeping edge {i}-{j} for now, since p-value={pval_i:.5f} < alpha={alpha:.5f} with cond set {cond_set_i}") # calculate regression coefficients (i regressed on cond_set_i) for both datasets beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j) beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j) # compute statistic and p-value i_ix = cond_set_j.index(i) stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \ inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix] pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j) # remove i-j from skeleton if j regressed on (i, cond_set) is invariant j_invariant = pval_j > alpha if j_invariant: if verbose > 1: print( f"Removing edge {i}->{j} since p-value={pval_j:.5f} > alpha={alpha:.5f} with cond set {cond_set_j}") skeleton.remove((i, j)) break elif verbose > 1: print( f"Keeping edge {i}-{j} for now, since p-value={pval_j:.5f} < alpha={alpha:.5f} with cond set {cond_set_j}") return skeleton
def dci_skeleton_multiple( X1, X2, alpha_skeleton_grid: list = [0.1, 0.5], max_set_size: int = 3, difference_ug: list = None, nodes_cond_set: set = None, rh1: RegressionHelper = None, rh2: RegressionHelper = None, verbose: int = 0, lam: float = 0, progress: bool = False, true_diff: Optional[Set] = None ): if verbose > 0: print("DCI skeleton estimation...") if rh1 is None or rh2 is None: # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] for alpha in alpha_skeleton_grid: assert 0 <= alpha <= 1, "alpha must be in [0,1] range." min_alpha = min(alpha_skeleton_grid) skeletons = {alpha: {(i, j) for i, j in difference_ug} for alpha in alpha_skeleton_grid} difference_ug = tqdm(difference_ug) if (progress and len(difference_ug) != 0) else difference_ug for i, j in difference_ug: for cond_set in powerset(nodes_cond_set - {i, j}, r_max=max_set_size): cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i] # calculate regression coefficients (j regressed on cond_set_j) for both datasets beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i, lam=lam) beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i, lam=lam) # compute statistic and p-value j_ix = cond_set_i.index(j) stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \ inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix] pval_i = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i) # remove i-j from skeleton if i regressed on (j, cond_set) is invariant i_invariant = pval_i > min_alpha if i_invariant: removed_alphas = [alpha for alpha in alpha_skeleton_grid if pval_i > alpha] if verbose > 1: print( f"Removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_i:.5f} with cond set {cond_set_i}") for alpha in removed_alphas: skeletons[alpha].discard((i, j)) if true_diff is not None and (i, j) in true_diff or (j, i) in true_diff: print( f"Incorrectly removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_i:.6f} with cond set {cond_set_i}") if len(removed_alphas) == len(alpha_skeleton_grid): break elif verbose > 1: print(f"Keeping edge {i}-{j} for now, since p-value={pval_i:.5f} with cond set {cond_set_i}") # calculate regression coefficients (i regressed on cond_set_i) for both datasets beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j) beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j) # compute statistic and p-value i_ix = cond_set_j.index(i) stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \ inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix] pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j) # remove i-j from skeleton if j regressed on (i, cond_set) is invariant j_invariant = pval_j > min_alpha if j_invariant: removed_alphas = [alpha for alpha in alpha_skeleton_grid if pval_j > alpha] if verbose > 1: print( f"Removing edge {i}->{j} for alpha={removed_alphas} since p-value={pval_j:.5f} with cond set {cond_set_j}") for alpha in removed_alphas: skeletons[alpha].discard((i, j)) if true_diff is not None and (i, j) in true_diff or (j, i) in true_diff: print( f"Incorrectly removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_j:.6f} with cond set {cond_set_i}") if len(removed_alphas) == len(alpha_skeleton_grid): break elif verbose > 1: print(f"Keeping edge {i}-{j} for now, since p-value={pval_j:.5f}with cond set {cond_set_j}") return skeletons
def dci_orient( skeleton: set, rh1: RegressionHelper, rh2: RegressionHelper, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0 ): """ Orients edges in the skeleton of the difference DAG. Parameters ---------- skeleton: set Set of edges in the skeleton of the difference-DAG. rh1: RegressionHelper Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class. rh2: RegressionHelper Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class. alpha: float, default = 0.1 Significance level parameter for determining orientation of an edge. Lower alpha results in more directed edges in the difference-DAG. max_set_size: int, default = 3 Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. verbose: int, default = 0 The verbosity level of logging messages. See Also -------- dci, dci_undirected_graph, dci_skeleton Returns ------- oriented_edges: set Set of edges in the skeleton of the difference-DAG for which directionality could be determined. unoriented_edges: set Set of edges in the skeleton of the difference-DAG for which directionality could not be determined. """ if verbose > 0: print("DCI edge orientation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." nodes = {i for i, j in skeleton} | {j for i, j in skeleton} oriented_edges = set() n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] for i, j in skeleton: for cond_i, cond_j in zip(powerset(nodes - {i}, r_max=max_set_size), powerset(nodes - {j}, r_max=max_set_size)): # compute residual variances for i beta1_i, var1_i, _ = rh1.regression(i, cond_i) beta2_i, var2_i, _ = rh2.regression(i, cond_i) # compute p-value for invariance of residual variances for i pvalue_i = ncfdtr(n1 - len(cond_i), n2 - len(cond_i), 0, var1_i / var2_i) pvalue_i = 2 * min(pvalue_i, 1 - pvalue_i) # compute residual variances for j beta1_j, var1_j, _ = rh1.regression(j, cond_j) beta2_j, var2_j, _ = rh2.regression(j, cond_j) # compute p-value for invariance of residual variances for j pvalue_j = ncfdtr(n1 - len(cond_j), n2 - len(cond_j), 0, var1_j / var2_j) pvalue_j = 2 * min(pvalue_j, 1 - pvalue_j) if ((pvalue_i > alpha) | (pvalue_j > alpha)): # orient the edge according to highest p-value if pvalue_i > pvalue_j: edge = (j, i) if j in cond_i else (i, j) else: edge = (i, j) if i in cond_j else (j, i) oriented_edges.add(edge) if verbose > 0: print("Oriented (%d, %d) as %s" % (i, j, edge)) break unoriented_edges = skeleton - {frozenset({i, j}) for i, j in oriented_edges} return oriented_edges, unoriented_edges
def dci_skeleton( difference_ug: list, rh1: RegressionHelper, rh2: RegressionHelper, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0 ): """ Estimates the skeleton of the difference-DAG. Parameters ---------- difference_ug: list List of tuples that represents edges in the difference undirected graph. rh1: RegressionHelper Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class. rh2: RegressionHelper Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class. alpha: float, default = 0.1 Significance level parameter for determining presence of edges in the skeleton of the difference graph. Lower alpha results in sparser difference graph. max_set_size: int, default = None Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. verbose: int, default = 0 The verbosity level of logging messages. See Also -------- dci, dci_undirected_graph, dci_orient Returns ------- skeleton: set Set of edges in the skeleton of the difference-DAG. """ if verbose > 0: print("DCI skeleton estimation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] nodes = get_nodes_in_graph(difference_ug) skeleton = {frozenset({i, j}) for i, j in difference_ug} for i, j in difference_ug: for cond_set in powerset(nodes - {i, j}, r_max=max_set_size): cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i] # calculate regression coefficients (j regressed on cond_set_j) for both datasets beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i) beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i) # compute statistic and p-value j_ix = cond_set_i.index(j) stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \ inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix] pval_i = ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i) pval_i = 2 * min(pval_i, 1 - pval_i) # remove i-j from skeleton if i regressed on (j, cond_set) is invariant i_invariant = pval_i > alpha if i_invariant: if verbose > 0: print("Removing edge %d-%d since p-value=%.5f < alpha=%.5f" % (i, j, pval_i, alpha)) skeleton.remove(frozenset({i, j})) break # calculate regression coefficients (i regressed on cond_set_i) for both datasets beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j) beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j) # compute statistic and p-value i_ix = cond_set_j.index(i) stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \ inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix] pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j) pval_j = 2 * min(pval_j, 1 - pval_j) # remove i-j from skeleton if j regressed on (i, cond_set) is invariant j_invariant = pval_j > alpha if j_invariant: if verbose > 0: print("Removing edge %d-%d since p-value=%.5f < alpha=%.5f" % (i, j, pval_j, alpha)) skeleton.remove(frozenset({i, j})) break return skeleton
if __name__ == '__main__': from causaldag.rand.graphs import directed_erdos, rand_weights from causaldag.utils.ci_tests import gauss_ci_suffstat from causaldag.utils.core_utils import powerset from sklearn.linear_model import LinearRegression import numpy as np from tqdm import trange nnodes = 10 nodes = set(range(nnodes)) exp_nbrs = 2 nsamples = 100 dag = directed_erdos(nnodes, exp_nbrs/(nnodes-1)) gdag = rand_weights(dag) samples = gdag.sample(nsamples) suff = gauss_ci_suffstat(samples) reg_helper = RegressionHelper(suff) lr = LinearRegression() for i in trange(nnodes): for c in powerset(nodes - {i}, r_min=1): c = list(c) coefs, var, _ = reg_helper.regression(i, c) lr.fit(samples[:, c], samples[:, i]) var2 = np.var(samples[:, c] @ coefs - samples[:, i], ddof=len(c)) # if not np.isclose(coefs, lr.coef_).all(): # print(coefs, lr.coef_) # if not np.isclose(var, var2): # print(var, var2)