def dci_orient_order_independent( X1, X2, skeletons: Union[Dict[float, set], set], nodes_cond_set: set, rh1: RegressionHelper = None, rh2: RegressionHelper = None, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0 ): if verbose > 0: print("DCI edge orientation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." if rh1 is None or rh2 is None: # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) if isinstance(skeletons, dict): return { alpha: dci_orient_order_independent( X1, X2, skeleton, nodes_cond_set, rh1, rh2, alpha=alpha, max_set_size=max_set_size ) for alpha, skeleton in skeletons.items() } skeleton = {frozenset({i, j}) for i, j in skeletons} nodes = {i for i, j in skeleton} | {j for i, j in skeleton} d_nx = nx.DiGraph() d_nx.add_nodes_from(nodes) nodes_with_decided_parents = set() n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] for parent_set_size in range(max_set_size + 2): if verbose > 0: print(f"Trying parent sets of size {parent_set_size}") pvalue_dict = dict() for i in nodes - nodes_with_decided_parents: for cond_i in itertools.combinations(nodes_cond_set - {i}, parent_set_size): beta1_i, var1_i, _ = rh1.regression(i, list(cond_i)) beta2_i, var2_i, _ = rh2.regression(i, list(cond_i)) pvalue_i = ncfdtr(n1 - parent_set_size, n2 - parent_set_size, 0, var1_i / var2_i) pvalue_i = 2 * min(pvalue_i, 1 - pvalue_i) pvalue_dict[(i, frozenset(cond_i))] = pvalue_i # sort p-value dict sorted_pvalue_dict = [ (pvalue, i, cond_i) for (i, cond_i), pvalue in sorted(pvalue_dict.items(), key=op.itemgetter(1), reverse=True) if pvalue > alpha ] while sorted_pvalue_dict: _, i, cond_i = sorted_pvalue_dict.pop(0) i_children = {j for j in nodes - cond_i - {i} if frozenset({i, j}) in skeleton} # don't use this parent set if it contradicts the existing edges if any(j in d_nx.successors(i) for j in cond_i): continue if any(j in d_nx.predecessors(i) for j in i_children): continue # don't use this parent set if it creates a cycle if any(j in nx.descendants(d_nx, i) for j in cond_i): continue if any(j in nx.ancestors(d_nx, i) for j in i_children): continue edges = {(j, i) for j in cond_i if frozenset({i, j}) in skeleton} | \ {(i, j) for j in nodes - cond_i - {i} if frozenset({i, j}) in skeleton} nodes_with_decided_parents.add(i) if verbose > 0: print(f"Adding {edges}") d_nx.add_edges_from(edges) # orient edges via graph traversal oriented_edges = set(d_nx.edges) unoriented_edges_before_traversal = skeleton - {frozenset({j, i}) for i, j in oriented_edges} unoriented_edges = unoriented_edges_before_traversal.copy() g = nx.DiGraph() for i, j in oriented_edges: g.add_edge(i, j) g.add_nodes_from(nodes) for i, j in unoriented_edges_before_traversal: chain_path = list(nx.all_simple_paths(g, source=i, target=j)) if len(chain_path) > 0: oriented_edges.add((i, j)) unoriented_edges.remove(frozenset({i, j})) if verbose > 0: print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (i, j))) else: chain_path = list(nx.all_simple_paths(g, source=j, target=i)) if len(chain_path) > 0: oriented_edges.add((j, i)) unoriented_edges.remove(frozenset({i, j})) if verbose > 0: print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (j, i))) # form an adjacency matrix containing directed and undirected edges num_nodes = X1.shape[1] adjacency_matrix = edges2adjacency(num_nodes, unoriented_edges, undirected=True) + edges2adjacency(num_nodes, oriented_edges, undirected=False) return adjacency_matrix
def dci_orient( X1, X2, skeleton: set, nodes_cond_set: set, rh1: RegressionHelper = None, rh2: RegressionHelper = None, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0 ): """ Orients edges in the skeleton of the difference DAG. Parameters ---------- X1: array, shape = [n_samples, n_features] First dataset. X2: array, shape = [n_samples, n_features] Second dataset. skeleton: set Set of edges in the skeleton of the difference-DAG. nodes_cond_set: set Nodes to be considered as conditioning sets. rh1: RegressionHelper, default = None Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class. rh2: RegressionHelper, default = None Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class. alpha: float, default = 0.1 Significance level parameter for determining orientation of an edge. Lower alpha results in more directed edges in the difference-DAG. max_set_size: int, default = 3 Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. verbose: int, default = 0 The verbosity level of logging messages. See Also -------- dci, dci_undirected_graph, dci_skeleton Returns ------- oriented_edges: set Set of edges in the skeleton of the difference-DAG for which directionality could be determined. unoriented_edges: set Set of edges in the skeleton of the difference-DAG for which directionality could not be determined. """ if verbose > 0: print("DCI edge orientation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." if rh1 is None or rh2 is None: # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) nodes = {i for i, j in skeleton} | {j for i, j in skeleton} oriented_edges = set() n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] for i, j in skeleton: for cond_i, cond_j in zip(powerset(nodes_cond_set - {i}, r_max=max_set_size), powerset(nodes_cond_set - {j}, r_max=max_set_size)): # compute residual variances for i beta1_i, var1_i, _ = rh1.regression(i, list(cond_i)) beta2_i, var2_i, _ = rh2.regression(i, list(cond_i)) # compute p-value for invariance of residual variances for i pvalue_i = ncfdtr(n1 - len(cond_i), n2 - len(cond_i), 0, var1_i / var2_i) pvalue_i = 2 * min(pvalue_i, 1 - pvalue_i) # compute residual variances for j beta1_j, var1_j, _ = rh1.regression(j, list(cond_j)) beta2_j, var2_j, _ = rh2.regression(j, list(cond_j)) # compute p-value for invariance of residual variances for j pvalue_j = ncfdtr(n1 - len(cond_j), n2 - len(cond_j), 0, var1_j / var2_j) pvalue_j = 2 * min(pvalue_j, 1 - pvalue_j) if ((pvalue_i > alpha) | (pvalue_j > alpha)): # orient the edge according to highest p-value if pvalue_i > pvalue_j: edge = (j, i) if j in cond_i else (i, j) pvalue_used = pvalue_i else: edge = (i, j) if i in cond_j else (j, i) pvalue_used = pvalue_j oriented_edges.add(edge) if verbose > 0: print("Oriented (%d, %d) as %s since p-value=%.5f > alpha=%.5f" % (i, j, edge, pvalue_used, alpha)) break # orient edges via graph traversal unoriented_edges_before_traversal = skeleton - oriented_edges - {(j, i) for i, j in oriented_edges} unoriented_edges = unoriented_edges_before_traversal.copy() g = nx.DiGraph() for i, j in oriented_edges: g.add_edge(i, j) g.add_nodes_from(nodes) for i, j in unoriented_edges_before_traversal: chain_path = list(nx.all_simple_paths(g, source=i, target=j)) if len(chain_path) > 0: oriented_edges.add((i, j)) unoriented_edges.remove((i, j)) if verbose > 0: print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (i, j))) else: chain_path = list(nx.all_simple_paths(g, source=j, target=i)) if len(chain_path) > 0: oriented_edges.add((j, i)) unoriented_edges.remove((i, j)) if verbose > 0: print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (j, i))) # form an adjacency matrix containing directed and undirected edges num_nodes = X1.shape[1] adjacency_matrix = edges2adjacency(num_nodes, unoriented_edges, undirected=True) + edges2adjacency(num_nodes, oriented_edges, undirected=False) return adjacency_matrix
def dci_skeleton( X1, X2, difference_ug: list, nodes_cond_set: set, rh1: RegressionHelper = None, rh2: RegressionHelper = None, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0, lam: float = 0, progress: bool = False ): """ Estimates the skeleton of the difference-DAG. Parameters ---------- X1: array, shape = [n_samples, n_features] First dataset. X2: array, shape = [n_samples, n_features] Second dataset. difference_ug: list List of tuples that represents edges in the difference undirected graph. nodes_cond_set: set Nodes to be considered as conditioning sets. rh1: RegressionHelper, default = None Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class. rh2: RegressionHelper, default = None Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class. alpha: float, default = 0.1 Significance level parameter for determining presence of edges in the skeleton of the difference graph. Lower alpha results in sparser difference graph. max_set_size: int, default = 3 Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. verbose: int, default = 0 The verbosity level of logging messages. lam: float, default = 0 Amount of regularization for regression (becomes ridge regression if nonzero). See Also -------- dci, dci_undirected_graph, dci_orient Returns ------- skeleton: set Set of edges in the skeleton of the difference-DAG. """ if verbose > 0: print("DCI skeleton estimation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." if rh1 is None or rh2 is None: # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] skeleton = {(i, j) for i, j in difference_ug} difference_ug = tqdm(difference_ug) if (progress and len(difference_ug) != 0) else difference_ug for i, j in difference_ug: for cond_set in powerset(nodes_cond_set - {i, j}, r_max=max_set_size): cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i] # calculate regression coefficients (j regressed on cond_set_j) for both datasets beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i, lam=lam) beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i, lam=lam) # compute statistic and p-value j_ix = cond_set_i.index(j) stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \ inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix] pval_i = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i) # remove i-j from skeleton if i regressed on (j, cond_set) is invariant i_invariant = pval_i > alpha if i_invariant: if verbose > 1: print( f"Removing edge {j}->{i} since p-value={pval_i:.5f} > alpha={alpha:.5f} with cond set {cond_set_i}") skeleton.remove((i, j)) break elif verbose > 1: print( f"Keeping edge {i}-{j} for now, since p-value={pval_i:.5f} < alpha={alpha:.5f} with cond set {cond_set_i}") # calculate regression coefficients (i regressed on cond_set_i) for both datasets beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j) beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j) # compute statistic and p-value i_ix = cond_set_j.index(i) stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \ inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix] pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j) # remove i-j from skeleton if j regressed on (i, cond_set) is invariant j_invariant = pval_j > alpha if j_invariant: if verbose > 1: print( f"Removing edge {i}->{j} since p-value={pval_j:.5f} > alpha={alpha:.5f} with cond set {cond_set_j}") skeleton.remove((i, j)) break elif verbose > 1: print( f"Keeping edge {i}-{j} for now, since p-value={pval_j:.5f} < alpha={alpha:.5f} with cond set {cond_set_j}") return skeleton
def dci( X1, X2, alpha_ug: float = 1.0, alpha_skeleton: float = 0.1, alpha_orient: float = 0.1, max_set_size: Optional[int] = 3, difference_ug: list = None, nodes_cond_set: set = None, max_iter: int = 1000, edge_threshold: float = 0, verbose: int = 0, lam: float = 0, progress: bool = False, order_independent: bool = True ): """ Uses the Difference Causal Inference (DCI) algorithm to estimate the difference-DAG between two settings. Parameters ---------- X1: array, shape = [n_samples, n_features] First dataset. X2: array, shape = [n_samples, n_features] Second dataset. alpha_ug: float, default = 1.0 L1 regularization parameter for estimating the difference undirected graph via KLIEP algorithm. alpha_skeleton: float, default = 0.1 Significance level parameter for determining presence of edges in the skeleton of the difference graph. Lower alpha_skeleton results in sparser difference graph. alpha_orient: float, default = 0.1 Significance level parameter for determining orientation of an edge. Lower alpha_orient results in more directed edges in the difference-DAG. max_set_size: int, default = 3 Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. If None, conditioning sets of all sizes will be used. difference_ug: list, default = None List of tuples that represents edges in the difference undirected graph. If difference_ug is None, KLIEP algorithm for estimating the difference undirected graph will be run. If the number of nodes is small, difference_ug could be taken to be the complete graph between all the nodes. nodes_cond_set: set Nodes to be considered as conditioning sets. max_iter: int, default = 1000 Maximum number of iterations for gradient descent in KLIEP algorithm. edge_threshold: float, default = 0 Edge weight cutoff for keeping an edge for KLIEP algorithm (all edges above or equal to this threshold are kept). verbose: int, default = 0 The verbosity level of logging messages. lam: float, default = 0 Amount of regularization for regression (becomes ridge regression if nonzero). See Also -------- dci_undirected_graph, dci_skeleton, dci_orient Returns ------- adjacency_matrix: array, shape = [n_features, n_features] Estimated difference-DAG. Edges that were found to be different between two settings but the orientation could not be determined, are represented by assigning 1 in both directions, i.e. adjacency_matrix[i,j] = 1 and adjacency_matrix[j,i] = 1. Otherwise for oriented edges, only adjacency_matrix[i,j] = 1 is assigned. Assignment of 0 in the adjacency matrix represents no edge. References ---------- [1] Wang, Y., Squires, C., Belyaeva, A., & Uhler, C. (2018). Direct estimation of differences in causal graphs. In Advances in Neural Information Processing Systems (pp. 3770-3781). """ assert 0 <= alpha_skeleton <= 1, "alpha_skeleton must be in [0,1] range." assert 0 <= alpha_orient <= 1, "alpha_orient must be in [0,1] range." num_nodes = X1.shape[1] # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) # compute the difference undirected graph via KLIEP if the differece_ug is not provided if difference_ug is None or nodes_cond_set is None: difference_ug, nodes_cond_set = dci_undirected_graph( X1, X2, alpha=alpha_ug, max_iter=max_iter, edge_threshold=edge_threshold, verbose=verbose ) if verbose > 0: print(f"{len(difference_ug)} edges in the difference UG, over {len(nodes_cond_set)} nodes") # estimate the skeleton of the difference-DAG skeleton = dci_skeleton( X1, X2, difference_ug, nodes_cond_set, rh1=rh1, rh2=rh2, alpha=alpha_skeleton, max_set_size=max_set_size, verbose=verbose, lam=lam, progress=progress ) if verbose > 0: print(f"{len(skeleton)} edges in the difference skeleton") # orient edges of the skeleton of the difference-DAG orient_algorithm = dci_orient if not order_independent else dci_orient_order_independent adjacency_matrix = orient_algorithm( X1, X2, skeleton, nodes_cond_set, rh1=rh1, rh2=rh2, alpha=alpha_orient, max_set_size=max_set_size, verbose=verbose ) return adjacency_matrix
def dci_skeleton_multiple( X1, X2, alpha_skeleton_grid: list = [0.1, 0.5], max_set_size: int = 3, difference_ug: list = None, nodes_cond_set: set = None, rh1: RegressionHelper = None, rh2: RegressionHelper = None, verbose: int = 0, lam: float = 0, progress: bool = False, true_diff: Optional[Set] = None ): if verbose > 0: print("DCI skeleton estimation...") if rh1 is None or rh2 is None: # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] for alpha in alpha_skeleton_grid: assert 0 <= alpha <= 1, "alpha must be in [0,1] range." min_alpha = min(alpha_skeleton_grid) skeletons = {alpha: {(i, j) for i, j in difference_ug} for alpha in alpha_skeleton_grid} difference_ug = tqdm(difference_ug) if (progress and len(difference_ug) != 0) else difference_ug for i, j in difference_ug: for cond_set in powerset(nodes_cond_set - {i, j}, r_max=max_set_size): cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i] # calculate regression coefficients (j regressed on cond_set_j) for both datasets beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i, lam=lam) beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i, lam=lam) # compute statistic and p-value j_ix = cond_set_i.index(j) stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \ inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix] pval_i = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i) # remove i-j from skeleton if i regressed on (j, cond_set) is invariant i_invariant = pval_i > min_alpha if i_invariant: removed_alphas = [alpha for alpha in alpha_skeleton_grid if pval_i > alpha] if verbose > 1: print( f"Removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_i:.5f} with cond set {cond_set_i}") for alpha in removed_alphas: skeletons[alpha].discard((i, j)) if true_diff is not None and (i, j) in true_diff or (j, i) in true_diff: print( f"Incorrectly removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_i:.6f} with cond set {cond_set_i}") if len(removed_alphas) == len(alpha_skeleton_grid): break elif verbose > 1: print(f"Keeping edge {i}-{j} for now, since p-value={pval_i:.5f} with cond set {cond_set_i}") # calculate regression coefficients (i regressed on cond_set_i) for both datasets beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j) beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j) # compute statistic and p-value i_ix = cond_set_j.index(i) stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \ inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix] pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j) # remove i-j from skeleton if j regressed on (i, cond_set) is invariant j_invariant = pval_j > min_alpha if j_invariant: removed_alphas = [alpha for alpha in alpha_skeleton_grid if pval_j > alpha] if verbose > 1: print( f"Removing edge {i}->{j} for alpha={removed_alphas} since p-value={pval_j:.5f} with cond set {cond_set_j}") for alpha in removed_alphas: skeletons[alpha].discard((i, j)) if true_diff is not None and (i, j) in true_diff or (j, i) in true_diff: print( f"Incorrectly removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_j:.6f} with cond set {cond_set_i}") if len(removed_alphas) == len(alpha_skeleton_grid): break elif verbose > 1: print(f"Keeping edge {i}-{j} for now, since p-value={pval_j:.5f}with cond set {cond_set_j}") return skeletons
def dci_orient( skeleton: set, rh1: RegressionHelper, rh2: RegressionHelper, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0 ): """ Orients edges in the skeleton of the difference DAG. Parameters ---------- skeleton: set Set of edges in the skeleton of the difference-DAG. rh1: RegressionHelper Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class. rh2: RegressionHelper Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class. alpha: float, default = 0.1 Significance level parameter for determining orientation of an edge. Lower alpha results in more directed edges in the difference-DAG. max_set_size: int, default = 3 Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. verbose: int, default = 0 The verbosity level of logging messages. See Also -------- dci, dci_undirected_graph, dci_skeleton Returns ------- oriented_edges: set Set of edges in the skeleton of the difference-DAG for which directionality could be determined. unoriented_edges: set Set of edges in the skeleton of the difference-DAG for which directionality could not be determined. """ if verbose > 0: print("DCI edge orientation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." nodes = {i for i, j in skeleton} | {j for i, j in skeleton} oriented_edges = set() n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] for i, j in skeleton: for cond_i, cond_j in zip(powerset(nodes - {i}, r_max=max_set_size), powerset(nodes - {j}, r_max=max_set_size)): # compute residual variances for i beta1_i, var1_i, _ = rh1.regression(i, cond_i) beta2_i, var2_i, _ = rh2.regression(i, cond_i) # compute p-value for invariance of residual variances for i pvalue_i = ncfdtr(n1 - len(cond_i), n2 - len(cond_i), 0, var1_i / var2_i) pvalue_i = 2 * min(pvalue_i, 1 - pvalue_i) # compute residual variances for j beta1_j, var1_j, _ = rh1.regression(j, cond_j) beta2_j, var2_j, _ = rh2.regression(j, cond_j) # compute p-value for invariance of residual variances for j pvalue_j = ncfdtr(n1 - len(cond_j), n2 - len(cond_j), 0, var1_j / var2_j) pvalue_j = 2 * min(pvalue_j, 1 - pvalue_j) if ((pvalue_i > alpha) | (pvalue_j > alpha)): # orient the edge according to highest p-value if pvalue_i > pvalue_j: edge = (j, i) if j in cond_i else (i, j) else: edge = (i, j) if i in cond_j else (j, i) oriented_edges.add(edge) if verbose > 0: print("Oriented (%d, %d) as %s" % (i, j, edge)) break unoriented_edges = skeleton - {frozenset({i, j}) for i, j in oriented_edges} return oriented_edges, unoriented_edges
def dci_skeleton( difference_ug: list, rh1: RegressionHelper, rh2: RegressionHelper, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0 ): """ Estimates the skeleton of the difference-DAG. Parameters ---------- difference_ug: list List of tuples that represents edges in the difference undirected graph. rh1: RegressionHelper Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class. rh2: RegressionHelper Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class. alpha: float, default = 0.1 Significance level parameter for determining presence of edges in the skeleton of the difference graph. Lower alpha results in sparser difference graph. max_set_size: int, default = None Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. verbose: int, default = 0 The verbosity level of logging messages. See Also -------- dci, dci_undirected_graph, dci_orient Returns ------- skeleton: set Set of edges in the skeleton of the difference-DAG. """ if verbose > 0: print("DCI skeleton estimation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] nodes = get_nodes_in_graph(difference_ug) skeleton = {frozenset({i, j}) for i, j in difference_ug} for i, j in difference_ug: for cond_set in powerset(nodes - {i, j}, r_max=max_set_size): cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i] # calculate regression coefficients (j regressed on cond_set_j) for both datasets beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i) beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i) # compute statistic and p-value j_ix = cond_set_i.index(j) stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \ inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix] pval_i = ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i) pval_i = 2 * min(pval_i, 1 - pval_i) # remove i-j from skeleton if i regressed on (j, cond_set) is invariant i_invariant = pval_i > alpha if i_invariant: if verbose > 0: print("Removing edge %d-%d since p-value=%.5f < alpha=%.5f" % (i, j, pval_i, alpha)) skeleton.remove(frozenset({i, j})) break # calculate regression coefficients (i regressed on cond_set_i) for both datasets beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j) beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j) # compute statistic and p-value i_ix = cond_set_j.index(i) stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \ inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix] pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j) pval_j = 2 * min(pval_j, 1 - pval_j) # remove i-j from skeleton if j regressed on (i, cond_set) is invariant j_invariant = pval_j > alpha if j_invariant: if verbose > 0: print("Removing edge %d-%d since p-value=%.5f < alpha=%.5f" % (i, j, pval_j, alpha)) skeleton.remove(frozenset({i, j})) break return skeleton