def test_dog_example(): df = dog_example(size=100000) graph = Graph(variables=list(df.columns), complete=True) skeleton_finder = PCSkeletonFinder(data=df, graph=graph) cond_sets_satisfying_cond_indep = \ skeleton_finder.find() assert cond_sets_satisfying_cond_indep[key_for_pair( ('activity', 'dog_tired'))].intersection( set({frozenset({'exercise_levels'}) })) == set({frozenset({'exercise_levels'})}) assert set({frozenset({'best_friends_visit', 'activity'})}) not in \ cond_sets_satisfying_cond_indep[ key_for_pair(('weekend', 'mentally_exhausted_before_bed')) ] assert graph.has_adjacency(('rain', 'best_friends_visit')) assert graph.has_adjacency(('weekend', 'best_friends_visit')) assert graph.has_adjacency(('rain', 'activity')) assert graph.has_adjacency(('exercise_levels', 'best_friends_visit')) assert graph.has_adjacency(('exercise_levels', 'activity')) assert graph.has_adjacency(('mentally_exhausted_before_bed', 'activity')) assert graph.has_adjacency(('exercise_levels', 'dog_tired')) assert graph.has_adjacency( ('best_friends_visit', 'mentally_exhausted_before_bed')) assert graph.has_adjacency( ('mentally_exhausted_before_bed', 'dog_teeth_brushed')) assert graph.has_adjacency(('dog_tired', 'dog_teeth_brushed'))
def test_long_chains_and_collider_with_MI(df_long_chains_and_collider_with_MI): df = df_long_chains_and_collider_with_MI(size=1000, proba_noise=0.6) graph = MarkedPatternGraph( nodes=list(set(df.columns).union(set({'MI_b'}))), undirected_edges=set({ frozenset({'b', 'a'}), frozenset({'d', 'e'}), frozenset({'d', 'c'}), frozenset({'b', 'c'}), frozenset({'d', 'b'}) }), marked_arrows=[('c', 'MI_b')] ) cond_sets = ConditioningSets() finder = RemovableEdgesFinder( data=df, cond_sets=cond_sets, graph=graph, potentially_extraneous_edges=set({ frozenset({'d', 'b'}), frozenset({'d', 'c'}), frozenset({'b', 'c'}) }), data_correction=DensityRatioWeightedCorrection, ) removables = finder.find() assert cond_sets[key_for_pair(('b','d'))] != set({}) assert set(removables) == set({ frozenset({'b', 'd'}) })
def find(self): """ Go through each pair of variables (in var_names). For each pair, find a conditioning set that renders the two variables independent. Returns: marked_pattern: MarkedPatternGraph It'll store the skeleton (a set of undirected edges). It can be used for later steps, such as finding immoralities. cond_sets_satisfying_cond_indep: dict key: str. The pair of variables that are conditionally independent, delimited by " _||_ ". E.g. If "X _||_ Y" is a key, then X and Y are the variables that are conditionally independent. value: list(sets(str)). The conditioning sets that make X and Y conditionally independent. """ undirected_edges = [] cond_sets_satisfying_cond_indep = {} for var_name_1, var_name_2 in combinations(self.orig_cols, 2): possible_conditioning_set_vars = \ set(self.orig_cols) \ - set([var_name_1, var_name_2]) cond_sets = conditioning_sets_satisfying_conditional_independence( data=self.data, var_name_1=var_name_1, var_name_2=var_name_2, cond_indep_test=self.cond_indep_test, possible_conditioning_set_vars=possible_conditioning_set_vars, only_find_one=self.only_find_one, max_depth=self.max_depth) if len(cond_sets) == 0: undirected_edges.append(frozenset((var_name_1, var_name_2))) else: cond_sets_satisfying_cond_indep[key_for_pair( [var_name_1, var_name_2])] = cond_sets marked_pattern = MarkedPatternGraph(nodes=list(self.data.columns), undirected_edges=undirected_edges) return marked_pattern, cond_sets_satisfying_cond_indep
def test_chain_and_collider_with_MI( df_chain_and_collider_with_MI ): size = 10000 df = df_chain_and_collider_with_MI(size=size) cond_sets = ConditioningSets() graph = MarkedPatternGraph( nodes=list(set(df.columns).union(set({'MI_y'}))), undirected_edges=set({ frozenset({'a', 'c'}), # extraneous edge frozenset({'a', 'b'}), frozenset({'b', 'c'}), frozenset({'a', 'd'}), frozenset({'c', 'd'}), }), marked_arrows=[ ('d', 'MI_a') ] ) # we expect a-c in this intermediate stage. a-c is spurious, due to # collider bias. expected_undirected_edges = frozenset({ frozenset({'a', 'c'}), }) finder = RemovableEdgesFinder( data=df, cond_sets=cond_sets, graph=graph, potentially_extraneous_edges=set({ frozenset({'a', 'c'}), }), data_correction=DensityRatioWeightedCorrection, ) removables = finder.find() assert cond_sets[key_for_pair(('a', 'c'))] != set({}) assert set(removables) == set({ frozenset({'a', 'c'}) })
def test_3_multinom_RVs_MAR( df_Z_causes_X_Y_and_X_Z_causes_MI_Y ): size = 1000 df = df_Z_causes_X_Y_and_X_Z_causes_MI_Y(size=size) graph = MarkedPatternGraph( nodes=list(set(df.columns).union(set({'MI_y'}))), undirected_edges=set({ frozenset({'x', 'y'}), # extraneous edge frozenset({'x', 'z'}), frozenset({'z', 'y'}), }), marked_arrows=[ ('x', 'MI_y'), ('z', 'MI_y') ] ) cond_sets = ConditioningSets() finder = RemovableEdgesFinder( data=df, cond_sets=cond_sets, graph=graph, potentially_extraneous_edges=set({ frozenset({'x', 'y'}), }), data_correction=DensityRatioWeightedCorrection, ) removables = finder.find() assert set(removables) == set({ frozenset({'x', 'y'}) }) assert cond_sets[key_for_pair(('x', 'y'))] != set({})
def test_cond_on_collider(df_X_and_Y_cause_Z_and_Z_cause_MI_X): df = df_X_and_Y_cause_Z_and_Z_cause_MI_X(size=2000) cond_sets = ConditioningSets() # extraneous edge x-y graph = MarkedPatternGraph( nodes=['x', 'y', 'z', 'MI_x'], undirected_edges=[set({'x', 'y'}), set({'x', 'z'}), set({'y', 'z'})], marked_arrows=[('z', 'MI_x')] ) finder = RemovableEdgesFinder( data=df, cond_sets=cond_sets, graph=graph, potentially_extraneous_edges=[set({'x', 'y'})], data_correction=DensityRatioWeightedCorrection, ) removables = finder.find() assert removables == [set({'x', 'y'})] assert cond_sets[key_for_pair(('x','y'))] != set({})
def find(self): """ Go through each pair of variables. For each pair, find a conditioning set that renders the two variables independent. """ depth = 0 logging = setup_logging() nodes = self.graph.get_observable_nodes() unmarked_arrows = self.graph.get_unmarked_arrows() has_missing_data = self.data.isnull().sum().sum() > 0 while self._depth_not_greater_than_num_adj_nodes_per_var(depth): visited = {} for node_1, node_2 in combinations(nodes, 2): node_1_neighbors = self.graph.get_neighbors(node_1) node_2_neighbors = self.graph.get_neighbors(node_2) _neighbors = node_1_neighbors.union(node_2_neighbors) if key_for_pair((node_1, node_2)) in visited: continue if len(_neighbors.intersection(set({node_1, node_2}))) > 0: continue if node_1 == node_2: continue if not self.graph.has_path((node_1, node_2)): continue neighbors = _neighbors - set({node_1, node_2}) for conditionable in combinations(neighbors, depth): # TODO: it's not just about having missing data; we care # about having missing data that is directly associated to # one of the variables if has_missing_data and self._has_common_neighbor_not_immoral( node_1, node_2, node_1_neighbors, node_2_neighbors, unmarked_arrows ): var_names = neighbors.union(set({node_1, node_2})) _data = DensityRatioWeightedCorrection( data=self.data, var_names=var_names, graph=self.graph, missingness_indicator_prefix='MI_' ).correct() else: _data = self.data if self.cond_indep_test( _data, vars_1=[node_1], vars_2=[node_2], conditioning_set=list(conditionable) ): self.cond_sets.add(node_1, node_2, conditionable) visited[key_for_pair((node_1, node_2))] = True depth += 1