def test_when_some_vars_are_in_the_same_time_window(): graph = MarkedPatternGraph(nodes=['X_t=2', 'Y_t=2'], undirected_edges=[('X_t=2', 'Y_t=2')]) TimeEdgeOrienter(graph).orient() assert graph.get_unmarked_arrows() == set({})
def test_equals_same(): var_names = ['a', 'b', 'c', 'd', 'e'] graph_1 = MarkedPatternGraph( nodes=var_names, marked_arrows=[('c', 'MI_b')], undirected_edges=[ ('a', 'b'), ('b', 'c'), ('e', 'd'), ('d', 'c'), ('b', 'd'), # extraneous edge ], unmarked_arrows=[('a', 'e')], bidirectional_edges=[('a', 'b')]) graph_2 = MarkedPatternGraph( nodes=var_names, marked_arrows=[('c', 'MI_b')], undirected_edges=[ ('a', 'b'), ('b', 'c'), ('e', 'd'), ('d', 'c'), ('b', 'd'), # extraneous edge ], unmarked_arrows=[('a', 'e')], bidirectional_edges=[('a', 'b')]) assert graph_1 == graph_2
def test_equals_undirected_edges_diff(): var_names = ['a', 'b', 'c', 'd', 'e'] graph_1 = MarkedPatternGraph(nodes=var_names, marked_arrows=[('c', 'MI_b')], undirected_edges=[ ('a', 'b'), ('b', 'c'), ('e', 'd'), ('d', 'c'), ('b', 'd'), ], unmarked_arrows=[], bidirectional_edges=[]) graph_2 = MarkedPatternGraph(nodes=var_names, marked_arrows=[('c', 'MI_b')], undirected_edges=[ ('a', 'b'), ('b', 'c'), ('e', 'd'), ('d', 'c'), ], unmarked_arrows=[], bidirectional_edges=[]) assert graph_1 != graph_2
def test_has_path_when_there_is_a_longer_one(): graph = MarkedPatternGraph(nodes=['x', 'y', 'z'], marked_arrows=[], undirected_edges=[('x', 'z')], unmarked_arrows=[('z', 'y')], bidirectional_edges=[]) assert graph.has_path(('x', 'y')) == True
def test_has_path_when_there_are_none(): graph = MarkedPatternGraph(nodes=['x', 'y'], marked_arrows=[], undirected_edges=[], unmarked_arrows=[], bidirectional_edges=[]) assert graph.has_path(('x', 'y')) == False
def test_bidirectional_edges(): graph = MarkedPatternGraph(nodes=['a', 'b']) graph.add_undirected_edge(('a', 'b')) graph.add_arrowhead(('a', 'b')) graph.add_arrowhead(('b', 'a')) assert set(graph.get_undirected_edges()) == set({}) assert set(graph.get_unmarked_arrows()) == set({}) assert set(graph.get_marked_arrows()) == set({}) assert set(graph.get_bidirectional_edges()) == set({frozenset({'a', 'b'})})
def test_has_arrowhead_with_marked_arrowhead(): graph = MarkedPatternGraph(nodes=['a', 'b']) graph.add_undirected_edge(('a', 'b')) assert graph.has_arrowhead(('a', 'b')) == False graph.add_marked_arrowhead(('a', 'b')) assert graph.has_arrowhead(('a', 'b')) == True assert graph.has_marked_arrowhead(('a', 'b')) == True
def test_remove_undirected_edge_when_not_exist(): graph = MarkedPatternGraph(nodes=['a', 'b']) graph.remove_undirected_edge(('a', 'b')) assert set(graph.get_undirected_edges()) == set({}) assert set(graph.get_unmarked_arrows()) == set({}) assert set(graph.get_marked_arrows()) == set({}) assert set(graph.get_bidirectional_edges()) == set({}) assert set(graph.get_edges()) == set({})
def test_immorality_across_time(): # X_t=1 --> Y_t=2 <-- X_t=3 graph = MarkedPatternGraph(nodes=['X_t=1', 'Y_t=2', 'X_t=3'], unmarked_arrows=[('X_t=1', 'Y_t=2'), ('X_t=3', 'Y_t=2')]) TimeEdgeOrienter(graph).orient() # X_t=1 --> Y_t=2 <--> X_t=3 assert graph.get_unmarked_arrows() == set({('X_t=1', 'Y_t=2')}) assert graph.get_bidirectional_edges() == set( {frozenset({'X_t=3', 'Y_t=2'})})
def test_long_chains_collider_bias_with_MI( df_long_chains_and_collider_with_MI, df_long_chains_and_collider_without_MI): size = 10000 var_names = ['a', 'b', 'c', 'd', 'e'] graph = MarkedPatternGraph( nodes=var_names, marked_arrows=[('c', 'MI_b')], undirected_edges=[ ('a', 'b'), ('b', 'c'), ('e', 'd'), ('d', 'c'), ('b', 'd'), # extraneous edge ]) df_no_missing = df_long_chains_and_collider_without_MI(size=size) df_no_missing['count'] = 0 assert df_no_missing['b'].mean() == approx(0.175, abs=0.01) no_missing_counts = (df_no_missing.groupby(['b', 'd']).count() / df_no_missing.groupby('d').count())['count'] # B & D are marginally independent assert no_missing_counts.xs([False, False], level=['b', 'd']).values[0] \ == approx(1 - 0.175, abs=0.02) assert no_missing_counts.xs([False, True], level=['b', 'd']).values[0] \ == approx(1 - 0.175, abs=0.02) assert no_missing_counts.xs([True, False], level=['b', 'd']).values[0] \ == approx(0.175, abs=0.02) assert no_missing_counts.xs([True, True], level=['b', 'd']).values[0] \ == approx(0.175, abs=0.02) corrected_df = DensityRatioWeightedCorrection( data=df_long_chains_and_collider_with_MI(size=size), var_names=['b', 'd'], graph=graph).correct() corrected_df['count'] = 0 corrected_df_counts = (corrected_df.groupby(['b', 'd']).count() / corrected_df.groupby('d').count())['count'] # B & D are marginally independent assert corrected_df_counts.xs([0, False], level=['b', 'd']).values[0] \ == approx(1 - 0.175, abs=0.02) assert corrected_df_counts.xs([0, True], level=['b', 'd']).values[0] \ == approx(1 - 0.175, abs=0.02) assert corrected_df_counts.xs([1, False], level=['b', 'd']).values[0] \ == approx(0.175, abs=0.02) assert corrected_df_counts.xs([1, True], level=['b', 'd']).values[0] \ == approx(0.175, abs=0.02)
def test_firing_squad_example(): undirected_edges = [ frozenset(('captain', 'rifle_person_1')), frozenset(('captain', 'rifle_person_2')), frozenset(('rifle_person_1', 'death')), frozenset(('rifle_person_2', 'death')), ] marked_pattern_graph = MarkedPatternGraph( nodes=['captain', 'rifle_person_1', 'rifle_person_2', 'death', 'MI_captain'], marked_arrows=[('death', 'MI_captain')], undirected_edges=undirected_edges ) potentially_extraneous_edges_finder = PotentiallyExtraneousEdgesFinder( marked_pattern_graph=marked_pattern_graph ) potentially_extraneous_edges = \ potentially_extraneous_edges_finder.find() assert potentially_extraneous_edges == set([])
def test_deterministic_cause_of_missingness(): size = 1000 x = np.random.binomial(n=1, p=0.6, size=size) y = np.random.binomial(n=1, p=0.3, size=size) z = np.random.binomial(n=1, p=0.3, size=size) missing = np.where(x == 1)[0] df = pd.DataFrame({ 'x': x, 'y': y, 'z': z, }) df.at[missing, 'z'] = np.nan graph = MarkedPatternGraph(nodes=['x', 'y', 'z', 'MI_z'], marked_arrows=[('x', 'MI_z')]) corrector = DensityRatioWeightedCorrection(data=df, var_names=['x', 'y', 'z'], graph=graph).correct() # no errors thrown assert 1
def test_long_chains_and_collider_with_MI(df_long_chains_and_collider_with_MI): df = df_long_chains_and_collider_with_MI(size=1000, proba_noise=0.6) graph = MarkedPatternGraph( nodes=list(set(df.columns).union(set({'MI_b'}))), undirected_edges=set({ frozenset({'b', 'a'}), frozenset({'d', 'e'}), frozenset({'d', 'c'}), frozenset({'b', 'c'}), frozenset({'d', 'b'}) }), marked_arrows=[('c', 'MI_b')] ) cond_sets = ConditioningSets() finder = RemovableEdgesFinder( data=df, cond_sets=cond_sets, graph=graph, potentially_extraneous_edges=set({ frozenset({'d', 'b'}), frozenset({'d', 'c'}), frozenset({'b', 'c'}) }), data_correction=DensityRatioWeightedCorrection, ) removables = finder.find() assert cond_sets[key_for_pair(('b','d'))] != set({}) assert set(removables) == set({ frozenset({'b', 'd'}) })
def test_3_multinom_RVs_MAR(df_Z_causes_X_Y_and_X_Z_causes_MI_Y): size = 1000 df = df_Z_causes_X_Y_and_X_Z_causes_MI_Y(size=size) graph = MarkedPatternGraph(nodes=['x', 'y', 'z']) direct_causes_of_missingness_finder = DirectCausesOfMissingnessFinder( data=df, graph=graph) marked_arrows = direct_causes_of_missingness_finder.find() assert set(marked_arrows) == set([('z', 'MI_y'), ('x', 'MI_y')])
def test_when_marked_path_exists(): # a -*> b -*> c # \ / # \ / # \ / # graph = MarkedPatternGraph(nodes=['a', 'b', 'c']) graph.add_undirected_edge(('a', 'b')) graph.add_undirected_edge(('b', 'c')) graph.add_undirected_edge(('a', 'c')) graph.add_marked_arrowhead(('a', 'b')) graph.add_marked_arrowhead(('b', 'c')) RecursiveEdgeOrienter(marked_pattern_graph=graph).orient() assert graph.get_unmarked_arrows() == set({('a', 'c')})
def test_simple(): # a c # \ / # v v # b # | # d graph = MarkedPatternGraph(nodes=['a', 'b', 'c', 'd']) graph.add_undirected_edge(('a', 'b')) graph.add_undirected_edge(('c', 'b')) graph.add_undirected_edge(('b', 'd')) graph.add_arrowhead(('a', 'b')) graph.add_arrowhead(('c', 'b')) RecursiveEdgeOrienter(marked_pattern_graph=graph).orient() assert graph.get_marked_arrows() == set({('b', 'd')})
def test_mcar(): marked_pattern_graph = MarkedPatternGraph( nodes=['X', 'Y', 'MI_x'] ) potentially_extraneous_edges_finder = PotentiallyExtraneousEdgesFinder( marked_pattern_graph=marked_pattern_graph ) potentially_extraneous_edges = \ potentially_extraneous_edges_finder.find() assert potentially_extraneous_edges == set({})
def test_add_marked_arrows(): graph = MarkedPatternGraph(nodes=['a', 'b'], undirected_edges=[('a', 'b')]) graph.add_marked_arrow(('c', 'd')) assert graph.get_marked_arrows() == set({('c', 'd')}) assert set(graph.get_edges()) == set( {frozenset({'a', 'b'}), frozenset({'c', 'd'})})
def test_simple(): # a c # \ / # v v # b # | # d graph = MarkedPatternGraph(nodes=['a', 'b', 'c', 'd']) graph.add_undirected_edge(('a', 'b')) graph.add_undirected_edge(('c', 'b')) graph.add_undirected_edge(('b', 'd')) graph.add_arrowhead(('a', 'b')) graph.add_arrowhead(('c', 'b')) assert graph.get_edges() == set( {frozenset({'a', 'b'}), frozenset({'b', 'c'}), frozenset({'b', 'd'})})
def find(self): """ Go through each pair of variables (in var_names). For each pair, find a conditioning set that renders the two variables independent. Returns: marked_pattern: MarkedPatternGraph It'll store the skeleton (a set of undirected edges). It can be used for later steps, such as finding immoralities. cond_sets_satisfying_cond_indep: dict key: str. The pair of variables that are conditionally independent, delimited by " _||_ ". E.g. If "X _||_ Y" is a key, then X and Y are the variables that are conditionally independent. value: list(sets(str)). The conditioning sets that make X and Y conditionally independent. """ undirected_edges = [] cond_sets_satisfying_cond_indep = {} for var_name_1, var_name_2 in combinations(self.orig_cols, 2): possible_conditioning_set_vars = \ set(self.orig_cols) \ - set([var_name_1, var_name_2]) cond_sets = conditioning_sets_satisfying_conditional_independence( data=self.data, var_name_1=var_name_1, var_name_2=var_name_2, cond_indep_test=self.cond_indep_test, possible_conditioning_set_vars=possible_conditioning_set_vars, only_find_one=self.only_find_one, max_depth=self.max_depth) if len(cond_sets) == 0: undirected_edges.append(frozenset((var_name_1, var_name_2))) else: cond_sets_satisfying_cond_indep[key_for_pair( [var_name_1, var_name_2])] = cond_sets marked_pattern = MarkedPatternGraph(nodes=list(self.data.columns), undirected_edges=undirected_edges) return marked_pattern, cond_sets_satisfying_cond_indep
def test_2_multinom_RVs_MAR(df_X_Y_indep_Y_causes_MI_X): size = 2000 df = df_X_Y_indep_Y_causes_MI_X(size=size) graph = MarkedPatternGraph(nodes=['x', 'y']) direct_causes_of_missingness_finder = DirectCausesOfMissingnessFinder( data=df, graph=graph) marked_arrows = direct_causes_of_missingness_finder.find() assert marked_arrows == [('y', 'MI_x')]
def test_2_multinom_RVs_MCAR(df_2_multinomial_indep_RVs): size = 2000 df = df_2_multinomial_indep_RVs(size=size) missingness_of_x = np.random.binomial(n=1, p=0.3, size=size) missingness_indices = np.where(missingness_of_x == 1) df.at[missingness_indices[0], 'x'] = np.nan graph = MarkedPatternGraph(nodes=['x', 'y']) direct_causes_of_missingness_finder = DirectCausesOfMissingnessFinder( data=df, graph=graph) marked_arrows = direct_causes_of_missingness_finder.find() assert marked_arrows == []
def test_chain_and_collider_with_MI( df_chain_and_collider_with_MI ): size = 10000 df = df_chain_and_collider_with_MI(size=size) cond_sets = ConditioningSets() graph = MarkedPatternGraph( nodes=list(set(df.columns).union(set({'MI_y'}))), undirected_edges=set({ frozenset({'a', 'c'}), # extraneous edge frozenset({'a', 'b'}), frozenset({'b', 'c'}), frozenset({'a', 'd'}), frozenset({'c', 'd'}), }), marked_arrows=[ ('d', 'MI_a') ] ) # we expect a-c in this intermediate stage. a-c is spurious, due to # collider bias. expected_undirected_edges = frozenset({ frozenset({'a', 'c'}), }) finder = RemovableEdgesFinder( data=df, cond_sets=cond_sets, graph=graph, potentially_extraneous_edges=set({ frozenset({'a', 'c'}), }), data_correction=DensityRatioWeightedCorrection, ) removables = finder.find() assert cond_sets[key_for_pair(('a', 'c'))] != set({}) assert set(removables) == set({ frozenset({'a', 'c'}) })
def test_two_causes_MI_collider(): undirected_edges = [ frozenset(('z', 'y')) ] marked_pattern_graph = MarkedPatternGraph( nodes=['x', 'y', 'z', 'MI_x'], marked_arrows=[('y', 'MI_x'), ('z', 'MI_x' )], undirected_edges=undirected_edges ) potentially_extraneous_edges_finder = PotentiallyExtraneousEdgesFinder( marked_pattern_graph=marked_pattern_graph ) potentially_extraneous_edges = \ potentially_extraneous_edges_finder.find() assert potentially_extraneous_edges == set(undirected_edges)
def test_marked_arrow_exists_with_no_MI(): undirected_edges = [ frozenset(('z', 'y')) ] marked_pattern_graph = MarkedPatternGraph( nodes=['x', 'y', 'z', 'MI_x'], marked_arrows=[('x', 'y')], undirected_edges=undirected_edges ) potentially_extraneous_edges_finder = PotentiallyExtraneousEdgesFinder( marked_pattern_graph=marked_pattern_graph ) potentially_extraneous_edges = \ potentially_extraneous_edges_finder.find() assert potentially_extraneous_edges == set([])
def test_missing_data_because_of_ses(): size = 10000 ses = np.random.binomial(n=1, p=0.3, size=size) b_1_given_ses_low = np.random.binomial(n=1, p=0.4, size=size) b_1_given_ses_high = np.random.binomial(n=1, p=0.9, size=size) missing_b_1_given_ses_low = np.random.binomial(n=1, p=0.5, size=size) missing_b_1_given_ses_high = np.random.binomial(n=1, p=0.1, size=size) b = ses * b_1_given_ses_high + (ses == 0) * b_1_given_ses_low missing = ses * missing_b_1_given_ses_high \ + (ses == 0) * missing_b_1_given_ses_low # true mean assert b.mean() == approx(0.55, abs=0.015) # Those with lower SES are more likely to be missing. missing_index = np.where(missing == 1)[0] df_with_missing_data = pd.DataFrame({'ses': ses, 'b': b}) df_with_missing_data.loc[missing_index, 'b'] = np.nan # A naive analysis leads to an overestimate. assert df_with_missing_data['b'].mean() == approx(0.62, abs=0.015) graph = MarkedPatternGraph(nodes=['ses', 'b', 'MI_b'], marked_arrows=[('ses', 'MI_b')], undirected_edges=[('ses', 'b')]) corrector = DensityRatioWeightedCorrection(data=df_with_missing_data, var_names=['ses', 'b', 'MI_b'], graph=graph) # reweight data before running statistics on it reweighted_df = corrector.correct() # we're able to recover the true mean assert reweighted_df['b'].mean() == approx(0.55, abs=0.015)
def test_chain_and_collider_with_MI(df_chain_and_collider_with_MI): size = 10000 df = df_chain_and_collider_with_MI(size=size) graph = MarkedPatternGraph(nodes=df.columns, undirected_edges=[ set({'a', 'b'}), set({'b', 'c'}), set({'c', 'd'}), set({'a', 'd'}), ]) direct_causes_of_missingness_finder = DirectCausesOfMissingnessFinder( data=df, graph=graph) marked_arrows = direct_causes_of_missingness_finder.find() expected_marked_arrows = frozenset({('d', 'MI_a')}) assert frozenset(marked_arrows) == expected_marked_arrows
def test_3_multinom_RVs_MAR( df_Z_causes_X_Y_and_X_Z_causes_MI_Y ): size = 1000 df = df_Z_causes_X_Y_and_X_Z_causes_MI_Y(size=size) graph = MarkedPatternGraph( nodes=list(set(df.columns).union(set({'MI_y'}))), undirected_edges=set({ frozenset({'x', 'y'}), # extraneous edge frozenset({'x', 'z'}), frozenset({'z', 'y'}), }), marked_arrows=[ ('x', 'MI_y'), ('z', 'MI_y') ] ) cond_sets = ConditioningSets() finder = RemovableEdgesFinder( data=df, cond_sets=cond_sets, graph=graph, potentially_extraneous_edges=set({ frozenset({'x', 'y'}), }), data_correction=DensityRatioWeightedCorrection, ) removables = finder.find() assert set(removables) == set({ frozenset({'x', 'y'}) }) assert cond_sets[key_for_pair(('x', 'y'))] != set({})
def test_cond_on_collider(df_X_and_Y_cause_Z_and_Z_cause_MI_X): df = df_X_and_Y_cause_Z_and_Z_cause_MI_X(size=2000) cond_sets = ConditioningSets() # extraneous edge x-y graph = MarkedPatternGraph( nodes=['x', 'y', 'z', 'MI_x'], undirected_edges=[set({'x', 'y'}), set({'x', 'z'}), set({'y', 'z'})], marked_arrows=[('z', 'MI_x')] ) finder = RemovableEdgesFinder( data=df, cond_sets=cond_sets, graph=graph, potentially_extraneous_edges=[set({'x', 'y'})], data_correction=DensityRatioWeightedCorrection, ) removables = finder.find() assert removables == [set({'x', 'y'})] assert cond_sets[key_for_pair(('x','y'))] != set({})
def test_firing_squad(): undirected_edges = [ frozenset(('captain', 'rifle_person_1')), frozenset(('captain', 'rifle_person_2')), frozenset(('rifle_person_1', 'death')), frozenset(('rifle_person_2', 'death')), ] graph = MarkedPatternGraph(nodes=[ 'captain', 'rifle_person_1', 'rifle_person_2', 'prisoner shot', 'prisoner death' ]) graph.add_undirected_edge(('captain', 'rifle_person_1')) graph.add_undirected_edge(('captain', 'rifle_person_2')) graph.add_undirected_edge(('rifle_person_1', 'prisoner shot')) graph.add_undirected_edge(('rifle_person_2', 'prisoner shot')) graph.add_undirected_edge(('prisoner shot', 'prisoner death')) graph.add_arrowhead(('rifle_person_1', 'prisoner shot')) graph.add_arrowhead(('rifle_person_2', 'prisoner shot')) RecursiveEdgeOrienter(marked_pattern_graph=graph).orient() assert graph.get_marked_arrows() == set({('prisoner shot', 'prisoner death')})