def test_dog_example():
    df = dog_example(size=100000)

    graph = Graph(variables=list(df.columns), complete=True)

    skeleton_finder = PCSkeletonFinder(data=df, graph=graph)

    cond_sets_satisfying_cond_indep = \
        skeleton_finder.find()

    assert cond_sets_satisfying_cond_indep[key_for_pair(
        ('activity', 'dog_tired'))].intersection(
            set({frozenset({'exercise_levels'})
                 })) == set({frozenset({'exercise_levels'})})

    assert set({frozenset({'best_friends_visit', 'activity'})}) not in \
        cond_sets_satisfying_cond_indep[
            key_for_pair(('weekend', 'mentally_exhausted_before_bed'))
        ]

    assert graph.has_adjacency(('rain', 'best_friends_visit'))
    assert graph.has_adjacency(('weekend', 'best_friends_visit'))
    assert graph.has_adjacency(('rain', 'activity'))
    assert graph.has_adjacency(('exercise_levels', 'best_friends_visit'))
    assert graph.has_adjacency(('exercise_levels', 'activity'))
    assert graph.has_adjacency(('mentally_exhausted_before_bed', 'activity'))
    assert graph.has_adjacency(('exercise_levels', 'dog_tired'))
    assert graph.has_adjacency(
        ('best_friends_visit', 'mentally_exhausted_before_bed'))
    assert graph.has_adjacency(
        ('mentally_exhausted_before_bed', 'dog_teeth_brushed'))
    assert graph.has_adjacency(('dog_tired', 'dog_teeth_brushed'))
def test_2_multinom_RVs(df_2_multinomial_indep_RVs):
    df = df_2_multinomial_indep_RVs(size=10000)
    graph = Graph(variables=list(df.columns), complete=True)

    skeleton_finder = PCSkeletonFinder(data=df, graph=graph)

    cond_sets_satisfying_cond_indep = skeleton_finder.find()

    assert graph.get_edges() == set({})
    assert cond_sets_satisfying_cond_indep['x _||_ y'] == set({frozenset({})})
def test_skeleton_finder_X_causes_Y(df_X_causes_Y):
    df = df_X_causes_Y(size=1000)

    graph = Graph(variables=list(df.columns), complete=True)

    skeleton_finder = PCSkeletonFinder(data=df, graph=graph)

    cond_sets_satisfying_cond_indep = skeleton_finder.find()

    assert graph.has_adjacency(('x', 'y'))
    assert cond_sets_satisfying_cond_indep == {}
def test_2_deterministic_and_3rd_var_caused_by_one_of_them(
        df_2_deterministic_and_3rd_var_caused_by_one_of_them):
    df = df_2_deterministic_and_3rd_var_caused_by_one_of_them(size=1000)
    graph = Graph(variables=list(df.columns), complete=True)

    skeleton_finder = PCSkeletonFinder(data=df, graph=graph)

    skeleton_finder.find()

    assert graph.has_adjacency(('x', 'y'))
    assert graph.get_nodes() == set({'x', 'y', 'z'})  # pylint: disable='no-member'
def test_skeleton_finder_Z_causes_X_and_Y(df_Z_causes_X_and_Y):
    df = df_Z_causes_X_and_Y(size=1000)

    graph = Graph(variables=list(df.columns), complete=True)

    skeleton_finder = PCSkeletonFinder(data=df, graph=graph)

    cond_sets_satisfying_cond_indep = skeleton_finder.find()

    assert graph.has_adjacency(('x', 'z'))
    assert graph.has_adjacency(('y', 'z'))
    assert cond_sets_satisfying_cond_indep == \
        {'x _||_ y': set({frozenset({'z'})})}
def test_3_multinom_RVs_MAR(df_Z_causes_X_Y_and_X_Z_causes_MI_Y):
    size = 70000

    df = df_Z_causes_X_Y_and_X_Z_causes_MI_Y(size=size)

    graph = Graph(variables=list(df.columns), complete=True)

    skeleton_finder = PCSkeletonFinder(data=df, graph=graph)

    skeleton_finder.find()

    assert set(graph.get_nodes()).intersection(set(['x', 'y', 'MI_x']))  # pylint: disable='no-member'

    assert graph.has_adjacency(('x', 'z'))
    assert graph.has_adjacency(('y', 'z'))
def test_chain_and_collider_without_MI(df_chain_and_collider_without_MI):
    size = 10000

    df = df_chain_and_collider_without_MI(size=size)

    graph = Graph(variables=list(df.columns), complete=True)

    skeleton_finder = PCSkeletonFinder(data=df, graph=graph)

    skeleton_finder.find()

    assert graph.has_adjacency(('a', 'b'))
    assert graph.has_adjacency(('b', 'c'))
    assert graph.has_adjacency(('a', 'd'))
    assert graph.has_adjacency(('c', 'd'))
def test_chain_and_collider_with_MI(df_chain_and_collider_with_MI):
    size = 20000

    df = df_chain_and_collider_with_MI(size=size)
    graph = Graph(variables=list(df.columns), complete=True)

    skeleton_finder = PCSkeletonFinder(data=df, graph=graph)

    skeleton_finder.find()

    # we expect a-c in this intermediate stage. a-c is spurious, due to
    # collider bias.

    assert graph.has_adjacency(('a', 'c'))
    assert graph.has_adjacency(('a', 'b'))
    assert graph.has_adjacency(('b', 'c'))
    assert graph.has_adjacency(('a', 'd'))
    assert graph.has_adjacency(('c', 'd'))
def test_long_chains_collider_bias_with_MI(
        df_long_chains_and_collider_with_MI):
    size = 10000

    df = df_long_chains_and_collider_with_MI(size=size, proba_noise=0.7)

    graph = Graph(variables=list(df.columns), complete=True)

    skeleton_finder = PCSkeletonFinder(data=df, graph=graph)

    skeleton_finder.find()

    # we expect b-d in this intermediate stage. b-d is spurious, due to
    # collider bias.

    assert graph.has_adjacency(('a', 'b'))
    assert graph.has_adjacency(('b', 'd'))
    assert graph.has_adjacency(('d', 'e'))
    assert graph.has_adjacency(('b', 'c'))
    assert graph.has_adjacency(('c', 'd'))
def test_dog_pee():
    size = 100000

    # Sometimes cloudy
    cloudy = np.random.binomial(n=1, p=0.5, size=size)

    # Cloudyness causes rain, but sometimes it rains even when it's not cloudy.
    rain = cloudy * np.random.binomial(n=1, p=0.7, size=size) \
        + (1 - cloudy) * np.random.binomial(n=1, p=0.1, size=size)

    # Sprinkler generally turns on when it isn't cloudy.
    sprinkler = (cloudy == 0) * np.random.binomial(n=1, p=0.8, size=size) \
        + cloudy * np.random.binomial(n=1, p=0.1, size=size)

    # Grass is generally wet whenever it rained or the sprinkler is on.
    wet_grass = (rain | sprinkler) * np.random.binomial(n=1, p=0.90, size=size)

    # Dog doesn't like to get rained on
    # Dog goes out more frequently when it's not raining
    dog_goes_out_to_pee = rain * np.random.binomial(n=1, p=0.2, size=size) \
        + (1 - rain) * np.random.binomial(n=1, p=0.9, size=size)

    df = pd.DataFrame({
        'cloudy': cloudy,
        'sprinkler': sprinkler,
        'rain': rain,
        'wet_grass': wet_grass,
        'dog_goes_out_to_pee': dog_goes_out_to_pee
    })

    graph = Graph(variables=list(df.columns), complete=True)
    skeleton_finder = PCSkeletonFinder(data=df, graph=graph)

    skeleton_finder.find()

    assert graph.has_adjacency(('cloudy', 'rain'))
    assert graph.has_adjacency(('cloudy', 'sprinkler'))
    assert graph.has_adjacency(('rain', 'dog_goes_out_to_pee'))
    assert graph.has_adjacency(('rain', 'wet_grass'))
    assert graph.has_adjacency(('sprinkler', 'wet_grass'))
Ejemplo n.º 11
0
def test_dog_example():
    df = dog_example(size=100000)

    skeleton_finder = PCSkeletonFinder(
        data=df
    )

    graph, cond_sets = \
        skeleton_finder.find()

    FindMoreCondIndeps(
        data=df,
        graph=graph,
        cond_sets=cond_sets
    ).find()

    assert len(set({frozenset({'exercise_levels'})})\
        .intersection(
            cond_sets.get('activity', 'dog_tired')
        )) == 1

    assert len(set({frozenset({'best_friends_visit', 'activity'})})\
        .intersection(
            cond_sets.get('weekend', 'mentally_exhausted_before_bed')\
        )) == 1

    assert graph.get_undirected_edges() == frozenset({
        frozenset(('rain', 'best_friends_visit')),
        frozenset(('weekend', 'best_friends_visit')),
        frozenset(('rain', 'activity')),
        frozenset(('exercise_levels', 'best_friends_visit')),
        frozenset(('exercise_levels', 'activity')),
        frozenset(('mentally_exhausted_before_bed', 'activity')),
        frozenset(('exercise_levels', 'dog_tired')),
        frozenset(('best_friends_visit', 'mentally_exhausted_before_bed')),
        frozenset(('mentally_exhausted_before_bed', 'dog_teeth_brushed')),
        frozenset(('dog_tired', 'dog_teeth_brushed')),
    })
Ejemplo n.º 12
0
    def predict(self, debug=False):
        logging = setup_logging()

        self.debug_info = []

        logging.info('Finding skeleton...')

        skeleton_finder = PCSkeletonFinder(
            data=self.data, cond_indep_test=self.cond_indep_test)

        graph, cond_sets = skeleton_finder.find()

        self.debug_info.append({
            'name': 'after skeleton finding',
            'graph': graph.copy(),
            'cond_sets': dict(cond_sets.dict)
        })

        logging.info(
            'Done finding skeleton. Now Finding direct causes of missingness...'
        )

        marked_arrows = DirectCausesOfMissingnessFinder(
            data=self.data,
            graph=graph,
            missingness_indicator_prefix=self.missingness_indicator_prefix,
            cond_indep_test=self.cond_indep_test).find()

        graph.add_marked_arrows(marked_arrows)

        self.debug_info.append({
            'name': 'after adding direct causes of missingness',
            'graph': graph.copy()
        })

        logging.info(
            'Done finding direct causes of missingness. Now finding potential edges to remove...'
        )

        potentially_extraneous_edges = PotentiallyExtraneousEdgesFinder(
            marked_pattern_graph=graph, ).find()

        logging.info(
            'Done finding potentially extraneous edges to remove. Now attempting to remove edges...'
        )

        edges_to_remove = RemovableEdgesFinder(
            data=self.data,
            data_correction=DensityRatioWeightedCorrection,
            potentially_extraneous_edges=potentially_extraneous_edges,
            cond_sets=cond_sets,
            graph=graph,
            missingness_indicator_prefix=self.missingness_indicator_prefix,
            cond_indep_test=self.cond_indep_test).find()

        graph.remove_undirected_edges(edges_to_remove)

        self.debug_info.append({
            'name': 'after removing undirected edges',
            'graph': graph.copy(),
            'cond_sets': dict(cond_sets.dict)
        })

        logging.info(
            'Done removing extraneous edges. Now finding immoralities...')

        immoralities = ImmoralitiesFinder(marked_pattern_graph=graph,
                                          cond_sets=cond_sets).find()

        graph.add_arrowheads(immoralities)

        self.debug_info.append({
            'name': 'after adding immoralities',
            'graph': graph.copy()
        })

        logging.info(
            'Done finding immoralities. Now recursively orienting edges...')

        RecursiveEdgeOrienter(marked_pattern_graph=graph).orient()

        self.debug_info.append({
            'name': 'after recursively orienting edges',
            'graph': graph.copy()
        })

        logging.info('Done recursively orienting edges!')

        return graph