def test_dog_example():
    df = dog_example(size=100000)

    graph = Graph(variables=list(df.columns), complete=True)

    skeleton_finder = PCSkeletonFinder(data=df, graph=graph)

    cond_sets_satisfying_cond_indep = \
        skeleton_finder.find()

    assert cond_sets_satisfying_cond_indep[key_for_pair(
        ('activity', 'dog_tired'))].intersection(
            set({frozenset({'exercise_levels'})
                 })) == set({frozenset({'exercise_levels'})})

    assert set({frozenset({'best_friends_visit', 'activity'})}) not in \
        cond_sets_satisfying_cond_indep[
            key_for_pair(('weekend', 'mentally_exhausted_before_bed'))
        ]

    assert graph.has_adjacency(('rain', 'best_friends_visit'))
    assert graph.has_adjacency(('weekend', 'best_friends_visit'))
    assert graph.has_adjacency(('rain', 'activity'))
    assert graph.has_adjacency(('exercise_levels', 'best_friends_visit'))
    assert graph.has_adjacency(('exercise_levels', 'activity'))
    assert graph.has_adjacency(('mentally_exhausted_before_bed', 'activity'))
    assert graph.has_adjacency(('exercise_levels', 'dog_tired'))
    assert graph.has_adjacency(
        ('best_friends_visit', 'mentally_exhausted_before_bed'))
    assert graph.has_adjacency(
        ('mentally_exhausted_before_bed', 'dog_teeth_brushed'))
    assert graph.has_adjacency(('dog_tired', 'dog_teeth_brushed'))
Example #2
0
def test_long_chains_and_collider_with_MI(df_long_chains_and_collider_with_MI):
    df = df_long_chains_and_collider_with_MI(size=1000, proba_noise=0.6)

    graph = MarkedPatternGraph(
        nodes=list(set(df.columns).union(set({'MI_b'}))),
        undirected_edges=set({
            frozenset({'b', 'a'}),
            frozenset({'d', 'e'}),
            frozenset({'d', 'c'}),
            frozenset({'b', 'c'}),
            frozenset({'d', 'b'})
        }),
        marked_arrows=[('c', 'MI_b')]
    )

    cond_sets = ConditioningSets()

    finder = RemovableEdgesFinder(
        data=df,
        cond_sets=cond_sets,
        graph=graph,
        potentially_extraneous_edges=set({
            frozenset({'d', 'b'}),
            frozenset({'d', 'c'}),
            frozenset({'b', 'c'})
        }),
        data_correction=DensityRatioWeightedCorrection,
    )

    removables = finder.find()

    assert cond_sets[key_for_pair(('b','d'))] != set({})

    assert set(removables) == set({ frozenset({'b', 'd'}) })
Example #3
0
    def find(self):
        """
            Go through each pair of variables (in var_names).
            For each pair, find a conditioning set that renders the two variables
            independent.

            Returns:
                marked_pattern: MarkedPatternGraph
                    It'll store the skeleton (a set of undirected edges). It
                    can be used for later steps, such as finding immoralities.

                cond_sets_satisfying_cond_indep: dict

                    key: str.
                        The pair of variables that are conditionally
                        independent, delimited by " _||_ ".  E.g. If "X _||_ Y"
                        is a key, then X and Y are the variables that are
                        conditionally independent.

                    value: list(sets(str)).
                        The conditioning sets that make X and Y conditionally
                        independent.
        """
        undirected_edges = []
        cond_sets_satisfying_cond_indep = {}

        for var_name_1, var_name_2 in combinations(self.orig_cols, 2):
            possible_conditioning_set_vars = \
                set(self.orig_cols) \
                - set([var_name_1, var_name_2])

            cond_sets = conditioning_sets_satisfying_conditional_independence(
                data=self.data,
                var_name_1=var_name_1,
                var_name_2=var_name_2,
                cond_indep_test=self.cond_indep_test,
                possible_conditioning_set_vars=possible_conditioning_set_vars,
                only_find_one=self.only_find_one,
                max_depth=self.max_depth)

            if len(cond_sets) == 0:
                undirected_edges.append(frozenset((var_name_1, var_name_2)))
            else:
                cond_sets_satisfying_cond_indep[key_for_pair(
                    [var_name_1, var_name_2])] = cond_sets

        marked_pattern = MarkedPatternGraph(nodes=list(self.data.columns),
                                            undirected_edges=undirected_edges)

        return marked_pattern, cond_sets_satisfying_cond_indep
Example #4
0
def test_chain_and_collider_with_MI(
    df_chain_and_collider_with_MI
):
    size = 10000

    df = df_chain_and_collider_with_MI(size=size)

    cond_sets = ConditioningSets()

    graph = MarkedPatternGraph(
        nodes=list(set(df.columns).union(set({'MI_y'}))),
        undirected_edges=set({
            frozenset({'a', 'c'}), # extraneous edge
            frozenset({'a', 'b'}),
            frozenset({'b', 'c'}),
            frozenset({'a', 'd'}),
            frozenset({'c', 'd'}),
        }),
        marked_arrows=[
            ('d', 'MI_a')
        ]
    )

    # we expect a-c in this intermediate stage. a-c is spurious, due to
    # collider bias.

    expected_undirected_edges = frozenset({
        frozenset({'a', 'c'}),
    })

    finder = RemovableEdgesFinder(
        data=df,
        cond_sets=cond_sets,
        graph=graph,
        potentially_extraneous_edges=set({
            frozenset({'a', 'c'}),
        }),
        data_correction=DensityRatioWeightedCorrection,
    )

    removables = finder.find()

    assert cond_sets[key_for_pair(('a', 'c'))] != set({})

    assert set(removables) == set({ frozenset({'a', 'c'}) })
Example #5
0
def test_3_multinom_RVs_MAR(
    df_Z_causes_X_Y_and_X_Z_causes_MI_Y
):
    size = 1000

    df = df_Z_causes_X_Y_and_X_Z_causes_MI_Y(size=size)

    graph = MarkedPatternGraph(
        nodes=list(set(df.columns).union(set({'MI_y'}))),
        undirected_edges=set({
            frozenset({'x', 'y'}), # extraneous edge
            frozenset({'x', 'z'}),
            frozenset({'z', 'y'}),
        }),
        marked_arrows=[
            ('x', 'MI_y'),
            ('z', 'MI_y')
        ]
    )

    cond_sets = ConditioningSets()

    finder = RemovableEdgesFinder(
        data=df,
        cond_sets=cond_sets,
        graph=graph,
        potentially_extraneous_edges=set({
            frozenset({'x', 'y'}),
        }),
        data_correction=DensityRatioWeightedCorrection,
    )

    removables = finder.find()

    assert set(removables) == set({ frozenset({'x', 'y'}) })

    assert cond_sets[key_for_pair(('x', 'y'))] != set({})
Example #6
0
def test_cond_on_collider(df_X_and_Y_cause_Z_and_Z_cause_MI_X):
    df = df_X_and_Y_cause_Z_and_Z_cause_MI_X(size=2000)

    cond_sets = ConditioningSets()

    # extraneous edge x-y
    graph = MarkedPatternGraph(
        nodes=['x', 'y', 'z', 'MI_x'],
        undirected_edges=[set({'x', 'y'}), set({'x', 'z'}), set({'y', 'z'})],
        marked_arrows=[('z', 'MI_x')]
    )

    finder = RemovableEdgesFinder(
        data=df,
        cond_sets=cond_sets,
        graph=graph,
        potentially_extraneous_edges=[set({'x', 'y'})],
        data_correction=DensityRatioWeightedCorrection,
    )

    removables = finder.find()

    assert removables == [set({'x', 'y'})]
    assert cond_sets[key_for_pair(('x','y'))] != set({})
Example #7
0
    def find(self):
        """
            Go through each pair of variables.
            For each pair, find a conditioning set that renders the two variables
            independent.

        """
        depth = 0

        logging = setup_logging()

        nodes = self.graph.get_observable_nodes()

        unmarked_arrows = self.graph.get_unmarked_arrows()
        has_missing_data = self.data.isnull().sum().sum() > 0

        while self._depth_not_greater_than_num_adj_nodes_per_var(depth):
            visited = {}
            for node_1, node_2 in combinations(nodes, 2):
                node_1_neighbors = self.graph.get_neighbors(node_1)
                node_2_neighbors = self.graph.get_neighbors(node_2)

                _neighbors = node_1_neighbors.union(node_2_neighbors)
                if key_for_pair((node_1, node_2)) in visited:
                    continue

                if len(_neighbors.intersection(set({node_1, node_2}))) > 0:
                    continue

                if node_1 == node_2:
                    continue

                if not self.graph.has_path((node_1, node_2)):
                    continue

                neighbors = _neighbors - set({node_1, node_2})

                for conditionable in combinations(neighbors, depth):
                    # TODO: it's not just about having missing data; we care
                    # about having missing data that is directly associated to
                    # one of the variables
                    if has_missing_data and self._has_common_neighbor_not_immoral(
                        node_1,
                        node_2,
                        node_1_neighbors,
                        node_2_neighbors,
                        unmarked_arrows
                    ):
                        var_names = neighbors.union(set({node_1, node_2}))

                        _data = DensityRatioWeightedCorrection(
                            data=self.data,
                            var_names=var_names,
                            graph=self.graph,
                            missingness_indicator_prefix='MI_'
                        ).correct()
                    else:
                        _data = self.data

                    if self.cond_indep_test(
                        _data,
                        vars_1=[node_1],
                        vars_2=[node_2],
                        conditioning_set=list(conditionable)
                    ):
                        self.cond_sets.add(node_1, node_2, conditionable)

                visited[key_for_pair((node_1, node_2))] = True

            depth += 1