Exemple #1
0
    def __init__(self, graph, estimand_type,
            method_name = "default",
            proceed_when_unidentifiable=None):
        '''
        Class to perform identification using the ID algorithm.

        :param self: instance of the IDIdentifier class.
        :param estimand_type: Type of estimand ("nonparametric-ate", "nonparametric-nde" or "nonparametric-nie").
        :param method_name: Identification method ("id-algorithm" in this case).
        :param proceed_when_unidentifiable: If True, proceed with identification even in the presence of unobserved/missing variables.
        '''

        super().__init__(graph, estimand_type, method_name, proceed_when_unidentifiable)

        if self.estimand_type != CausalIdentifier.NONPARAMETRIC_ATE:
            raise Exception("The estimand type should be 'non-parametric ate' for the ID method type.")

        self._treatment_names = OrderedSet(parse_state(graph.treatment_name))
        self._outcome_names = OrderedSet(parse_state(graph.outcome_name))
        self._adjacency_matrix = graph.get_adjacency_matrix()

        try:
            self._tsort_node_names = OrderedSet(list(nx.topological_sort(graph._graph))) # topological sorting of graph nodes
        except:
            raise Exception("The graph must be a directed acyclic graph (DAG).")
        self._node_names = OrderedSet(graph._graph.nodes)
Exemple #2
0
    def identify_mediation(self):
        """ Find a valid mediator if it exists.

        Currently only supports a single variable mediator set.
        """
        mediation_var = None
        mediation_paths = self._graph.get_all_directed_paths(
            self.treatment_name, self.outcome_name)
        eligible_variables = self._graph.get_descendants(self.treatment_name) \
            - set(self.outcome_name)
        # For simplicity, assuming a one-variable mediation set
        for candidate_var in eligible_variables:
            is_valid_mediation = self._graph.check_valid_mediation_set(
                self.treatment_name,
                self.outcome_name,
                parse_state(candidate_var),
                mediation_paths=mediation_paths)
            self.logger.debug(
                "Candidate mediation set: {0}, on_mediating_path: {1}".format(
                    candidate_var, is_valid_mediation))
            if is_valid_mediation:
                mediation_var = candidate_var
                break
        return parse_state(mediation_var)

        return None
Exemple #3
0
 def __init__(self,
              identifier,
              treatment_variable,
              outcome_variable,
              estimand_type=None,
              estimands=None,
              backdoor_variables=None,
              instrumental_variables=None,
              frontdoor_variables=None,
              mediator_variables=None,
              mediation_first_stage_confounders=None,
              mediation_second_stage_confounders=None,
              default_backdoor_id=None,
              identifier_method=None):
     self.identifier = identifier
     self.treatment_variable = parse_state(treatment_variable)
     self.outcome_variable = parse_state(outcome_variable)
     self.backdoor_variables = backdoor_variables
     self.instrumental_variables = parse_state(instrumental_variables)
     self.frontdoor_variables = parse_state(frontdoor_variables)
     self.mediator_variables = parse_state(mediator_variables)
     self.mediation_first_stage_confounders = mediation_first_stage_confounders
     self.mediation_second_stage_confounders = mediation_second_stage_confounders
     self.estimand_type = estimand_type
     self.estimands = estimands
     self.default_backdoor_id = default_backdoor_id
     self.identifier_method = identifier_method
    def test_2(self):
        treatment = "T"
        outcome = "Y"
        variables = ["X1", "X2"]
        causal_graph = "digraph{T->X1;T->X2;X1->X2;X2->Y;T->Y}"

        vars = list(treatment) + list(outcome) + list(variables)
        df = pd.DataFrame(columns=vars)

        treatment_name = parse_state(treatment)
        outcome_name = parse_state(outcome)

        # Causal model initialization
        causal_model = CausalModel(df, treatment, outcome, graph=causal_graph)

        # Causal identifier identification
        identifier = CausalIdentifier(causal_model._graph,
                                      estimand_type=None,
                                      method_name="default",
                                      proceed_when_unidentifiable=None)

        # Obtain backdoor sets
        path = Backdoor(identifier._graph._graph, treatment_name, outcome_name)
        backdoor_sets = path.get_backdoor_vars()

        assert len(backdoor_sets) == 0
Exemple #5
0
    def get_instruments(self, treatment_nodes, outcome_nodes):
        treatment_nodes = parse_state(treatment_nodes)
        outcome_nodes = parse_state(outcome_nodes)
        parents_treatment = set()
        for node in treatment_nodes:
            parents_treatment = parents_treatment.union(self.get_parents(node))
        g_no_parents_treatment = self.do_surgery(treatment_nodes,
                                                 remove_incoming_edges=True)
        ancestors_outcome = set()
        for node in outcome_nodes:
            ancestors_outcome = ancestors_outcome.union(
                nx.ancestors(g_no_parents_treatment, node))

        # [TODO: double check these work with multivariate implementation:]
        # Exclusion
        candidate_instruments = parents_treatment.difference(ancestors_outcome)
        self.logger.debug("Candidate instruments after exclusion %s",
                          candidate_instruments)
        # As-if-random setup
        children_causes_outcome = [
            nx.descendants(g_no_parents_treatment, v)
            for v in ancestors_outcome
        ]
        children_causes_outcome = set(
            [item for sublist in children_causes_outcome for item in sublist])

        # As-if-random
        instruments = candidate_instruments.difference(children_causes_outcome)
        return list(instruments)
Exemple #6
0
    def __init__(self,
                 treatment_name, outcome_name,
                 graph=None,
                 common_cause_names=None,
                 instrument_names=None,
                 effect_modifier_names=None,
                 observed_node_names=None,
                 missing_nodes_as_confounders=False):
        self.treatment_name = parse_state(treatment_name)
        self.outcome_name = parse_state(outcome_name)
        instrument_names = parse_state(instrument_names)
        common_cause_names = parse_state(common_cause_names)
        self.logger = logging.getLogger(__name__)

        if graph is None:
            self._graph = nx.DiGraph()
            self._graph = self.build_graph(common_cause_names,
                                           instrument_names, effect_modifier_names)
        elif re.match(r".*\.dot", graph):
            # load dot file
            try:
                import pygraphviz as pgv
                self._graph = nx.DiGraph(nx.drawing.nx_agraph.read_dot(graph))
            except Exception as e:
                self.logger.error("Pygraphviz cannot be loaded. " + str(e) + "\nTrying pydot...")
                try:
                    import pydot
                    self._graph = nx.DiGraph(nx.drawing.nx_pydot.read_dot(graph))
                except Exception as e:
                    self.logger.error("Error: Pydot cannot be loaded. " + str(e))
                    raise e
        elif re.match(r".*\.gml", graph):
            self._graph = nx.DiGraph(nx.read_gml(graph))
        elif re.match(r".*graph\s*\{.*\}\s*", graph):
            try:
                import pygraphviz as pgv
                self._graph = pgv.AGraph(graph, strict=True, directed=True)
                self._graph = nx.drawing.nx_agraph.from_agraph(self._graph)
            except Exception as e:
                self.logger.error("Error: Pygraphviz cannot be loaded. " + str(e) + "\nTrying pydot ...")
                try:
                    import pydot
                    P_list = pydot.graph_from_dot_data(graph)
                    self._graph = nx.drawing.nx_pydot.from_pydot(P_list[0])
                except Exception as e:
                    self.logger.error("Error: Pydot cannot be loaded. " + str(e))
                    raise e
        elif re.match(".*graph\s*\[.*\]\s*", graph):
            self._graph = nx.DiGraph(nx.parse_gml(graph))
        else:
            self.logger.error("Error: Please provide graph (as string or text file) in dot or gml format.")
            self.logger.error("Error: Incorrect graph format")
            raise ValueError
        if missing_nodes_as_confounders:
            self._graph = self.add_missing_nodes_as_common_causes(observed_node_names)
        # Adding node attributes
        self._graph = self.add_node_attributes(observed_node_names)
        #TODO do not add it here. CausalIdentifier should call causal_graph to add an unobserved common cause if needed. This also ensures that we do not need get_common_causes in this class.
        self._graph = self.add_unobserved_common_cause(observed_node_names)
 def __init__(self, treatment_variable, outcome_variable,
              estimand_type=None, estimands=None,
              backdoor_variables=None, instrumental_variables=None):
     self.treatment_variable = parse_state(treatment_variable)
     self.outcome_variable = parse_state(outcome_variable)
     self.backdoor_variables = parse_state(backdoor_variables)
     self.instrumental_variables = parse_state(instrumental_variables)
     self.estimand_type = estimand_type
     self.estimands = estimands
     self.identifier_method = None
Exemple #8
0
 def get_common_causes(self, nodes1, nodes2):
     nodes1 = parse_state(nodes1)
     nodes2 = parse_state(nodes2)
     causes_1 = set()
     causes_2 = set()
     for node in nodes1:
         causes_1 = causes_1.union(self.get_ancestors(node))
     for node in nodes2:
         causes_2 = causes_2.union(self.get_ancestors(node))
     return list(causes_1.intersection(causes_2))
Exemple #9
0
 def get_causes(self, nodes, remove_edges = None):
     nodes = parse_state(nodes)
     new_graph=None
     if remove_edges is not None:
         new_graph = self._graph.copy()  # caution: shallow copy of the attributes
         sources = parse_state(remove_edges["sources"])
         targets = parse_state(remove_edges["targets"])
         for s in sources:
             for t in targets:
                 new_graph.remove_edge(s, t)
     causes = set()
     for v in nodes:
         causes = causes.union(self.get_ancestors(v, new_graph=new_graph))
     return causes
Exemple #10
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # choosing the instrumental variable to use
        if getattr(self, 'iv_instrument_name', None) is None:
            self.estimating_instrument_names = self._target_estimand.instrumental_variables
        else:
            self.estimating_instrument_names = parse_state(
                self.iv_instrument_name)
        self.logger.debug("Instrumental Variables used:" +
                          ",".join(self.estimating_instrument_names))
        if not self.estimating_instrument_names:
            raise ValueError(
                "No valid instruments found. IV Method not applicable")
        if len(self.estimating_instrument_names) < len(self._treatment_name):
            # TODO move this to the identification step
            raise ValueError(
                "Number of instruments fewer than number of treatments. 2SLS requires at least as many instruments as treatments."
            )
        self._estimating_instruments = self._data[
            self.estimating_instrument_names]
        self.logger.info("INFO: Using Instrumental Variable Estimator")

        self.symbolic_estimator = self.construct_symbolic_estimator(
            self._target_estimand)
        self.logger.info(self.symbolic_estimator)
Exemple #11
0
    def identify_frontdoor(self):
        """ Find a valid frontdoor variable if it exists.

        Currently only supports a single variable frontdoor set.
        """
        frontdoor_var = None
        frontdoor_paths = self._graph.get_all_directed_paths(self.treatment_name, self.outcome_name)
        eligible_variables = self._graph.get_descendants(self.treatment_name) \
            - set(self.outcome_name)
        # For simplicity, assuming a one-variable frontdoor set
        for candidate_var in eligible_variables:
            is_valid_frontdoor = self._graph.check_valid_frontdoor_set(self.treatment_name,
                    self.outcome_name, parse_state(candidate_var), frontdoor_paths=frontdoor_paths)
            self.logger.debug("Candidate frontdoor set: {0}, is_dseparated: {1}".format(candidate_var, is_valid_frontdoor))
            if is_valid_frontdoor:
                frontdoor_var = candidate_var
                break
        return parse_state(frontdoor_var)
Exemple #12
0
 def get_common_causes(self, nodes1, nodes2):
     """
     Assume that nodes1 causes nodes2 (e.g., nodes1 are the treatments and nodes2 are the outcomes)
     """
     # TODO Refactor to remove this from here and only implement this logic in causalIdentifier. Unnecessary assumption of nodes1 to be causing nodes2.
     nodes1 = parse_state(nodes1)
     nodes2 = parse_state(nodes2)
     causes_1 = set()
     causes_2 = set()
     for node in nodes1:
         causes_1 = causes_1.union(self.get_ancestors(node))
     for node in nodes2:
         # Cannot simply compute ancestors, since that will also include nodes1 and its parents (e.g. instruments)
         parents_2 = self.get_parents(node)
         for parent in parents_2:
             if parent not in nodes1:
                 causes_2 = causes_2.union(set([parent,]))
                 causes_2 = causes_2.union(self.get_ancestors(parent))
     return list(causes_1.intersection(causes_2))
Exemple #13
0
 def get_common_causes(self, nodes1, nodes2):
     """
     Assume that nodes1 causes nodes2 (e.g., nodes1 are the treatments and nodes2 are the outcomes)
     """
     nodes1 = parse_state(nodes1)
     nodes2 = parse_state(nodes2)
     causes_1 = set()
     causes_2 = set()
     for node in nodes1:
         causes_1 = causes_1.union(self.get_ancestors(node))
     for node in nodes2:
         # Cannot simply compute ancestors, since that will also include nodes1 and its parents (e.g. instruments)
         parents_2 = self.get_parents(node)
         for parent in parents_2:
             if parent not in nodes1:
                 causes_2 = causes_2.union(set([
                     parent,
                 ]))
                 causes_2 = causes_2.union(self.get_ancestors(parent))
     return list(causes_1.intersection(causes_2))
Exemple #14
0
    def _estimate_conditional_effects(self,
                                      estimate_effect_fn,
                                      effect_modifier_names=None,
                                      num_quantiles=None):
        """Estimate conditional treatment effects. Common method for all estimators that utilizes a specific estimate_effect_fn implemented by each child estimator.

        If a numeric effect modifier is provided, it is discretized into quantile bins. If you would like a custom discretization, you can do so yourself: create a new column containing the discretized effect modifier and then include that column's name in the effect_modifier_names argument.

        :param estimate_effect_fn: Function that has a single parameter (a data frame) and returns the treatment effect estimate on that data.
        :param effect_modifier_names: Names of effect modifier variables over which the conditional effects will be estimated. If not provided, defaults to the effect modifiers specified during creation of the CausalEstimator object.
        :param num_quantiles: The number of quantiles into which a numeric effect modifier variable is discretized. Does not affect any categorical effect modifiers.

        :returns: A (multi-index) dataframe that provides separate effects for each value of the (discretized) effect modifiers.
        """
        # Defaulting to class default values if parameters are not provided
        if effect_modifier_names is None:
            effect_modifier_names = self._effect_modifier_names
        if num_quantiles is None:
            num_quantiles = self.num_quantiles_to_discretize_cont_cols
        # Checking that there is at least one effect modifier
        if not effect_modifier_names:
            raise ValueError(
                "At least one effect modifier should be specified to compute conditional effects."
            )
        # Making sure that effect_modifier_names is a list
        effect_modifier_names = parse_state(effect_modifier_names)
        if not all(em in self._effect_modifier_names
                   for em in effect_modifier_names):
            self.logger.warn(
                "At least one of the provided effect modifiers was not included while fitting the estimator. You may get incorrect results. To resolve, fit the estimator again by providing the updated effect modifiers in estimate_effect()."
            )
        # Making a copy since we are going to be changing effect modifier names
        effect_modifier_names = effect_modifier_names.copy()
        prefix = CausalEstimator.TEMP_CAT_COLUMN_PREFIX
        # For every numeric effect modifier, adding a temp categorical column
        for i in range(len(effect_modifier_names)):
            em = effect_modifier_names[i]
            if pd.api.types.is_numeric_dtype(self._data[em].dtypes):
                self._data[prefix + str(em)] = pd.qcut(self._data[em],
                                                       num_quantiles,
                                                       duplicates="drop")
                effect_modifier_names[i] = prefix + str(em)
        # Grouping by effect modifiers and computing effect separately
        by_effect_mods = self._data.groupby(effect_modifier_names)
        cond_est_fn = lambda x: self._do(self._treatment_value, x) - self._do(
            self._control_value, x)
        conditional_estimates = by_effect_mods.apply(estimate_effect_fn)
        # Deleting the temporary categorical columns
        for em in effect_modifier_names:
            if em.startswith(prefix):
                self._data.pop(em)
        return conditional_estimates
Exemple #15
0
    def test_1(self):
        treatment = "T"
        outcome = "Y"
        variables = ["X1", "X2"]
        causal_graph = "digraph{X1->T;X2->T;X1->X2;X2->Y;T->Y}"

        vars = list(treatment) + list(outcome) + list(variables)
        df = pd.DataFrame(columns=vars)

        treatment_name = parse_state(treatment)
        outcome_name = parse_state(outcome)

        # Causal model initialization
        causal_model = CausalModel(df, treatment, outcome, graph=causal_graph)

        # Causal identifier identification
        identifier = CausalIdentifier(causal_model._graph,
                                      estimand_type=None,
                                      method_name="default",
                                      proceed_when_unidentifiable=None)

        # Obtain backdoor sets
        path = Backdoor(identifier._graph._graph, treatment_name, outcome_name)
        backdoor_sets = path.get_backdoor_vars()
        print(backdoor_sets)
        # Check if backdoor sets are valid i.e. if they block all paths between the treatment and the outcome
        backdoor_paths = identifier._graph.get_backdoor_paths(
            treatment_name, outcome_name)
        check_set = set(backdoor_sets[0]['backdoor_set'])
        check = identifier._graph.check_valid_backdoor_set(
            treatment_name,
            outcome_name,
            check_set,
            backdoor_paths=backdoor_paths,
            dseparation_algo="naive")
        print(check)
        assert check["is_dseparated"]
Exemple #16
0
    def interpret(self, method_name=None, **kwargs):
        """Interpret the refutation results.

        :param method_name: Method used (string) or a list of methods. If None, then the default for the specific refuter is used.

        :returns: None

        """
        if method_name is None:
            method_name = self.refuter.interpret_method
        method_name_arr = parse_state(method_name)
        import dowhy.interpreters as interpreters
        for method in method_name_arr:
            interpreter = interpreters.get_class_object(method)
            interpreter(self, **kwargs).interpret()
Exemple #17
0
 def do_surgery(self,
                node_names,
                remove_outgoing_edges=False,
                remove_incoming_edges=False):
     node_names = parse_state(node_names)
     new_graph = self._graph.copy()
     for node_name in node_names:
         if remove_outgoing_edges:
             children = new_graph.successors(node_name)
             edges_bunch = [(node_name, child) for child in children]
             new_graph.remove_edges_from(edges_bunch)
         if remove_incoming_edges:
             parents = new_graph.predecessors(node_name)
             edges_bunch = [(parent, node_name) for parent in parents]
             new_graph.remove_edges_from(edges_bunch)
     return new_graph
Exemple #18
0
    def interpret(self, method_name=None, **kwargs):
        """Interpret the causal estimate.

        :param method_name: Method used (string) or a list of methods. If None, then the default for the specific estimator is used.
        :param kwargs:: Optional parameters that are directly passed to the interpreter method.

        :returns: None

        """
        if method_name is None:
            method_name = self.estimator.interpret_method
        method_name_arr = parse_state(method_name)

        for method in method_name_arr:
            interpreter = interpreters.get_class_object(method)
            interpreter(self, **kwargs).interpret()
Exemple #19
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.logger.info("INFO: Using EconML Estimator")
     self.identifier_method = self._target_estimand.identifier_method
     self._observed_common_causes_names = self._target_estimand.get_backdoor_variables().copy()
     # For metalearners only--issue a warning if w contains variables not in x
     (module_name, _, class_name) = self._econml_methodname.rpartition(".")
     if module_name.endswith("metalearners"):
         effect_modifier_names = []
         if self._effect_modifier_names is not None:
             effect_modifier_names = self._effect_modifier_names.copy()
         w_diff_x = [w for w in self._observed_common_causes_names if w not in effect_modifier_names]
         if len(w_diff_x) >0:
             self.logger.warn("Concatenating common_causes and effect_modifiers and providing a single list of variables to metalearner estimator method, " + class_name + ". EconML metalearners accept a single X argument.")
             effect_modifier_names.extend(w_diff_x)
             # Override the effect_modifiers set in CausalEstimator.__init__()
             # Also only update self._effect_modifiers, and create a copy of self._effect_modifier_names
             # the latter can be used by other estimator methods later
             self._effect_modifiers = self._data[effect_modifier_names]
             self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
             self._effect_modifier_names = effect_modifier_names
         self.logger.debug("Effect modifiers: " +
                       ",".join(effect_modifier_names))
     if self._observed_common_causes_names:
         self._observed_common_causes = self._data[self._observed_common_causes_names]
         self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
     else:
         self._observed_common_causes = None
     self.logger.debug("Back-door variables used:" +
                       ",".join(self._observed_common_causes_names))
     # Instrumental variables names, if present
     # choosing the instrumental variable to use
     if getattr(self, 'iv_instrument_name', None) is None:
         self.estimating_instrument_names = self._target_estimand.instrumental_variables
     else:
         self.estimating_instrument_names = parse_state(self.iv_instrument_name)
     if self.estimating_instrument_names:
         self._estimating_instruments = self._data[self.estimating_instrument_names]
         self._estimating_instruments = pd.get_dummies(self._estimating_instruments, drop_first=True)
     else:
         self._estimating_instruments = None
     self.estimator = None
     self.symbolic_estimator = self.construct_symbolic_estimator(self._target_estimand)
     self.logger.info(self.symbolic_estimator)
Exemple #20
0
    def interpret(self, method_name=None, **kwargs):
        """Interpret the causal model.

        :param method_name: method used for interpreting the model. If None, 
                            then default interpreter is chosen that describes the model summary and shows the associated causal graph.
        :param kwargs:: Optional parameters that are directly passed to the interpreter method.

        :returns: None

        """
        if method_name is None:
            self.summary(print_to_stdout=True)
            self.view_model()
            return

        method_name_arr = parse_state(method_name)
        for method in method_name_arr:
            interpreter = interpreters.get_class_object(method)
            interpreter(self, **kwargs).interpret()
Exemple #21
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.logger.debug("Instrumental Variables used:" +
                          ",".join(self._target_estimand.instrumental_variables))


        # choosing the instrumental variable to use
        if getattr(self, 'iv_instrument_name', None) is None:
            self.estimating_instrument_names = self._target_estimand.instrumental_variables
        else:
            self.estimating_instrument_names = parse_state(self.iv_instrument_name)

        if not self.estimating_instrument_names:
            raise Exception("No valid instruments found. IV Method not applicable")
        self._estimating_instruments = self._data[self.estimating_instrument_names]
        self.logger.info("INFO: Using Instrumental Variable Estimator")

        self.symbolic_estimator = self.construct_symbolic_estimator(self._target_estimand)
        self.logger.info(self.symbolic_estimator)
Exemple #22
0
    def __init__(self, *args, iv_instrument_name=None, **kwargs):
        """
        :param iv_instrument_name: Name of the specific instrumental variable
            to be used. Needs to be one of the IVs identified in the
            identification step. Default is to use all the IV variables
            from the identification step.
        """
        # Required to ensure that self.method_params contains all the information
        # to create an object of this class
        args_dict = {
            k: v
            for k, v in locals().items() if k not in type(self)._STD_INIT_ARGS
        }
        args_dict.update(kwargs)
        super().__init__(*args, **args_dict)
        # choosing the instrumental variable to use
        self.estimating_instrument_names = self._target_estimand.instrumental_variables
        if iv_instrument_name is not None:
            self.estimating_instrument_names = parse_state(iv_instrument_name)
        self.logger.debug("Instrumental Variables used:" +
                          ",".join(self.estimating_instrument_names))
        if not self.estimating_instrument_names:
            raise ValueError(
                "No valid instruments found. IV Method not applicable")
        if len(self.estimating_instrument_names) < len(self._treatment_name):
            # TODO move this to the identification step
            raise ValueError(
                "Number of instruments fewer than number of treatments. 2SLS requires at least as many instruments as treatments."
            )
        self._estimating_instruments = self._data[
            self.estimating_instrument_names]
        self.logger.info("INFO: Using Instrumental Variable Estimator")

        self.symbolic_estimator = self.construct_symbolic_estimator(
            self._target_estimand)
        self.logger.info(self.symbolic_estimator)
Exemple #23
0
    def __init__(self,
                 data,
                 treatment,
                 outcome,
                 graph=None,
                 common_causes=None,
                 instruments=None,
                 effect_modifiers=None,
                 estimand_type="nonparametric-ate",
                 proceed_when_unidentifiable=False,
                 missing_nodes_as_confounders=False,
                 **kwargs):
        """Initialize data and create a causal graph instance.

        Assigns treatment and outcome variables.
        Also checks and finds the common causes and instruments for treatment
        and outcome.

        At least one of graph, common_causes or instruments must be provided.

        :param data: a pandas dataframe containing treatment, outcome and other
        variables.
        :param treatment: name of the treatment variable
        :param outcome: name of the outcome variable
        :param graph: path to DOT file containing a DAG or a string containing
        a DAG specification in DOT format
        :param common_causes: names of common causes of treatment and _outcome. Only used when graph is None.
        :param instruments: names of instrumental variables for the effect of
        treatment on outcome. Only used when graph is None.
        :param effect_modifiers: names of variables that can modify the treatment effect. If not provided, then the causal graph is used to find the effect modifiers. Estimators will return multiple different estimates based on each value of effect_modifiers.
        :param estimand_type: the type of estimand requested (currently only "nonparametric-ate" is supported). In the future, may support other specific parametric forms of identification.
        :param proceed_when_unidentifiable: does the identification proceed by ignoring potential unobserved confounders. Binary flag.
        :param missing_nodes_as_confounders: Binary flag indicating whether variables in the dataframe that are not included in the causal graph, should be  automatically included as confounder nodes.
        :returns: an instance of CausalModel class

        """
        self._data = data
        self._treatment = parse_state(treatment)
        self._outcome = parse_state(outcome)
        self._effect_modifiers = parse_state(effect_modifiers)
        self._estimand_type = estimand_type
        self._proceed_when_unidentifiable = proceed_when_unidentifiable
        self._missing_nodes_as_confounders = missing_nodes_as_confounders
        self.logger = logging.getLogger(__name__)

        if graph is None:
            self.logger.warning(
                "Causal Graph not provided. DoWhy will construct a graph based on data inputs."
            )
            self._common_causes = parse_state(common_causes)
            self._instruments = parse_state(instruments)
            if common_causes is not None and instruments is not None:
                self._graph = CausalGraph(
                    self._treatment,
                    self._outcome,
                    common_cause_names=self._common_causes,
                    instrument_names=self._instruments,
                    effect_modifier_names=self._effect_modifiers,
                    observed_node_names=self._data.columns.tolist())
            elif common_causes is not None:
                self._graph = CausalGraph(
                    self._treatment,
                    self._outcome,
                    common_cause_names=self._common_causes,
                    effect_modifier_names=self._effect_modifiers,
                    observed_node_names=self._data.columns.tolist())
            elif instruments is not None:
                self._graph = CausalGraph(
                    self._treatment,
                    self._outcome,
                    instrument_names=self._instruments,
                    effect_modifier_names=self._effect_modifiers,
                    observed_node_names=self._data.columns.tolist())
            else:
                cli.query_yes_no(
                    "WARN: Are you sure that there are no common causes of treatment and outcome?",
                    default=None)

        else:
            self._graph = CausalGraph(
                self._treatment,
                self._outcome,
                graph,
                effect_modifier_names=self._effect_modifiers,
                observed_node_names=self._data.columns.tolist(),
                missing_nodes_as_confounders=self._missing_nodes_as_confounders
            )
            self._common_causes = self._graph.get_common_causes(
                self._treatment, self._outcome)
            self._instruments = self._graph.get_instruments(
                self._treatment, self._outcome)
            # Sometimes, effect modifiers from the graph may not match those provided by the user.
            # (Because some effect modifiers may also be common causes)
            # In such cases, the user-provided modifiers are used.
            # If no effect modifiers are provided,  then the ones from the graph are used.
            if self._effect_modifiers is None or not self._effect_modifiers:
                self._effect_modifiers = self._graph.get_effect_modifiers(
                    self._treatment, self._outcome)

        self._other_variables = kwargs
        self.summary()
Exemple #24
0
    def __init__(self,
                 data,
                 treatment,
                 outcome,
                 graph=None,
                 common_causes=None,
                 instruments=None,
                 effect_modifiers=None,
                 mediator=None,
                 estimand_type="nonparametric-ate",
                 proceed_when_unidentifiable=False,
                 missing_nodes_as_confounders=False,
                 **kwargs):
        """Initialize data and create a causal graph instance.

        Assigns treatment and outcome variables.
        Also checks and finds the common causes and instruments for treatment
        and outcome.

        At least one of graph, common_causes or instruments must be provided.

        :param data: a pandas dataframe containing treatment, outcome and other
        variables.
        :param treatment: name of the treatment variable
        :param outcome: name of the outcome variable
        :param graph: path to DOT file containing a DAG or a string containing
        a DAG specification in DOT format
        :param common_causes: names of common causes of treatment and _outcome
        :param instruments: names of instrumental variables for the effect of
        treatment on outcome
        :param effect_modifiers: names of variables that can modify the treatment effect (useful for heterogeneous treatment effect estimation)
        :param mediator: names of mediator between treatment and outcome
        :param estimand_type: the type of estimand requested (currently only "nonparametric-ate" is supported). In the future, may support other specific parametric forms of identification.
        :proceed_when_unidentifiable: does the identification proceed by ignoring potential unobserved confounders. Binary flag.
        :missing_nodes_as_confounders: Binary flag indicating whether variables in the dataframe that are not included in the causal graph, should be  automatically included as confounder nodes.

        :returns: an instance of CausalModel class

        """
        self._data = data
        self._treatment = parse_state(treatment)
        self._outcome = parse_state(outcome)
        self._estimand_type = estimand_type
        self._proceed_when_unidentifiable = proceed_when_unidentifiable
        self._missing_nodes_as_confounders = missing_nodes_as_confounders
        if 'logging_level' in kwargs:
            logging.basicConfig(level=kwargs['logging_level'])
        else:
            logging.basicConfig(level=logging.INFO)

        # TODO: move the logging level argument to a json file. Tue 20 Feb 2018 06:56:27 PM DST
        self.logger = logging.getLogger(__name__)

        if graph is None:
            self.logger.warning(
                "Causal Graph not provided. DoWhy will construct a graph based on data inputs."
            )
            self._common_causes = parse_state(common_causes)
            self._instruments = parse_state(instruments)
            self._effect_modifiers = parse_state(effect_modifiers)
            self._mediator = parse_state(mediator)
            if common_causes is not None and instruments is not None:
                self._graph = CausalGraph(
                    self._treatment,
                    self._outcome,
                    common_cause_names=self._common_causes,
                    instrument_names=self._instruments,
                    effect_modifier_names=self._effect_modifiers,
                    mediator_name=mediator,
                    observed_node_names=self._data.columns.tolist())
            elif common_causes is not None:
                self._graph = CausalGraph(
                    self._treatment,
                    self._outcome,
                    common_cause_names=self._common_causes,
                    effect_modifier_names=self._effect_modifiers,
                    mediator_name=mediator,
                    observed_node_names=self._data.columns.tolist())
            elif instruments is not None:
                self._graph = CausalGraph(
                    self._treatment,
                    self._outcome,
                    instrument_names=self._instruments,
                    effect_modifier_names=self._effect_modifiers,
                    mediator_name=mediator,
                    observed_node_names=self._data.columns.tolist())
            else:
                cli.query_yes_no(
                    "WARN: Are you sure that there are no common causes of treatment and outcome?",
                    default=None)

        else:
            self._graph = CausalGraph(
                self._treatment,
                self._outcome,
                graph,
                observed_node_names=self._data.columns.tolist(),
                missing_nodes_as_confounders=self._missing_nodes_as_confounders
            )
            self._common_causes = self._graph.get_common_causes(
                self._treatment, self._outcome)
            self._instruments = self._graph.get_instruments(
                self._treatment, self._outcome)
            self._effect_modifiers = self._graph.get_effect_modifiers(
                self._treatment, self._outcome)
            self._mediator = self._graph.get_mediators(self._treatment,
                                                       self._outcome)

        self._other_variables = kwargs
        self.summary()
Exemple #25
0
    def __init__(self, data, treatment, outcome, graph=None,
                 common_causes=None, instruments=None, estimand_type="ate",
                 proceed_when_unidentifiable=False,
                 **kwargs):
        """Initialize data and create a causal graph instance.

        Assigns treatment and outcome variables.
        Also checks and finds the common causes and instruments for treatment
        and outcome.

        At least one of graph, common_causes or instruments must be provided.

        :param data: a pandas dataframe containing treatment, outcome and other
        variables.
        :param treatment: name of the treatment variable
        :param outcome: name of the outcome variable
        :param graph: path to DOT file containing a DAG or a string containing
        a DAG specification in DOT format
        :param common_causes: names of common causes of treatment and _outcome
        :param instruments: names of instrumental variables for the effect of
        treatment on outcome
        :returns: an instance of CausalModel class

        """
        self._data = data
        self._treatment = parse_state(treatment)
        self._outcome = parse_state(outcome)
        self._estimand_type = estimand_type
        self._proceed_when_unidentifiable = proceed_when_unidentifiable
        if 'logging_level' in kwargs:
            logging.basicConfig(level=kwargs['logging_level'])
        else:
            logging.basicConfig(level=logging.INFO)

        # TODO: move the logging level argument to a json file. Tue 20 Feb 2018 06:56:27 PM DST
        self.logger = logging.getLogger(__name__)

        if graph is None:
            self.logger.warning("Causal Graph not provided. DoWhy will construct a graph based on data inputs.")
            self._common_causes = parse_state(common_causes)
            self._instruments = parse_state(instruments)
            if common_causes is not None and instruments is not None:
                self._graph = CausalGraph(
                    self._treatment,
                    self._outcome,
                    common_cause_names=self._common_causes,
                    instrument_names=self._instruments,
                    observed_node_names=self._data.columns.tolist()
                )
            elif common_causes is not None:
                self._graph = CausalGraph(
                    self._treatment,
                    self._outcome,
                    common_cause_names=self._common_causes,
                    observed_node_names=self._data.columns.tolist()
                )
            elif instruments is not None:
                self._graph = CausalGraph(
                    self._treatment,
                    self._outcome,
                    instrument_names=self._instruments,
                    observed_node_names=self._data.columns.tolist()
                )
            else:
                cli.query_yes_no(
                    "WARN: Are you sure that there are no common causes of treatment and outcome?",
                    default=None
                )

        else:
            self._graph = CausalGraph(
                self._treatment,
                self._outcome,
                graph,
                observed_node_names=self._data.columns.tolist()
            )
            self._common_causes = self._graph.get_common_causes(self._treatment, self._outcome)
            self._instruments = self._graph.get_instruments(self._treatment,
                                                            self._outcome)

        self._other_variables = kwargs
        self.summary()
Exemple #26
0
    def __init__(self, data, treatment, outcome, graph=None,
                 common_causes=None, instruments=None,
                 effect_modifiers=None,
                 estimand_type="nonparametric-ate",
                 proceed_when_unidentifiable=False,
                 missing_nodes_as_confounders=False,
                 identify_vars=False,
                 **kwargs):
        """Initialize data and create a causal graph instance.

        Assigns treatment and outcome variables.
        Also checks and finds the common causes and instruments for treatment
        and outcome.

        At least one of graph, common_causes or instruments must be provided. If 
        none of these variables are provided, then learn_graph() can be used later.

        :param data: a pandas dataframe containing treatment, outcome and other
        variables.
        :param treatment: name of the treatment variable
        :param outcome: name of the outcome variable
        :param graph: path to DOT file containing a DAG or a string containing
        a DAG specification in DOT format
        :param common_causes: names of common causes of treatment and _outcome. Only used when graph is None.
        :param instruments: names of instrumental variables for the effect of
        treatment on outcome. Only used when graph is None.
        :param effect_modifiers: names of variables that can modify the treatment effect. If not provided, then the causal graph is used to find the effect modifiers. Estimators will return multiple different estimates based on each value of effect_modifiers.
        :param estimand_type: the type of estimand requested (currently only "nonparametric-ate" is supported). In the future, may support other specific parametric forms of identification.
        :param proceed_when_unidentifiable: does the identification proceed by ignoring potential unobserved confounders. Binary flag.
        :param missing_nodes_as_confounders: Binary flag indicating whether variables in the dataframe that are not included in the causal graph, should be  automatically included as confounder nodes.
        :param identify_vars: Variable deciding whether to compute common causes, instruments and effect modifiers while initializing the class. identify_vars should be set to False when user is providing common_causes, instruments or effect modifiers on their own(otherwise the identify_vars code can override the user provided values). Also it does not make sense if no graph is given.
        :returns: an instance of CausalModel class

        """
        self._data = data
        self._treatment = parse_state(treatment)
        self._outcome = parse_state(outcome)
        self._effect_modifiers = parse_state(effect_modifiers)
        self._estimand_type = estimand_type
        self._proceed_when_unidentifiable = proceed_when_unidentifiable
        self._missing_nodes_as_confounders = missing_nodes_as_confounders
        self.logger = logging.getLogger(__name__)

        if graph is None:
            self.logger.warning("Causal Graph not provided. DoWhy will construct a graph based on data inputs.")
            self._common_causes = parse_state(common_causes)
            self._instruments = parse_state(instruments)
            if common_causes is not None and instruments is not None:
                self._graph = CausalGraph(
                    self._treatment,
                    self._outcome,
                    common_cause_names=self._common_causes,
                    instrument_names=self._instruments,
                    effect_modifier_names=self._effect_modifiers,
                    observed_node_names=self._data.columns.tolist()
                )
            elif common_causes is not None:
                self._graph = CausalGraph(
                    self._treatment,
                    self._outcome,
                    common_cause_names=self._common_causes,
                    effect_modifier_names = self._effect_modifiers,
                    observed_node_names=self._data.columns.tolist()
                )
            elif instruments is not None:
                self._graph = CausalGraph(
                    self._treatment,
                    self._outcome,
                    instrument_names=self._instruments,
                    effect_modifier_names = self._effect_modifiers,
                    observed_node_names=self._data.columns.tolist()
                )
            else:
                self.logger.warning("Relevant variables to build causal graph not provided. You may want to use the learn_graph() function to construct the causal graph.")
                self._graph = None

        else:
            self.init_graph(graph=graph, identify_vars=identify_vars)
            
        self._other_variables = kwargs
        self.summary()
Exemple #27
0
    def __init__(self,
                 data,
                 params=None,
                 variable_types=None,
                 num_cores=1,
                 causal_model=None,
                 keep_original_treatment=False):
        """
        Initializes a do sampler with data and names of relevant variables.

        Do sampling implements the do() operation from Pearl (2000). This is an operation is defined on a causal
        bayesian network, an explicit implementation of which is the basis for the MCMC sampling method.

        We abstract the idea behind the three-step process to allow other methods, as well. The `disrupt_causes`
        method is the means to make treatment assignment ignorable. In the Pearlian framework, this is where we cut the
        edges pointing into the causal state. With other methods, this will typically be by using some approach which
        assumes conditional ignorability (e.g. weighting, or explicit conditioning with Robins G-formula.)

        Next, the `make_treatment_effective` method reflects the assumption that the intervention we impose is
        "effective". Most simply, we fix the causal state to some specific value. We skip this step there is no value
        specified for the causal state, and the original values are used instead.

        Finally, we sample from the resulting distribution. This can be either from a `point_sample` method, in the case
        that the inference method doesn't support batch sampling, or the `sample` method in the case that it does. For
        convenience, the `point_sample` method parallelizes with `multiprocessing` using the `num_cores` kwargs to set
        the number of cores to use for parallelization.

        While different methods will have their own class attributes, the `_df` method should be common to all methods.
        This is them temporary dataset which starts as a copy of the original data, and is modified to reflect the steps
        of the do operation. Read through the existing methods (weighting is likely the most minimal) to get an idea of
        how this works to implement one yourself.

        :param data: pandas.DataFrame containing the data
        :param identified_estimand: dowhy.causal_identifier.IdentifiedEstimand: and estimand using a backdoor method
        for effect identification.
        :param treatments: list or str:  names of the treatment variables
        :param outcomes: list or str: names of the outcome variables
        :param variable_types: dict: A dictionary containing the variable's names and types. 'c' for continuous, 'o'
        for ordered, 'd' for discrete, and 'u' for unordered discrete.
        :param keep_original_treatment: bool: Whether to use `make_treatment_effective`, or to keep the original
        treatment assignments.
        :param params: (optional) additional method parameters

        """
        self._data = data.copy()
        self._causal_model = causal_model
        self._target_estimand = self._causal_model.identify_effect()
        self._target_estimand.set_identifier_method("backdoor")
        self._treatment_names = parse_state(self._causal_model._treatment)
        self._outcome_names = parse_state(self._causal_model._outcome)
        self._estimate = None
        self._variable_types = variable_types
        self.num_cores = num_cores
        self.point_sampler = True
        self.sampler = None
        self.keep_original_treatment = keep_original_treatment

        if params is not None:
            for key, value in params.items():
                setattr(self, key, value)

        self._df = self._data.copy()

        if not self._variable_types:
            self._infer_variable_types()
        self.dep_type = [
            self._variable_types[var] for var in self._outcome_names
        ]
        self.indep_type = [
            self._variable_types[var] for var in self._treatment_names +
            self._target_estimand.get_backdoor_variables()
        ]
        self.density_types = [
            self._variable_types[var]
            for var in self._target_estimand.get_backdoor_variables()
        ]

        self.outcome_lower_support = self._data[
            self._outcome_names].min().values
        self.outcome_upper_support = self._data[
            self._outcome_names].max().values

        self.logger = logging.getLogger(__name__)
Exemple #28
0
    def do(self, x, method='weighting', num_cores=1, variable_types={}, outcome=None, params=None, dot_graph=None,
           common_causes=None, estimand_type='ate', proceed_when_unidentifiable=False, stateful=False):
        """
        The do-operation implemented with sampling. This will return a pandas.DataFrame with the outcome
        variable(s) replaced with samples from P(Y|do(X=x)).

        If the value of `x` is left unspecified (e.g. as a string or list), then the original values of `x` are left in
        the DataFrame, and Y is sampled from its respective P(Y|do(x)). If the value of `x` is specified (passed with a
        `dict`, where variable names are keys, and values are specified) then the new `DataFrame` will contain the
        specified values of `x`.

        For some methods, the `variable_types` field must be specified. It should be a `dict`, where the keys are
        variable names, and values are 'o' for ordered discrete, 'u' for un-ordered discrete, 'd' for discrete, or 'c'
        for continuous.

        Inference requires a set of control variables. These can be provided explicitly using `common_causes`, which
        contains a list of variable names to control for. These can be provided implicitly by specifying a causal graph
        with `dot_graph`, from which they will be chosen using the default identification method.

        When the set of control variables can't be identified with the provided assumptions, a prompt will raise to the
        user asking whether to proceed. To automatically over-ride the prompt, you can set the flag
        `proceed_when_unidentifiable` to `True`.

        Some methods build components during inference which are expensive. To retain those components for later
        inference (e.g. successive calls to `do` with different values of `x`), you can set the `stateful` flag to `True`.
        Be cautious about using the `do` operation statefully. State is set on the namespace, rather than the method, so
        can behave unpredictably. To reset the namespace and run statelessly again, you can call the `reset` method.

        :param x: str, list, dict: The causal state on which to intervene, and (optional) its interventional value(s).
        :param method: The inference method to use with the sampler. Currently, `'mcmc'`, `'weighting'`, and
        `'kernel_density'` are supported. The `mcmc` sampler requires `pymc3>=3.7`.
        :param num_cores: int: if the inference method only supports sampling a point at a time, this will parallelize
        sampling.
        :param variable_types: dict: The dictionary containing the variable types. Must contain the union of the causal
        state, control variables, and the outcome.
        :param outcome: str: The outcome variable.
        :param params: dict: extra parameters to set as attributes on the sampler object
        :param dot_graph: str: A string specifying the causal graph.
        :param common_causes: list: A list of strings containing the variable names to control for.
        :param estimand_type: str: 'ate' is the only one currently supported. Others may be added later, to allow for
        CATE estimation.
        :param proceed_when_unidentifiable: bool: A flag to over-ride user prompts to proceed when effects aren't
        identifiable with the assumptions provided.
        :param stateful: bool: Whether to retain state. By default, the do operation is stateless.
        :return: pandas.DataFrame: A DataFrame containing the sampled outcome
        """
        x, keep_original_treatment = self.parse_x(x)
        outcome = parse_state(outcome)
        if not stateful or method != self._method:
            self.reset()
        if not self._causal_model:
            self._causal_model = CausalModel(self._obj,
                                             [xi for xi in x.keys()],
                                             outcome,
                                             graph=dot_graph,
                                             common_causes=common_causes,
                                             instruments=None,
                                             estimand_type=estimand_type,
                                             proceed_when_unidentifiable=proceed_when_unidentifiable)
        #self._identified_estimand = self._causal_model.identify_effect()
        if not self._sampler:
            self._method = method
            do_sampler_class = do_samplers.get_class_object(method + "_sampler")
            self._sampler = do_sampler_class(self._obj,
                                             #self._identified_estimand,
                                             #self._causal_model._treatment,
                                             #self._causal_model._outcome,
                                             params=params,
                                             variable_types=variable_types,
                                             num_cores=num_cores,
                                             causal_model=self._causal_model,
                                             keep_original_treatment=keep_original_treatment)
        result = self._sampler.do_sample(x)
        if not stateful:
            self.reset()
        return result
Exemple #29
0
    def _estimate_effect(self):
        estimate_value = None
        # First stage
        modified_target_estimand = copy.deepcopy(self._target_estimand)
        modified_target_estimand.identifier_method = "backdoor"
        modified_target_estimand.backdoor_variables = self._target_estimand.mediation_first_stage_confounders
        if self._target_estimand.identifier_method == "frontdoor":
            modified_target_estimand.outcome_variable = parse_state(
                self._frontdoor_variables_names)
        elif self._target_estimand.identifier_method == "mediation":
            modified_target_estimand.outcome_variable = parse_state(
                self._mediators_names)

        first_stage_estimate = self.first_stage_model(
            self._data,
            modified_target_estimand,
            self._treatment_name,
            parse_state(modified_target_estimand.outcome_variable),
            control_value=self._control_value,
            treatment_value=self._treatment_value,
            test_significance=self._significance_test,
            evaluate_effect_strength=self._effect_strength_eval,
            confidence_intervals=self._confidence_intervals,
            target_units=self._target_units,
            effect_modifiers=self._effect_modifier_names,
            **self.method_params)._estimate_effect()

        # Second Stage
        modified_target_estimand = copy.deepcopy(self._target_estimand)
        modified_target_estimand.identifier_method = "backdoor"
        modified_target_estimand.backdoor_variables = self._target_estimand.mediation_second_stage_confounders
        if self._target_estimand.identifier_method == "frontdoor":
            modified_target_estimand.treatment_variable = parse_state(
                self._frontdoor_variables_names)
        elif self._target_estimand.identifier_method == "mediation":
            modified_target_estimand.treatment_variable = parse_state(
                self._mediators_names)

        second_stage_estimate = self.second_stage_model(
            self._data,
            modified_target_estimand,
            parse_state(modified_target_estimand.treatment_variable),
            parse_state(
                self._outcome_name
            ),  # to convert it to array before passing to causal estimator
            control_value=self._control_value,
            treatment_value=self._treatment_value,
            test_significance=self._significance_test,
            evaluate_effect_strength=self._effect_strength_eval,
            confidence_intervals=self._confidence_intervals,
            target_units=self._target_units,
            effect_modifiers=self._effect_modifier_names,
            **self.method_params)._estimate_effect()
        # Combining the two estimates
        natural_indirect_effect = first_stage_estimate.value * second_stage_estimate.value
        # This same estimate is valid for frontdoor as well as mediation (NIE)
        estimate_value = natural_indirect_effect
        self.symbolic_estimator = self.construct_symbolic_estimator(
            first_stage_estimate.realized_estimand_expr,
            second_stage_estimate.realized_estimand_expr,
            estimand_type=CausalIdentifier.NONPARAMETRIC_NIE)
        if self._target_estimand.estimand_type == CausalIdentifier.NONPARAMETRIC_NDE:
            # Total  effect of treatment
            modified_target_estimand = copy.deepcopy(self._target_estimand)
            modified_target_estimand.identifier_method = "backdoor"

            total_effect_estimate = self.second_stage_model(
                self._data,
                modified_target_estimand,
                self._treatment_name,
                parse_state(self._outcome_name),
                control_value=self._control_value,
                treatment_value=self._treatment_value,
                test_significance=self._significance_test,
                evaluate_effect_strength=self._effect_strength_eval,
                confidence_intervals=self._confidence_intervals,
                target_units=self._target_units,
                effect_modifiers=self._effect_modifier_names,
                **self.method_params)._estimate_effect()
            natural_direct_effect = total_effect_estimate.value - natural_indirect_effect
            estimate_value = natural_direct_effect
            self.symbolic_estimator = self.construct_symbolic_estimator(
                first_stage_estimate.realized_estimand_expr,
                second_stage_estimate.realized_estimand_expr,
                total_effect_estimate.realized_estimand_expr,
                estimand_type=self._target_estimand.estimand_type)
        return CausalEstimate(estimate=estimate_value,
                              control_value=self._control_value,
                              treatment_value=self._treatment_value,
                              target_estimand=self._target_estimand,
                              realized_estimand_expr=self.symbolic_estimator)
    def refute_estimate(self):
        # only permute is supported for iv methods
        if self._target_estimand.identifier_method.startswith("iv"):
            if self._placebo_type != "permute":
                self.logger.error(
                    "Only placebo_type=''permute'' is supported for creating placebo for instrumental variable estimation methods"
                )
                raise ValueError(
                    "Only placebo_type=''permute'' is supported for creating placebo for instrumental variable estimation methods."
                )

        # We need to change the identified estimand
        # We make a copy as a safety measure, we don't want to change the
        # original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.treatment_variable = ["placebo"]
        if self._target_estimand.identifier_method.startswith("iv"):
            identified_estimand.instrumental_variables = [
                "placebo_" + s
                for s in identified_estimand.instrumental_variables
            ]
            # For IV methods, the estimating_instrument_names should also be
            # changed. So we change it inside the estimate and then restore it
            # back at the end of this method.
            if self._estimate.params[
                    "method_params"] is not None and "iv_instrument_name" in self._estimate.params[
                        "method_params"]:
                self._estimate.params["method_params"]["iv_instrument_name"] = \
                ["placebo_" + s for s in parse_state(self._estimate.params["method_params"]["iv_instrument_name"])]

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info(
            "Refutation over {} simulated datasets of {} treatment".format(
                self._num_simulations, self._placebo_type))

        num_rows = self._data.shape[0]
        treatment_name = self._treatment_name[
            0]  # Extract the name of the treatment variable
        type_dict = dict(self._data.dtypes)

        for index in range(self._num_simulations):

            if self._placebo_type == "permute":
                permuted_idx = None
                if self._random_state is None:
                    permuted_idx = np.random.choice(self._data.shape[0],
                                                    size=self._data.shape[0],
                                                    replace=False)

                else:
                    permuted_idx = self._random_state.choice(
                        self._data.shape[0],
                        size=self._data.shape[0],
                        replace=False)
                new_treatment = self._data[
                    self._treatment_name].iloc[permuted_idx].values
                if self._target_estimand.identifier_method.startswith("iv"):
                    new_instruments_values = self._data[
                        self._estimate.estimator.
                        estimating_instrument_names].iloc[permuted_idx].values
                    new_instruments_df = pd.DataFrame(
                        new_instruments_values,
                        columns=[
                            "placebo_" + s for s in
                            self._data[self._estimate.estimator.
                                       estimating_instrument_names].columns
                        ])
            else:
                if 'float' in type_dict[treatment_name].name:
                    self.logger.info(
                        "Using a Normal Distribution with Mean:{} and Variance:{}"
                        .format(
                            PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL,
                            PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL))
                    new_treatment = np.random.randn(num_rows)*PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL + \
                                    PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL

                elif 'bool' in type_dict[treatment_name].name:
                    self.logger.info(
                        "Using a Binomial Distribution with {} trials and {} probability of success"
                        .format(
                            PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS,
                            PlaceboTreatmentRefuter.
                            DEFAULT_PROBABILITY_OF_BINOMIAL))
                    new_treatment = np.random.binomial(
                        PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS,
                        PlaceboTreatmentRefuter.
                        DEFAULT_PROBABILITY_OF_BINOMIAL, num_rows).astype(bool)

                elif 'int' in type_dict[treatment_name].name:
                    self.logger.info(
                        "Using a Discrete Uniform Distribution lying between {} and {}"
                        .format(self._data[treatment_name].min(),
                                self._data[treatment_name].max()))
                    new_treatment = np.random.randint(
                        low=self._data[treatment_name].min(),
                        high=self._data[treatment_name].max(),
                        size=num_rows)

                elif 'category' in type_dict[treatment_name].name:
                    categories = self._data[treatment_name].unique()
                    self.logger.info(
                        "Using a Discrete Uniform Distribution with the following categories:{}"
                        .format(categories))
                    sample = np.random.choice(categories, size=num_rows)
                    new_treatment = pd.Series(sample).astype('category')

            # Create a new column in the data by the name of placebo
            new_data = self._data.assign(placebo=new_treatment)
            if self._target_estimand.identifier_method.startswith("iv"):
                new_data = pd.concat((new_data, new_instruments_df), axis=1)
            # Sanity check the data
            self.logger.debug(new_data[0:10])
            new_estimator = CausalEstimator.get_estimator_object(
                new_data, identified_estimand, self._estimate)
            new_effect = new_estimator.estimate_effect()
            sample_estimates[index] = new_effect.value

        # Restoring the value of iv_instrument_name
        if self._target_estimand.identifier_method.startswith("iv"):
            if self._estimate.params[
                    "method_params"] is not None and "iv_instrument_name" in self._estimate.params[
                        "method_params"]:
                self._estimate.params["method_params"]["iv_instrument_name"] = \
                [s.replace("placebo_","",1) for s in parse_state(self._estimate.params["method_params"]["iv_instrument_name"])]
        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Use a Placebo Treatment")

        # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter
        # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal
        # relationship between the treatment and the outcome.
        dummy_estimator = CausalEstimate(
            estimate=0,
            control_value=self._estimate.control_value,
            treatment_value=self._estimate.treatment_value,
            target_estimand=self._estimate.target_estimand,
            realized_estimand_expr=self._estimate.realized_estimand_expr)

        refute.add_significance_test_results(
            self.test_significance(dummy_estimator, sample_estimates))
        refute.add_refuter(self)
        return refute