def refute_estimate(self):

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info(
            "Refutation over {} simulated datasets of size {} each".format(
                self._subset_fraction,
                self._subset_fraction * len(self._data.index)))

        for index in range(self._num_simulations):
            if self._random_state is None:
                new_data = self._data.sample(frac=self._subset_fraction)
            else:
                new_data = self._data.sample(frac=self._subset_fraction,
                                             random_state=self._random_state)

            new_estimator = self.get_estimator_object(new_data,
                                                      self._target_estimand,
                                                      self._estimate)
            new_effect = new_estimator.estimate_effect()
            sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Use a subset of data")

        # We want to see if the estimate falls in the same distribution as the one generated by the refuter
        # Ideally that should be the case as choosing a subset should not have a significant effect on the ability
        # of the treatment to affect the outcome
        refute.add_significance_test_results(
            self.test_significance(self._estimate.value, sample_estimates))

        return refute
Exemple #2
0
    def refute_estimate(self, *args, **kwargs):
        if self._sample_size > len(self._data):
                self.logger.warning("The sample size is larger than the population size")

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info("Refutation over {} simulated datasets of size {} each"
                         .format(self._num_simulations
                         ,self._sample_size )
                        ) 
        
        for index in range(self._num_simulations):
            if self._random_state is None:
                new_data = resample(self._data, 
                                n_samples=self._sample_size )
            else:
                new_data = resample(self._data,
                                    n_samples=self._sample_size,
                                    random_state=self._random_state )

            if self._chosen_variables is not None:
                for variable in self._chosen_variables:
                    
                    if ('float' or 'int') in new_data[variable].dtype.name:
                        scaling_factor = new_data[variable].std() 
                        new_data[variable] += np.random.normal(loc=0.0, scale=self._noise * scaling_factor,size=self._sample_size) 
                    
                    elif 'bool' in new_data[variable].dtype.name:
                        probs = np.random.uniform(0, 1, self._sample_size )
                        new_data[variable] = np.where(probs < self._probability_of_change, 
                                                        np.logical_not(new_data[variable]), 
                                                        new_data[variable]) 
                    
                    elif 'category' in new_data[variable].dtype.name:
                        categories = new_data[variable].unique()
                        # Find the set difference for each row
                        changed_data = new_data[variable].apply( lambda row: list( set(categories) - set([row]) ) )
                        # Choose one out of the remaining
                        changed_data = changed_data.apply( lambda row: random.choice(row)  )
                        new_data[variable] = np.where(probs < self._probability_of_change, changed_data)
                        new_data[variable].astype('category')

            new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate)
            new_effect = new_estimator.estimate_effect()
            sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Bootstrap Sample Dataset"
        )

        # We want to see if the estimate falls in the same distribution as the one generated by the refuter
        # Ideally that should be the case as running bootstrap should not have a significant effect on the ability
        # of the treatment to affect the outcome
        refute.add_significance_test_results(
            self.test_significance(self._estimate, sample_estimates)
        )

        return refute
Exemple #3
0
    def refute_estimate(self):

        # We need to change the identified estimand
        # We thus, make a copy. This is done as we don't want
        # to change the original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.outcome_variable = ["dummy_outcome"]

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info("Refutation over {} simulated datasets of {} treatment"
                        .format(self._num_simulations
                        ,self._dummy_outcome_type)
                        )
        num_rows =  self._data.shape[0]

        for index in range(self._num_simulations):

            if self._dummy_outcome_type == "permute":
                if self._random_state is None:
                    new_outcome = self._data[self._outcome_name].sample(frac=1).values
                else:
                    new_outcome = self._data[self._outcome_name].sample(frac=1,
                                                                random_state=self._random_state).values
            else:
                new_outcome = np.random.randn(num_rows)

        # Create a new column in the data by the name of dummy_outcome
        new_data = self._data.assign(dummy_outcome=new_outcome)

        # Sanity check the data
        self.logger.debug(new_data[0:10])

        new_estimator = self.get_estimator_object(new_data, identified_estimand, self._estimate)
        new_effect = new_estimator.estimate_effect()
        sample_estimates[index] = new_effect.value

        refute = CausalRefutation(self._estimate.value,
                                        np.mean(sample_estimates),
                                        refutation_type="Refute: Use a Dummy Outcome")
        
        # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter
        # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal 
        # relationship between the treatment and the outcome.

        dummy_estimator = copy.deepcopy(self._estimate)
        dummy_estimator.value = 0

        refute.add_significance_test_results(
            self.test_significance(dummy_estimator, sample_estimates)
        )

        return refute
Exemple #4
0
    def refute_estimate(self, *args, **kwargs):
        if self._sample_size > len(self._data):
                self.logger.warning("The sample size is larger than the population size")

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info("Refutation over {} simulated datasets of size {} each"
                         .format(self._num_simulations
                         ,self._sample_size )
                        ) 
        
        for index in range(self._num_simulations):
            if self._random_state is None:
                new_data = resample(self._data, 
                                n_samples=self._sample_size )
            else:
                new_data = resample(self._data,
                                    n_samples=self._sample_size,
                                    random_state=self._random_state )

            new_estimator = self.get_estimator_object(new_data, self._target_estimand, self._estimate)
            new_effect = new_estimator.estimate_effect()
            sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Bootstrap Sample Dataset"
        )

        # We want to see if the estimate falls in the same distribution as the one generated by the refuter
        # Ideally that should be the case as bootstrapping should not have a significant effect on the ability
        # of the treatment to affect the outcome
        refute.add_significance_test_results(
            self.test_significance(self._estimate.value, sample_estimates)
        )

        return refute
    def refute_estimate(self):
        num_rows = self._data.shape[0]
        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info("Refutation over {} simulated datasets, each with a random common cause added"
                         .format(self._num_simulations))

        new_backdoor_variables = self._target_estimand.get_backdoor_variables() + ['w_random']
        identified_estimand = copy.deepcopy(self._target_estimand)
        # Adding a new backdoor variable to the identified estimand
        identified_estimand.set_backdoor_variables(new_backdoor_variables)
        for index in range(self._num_simulations):
            if self._random_state is None:
                new_data = self._data.assign(w_random=np.random.randn(num_rows))
            else:
                new_data = self._data.assign(w_random=self._random_state.normal(size=num_rows
                                             ))

            new_estimator = CausalEstimator.get_estimator_object(new_data, identified_estimand, self._estimate)
            new_effect = new_estimator.estimate_effect()

            sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Add a random common cause"
        )

        # We want to see if the estimate falls in the same distribution as the one generated by the refuter
        # Ideally that should be the case as choosing a subset should not have a significant effect on the ability
        # of the treatment to affect the outcome
        refute.add_significance_test_results(
            self.test_significance(self._estimate, sample_estimates)
        )

        refute.add_refuter(self)
        return refute
Exemple #6
0
    def refute_estimate(self):

        # We need to change the identified estimand
        # We thus, make a copy. This is done as we don't want
        # to change the original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.outcome_variable = ["dummy_outcome"]

        self.logger.info("Refutation over {} simulated datasets".format(
            self._num_simulations))
        self.logger.info("The transformation passed: {}".format(
            self._transformation_list))

        simulation_results = []
        refute_list = []

        # We use collections.OrderedDict to maintain the order in which the data is stored
        causal_effect_map = OrderedDict()

        # Check if we are using an estimator in the transformation list
        estimator_present = self._has_estimator()

        # The rationale behind ordering of the loops is the fact that we induce randomness everytime we create the
        # Train and the Validation Datasets. Thus, we run the simulation loop followed by the training and the validation
        # loops. Thus, we can get different values everytime we get the estimator.

        for _ in range(self._num_simulations):
            estimates = []

            if estimator_present == False:

                # Warn the user that the specified parameter is not applicable when no estimator is present in the transformation
                if self._test_fraction != DummyOutcomeRefuter.DEFAULT_TEST_FRACTION:
                    self.logger.warning(
                        "'test_fraction' is not applicable as there is no base treatment value."
                    )

                # We set X_train = 0 and outcome_train to be 0
                if self._unobserved_confounder_values is not None:
                    self._data[
                        'simulated'] = self._unobserved_confounder_values
                    self._chosen_variables.append('simulated')

                validation_df = self._data
                X_train = None
                outcome_train = None
                X_validation_df = validation_df[self._chosen_variables]

                X_validation = X_validation_df.values
                outcome_validation = validation_df['y'].values

                # Get the final outcome, after running through all the values in the transformation list
                outcome_validation = self.process_data(
                    X_train, outcome_train, X_validation, outcome_validation,
                    self._transformation_list)

                # Check if the value of true effect has been already stored
                # We use None as the key as we have no base category for this refutation
                if None not in causal_effect_map:
                    # As we currently support only one treatment
                    causal_effect_map[None] = self._true_causal_effect(
                        validation_df[self._treatment_name[0]])

                outcome_validation += causal_effect_map[None]

                new_data = validation_df.assign(
                    dummy_outcome=outcome_validation)

                new_estimator = CausalEstimator.get_estimator_object(
                    new_data, identified_estimand, self._estimate)
                new_effect = new_estimator.estimate_effect()
                estimates.append(new_effect.value)

            else:

                groups = self.preprocess_data_by_treatment()
                group_count = 0

                if len(self._test_fraction) == 1:
                    self._test_fraction = len(groups) * self._test_fraction

                for key_train, _ in groups:
                    base_train = groups.get_group(key_train).sample(
                        frac=self._test_fraction[group_count].base)
                    train_set = set(
                        [tuple(line) for line in base_train.values])
                    total_set = set([
                        tuple(line)
                        for line in groups.get_group(key_train).values
                    ])
                    base_validation = pd.DataFrame(list(
                        total_set.difference(train_set)),
                                                   columns=base_train.columns)
                    X_train_df = base_train[self._chosen_variables]

                    X_train = X_train_df.values

                    outcome_train = base_train['y'].values

                    validation_df = []
                    transformation_list = self._transformation_list
                    validation_df.append(base_validation)

                    for key_validation, _ in groups:
                        if key_validation != key_train:
                            validation_df.append(
                                groups.get_group(key_validation).sample(
                                    frac=self._test_fraction[group_count].other
                                ))

                    validation_df = pd.concat(validation_df)
                    X_validation_df = validation_df[self._chosen_variables]

                    X_validation = X_validation_df.values
                    outcome_validation = validation_df['y'].values

                    # If the number of data points is too few, run the default transformation: [("zero",""),("noise", {'std_dev':1} )]
                    if X_train.shape[0] <= self._min_data_point_threshold:
                        transformation_list = DummyOutcomeRefuter.DEFAULT_TRANSFORMATION
                        self.logger.warning(
                            "The number of data points in X_train:{} for category:{} is less than threshold:{}"
                            .format(X_train.shape[0], key_train,
                                    self._min_data_point_threshold))
                        self.logger.warning(
                            "Therefore, defaulting to the minimal set of transformations:{}"
                            .format(transformation_list))

                    outcome_validation = self.process_data(
                        X_train, outcome_train, X_validation,
                        outcome_validation, transformation_list)

                    # Check if the value of true effect has been already stored
                    # This ensures that we calculate the causal effect only once.
                    # We use key_train as we map data with respect to the base category of the data

                    if key_train not in causal_effect_map:
                        # As we currently support only one treatment
                        causal_effect_map[
                            key_train] = self._true_causal_effect(
                                validation_df[self._treatment_name[0]])

                    # Add h(t) to f(W) to get the dummy outcome
                    outcome_validation += causal_effect_map[key_train]

                    new_data = validation_df.assign(
                        dummy_outcome=outcome_validation)
                    new_estimator = CausalEstimator.get_estimator_object(
                        new_data, identified_estimand, self._estimate)
                    new_effect = new_estimator.estimate_effect()

                    estimates.append(new_effect.value)
                    group_count += 1

            simulation_results.append(estimates)

        # We convert to ndarray for ease in indexing
        # The data is of the form
        # sim1: cat1 cat2 ... catn
        # sim2: cat1 cat2 ... catn
        simulation_results = np.array(simulation_results)

        # Note: We would like the causal_estimator to find the true causal estimate that we have specified through this
        # refuter. Let the value of the true causal effect be h(t). In the following section of code, we wish to find out if h(t) falls in the
        # distribution of the refuter.

        if estimator_present == False:

            dummy_estimate = CausalEstimate(
                estimate=causal_effect_map[None],
                target_estimand=self._estimate.target_estimand,
                realized_estimand_expr=self._estimate.realized_estimand_expr)

            refute = CausalRefutation(
                dummy_estimate.value,
                np.mean(simulation_results),
                refutation_type="Refute: Use a Dummy Outcome")

            refute.add_significance_test_results(
                self.test_significance(dummy_estimate,
                                       np.ravel(simulation_results)))

            refute.add_refuter(self)

            refute_list.append(refute)

        else:
            # True Causal Effect list
            causal_effect_list = list(causal_effect_map.values())
            # Iterating through the refutation for each category
            for train_category in range(simulation_results.shape[1]):
                dummy_estimate = CausalEstimate(
                    estimate=causal_effect_list[train_category],
                    target_estimand=self._estimate.target_estimand,
                    realized_estimand_expr=self._estimate.
                    realized_estimand_expr)

                refute = CausalRefutation(
                    dummy_estimate.value,
                    np.mean(simulation_results[:, train_category]),
                    refutation_type="Refute: Use a Dummy Outcome")

                refute.add_significance_test_results(
                    self.test_significance(
                        dummy_estimate, simulation_results[:, train_category]))

                refute.add_refuter(self)
                refute_list.append(refute)

        return refute_list
Exemple #7
0
    def refute_estimate(self):

        # We need to change the identified estimand
        # We thus, make a copy. This is done as we don't want
        # to change the original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.outcome_variable = ["dummy_outcome"]

        self.logger.info("Refutation over {} simulated datasets".format(
            self._num_simulations))
        self.logger.info("The transformation passed: {}".format(
            self._transformation_list))

        simulation_results = []
        refute_list = []
        no_estimator = self.check_for_estimator()

        for _ in range(self._num_simulations):
            estimates = []
            if no_estimator:
                # We set X_train = 0 and outcome_train to be 0
                validation_df = self._data
                X_train = None
                outcome_train = None
                X_validation = validation_df[self._chosen_variables].values
                outcome_validation = validation_df['y'].values

                # Get the final outcome, after running through all the values in the transformation list
                outcome_validation = self.process_data(
                    X_train, outcome_train, X_validation, outcome_validation,
                    self._transformation_list)

            else:
                groups = self.preprocess_data_by_treatment()
                for key_train, _ in groups:
                    X_train = groups.get_group(key_train)[
                        self._chosen_variables].values
                    outcome_train = groups.get_group(key_train)['y'].values
                    validation_df = []
                    transformation_list = self._transformation_list

                    for key_validation, _ in groups:
                        if key_validation != key_train:
                            validation_df.append(
                                groups.get_group(key_validation))

                    validation_df = pd.concat(validation_df)
                    X_validation = validation_df[self._chosen_variables].values
                    outcome_validation = validation_df['y'].values

                    # If the number of data points is too few, run the default transformation: [("zero",""),("noise", {'std_dev':1} )]
                    if X_train.shape[0] <= self._min_data_point_threshold:
                        transformation_list = DummyOutcomeRefuter.DEFAULT_TRANSFORMATION

                    outcome_validation = self.process_data(
                        X_train, outcome_train, X_validation,
                        outcome_validation, transformation_list)

            new_data = validation_df.assign(dummy_outcome=outcome_validation)
            new_estimator = CausalEstimator.get_estimator_object(
                new_data, identified_estimand, self._estimate)
            new_effect = new_estimator.estimate_effect()
            estimates.append(new_effect.value)

        simulation_results.append(estimates)

        # We convert to ndarray for ease in indexing
        # The data is of the form
        # sim1: cat1 cat2 ... catn
        # sim2: cat1 cat2 ... catn
        simulation_results = np.array(simulation_results)

        # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter
        # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal
        # relationship between the treatment and the outcome.
        dummy_estimator = CausalEstimate(
            estimate=0,
            target_estimand=self._estimate.target_estimand,
            realized_estimand_expr=self._estimate.realized_estimand_expr)

        if no_estimator:
            refute = CausalRefutation(
                self._estimate.value,
                np.mean(simulation_results),
                refutation_type="Refute: Use a Dummy Outcome")

            refute.add_significance_test_results(
                self.test_significance(dummy_estimator, simulation_results))

            refute_list.append(refute)

        else:
            for category in simulation_results.shape[1]:
                refute = CausalRefutation(
                    self._estimate.value,
                    np.mean(simulation_results[:, category]),
                    refutation_type="Refute: Use a Dummy Outcome")

                refute.add_significance_test_results(
                    self.test_significance(dummy_estimator,
                                           simulation_results[:, category]))

                refute_list.append(refute)

        return refute_list
Exemple #8
0
    def refute_estimate(self):

        # We need to change the identified estimand
        # We thus, make a copy. This is done as we don't want
        # to change the original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.outcome_variable = ["dummy_outcome"]

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info("Refutation over {} simulated datasets".format(
            self._num_simulations))
        self.logger.info("The transformation passed: {}",
                         self._transformations)

        # This flag is to make sure we store the estimators whose input is deterministic
        save_estimators = True
        # We store the value of the estimators in the format "estimator_name" +  "pos_in_transform" : estimator_object
        saved_estimator_dict = {}

        X = self._data[self._chosen_variables]
        new_outcome = self._data['y']

        for index in range(self._num_simulations):
            transform_num = 0
            for action, func_args in self._transformations:

                if callable(action):
                    new_outcome = action(X, **func_args)

                elif action in DummyOutcomeRefuter.SUPPORTED_ESTIMATORS:
                    if action + str(transform_num) in saved_estimator_dict:
                        estimator = saved_estimator_dict[action +
                                                         str(transform_num)]
                        new_outcome = estimator(X)
                    else:
                        estimator = self._estimate_dummy_outcome(
                            func_args, action, new_outcome)
                        new_outcome = estimator(X)
                        if save_estimators:
                            saved_estimator_dict[
                                action + str(transform_num)] = estimator

                elif action == 'noise':
                    save_estimators = False
                    new_outcome = self._noise(new_outcome, func_args)

                elif action == 'permute':
                    save_estimators = False
                    new_outcome = self._permute(new_outcome, func_args)

                elif action == 'zero':
                    save_estimators = False
                    new_outcome = np.zeros(new_outcome.shape)

                transform_num += 1

            save_estimators = False

        # Create a new column in the data by the name of dummy_outcome

        new_data = self._data.assign(dummy_outcome=new_outcome)

        # Sanity check the data
        self.logger.debug(new_data[0:10])

        new_estimator = CausalEstimator.get_estimator_object(
            new_data, identified_estimand, self._estimate)
        new_effect = new_estimator.estimate_effect()
        sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Use a Dummy Outcome")

        # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter
        # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal
        # relationship between the treatment and the outcome.

        dummy_estimator = copy.deepcopy(self._estimate)
        dummy_estimator.value = 0

        refute.add_significance_test_results(
            self.test_significance(dummy_estimator, sample_estimates))

        return refute
    def refute_estimate(self):

        # We need to change the identified estimand
        # We make a copy as a safety measure, we don't want to change the
        # original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.treatment_variable = ["placebo"]

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info(
            "Refutation over {} simulated datasets of {} treatment".format(
                self._num_simulations, self._placebo_type))

        num_rows = self._data.shape[0]
        treatment_name = self._treatment_name[
            0]  # Extract the name of the treatment variable
        type_dict = dict(self._data.dtypes)

        for index in range(self._num_simulations):

            if self._placebo_type == "permute":
                if self._random_state is None:
                    new_treatment = self._data[self._treatment_name].sample(
                        frac=1).values
                else:
                    new_treatment = self._data[self._treatment_name].sample(
                        frac=1, random_state=self._random_state).values
            else:
                if 'float' in type_dict[treatment_name].name:
                    self.logger.info(
                        "Using a Normal Distribution with Mean:{} and Variance:{}"
                        .format(
                            PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL,
                            PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL))
                    new_treatment = np.random.randn(num_rows)*PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL + \
                                    PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL

                elif 'bool' in type_dict[treatment_name].name:
                    self.logger.info(
                        "Using a Binomial Distribution with {} trials and {} probability of success"
                        .format(
                            PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS,
                            PlaceboTreatmentRefuter.
                            DEFAULT_PROBABILITY_OF_BINOMIAL))
                    new_treatment = np.random.binomial(
                        PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS,
                        PlaceboTreatmentRefuter.
                        DEFAULT_PROBABILITY_OF_BINOMIAL, num_rows).astype(bool)

                elif 'int' in type_dict[treatment_name].name:
                    self.logger.info(
                        "Using a Discrete Uniform Distribution lying between {} and {}"
                        .format(self._data[treatment_name].min(),
                                self._data[treatment_name].max()))
                    new_treatment = np.random.randint(
                        low=self._data[treatment_name].min(),
                        high=self._data[treatment_name].max(),
                        size=num_rows)

                elif 'category' in type_dict[treatment_name].name:
                    categories = self._data[treatment_name].unique()
                    self.logger.info(
                        "Using a Discrete Uniform Distribution with the following categories:{}"
                        .format(categories))
                    sample = np.random.choice(categories, size=num_rows)
                    new_treatment = pd.Series(sample).astype('category')

            # Create a new column in the data by the name of placebo
            new_data = self._data.assign(placebo=new_treatment)

            # Sanity check the data
            self.logger.debug(new_data[0:10])

            new_estimator = self.get_estimator_object(new_data,
                                                      identified_estimand,
                                                      self._estimate)
            new_effect = new_estimator.estimate_effect()
            sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Use a Placebo Treatment")

        # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter
        # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal
        # relationship between the treatment and the outcome.

        dummy_estimator = copy.deepcopy(self._estimate)
        dummy_estimator.value = 0

        refute.add_significance_test_results(
            self.test_significance(dummy_estimator, sample_estimates))

        return refute
Exemple #10
0
    def refute_estimate(self):

        # We need to change the identified estimand
        # We thus, make a copy. This is done as we don't want
        # to change the original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.outcome_variable = ["dummy_outcome"]

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info(
            "Refutation over {} simulated datasets of {} treatment".format(
                self._num_simulations, self._dummy_outcome_type))
        num_rows = self._data.shape[0]

        for index in range(self._num_simulations):

            if self._dummy_outcome_type == "permute":
                if self._random_state is None:
                    new_outcome = self._data[self._outcome_name].sample(
                        frac=1).values
                else:
                    new_outcome = self._data[self._outcome_name].sample(
                        frac=1, random_state=self._random_state).values
            elif self._outcome_function is not None:
                new_outcome = self._outcome_function(self._data)

                if type(new_outcome) is pd.Series or \
                   type(new_outcome) is pd.DataFrame:
                    new_outcome = new_outcome.values

                # Check if data types match
                assert type(new_outcome) is np.ndarray, (
                    "Only  supports numpy.ndarray as the output")
                assert 'float' in new_outcome.dtype.name, (
                    "Only float outcomes are currently supported")

                if len(new_outcome.shape) == 2 and \
                    ( new_outcome.shape[0] ==1 or new_outcome.shape[1] ):
                    self.logger.warning(
                        "Converting the row or column vector to 1D array")
                    new_outcome = new_outcome.ravel()
                    assert len(new_outcome) == num_rows, (
                        "The number of outputs do not match that of the number of outcomes"
                    )
                elif len(new_outcome.shape) == 1:
                    assert len(new_outcome) == num_rows, (
                        "The number of outputs do not match that of the number of outcomes"
                    )
                else:
                    raise Exception(
                        "Type Mismatch: The outcome is one dimensional, but the output has the shape:{}"
                        .format(new_outcome.shape))
            else:
                new_outcome = np.random.randn(num_rows)

        # Create a new column in the data by the name of dummy_outcome
        new_data = self._data.assign(dummy_outcome=new_outcome)

        # Sanity check the data
        self.logger.debug(new_data[0:10])

        new_estimator = CausalEstimator.get_estimator_object(
            new_data, identified_estimand, self._estimate)
        new_effect = new_estimator.estimate_effect()
        sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Use a Dummy Outcome")

        # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter
        # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal
        # relationship between the treatment and the outcome.

        dummy_estimator = copy.deepcopy(self._estimate)
        dummy_estimator.value = 0

        refute.add_significance_test_results(
            self.test_significance(dummy_estimator, sample_estimates))

        return refute
    def refute_estimate(self):
        # only permute is supported for iv methods
        if self._target_estimand.identifier_method.startswith("iv"):
            if self._placebo_type != "permute":
                self.logger.error(
                    "Only placebo_type=''permute'' is supported for creating placebo for instrumental variable estimation methods"
                )
                raise ValueError(
                    "Only placebo_type=''permute'' is supported for creating placebo for instrumental variable estimation methods."
                )

        # We need to change the identified estimand
        # We make a copy as a safety measure, we don't want to change the
        # original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.treatment_variable = ["placebo"]
        if self._target_estimand.identifier_method.startswith("iv"):
            identified_estimand.instrumental_variables = [
                "placebo_" + s
                for s in identified_estimand.instrumental_variables
            ]
            # For IV methods, the estimating_instrument_names should also be
            # changed. So we change it inside the estimate and then restore it
            # back at the end of this method.
            if self._estimate.params[
                    "method_params"] is not None and "iv_instrument_name" in self._estimate.params[
                        "method_params"]:
                self._estimate.params["method_params"]["iv_instrument_name"] = \
                ["placebo_" + s for s in parse_state(self._estimate.params["method_params"]["iv_instrument_name"])]

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info(
            "Refutation over {} simulated datasets of {} treatment".format(
                self._num_simulations, self._placebo_type))

        num_rows = self._data.shape[0]
        treatment_name = self._treatment_name[
            0]  # Extract the name of the treatment variable
        type_dict = dict(self._data.dtypes)

        for index in range(self._num_simulations):

            if self._placebo_type == "permute":
                permuted_idx = None
                if self._random_state is None:
                    permuted_idx = np.random.choice(self._data.shape[0],
                                                    size=self._data.shape[0],
                                                    replace=False)

                else:
                    permuted_idx = self._random_state.choice(
                        self._data.shape[0],
                        size=self._data.shape[0],
                        replace=False)
                new_treatment = self._data[
                    self._treatment_name].iloc[permuted_idx].values
                if self._target_estimand.identifier_method.startswith("iv"):
                    new_instruments_values = self._data[
                        self._estimate.estimator.
                        estimating_instrument_names].iloc[permuted_idx].values
                    new_instruments_df = pd.DataFrame(
                        new_instruments_values,
                        columns=[
                            "placebo_" + s for s in
                            self._data[self._estimate.estimator.
                                       estimating_instrument_names].columns
                        ])
            else:
                if 'float' in type_dict[treatment_name].name:
                    self.logger.info(
                        "Using a Normal Distribution with Mean:{} and Variance:{}"
                        .format(
                            PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL,
                            PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL))
                    new_treatment = np.random.randn(num_rows)*PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL + \
                                    PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL

                elif 'bool' in type_dict[treatment_name].name:
                    self.logger.info(
                        "Using a Binomial Distribution with {} trials and {} probability of success"
                        .format(
                            PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS,
                            PlaceboTreatmentRefuter.
                            DEFAULT_PROBABILITY_OF_BINOMIAL))
                    new_treatment = np.random.binomial(
                        PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS,
                        PlaceboTreatmentRefuter.
                        DEFAULT_PROBABILITY_OF_BINOMIAL, num_rows).astype(bool)

                elif 'int' in type_dict[treatment_name].name:
                    self.logger.info(
                        "Using a Discrete Uniform Distribution lying between {} and {}"
                        .format(self._data[treatment_name].min(),
                                self._data[treatment_name].max()))
                    new_treatment = np.random.randint(
                        low=self._data[treatment_name].min(),
                        high=self._data[treatment_name].max(),
                        size=num_rows)

                elif 'category' in type_dict[treatment_name].name:
                    categories = self._data[treatment_name].unique()
                    self.logger.info(
                        "Using a Discrete Uniform Distribution with the following categories:{}"
                        .format(categories))
                    sample = np.random.choice(categories, size=num_rows)
                    new_treatment = pd.Series(sample).astype('category')

            # Create a new column in the data by the name of placebo
            new_data = self._data.assign(placebo=new_treatment)
            if self._target_estimand.identifier_method.startswith("iv"):
                new_data = pd.concat((new_data, new_instruments_df), axis=1)
            # Sanity check the data
            self.logger.debug(new_data[0:10])
            new_estimator = CausalEstimator.get_estimator_object(
                new_data, identified_estimand, self._estimate)
            new_effect = new_estimator.estimate_effect()
            sample_estimates[index] = new_effect.value

        # Restoring the value of iv_instrument_name
        if self._target_estimand.identifier_method.startswith("iv"):
            if self._estimate.params[
                    "method_params"] is not None and "iv_instrument_name" in self._estimate.params[
                        "method_params"]:
                self._estimate.params["method_params"]["iv_instrument_name"] = \
                [s.replace("placebo_","",1) for s in parse_state(self._estimate.params["method_params"]["iv_instrument_name"])]
        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Use a Placebo Treatment")

        # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter
        # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal
        # relationship between the treatment and the outcome.
        dummy_estimator = CausalEstimate(
            estimate=0,
            control_value=self._estimate.control_value,
            treatment_value=self._estimate.treatment_value,
            target_estimand=self._estimate.target_estimand,
            realized_estimand_expr=self._estimate.realized_estimand_expr)

        refute.add_significance_test_results(
            self.test_significance(dummy_estimator, sample_estimates))
        refute.add_refuter(self)
        return refute