def refute_estimate(self): sample_estimates = np.zeros(self._num_simulations) self.logger.info( "Refutation over {} simulated datasets of size {} each".format( self._subset_fraction, self._subset_fraction * len(self._data.index))) for index in range(self._num_simulations): if self._random_state is None: new_data = self._data.sample(frac=self._subset_fraction) else: new_data = self._data.sample(frac=self._subset_fraction, random_state=self._random_state) new_estimator = self.get_estimator_object(new_data, self._target_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Use a subset of data") # We want to see if the estimate falls in the same distribution as the one generated by the refuter # Ideally that should be the case as choosing a subset should not have a significant effect on the ability # of the treatment to affect the outcome refute.add_significance_test_results( self.test_significance(self._estimate.value, sample_estimates)) return refute
def refute_estimate(self, *args, **kwargs): if self._sample_size > len(self._data): self.logger.warning("The sample size is larger than the population size") sample_estimates = np.zeros(self._num_simulations) self.logger.info("Refutation over {} simulated datasets of size {} each" .format(self._num_simulations ,self._sample_size ) ) for index in range(self._num_simulations): if self._random_state is None: new_data = resample(self._data, n_samples=self._sample_size ) else: new_data = resample(self._data, n_samples=self._sample_size, random_state=self._random_state ) if self._chosen_variables is not None: for variable in self._chosen_variables: if ('float' or 'int') in new_data[variable].dtype.name: scaling_factor = new_data[variable].std() new_data[variable] += np.random.normal(loc=0.0, scale=self._noise * scaling_factor,size=self._sample_size) elif 'bool' in new_data[variable].dtype.name: probs = np.random.uniform(0, 1, self._sample_size ) new_data[variable] = np.where(probs < self._probability_of_change, np.logical_not(new_data[variable]), new_data[variable]) elif 'category' in new_data[variable].dtype.name: categories = new_data[variable].unique() # Find the set difference for each row changed_data = new_data[variable].apply( lambda row: list( set(categories) - set([row]) ) ) # Choose one out of the remaining changed_data = changed_data.apply( lambda row: random.choice(row) ) new_data[variable] = np.where(probs < self._probability_of_change, changed_data) new_data[variable].astype('category') new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Bootstrap Sample Dataset" ) # We want to see if the estimate falls in the same distribution as the one generated by the refuter # Ideally that should be the case as running bootstrap should not have a significant effect on the ability # of the treatment to affect the outcome refute.add_significance_test_results( self.test_significance(self._estimate, sample_estimates) ) return refute
def refute_estimate(self): # We need to change the identified estimand # We thus, make a copy. This is done as we don't want # to change the original DataFrame identified_estimand = copy.deepcopy(self._target_estimand) identified_estimand.outcome_variable = ["dummy_outcome"] sample_estimates = np.zeros(self._num_simulations) self.logger.info("Refutation over {} simulated datasets of {} treatment" .format(self._num_simulations ,self._dummy_outcome_type) ) num_rows = self._data.shape[0] for index in range(self._num_simulations): if self._dummy_outcome_type == "permute": if self._random_state is None: new_outcome = self._data[self._outcome_name].sample(frac=1).values else: new_outcome = self._data[self._outcome_name].sample(frac=1, random_state=self._random_state).values else: new_outcome = np.random.randn(num_rows) # Create a new column in the data by the name of dummy_outcome new_data = self._data.assign(dummy_outcome=new_outcome) # Sanity check the data self.logger.debug(new_data[0:10]) new_estimator = self.get_estimator_object(new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation(self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Use a Dummy Outcome") # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal # relationship between the treatment and the outcome. dummy_estimator = copy.deepcopy(self._estimate) dummy_estimator.value = 0 refute.add_significance_test_results( self.test_significance(dummy_estimator, sample_estimates) ) return refute
def refute_estimate(self, *args, **kwargs): if self._sample_size > len(self._data): self.logger.warning("The sample size is larger than the population size") sample_estimates = np.zeros(self._num_simulations) self.logger.info("Refutation over {} simulated datasets of size {} each" .format(self._num_simulations ,self._sample_size ) ) for index in range(self._num_simulations): if self._random_state is None: new_data = resample(self._data, n_samples=self._sample_size ) else: new_data = resample(self._data, n_samples=self._sample_size, random_state=self._random_state ) new_estimator = self.get_estimator_object(new_data, self._target_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Bootstrap Sample Dataset" ) # We want to see if the estimate falls in the same distribution as the one generated by the refuter # Ideally that should be the case as bootstrapping should not have a significant effect on the ability # of the treatment to affect the outcome refute.add_significance_test_results( self.test_significance(self._estimate.value, sample_estimates) ) return refute
def refute_estimate(self): num_rows = self._data.shape[0] sample_estimates = np.zeros(self._num_simulations) self.logger.info("Refutation over {} simulated datasets, each with a random common cause added" .format(self._num_simulations)) new_backdoor_variables = self._target_estimand.get_backdoor_variables() + ['w_random'] identified_estimand = copy.deepcopy(self._target_estimand) # Adding a new backdoor variable to the identified estimand identified_estimand.set_backdoor_variables(new_backdoor_variables) for index in range(self._num_simulations): if self._random_state is None: new_data = self._data.assign(w_random=np.random.randn(num_rows)) else: new_data = self._data.assign(w_random=self._random_state.normal(size=num_rows )) new_estimator = CausalEstimator.get_estimator_object(new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Add a random common cause" ) # We want to see if the estimate falls in the same distribution as the one generated by the refuter # Ideally that should be the case as choosing a subset should not have a significant effect on the ability # of the treatment to affect the outcome refute.add_significance_test_results( self.test_significance(self._estimate, sample_estimates) ) refute.add_refuter(self) return refute
def refute_estimate(self): # We need to change the identified estimand # We thus, make a copy. This is done as we don't want # to change the original DataFrame identified_estimand = copy.deepcopy(self._target_estimand) identified_estimand.outcome_variable = ["dummy_outcome"] self.logger.info("Refutation over {} simulated datasets".format( self._num_simulations)) self.logger.info("The transformation passed: {}".format( self._transformation_list)) simulation_results = [] refute_list = [] # We use collections.OrderedDict to maintain the order in which the data is stored causal_effect_map = OrderedDict() # Check if we are using an estimator in the transformation list estimator_present = self._has_estimator() # The rationale behind ordering of the loops is the fact that we induce randomness everytime we create the # Train and the Validation Datasets. Thus, we run the simulation loop followed by the training and the validation # loops. Thus, we can get different values everytime we get the estimator. for _ in range(self._num_simulations): estimates = [] if estimator_present == False: # Warn the user that the specified parameter is not applicable when no estimator is present in the transformation if self._test_fraction != DummyOutcomeRefuter.DEFAULT_TEST_FRACTION: self.logger.warning( "'test_fraction' is not applicable as there is no base treatment value." ) # We set X_train = 0 and outcome_train to be 0 if self._unobserved_confounder_values is not None: self._data[ 'simulated'] = self._unobserved_confounder_values self._chosen_variables.append('simulated') validation_df = self._data X_train = None outcome_train = None X_validation_df = validation_df[self._chosen_variables] X_validation = X_validation_df.values outcome_validation = validation_df['y'].values # Get the final outcome, after running through all the values in the transformation list outcome_validation = self.process_data( X_train, outcome_train, X_validation, outcome_validation, self._transformation_list) # Check if the value of true effect has been already stored # We use None as the key as we have no base category for this refutation if None not in causal_effect_map: # As we currently support only one treatment causal_effect_map[None] = self._true_causal_effect( validation_df[self._treatment_name[0]]) outcome_validation += causal_effect_map[None] new_data = validation_df.assign( dummy_outcome=outcome_validation) new_estimator = CausalEstimator.get_estimator_object( new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() estimates.append(new_effect.value) else: groups = self.preprocess_data_by_treatment() group_count = 0 if len(self._test_fraction) == 1: self._test_fraction = len(groups) * self._test_fraction for key_train, _ in groups: base_train = groups.get_group(key_train).sample( frac=self._test_fraction[group_count].base) train_set = set( [tuple(line) for line in base_train.values]) total_set = set([ tuple(line) for line in groups.get_group(key_train).values ]) base_validation = pd.DataFrame(list( total_set.difference(train_set)), columns=base_train.columns) X_train_df = base_train[self._chosen_variables] X_train = X_train_df.values outcome_train = base_train['y'].values validation_df = [] transformation_list = self._transformation_list validation_df.append(base_validation) for key_validation, _ in groups: if key_validation != key_train: validation_df.append( groups.get_group(key_validation).sample( frac=self._test_fraction[group_count].other )) validation_df = pd.concat(validation_df) X_validation_df = validation_df[self._chosen_variables] X_validation = X_validation_df.values outcome_validation = validation_df['y'].values # If the number of data points is too few, run the default transformation: [("zero",""),("noise", {'std_dev':1} )] if X_train.shape[0] <= self._min_data_point_threshold: transformation_list = DummyOutcomeRefuter.DEFAULT_TRANSFORMATION self.logger.warning( "The number of data points in X_train:{} for category:{} is less than threshold:{}" .format(X_train.shape[0], key_train, self._min_data_point_threshold)) self.logger.warning( "Therefore, defaulting to the minimal set of transformations:{}" .format(transformation_list)) outcome_validation = self.process_data( X_train, outcome_train, X_validation, outcome_validation, transformation_list) # Check if the value of true effect has been already stored # This ensures that we calculate the causal effect only once. # We use key_train as we map data with respect to the base category of the data if key_train not in causal_effect_map: # As we currently support only one treatment causal_effect_map[ key_train] = self._true_causal_effect( validation_df[self._treatment_name[0]]) # Add h(t) to f(W) to get the dummy outcome outcome_validation += causal_effect_map[key_train] new_data = validation_df.assign( dummy_outcome=outcome_validation) new_estimator = CausalEstimator.get_estimator_object( new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() estimates.append(new_effect.value) group_count += 1 simulation_results.append(estimates) # We convert to ndarray for ease in indexing # The data is of the form # sim1: cat1 cat2 ... catn # sim2: cat1 cat2 ... catn simulation_results = np.array(simulation_results) # Note: We would like the causal_estimator to find the true causal estimate that we have specified through this # refuter. Let the value of the true causal effect be h(t). In the following section of code, we wish to find out if h(t) falls in the # distribution of the refuter. if estimator_present == False: dummy_estimate = CausalEstimate( estimate=causal_effect_map[None], target_estimand=self._estimate.target_estimand, realized_estimand_expr=self._estimate.realized_estimand_expr) refute = CausalRefutation( dummy_estimate.value, np.mean(simulation_results), refutation_type="Refute: Use a Dummy Outcome") refute.add_significance_test_results( self.test_significance(dummy_estimate, np.ravel(simulation_results))) refute.add_refuter(self) refute_list.append(refute) else: # True Causal Effect list causal_effect_list = list(causal_effect_map.values()) # Iterating through the refutation for each category for train_category in range(simulation_results.shape[1]): dummy_estimate = CausalEstimate( estimate=causal_effect_list[train_category], target_estimand=self._estimate.target_estimand, realized_estimand_expr=self._estimate. realized_estimand_expr) refute = CausalRefutation( dummy_estimate.value, np.mean(simulation_results[:, train_category]), refutation_type="Refute: Use a Dummy Outcome") refute.add_significance_test_results( self.test_significance( dummy_estimate, simulation_results[:, train_category])) refute.add_refuter(self) refute_list.append(refute) return refute_list
def refute_estimate(self): # We need to change the identified estimand # We thus, make a copy. This is done as we don't want # to change the original DataFrame identified_estimand = copy.deepcopy(self._target_estimand) identified_estimand.outcome_variable = ["dummy_outcome"] self.logger.info("Refutation over {} simulated datasets".format( self._num_simulations)) self.logger.info("The transformation passed: {}".format( self._transformation_list)) simulation_results = [] refute_list = [] no_estimator = self.check_for_estimator() for _ in range(self._num_simulations): estimates = [] if no_estimator: # We set X_train = 0 and outcome_train to be 0 validation_df = self._data X_train = None outcome_train = None X_validation = validation_df[self._chosen_variables].values outcome_validation = validation_df['y'].values # Get the final outcome, after running through all the values in the transformation list outcome_validation = self.process_data( X_train, outcome_train, X_validation, outcome_validation, self._transformation_list) else: groups = self.preprocess_data_by_treatment() for key_train, _ in groups: X_train = groups.get_group(key_train)[ self._chosen_variables].values outcome_train = groups.get_group(key_train)['y'].values validation_df = [] transformation_list = self._transformation_list for key_validation, _ in groups: if key_validation != key_train: validation_df.append( groups.get_group(key_validation)) validation_df = pd.concat(validation_df) X_validation = validation_df[self._chosen_variables].values outcome_validation = validation_df['y'].values # If the number of data points is too few, run the default transformation: [("zero",""),("noise", {'std_dev':1} )] if X_train.shape[0] <= self._min_data_point_threshold: transformation_list = DummyOutcomeRefuter.DEFAULT_TRANSFORMATION outcome_validation = self.process_data( X_train, outcome_train, X_validation, outcome_validation, transformation_list) new_data = validation_df.assign(dummy_outcome=outcome_validation) new_estimator = CausalEstimator.get_estimator_object( new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() estimates.append(new_effect.value) simulation_results.append(estimates) # We convert to ndarray for ease in indexing # The data is of the form # sim1: cat1 cat2 ... catn # sim2: cat1 cat2 ... catn simulation_results = np.array(simulation_results) # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal # relationship between the treatment and the outcome. dummy_estimator = CausalEstimate( estimate=0, target_estimand=self._estimate.target_estimand, realized_estimand_expr=self._estimate.realized_estimand_expr) if no_estimator: refute = CausalRefutation( self._estimate.value, np.mean(simulation_results), refutation_type="Refute: Use a Dummy Outcome") refute.add_significance_test_results( self.test_significance(dummy_estimator, simulation_results)) refute_list.append(refute) else: for category in simulation_results.shape[1]: refute = CausalRefutation( self._estimate.value, np.mean(simulation_results[:, category]), refutation_type="Refute: Use a Dummy Outcome") refute.add_significance_test_results( self.test_significance(dummy_estimator, simulation_results[:, category])) refute_list.append(refute) return refute_list
def refute_estimate(self): # We need to change the identified estimand # We thus, make a copy. This is done as we don't want # to change the original DataFrame identified_estimand = copy.deepcopy(self._target_estimand) identified_estimand.outcome_variable = ["dummy_outcome"] sample_estimates = np.zeros(self._num_simulations) self.logger.info("Refutation over {} simulated datasets".format( self._num_simulations)) self.logger.info("The transformation passed: {}", self._transformations) # This flag is to make sure we store the estimators whose input is deterministic save_estimators = True # We store the value of the estimators in the format "estimator_name" + "pos_in_transform" : estimator_object saved_estimator_dict = {} X = self._data[self._chosen_variables] new_outcome = self._data['y'] for index in range(self._num_simulations): transform_num = 0 for action, func_args in self._transformations: if callable(action): new_outcome = action(X, **func_args) elif action in DummyOutcomeRefuter.SUPPORTED_ESTIMATORS: if action + str(transform_num) in saved_estimator_dict: estimator = saved_estimator_dict[action + str(transform_num)] new_outcome = estimator(X) else: estimator = self._estimate_dummy_outcome( func_args, action, new_outcome) new_outcome = estimator(X) if save_estimators: saved_estimator_dict[ action + str(transform_num)] = estimator elif action == 'noise': save_estimators = False new_outcome = self._noise(new_outcome, func_args) elif action == 'permute': save_estimators = False new_outcome = self._permute(new_outcome, func_args) elif action == 'zero': save_estimators = False new_outcome = np.zeros(new_outcome.shape) transform_num += 1 save_estimators = False # Create a new column in the data by the name of dummy_outcome new_data = self._data.assign(dummy_outcome=new_outcome) # Sanity check the data self.logger.debug(new_data[0:10]) new_estimator = CausalEstimator.get_estimator_object( new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Use a Dummy Outcome") # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal # relationship between the treatment and the outcome. dummy_estimator = copy.deepcopy(self._estimate) dummy_estimator.value = 0 refute.add_significance_test_results( self.test_significance(dummy_estimator, sample_estimates)) return refute
def refute_estimate(self): # We need to change the identified estimand # We make a copy as a safety measure, we don't want to change the # original DataFrame identified_estimand = copy.deepcopy(self._target_estimand) identified_estimand.treatment_variable = ["placebo"] sample_estimates = np.zeros(self._num_simulations) self.logger.info( "Refutation over {} simulated datasets of {} treatment".format( self._num_simulations, self._placebo_type)) num_rows = self._data.shape[0] treatment_name = self._treatment_name[ 0] # Extract the name of the treatment variable type_dict = dict(self._data.dtypes) for index in range(self._num_simulations): if self._placebo_type == "permute": if self._random_state is None: new_treatment = self._data[self._treatment_name].sample( frac=1).values else: new_treatment = self._data[self._treatment_name].sample( frac=1, random_state=self._random_state).values else: if 'float' in type_dict[treatment_name].name: self.logger.info( "Using a Normal Distribution with Mean:{} and Variance:{}" .format( PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL, PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL)) new_treatment = np.random.randn(num_rows)*PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL + \ PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL elif 'bool' in type_dict[treatment_name].name: self.logger.info( "Using a Binomial Distribution with {} trials and {} probability of success" .format( PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS, PlaceboTreatmentRefuter. DEFAULT_PROBABILITY_OF_BINOMIAL)) new_treatment = np.random.binomial( PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS, PlaceboTreatmentRefuter. DEFAULT_PROBABILITY_OF_BINOMIAL, num_rows).astype(bool) elif 'int' in type_dict[treatment_name].name: self.logger.info( "Using a Discrete Uniform Distribution lying between {} and {}" .format(self._data[treatment_name].min(), self._data[treatment_name].max())) new_treatment = np.random.randint( low=self._data[treatment_name].min(), high=self._data[treatment_name].max(), size=num_rows) elif 'category' in type_dict[treatment_name].name: categories = self._data[treatment_name].unique() self.logger.info( "Using a Discrete Uniform Distribution with the following categories:{}" .format(categories)) sample = np.random.choice(categories, size=num_rows) new_treatment = pd.Series(sample).astype('category') # Create a new column in the data by the name of placebo new_data = self._data.assign(placebo=new_treatment) # Sanity check the data self.logger.debug(new_data[0:10]) new_estimator = self.get_estimator_object(new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Use a Placebo Treatment") # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal # relationship between the treatment and the outcome. dummy_estimator = copy.deepcopy(self._estimate) dummy_estimator.value = 0 refute.add_significance_test_results( self.test_significance(dummy_estimator, sample_estimates)) return refute
def refute_estimate(self): # We need to change the identified estimand # We thus, make a copy. This is done as we don't want # to change the original DataFrame identified_estimand = copy.deepcopy(self._target_estimand) identified_estimand.outcome_variable = ["dummy_outcome"] sample_estimates = np.zeros(self._num_simulations) self.logger.info( "Refutation over {} simulated datasets of {} treatment".format( self._num_simulations, self._dummy_outcome_type)) num_rows = self._data.shape[0] for index in range(self._num_simulations): if self._dummy_outcome_type == "permute": if self._random_state is None: new_outcome = self._data[self._outcome_name].sample( frac=1).values else: new_outcome = self._data[self._outcome_name].sample( frac=1, random_state=self._random_state).values elif self._outcome_function is not None: new_outcome = self._outcome_function(self._data) if type(new_outcome) is pd.Series or \ type(new_outcome) is pd.DataFrame: new_outcome = new_outcome.values # Check if data types match assert type(new_outcome) is np.ndarray, ( "Only supports numpy.ndarray as the output") assert 'float' in new_outcome.dtype.name, ( "Only float outcomes are currently supported") if len(new_outcome.shape) == 2 and \ ( new_outcome.shape[0] ==1 or new_outcome.shape[1] ): self.logger.warning( "Converting the row or column vector to 1D array") new_outcome = new_outcome.ravel() assert len(new_outcome) == num_rows, ( "The number of outputs do not match that of the number of outcomes" ) elif len(new_outcome.shape) == 1: assert len(new_outcome) == num_rows, ( "The number of outputs do not match that of the number of outcomes" ) else: raise Exception( "Type Mismatch: The outcome is one dimensional, but the output has the shape:{}" .format(new_outcome.shape)) else: new_outcome = np.random.randn(num_rows) # Create a new column in the data by the name of dummy_outcome new_data = self._data.assign(dummy_outcome=new_outcome) # Sanity check the data self.logger.debug(new_data[0:10]) new_estimator = CausalEstimator.get_estimator_object( new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Use a Dummy Outcome") # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal # relationship between the treatment and the outcome. dummy_estimator = copy.deepcopy(self._estimate) dummy_estimator.value = 0 refute.add_significance_test_results( self.test_significance(dummy_estimator, sample_estimates)) return refute
def refute_estimate(self): # only permute is supported for iv methods if self._target_estimand.identifier_method.startswith("iv"): if self._placebo_type != "permute": self.logger.error( "Only placebo_type=''permute'' is supported for creating placebo for instrumental variable estimation methods" ) raise ValueError( "Only placebo_type=''permute'' is supported for creating placebo for instrumental variable estimation methods." ) # We need to change the identified estimand # We make a copy as a safety measure, we don't want to change the # original DataFrame identified_estimand = copy.deepcopy(self._target_estimand) identified_estimand.treatment_variable = ["placebo"] if self._target_estimand.identifier_method.startswith("iv"): identified_estimand.instrumental_variables = [ "placebo_" + s for s in identified_estimand.instrumental_variables ] # For IV methods, the estimating_instrument_names should also be # changed. So we change it inside the estimate and then restore it # back at the end of this method. if self._estimate.params[ "method_params"] is not None and "iv_instrument_name" in self._estimate.params[ "method_params"]: self._estimate.params["method_params"]["iv_instrument_name"] = \ ["placebo_" + s for s in parse_state(self._estimate.params["method_params"]["iv_instrument_name"])] sample_estimates = np.zeros(self._num_simulations) self.logger.info( "Refutation over {} simulated datasets of {} treatment".format( self._num_simulations, self._placebo_type)) num_rows = self._data.shape[0] treatment_name = self._treatment_name[ 0] # Extract the name of the treatment variable type_dict = dict(self._data.dtypes) for index in range(self._num_simulations): if self._placebo_type == "permute": permuted_idx = None if self._random_state is None: permuted_idx = np.random.choice(self._data.shape[0], size=self._data.shape[0], replace=False) else: permuted_idx = self._random_state.choice( self._data.shape[0], size=self._data.shape[0], replace=False) new_treatment = self._data[ self._treatment_name].iloc[permuted_idx].values if self._target_estimand.identifier_method.startswith("iv"): new_instruments_values = self._data[ self._estimate.estimator. estimating_instrument_names].iloc[permuted_idx].values new_instruments_df = pd.DataFrame( new_instruments_values, columns=[ "placebo_" + s for s in self._data[self._estimate.estimator. estimating_instrument_names].columns ]) else: if 'float' in type_dict[treatment_name].name: self.logger.info( "Using a Normal Distribution with Mean:{} and Variance:{}" .format( PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL, PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL)) new_treatment = np.random.randn(num_rows)*PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL + \ PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL elif 'bool' in type_dict[treatment_name].name: self.logger.info( "Using a Binomial Distribution with {} trials and {} probability of success" .format( PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS, PlaceboTreatmentRefuter. DEFAULT_PROBABILITY_OF_BINOMIAL)) new_treatment = np.random.binomial( PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS, PlaceboTreatmentRefuter. DEFAULT_PROBABILITY_OF_BINOMIAL, num_rows).astype(bool) elif 'int' in type_dict[treatment_name].name: self.logger.info( "Using a Discrete Uniform Distribution lying between {} and {}" .format(self._data[treatment_name].min(), self._data[treatment_name].max())) new_treatment = np.random.randint( low=self._data[treatment_name].min(), high=self._data[treatment_name].max(), size=num_rows) elif 'category' in type_dict[treatment_name].name: categories = self._data[treatment_name].unique() self.logger.info( "Using a Discrete Uniform Distribution with the following categories:{}" .format(categories)) sample = np.random.choice(categories, size=num_rows) new_treatment = pd.Series(sample).astype('category') # Create a new column in the data by the name of placebo new_data = self._data.assign(placebo=new_treatment) if self._target_estimand.identifier_method.startswith("iv"): new_data = pd.concat((new_data, new_instruments_df), axis=1) # Sanity check the data self.logger.debug(new_data[0:10]) new_estimator = CausalEstimator.get_estimator_object( new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value # Restoring the value of iv_instrument_name if self._target_estimand.identifier_method.startswith("iv"): if self._estimate.params[ "method_params"] is not None and "iv_instrument_name" in self._estimate.params[ "method_params"]: self._estimate.params["method_params"]["iv_instrument_name"] = \ [s.replace("placebo_","",1) for s in parse_state(self._estimate.params["method_params"]["iv_instrument_name"])] refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Use a Placebo Treatment") # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal # relationship between the treatment and the outcome. dummy_estimator = CausalEstimate( estimate=0, control_value=self._estimate.control_value, treatment_value=self._estimate.treatment_value, target_estimand=self._estimate.target_estimand, realized_estimand_expr=self._estimate.realized_estimand_expr) refute.add_significance_test_results( self.test_significance(dummy_estimator, sample_estimates)) refute.add_refuter(self) return refute