def refute_estimate(self): sample_estimates = np.zeros(self._num_simulations) self.logger.info("Refutation over {} simulated datasets of size {} each" .format(self._subset_fraction ,self._subset_fraction*len(self._data.index) ) ) for index in range(self._num_simulations): if self._random_state is None: new_data = self._data.sample(frac=self._subset_fraction) else: new_data = self._data.sample(frac=self._subset_fraction, random_state=self._random_state) new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Use a subset of data" ) # We want to see if the estimate falls in the same distribution as the one generated by the refuter # Ideally that should be the case as choosing a subset should not have a significant effect on the ability # of the treatment to affect the outcome refute.add_significance_test_results( self.test_significance(self._estimate, sample_estimates) ) return refute
def refute_estimate(self, *args, **kwargs): if self._sample_size > len(self._data): self.logger.warning("The sample size is larger than the population size") sample_estimates = np.zeros(self._num_simulations) self.logger.info("Refutation over {} simulated datasets of size {} each" .format(self._num_simulations ,self._sample_size ) ) for index in range(self._num_simulations): if self._random_state is None: new_data = resample(self._data, n_samples=self._sample_size ) else: new_data = resample(self._data, n_samples=self._sample_size, random_state=self._random_state ) if self._chosen_variables is not None: for variable in self._chosen_variables: if ('float' or 'int') in new_data[variable].dtype.name: scaling_factor = new_data[variable].std() new_data[variable] += np.random.normal(loc=0.0, scale=self._noise * scaling_factor,size=self._sample_size) elif 'bool' in new_data[variable].dtype.name: probs = np.random.uniform(0, 1, self._sample_size ) new_data[variable] = np.where(probs < self._probability_of_change, np.logical_not(new_data[variable]), new_data[variable]) elif 'category' in new_data[variable].dtype.name: categories = new_data[variable].unique() # Find the set difference for each row changed_data = new_data[variable].apply( lambda row: list( set(categories) - set([row]) ) ) # Choose one out of the remaining changed_data = changed_data.apply( lambda row: random.choice(row) ) new_data[variable] = np.where(probs < self._probability_of_change, changed_data) new_data[variable].astype('category') new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Bootstrap Sample Dataset" ) # We want to see if the estimate falls in the same distribution as the one generated by the refuter # Ideally that should be the case as running bootstrap should not have a significant effect on the ability # of the treatment to affect the outcome refute.add_significance_test_results( self.test_significance(self._estimate, sample_estimates) ) return refute
def refute_estimate(self): num_rows = self._data.shape[0] new_data = self._data.assign(w_random=np.random.randn(num_rows)) new_backdoor_variables = self._target_estimand.backdoor_variables + ['w_random'] identified_estimand = copy.deepcopy(self._target_estimand) # Adding a new backdoor variable to the identified estimand identified_estimand.backdoor_variables = new_backdoor_variables new_estimator = CausalEstimator.get_estimator_object(new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() refute = CausalRefutation(self._estimate.value, new_effect.value, refutation_type="Refute: Add a Random Common Cause") return refute
def refute_estimate(self): num_rows = self._data.shape[0] sample_estimates = np.zeros(self._num_simulations) self.logger.info("Refutation over {} simulated datasets, each with a random common cause added" .format(self._num_simulations)) new_backdoor_variables = self._target_estimand.get_backdoor_variables() + ['w_random'] identified_estimand = copy.deepcopy(self._target_estimand) # Adding a new backdoor variable to the identified estimand identified_estimand.set_backdoor_variables(new_backdoor_variables) for index in range(self._num_simulations): if self._random_state is None: new_data = self._data.assign(w_random=np.random.randn(num_rows)) else: new_data = self._data.assign(w_random=self._random_state.normal(size=num_rows )) new_estimator = CausalEstimator.get_estimator_object(new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Add a random common cause" ) # We want to see if the estimate falls in the same distribution as the one generated by the refuter # Ideally that should be the case as choosing a subset should not have a significant effect on the ability # of the treatment to affect the outcome refute.add_significance_test_results( self.test_significance(self._estimate, sample_estimates) ) refute.add_refuter(self) return refute
def refute_estimate(self): # We need to change the identified estimand # We thus, make a copy. This is done as we don't want # to change the original DataFrame identified_estimand = copy.deepcopy(self._target_estimand) identified_estimand.outcome_variable = ["dummy_outcome"] self.logger.info("Refutation over {} simulated datasets".format( self._num_simulations)) self.logger.info("The transformation passed: {}".format( self._transformation_list)) simulation_results = [] refute_list = [] # We use collections.OrderedDict to maintain the order in which the data is stored causal_effect_map = OrderedDict() # Check if we are using an estimator in the transformation list estimator_present = self._has_estimator() # The rationale behind ordering of the loops is the fact that we induce randomness everytime we create the # Train and the Validation Datasets. Thus, we run the simulation loop followed by the training and the validation # loops. Thus, we can get different values everytime we get the estimator. for _ in range(self._num_simulations): estimates = [] if estimator_present == False: # Warn the user that the specified parameter is not applicable when no estimator is present in the transformation if self._test_fraction != DummyOutcomeRefuter.DEFAULT_TEST_FRACTION: self.logger.warning( "'test_fraction' is not applicable as there is no base treatment value." ) # We set X_train = 0 and outcome_train to be 0 if self._unobserved_confounder_values is not None: self._data[ 'simulated'] = self._unobserved_confounder_values self._chosen_variables.append('simulated') validation_df = self._data X_train = None outcome_train = None X_validation_df = validation_df[self._chosen_variables] X_validation = X_validation_df.values outcome_validation = validation_df['y'].values # Get the final outcome, after running through all the values in the transformation list outcome_validation = self.process_data( X_train, outcome_train, X_validation, outcome_validation, self._transformation_list) # Check if the value of true effect has been already stored # We use None as the key as we have no base category for this refutation if None not in causal_effect_map: # As we currently support only one treatment causal_effect_map[None] = self._true_causal_effect( validation_df[self._treatment_name[0]]) outcome_validation += causal_effect_map[None] new_data = validation_df.assign( dummy_outcome=outcome_validation) new_estimator = CausalEstimator.get_estimator_object( new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() estimates.append(new_effect.value) else: groups = self.preprocess_data_by_treatment() group_count = 0 if len(self._test_fraction) == 1: self._test_fraction = len(groups) * self._test_fraction for key_train, _ in groups: base_train = groups.get_group(key_train).sample( frac=self._test_fraction[group_count].base) train_set = set( [tuple(line) for line in base_train.values]) total_set = set([ tuple(line) for line in groups.get_group(key_train).values ]) base_validation = pd.DataFrame(list( total_set.difference(train_set)), columns=base_train.columns) X_train_df = base_train[self._chosen_variables] X_train = X_train_df.values outcome_train = base_train['y'].values validation_df = [] transformation_list = self._transformation_list validation_df.append(base_validation) for key_validation, _ in groups: if key_validation != key_train: validation_df.append( groups.get_group(key_validation).sample( frac=self._test_fraction[group_count].other )) validation_df = pd.concat(validation_df) X_validation_df = validation_df[self._chosen_variables] X_validation = X_validation_df.values outcome_validation = validation_df['y'].values # If the number of data points is too few, run the default transformation: [("zero",""),("noise", {'std_dev':1} )] if X_train.shape[0] <= self._min_data_point_threshold: transformation_list = DummyOutcomeRefuter.DEFAULT_TRANSFORMATION self.logger.warning( "The number of data points in X_train:{} for category:{} is less than threshold:{}" .format(X_train.shape[0], key_train, self._min_data_point_threshold)) self.logger.warning( "Therefore, defaulting to the minimal set of transformations:{}" .format(transformation_list)) outcome_validation = self.process_data( X_train, outcome_train, X_validation, outcome_validation, transformation_list) # Check if the value of true effect has been already stored # This ensures that we calculate the causal effect only once. # We use key_train as we map data with respect to the base category of the data if key_train not in causal_effect_map: # As we currently support only one treatment causal_effect_map[ key_train] = self._true_causal_effect( validation_df[self._treatment_name[0]]) # Add h(t) to f(W) to get the dummy outcome outcome_validation += causal_effect_map[key_train] new_data = validation_df.assign( dummy_outcome=outcome_validation) new_estimator = CausalEstimator.get_estimator_object( new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() estimates.append(new_effect.value) group_count += 1 simulation_results.append(estimates) # We convert to ndarray for ease in indexing # The data is of the form # sim1: cat1 cat2 ... catn # sim2: cat1 cat2 ... catn simulation_results = np.array(simulation_results) # Note: We would like the causal_estimator to find the true causal estimate that we have specified through this # refuter. Let the value of the true causal effect be h(t). In the following section of code, we wish to find out if h(t) falls in the # distribution of the refuter. if estimator_present == False: dummy_estimate = CausalEstimate( estimate=causal_effect_map[None], target_estimand=self._estimate.target_estimand, realized_estimand_expr=self._estimate.realized_estimand_expr) refute = CausalRefutation( dummy_estimate.value, np.mean(simulation_results), refutation_type="Refute: Use a Dummy Outcome") refute.add_significance_test_results( self.test_significance(dummy_estimate, np.ravel(simulation_results))) refute.add_refuter(self) refute_list.append(refute) else: # True Causal Effect list causal_effect_list = list(causal_effect_map.values()) # Iterating through the refutation for each category for train_category in range(simulation_results.shape[1]): dummy_estimate = CausalEstimate( estimate=causal_effect_list[train_category], target_estimand=self._estimate.target_estimand, realized_estimand_expr=self._estimate. realized_estimand_expr) refute = CausalRefutation( dummy_estimate.value, np.mean(simulation_results[:, train_category]), refutation_type="Refute: Use a Dummy Outcome") refute.add_significance_test_results( self.test_significance( dummy_estimate, simulation_results[:, train_category])) refute.add_refuter(self) refute_list.append(refute) return refute_list
def refute_estimate(self): # We need to change the identified estimand # We thus, make a copy. This is done as we don't want # to change the original DataFrame identified_estimand = copy.deepcopy(self._target_estimand) identified_estimand.outcome_variable = ["dummy_outcome"] self.logger.info("Refutation over {} simulated datasets".format( self._num_simulations)) self.logger.info("The transformation passed: {}".format( self._transformation_list)) simulation_results = [] refute_list = [] no_estimator = self.check_for_estimator() for _ in range(self._num_simulations): estimates = [] if no_estimator: # We set X_train = 0 and outcome_train to be 0 validation_df = self._data X_train = None outcome_train = None X_validation = validation_df[self._chosen_variables].values outcome_validation = validation_df['y'].values # Get the final outcome, after running through all the values in the transformation list outcome_validation = self.process_data( X_train, outcome_train, X_validation, outcome_validation, self._transformation_list) else: groups = self.preprocess_data_by_treatment() for key_train, _ in groups: X_train = groups.get_group(key_train)[ self._chosen_variables].values outcome_train = groups.get_group(key_train)['y'].values validation_df = [] transformation_list = self._transformation_list for key_validation, _ in groups: if key_validation != key_train: validation_df.append( groups.get_group(key_validation)) validation_df = pd.concat(validation_df) X_validation = validation_df[self._chosen_variables].values outcome_validation = validation_df['y'].values # If the number of data points is too few, run the default transformation: [("zero",""),("noise", {'std_dev':1} )] if X_train.shape[0] <= self._min_data_point_threshold: transformation_list = DummyOutcomeRefuter.DEFAULT_TRANSFORMATION outcome_validation = self.process_data( X_train, outcome_train, X_validation, outcome_validation, transformation_list) new_data = validation_df.assign(dummy_outcome=outcome_validation) new_estimator = CausalEstimator.get_estimator_object( new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() estimates.append(new_effect.value) simulation_results.append(estimates) # We convert to ndarray for ease in indexing # The data is of the form # sim1: cat1 cat2 ... catn # sim2: cat1 cat2 ... catn simulation_results = np.array(simulation_results) # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal # relationship between the treatment and the outcome. dummy_estimator = CausalEstimate( estimate=0, target_estimand=self._estimate.target_estimand, realized_estimand_expr=self._estimate.realized_estimand_expr) if no_estimator: refute = CausalRefutation( self._estimate.value, np.mean(simulation_results), refutation_type="Refute: Use a Dummy Outcome") refute.add_significance_test_results( self.test_significance(dummy_estimator, simulation_results)) refute_list.append(refute) else: for category in simulation_results.shape[1]: refute = CausalRefutation( self._estimate.value, np.mean(simulation_results[:, category]), refutation_type="Refute: Use a Dummy Outcome") refute.add_significance_test_results( self.test_significance(dummy_estimator, simulation_results[:, category])) refute_list.append(refute) return refute_list
def refute_estimate(self): # We need to change the identified estimand # We thus, make a copy. This is done as we don't want # to change the original DataFrame identified_estimand = copy.deepcopy(self._target_estimand) identified_estimand.outcome_variable = ["dummy_outcome"] sample_estimates = np.zeros(self._num_simulations) self.logger.info("Refutation over {} simulated datasets".format( self._num_simulations)) self.logger.info("The transformation passed: {}", self._transformations) # This flag is to make sure we store the estimators whose input is deterministic save_estimators = True # We store the value of the estimators in the format "estimator_name" + "pos_in_transform" : estimator_object saved_estimator_dict = {} X = self._data[self._chosen_variables] new_outcome = self._data['y'] for index in range(self._num_simulations): transform_num = 0 for action, func_args in self._transformations: if callable(action): new_outcome = action(X, **func_args) elif action in DummyOutcomeRefuter.SUPPORTED_ESTIMATORS: if action + str(transform_num) in saved_estimator_dict: estimator = saved_estimator_dict[action + str(transform_num)] new_outcome = estimator(X) else: estimator = self._estimate_dummy_outcome( func_args, action, new_outcome) new_outcome = estimator(X) if save_estimators: saved_estimator_dict[ action + str(transform_num)] = estimator elif action == 'noise': save_estimators = False new_outcome = self._noise(new_outcome, func_args) elif action == 'permute': save_estimators = False new_outcome = self._permute(new_outcome, func_args) elif action == 'zero': save_estimators = False new_outcome = np.zeros(new_outcome.shape) transform_num += 1 save_estimators = False # Create a new column in the data by the name of dummy_outcome new_data = self._data.assign(dummy_outcome=new_outcome) # Sanity check the data self.logger.debug(new_data[0:10]) new_estimator = CausalEstimator.get_estimator_object( new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Use a Dummy Outcome") # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal # relationship between the treatment and the outcome. dummy_estimator = copy.deepcopy(self._estimate) dummy_estimator.value = 0 refute.add_significance_test_results( self.test_significance(dummy_estimator, sample_estimates)) return refute
def refute_estimate(self): # We need to change the identified estimand # We make a copy as a safety measure, we don't want to change the # original DataFrame identified_estimand = copy.deepcopy(self._target_estimand) identified_estimand.treatment_variable = ["placebo"] sample_estimates = np.zeros(self._num_simulations) self.logger.info( "Refutation over {} simulated datasets of {} treatment".format( self._num_simulations, self._placebo_type)) num_rows = self._data.shape[0] treatment_name = self._treatment_name[ 0] # Extract the name of the treatment variable type_dict = dict(self._data.dtypes) for index in range(self._num_simulations): if self._placebo_type == "permute": if self._random_state is None: new_treatment = self._data[self._treatment_name].sample( frac=1).values else: new_treatment = self._data[self._treatment_name].sample( frac=1, random_state=self._random_state).values else: if 'float' in type_dict[treatment_name].name: self.logger.info( "Using a Normal Distribution with Mean:{} and Variance:{}" .format( PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL, PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL)) new_treatment = np.random.randn(num_rows)*PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL + \ PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL elif 'bool' in type_dict[treatment_name].name: self.logger.info( "Using a Binomial Distribution with {} trials and {} probability of success" .format( PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS, PlaceboTreatmentRefuter. DEFAULT_PROBABILITY_OF_BINOMIAL)) new_treatment = np.random.binomial( PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS, PlaceboTreatmentRefuter. DEFAULT_PROBABILITY_OF_BINOMIAL, num_rows).astype(bool) elif 'int' in type_dict[treatment_name].name: self.logger.info( "Using a Discrete Uniform Distribution lying between {} and {}" .format(self._data[treatment_name].min(), self._data[treatment_name].max())) new_treatment = np.random.randint( low=self._data[treatment_name].min(), high=self._data[treatment_name].max(), size=num_rows) elif 'category' in type_dict[treatment_name].name: categories = self._data[treatment_name].unique() self.logger.info( "Using a Discrete Uniform Distribution with the following categories:{}" .format(categories)) sample = np.random.choice(categories, size=num_rows) new_treatment = pd.Series(sample).astype('category') # Create a new column in the data by the name of placebo new_data = self._data.assign(placebo=new_treatment) # Sanity check the data self.logger.debug(new_data[0:10]) new_estimator = CausalEstimator.get_estimator_object( new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Use a Placebo Treatment") # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal # relationship between the treatment and the outcome. dummy_estimator = copy.deepcopy(self._estimate) dummy_estimator.value = 0 refute.add_significance_test_results( self.test_significance(dummy_estimator, sample_estimates)) return refute
def refute_estimate(self): # We need to change the identified estimand # We thus, make a copy. This is done as we don't want # to change the original DataFrame identified_estimand = copy.deepcopy(self._target_estimand) identified_estimand.outcome_variable = ["dummy_outcome"] sample_estimates = np.zeros(self._num_simulations) self.logger.info( "Refutation over {} simulated datasets of {} treatment".format( self._num_simulations, self._dummy_outcome_type)) num_rows = self._data.shape[0] for index in range(self._num_simulations): if self._dummy_outcome_type == "permute": if self._random_state is None: new_outcome = self._data[self._outcome_name].sample( frac=1).values else: new_outcome = self._data[self._outcome_name].sample( frac=1, random_state=self._random_state).values elif self._outcome_function is not None: new_outcome = self._outcome_function(self._data) if type(new_outcome) is pd.Series or \ type(new_outcome) is pd.DataFrame: new_outcome = new_outcome.values # Check if data types match assert type(new_outcome) is np.ndarray, ( "Only supports numpy.ndarray as the output") assert 'float' in new_outcome.dtype.name, ( "Only float outcomes are currently supported") if len(new_outcome.shape) == 2 and \ ( new_outcome.shape[0] ==1 or new_outcome.shape[1] ): self.logger.warning( "Converting the row or column vector to 1D array") new_outcome = new_outcome.ravel() assert len(new_outcome) == num_rows, ( "The number of outputs do not match that of the number of outcomes" ) elif len(new_outcome.shape) == 1: assert len(new_outcome) == num_rows, ( "The number of outputs do not match that of the number of outcomes" ) else: raise Exception( "Type Mismatch: The outcome is one dimensional, but the output has the shape:{}" .format(new_outcome.shape)) else: new_outcome = np.random.randn(num_rows) # Create a new column in the data by the name of dummy_outcome new_data = self._data.assign(dummy_outcome=new_outcome) # Sanity check the data self.logger.debug(new_data[0:10]) new_estimator = CausalEstimator.get_estimator_object( new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Use a Dummy Outcome") # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal # relationship between the treatment and the outcome. dummy_estimator = copy.deepcopy(self._estimate) dummy_estimator.value = 0 refute.add_significance_test_results( self.test_significance(dummy_estimator, sample_estimates)) return refute
def refute_estimate(self): # only permute is supported for iv methods if self._target_estimand.identifier_method.startswith("iv"): if self._placebo_type != "permute": self.logger.error( "Only placebo_type=''permute'' is supported for creating placebo for instrumental variable estimation methods" ) raise ValueError( "Only placebo_type=''permute'' is supported for creating placebo for instrumental variable estimation methods." ) # We need to change the identified estimand # We make a copy as a safety measure, we don't want to change the # original DataFrame identified_estimand = copy.deepcopy(self._target_estimand) identified_estimand.treatment_variable = ["placebo"] if self._target_estimand.identifier_method.startswith("iv"): identified_estimand.instrumental_variables = [ "placebo_" + s for s in identified_estimand.instrumental_variables ] # For IV methods, the estimating_instrument_names should also be # changed. So we change it inside the estimate and then restore it # back at the end of this method. if self._estimate.params[ "method_params"] is not None and "iv_instrument_name" in self._estimate.params[ "method_params"]: self._estimate.params["method_params"]["iv_instrument_name"] = \ ["placebo_" + s for s in parse_state(self._estimate.params["method_params"]["iv_instrument_name"])] sample_estimates = np.zeros(self._num_simulations) self.logger.info( "Refutation over {} simulated datasets of {} treatment".format( self._num_simulations, self._placebo_type)) num_rows = self._data.shape[0] treatment_name = self._treatment_name[ 0] # Extract the name of the treatment variable type_dict = dict(self._data.dtypes) for index in range(self._num_simulations): if self._placebo_type == "permute": permuted_idx = None if self._random_state is None: permuted_idx = np.random.choice(self._data.shape[0], size=self._data.shape[0], replace=False) else: permuted_idx = self._random_state.choice( self._data.shape[0], size=self._data.shape[0], replace=False) new_treatment = self._data[ self._treatment_name].iloc[permuted_idx].values if self._target_estimand.identifier_method.startswith("iv"): new_instruments_values = self._data[ self._estimate.estimator. estimating_instrument_names].iloc[permuted_idx].values new_instruments_df = pd.DataFrame( new_instruments_values, columns=[ "placebo_" + s for s in self._data[self._estimate.estimator. estimating_instrument_names].columns ]) else: if 'float' in type_dict[treatment_name].name: self.logger.info( "Using a Normal Distribution with Mean:{} and Variance:{}" .format( PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL, PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL)) new_treatment = np.random.randn(num_rows)*PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL + \ PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL elif 'bool' in type_dict[treatment_name].name: self.logger.info( "Using a Binomial Distribution with {} trials and {} probability of success" .format( PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS, PlaceboTreatmentRefuter. DEFAULT_PROBABILITY_OF_BINOMIAL)) new_treatment = np.random.binomial( PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS, PlaceboTreatmentRefuter. DEFAULT_PROBABILITY_OF_BINOMIAL, num_rows).astype(bool) elif 'int' in type_dict[treatment_name].name: self.logger.info( "Using a Discrete Uniform Distribution lying between {} and {}" .format(self._data[treatment_name].min(), self._data[treatment_name].max())) new_treatment = np.random.randint( low=self._data[treatment_name].min(), high=self._data[treatment_name].max(), size=num_rows) elif 'category' in type_dict[treatment_name].name: categories = self._data[treatment_name].unique() self.logger.info( "Using a Discrete Uniform Distribution with the following categories:{}" .format(categories)) sample = np.random.choice(categories, size=num_rows) new_treatment = pd.Series(sample).astype('category') # Create a new column in the data by the name of placebo new_data = self._data.assign(placebo=new_treatment) if self._target_estimand.identifier_method.startswith("iv"): new_data = pd.concat((new_data, new_instruments_df), axis=1) # Sanity check the data self.logger.debug(new_data[0:10]) new_estimator = CausalEstimator.get_estimator_object( new_data, identified_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value # Restoring the value of iv_instrument_name if self._target_estimand.identifier_method.startswith("iv"): if self._estimate.params[ "method_params"] is not None and "iv_instrument_name" in self._estimate.params[ "method_params"]: self._estimate.params["method_params"]["iv_instrument_name"] = \ [s.replace("placebo_","",1) for s in parse_state(self._estimate.params["method_params"]["iv_instrument_name"])] refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Use a Placebo Treatment") # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal # relationship between the treatment and the outcome. dummy_estimator = CausalEstimate( estimate=0, control_value=self._estimate.control_value, treatment_value=self._estimate.treatment_value, target_estimand=self._estimate.target_estimand, realized_estimand_expr=self._estimate.realized_estimand_expr) refute.add_significance_test_results( self.test_significance(dummy_estimator, sample_estimates)) refute.add_refuter(self) return refute
def refute_estimate(self): """ This function attempts to add an unobserved common cause to the outcome and the treatment. At present, we have implemented the behavior for one dimensional behaviors for continueous and binary variables. This function can either take single valued inputs or a range of inputs. The function then looks at the data type of the input and then decides on the course of action. :return: CausalRefuter: An object that contains the estimated effect and a new effect and the name of the refutation used. """ if not isinstance(self.kappa_t, np.ndarray) and not isinstance(self.kappa_y, np.ndarray): # Deal with single value inputs new_data = copy.deepcopy(self._data) new_data = self.include_confounders_effect(new_data, self.kappa_t, self.kappa_y) new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate) new_effect = new_estimator.estimate_effect() refute = CausalRefutation(self._estimate.value, new_effect.value, refutation_type="Refute: Add an Unobserved Common Cause") refute.new_effect = np.array(new_effect.value) refute.add_refuter(self) return refute else: # Deal with multiple value inputs if isinstance(self.kappa_t, np.ndarray) and isinstance(self.kappa_y, np.ndarray): # Deal with range inputs # Get a 2D matrix of values x,y = np.meshgrid(self.kappa_t, self.kappa_y) # x,y are both MxN results_matrix = np.random.rand(len(x),len(y)) # Matrix to hold all the results of NxM print(results_matrix.shape) orig_data = copy.deepcopy(self._data) for i in range(0,len(x[0])): for j in range(0,len(y)): new_data = self.include_confounders_effect(orig_data, x[0][i], y[j][0]) new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate) new_effect = new_estimator.estimate_effect() refute = CausalRefutation(self._estimate.value, new_effect.value, refutation_type="Refute: Add an Unobserved Common Cause") self.logger.debug(refute) results_matrix[i][j] = refute.estimated_effect # Populate the results import matplotlib import matplotlib.pyplot as plt fig = plt.figure(figsize=(6,5)) left, bottom, width, height = 0.1, 0.1, 0.8, 0.8 ax = fig.add_axes([left, bottom, width, height]) cp = plt.contourf(x, y, results_matrix) plt.colorbar(cp) ax.set_title('Effect of Unobserved Common Cause') ax.set_xlabel('Value of Linear Constant on Treatment') ax.set_ylabel('Value of Linear Constant on Outcome') plt.show() refute.new_effect = results_matrix # Store the values into the refute object refute.add_refuter(self) return refute elif isinstance(self.kappa_t, np.ndarray): outcomes = np.random.rand(len(self.kappa_t)) orig_data = copy.deepcopy(self._data) for i in range(0,len(self.kappa_t)): new_data = self.include_confounders_effect(orig_data, self.kappa_t[i], self.kappa_y) new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate) new_effect = new_estimator.estimate_effect() refute = CausalRefutation(self._estimate.value, new_effect.value, refutation_type="Refute: Add an Unobserved Common Cause") self.logger.debug(refute) outcomes[i] = refute.estimated_effect # Populate the results import matplotlib import matplotlib.pyplot as plt fig = plt.figure(figsize=(6,5)) left, bottom, width, height = 0.1, 0.1, 0.8, 0.8 ax = fig.add_axes([left, bottom, width, height]) plt.plot(self.kappa_t, outcomes) ax.set_title('Effect of Unobserved Common Cause') ax.set_xlabel('Value of Linear Constant on Treatment') ax.set_ylabel('New Effect') plt.show() refute.new_effect = outcomes refute.add_refuter(self) return refute elif isinstance(self.kappa_y, np.ndarray): outcomes = np.random.rand(len(self.kappa_y)) orig_data = copy.deepcopy(self._data) for i in range(0, len(self.kappa_y)): new_data = self.include_confounders_effect(orig_data, self.kappa_t, self.kappa_y[i]) new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate) new_effect = new_estimator.estimate_effect() refute = CausalRefutation(self._estimate.value, new_effect.value, refutation_type="Refute: Add an Unobserved Common Cause") self.logger.debug(refute) outcomes[i] = refute.estimated_effect # Populate the results import matplotlib import matplotlib.pyplot as plt fig = plt.figure(figsize=(6,5)) left, bottom, width, height = 0.1, 0.1, 0.8, 0.8 ax = fig.add_axes([left, bottom, width, height]) plt.plot(self.kappa_y, outcomes) ax.set_title('Effect of Unobserved Common Cause') ax.set_xlabel('Value of Linear Constant on Outcome') ax.set_ylabel('New Effect') plt.show() refute.new_effect = outcomes refute.add_refuter(self) return refute
def refute_estimate(self): """ This function attempts to add an unobserved common cause to the outcome and the treatment. At present, we have implemented the behavior for one dimensional behaviors for continuous and binary variables. This function can either take single valued inputs or a range of inputs. The function then looks at the data type of the input and then decides on the course of action. :return: CausalRefuter: An object that contains the estimated effect and a new effect and the name of the refutation used. """ if self.kappa_t is None: self.kappa_t = self.infer_default_kappa_t() if self.kappa_y is None: self.kappa_y = self.infer_default_kappa_y() if not isinstance(self.kappa_t, (list, np.ndarray)) and not isinstance( self.kappa_y, (list, np.ndarray)): # Deal with single value inputs new_data = copy.deepcopy(self._data) new_data = self.include_confounders_effect(new_data, self.kappa_t, self.kappa_y) new_estimator = CausalEstimator.get_estimator_object( new_data, self._target_estimand, self._estimate) new_effect = new_estimator.estimate_effect() refute = CausalRefutation( self._estimate.value, new_effect.value, refutation_type="Refute: Add an Unobserved Common Cause") refute.new_effect_array = np.array(new_effect.value) refute.new_effect = new_effect.value refute.add_refuter(self) return refute else: # Deal with multiple value inputs if isinstance(self.kappa_t, (list, np.ndarray)) and isinstance( self.kappa_y, (list, np.ndarray)): # Deal with range inputs # Get a 2D matrix of values #x,y = np.meshgrid(self.kappa_t, self.kappa_y) # x,y are both MxN results_matrix = np.random.rand( len(self.kappa_t), len(self.kappa_y)) # Matrix to hold all the results of NxM orig_data = copy.deepcopy(self._data) for i in range(len(self.kappa_t)): for j in range(len(self.kappa_y)): new_data = self.include_confounders_effect( orig_data, self.kappa_t[i], self.kappa_y[j]) new_estimator = CausalEstimator.get_estimator_object( new_data, self._target_estimand, self._estimate) new_effect = new_estimator.estimate_effect() refute = CausalRefutation( self._estimate.value, new_effect.value, refutation_type= "Refute: Add an Unobserved Common Cause") results_matrix[i][ j] = refute.new_effect # Populate the results refute.new_effect_array = results_matrix refute.new_effect = (np.min(results_matrix), np.max(results_matrix)) # Store the values into the refute object refute.add_refuter(self) if self.plotmethod is None: return refute import matplotlib import matplotlib.pyplot as plt fig = plt.figure(figsize=(6, 5)) left, bottom, width, height = 0.1, 0.1, 0.8, 0.8 ax = fig.add_axes([left, bottom, width, height]) oe = self._estimate.value contour_levels = [oe / 4.0, oe / 2.0, (3.0 / 4) * oe, oe] contour_levels.extend( [0, np.min(results_matrix), np.max(results_matrix)]) if self.plotmethod == "contour": cp = plt.contourf(self.kappa_y, self.kappa_t, results_matrix, levels=sorted(contour_levels)) # Adding a label on the contour line for the original estimate fmt = {} trueeffect_index = np.where(cp.levels == oe)[0][0] fmt[cp.levels[trueeffect_index]] = "Estimated Effect" # Label every other level using strings plt.clabel(cp, [cp.levels[trueeffect_index]], inline=True, fmt=fmt) plt.colorbar(cp) elif self.plotmethod == "colormesh": cp = plt.pcolormesh(self.kappa_y, self.kappa_t, results_matrix, shading="nearest") plt.colorbar(cp, ticks=contour_levels) ax.yaxis.set_ticks(self.kappa_t) ax.xaxis.set_ticks(self.kappa_y) plt.xticks(rotation=45) ax.set_title('Effect of Unobserved Common Cause') ax.set_ylabel('Value of Linear Constant on Treatment') ax.set_xlabel('Value of Linear Constant on Outcome') plt.show() return refute elif isinstance(self.kappa_t, (list, np.ndarray)): outcomes = np.random.rand(len(self.kappa_t)) orig_data = copy.deepcopy(self._data) for i in range(0, len(self.kappa_t)): new_data = self.include_confounders_effect( orig_data, self.kappa_t[i], self.kappa_y) new_estimator = CausalEstimator.get_estimator_object( new_data, self._target_estimand, self._estimate) new_effect = new_estimator.estimate_effect() refute = CausalRefutation( self._estimate.value, new_effect.value, refutation_type="Refute: Add an Unobserved Common Cause" ) self.logger.debug(refute) outcomes[i] = refute.new_effect # Populate the results refute.new_effect_array = outcomes refute.new_effect = (np.min(outcomes), np.max(outcomes)) refute.add_refuter(self) if self.plotmethod is None: return refute import matplotlib import matplotlib.pyplot as plt fig = plt.figure(figsize=(6, 5)) left, bottom, width, height = 0.1, 0.1, 0.8, 0.8 ax = fig.add_axes([left, bottom, width, height]) plt.plot(self.kappa_t, outcomes) plt.axhline(self._estimate.value, linestyle='--', color="gray") ax.set_title('Effect of Unobserved Common Cause') ax.set_xlabel('Value of Linear Constant on Treatment') ax.set_ylabel('Estimated Effect after adding the common cause') plt.show() return refute elif isinstance(self.kappa_y, (list, np.ndarray)): outcomes = np.random.rand(len(self.kappa_y)) orig_data = copy.deepcopy(self._data) for i in range(0, len(self.kappa_y)): new_data = self.include_confounders_effect( orig_data, self.kappa_t, self.kappa_y[i]) new_estimator = CausalEstimator.get_estimator_object( new_data, self._target_estimand, self._estimate) new_effect = new_estimator.estimate_effect() refute = CausalRefutation( self._estimate.value, new_effect.value, refutation_type="Refute: Add an Unobserved Common Cause" ) self.logger.debug(refute) outcomes[i] = refute.new_effect # Populate the results refute.new_effect_array = outcomes refute.new_effect = (np.min(outcomes), np.max(outcomes)) refute.add_refuter(self) if self.plotmethod is None: return refute import matplotlib import matplotlib.pyplot as plt fig = plt.figure(figsize=(6, 5)) left, bottom, width, height = 0.1, 0.1, 0.8, 0.8 ax = fig.add_axes([left, bottom, width, height]) plt.plot(self.kappa_y, outcomes) plt.axhline(self._estimate.value, linestyle='--', color="gray") ax.set_title('Effect of Unobserved Common Cause') ax.set_xlabel('Value of Linear Constant on Outcome') ax.set_ylabel('Estimated Effect after adding the common cause') plt.show() return refute