def test_5(self): treatment = "T" outcome = "Y" variables = ["X1", "X2"] causal_graph = "digraph{T->Y;X1->T;X1->Y;X2->T;}" columns = list(treatment) + list(outcome) + list(variables) df = pd.DataFrame(columns=columns) # Calculate causal effect twice: once for unit (t=1, c=0), once for specific increase (t=100, c=50) causal_model = CausalModel(df, treatment, outcome, graph=causal_graph) identified_estimand = causal_model.identify_effect( method_name="id-algorithm") # Compare with ground truth set_a = set(identified_estimand._product[0]._product[0]._product[0] ['outcome_vars']._set) set_b = set(identified_estimand._product[0]._product[0]._product[0] ['condition_vars']._set) set_c = set(identified_estimand._product[0]._product[1]._product[0] ['outcome_vars']._set) set_d = set(identified_estimand._product[0]._product[1]._product[0] ['condition_vars']._set) assert identified_estimand._product[0]._sum == ['X1'] assert len(set_a.difference({'Y'})) == 0 assert len(set_b.difference({'X1', 'X2', 'T'})) == 0 assert len(set_c.difference({'X1'})) == 0 assert len(set_d) == 0
def predict(self, dataset: DatasetInterface): data = dataset.get_data() # Temporally add treatment. data['treatment'] = True treatment = 'treatment' outcome = dataset.get_outcome() common_causes = dataset.get_causes() model = CausalModel(data, treatment, outcome, common_causes=common_causes, proceed_when_unidentifiable=True) # Identify the causal effect relation = model.identify_effect() # Estimate the causal effect estimate = model.estimate_effect( relation, method_name="backdoor.linear_regression", test_significance=True) # Refute the obtained estimate result = model.refute_estimate(relation, estimate, method_name="random_common_cause") return result.estimated_effect, result.new_effect
def test_external_estimator(self, beta, num_samples, num_treatments): num_common_causes = 5 data = dowhy.datasets.linear_dataset( beta=beta, num_common_causes=num_common_causes, num_samples=num_samples, num_treatments=num_treatments, treatment_is_binary=True, ) model = CausalModel( data=data["df"], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None, ) identified_estimand = model.identify_effect(proceed_when_unidentifiable=True) estimate = model.estimate_effect( identified_estimand, method_name="backdoor.tests.causal_estimators.mock_external_estimator.PropensityScoreWeightingEstimator", control_value=0, treatment_value=1, target_units="ate", # condition used for CATE confidence_intervals=True, method_params={ "propensity_score_model": linear_model.LogisticRegression(max_iter=1000) }, ) assert estimate.estimator.propensity_score_model.max_iter == 1000
def test_causalml_XGBTRegressor(self, init_data): # Defined a linear dataset with a given set of properties data = init_data # Create a model that captures the same model = CausalModel( data=data['df'], treatment=data['treatment_name'], outcome=data['outcome_name'], effect_modifiers=data['effect_modifier_names'], graph=data['gml_graph'] ) # Identify the effects within the model identified_estimand = model.identify_effect( proceed_when_unidentifiable=True ) xgbt_estimate = model.estimate_effect( identified_estimand, method_name="backdoor.causalml.inference.meta.XGBTRegressor", method_params={"init_params":{}} ) print("The XGBT estimate obtained:") print(xgbt_estimate)
def dowhy_quick_backdoor_estimator(dataframe, outcome, treatment, cofounders_list, method_name, populaton_of_interest='ate', view_model=False): """ Make a quick statistical assessment for the mean of 2 different samples (hypothesis test based) :param dataframe: original dataframe in a subject level :param group_col: the name of the group column :param category_col: the name of the category_col column :returns group_share_per_category_df: df containing the % share each category has by group """ causal_model = CausalModel(data=dataframe, treatment=treatment, outcome=outcome, common_causes=cofounders_list) if view_model: causal_model.view_model(layout="dot") identified_estimand = causal_model.identify_effect( proceed_when_unidentifiable=True) causal_estimate = causal_model.estimate_effect( identified_estimand, method_name=method_name, target_units= populaton_of_interest #, confidence_intervals=True # not in this release ) return causal_estimate.value
def test_causalml_RLearner(self, init_data): # Defined a linear dataset with a given set of properties data = init_data # Create a model that captures the same model = CausalModel( data=data['df'], treatment=data['treatment_name'], outcome=data['outcome_name'], effect_modifiers=data['effect_modifier_names'], graph=data['gml_graph'] ) # Identify the effects within the model identified_estimand = model.identify_effect( proceed_when_unidentifiable=True ) rl_estimate = None try: rl_estimate = model.estimate_effect( identified_estimand, method_name="backdoor.causalml.inference.meta.BaseRRegressor", method_params={"init_params":{ 'learner':XGBRegressor() } } ) except ValueError: print("Error with respect to the number of samples") print("The R Learner estimate obtained:") print(rl_estimate)
def average_treatment_effect_test_continuous(self, dataset="linear", beta=1, num_common_causes=3, num_instruments=2, num_samples=100000, treatment_is_binary=False): data = dowhy.datasets.linear_dataset( beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_samples=num_samples, treatment_is_binary=treatment_is_binary) model = CausalModel(data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None) target_estimand = model.identify_effect() estimator_ate = self._Estimator(data['df'], identified_estimand=target_estimand, treatment=data["treatment_name"], outcome=data["outcome_name"], test_significance=None) true_ate = data["ate"] ate_estimate = estimator_ate.estimate_effect() error = abs(ate_estimate.value - true_ate) print( "Error in ATE estimate = {0} with tolerance {1}%. Estimated={2},True={3}" .format(error, self._error_tolerance * 100, ate_estimate.value, true_ate)) res = True if ( error < abs(true_ate) * self._error_tolerance) else False assert res
def test_causalml_MLPTRegressor(self, init_data): # Defined a linear dataset with a given set of properties data = init_data # Create a model that captures the same model = CausalModel( data=data['df'], treatment=data['treatment_name'], outcome=data['outcome_name'], effect_modifiers=data['effect_modifier_names'], graph=data['gml_graph'] ) # Identify the effects within the model identified_estimand = model.identify_effect( proceed_when_unidentifiable=True ) mlpt_estimate = model.estimate_effect( identified_estimand, method_name="backdoor.causalml.inference.meta.MLPTRegressor", method_params={"init_params":{ 'hidden_layer_sizes':(10,10), 'learning_rate_init':0.1, 'early_stopping':True } } ) print("The MLPT estimate obtained:") print(mlpt_estimate)
def custom_data_average_treatment_effect_test(self, data): model = CausalModel( data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None ) target_estimand = model.identify_effect() estimator_ate = self._Estimator( data['df'], identified_estimand=target_estimand, treatment=data["treatment_name"], outcome=data["outcome_name"], test_significance=None ) true_ate = data["ate"] ate_estimate = estimator_ate.estimate_effect() error = ate_estimate.value - true_ate print("Error in ATE estimate = {0} with tolerance {1}%. Estimated={2},True={3}".format( error, self._error_tolerance * 100, ate_estimate.value, true_ate) ) res = True if (error < true_ate * self._error_tolerance) else False assert res
def simulate_dag_violations( methods, # estimators to use beta, # true treatment effect num_w_affected, # number of common causes affected effect_on_w, # effect of U on common causes num_z_affected, # number of common causes affected effect_on_z, # effect of U on instruments num_t_affected, # number of treatments affected effect_on_t, # effect of U on treatment effect_on_y, # effect of U on outcomes times, # number of simulation ): output = [] for _ in range(times): # beta, num_common_causes, num_instruments, num_samples, etc. are as in the tutorial data = modified_linear_dataset( beta=beta, # u -> common causes num_w_affected=num_w_affected, effect_on_w=effect_on_w, # u -> instruments num_z_affected=num_z_affected, effect_on_z=effect_on_z, # u -> treatment num_t_affected=num_t_affected, effect_on_t=effect_on_t, # u -> outcome effect_on_y=effect_on_y, num_common_causes=5, num_instruments=2, num_samples=10000, treatment_is_binary=True, ) df = data["df"] model = CausalModel( data=df, treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], instruments=data["instrument_names"], proceed_when_unidentifiable=True, ) identified_estimand = model.identify_effect() estimates = [ model.estimate_effect( identified_estimand, method_name=i[0], method_params=i[1] ).value for i in methods ] tmp_output = list(zip(estimates, [item[0] for item in methods])) output = output + tmp_output return output
def null_refutation_test(self, data=None, dataset="linear", beta=10, num_common_causes=1, num_instruments=1, num_samples=100000, treatment_is_binary=True): # Supports user-provided dataset object if data is None: data = dowhy.datasets.linear_dataset(beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_samples=num_samples, treatment_is_binary=treatment_is_binary) model = CausalModel( data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None ) target_estimand = model.identify_effect() ate_estimate = model.estimate_effect( identified_estimand=target_estimand, method_name=self.estimator_method, test_significance=None ) true_ate = data["ate"] self.logger.debug(true_ate) # To test if there are any exceptions ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method, confounders_effect_on_treatment = self.confounders_effect_on_t, confounders_effect_on_outcome = self.confounders_effect_on_y, effect_strength_on_treatment =self.effect_strength_on_t, effect_strength_on_outcome=self.effect_strength_on_y) self.logger.debug(ref.new_effect) # To test if the estimate is identical if refutation parameters are zero refute = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method, confounders_effect_on_treatment = self.confounders_effect_on_t, confounders_effect_on_outcome = self.confounders_effect_on_y, effect_strength_on_treatment = 0, effect_strength_on_outcome = 0) error = abs(refute.new_effect - ate_estimate.value) print("Error in refuted estimate = {0} with tolerance {1}%. Estimated={2},After Refutation={3}".format( error, self._error_tolerance * 100, ate_estimate.value, refute.new_effect) ) res = True if (error < abs(ate_estimate.value) * self._error_tolerance) else False assert res
def test_1(self): treatment = "T" outcome = "Y" causal_graph = "digraph{T->Y;}" columns = list(treatment) + list(outcome) df = pd.DataFrame(columns=columns) # Calculate causal effect twice: once for unit (t=1, c=0), once for specific increase (t=100, c=50) causal_model = CausalModel(df, treatment, outcome, graph=causal_graph) identified_estimand = causal_model.identify_effect( method_name="id-algorithm") # Only P(Y|T) should be present for test to succeed. identified_str = identified_estimand.__str__() gt_str = "Predictor: P(Y|T)" assert identified_str == gt_str
def predict_tutorial(self, data: pd.DataFrame): # https://towardsdatascience.com/implementing-causal-inference-a-key-step-towards-agi-de2cde8ea599 data = pd.read_csv( 'https://raw.githubusercontent.com/AMLab-Amsterdam/CEVAE/master/datasets/IHDP/csv/ihdp_npci_1.csv', header=None) col = [ 'treatment', 'y_factual', 'y_cfactual', 'mu0', 'mu1', ] for i in range(1, 26): col.append('x' + str(i)) data.columns = col data = data.astype({'treatment': 'bool'}, copy=False) result = data.head() # Create a causal model from the data and given common causes. xs = "" for i in range(1, 26): xs += ("x" + str(i) + "+") model = CausalModel(data=data, treatment='treatment', outcome='y_factual', common_causes=xs.split('+')) # Identify the causal effect identified_estimand = model.identify_effect() print(identified_estimand) # Estimate the causal effect and compare it with Average Treatment Effect estimate = model.estimate_effect( identified_estimand, method_name="backdoor.linear_regression", test_significance=True) print(estimate) print("Causal Estimate is " + str(estimate.value)) refute_results = model.refute_estimate( identified_estimand, estimate, method_name="random_common_cause") print(refute_results) dd = 3
def test_4(self): treatment = "T" outcome = "Y" variables = ["X1"] causal_graph = "digraph{T->Y;T->X1;X1->Y;}" columns = list(treatment) + list(outcome) + list(variables) df = pd.DataFrame(columns=columns) # Calculate causal effect twice: once for unit (t=1, c=0), once for specific increase (t=100, c=50) causal_model = CausalModel(df, treatment, outcome, graph=causal_graph) identified_estimand = causal_model.identify_effect( method_name="id-algorithm") # Compare with ground truth identified_str = identified_estimand.__str__() gt_str = "Sum over {X1}:\n\tPredictor: P(Y|T,X1)\n\tPredictor: P(X1|T)" assert identified_str == gt_str
def test_2(self): ''' Test undirected edge between treatment and outcome. ''' treatment = "T" outcome = "Y" causal_graph = "digraph{T->Y; Y->T;}" columns = list(treatment) + list(outcome) df = pd.DataFrame(columns=columns) # Calculate causal effect twice: once for unit (t=1, c=0), once for specific increase (t=100, c=50) causal_model = CausalModel(df, treatment, outcome, graph=causal_graph) # Since undirected graph, identify effect must throw an error. with pytest.raises(Exception): identified_estimand = causal_model.identify_effect( method_name="id-algorithm")
def att_causal_estimator(df, outcome, treatment, cofounders_list, method_name, view_model=False): causal_model = CausalModel(data=df, treatment=treatment, outcome=outcome, common_causes=cofounders_list) if view_model: causal_model.view_model(layout="dot") identified_estimand = causal_model.identify_effect( proceed_when_unidentifiable=True) causal_estimate = causal_model.estimate_effect( identified_estimand, method_name=method_name, target_units='att', #, confidence_intervals=True ) return (causal_estimate.value)
def predict_example(self, data: pd.DataFrame): # https://github.com/Microsoft/dowhy # https://ntanmayee.github.io/articles/2018/11/16/tools-for-causality.html x = 'E1' y = 'E3' causes = ['E1', 'E2'] model = CausalModel(data=data, treatment=causes, outcome=y, proceed_when_unidentifiable=True) # Identify causal effect and return target estimands identified_estimand = model.identify_effect() # Estimate the target estimand using a statistical method. estimate = model.estimate_effect( identified_estimand, method_name="backdoor.propensity_score_matching") # Refute the obtained estimate using multiple robustness checks. refute_results = model.refute_estimate( identified_estimand, estimate, method_name="random_common_cause")
def average_treatment_effect_test(self, dataset="linear", beta=10, num_common_causes=1, num_instruments=1, num_effect_modifiers=0, num_treatments=1, num_frontdoor_variables = 0, num_samples=100000, treatment_is_binary=True, outcome_is_binary=False, confidence_intervals=False, test_significance=False, method_params=None): if dataset == "linear": data = dowhy.datasets.linear_dataset(beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_effect_modifiers = num_effect_modifiers, num_treatments = num_treatments, num_frontdoor_variables=num_frontdoor_variables, num_samples=num_samples, treatment_is_binary=treatment_is_binary, outcome_is_binary = outcome_is_binary) elif dataset == "simple-iv": data = dowhy.datasets.simple_iv_dataset(beta=beta, num_treatments = num_treatments, num_samples = num_samples, treatment_is_binary=treatment_is_binary, outcome_is_binary = outcome_is_binary) else: raise ValueError("Dataset type not supported.") model = CausalModel( data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=test_significance ) target_estimand = model.identify_effect() target_estimand.set_identifier_method(self._identifier_method) estimator_ate = self._Estimator( data['df'], identified_estimand=target_estimand, treatment=data["treatment_name"], outcome=data["outcome_name"], control_value = 0, treatment_value = 1, test_significance=test_significance, evaluate_effect_strength=False, confidence_intervals = confidence_intervals, target_units = "ate", effect_modifiers = data["effect_modifier_names"], params=method_params ) true_ate = data["ate"] ate_estimate = estimator_ate.estimate_effect() str(ate_estimate) # checking if str output is correctly created error = abs(ate_estimate.value - true_ate) print("Error in ATE estimate = {0} with tolerance {1}%. Estimated={2},True={3}".format( error, self._error_tolerance * 100, ate_estimate.value, true_ate) ) res = True if (error < abs(true_ate) * self._error_tolerance) else False assert res # Compute confidence intervals, standard error and significance tests if confidence_intervals: ate_estimate.get_confidence_intervals() ate_estimate.get_confidence_intervals(confidence_level=0.99) ate_estimate.get_confidence_intervals(method="bootstrap") ate_estimate.get_standard_error() ate_estimate.get_standard_error(method="bootstrap") if test_significance: ate_estimate.test_stat_significance() ate_estimate.test_stat_significance(method="bootstrap")
def null_refutation_test(self, data=None, dataset="linear", beta=10, num_common_causes=1, num_instruments=1, num_samples=100000, treatment_is_binary=True): # Supports user-provided dataset object if data is None: data = dowhy.datasets.linear_dataset( beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_samples=num_samples, treatment_is_binary=treatment_is_binary) print(data['df']) print("") model = CausalModel(data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None) target_estimand = model.identify_effect() ate_estimate = model.estimate_effect( identified_estimand=target_estimand, method_name=self.estimator_method, test_significance=None) true_ate = data["ate"] self.logger.debug(true_ate) if self.refuter_method == "add_unobserved_common_cause": # To test if there are any exceptions ref = model.refute_estimate( target_estimand, ate_estimate, method_name=self.refuter_method, confounders_effect_on_treatment=self.confounders_effect_on_t, confounders_effect_on_outcome=self.confounders_effect_on_y, effect_strength_on_treatment=self.effect_strength_on_t, effect_strength_on_outcome=self.effect_strength_on_y) self.logger.debug(ref.new_effect) # To test if the estimate is identical if refutation parameters are zero refute = model.refute_estimate( target_estimand, ate_estimate, method_name=self.refuter_method, confounders_effect_on_treatment=self.confounders_effect_on_t, confounders_effect_on_outcome=self.confounders_effect_on_y, effect_strength_on_treatment=0, effect_strength_on_outcome=0) error = abs(refute.new_effect - ate_estimate.value) print( "Error in refuted estimate = {0} with tolerance {1}%. Estimated={2},After Refutation={3}" .format(error, self._error_tolerance * 100, ate_estimate.value, refute.new_effect)) res = True if (error < abs(ate_estimate.value) * self._error_tolerance) else False assert res elif self.refuter_method == "placebo_treatment_refuter": if treatment_is_binary is True: ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method, num_simulations=10) else: ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method) # This value is hardcoded to be zero as we are runnning this on a linear dataset. # Ordinarily, we should expect this value to be zero. EXPECTED_PLACEBO_VALUE = 0 error = abs(ref.new_effect - EXPECTED_PLACEBO_VALUE) print( "Error in the refuted estimate = {0} with tolerence {1}%. Expected Value={2}, After Refutation={3}" .format(error, self._error_tolerance * 100, EXPECTED_PLACEBO_VALUE, ref.new_effect)) print(ref) res = True if (error < self._error_tolerance) else False assert res elif self.refuter_method == "data_subset_refuter": if treatment_is_binary is True: ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method, num_simulations=5) else: ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method) error = abs(ref.new_effect - ate_estimate.value) print( "Error in the refuted estimate = {0} with tolerence {1}%. Estimated={2}, After Refutation={3}" .format(error, self._error_tolerance * 100, ate_estimate.value, ref.new_effect)) print(ref) res = True if (error < abs(ate_estimate.value) * self._error_tolerance) else False assert res elif self.refuter_method == "bootstrap_refuter": if treatment_is_binary is True: ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method, num_simulations=5) else: ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method) error = abs(ref.new_effect - ate_estimate.value) print( "Error in the refuted estimate = {0} with tolerence {1}%. Estimated={2}, After Refutation={3}" .format(error, self._error_tolerance * 100, ate_estimate.value, ref.new_effect)) print(ref) res = True if (error < abs(ate_estimate.value) * self._error_tolerance) else False assert res elif self.refuter_method == "dummy_outcome_refuter": if self.transformations is None: ref = model.refute_estimate(target_estimand, ate_estimate, method_name=self.refuter_method, num_simulations=2) else: ref = model.refute_estimate( target_estimand, ate_estimate, method_name=self.refuter_method, transformations=self.transformations, params=self.params, num_simulations=2) # This value is hardcoded to be zero as we are runnning this on a linear dataset. # Ordinarily, we should expect this value to be zero. EXPECTED_DUMMY_OUTCOME_VALUE = 0 error = abs(ref.new_effect - EXPECTED_DUMMY_OUTCOME_VALUE) print( "Error in the refuted estimate = {0} with tolerence {1}%. Expected Value={2}, After Refutation={3}" .format(error, self._error_tolerance * 100, EXPECTED_DUMMY_OUTCOME_VALUE, ref.new_effect)) print(ref) assert ref
xs = "" for i in range(1, 26): xs += ("x" + str(i) + "+") model = CausalModel( data=data, treatment='treatment', outcome='y_factual', common_causes=xs.split('+'), ) #save the model as a png model.view_model() display(Image(filename="causal_model.png")) #Identify the causal effect identified_estimand = model.identify_effect() print(identified_estimand) # Estimate the causal effect and compare it with Average Treatment Effect estimate = model.estimate_effect(identified_estimand, method_name="backdoor.linear_regression", test_significance=True) print(estimate) print("Causal Estimate is " + str(estimate.value)) data_1 = data[data["treatment"] == 1] data_0 = data[data["treatment"] == 0] print("ATE", np.mean(data_1["y_factual"]) - np.mean(data_0["y_factual"]))
def average_treatment_effect_test(self, dataset="linear", beta=10, num_common_causes=1, num_instruments=1, num_effect_modifiers=0, num_treatments=1, num_samples=100000, treatment_is_binary=True, outcome_is_binary=False, method_params=None): if dataset == "linear": data = dowhy.datasets.linear_dataset( beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_effect_modifiers=num_effect_modifiers, num_treatments=num_treatments, num_samples=num_samples, treatment_is_binary=treatment_is_binary, outcome_is_binary=outcome_is_binary) elif dataset == "simple-iv": data = dowhy.datasets.simple_iv_dataset( beta=beta, num_treatments=num_treatments, num_samples=num_samples, treatment_is_binary=treatment_is_binary, outcome_is_binary=outcome_is_binary) else: raise ValueError("Dataset type not supported.") model = CausalModel(data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None) target_estimand = model.identify_effect() estimator_ate = self._Estimator( data['df'], identified_estimand=target_estimand, treatment=data["treatment_name"], outcome=data["outcome_name"], control_value=0, treatment_value=1, test_significance=None, evaluate_effect_strength=False, confidence_intervals=False, target_units="ate", effect_modifiers=data["effect_modifier_names"], params=method_params) true_ate = data["ate"] ate_estimate = estimator_ate.estimate_effect() error = abs(ate_estimate.value - true_ate) print( "Error in ATE estimate = {0} with tolerance {1}%. Estimated={2},True={3}" .format(error, self._error_tolerance * 100, ate_estimate.value, true_ate)) res = True if ( error < abs(true_ate) * self._error_tolerance) else False assert res
class DoWhyWrapper: """ A wrapper class to allow user call other methods from dowhy package through EconML. (e.g. causal graph, refutation test, etc.) Parameters ---------- cate_estimator: instance An instance of any CATE estimator we currently support """ def __init__(self, cate_estimator): self._cate_estimator = cate_estimator def _get_params(self): init = self._cate_estimator.__init__ # introspect the constructor arguments to find the model parameters # to represent init_signature = inspect.signature(init) parameters = init_signature.parameters.values() for p in parameters: if p.kind == p.VAR_POSITIONAL or p.kind == p.VAR_KEYWORD: raise RuntimeError("cate estimators should always specify their parameters in the signature " "of their __init__ (no varargs, no varkwargs). " f"{self._cate_estimator} with constructor {init_signature} doesn't " "follow this convention.") # Extract and sort argument names excluding 'self' return sorted([p.name for p in parameters]) def fit(self, Y, T, X=None, W=None, Z=None, *, outcome_names=None, treatment_names=None, feature_names=None, confounder_names=None, instrument_names=None, graph=None, estimand_type="nonparametric-ate", proceed_when_unidentifiable=True, missing_nodes_as_confounders=False, control_value=0, treatment_value=1, target_units="ate", **kwargs): """ Estimate the counterfactual model from data through dowhy package. Parameters ---------- Y: vector of length n Outcomes for each sample T: vector of length n Treatments for each sample X: optional (n, d_x) matrix (Default=None) Features for each sample W: optional (n, d_w) matrix (Default=None) Controls for each sample Z: optional (n, d_z) matrix (Default=None) Instruments for each sample outcome_names: optional list (Default=None) Name of the outcome treatment_names: optional list (Default=None) Name of the treatment feature_names: optional list (Default=None) Name of the features confounder_names: optional list (Default=None) Name of the confounders instrument_names: optional list (Default=None) Name of the instruments graph: optional Path to DOT file containing a DAG or a string containing a DAG specification in DOT format estimand_type: optional string Type of estimand requested (currently only "nonparametric-ate" is supported). In the future, may support other specific parametric forms of identification proceed_when_unidentifiable: optional bool (Default=True) Whether the identification should proceed by ignoring potential unobserved confounders missing_nodes_as_confounders: optional bool (Default=False) Whether variables in the dataframe that are not included in the causal graph should be automatically included as confounder nodes control_value: optional scalar (Default=0) Value of the treatment in the control group, for effect estimation treatment_value: optional scalar (Default=1) Value of the treatment in the treated group, for effect estimation target_units: optional (Default="ate") The units for which the treatment effect should be estimated. This can be of three types: 1. A string for common specifications of target units (namely, "ate", "att" and "atc"), 2. A lambda function that can be used as an index for the data (pandas DataFrame), 3. A new DataFrame that contains values of the effect_modifiers and effect will be estimated only for this new data kwargs: optional Other keyword arguments from fit method for CATE estimator Returns ------- self """ Y, T, X, W, Z = check_input_arrays(Y, T, X, W, Z) # create dataframe n_obs = Y.shape[0] Y, T, X, W, Z = reshape_arrays_2dim(n_obs, Y, T, X, W, Z) # currently dowhy only support single outcome and single treatment assert Y.shape[1] == 1, "Can only accept single dimensional outcome." assert T.shape[1] == 1, "Can only accept single dimensional treatment." # column names if outcome_names is None: outcome_names = [f"Y{i}" for i in range(Y.shape[1])] if treatment_names is None: treatment_names = [f"T{i}" for i in range(T.shape[1])] if feature_names is None: feature_names = [f"X{i}" for i in range(X.shape[1])] if confounder_names is None: confounder_names = [f"W{i}" for i in range(W.shape[1])] if instrument_names is None: instrument_names = [f"Z{i}" for i in range(Z.shape[1])] column_names = outcome_names + treatment_names + feature_names + confounder_names + instrument_names df = pd.DataFrame(np.hstack((Y, T, X, W, Z)), columns=column_names) self.dowhy_ = CausalModel( data=df, treatment=treatment_names, outcome=outcome_names, graph=graph, common_causes=feature_names + confounder_names if X.shape[1] > 0 or W.shape[1] > 0 else None, instruments=instrument_names if Z.shape[1] > 0 else None, effect_modifiers=feature_names if X.shape[1] > 0 else None, estimand_type=estimand_type, proceed_when_unidetifiable=proceed_when_unidentifiable, missing_nodes_as_confounders=missing_nodes_as_confounders ) self.identified_estimand_ = self.dowhy_.identify_effect(proceed_when_unidentifiable=True) method_name = "backdoor." + self._cate_estimator.__module__ + "." + self._cate_estimator.__class__.__name__ init_params = {} for p in self._get_params(): init_params[p] = getattr(self._cate_estimator, p) self.estimate_ = self.dowhy_.estimate_effect(self.identified_estimand_, method_name=method_name, control_value=control_value, treatment_value=treatment_value, target_units=target_units, method_params={ "init_params": init_params, "fit_params": kwargs, }, ) return self def refute_estimate(self, *, method_name, **kwargs): """ Refute an estimated causal effect. If method_name is provided, uses the provided method. In the future, we may support automatic selection of suitable refutation tests. Following refutation methods are supported: - Adding a randomly-generated confounder: "random_common_cause" - Adding a confounder that is associated with both treatment and outcome: "add_unobserved_common_cause" - Replacing the treatment with a placebo (random) variable): "placebo_treatment_refuter" - Removing a random subset of the data: "data_subset_refuter" For more details, see docs :mod:`dowhy.causal_refuters` Parameters ---------- method_name: string Name of the refutation method kwargs: optional Additional arguments that are passed directly to the refutation method. Can specify a random seed here to ensure reproducible results ('random_seed' parameter). For method-specific parameters, consult the documentation for the specific method. All refutation methods are in the causal_refuters subpackage. Returns ------- RefuteResult: an instance of the RefuteResult class """ return self.dowhy_.refute_estimate( self.identified_estimand_, self.estimate_, method_name=method_name, **kwargs ) # We don't allow user to call refit_final from this class, since internally dowhy effect estimate will only update # cate estimator but not the effect. def refit_final(self, inference=None): raise AttributeError( "Method refit_final is not allowed through a dowhy object; please perform a full fit instead.") def __getattr__(self, attr): # don't proxy special methods if attr.startswith('__'): raise AttributeError(attr) elif attr in ['_cate_estimator', 'dowhy_', 'identified_estimand_', 'estimate_']: return super().__getattr__(attr) elif attr.startswith('dowhy__'): return getattr(self.dowhy_, attr[len('dowhy__'):]) elif hasattr(self.estimate_._estimator_object, attr): if hasattr(self.dowhy_, attr): warnings.warn("This call is ambiguous, " "we're defaulting to CATE estimator's attribute. " "Please add 'dowhy__' as prefix if you want to get dowhy attribute.", UserWarning) return getattr(self.estimate_._estimator_object, attr) else: return getattr(self.dowhy_, attr) def __setattr__(self, attr, value): if attr in ['_cate_estimator', 'dowhy_', 'identified_estimand_', 'estimate_']: super().__setattr__(attr, value) elif attr.startswith('dowhy__'): setattr(self.dowhy_, attr[len('dowhy__'):], value) elif hasattr(self.estimate_._estimator_object, attr): if hasattr(self.dowhy_, attr): warnings.warn("This call is ambiguous, " "we're defaulting to CATE estimator's attribute. " "Please add 'dowhy__' as prefix if you want to set dowhy attribute.", UserWarning) setattr(self.estimate_._estimator_object, attr, value) else: setattr(self.dowhy_, attr, value)
def test_iv_estimators(self): # Setup data data = datasets.linear_dataset(10, num_common_causes=4, num_samples=10000, num_instruments=2, num_effect_modifiers=2, num_treatments=1, treatment_is_binary=False) df = data['df'] model = CausalModel(data=data["df"], treatment=data["treatment_name"], outcome=data["outcome_name"], effect_modifiers=data["effect_modifier_names"], graph=data["gml_graph"]) identified_estimand = model.identify_effect( proceed_when_unidentifiable=True) # Test DeepIV dims_zx = len(model._instruments) + len(model._effect_modifiers) dims_tx = len(model._treatment) + len(model._effect_modifiers) treatment_model = keras.Sequential([ keras.layers.Dense( 128, activation='relu', input_shape=(dims_zx, )), # sum of dims of Z and X keras.layers.Dropout(0.17), keras.layers.Dense(64, activation='relu'), keras.layers.Dropout(0.17), keras.layers.Dense(32, activation='relu'), keras.layers.Dropout(0.17) ]) response_model = keras.Sequential([ keras.layers.Dense( 128, activation='relu', input_shape=(dims_tx, )), # sum of dims of T and X keras.layers.Dropout(0.17), keras.layers.Dense(64, activation='relu'), keras.layers.Dropout(0.17), keras.layers.Dense(32, activation='relu'), keras.layers.Dropout(0.17), keras.layers.Dense(1) ]) deepiv_estimate = model.estimate_effect( identified_estimand, method_name="iv.econml.deepiv.DeepIVEstimator", target_units=lambda df: df["X0"] > -1, confidence_intervals=False, method_params={ "init_params": { 'n_components': 10, # Number of gaussians in the mixture density networks 'm': lambda z, x: treatment_model( keras.layers.concatenate([z, x])), # Treatment model, "h": lambda t, x: response_model( keras.layers.concatenate([t, x])), # Response model 'n_samples': 1, # Number of samples used to estimate the response 'first_stage_options': { 'epochs': 25 }, 'second_stage_options': { 'epochs': 25 } }, "fit_params": {} })
from dowhy import CausalModel credit_data = get_credit() model = CausalModel( data=credit_data["df"], treatment=["YearsEmployed"], outcome=["Approved"], graph=credit_data["dot_graph"], ) from sklearn.linear_model import LogisticRegressionCV # Saves the model as "causal_model.png" model.view_model(layout="dot") identified_estimand_binary = model.identify_effect( proceed_when_unidentifiable=True) # estimate = model.estimate_effect(identified_estimand, method_name="backdoor.econml.drlearner.LinearDRLearner") orthoforest_estimate = model.estimate_effect( identified_estimand_binary, method_name="backdoor.econml.ortho_forest.ContinuousTreatmentOrthoForest", target_units=lambda df: df["Male"] == 1, confidence_intervals=False, method_params={ "init_params": { 'n_trees': 2, # not ideal, just as an example to speed up computation }, "fit_params": {} }) print(orthoforest_estimate)
def test_backdoor_estimators(self): # Setup data data = datasets.linear_dataset(10, num_common_causes=4, num_samples=10000, num_instruments=2, num_effect_modifiers=2, num_treatments=1, treatment_is_binary=False) df = data['df'] model = CausalModel(data=data["df"], treatment=data["treatment_name"], outcome=data["outcome_name"], effect_modifiers=data["effect_modifier_names"], graph=data["gml_graph"]) identified_estimand = model.identify_effect( proceed_when_unidentifiable=True) # Test LinearDML dml_estimate = model.estimate_effect( identified_estimand, method_name="backdoor.econml.dml.LinearDML", control_value=0, treatment_value=1, target_units=lambda df: df["X0"] > 1, # condition used for CATE method_params={ "init_params": { 'model_y': GradientBoostingRegressor(), 'model_t': GradientBoostingRegressor(), 'featurizer': PolynomialFeatures(degree=1, include_bias=True) }, "fit_params": {} }) # Test ContinuousTreatmentOrthoForest orthoforest_estimate = model.estimate_effect( identified_estimand, method_name= "backdoor.econml.ortho_forest.ContinuousTreatmentOrthoForest", target_units=lambda df: df["X0"] > 2, method_params={ "init_params": { 'n_trees': 10 }, "fit_params": {} }) # Test LinearDRLearner data_binary = datasets.linear_dataset(10, num_common_causes=4, num_samples=10000, num_instruments=2, num_effect_modifiers=2, treatment_is_binary=True, outcome_is_binary=True) model_binary = CausalModel( data=data_binary["df"], treatment=data_binary["treatment_name"], outcome=data_binary["outcome_name"], effect_modifiers=data["effect_modifier_names"], graph=data_binary["gml_graph"]) identified_estimand_binary = model_binary.identify_effect( proceed_when_unidentifiable=True) drlearner_estimate = model_binary.estimate_effect( identified_estimand_binary, method_name="backdoor.econml.drlearner.LinearDRLearner", target_units=lambda df: df["X0"] > 1, confidence_intervals=False, method_params={ "init_params": { 'model_propensity': LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto') }, "fit_params": {} })
def test_iv_estimators(self): keras = pytest.importorskip("keras") # Setup data data = datasets.linear_dataset(10, num_common_causes=4, num_samples=10000, num_instruments=2, num_effect_modifiers=2, num_treatments=1, treatment_is_binary=False) df = data['df'] model = CausalModel(data=data["df"], treatment=data["treatment_name"], outcome=data["outcome_name"], effect_modifiers=data["effect_modifier_names"], graph=data["gml_graph"]) identified_estimand = model.identify_effect( proceed_when_unidentifiable=True) # Test DeepIV dims_zx = len(model._instruments) + len(model._effect_modifiers) dims_tx = len(model._treatment) + len(model._effect_modifiers) treatment_model = keras.Sequential([ keras.layers.Dense( 128, activation='relu', input_shape=(dims_zx, )), # sum of dims of Z and X keras.layers.Dropout(0.17), keras.layers.Dense(64, activation='relu'), keras.layers.Dropout(0.17), keras.layers.Dense(32, activation='relu'), keras.layers.Dropout(0.17) ]) response_model = keras.Sequential([ keras.layers.Dense( 128, activation='relu', input_shape=(dims_tx, )), # sum of dims of T and X keras.layers.Dropout(0.17), keras.layers.Dense(64, activation='relu'), keras.layers.Dropout(0.17), keras.layers.Dense(32, activation='relu'), keras.layers.Dropout(0.17), keras.layers.Dense(1) ]) deepiv_estimate = model.estimate_effect( identified_estimand, method_name="iv.econml.deepiv.DeepIVEstimator", target_units=lambda df: df["X0"] > -1, confidence_intervals=False, method_params={ "init_params": { 'n_components': 10, # Number of gaussians in the mixture density networks # Treatment model, 'm': lambda z, x: treatment_model( keras.layers.concatenate([z, x])), # Response model "h": lambda t, x: response_model( keras.layers.concatenate([t, x])), 'n_samples': 1, # Number of samples used to estimate the response 'first_stage_options': { 'epochs': 25 }, 'second_stage_options': { 'epochs': 25 } }, "fit_params": {} }) # Test IntentToTreatDRIV data = datasets.linear_dataset(10, num_common_causes=4, num_samples=10000, num_instruments=1, num_effect_modifiers=2, num_treatments=1, treatment_is_binary=True, num_discrete_instruments=1) df = data['df'] model = CausalModel(data=data["df"], treatment=data["treatment_name"], outcome=data["outcome_name"], effect_modifiers=data["effect_modifier_names"], graph=data["gml_graph"]) identified_estimand = model.identify_effect( proceed_when_unidentifiable=True) driv_estimate = model.estimate_effect( identified_estimand, method_name="iv.econml.ortho_iv.LinearIntentToTreatDRIV", target_units=lambda df: df["X0"] > 1, confidence_intervals=False, method_params={ "init_params": { 'model_T_XZ': GradientBoostingClassifier(), 'model_Y_X': GradientBoostingRegressor(), 'flexible_model_effect': GradientBoostingRegressor(), 'featurizer': PolynomialFeatures(degree=1, include_bias=False) }, "fit_params": {} })