def average_treatment_effect_test(self, dataset="linear", beta=10, num_common_causes=1, num_instruments=1, num_samples=10000, treatment_is_binary=True): data = dowhy.datasets.linear_dataset( beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_samples=num_samples, treatment_is_binary=treatment_is_binary) model = CausalModel(data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None) target_estimand = model.identify_effect() estimator_ate = self._Estimator(data['df'], identified_estimand=target_estimand, treatment=data["treatment_name"], outcome=data["outcome_name"], test_significance=None) true_ate = data["ate"] ate_estimate = estimator_ate.estimate_effect() error = ate_estimate.value - true_ate print( "Error in ATE estimate = {0} with tolerance {1}%. Estimated={2},True={3}" .format(error, self._error_tolerance * 100, ate_estimate.value, true_ate)) res = True if (error < true_ate * self._error_tolerance) else False assert res
def do(self, x, method='weighting', num_cores=1, variable_types={}, outcome=None, params=None, dot_graph=None, common_causes=None, instruments=None, estimand_type='ate', proceed_when_unidentifiable=False, stateful=False): x, keep_original_treatment = self.parse_x(x) if not stateful or method != self._method: self.reset() if not self._causal_model: self._causal_model = CausalModel(self._obj, [xi for xi in x.keys()][0], outcome, graph=dot_graph, common_causes=common_causes, instruments=instruments, estimand_type=estimand_type, proceed_when_unidentifiable=proceed_when_unidentifiable) self._identified_estimand = self._causal_model.identify_effect() if not self._sampler: self._method = method do_sampler_class = do_samplers.get_class_object(method + "_sampler") self._sampler = do_sampler_class(self._obj, self._identified_estimand, self._causal_model._treatment, self._causal_model._outcome, params=params, variable_types=variable_types, num_cores=num_cores, causal_model=self._causal_model, keep_original_treatment=keep_original_treatment) result = self._sampler.do_sample(x) if not stateful: self.reset() return result
def model(self, force_again=False): if self.t_model is None or force_again: self.t_model = CausalModel(data=self.data, treatment='TOJ', outcome='IntCur', graph=self.gml_graph) # CausalModel(data=self.data["df"], # treatment=self.data["treatment_name"], # outcome=self.data["outcome_name"], # graph=self.data["gml_graph"]) return self.t_model
def plot(self, *args, **kwargs): if kwargs.get('method_name'): method_name = kwargs.get('method_name') else: method_name = "backdoor.propensity_score_matching" logging.info("Using {} for estimation.".format(method_name)) if kwargs.get('common_causes'): self.use_graph = False elif kwargs.get('dot_graph'): self.use_graph = True else: raise Exception("You must specify a method for determining a backdoor set.") if self.use_graph: model = CausalModel(data=self._obj, treatment=self._obj[kwargs["treatment_name"]], outcome=self._obj[kwargs["outcome_name"]], graph=args["dot_graph"]) else: model = CausalModel(data=self._obj, treatment=self._obj[kwargs["treatment_name"]], outcome=self._obj[kwargs["outcome_name"]], common_causes=args["common_causes"]) if kwargs['kind'] == 'bar': identified_estimand = model.identify_effect() estimate = model.estimate_effect(identified_estimand, method_name=method_name) elif kwargs['kind'] == 'line' or not kwargs['kind'].get(): identified_estimand = model.identify_effect() estimate = model.estimate_effect(identified_estimand, method_name=method_name) else: raise Exception("Plot type {} not supported for causal plots!".format(kwargs.get('kind'))) self._obj.plot(*args, **kwargs)
def CalDoWhy(dat): model = CausalModel(data=dat["df"], treatment=dat["treatment_name"], outcome=dat["outcome_name"], graph=dat["gml_graph"]) # Identification identified_estimand = model.identify_effect() # Estimation causal_estimate = model.estimate_effect( identified_estimand, method_name="backdoor.linear_regression") return causal_estimate
def CalPSR(dat): model = CausalModel( data=dat["df"], treatment=dat["treatment_name"], outcome=dat["outcome_name"], graph=dat["gml_graph"] ) treatment_name = model._treatment outcome_name = model._outcome common_causes_name = model._graph.get_common_causes(treatment_name, outcome_name) data = dat["df"] treatment = data[treatment_name] outcome = data[outcome_name] if("U" in common_causes_name): common_causes_name.remove("U") common_causes = data[common_causes_name] reg_ps = LinearRegression().fit(common_causes, treatment) ps = reg_ps.predict(common_causes) X = pd.DataFrame({"Treatment": treatment, "PS": ps}) psr = LinearRegression().fit(X, outcome) return psr.coef_[0]
class CausalAccessor(object): def __init__(self, pandas_obj): self._obj = pandas_obj self._causal_model = None self._sampler = None self._identified_estimand = None self._method = None def reset(self): self._causal_model = None self._identified_estimand = None self._sampler = None self._method = None def do(self, x, method='weighting', num_cores=1, variable_types={}, outcome=None, params=None, dot_graph=None, common_causes=None, instruments=None, estimand_type='ate', proceed_when_unidentifiable=False, stateful=False): x, keep_original_treatment = self.parse_x(x) if not stateful or method != self._method: self.reset() if not self._causal_model: self._causal_model = CausalModel(self._obj, [xi for xi in x.keys()][0], outcome, graph=dot_graph, common_causes=common_causes, instruments=instruments, estimand_type=estimand_type, proceed_when_unidentifiable=proceed_when_unidentifiable) self._identified_estimand = self._causal_model.identify_effect() if not self._sampler: self._method = method do_sampler_class = do_samplers.get_class_object(method + "_sampler") self._sampler = do_sampler_class(self._obj, self._identified_estimand, self._causal_model._treatment, self._causal_model._outcome, params=params, variable_types=variable_types, num_cores=num_cores, causal_model=self._causal_model, keep_original_treatment=keep_original_treatment) result = self._sampler.do_sample(x) if not stateful: self.reset() return result def parse_x(self, x): if type(x) == str: return {x: None}, True if type(x) == list: return {xi: None for xi in x}, True if type(x) == dict: return x, False raise Exception('x format not recognized: {}'.format(type(x)))
def custom_data_average_treatment_effect_test(self, data): model = CausalModel(data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None) target_estimand = model.identify_effect() estimator_ate = self._Estimator(data['df'], identified_estimand=target_estimand, treatment=data["treatment_name"], outcome=data["outcome_name"], test_significance=None) true_ate = data["ate"] ate_estimate = estimator_ate.estimate_effect() error = ate_estimate.value - true_ate print( "Error in ATE estimate = {0} with tolerance {1}%. Estimated={2},True={3}" .format(error, self._error_tolerance * 100, ate_estimate.value, true_ate)) res = True if (error < true_ate * self._error_tolerance) else False assert res
def register_graph(): digraph = request.args.get('digraph') dataset = request.args.get('dataset') treatment_name = request.args.get('treatment') outcome_name = request.args.get('outcome') df = dataiku.Dataset(dataset).get_dataframe() model = CausalModel( data=df, treatment=treatment_name, outcome=outcome_name, graph=digraph, ) identified_estimand = model.identify_effect() causal_estimate_reg = model.estimate_effect( identified_estimand, method_name="backdoor.linear_regression", test_significance=True) d = {'results': str(causal_estimate_reg)} return json.dumps(d)
def null_refutation_test(self, data=None, dataset="linear", beta=10, num_common_causes=1, num_instruments=1, num_samples=100000, treatment_is_binary=True): # Supports user-provided dataset object if data is None: data = dowhy.datasets.linear_dataset( beta=beta, num_common_causes=num_common_causes, num_instruments=num_instruments, num_samples=num_samples, treatment_is_binary=treatment_is_binary) model = CausalModel(data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"], proceed_when_unidentifiable=True, test_significance=None) target_estimand = model.identify_effect() ate_estimate = model.estimate_effect( identified_estimand=target_estimand, method_name=self.estimator_method, test_significance=None) true_ate = data["ate"] # To test if there are any exceptions ref = model.refute_estimate( target_estimand, ate_estimate, method_name=self.refuter_method, confounders_effect_on_treatment=self.confounders_effect_on_t, confounders_effect_on_outcome=self.confounders_effect_on_y, effect_strength_on_treatment=self.effect_strength_on_t, effect_strength_on_outcome=self.effect_strength_on_y) # To test if the estimate is identical if refutation parameters are zero refute = model.refute_estimate( target_estimand, ate_estimate, method_name=self.refuter_method, confounders_effect_on_treatment=self.confounders_effect_on_t, confounders_effect_on_outcome=self.confounders_effect_on_y, effect_strength_on_treatment=0, effect_strength_on_outcome=0) error = abs(refute.new_effect - ate_estimate.value) print( "Error in refuted estimate = {0} with tolerance {1}%. Estimated={2},After Refutation={3}" .format(error, self._error_tolerance * 100, ate_estimate.value, refute.new_effect)) res = True if ( error < abs(ate_estimate.value) * self._error_tolerance) else False assert res
def do(self, x, method=None, num_cores=1, variable_types={}, outcome=None, params=None, dot_graph=None, common_causes=None, instruments=None, estimand_type='ate', proceed_when_unidentifiable=False, keep_original_treatment=False, use_previous_sampler=False): if not method: raise Exception("You must specify a do sampling method.") if not self._obj._causal_model or not use_previous_sampler: self._obj._causal_model = CausalModel( self._obj, [xi for xi in x.keys()][0], outcome, graph=dot_graph, common_causes=common_causes, instruments=instruments, estimand_type=estimand_type, proceed_when_unidentifiable=proceed_when_unidentifiable) self._obj._identified_estimand = self._obj._causal_model.identify_effect( ) do_sampler_class = do_samplers.get_class_object(method + "_sampler") if not self._obj._sampler or not use_previous_sampler: self._obj._sampler = do_sampler_class( self._obj, self._obj._identified_estimand, self._obj._causal_model._treatment, self._obj._causal_model._outcome, params=params, variable_types=variable_types, num_cores=num_cores, causal_model=self._obj._causal_model, keep_original_treatment=keep_original_treatment) return self._obj._sampler.do_sample(x)
import dowhy.datasets z= [i for i in range(10)] random.shuffle(z) df = pd.DataFrame(data = {'Z': z, 'X': range(0,10), 'Y': range(0,100,10)}) df dir = "C:\\Users\\T149900\\source\\repos\\PythonApplication2\\PythonApplication2\\" # With GML file model = CausalModel(data = df, treatment='X', outcome='Y', graph= dir + "test.gml") model
import dowhy # from dowhy import CausalModel from IPython.display import Image, display # I. Generating dummy data # We generate some dummy data for three variables: X, Y and Z. from dowhy.do_why import CausalModel z = [i for i in range(10)] random.shuffle(z) df = pd.DataFrame(data={'Z': z, 'X': range(0, 10), 'Y': range(0, 100, 10)}) print(df) # II. Loading GML or DOT graphs # GML format # With GML string model = CausalModel(data=df, treatment='X', outcome='Y', graph="""graph[directed 1 node[id "Z" label "Z"] node[id "X" label "X"] node[id "Y" label "Y"] edge[source "Z" target "X"] edge[source "Z" target "Y"] edge[source "X" target "Y"]]""") model.view_model() display(Image(filename="causal_model_simple_example.png"))
for i in range(1, 26): col.append("x" + str(i)) data.columns = col data.head() print(data) # Model # Create a causal model from the data and given common causes. xs = "" for i in range(1, 26): xs += ("x" + str(i) + "+") model = CausalModel( data=data, treatment='treatment', outcome='y_factual', common_causes=xs.split('+') ) # Identify # Identify the causal effect identified_estimand = model.identify_effect() # Estimate (using different methods) # 3.1 Using Linear Regression # Estimate the causal effect and compare it with Average Treatment Effect estimate = model.estimate_effect(identified_estimand, method_name="backdoor.linear_regression", test_significance=True )
import dowhy from dowhy.do_why import CausalModel import dowhy.datasets if __name__ == "__main__": data = dowhy.datasets.linear_dataset(beta=10, num_common_causes=5, num_instruments=2, num_samples=10000, treatment_is_binary=True) # Create a causal model from the data and given graph. model = CausalModel( data=data["df"], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["dot_graph"]) identified_estimand = model.identify_effect() estimate = model.estimate_effect( identified_estimand, method_name="backdoor.linear_regression") print("Causal Estimate is " + str(estimate.value)) # Adding a random common cause variable res_random = model.refute_estimate( identified_estimand, estimate, method_name="random_common_cause") print(res_random) # Replacing treatment with a random (placebo) variable
from dowhy.do_why import CausalModel data = dowhy.datasets.linear_dataset(beta=10, num_common_causes=5, num_instruments=2, num_samples=10000, treatment_is_binary=True) df = data["df"] print(df.head()) print(data["dot_graph"]) print("\n") print(data["gml_graph"]) # With graph model = CausalModel(data=df, treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"]) model.view_model() from IPython.display import Image, display display(Image(filename="causal_model_simple_example.png")) # DoWhy philosophy: Keep identification and estimation separate # Identification can be achieved without access to the data, acccesing only the graph. # This results in an expression to be computed. # This expression can then be evaluated using the available data in the estimation step. # It is important to understand that these are orthogonal steps. identified_estimand = model.identify_effect()
class DoWhyExample: data_old = ds.linear_dataset(beta=10, num_common_causes=5, num_instruments=5, num_samples=10000, treatment_is_binary=True) gml_graph = ('graph[directed 1' 'node[ id "TOJ" label "TOJ"]' 'node[ id "IntCur" label "IntCur"]' 'node[ id "U" label "Unobserved Confounders"]' 'edge[source "TOJ" target "IntCur"]' 'edge[source "U" target "TOJ"]' 'edge[source "U" target "IntCur"]') gml_graph = add_node(gml_graph, "YeshivaAdults", "IntCur") gml_graph = add_node(gml_graph, "Sex", "IntCur") gml_graph = add_node(gml_graph, "Age", "IntCur") gml_graph = connect_node(gml_graph, "Age", "TOJ") gml_graph = connect_node(gml_graph, "Age", "YeshivaAdults") gml_graph = connect_node(gml_graph, "Sex", "YeshivaAdults") gml_graph = connect_node(gml_graph, "TOJ", "YeshivaAdults") gml_graph = gml_graph + ']' # table # ID Age Sex TOJ (Orthodox)? (Treatment?) Yeshiva? Intell. Curios? (Outcome) data = pd.DataFrame( np.array([[30.0, 1.0, 1.0, 1.0, 0.0], [40.0, 1.0, 0.0, 0.0, 1.0]]), columns=['Age', 'Sex', 'TOJ', 'YeshivaAdults', 'IntCur']) # t_model = None t_identify = None t_estimate = None def model(self, force_again=False): if self.t_model is None or force_again: self.t_model = CausalModel(data=self.data, treatment='TOJ', outcome='IntCur', graph=self.gml_graph) # CausalModel(data=self.data["df"], # treatment=self.data["treatment_name"], # outcome=self.data["outcome_name"], # graph=self.data["gml_graph"]) return self.t_model def identify(self, force_again=False): if self.t_identify is None or force_again: if self.t_model is None or force_again: self.model(force_again=force_again) self.t_identify = self.t_model.identify_effect() return self.t_identify def estimate(self, method_name="backdoor.propensity_score_matching", force_again=False): if self.t_estimate is None or force_again: self.t_estimate = self.t_model.estimate_effect( self.identify(force_again), method_name) return self.t_estimate def refute(self, method_name="random_common_cause", force_again=False): return self.model(force_again=force_again).refute_estimate( self.identify(force_again), self.estimate(force_again=force_again), method_name=method_name)
class CausalAccessor(object): def __init__(self, pandas_obj): """ An accessor for the pandas.DataFrame under the `causal` namespace. :param pandas_obj: """ self._obj = pandas_obj self._causal_model = None self._sampler = None self._identified_estimand = None self._method = None def reset(self): """ If a `causal` namespace method (especially `do`) was run statefully, this resets the namespace. :return: """ self._causal_model = None self._identified_estimand = None self._sampler = None self._method = None def do(self, x, method='weighting', num_cores=1, variable_types={}, outcome=None, params=None, dot_graph=None, common_causes=None, estimand_type='ate', proceed_when_unidentifiable=False, stateful=False): """ The do-operation implemented with sampling. This will return a pandas.DataFrame with the outcome variable(s) replaced with samples from P(Y|do(X=x)). If the value of `x` is left unspecified (e.g. as a string or list), then the original values of `x` are left in the DataFrame, and Y is sampled from its respective P(Y|do(x)). If the value of `x` is specified (passed with a `dict`, where variable names are keys, and values are specified) then the new `DataFrame` will contain the specified values of `x`. For some methods, the `variable_types` field must be specified. It should be a `dict`, where the keys are variable names, and values are 'o' for ordered discrete, 'u' for un-ordered discrete, 'd' for discrete, or 'c' for continuous. Inference requires a set of control variables. These can be provided explicitly using `common_causes`, which contains a list of variable names to control for. These can be provided implicitly by specifying a causal graph with `dot_graph`, from which they will be chosen using the default identification method. When the set of control variables can't be identified with the provided assumptions, a prompt will raise to the user asking whether to proceed. To automatically over-ride the prompt, you can set the flag `proceed_when_unidentifiable` to `True`. Some methods build components during inference which are expensive. To retain those components for later inference (e.g. successive calls to `do` with different values of `x`), you can set the `stateful` flag to `True`. Be cautious about using the `do` operation statefully. State is set on the namespace, rather than the method, so can behave unpredictably. To reset the namespace and run statelessly again, you can call the `reset` method. :param x: str, list, dict: The causal state on which to intervene, and (optional) its interventional value(s). :param method: The inference method to use with the sampler. Currently, `'mcmc'`, `'weighting'`, and `'kernel_density'` are supported. :param num_cores: int: if the inference method only supports sampling a point at a time, this will parallelize sampling. :param variable_types: dict: The dictionary containing the variable types. Must contain the union of the causal state, control variables, and the outcome. :param outcome: str: The outcome variable. :param params: dict: extra parameters to set as attributes on the sampler object :param dot_graph: str: A string specifying the causal graph. :param common_causes: list: A list of strings containing the variable names to control for. :param estimand_type: str: 'ate' is the only one currently supported. Others may be added later, to allow for CATE estimation. :param proceed_when_unidentifiable: bool: A flag to over-ride user prompts to proceed when effects aren't identifiable with the assumptions provided. :param stateful: bool: Whether to retain state. By default, the do operation is stateless. :return: pandas.DataFrame: A DataFrame containing the sampled outcome """ x, keep_original_treatment = self.parse_x(x) if not stateful or method != self._method: self.reset() if not self._causal_model: self._causal_model = CausalModel( self._obj, [xi for xi in x.keys()][0], outcome, graph=dot_graph, common_causes=common_causes, instruments=None, estimand_type=estimand_type, proceed_when_unidentifiable=proceed_when_unidentifiable) self._identified_estimand = self._causal_model.identify_effect() if not self._sampler: self._method = method do_sampler_class = do_samplers.get_class_object(method + "_sampler") self._sampler = do_sampler_class( self._obj, self._identified_estimand, self._causal_model._treatment, self._causal_model._outcome, params=params, variable_types=variable_types, num_cores=num_cores, causal_model=self._causal_model, keep_original_treatment=keep_original_treatment) result = self._sampler.do_sample(x) if not stateful: self.reset() return result def parse_x(self, x): if type(x) == str: return {x: None}, True if type(x) == list: return {xi: None for xi in x}, True if type(x) == dict: return x, False raise Exception('x format not recognized: {}'.format(type(x)))
import dowhy from dowhy.do_why import CausalModel import dowhy.datasets import Pygraphviz # Load some sample data data = dowhy.datasets.linear_dataset(beta=10, num_common_causes=5, num_instruments=2, num_samples=10000, treatment_is_binary=True) # Create a causal model from the data and given graph. model = CausalModel( data=data["df"], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["dot_graph"], ) # Identify causal effect and return target estimands identified_estimand = model.identify_effect() # Estimate the target estimand using a statistical method. estimate = model.estimate_effect( identified_estimand, method_name="backdoor.propensity_score_matching") # Refute the obtained estimate using multiple robustness checks. refute_results = model.refute_estimate(identified_estimand, estimate, method_name="random_common_cause")
method_name="iv.regression_discontinuity", method_params={ 'rd_variable_name': 'Z1', 'rd_threshold_value': 0.5, 'rd_bandwidth': 0.1 }) print(causal_estimate_regdist) print("Causal Estimate is " + str(causal_estimate_regdist.value)) if __name__ == "__main__": data = dowhy.datasets.linear_dataset(beta=10, num_common_causes=5, num_instruments=2, num_samples=10000, treatment_is_binary=True) # With graph model = CausalModel(data=data['df'], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["dot_graph"], instruments=data["instrument_names"], logging_level=logging.INFO) model.view_model() identified_estimand = model.identify_effect() print(identified_estimand) regression(model, identified_estimand)
import numpy as np import pandas as pd import dowhy from dowhy.do_why import CausalModel import dowhy.datasets data = dowhy.datasets.linear_dataset(beta=10, num_common_causes=5, num_instruments=2, num_samples=10000, treatment_is_binary=True) df = data["df"] print(df.head()) print(data["dot_graph"]) print("\n") print(data["gml_graph"]) # With graph model = CausalModel(data=df, treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"]) identified_estimand = model.identify_effect() print(identified_estimand) causal_estimate = model.estimate_effect( identified_estimand, method_name="backdoor.linear_regression") print(causal_estimate) print("Causal Estimate is " + str(causal_estimate.value))