Esempio n. 1
0
def test_elasticnet_propensity_model(generate_regression_data):
    y, X, treatment, tau, b, e = generate_regression_data()

    pm = ElasticNetPropensityModel(random_state=RANDOM_SEED)
    ps = pm.fit_predict(X, treatment)

    assert roc_auc_score(treatment, ps) > .5
Esempio n. 2
0
def get_synthetic_preds(synthetic_data_func, n=1000, estimators={}):
    """Generate predictions for synthetic data using specified function (single simulation)

    Args:
        synthetic_data_func (function): synthetic data generation function
        n (int, optional): number of samples
        estimators (dict of object): dict of names and objects of treatment effect estimators

    Returns:
        (dict): dict of the actual and estimates of treatment effects
    """
    y, X, w, tau, b, e = synthetic_data_func(n=n)

    preds_dict = {}
    preds_dict[KEY_ACTUAL] = tau
    preds_dict[KEY_GENERATED_DATA] = {
        "y": y,
        "X": X,
        "w": w,
        "tau": tau,
        "b": b,
        "e": e,
    }

    # Predict p_hat because e would not be directly observed in real-life
    p_model = ElasticNetPropensityModel()
    p_hat = p_model.fit_predict(X, w)

    if estimators:
        for name, learner in estimators.items():
            try:
                preds_dict[name] = learner.fit_predict(X=X,
                                                       treatment=w,
                                                       y=y,
                                                       p=p_hat).flatten()
            except TypeError:
                preds_dict[name] = learner.fit_predict(X=X, treatment=w,
                                                       y=y).flatten()
    else:
        for base_learner, label_l in zip(
            [BaseSRegressor, BaseTRegressor, BaseXRegressor, BaseRRegressor],
            ["S", "T", "X", "R"],
        ):
            for model, label_m in zip([LinearRegression, XGBRegressor],
                                      ["LR", "XGB"]):
                learner = base_learner(model())
                model_name = "{} Learner ({})".format(label_l, label_m)
                try:
                    preds_dict[model_name] = learner.fit_predict(
                        X=X, treatment=w, y=y, p=p_hat).flatten()
                except TypeError:
                    preds_dict[model_name] = learner.fit_predict(
                        X=X, treatment=w, y=y).flatten()

        learner = CausalTreeRegressor(random_state=RANDOM_SEED)
        preds_dict["Causal Tree"] = learner.fit_predict(X=X, treatment=w,
                                                        y=y).flatten()

    return preds_dict
Esempio n. 3
0
def get_synthetic_preds(synthetic_data_func, n=1000, estimators={}):
    """Generate predictions for synthetic data using specified function (single simulation)

    Args:
        synthetic_data_func (function): synthetic data generation function
        n (int, optional): number of samples
        estimators (dict of object): dict of names and objects of treatment effect estimators

    Returns:
        (dict): dict of the actual and estimates of treatuement effects
    """
    y, X, w, tau, b, e = synthetic_data_func(n=n)

    preds_dict = {}
    preds_dict[KEY_ACTUAL] = tau
    preds_dict[KEY_GENERATED_DATA] = {
        'y': y,
        'X': X,
        'w': w,
        'tau': tau,
        'b': b,
        'e': e
    }

    # Predict p_hat because e would not be directly observed in real-life
    p_model = ElasticNetPropensityModel()
    p_hat = p_model.fit_predict(X, w)

    if estimators:
        for name, learner in estimators.items():
            try:
                preds_dict[name] = learner.fit_predict(X=X,
                                                       p=p_hat,
                                                       treatment=w,
                                                       y=y).flatten()
            except TypeError:
                preds_dict[name] = learner.fit_predict(X=X, treatment=w,
                                                       y=y).flatten()
    else:
        for base_learner, label_l in zip(
            [BaseSLearner, BaseTLearner, BaseXLearner, BaseRLearner],
            ['S', 'T', 'X', 'R']):
            for model, label_m in zip([LinearRegression, XGBRegressor],
                                      ['LR', 'XGB']):
                learner = base_learner(model())
                model_name = '{} Learner ({})'.format(label_l, label_m)
                try:
                    preds_dict[model_name] = learner.fit_predict(
                        X=X, p=p_hat, treatment=w, y=y).flatten()
                except TypeError:
                    preds_dict[model_name] = learner.fit_predict(
                        X=X, treatment=w, y=y).flatten()

        learner = CausalTreeRegressor(random_state=RANDOM_SEED)
        preds_dict['Causal Tree'] = learner.fit_predict(X=X, treatment=w,
                                                        y=y).flatten()

    return preds_dict
Esempio n. 4
0
    def __init__(self,
                 outcome_learner=None,
                 effect_learner=None,
                 propensity_learner=ElasticNetPropensityModel(),
                 ate_alpha=.05,
                 control_name=0,
                 n_fold=5,
                 random_state=None):
        """Initialize an R-learner classifier.

        Args:
            outcome_learner: a model to estimate outcomes. Should be a classifier.
            effect_learner: a model to estimate treatment effects. It needs to take `sample_weight` as an
                input argument for `fit()`. Should be a regressor.
            propensity_learner (optional): a model to estimate propensity scores. `ElasticNetPropensityModel()` will
                be used by default.
            ate_alpha (float, optional): the confidence level alpha of the ATE estimate
            control_name (str or int, optional): name of control group
            n_fold (int, optional): the number of cross validation folds for outcome_learner
            random_state (int or RandomState, optional): a seed (int) or random number generator (RandomState)
        """
        super().__init__(learner=None,
                         outcome_learner=outcome_learner,
                         effect_learner=effect_learner,
                         propensity_learner=propensity_learner,
                         ate_alpha=ate_alpha,
                         control_name=control_name,
                         n_fold=n_fold,
                         random_state=random_state)

        if (outcome_learner is None) and (effect_learner is None):
            raise ValueError(
                "Either the outcome learner or the effect learner must be specified."
            )
Esempio n. 5
0
    def __init__(
        self,
        learner=None,
        outcome_learner=None,
        effect_learner=None,
        propensity_learner=ElasticNetPropensityModel(),
        ate_alpha=0.05,
        control_name=0,
        n_fold=5,
        random_state=None,
    ):
        """Initialize an R-learner regressor.

        Args:
            learner (optional): a model to estimate outcomes and treatment effects
            outcome_learner (optional): a model to estimate outcomes
            effect_learner (optional): a model to estimate treatment effects. It needs to take `sample_weight` as an
                input argument for `fit()`
            propensity_learner (optional): a model to estimate propensity scores. `ElasticNetPropensityModel()` will
                be used by default.
            ate_alpha (float, optional): the confidence level alpha of the ATE estimate
            control_name (str or int, optional): name of control group
            n_fold (int, optional): the number of cross validation folds for outcome_learner
            random_state (int or RandomState, optional): a seed (int) or random number generator (RandomState)
        """
        super().__init__(
            learner=learner,
            outcome_learner=outcome_learner,
            effect_learner=effect_learner,
            propensity_learner=propensity_learner,
            ate_alpha=ate_alpha,
            control_name=control_name,
            n_fold=n_fold,
            random_state=random_state,
        )
Esempio n. 6
0
    def _generate_data():
        if not generated:
            y, X, treatment, tau, b, e = generate_regression_data()

            features = ['x{}'.format(i) for i in range(X.shape[1])]
            df = pd.DataFrame(X, columns=features)
            df[TREATMENT_COL] = treatment

            df_c = df.loc[treatment == 0]
            df_t = df.loc[treatment == 1]

            df = pd.concat([df_t, df_c, df_c], axis=0, ignore_index=True)

            pm = ElasticNetPropensityModel(random_state=RANDOM_SEED)
            ps = pm.fit_predict(df[features], df[TREATMENT_COL])
            df[SCORE_COL] = ps
            df[GROUP_COL] = np.random.randint(0, 2, size=df.shape[0])

        return df, features
Esempio n. 7
0
    def __init__(
        self,
        learner=None,
        outcome_learner=None,
        effect_learner=None,
        propensity_learner=ElasticNetPropensityModel(),
        ate_alpha=0.05,
        control_name=0,
        n_fold=5,
        random_state=None,
    ):
        """Initialize an R-learner.

        Args:
            learner (optional): a model to estimate outcomes and treatment effects
            outcome_learner (optional): a model to estimate outcomes
            effect_learner (optional): a model to estimate treatment effects. It needs to take `sample_weight` as an
                input argument for `fit()`
            propensity_learner (optional): a model to estimate propensity scores. `ElasticNetPropensityModel()` will
                be used by default.
            ate_alpha (float, optional): the confidence level alpha of the ATE estimate
            control_name (str or int, optional): name of control group
            n_fold (int, optional): the number of cross validation folds for outcome_learner
            random_state (int or RandomState, optional): a seed (int) or random number generator (RandomState)
        """
        assert (learner is not None) or (
            (outcome_learner is not None) and (effect_learner is not None)
        )
        assert propensity_learner is not None

        self.model_mu = (
            outcome_learner if outcome_learner is not None else deepcopy(learner)
        )
        self.model_tau = (
            effect_learner if effect_learner is not None else deepcopy(learner)
        )
        self.model_p = propensity_learner

        self.ate_alpha = ate_alpha
        self.control_name = control_name

        self.random_state = random_state
        self.cv = KFold(n_splits=n_fold, shuffle=True, random_state=random_state)

        self.propensity = None
        self.propensity_model = None
Esempio n. 8
0
def get_synthetic_preds_holdout(synthetic_data_func,
                                n=1000,
                                valid_size=0.2,
                                estimators={}):
    """Generate predictions for synthetic data using specified function (single simulation) for train and holdout

    Args:
        synthetic_data_func (function): synthetic data generation function
        n (int, optional): number of samples
        valid_size(float,optional): validaiton/hold out data size
        estimators (dict of object): dict of names and objects of treatment effect estimators

    Returns:
        (tuple): synthetic training and validation data dictionaries:

          - preds_dict_train (dict): synthetic training data dictionary
          - preds_dict_valid (dict): synthetic validation data dictionary
    """
    y, X, w, tau, b, e = synthetic_data_func(n=n)

    X_train, X_val, y_train, y_val, w_train, w_val, tau_train, tau_val, b_train, b_val, e_train, e_val = \
        train_test_split(X, y, w, tau, b, e, test_size=valid_size, random_state=RANDOM_SEED, shuffle=True)

    preds_dict_train = {}
    preds_dict_valid = {}

    preds_dict_train[KEY_ACTUAL] = tau_train
    preds_dict_valid[KEY_ACTUAL] = tau_val

    preds_dict_train['generated_data'] = {
        'y': y_train,
        'X': X_train,
        'w': w_train,
        'tau': tau_train,
        'b': b_train,
        'e': e_train
    }
    preds_dict_valid['generated_data'] = {
        'y': y_val,
        'X': X_val,
        'w': w_val,
        'tau': tau_val,
        'b': b_val,
        'e': e_val
    }

    # Predict p_hat because e would not be directly observed in real-life
    p_model = ElasticNetPropensityModel()
    p_hat_train = p_model.fit_predict(X_train, w_train)
    p_hat_val = p_model.fit_predict(X_val, w_val)

    for base_learner, label_l in zip(
        [BaseSRegressor, BaseTRegressor, BaseXRegressor, BaseRRegressor],
        ['S', 'T', 'X', 'R']):
        for model, label_m in zip([LinearRegression, XGBRegressor],
                                  ['LR', 'XGB']):
            # RLearner will need to fit on the p_hat
            if label_l != 'R':
                learner = base_learner(model())
                # fit the model on training data only
                learner.fit(X=X_train, treatment=w_train, y=y_train)
                try:
                    preds_dict_train['{} Learner ({})'.format(
                        label_l,
                        label_m)] = learner.predict(X=X_train,
                                                    p=p_hat_train).flatten()
                    preds_dict_valid['{} Learner ({})'.format(
                        label_l,
                        label_m)] = learner.predict(X=X_val,
                                                    p=p_hat_val).flatten()
                except TypeError:
                    preds_dict_train['{} Learner ({})'.format(
                        label_l,
                        label_m)] = learner.predict(X=X_train,
                                                    treatment=w_train,
                                                    y=y_train).flatten()
                    preds_dict_valid['{} Learner ({})'.format(
                        label_l,
                        label_m)] = learner.predict(X=X_val,
                                                    treatment=w_val,
                                                    y=y_val).flatten()
            else:
                learner = base_learner(model())
                learner.fit(X=X_train,
                            p=p_hat_train,
                            treatment=w_train,
                            y=y_train)
                preds_dict_train['{} Learner ({})'.format(
                    label_l, label_m)] = learner.predict(X=X_train).flatten()
                preds_dict_valid['{} Learner ({})'.format(
                    label_l, label_m)] = learner.predict(X=X_val).flatten()

    return preds_dict_train, preds_dict_valid