Ejemplo n.º 1
0
def metalearner(outcome, treatment, data, est='T', method='linear'):
    if method == 'linear':
        models = RidgeCV()
        propensity_model = LogisticRegressionCV()
    if method == 'GBR':
        models = GradientBoostingRegressor()
        propensity_model = GradientBoostingClassifier()
    if est == 'T':
        T_learner = TLearner(models=models)
        T_learner.fit(data[outcome],
                      data[treatment],
                      X=data.drop(columns=[outcome, treatment]))
        point = T_learner.ate(X=data.drop(columns=[outcome, treatment]))
    elif est == 'S':
        S_learner = SLearner(overall_model=models)
        S_learner.fit(data[outcome],
                      data[treatment],
                      X=data.drop(columns=[outcome, treatment]))
        point = S_learner.ate(X=data.drop(columns=[outcome, treatment]))
    elif est == 'X':
        X_learner = XLearner(models=models, propensity_model=propensity_model)
        X_learner.fit(data[outcome],
                      data[treatment],
                      X=data.drop(columns=[outcome, treatment]))
        point = X_learner.ate(X=data.drop(columns=[outcome, treatment]))
    return point
Ejemplo n.º 2
0
 def test_metalearners(self):
     X = TestPandasIntegration.df[TestPandasIntegration.features]
     W = TestPandasIntegration.df[TestPandasIntegration.controls]
     Y = TestPandasIntegration.df[TestPandasIntegration.outcome]
     T = TestPandasIntegration.df[TestPandasIntegration.bin_treat]
     # Test XLearner
     # Skipping population summary names test because bootstrap inference is too slow
     est = XLearner(models=GradientBoostingRegressor(),
                    propensity_model=GradientBoostingClassifier(),
                    cate_models=GradientBoostingRegressor())
     est.fit(Y, T, X=np.hstack([X, W]))
     treatment_effects = est.effect(np.hstack([X, W]))
     # Test SLearner
     est = SLearner(overall_model=GradientBoostingRegressor())
     est.fit(Y, T, X=np.hstack([X, W]))
     treatment_effects = est.effect(np.hstack([X, W]))
     # Test TLearner
     est = TLearner(models=GradientBoostingRegressor())
     est.fit(Y, T, X=np.hstack([X, W]))
     treatment_effects = est.effect(np.hstack([X, W]))
Ejemplo n.º 3
0
    def test_comparison(self):
        def reg():
            return LinearRegression()

        def clf():
            return LogisticRegression()

        y, T, X, true_eff = self._get_data()
        (X_train, X_val, T_train, T_val, Y_train, Y_val, _,
         true_eff_val) = train_test_split(X, T, y, true_eff, test_size=.4)

        models = [
            ('ldml',
             LinearDML(model_y=reg(),
                       model_t=clf(),
                       discrete_treatment=True,
                       linear_first_stages=False,
                       cv=3)),
            ('sldml',
             SparseLinearDML(model_y=reg(),
                             model_t=clf(),
                             discrete_treatment=True,
                             featurizer=PolynomialFeatures(degree=2,
                                                           include_bias=False),
                             linear_first_stages=False,
                             cv=3)),
            ('xlearner',
             XLearner(models=reg(), cate_models=reg(),
                      propensity_model=clf())),
            ('dalearner',
             DomainAdaptationLearner(models=reg(),
                                     final_models=reg(),
                                     propensity_model=clf())),
            ('slearner', SLearner(overall_model=reg())),
            ('tlearner', TLearner(models=reg())),
            ('drlearner',
             DRLearner(model_propensity=clf(),
                       model_regression=reg(),
                       model_final=reg(),
                       cv=3)),
            ('rlearner',
             NonParamDML(model_y=reg(),
                         model_t=clf(),
                         model_final=reg(),
                         discrete_treatment=True,
                         cv=3)),
            ('dml3dlasso',
             DML(model_y=reg(),
                 model_t=clf(),
                 model_final=reg(),
                 discrete_treatment=True,
                 featurizer=PolynomialFeatures(degree=3),
                 linear_first_stages=False,
                 cv=3))
        ]

        models = Parallel(n_jobs=-1, verbose=1)(
            delayed(_fit_model)(name, mdl, Y_train, T_train, X_train)
            for name, mdl in models)

        scorer = RScorer(model_y=reg(),
                         model_t=clf(),
                         discrete_treatment=True,
                         cv=3,
                         mc_iters=2,
                         mc_agg='median')
        scorer.fit(Y_val, T_val, X=X_val)
        rscore = [scorer.score(mdl) for _, mdl in models]
        rootpehe_score = [
            np.sqrt(
                np.mean(
                    (true_eff_val.flatten() - mdl.effect(X_val).flatten())**2))
            for _, mdl in models
        ]
        assert LinearRegression().fit(
            np.array(rscore).reshape(-1, 1),
            np.array(rootpehe_score)).coef_ < 0.5
        mdl, _ = scorer.best_model([mdl for _, mdl in models])
        rootpehe_best = np.sqrt(
            np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2))
        assert rootpehe_best < 1.2 * np.min(rootpehe_score)
        mdl, _ = scorer.ensemble([mdl for _, mdl in models])
        rootpehe_ensemble = np.sqrt(
            np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2))
        assert rootpehe_ensemble < 1.2 * np.min(rootpehe_score)
Ejemplo n.º 4
0
    def __init__(self,
                 model,
                 df,
                 completed_start,
                 completed_end,
                 updating_start,
                 updating_end,
                 datetime_name,
                 target_name,
                 treatment_name,
                 treatment_set,
                 features_list=None,
                 drop_list=None,
                 n_estimator=1):
        self.completed_start, self.completed_end, self.updating_start, self.updating_end, self.datetime_name, self.target_name, self.treatment_name, self.treatment_set, self.n_estimator = completed_start, completed_end, updating_start, updating_end, datetime_name, target_name, treatment_name, treatment_set, n_estimator
        self._model = model
        self.target_effect = target_prep_list(self.target_name, 'effect')

        if features_list is not None:
            X, y, t = X_y_t_split(self._complete_treatment(
                df[(df[self.datetime_name] >= self.completed_start)
                   & (df[self.datetime_name] <= self.completed_end)]),
                                  self.target_name,
                                  features_list=features_list,
                                  treatment_name=self.treatment_name)
        elif drop_list is not None:
            X, y, t = X_y_t_split(self._complete_treatment(
                df[(df[self.datetime_name] >= self.completed_start)
                   & (df[self.datetime_name] <= self.completed_end)]),
                                  self.target_name,
                                  drop_list=drop_list,
                                  treatment_name=self.treatment_name)
        self.features = X.columns

        if self._model == 'classifier':
            self.model = TLearner(
                models=RandomForestClassifier(n_estimators=(
                    (self.completed_end - self.completed_start).days + 1) *
                                              self.n_estimator,
                                              random_state=0,
                                              n_jobs=-1,
                                              warm_start=True,
                                              oob_score=True)).fit(
                                                  y.values, t.values, X.values)
        elif self._model == 'regressor':
            self.model = TLearner(
                models=RandomForestRegressor(n_estimators=(
                    (self.completed_end - self.completed_start).days + 1) *
                                             self.n_estimator,
                                             random_state=0,
                                             n_jobs=-1,
                                             warm_start=True,
                                             oob_score=True)).fit(
                                                 y.values, t.values, X.values)

        for model in self.model.models:
            model.n_estimators += (
                (self.updating_end - self.updating_start).days +
                1) * self.n_estimator

        X, y, t = X_y_t_split(self._complete_treatment(
            df[(df[self.datetime_name] >= self.updating_start)
               & (df[self.datetime_name] <= self.updating_end)]),
                              self.target_name,
                              features_list=self.features,
                              treatment_name=self.treatment_name)
        self.model.fit(y.values, t.values, X.values)
Ejemplo n.º 5
0
class TRF:
    def __init__(self,
                 model,
                 df,
                 completed_start,
                 completed_end,
                 updating_start,
                 updating_end,
                 datetime_name,
                 target_name,
                 treatment_name,
                 treatment_set,
                 features_list=None,
                 drop_list=None,
                 n_estimator=1):
        self.completed_start, self.completed_end, self.updating_start, self.updating_end, self.datetime_name, self.target_name, self.treatment_name, self.treatment_set, self.n_estimator = completed_start, completed_end, updating_start, updating_end, datetime_name, target_name, treatment_name, treatment_set, n_estimator
        self._model = model
        self.target_effect = target_prep_list(self.target_name, 'effect')

        if features_list is not None:
            X, y, t = X_y_t_split(self._complete_treatment(
                df[(df[self.datetime_name] >= self.completed_start)
                   & (df[self.datetime_name] <= self.completed_end)]),
                                  self.target_name,
                                  features_list=features_list,
                                  treatment_name=self.treatment_name)
        elif drop_list is not None:
            X, y, t = X_y_t_split(self._complete_treatment(
                df[(df[self.datetime_name] >= self.completed_start)
                   & (df[self.datetime_name] <= self.completed_end)]),
                                  self.target_name,
                                  drop_list=drop_list,
                                  treatment_name=self.treatment_name)
        self.features = X.columns

        if self._model == 'classifier':
            self.model = TLearner(
                models=RandomForestClassifier(n_estimators=(
                    (self.completed_end - self.completed_start).days + 1) *
                                              self.n_estimator,
                                              random_state=0,
                                              n_jobs=-1,
                                              warm_start=True,
                                              oob_score=True)).fit(
                                                  y.values, t.values, X.values)
        elif self._model == 'regressor':
            self.model = TLearner(
                models=RandomForestRegressor(n_estimators=(
                    (self.completed_end - self.completed_start).days + 1) *
                                             self.n_estimator,
                                             random_state=0,
                                             n_jobs=-1,
                                             warm_start=True,
                                             oob_score=True)).fit(
                                                 y.values, t.values, X.values)

        for model in self.model.models:
            model.n_estimators += (
                (self.updating_end - self.updating_start).days +
                1) * self.n_estimator

        X, y, t = X_y_t_split(self._complete_treatment(
            df[(df[self.datetime_name] >= self.updating_start)
               & (df[self.datetime_name] <= self.updating_end)]),
                              self.target_name,
                              features_list=self.features,
                              treatment_name=self.treatment_name)
        self.model.fit(y.values, t.values, X.values)

    def update(self, df, completed_end, updating_start, updating_end):
        self.completed_end, self.updating_start, self.updating_end = completed_end, updating_start, updating_end
        updating_len = (self.updating_end - self.updating_start).days + 1

        for model in self.model.models:
            model.estimators_ = model.estimators_[:-updating_len *
                                                  self.n_estimator]
            model.n_estimators -= updating_len * self.n_estimator
            model.n_estimators += self.n_estimator

        X, y, t = X_y_t_split(self._complete_treatment(
            df[df[self.datetime_name] == self.completed_end]),
                              self.target_name,
                              features_list=self.features,
                              treatment_name=self.treatment_name)
        self.model.fit(y.values, t.values, X.values)

        for model in self.model.models:
            model.n_estimators += updating_len * self.n_estimator

        X, y, t = X_y_t_split(self._complete_treatment(
            df[(df[self.datetime_name] >= self.updating_start)
               & (df[self.datetime_name] <= self.updating_end)]),
                              self.target_name,
                              features_list=self.features,
                              treatment_name=self.treatment_name)
        self.model.fit(y.values, t.values, X.values)

    def effect(self, df):
        if self._model == 'classifier':
            yeffect = self.model.effect(df[self.features].values)[:, 1]
        elif self._model == 'regressor':
            yeffect = self.model.effect(df[self.features].values)

        target_len = len(self.target_name)
        if target_len == 1:
            df[self.target_effect] = yeffect
        elif target_len > 1:
            for (i, target_effect) in enumerate(self.target_effect):
                df[target_effect] = yeffect[:, i]

        return df.sort_values(self.target_effect, ascending=False)

    def _complete_treatment(self, df):
        treatment_diff = self.treatment_set.difference(
            set(df[self.treatment_name]))
        if len(treatment_diff) > 0:
            for treatment in treatment_diff:
                blank_row = pd.DataFrame(np.zeros((1, len(df.columns))),
                                         columns=df.columns)
                blank_row[self.treatment_name] = treatment
                df = df.append(blank_row)

        return df