def test_elasticnet_propensity_model(generate_regression_data): y, X, treatment, tau, b, e = generate_regression_data() pm = ElasticNetPropensityModel(random_state=RANDOM_SEED) ps = pm.fit_predict(X, treatment) assert roc_auc_score(treatment, ps) > .5
def get_synthetic_preds(synthetic_data_func, n=1000, estimators={}): """Generate predictions for synthetic data using specified function (single simulation) Args: synthetic_data_func (function): synthetic data generation function n (int, optional): number of samples estimators (dict of object): dict of names and objects of treatment effect estimators Returns: (dict): dict of the actual and estimates of treatment effects """ y, X, w, tau, b, e = synthetic_data_func(n=n) preds_dict = {} preds_dict[KEY_ACTUAL] = tau preds_dict[KEY_GENERATED_DATA] = { "y": y, "X": X, "w": w, "tau": tau, "b": b, "e": e, } # Predict p_hat because e would not be directly observed in real-life p_model = ElasticNetPropensityModel() p_hat = p_model.fit_predict(X, w) if estimators: for name, learner in estimators.items(): try: preds_dict[name] = learner.fit_predict(X=X, treatment=w, y=y, p=p_hat).flatten() except TypeError: preds_dict[name] = learner.fit_predict(X=X, treatment=w, y=y).flatten() else: for base_learner, label_l in zip( [BaseSRegressor, BaseTRegressor, BaseXRegressor, BaseRRegressor], ["S", "T", "X", "R"], ): for model, label_m in zip([LinearRegression, XGBRegressor], ["LR", "XGB"]): learner = base_learner(model()) model_name = "{} Learner ({})".format(label_l, label_m) try: preds_dict[model_name] = learner.fit_predict( X=X, treatment=w, y=y, p=p_hat).flatten() except TypeError: preds_dict[model_name] = learner.fit_predict( X=X, treatment=w, y=y).flatten() learner = CausalTreeRegressor(random_state=RANDOM_SEED) preds_dict["Causal Tree"] = learner.fit_predict(X=X, treatment=w, y=y).flatten() return preds_dict
def get_synthetic_preds(synthetic_data_func, n=1000, estimators={}): """Generate predictions for synthetic data using specified function (single simulation) Args: synthetic_data_func (function): synthetic data generation function n (int, optional): number of samples estimators (dict of object): dict of names and objects of treatment effect estimators Returns: (dict): dict of the actual and estimates of treatuement effects """ y, X, w, tau, b, e = synthetic_data_func(n=n) preds_dict = {} preds_dict[KEY_ACTUAL] = tau preds_dict[KEY_GENERATED_DATA] = { 'y': y, 'X': X, 'w': w, 'tau': tau, 'b': b, 'e': e } # Predict p_hat because e would not be directly observed in real-life p_model = ElasticNetPropensityModel() p_hat = p_model.fit_predict(X, w) if estimators: for name, learner in estimators.items(): try: preds_dict[name] = learner.fit_predict(X=X, p=p_hat, treatment=w, y=y).flatten() except TypeError: preds_dict[name] = learner.fit_predict(X=X, treatment=w, y=y).flatten() else: for base_learner, label_l in zip( [BaseSLearner, BaseTLearner, BaseXLearner, BaseRLearner], ['S', 'T', 'X', 'R']): for model, label_m in zip([LinearRegression, XGBRegressor], ['LR', 'XGB']): learner = base_learner(model()) model_name = '{} Learner ({})'.format(label_l, label_m) try: preds_dict[model_name] = learner.fit_predict( X=X, p=p_hat, treatment=w, y=y).flatten() except TypeError: preds_dict[model_name] = learner.fit_predict( X=X, treatment=w, y=y).flatten() learner = CausalTreeRegressor(random_state=RANDOM_SEED) preds_dict['Causal Tree'] = learner.fit_predict(X=X, treatment=w, y=y).flatten() return preds_dict
def _generate_data(): if not generated: y, X, treatment, tau, b, e = generate_regression_data() features = ['x{}'.format(i) for i in range(X.shape[1])] df = pd.DataFrame(X, columns=features) df[TREATMENT_COL] = treatment df_c = df.loc[treatment == 0] df_t = df.loc[treatment == 1] df = pd.concat([df_t, df_c, df_c], axis=0, ignore_index=True) pm = ElasticNetPropensityModel(random_state=RANDOM_SEED) ps = pm.fit_predict(df[features], df[TREATMENT_COL]) df[SCORE_COL] = ps df[GROUP_COL] = np.random.randint(0, 2, size=df.shape[0]) return df, features
def get_synthetic_preds_holdout(synthetic_data_func, n=1000, valid_size=0.2, estimators={}): """Generate predictions for synthetic data using specified function (single simulation) for train and holdout Args: synthetic_data_func (function): synthetic data generation function n (int, optional): number of samples valid_size(float,optional): validaiton/hold out data size estimators (dict of object): dict of names and objects of treatment effect estimators Returns: (tuple): synthetic training and validation data dictionaries: - preds_dict_train (dict): synthetic training data dictionary - preds_dict_valid (dict): synthetic validation data dictionary """ y, X, w, tau, b, e = synthetic_data_func(n=n) X_train, X_val, y_train, y_val, w_train, w_val, tau_train, tau_val, b_train, b_val, e_train, e_val = \ train_test_split(X, y, w, tau, b, e, test_size=valid_size, random_state=RANDOM_SEED, shuffle=True) preds_dict_train = {} preds_dict_valid = {} preds_dict_train[KEY_ACTUAL] = tau_train preds_dict_valid[KEY_ACTUAL] = tau_val preds_dict_train['generated_data'] = { 'y': y_train, 'X': X_train, 'w': w_train, 'tau': tau_train, 'b': b_train, 'e': e_train } preds_dict_valid['generated_data'] = { 'y': y_val, 'X': X_val, 'w': w_val, 'tau': tau_val, 'b': b_val, 'e': e_val } # Predict p_hat because e would not be directly observed in real-life p_model = ElasticNetPropensityModel() p_hat_train = p_model.fit_predict(X_train, w_train) p_hat_val = p_model.fit_predict(X_val, w_val) for base_learner, label_l in zip( [BaseSRegressor, BaseTRegressor, BaseXRegressor, BaseRRegressor], ['S', 'T', 'X', 'R']): for model, label_m in zip([LinearRegression, XGBRegressor], ['LR', 'XGB']): # RLearner will need to fit on the p_hat if label_l != 'R': learner = base_learner(model()) # fit the model on training data only learner.fit(X=X_train, treatment=w_train, y=y_train) try: preds_dict_train['{} Learner ({})'.format( label_l, label_m)] = learner.predict(X=X_train, p=p_hat_train).flatten() preds_dict_valid['{} Learner ({})'.format( label_l, label_m)] = learner.predict(X=X_val, p=p_hat_val).flatten() except TypeError: preds_dict_train['{} Learner ({})'.format( label_l, label_m)] = learner.predict(X=X_train, treatment=w_train, y=y_train).flatten() preds_dict_valid['{} Learner ({})'.format( label_l, label_m)] = learner.predict(X=X_val, treatment=w_val, y=y_val).flatten() else: learner = base_learner(model()) learner.fit(X=X_train, p=p_hat_train, treatment=w_train, y=y_train) preds_dict_train['{} Learner ({})'.format( label_l, label_m)] = learner.predict(X=X_train).flatten() preds_dict_valid['{} Learner ({})'.format( label_l, label_m)] = learner.predict(X=X_val).flatten() return preds_dict_train, preds_dict_valid