def test_weighted_variance(): x = np.array([1, 2, 3, 4, 5]) sample_weight_equal = np.ones(len(x)) var_x = get_weighted_variance(x, sample_weight_equal) # should get the same variance with equal sample_weight assert var_x == x.var() x1 = np.array([1, 2, 3, 4, 4, 5, 5]) sample_weight_equal = np.ones(len(x1)) sample_weight = [1, 1, 1, 2, 2] var_x2 = get_weighted_variance(x, sample_weight) var_x1 = get_weighted_variance(x1, sample_weight_equal) # should get the same variance by duplicate the observation based on the sample weight assert var_x1 == var_x2
def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True): """Fit the treatment effect and outcome models of the R learner. Args: X (np.matrix or np.array or pd.Dataframe): a feature matrix treatment (np.array or pd.Series): a treatment vector y (np.array or pd.Series): an outcome vector p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores. sample_weight (np.array or pd.Series, optional): an array of sample weights indicating the weight of each observation for `effect_learner`. If None, it assumes equal weight. verbose (bool, optional): whether to output progress logs """ X, treatment, y = convert_pd_to_np(X, treatment, y) check_treatment_vector(treatment, self.control_name) if sample_weight is not None: assert len(sample_weight) == len( y ), "Data length must be equal for sample_weight and the input data" sample_weight = convert_pd_to_np(sample_weight) self.t_groups = np.unique(treatment[treatment != self.control_name]) self.t_groups.sort() if p is None: self._set_propensity_models(X=X, treatment=treatment, y=y) p = self.propensity else: p = self._format_p(p, self.t_groups) self._classes = {group: i for i, group in enumerate(self.t_groups)} self.models_tau = {group: deepcopy(self.model_tau) for group in self.t_groups} self.vars_c = {} self.vars_t = {} if verbose: logger.info("generating out-of-fold CV outcome estimates") yhat = cross_val_predict(self.model_mu, X, y, cv=self.cv, n_jobs=-1) for group in self.t_groups: mask = (treatment == group) | (treatment == self.control_name) treatment_filt = treatment[mask] X_filt = X[mask] y_filt = y[mask] yhat_filt = yhat[mask] p_filt = p[group][mask] w = (treatment_filt == group).astype(int) weight = (w - p_filt) ** 2 diff_c = y_filt[w == 0] - yhat_filt[w == 0] diff_t = y_filt[w == 1] - yhat_filt[w == 1] if sample_weight is not None: sample_weight_filt = sample_weight[mask] sample_weight_filt_c = sample_weight_filt[w == 0] sample_weight_filt_t = sample_weight_filt[w == 1] self.vars_c[group] = get_weighted_variance(diff_c, sample_weight_filt_c) self.vars_t[group] = get_weighted_variance(diff_t, sample_weight_filt_t) weight *= sample_weight_filt # update weight else: self.vars_c[group] = diff_c.var() self.vars_t[group] = diff_t.var() if verbose: logger.info( "training the treatment effect model for {} with R-loss".format( group ) ) self.models_tau[group].fit( X_filt, (y_filt - yhat_filt) / (w - p_filt), sample_weight=weight )
def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True): """Fit the treatment effect and outcome models of the R learner. Args: X (np.matrix or np.array or pd.Dataframe): a feature matrix y (np.array or pd.Series): an outcome vector p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores. sample_weight (np.array or pd.Series, optional): an array of sample weights indicating the weight of each observation for `effect_learner`. If None, it assumes equal weight. verbose (bool, optional): whether to output progress logs """ X, treatment, y = convert_pd_to_np(X, treatment, y) check_treatment_vector(treatment, self.control_name) # initialize equal sample weight if it's not provided, for simplicity purpose sample_weight = ( convert_pd_to_np(sample_weight) if sample_weight is not None else convert_pd_to_np(np.ones(len(y))) ) assert len(sample_weight) == len( y ), "Data length must be equal for sample_weight and the input data" self.t_groups = np.unique(treatment[treatment != self.control_name]) self.t_groups.sort() if p is None: self._set_propensity_models(X=X, treatment=treatment, y=y) p = self.propensity else: p = self._format_p(p, self.t_groups) self._classes = {group: i for i, group in enumerate(self.t_groups)} self.models_tau = {group: deepcopy(self.model_tau) for group in self.t_groups} self.vars_c = {} self.vars_t = {} if verbose: logger.info("generating out-of-fold CV outcome estimates") yhat = cross_val_predict(self.model_mu, X, y, cv=self.cv, n_jobs=-1) for group in self.t_groups: treatment_mask = (treatment == group) | (treatment == self.control_name) treatment_filt = treatment[treatment_mask] w = (treatment_filt == group).astype(int) X_filt = X[treatment_mask] y_filt = y[treatment_mask] yhat_filt = yhat[treatment_mask] p_filt = p[group][treatment_mask] sample_weight_filt = sample_weight[treatment_mask] if verbose: logger.info( "training the treatment effect model for {} with R-loss".format( group ) ) if self.early_stopping: ( X_train_filt, X_test_filt, y_train_filt, y_test_filt, yhat_train_filt, yhat_test_filt, w_train, w_test, p_train_filt, p_test_filt, sample_weight_train_filt, sample_weight_test_filt, ) = train_test_split( X_filt, y_filt, yhat_filt, w, p_filt, sample_weight_filt, test_size=self.test_size, random_state=self.random_state, ) weight = sample_weight_filt self.models_tau[group].fit( X=X_train_filt, y=(y_train_filt - yhat_train_filt) / (w_train - p_train_filt), sample_weight=sample_weight_train_filt * ((w_train - p_train_filt) ** 2), eval_set=[ ( X_test_filt, (y_test_filt - yhat_test_filt) / (w_test - p_test_filt), ) ], sample_weight_eval_set=[ sample_weight_test_filt * ((w_test - p_test_filt) ** 2) ], eval_metric=self.effect_learner_eval_metric, early_stopping_rounds=self.early_stopping_rounds, verbose=verbose, ) else: self.models_tau[group].fit( X_filt, (y_filt - yhat_filt) / (w - p_filt), sample_weight=sample_weight_filt * ((w - p_filt) ** 2), eval_metric=self.effect_learner_eval_metric, ) diff_c = y_filt[w == 0] - yhat_filt[w == 0] diff_t = y_filt[w == 1] - yhat_filt[w == 1] sample_weight_filt_c = sample_weight_filt[w == 0] sample_weight_filt_t = sample_weight_filt[w == 1] self.vars_c[group] = get_weighted_variance(diff_c, sample_weight_filt_c) self.vars_t[group] = get_weighted_variance(diff_t, sample_weight_filt_t)