Ejemplo n.º 1
0
    def fit(self, X, treatment, y):
        """Fit the inference model.

        Args:
            X (np.matrix): a feature matrix
            treatment (np.array): a treatment vector
            y (np.array): an outcome vector
        """
        check_control_in_treatment(treatment, self.control_name)
        self.t_groups = np.unique(treatment[treatment != self.control_name])
        self.t_groups.sort()
        self._classes = {group: i for i, group in enumerate(self.t_groups)}
        self.models_mu_c = {
            group: deepcopy(self.model_mu_c)
            for group in self.t_groups
        }
        self.models_mu_t = {
            group: deepcopy(self.model_mu_t)
            for group in self.t_groups
        }
        self.models_tau_c = {
            group: deepcopy(self.model_tau_c)
            for group in self.t_groups
        }
        self.models_tau_t = {
            group: deepcopy(self.model_tau_t)
            for group in self.t_groups
        }
        self.vars_c = {}
        self.vars_t = {}

        for group in self.t_groups:
            mask = (treatment == group) | (treatment == self.control_name)
            treatment_filt = treatment[mask]
            X_filt = X[mask]
            y_filt = y[mask]
            w = (treatment_filt == group).astype(int)

            # Train outcome models
            self.models_mu_c[group].fit(X_filt[w == 0], y_filt[w == 0])
            self.models_mu_t[group].fit(X_filt[w == 1], y_filt[w == 1])

            # Calculate variances and treatment effects
            var_c = (y_filt[w == 0] -
                     self.models_mu_c[group].predict(X_filt[w == 0])).var()
            self.vars_c[group] = var_c
            var_t = (y_filt[w == 1] -
                     self.models_mu_t[group].predict(X_filt[w == 1])).var()
            self.vars_t[group] = var_t

            # Train treatment models
            d_c = self.models_mu_t[group].predict(
                X_filt[w == 0]) - y_filt[w == 0]
            d_t = y_filt[w == 1] - self.models_mu_c[group].predict(
                X_filt[w == 1])
            self.models_tau_c[group].fit(X_filt[w == 0], d_c)
            self.models_tau_t[group].fit(X_filt[w == 1], d_t)
Ejemplo n.º 2
0
    def fit(self, X, p, treatment, y, verbose=True):
        """Fit the treatment effect and outcome models of the R learner.

        Args:
            X (np.matrix): a feature matrix
            p (np.ndarray or dict): an array of propensity scores of float (0,1) in the single-treatment case
                                    or, a dictionary of treatment groups that map to propensity vectors of float (0,1)
            treatment (np.array): a treatment vector
            y (np.array): an outcome vector
        """
        check_control_in_treatment(treatment, self.control_name)
        self.t_groups = np.unique(treatment[treatment != self.control_name])
        self.t_groups.sort()
        check_p_conditions(p, self.t_groups)
        if isinstance(p, np.ndarray):
            treatment_name = self.t_groups[0]
            p = {treatment_name: p}

        self._classes = {group: i for i, group in enumerate(self.t_groups)}
        self.models_tau = {
            group: deepcopy(self.model_tau)
            for group in self.t_groups
        }
        self.vars_c = {}
        self.vars_t = {}

        if verbose:
            logger.info('generating out-of-fold CV outcome estimates')
        yhat = cross_val_predict(self.model_mu,
                                 X,
                                 y,
                                 cv=self.cv,
                                 method='predict_proba',
                                 n_jobs=-1)[:, 1]

        for group in self.t_groups:
            mask = (treatment == group) | (treatment == self.control_name)
            treatment_filt = treatment[mask]
            X_filt = X[mask]
            y_filt = y[mask]
            yhat_filt = yhat[mask]
            p_filt = p[group][mask]
            w = (treatment_filt == group).astype(int)

            if verbose:
                logger.info(
                    'training the treatment effect model for {} with R-loss'.
                    format(group))
            self.models_tau[group].fit(X_filt,
                                       (y_filt - yhat_filt) / (w - p_filt),
                                       sample_weight=(w - p_filt)**2)

            self.vars_c[group] = (y_filt[w == 0] - yhat_filt[w == 0]).var()
            self.vars_t[group] = (y_filt[w == 1] - yhat_filt[w == 1]).var()
Ejemplo n.º 3
0
    def fit(self, X, p, treatment, y, verbose=True):
        """Fit the treatment effect and outcome models of the R learner.

        Args:
            X (np.matrix): a feature matrix
            p (np.ndarray or dict): an array of propensity scores of float (0,1) in the single-treatment case
                                    or, a dictionary of treatment groups that map to propensity vectors of float (0,1)
            treatment (np.array): a treatment vector
            y (np.array): an outcome vector
        """
        check_control_in_treatment(treatment, self.control_name)
        self.t_groups = np.unique(treatment[treatment != self.control_name])
        self.t_groups.sort()
        check_p_conditions(p, self.t_groups)
        if isinstance(p, np.ndarray):
            treatment_name = self.t_groups[0]
            p = {treatment_name: p}

        self._classes = {group: i for i, group in enumerate(self.t_groups)}
        self.models_tau = {
            group: deepcopy(self.model_tau)
            for group in self.t_groups
        }
        self.vars_c = {}
        self.vars_t = {}

        if verbose:
            logger.info('generating out-of-fold CV outcome estimates')
        yhat = cross_val_predict(self.model_mu, X, y, cv=self.cv, n_jobs=-1)

        for group in self.t_groups:
            treatment_mask = (treatment == group) | (treatment
                                                     == self.control_name)
            treatment_filt = treatment[treatment_mask]
            w = (treatment_filt == group).astype(int)

            X_filt = X[treatment_mask]
            y_filt = y[treatment_mask]
            yhat_filt = yhat[treatment_mask]
            p_filt = p[group][treatment_mask]

            if verbose:
                logger.info(
                    'training the treatment effect model for {} with R-loss'.
                    format(group))

            if self.early_stopping:
                X_train_filt, X_test_filt, y_train_filt, y_test_filt, yhat_train_filt, yhat_test_filt, \
                    w_train, w_test, p_train_filt, p_test_filt = train_test_split(
                        X_filt, y_filt, yhat_filt, w, p_filt,
                        test_size=self.test_size, random_state=self.random_state
                    )

                self.models_tau[group].fit(
                    X=X_train_filt,
                    y=(y_train_filt - yhat_train_filt) /
                    (w_train - p_train_filt),
                    sample_weight=(w_train - p_train_filt)**2,
                    eval_set=[(X_test_filt, (y_test_filt - yhat_test_filt) /
                               (w_test - p_test_filt))],
                    sample_weight_eval_set=[(w_test - p_test_filt)**2],
                    eval_metric=self.effect_learner_eval_metric,
                    early_stopping_rounds=self.early_stopping_rounds,
                    verbose=verbose)

            else:
                self.models_tau[group].fit(
                    X_filt, (y_filt - yhat_filt) / (w - p_filt),
                    sample_weight=(w - p_filt)**2,
                    eval_metric=self.effect_learner_eval_metric)

            self.vars_c[group] = (y_filt[w == 0] - yhat_filt[w == 0]).var()
            self.vars_t[group] = (y_filt[w == 1] - yhat_filt[w == 1]).var()
Ejemplo n.º 4
0
    def estimate_ate(self, X, p, treatment, y, segment=None, return_ci=False):
        """Estimate the Average Treatment Effect (ATE).

        Args:
            X (np.matrix): A feature matrix
            p (np.ndarray or dict): An array of propensity scores of float (0,1) in the single-treatment case
                                    or, a dictionary of treatment groups that map to propensity vectors of float (0,1)
            treatment (np.array): A treatment vector of int
            y (np.array): an outcome vector
            segment (np.array, optional): An optional segment vector of int. If given, the ATE and its CI will be
                                          estimated for each segment.
            return_ci (bool, optional): Whether to return confidence intervals

        Returns:
            (tuple): The ATE and its confidence interval (LB, UB) for each treatment, t and segment, s
        """
        check_control_in_treatment(treatment, self.control_name)
        self.t_groups = np.unique(treatment[treatment != self.control_name])
        self.t_groups.sort()

        check_p_conditions(p, self.t_groups)
        if isinstance(p, np.ndarray):
            treatment_name = self.t_groups[0]
            p = {treatment_name: p}

        ate = []
        ate_lb = []
        ate_ub = []

        for i, group in enumerate(self.t_groups):
            logger.info('Estimating ATE for group {}.'.format(group))
            w_group = (treatment == group).astype(int)
            p_group = p[group]

            if self.calibrate_propensity:
                logger.info('Calibrating propensity scores.')
                p_group = calibrate(p_group, w_group)

            yhat_c = np.zeros_like(y, dtype=float)
            yhat_t = np.zeros_like(y, dtype=float)
            if self.cv:
                for i_fold, (i_trn, i_val) in enumerate(self.cv.split(X, y), 1):
                    logger.info('Training an outcome model for CV #{}'.format(i_fold))
                    self.model_tau.fit(np.hstack((X[i_trn], w_group[i_trn].reshape(-1, 1))), y[i_trn])

                    yhat_c[i_val] = self.model_tau.predict(np.hstack((X[i_val], np.zeros((len(i_val), 1)))))
                    yhat_t[i_val] = self.model_tau.predict(np.hstack((X[i_val], np.ones((len(i_val), 1)))))

            else:
                self.model_tau.fit(np.hstack((X, w_group.reshape(-1, 1))), y)

                yhat_c = self.model_tau.predict(np.hstack((X, np.zeros((len(y), 1)))))
                yhat_t = self.model_tau.predict(np.hstack((X, np.ones((len(y), 1)))))

            if segment is None:
                logger.info('Training the TMLE learner.')
                _ate, se = simple_tmle(y, w_group, yhat_c, yhat_t, p_group)
                _ate_lb = _ate - se * norm.ppf(1 - self.ate_alpha / 2)
                _ate_ub = _ate + se * norm.ppf(1 - self.ate_alpha / 2)
            else:
                assert segment.shape[0] == X.shape[0] and segment.ndim == 1, 'Segment must be the 1-d np.array of int.'
                segments = np.unique(segment)

                _ate = []
                _ate_lb = []
                _ate_ub = []
                for s in sorted(segments):
                    logger.info('Training the TMLE learner for segment {}.'.format(s))
                    filt = (segment == s) & (yhat_c < np.quantile(yhat_c, q=.99))
                    _ate_s, se = simple_tmle(y[filt], w_group[filt], yhat_c[filt], yhat_t[filt], p_group[filt])
                    _ate_lb_s = _ate_s - se * norm.ppf(1 - self.ate_alpha / 2)
                    _ate_ub_s = _ate_s + se * norm.ppf(1 - self.ate_alpha / 2)

                    _ate.append(_ate_s)
                    _ate_lb.append(_ate_lb_s)
                    _ate_ub.append(_ate_ub_s)

            ate.append(_ate)
            ate_lb.append(_ate_lb)
            ate_ub.append(_ate_ub)

        return np.array(ate), np.array(ate_lb), np.array(ate_ub)