Example #1
0
    def predict(self, F, datainfo, timeinfo):
        '''
        This function should provide predictions of labels on (test) data.
        Here we just return random values...
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves.
        The function predict eventually casdn return probabilities or continuous values.
        '''

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        print("[***] Overall time spent %5.2f sec" % overall_spenttime)
        print("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        date_cols = datainfo['loaded_feat_types'][0]
        numeric_cols = datainfo['loaded_feat_types'][1]
        categorical_cols = datainfo['loaded_feat_types'][2]
        multicategorical_cols = datainfo['loaded_feat_types'][3]

        # Get numerical variables and replace NaNs with 0s
        X = np.nan_to_num(F['numerical'])

        # Frequency encode categorical variables and concatenate them with numerical variables
        if categorical_cols > 0:
            X_cat = self.cat_encs.transform(F['CAT']).values
            X = np.concatenate((X, X_cat), axis=1)
            del X_cat

        # Adversarial validation
        print('AV: starting adversarial validation...')

        np.random.seed(SEED)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

        n_trn = self.X.shape[0]
        n_tst = X.shape[0]

        X_all = np.vstack((self.X, X))
        y_all = np.concatenate((np.zeros(n_trn, ), np.ones(n_tst, )))
        print('AV: ', X_all.shape, y_all.shape)

        ps_all = np.zeros_like(y_all, dtype=float)
        for i, (i_trn, i_val) in enumerate(cv.split(X_all, y_all)):

            model_av = LGBMClassifier(n_estimators=1000,
                                      subsample=.8,
                                      subsample_freq=1,
                                      colsample_bytree=.8,
                                      importance_type='gain')
            model_av.fit(X_all[i_trn],
                         y_all[i_trn],
                         eval_set=(X_all[i_val], y_all[i_val]),
                         early_stopping_rounds=10,
                         eval_metric='auc')

            ps_all[i_val] = model_av.predict_proba(X_all[i_val])[:, 1]

        av_score = roc_auc_score(y_all, ps_all)
        print(f'AV: AUC={av_score * 100: 3.2f}')

        ps_all = calibrate(ps_all, y_all)
        print(
            f'AV: propensity scores deciles: {np.percentile(ps_all, np.linspace(0, 1, 11))}'
        )

        psm = NearestNeighborMatch(replace=True,
                                   ratio=10,
                                   caliper=1,
                                   random_state=SEED)

        df_all = pd.DataFrame(X_all,
                              columns=[f'col_{x}' for x in range(X.shape[1])])
        df_all['is_test'] = y_all
        df_all['ps'] = ps_all

        print(f'AV: propensity score matching...')
        df_matched = psm.match(df_all,
                               treatment_col='is_test',
                               score_cols=['ps'])
        df_matched.drop_duplicates(inplace=True)
        print(f'AV: original data:\n{df_all.is_test.value_counts()}')
        print(f'AV: matched data:\n{df_matched.is_test.value_counts()}')

        trn_matched_idx = df_matched.index[df_matched.is_test == 0]

        # Training
        X_trn, X_val, y_trn, y_val = train_test_split(self.X[trn_matched_idx],
                                                      self.y[trn_matched_idx],
                                                      test_size=.25,
                                                      random_state=SEED)
        self.clf.fit(X_trn,
                     y_trn,
                     eval_set=(X_val, y_val),
                     early_stopping_rounds=10,
                     eval_metric='auc')

        num_test_samples = X.shape[0]
        if X.ndim > 1: num_feat = X.shape[1]
        print(("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples,
                                                       num_feat))
        if (self.num_feat != num_feat):
            print(
                "ARRGH: number of features in X does not match training data!")
        print(("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples,
                                                       self.num_labels))
        y = self.clf.predict_proba(X)[:, 1]
        y = np.transpose(y)
        return y
Example #2
0
    def predict(self, F, datainfo, timeinfo):
        '''
        This function should provide predictions of labels on (test) data.
        Here we just return random values...
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves.
        The function predict eventually casdn return probabilities or continuous values.
        '''

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        date_cols = datainfo['loaded_feat_types'][0]
        numeric_cols = datainfo['loaded_feat_types'][1]
        categorical_cols = datainfo['loaded_feat_types'][2]
        multicategorical_cols = datainfo['loaded_feat_types'][3]

        # Get numerical variables and replace NaNs with 0s
        X = np.nan_to_num(F['numerical'][date_cols:])

        # Adversarial validation
        logging.info('AV: starting adversarial validation...')

        np.random.seed(SEED)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

        n_trn = self.X.shape[0]
        n_tst = X.shape[0]
        n_feature = X.shape[1]

        X_all = np.vstack((self.X, X))
        y_all = np.concatenate((np.zeros(n_trn, ), np.ones(n_tst, )))
        logging.info(f'AV: {X_all.shape}, {y_all.shape}')

        # Train an adversarial validation classifier
        ps_all = np.zeros_like(y_all, dtype=float)
        for i, (i_trn, i_val) in enumerate(cv.split(X_all, y_all)):

            model_av = LGBMClassifier(**params)
            model_av.fit(X_all[i_trn],
                         y_all[i_trn],
                         eval_set=(X_all[i_val], y_all[i_val]),
                         early_stopping_rounds=10,
                         verbose=10)

            ps_all[i_val] = model_av.predict_proba(X_all[i_val])[:, 1]

        av_score = roc_auc_score(y_all, ps_all)
        logging.info(f'AV: AUC={av_score * 100: 3.2f}')

        ps_all = np.clip(calibrate(ps_all, y_all), .1, .9)
        w_all = ps_all / (1 - ps_all)
        logging.info(
            f'AV: propensity scores deciles: {np.percentile(ps_all, np.linspace(0, 1, 11))}'
        )

        # Training
        X_trn, X_val, y_trn, y_val, w_trn, w_val = train_test_split(
            self.X, self.y, w_all[:n_trn], test_size=.25, random_state=SEED)
        self.clf.fit(X_trn,
                     y_trn,
                     eval_set=(X_val, y_val),
                     early_stopping_rounds=10,
                     verbose=10,
                     sample_weight=w_trn)

        num_test_samples = X.shape[0]
        num_feat = X.shape[1]
        logging.info(
            ("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples,
                                                     num_feat))
        logging.info(
            ("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples,
                                                     self.num_labels))
        y = self.clf.predict_proba(X)[:, 1]
        y = np.transpose(y)
        return y
Example #3
0
    def estimate_ate(self, X, treatment, y, p, segment=None, return_ci=False):
        """Estimate the Average Treatment Effect (ATE).

        Args:
            X (np.matrix or np.array or pd.Dataframe): a feature matrix
            treatment (np.array or pd.Series): a treatment vector
            y (np.array or pd.Series): an outcome vector
            p (np.ndarray or pd.Series or dict): an array of propensity scores of float (0,1) in the single-treatment
                case; or, a dictionary of treatment groups that map to propensity vectors of float (0,1)
            segment (np.array, optional): An optional segment vector of int. If given, the ATE and its CI will be
                                          estimated for each segment.
            return_ci (bool, optional): Whether to return confidence intervals

        Returns:
            (tuple): The ATE and its confidence interval (LB, UB) for each treatment, t and segment, s
        """
        X, treatment, y = convert_pd_to_np(X, treatment, y)
        check_treatment_vector(treatment, self.control_name)
        self.t_groups = np.unique(treatment[treatment != self.control_name])
        self.t_groups.sort()

        check_p_conditions(p, self.t_groups)
        if isinstance(p, (np.ndarray, pd.Series)):
            treatment_name = self.t_groups[0]
            p = {treatment_name: convert_pd_to_np(p)}
        elif isinstance(p, dict):
            p = {
                treatment_name: convert_pd_to_np(_p)
                for treatment_name, _p in p.items()
            }

        ate = []
        ate_lb = []
        ate_ub = []

        for i, group in enumerate(self.t_groups):
            logger.info("Estimating ATE for group {}.".format(group))
            w_group = (treatment == group).astype(int)
            p_group = p[group]

            if self.calibrate_propensity:
                logger.info("Calibrating propensity scores.")
                p_group = calibrate(p_group, w_group)

            yhat_c = np.zeros_like(y, dtype=float)
            yhat_t = np.zeros_like(y, dtype=float)
            if self.cv:
                for i_fold, (i_trn,
                             i_val) in enumerate(self.cv.split(X, y), 1):
                    logger.info(
                        "Training an outcome model for CV #{}".format(i_fold))
                    self.model_tau.fit(
                        np.hstack((X[i_trn], w_group[i_trn].reshape(-1, 1))),
                        y[i_trn])

                    yhat_c[i_val] = self.model_tau.predict(
                        np.hstack((X[i_val], np.zeros((len(i_val), 1)))))
                    yhat_t[i_val] = self.model_tau.predict(
                        np.hstack((X[i_val], np.ones((len(i_val), 1)))))

            else:
                self.model_tau.fit(np.hstack((X, w_group.reshape(-1, 1))), y)

                yhat_c = self.model_tau.predict(
                    np.hstack((X, np.zeros((len(y), 1)))))
                yhat_t = self.model_tau.predict(
                    np.hstack((X, np.ones((len(y), 1)))))

            if segment is None:
                logger.info("Training the TMLE learner.")
                _ate, se = simple_tmle(y, w_group, yhat_c, yhat_t, p_group)
                _ate_lb = _ate - se * norm.ppf(1 - self.ate_alpha / 2)
                _ate_ub = _ate + se * norm.ppf(1 - self.ate_alpha / 2)
            else:
                assert (segment.shape[0] == X.shape[0] and segment.ndim
                        == 1), "Segment must be the 1-d np.array of int."
                segments = np.unique(segment)

                _ate = []
                _ate_lb = []
                _ate_ub = []
                for s in sorted(segments):
                    logger.info(
                        "Training the TMLE learner for segment {}.".format(s))
                    filt = (segment
                            == s) & (yhat_c < np.quantile(yhat_c, q=0.99))
                    _ate_s, se = simple_tmle(
                        y[filt],
                        w_group[filt],
                        yhat_c[filt],
                        yhat_t[filt],
                        p_group[filt],
                    )
                    _ate_lb_s = _ate_s - se * norm.ppf(1 - self.ate_alpha / 2)
                    _ate_ub_s = _ate_s + se * norm.ppf(1 - self.ate_alpha / 2)

                    _ate.append(_ate_s)
                    _ate_lb.append(_ate_lb_s)
                    _ate_ub.append(_ate_ub_s)

            ate.append(_ate)
            ate_lb.append(_ate_lb)
            ate_ub.append(_ate_ub)

        return np.array(ate), np.array(ate_lb), np.array(ate_ub)