Python calibrate Examples

Programming Language: Python

Namespace/Package Name: causalml.propensity

Method/Function: calibrate

Examples at hotexamples.com: 3

Python calibrate - 3 examples found. These are the top rated real world Python examples of causalml.propensity.calibrate extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: model.py Project: ppstacy/automl3_starting_kit

    def predict(self, F, datainfo, timeinfo):
        '''
        This function should provide predictions of labels on (test) data.
        Here we just return random values...
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves.
        The function predict eventually casdn return probabilities or continuous values.
        '''

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        print("[***] Overall time spent %5.2f sec" % overall_spenttime)
        print("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        date_cols = datainfo['loaded_feat_types'][0]
        numeric_cols = datainfo['loaded_feat_types'][1]
        categorical_cols = datainfo['loaded_feat_types'][2]
        multicategorical_cols = datainfo['loaded_feat_types'][3]

        # Get numerical variables and replace NaNs with 0s
        X = np.nan_to_num(F['numerical'])

        # Frequency encode categorical variables and concatenate them with numerical variables
        if categorical_cols > 0:
            X_cat = self.cat_encs.transform(F['CAT']).values
            X = np.concatenate((X, X_cat), axis=1)
            del X_cat

        # Adversarial validation
        print('AV: starting adversarial validation...')

        np.random.seed(SEED)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

        n_trn = self.X.shape[0]
        n_tst = X.shape[0]

        X_all = np.vstack((self.X, X))
        y_all = np.concatenate((np.zeros(n_trn, ), np.ones(n_tst, )))
        print('AV: ', X_all.shape, y_all.shape)

        ps_all = np.zeros_like(y_all, dtype=float)
        for i, (i_trn, i_val) in enumerate(cv.split(X_all, y_all)):

            model_av = LGBMClassifier(n_estimators=1000,
                                      subsample=.8,
                                      subsample_freq=1,
                                      colsample_bytree=.8,
                                      importance_type='gain')
            model_av.fit(X_all[i_trn],
                         y_all[i_trn],
                         eval_set=(X_all[i_val], y_all[i_val]),
                         early_stopping_rounds=10,
                         eval_metric='auc')

            ps_all[i_val] = model_av.predict_proba(X_all[i_val])[:, 1]

        av_score = roc_auc_score(y_all, ps_all)
        print(f'AV: AUC={av_score * 100: 3.2f}')

        ps_all = calibrate(ps_all, y_all)
        print(
            f'AV: propensity scores deciles: {np.percentile(ps_all, np.linspace(0, 1, 11))}'
        )

        psm = NearestNeighborMatch(replace=True,
                                   ratio=10,
                                   caliper=1,
                                   random_state=SEED)

        df_all = pd.DataFrame(X_all,
                              columns=[f'col_{x}' for x in range(X.shape[1])])
        df_all['is_test'] = y_all
        df_all['ps'] = ps_all

        print(f'AV: propensity score matching...')
        df_matched = psm.match(df_all,
                               treatment_col='is_test',
                               score_cols=['ps'])
        df_matched.drop_duplicates(inplace=True)
        print(f'AV: original data:\n{df_all.is_test.value_counts()}')
        print(f'AV: matched data:\n{df_matched.is_test.value_counts()}')

        trn_matched_idx = df_matched.index[df_matched.is_test == 0]

        # Training
        X_trn, X_val, y_trn, y_val = train_test_split(self.X[trn_matched_idx],
                                                      self.y[trn_matched_idx],
                                                      test_size=.25,
                                                      random_state=SEED)
        self.clf.fit(X_trn,
                     y_trn,
                     eval_set=(X_val, y_val),
                     early_stopping_rounds=10,
                     eval_metric='auc')

        num_test_samples = X.shape[0]
        if X.ndim > 1: num_feat = X.shape[1]
        print(("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples,
                                                       num_feat))
        if (self.num_feat != num_feat):
            print(
                "ARRGH: number of features in X does not match training data!")
        print(("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples,
                                                       self.num_labels))
        y = self.clf.predict_proba(X)[:, 1]
        y = np.transpose(y)
        return y

Example #2

Show file

    def predict(self, F, datainfo, timeinfo):
        '''
        This function should provide predictions of labels on (test) data.
        Here we just return random values...
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves.
        The function predict eventually casdn return probabilities or continuous values.
        '''

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        date_cols = datainfo['loaded_feat_types'][0]
        numeric_cols = datainfo['loaded_feat_types'][1]
        categorical_cols = datainfo['loaded_feat_types'][2]
        multicategorical_cols = datainfo['loaded_feat_types'][3]

        # Get numerical variables and replace NaNs with 0s
        X = np.nan_to_num(F['numerical'][date_cols:])

        # Adversarial validation
        logging.info('AV: starting adversarial validation...')

        np.random.seed(SEED)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

        n_trn = self.X.shape[0]
        n_tst = X.shape[0]
        n_feature = X.shape[1]

        X_all = np.vstack((self.X, X))
        y_all = np.concatenate((np.zeros(n_trn, ), np.ones(n_tst, )))
        logging.info(f'AV: {X_all.shape}, {y_all.shape}')

        # Train an adversarial validation classifier
        ps_all = np.zeros_like(y_all, dtype=float)
        for i, (i_trn, i_val) in enumerate(cv.split(X_all, y_all)):

            model_av = LGBMClassifier(**params)
            model_av.fit(X_all[i_trn],
                         y_all[i_trn],
                         eval_set=(X_all[i_val], y_all[i_val]),
                         early_stopping_rounds=10,
                         verbose=10)

            ps_all[i_val] = model_av.predict_proba(X_all[i_val])[:, 1]

        av_score = roc_auc_score(y_all, ps_all)
        logging.info(f'AV: AUC={av_score * 100: 3.2f}')

        ps_all = np.clip(calibrate(ps_all, y_all), .1, .9)
        w_all = ps_all / (1 - ps_all)
        logging.info(
            f'AV: propensity scores deciles: {np.percentile(ps_all, np.linspace(0, 1, 11))}'
        )

        # Training
        X_trn, X_val, y_trn, y_val, w_trn, w_val = train_test_split(
            self.X, self.y, w_all[:n_trn], test_size=.25, random_state=SEED)
        self.clf.fit(X_trn,
                     y_trn,
                     eval_set=(X_val, y_val),
                     early_stopping_rounds=10,
                     verbose=10,
                     sample_weight=w_trn)

        num_test_samples = X.shape[0]
        num_feat = X.shape[1]
        logging.info(
            ("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples,
                                                     num_feat))
        logging.info(
            ("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples,
                                                     self.num_labels))
        y = self.clf.predict_proba(X)[:, 1]
        y = np.transpose(y)
        return y

Example #3

Show file

File: tmle.py Project: uber/causalml

    def estimate_ate(self, X, treatment, y, p, segment=None, return_ci=False):
        """Estimate the Average Treatment Effect (ATE).

        Args:
            X (np.matrix or np.array or pd.Dataframe): a feature matrix
            treatment (np.array or pd.Series): a treatment vector
            y (np.array or pd.Series): an outcome vector
            p (np.ndarray or pd.Series or dict): an array of propensity scores of float (0,1) in the single-treatment
                case; or, a dictionary of treatment groups that map to propensity vectors of float (0,1)
            segment (np.array, optional): An optional segment vector of int. If given, the ATE and its CI will be
                                          estimated for each segment.
            return_ci (bool, optional): Whether to return confidence intervals

        Returns:
            (tuple): The ATE and its confidence interval (LB, UB) for each treatment, t and segment, s
        """
        X, treatment, y = convert_pd_to_np(X, treatment, y)
        check_treatment_vector(treatment, self.control_name)
        self.t_groups = np.unique(treatment[treatment != self.control_name])
        self.t_groups.sort()

        check_p_conditions(p, self.t_groups)
        if isinstance(p, (np.ndarray, pd.Series)):
            treatment_name = self.t_groups[0]
            p = {treatment_name: convert_pd_to_np(p)}
        elif isinstance(p, dict):
            p = {
                treatment_name: convert_pd_to_np(_p)
                for treatment_name, _p in p.items()
            }

        ate = []
        ate_lb = []
        ate_ub = []

        for i, group in enumerate(self.t_groups):
            logger.info("Estimating ATE for group {}.".format(group))
            w_group = (treatment == group).astype(int)
            p_group = p[group]

            if self.calibrate_propensity:
                logger.info("Calibrating propensity scores.")
                p_group = calibrate(p_group, w_group)

            yhat_c = np.zeros_like(y, dtype=float)
            yhat_t = np.zeros_like(y, dtype=float)
            if self.cv:
                for i_fold, (i_trn,
                             i_val) in enumerate(self.cv.split(X, y), 1):
                    logger.info(
                        "Training an outcome model for CV #{}".format(i_fold))
                    self.model_tau.fit(
                        np.hstack((X[i_trn], w_group[i_trn].reshape(-1, 1))),
                        y[i_trn])

                    yhat_c[i_val] = self.model_tau.predict(
                        np.hstack((X[i_val], np.zeros((len(i_val), 1)))))
                    yhat_t[i_val] = self.model_tau.predict(
                        np.hstack((X[i_val], np.ones((len(i_val), 1)))))

            else:
                self.model_tau.fit(np.hstack((X, w_group.reshape(-1, 1))), y)

                yhat_c = self.model_tau.predict(
                    np.hstack((X, np.zeros((len(y), 1)))))
                yhat_t = self.model_tau.predict(
                    np.hstack((X, np.ones((len(y), 1)))))

            if segment is None:
                logger.info("Training the TMLE learner.")
                _ate, se = simple_tmle(y, w_group, yhat_c, yhat_t, p_group)
                _ate_lb = _ate - se * norm.ppf(1 - self.ate_alpha / 2)
                _ate_ub = _ate + se * norm.ppf(1 - self.ate_alpha / 2)
            else:
                assert (segment.shape[0] == X.shape[0] and segment.ndim
                        == 1), "Segment must be the 1-d np.array of int."
                segments = np.unique(segment)

                _ate = []
                _ate_lb = []
                _ate_ub = []
                for s in sorted(segments):
                    logger.info(
                        "Training the TMLE learner for segment {}.".format(s))
                    filt = (segment
                            == s) & (yhat_c < np.quantile(yhat_c, q=0.99))
                    _ate_s, se = simple_tmle(
                        y[filt],
                        w_group[filt],
                        yhat_c[filt],
                        yhat_t[filt],
                        p_group[filt],
                    )
                    _ate_lb_s = _ate_s - se * norm.ppf(1 - self.ate_alpha / 2)
                    _ate_ub_s = _ate_s + se * norm.ppf(1 - self.ate_alpha / 2)

                    _ate.append(_ate_s)
                    _ate_lb.append(_ate_lb_s)
                    _ate_ub.append(_ate_ub_s)

            ate.append(_ate)
            ate_lb.append(_ate_lb)
            ate_ub.append(_ate_ub)

        return np.array(ate), np.array(ate_lb), np.array(ate_ub)