def predict(self, F, datainfo, timeinfo): ''' This function should provide predictions of labels on (test) data. Here we just return random values... Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. The function predict eventually casdn return probabilities or continuous values. ''' overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] print("[***] Overall time spent %5.2f sec" % overall_spenttime) print("[***] Dataset time spent %5.2f sec" % dataset_spenttime) date_cols = datainfo['loaded_feat_types'][0] numeric_cols = datainfo['loaded_feat_types'][1] categorical_cols = datainfo['loaded_feat_types'][2] multicategorical_cols = datainfo['loaded_feat_types'][3] # Get numerical variables and replace NaNs with 0s X = np.nan_to_num(F['numerical']) # Frequency encode categorical variables and concatenate them with numerical variables if categorical_cols > 0: X_cat = self.cat_encs.transform(F['CAT']).values X = np.concatenate((X, X_cat), axis=1) del X_cat # Adversarial validation print('AV: starting adversarial validation...') np.random.seed(SEED) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED) n_trn = self.X.shape[0] n_tst = X.shape[0] X_all = np.vstack((self.X, X)) y_all = np.concatenate((np.zeros(n_trn, ), np.ones(n_tst, ))) print('AV: ', X_all.shape, y_all.shape) ps_all = np.zeros_like(y_all, dtype=float) for i, (i_trn, i_val) in enumerate(cv.split(X_all, y_all)): model_av = LGBMClassifier(n_estimators=1000, subsample=.8, subsample_freq=1, colsample_bytree=.8, importance_type='gain') model_av.fit(X_all[i_trn], y_all[i_trn], eval_set=(X_all[i_val], y_all[i_val]), early_stopping_rounds=10, eval_metric='auc') ps_all[i_val] = model_av.predict_proba(X_all[i_val])[:, 1] av_score = roc_auc_score(y_all, ps_all) print(f'AV: AUC={av_score * 100: 3.2f}') ps_all = calibrate(ps_all, y_all) print( f'AV: propensity scores deciles: {np.percentile(ps_all, np.linspace(0, 1, 11))}' ) psm = NearestNeighborMatch(replace=True, ratio=10, caliper=1, random_state=SEED) df_all = pd.DataFrame(X_all, columns=[f'col_{x}' for x in range(X.shape[1])]) df_all['is_test'] = y_all df_all['ps'] = ps_all print(f'AV: propensity score matching...') df_matched = psm.match(df_all, treatment_col='is_test', score_cols=['ps']) df_matched.drop_duplicates(inplace=True) print(f'AV: original data:\n{df_all.is_test.value_counts()}') print(f'AV: matched data:\n{df_matched.is_test.value_counts()}') trn_matched_idx = df_matched.index[df_matched.is_test == 0] # Training X_trn, X_val, y_trn, y_val = train_test_split(self.X[trn_matched_idx], self.y[trn_matched_idx], test_size=.25, random_state=SEED) self.clf.fit(X_trn, y_trn, eval_set=(X_val, y_val), early_stopping_rounds=10, eval_metric='auc') num_test_samples = X.shape[0] if X.ndim > 1: num_feat = X.shape[1] print(("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples, num_feat)) if (self.num_feat != num_feat): print( "ARRGH: number of features in X does not match training data!") print(("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples, self.num_labels)) y = self.clf.predict_proba(X)[:, 1] y = np.transpose(y) return y
def predict(self, F, datainfo, timeinfo): ''' This function should provide predictions of labels on (test) data. Here we just return random values... Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. The function predict eventually casdn return probabilities or continuous values. ''' overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) date_cols = datainfo['loaded_feat_types'][0] numeric_cols = datainfo['loaded_feat_types'][1] categorical_cols = datainfo['loaded_feat_types'][2] multicategorical_cols = datainfo['loaded_feat_types'][3] # Get numerical variables and replace NaNs with 0s X = np.nan_to_num(F['numerical'][date_cols:]) # Adversarial validation logging.info('AV: starting adversarial validation...') np.random.seed(SEED) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED) n_trn = self.X.shape[0] n_tst = X.shape[0] n_feature = X.shape[1] X_all = np.vstack((self.X, X)) y_all = np.concatenate((np.zeros(n_trn, ), np.ones(n_tst, ))) logging.info(f'AV: {X_all.shape}, {y_all.shape}') # Train an adversarial validation classifier ps_all = np.zeros_like(y_all, dtype=float) for i, (i_trn, i_val) in enumerate(cv.split(X_all, y_all)): model_av = LGBMClassifier(**params) model_av.fit(X_all[i_trn], y_all[i_trn], eval_set=(X_all[i_val], y_all[i_val]), early_stopping_rounds=10, verbose=10) ps_all[i_val] = model_av.predict_proba(X_all[i_val])[:, 1] av_score = roc_auc_score(y_all, ps_all) logging.info(f'AV: AUC={av_score * 100: 3.2f}') ps_all = np.clip(calibrate(ps_all, y_all), .1, .9) w_all = ps_all / (1 - ps_all) logging.info( f'AV: propensity scores deciles: {np.percentile(ps_all, np.linspace(0, 1, 11))}' ) # Training X_trn, X_val, y_trn, y_val, w_trn, w_val = train_test_split( self.X, self.y, w_all[:n_trn], test_size=.25, random_state=SEED) self.clf.fit(X_trn, y_trn, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=10, sample_weight=w_trn) num_test_samples = X.shape[0] num_feat = X.shape[1] logging.info( ("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples, num_feat)) logging.info( ("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples, self.num_labels)) y = self.clf.predict_proba(X)[:, 1] y = np.transpose(y) return y
def estimate_ate(self, X, treatment, y, p, segment=None, return_ci=False): """Estimate the Average Treatment Effect (ATE). Args: X (np.matrix or np.array or pd.Dataframe): a feature matrix treatment (np.array or pd.Series): a treatment vector y (np.array or pd.Series): an outcome vector p (np.ndarray or pd.Series or dict): an array of propensity scores of float (0,1) in the single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of float (0,1) segment (np.array, optional): An optional segment vector of int. If given, the ATE and its CI will be estimated for each segment. return_ci (bool, optional): Whether to return confidence intervals Returns: (tuple): The ATE and its confidence interval (LB, UB) for each treatment, t and segment, s """ X, treatment, y = convert_pd_to_np(X, treatment, y) check_treatment_vector(treatment, self.control_name) self.t_groups = np.unique(treatment[treatment != self.control_name]) self.t_groups.sort() check_p_conditions(p, self.t_groups) if isinstance(p, (np.ndarray, pd.Series)): treatment_name = self.t_groups[0] p = {treatment_name: convert_pd_to_np(p)} elif isinstance(p, dict): p = { treatment_name: convert_pd_to_np(_p) for treatment_name, _p in p.items() } ate = [] ate_lb = [] ate_ub = [] for i, group in enumerate(self.t_groups): logger.info("Estimating ATE for group {}.".format(group)) w_group = (treatment == group).astype(int) p_group = p[group] if self.calibrate_propensity: logger.info("Calibrating propensity scores.") p_group = calibrate(p_group, w_group) yhat_c = np.zeros_like(y, dtype=float) yhat_t = np.zeros_like(y, dtype=float) if self.cv: for i_fold, (i_trn, i_val) in enumerate(self.cv.split(X, y), 1): logger.info( "Training an outcome model for CV #{}".format(i_fold)) self.model_tau.fit( np.hstack((X[i_trn], w_group[i_trn].reshape(-1, 1))), y[i_trn]) yhat_c[i_val] = self.model_tau.predict( np.hstack((X[i_val], np.zeros((len(i_val), 1))))) yhat_t[i_val] = self.model_tau.predict( np.hstack((X[i_val], np.ones((len(i_val), 1))))) else: self.model_tau.fit(np.hstack((X, w_group.reshape(-1, 1))), y) yhat_c = self.model_tau.predict( np.hstack((X, np.zeros((len(y), 1))))) yhat_t = self.model_tau.predict( np.hstack((X, np.ones((len(y), 1))))) if segment is None: logger.info("Training the TMLE learner.") _ate, se = simple_tmle(y, w_group, yhat_c, yhat_t, p_group) _ate_lb = _ate - se * norm.ppf(1 - self.ate_alpha / 2) _ate_ub = _ate + se * norm.ppf(1 - self.ate_alpha / 2) else: assert (segment.shape[0] == X.shape[0] and segment.ndim == 1), "Segment must be the 1-d np.array of int." segments = np.unique(segment) _ate = [] _ate_lb = [] _ate_ub = [] for s in sorted(segments): logger.info( "Training the TMLE learner for segment {}.".format(s)) filt = (segment == s) & (yhat_c < np.quantile(yhat_c, q=0.99)) _ate_s, se = simple_tmle( y[filt], w_group[filt], yhat_c[filt], yhat_t[filt], p_group[filt], ) _ate_lb_s = _ate_s - se * norm.ppf(1 - self.ate_alpha / 2) _ate_ub_s = _ate_s + se * norm.ppf(1 - self.ate_alpha / 2) _ate.append(_ate_s) _ate_lb.append(_ate_lb_s) _ate_ub.append(_ate_ub_s) ate.append(_ate) ate_lb.append(_ate_lb) ate_ub.append(_ate_ub) return np.array(ate), np.array(ate_lb), np.array(ate_ub)