def cross_validate(self, x_train, y_train, x_test, folds, verbose_eval=False): if int(verbose_eval) > 0: print("Validation starts") print(x_train.shape) for i, (train_idx, test_idx) in enumerate(folds): print("Fold %i" % (i + 1)) x_trn = x_train.iloc[train_idx,] y_trn = y_train[train_idx] x_holdout = x_train.iloc[test_idx,] y_holdout = y_train[test_idx] eval_set = [(x_holdout, y_holdout)] # match to a proper sklearn .fit function self.fit_func(x_trn, y_trn) self.y_trn[test_idx] = self.clf.predict_proba(x_holdout)[:, 1] self.holdout_gini = eval_gini_normalized(y_holdout, self.y_trn[test_idx]) self.y_tst_tmp[:, i] = self.clf.predict_proba(x_test)[:, 1] gc.collect() self.trn_gini = eval_gini_normalized(y_train, self.y_trn) self.y_tst = np.mean(self.y_tst_tmp, axis=1) if int(verbose_eval) > 0: print ("CV score for train cv set: %f" % self.trn_gini)
def cross_validate_xgb(self, x_train, y_train, x_test, folds, verbose_eval=False): ''' cross validation for xgb not with sklearn api ''' if int(verbose_eval) > 0: print("Validation starts") print(x_train.shape) for i, (train_idx, test_idx) in enumerate(folds): if int(verbose_eval) > 0: print("Fold %i" % (i + 1)) x_trn = x_train.iloc[train_idx, ] y_trn = y_train[train_idx] x_holdout = x_train.iloc[test_idx, ] y_holdout = y_train[test_idx] if self.upsam: x_trn, y_trn = self.upsampling(x_trn, y_trn) d_trn = xgb.DMatrix(x_trn, label=y_trn) d_tst = xgb.DMatrix(x_test) d_holdout = xgb.DMatrix(x_holdout, label=y_holdout) watchlist = [(d_holdout, 'holdout')] del x_trn, y_trn, x_holdout gc.collect() self.clf = xgb.train(params=self.params, dtrain=d_trn, num_boost_round=self.max_round, evals=watchlist, feval=gini_xgb, maximize=True, verbose_eval=verbose_eval, early_stopping_rounds=50) self.y_trn[test_idx] = self.clf.predict(d_holdout) self.holdout_gini = eval_gini_normalized(y_holdout, self.y_trn[test_idx]) self.y_tst_tmp[:, i] = self.clf.predict(d_tst) self.fscore = pd.concat( [self.fscore, pd.Series(self.clf.get_fscore(), name=i)], axis=1) del d_trn, d_holdout, y_holdout gc.collect() self.trn_gini = eval_gini_normalized(y_train, self.y_trn) self.y_tst = np.mean(self.y_tst_tmp, axis=1) if int(verbose_eval) > 0: print("CV score for train cv set: %f" % self.trn_gini)
def bagging(self, x_train, y_train, x_test, idx, verbose_eval=False): random_seeds = [177, 47, 8243, 5210, 1] print("Bagging of %i" % self.bags) trn_series = np.zeros((self.len_trn, self.bags)) tst_series = np.zeros((self.len_tst, self.bags)) for i in range(self.bags): print("Bag %i" % (i + 1)) folds = list(KFold( n_splits=self.n_splits, shuffle=True, random_state=random_seeds[i] ).split(x_train, y_train)) self.methods[idx](x_train, y_train, x_test, folds, verbose_eval) trn_series[:, i] = self.y_trn tst_series[:, i] = self.y_tst self.y_tst_tmp2[:, (i*5):((i+1)*5)] = self.y_tst_tmp self.trn_gini_bags[i] = self.trn_gini # Reset all values (not necessary but) self.y_trn = np.zeros(self.len_trn) self.y_tst = np.zeros(self.len_tst) self.y_tst_tmp = np.zeros((self.len_tst, self.n_splits)) self.y_trn = np.mean(trn_series, axis=1) self.y_tst = np.mean(tst_series, axis=1) self.trn_gini = eval_gini_normalized(y_train, self.y_trn) print ("CV score among different bags:\n") print ("CV score among different bags:\n") print self.trn_gini_bags print ("CV score var"), print np.var(self.trn_gini_bags) print ("\nCV score for %i aver cv set: %f" % (self.bags, self.trn_gini))
def log_reg(trn_series, tst_series, y_train): clf = LogisticRegression() clf.fit(trn_series, y_train) score = eval_gini_normalized(y_train, clf.predict_proba(trn_series)[:, 1]) y_tst = clf.predict_proba(tst_series)[:, 1] return score, y_tst
def mean(trn_series, tst_series, y_train): score = eval_gini_normalized(y_train, np.mean(trn_series, axis=1)) y_tst = np.mean(tst_series, axis=1) return score, y_tst