コード例 #1
0
    def cross_validate(self, x_train, y_train, x_test, folds, verbose_eval=False):
        if int(verbose_eval) > 0:
            print("Validation starts")
            print(x_train.shape)

        for i, (train_idx, test_idx) in enumerate(folds):
            print("Fold %i" % (i + 1))
            x_trn = x_train.iloc[train_idx,]
            y_trn = y_train[train_idx]
            x_holdout = x_train.iloc[test_idx,]
            y_holdout = y_train[test_idx]

            eval_set = [(x_holdout, y_holdout)]

            # match to a proper sklearn .fit function
            self.fit_func(x_trn, y_trn)

            self.y_trn[test_idx] = self.clf.predict_proba(x_holdout)[:, 1]
            self.holdout_gini = eval_gini_normalized(y_holdout, self.y_trn[test_idx])
            self.y_tst_tmp[:, i] = self.clf.predict_proba(x_test)[:, 1]
            gc.collect()

        self.trn_gini = eval_gini_normalized(y_train, self.y_trn)
        self.y_tst = np.mean(self.y_tst_tmp, axis=1)
        if int(verbose_eval) > 0:
            print ("CV score for train cv set: %f" % self.trn_gini)
コード例 #2
0
ファイル: cv.py プロジェクト: Oreki47/kaggle_competitions
    def cross_validate_xgb(self,
                           x_train,
                           y_train,
                           x_test,
                           folds,
                           verbose_eval=False):
        ''' cross validation for xgb not with sklearn api

        '''
        if int(verbose_eval) > 0:
            print("Validation starts")
            print(x_train.shape)

        for i, (train_idx, test_idx) in enumerate(folds):
            if int(verbose_eval) > 0:
                print("Fold %i" % (i + 1))
            x_trn = x_train.iloc[train_idx, ]
            y_trn = y_train[train_idx]
            x_holdout = x_train.iloc[test_idx, ]
            y_holdout = y_train[test_idx]

            if self.upsam:
                x_trn, y_trn = self.upsampling(x_trn, y_trn)

            d_trn = xgb.DMatrix(x_trn, label=y_trn)
            d_tst = xgb.DMatrix(x_test)
            d_holdout = xgb.DMatrix(x_holdout, label=y_holdout)
            watchlist = [(d_holdout, 'holdout')]

            del x_trn, y_trn, x_holdout
            gc.collect()

            self.clf = xgb.train(params=self.params,
                                 dtrain=d_trn,
                                 num_boost_round=self.max_round,
                                 evals=watchlist,
                                 feval=gini_xgb,
                                 maximize=True,
                                 verbose_eval=verbose_eval,
                                 early_stopping_rounds=50)

            self.y_trn[test_idx] = self.clf.predict(d_holdout)
            self.holdout_gini = eval_gini_normalized(y_holdout,
                                                     self.y_trn[test_idx])
            self.y_tst_tmp[:, i] = self.clf.predict(d_tst)
            self.fscore = pd.concat(
                [self.fscore,
                 pd.Series(self.clf.get_fscore(), name=i)],
                axis=1)

            del d_trn, d_holdout, y_holdout
            gc.collect()

        self.trn_gini = eval_gini_normalized(y_train, self.y_trn)
        self.y_tst = np.mean(self.y_tst_tmp, axis=1)
        if int(verbose_eval) > 0:
            print("CV score for train cv set: %f" % self.trn_gini)
コード例 #3
0
    def bagging(self, x_train, y_train, x_test, idx, verbose_eval=False):
        random_seeds = [177, 47, 8243, 5210, 1]
        print("Bagging of %i" % self.bags)
        trn_series = np.zeros((self.len_trn, self.bags))
        tst_series = np.zeros((self.len_tst, self.bags))

        for i in range(self.bags):
            print("Bag %i" % (i + 1))
            folds = list(KFold(
                n_splits=self.n_splits,
                shuffle=True,
                random_state=random_seeds[i]
            ).split(x_train, y_train))
            self.methods[idx](x_train, y_train, x_test, folds, verbose_eval)
            trn_series[:, i] = self.y_trn
            tst_series[:, i] = self.y_tst

            self.y_tst_tmp2[:, (i*5):((i+1)*5)] = self.y_tst_tmp
            self.trn_gini_bags[i] = self.trn_gini
            # Reset all values (not necessary but)
            self.y_trn = np.zeros(self.len_trn)
            self.y_tst = np.zeros(self.len_tst)
            self.y_tst_tmp = np.zeros((self.len_tst, self.n_splits))

        self.y_trn = np.mean(trn_series, axis=1)
        self.y_tst = np.mean(tst_series, axis=1)
        self.trn_gini = eval_gini_normalized(y_train, self.y_trn)
        print ("CV score among different bags:\n")
        print ("CV score among different bags:\n")
        print self.trn_gini_bags
        print ("CV score var"),
        print np.var(self.trn_gini_bags)
        print ("\nCV score for %i aver cv set: %f" % (self.bags, self.trn_gini))
コード例 #4
0
def log_reg(trn_series, tst_series, y_train):
    clf = LogisticRegression()
    clf.fit(trn_series, y_train)
    score = eval_gini_normalized(y_train, clf.predict_proba(trn_series)[:, 1])
    y_tst = clf.predict_proba(tst_series)[:, 1]
    return score, y_tst
コード例 #5
0
def mean(trn_series, tst_series, y_train):
    score = eval_gini_normalized(y_train, np.mean(trn_series, axis=1))
    y_tst = np.mean(tst_series, axis=1)
    return score, y_tst