def ret(series, n_folds, clfparams, featureparams, aggregateparams, refineparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run): data = TelstraData(include=include, exclude=exclude, **featureparams) X, y, Xtest, ytest = data.get_train_test_features(**aggregateparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] rrfparams = dict(refineparams) rrfparams['n_prunings'] = 1 kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index=y.index, columns=pred_cols) testpreds = [] # list for storing test predictions i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] _run.info['best_pruning'] = [] for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features( itrain, itest, **aggregateparams) clf = ET(**clfparams) clf.fit(Xtr, ytr) rrf = RRF(clf, **rrfparams) best_loss = 1000. train_loss = 1000. no_improvement_in = 0 for k in range(refineparams['n_prunings']): try: rrf.fit(Xtr, ytr) # fit and do 1 pruning except IndexError as e: print('IndexError') # sometimes I get an index error when an # unfortunate tree gets cut down to the root # we'll just stop and use best-so-far prediction in this case break loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr)) loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte)) print("Fold {} Pruning {} mlogloss train: {:.4f}, test: {:.4f}". format(i, k, loss2tr, loss2te)) if loss2te < best_loss: # performance is better no_improvement_in = 0 best_loss = loss2te + 0. # save new best loss # predict oof samples with new model pred.iloc[itest, :] = rrf.predict_proba(Xte) # predict test with new model testpred = rrf.predict_proba(Xtest) # record current train loss train_loss = loss2tr + 0. else: no_improvement_in += 1 if no_improvement_in >= 5: break # Append current testpred to testpreds list testpreds.append( pd.DataFrame(testpred, index=ytest.index, columns=pred_cols)) # Save loss and train loss from current fold _run.info['loss'].append(best_loss) _run.info['trainloss'].append(train_loss) _run.info['best_pruning'].append(k + 1) print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format( i, train_loss, best_loss)) i += 1 loss = multiclass_log_loss(y.values, pred.values) _run.info['features'] = list(Xtr.columns) filename = '{}_{}_cv_{:.4f}.csv'.format(series, time, loss) pred.to_csv(filename, index_label='id') # Optionally generate test predictions testpred = sum(testpreds) / len(testpreds) filename = '{}_test_{}_cv_{:.4f}.csv'.format(series, time, loss) testpred.to_csv(filename, index_label='id') return loss
def xgbrun(series, n_folds, clfparams, featureparams, num_trees, early_stopping_rounds, verbose_eval, _seed, _run, aggregateparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation): clfparams['seed'] = _seed data = TelstraData(include=include, exclude=exclude, **featureparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] num_rounds = num_trees + 0 params = {"verbose_eval": verbose_eval} if skip_cross_validation: loss = 999. else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index=y.index, columns=pred_cols) i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features( itrain, itest, **aggregateparams) dtrain = xgb.DMatrix(Xtr, ytr) dvalid = xgb.DMatrix(Xte, yte) params = {"verbose_eval": verbose_eval} if (i == 1) and (early_stopping_rounds > 0): watchlist = [(dtrain, 'train'), (dvalid, 'eval')] params["evals"] = watchlist params["early_stopping_rounds"] = early_stopping_rounds gbm = xgb.train(clfparams, dtrain, num_rounds, **params) if i == 1: num_rounds = gbm.best_ntree_limit _run.info['num_rounds'] = num_rounds pred.iloc[itest, :] = gbm.predict(dvalid, ntree_limit=num_rounds).reshape( yte.shape[0], len(pred_cols)) predtrain = gbm.predict(dtrain, ntree_limit=num_rounds).reshape( ytr.shape[0], len(pred_cols)) loss = multiclass_log_loss(yte, pred.iloc[itest].values) trainloss = multiclass_log_loss(ytr, predtrain) #print("Fold {:02d}: trainloss = {:.4f}, testloss = {:.4f}".format(i,trainloss, loss)) _run.info['loss'].append(loss) _run.info['trainloss'].append(trainloss) i += 1 loss = multiclass_log_loss(y, pred.values) _run.info['features'] = list(Xtr.columns) # Optionally save oob predictions if save_oob_predictions: filename = '{}_{}.csv'.format(series, time) pred.to_csv(filename, index_label='id') # Optionally generate test predictions if save_test_predictions: filename = '{}_test_{}.csv'.format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) dtrain = xgb.DMatrix(Xtr, ytr) dtest = xgb.DMatrix(Xte) gbm = xgb.train(clfparams, dtrain, num_rounds, **params) predtest = pd.DataFrame(gbm.predict(dtest).reshape( yte.shape[0], len(pred_cols)), index=yte.index, columns=pred_cols) predtest.to_csv(filename, index_label='id') return loss