def ret(series, n_folds, clfparams, featureparams, aggregateparams,
        refineparams, include, exclude, save_test_predictions,
        save_oob_predictions, skip_cross_validation, _run):
    data = TelstraData(include=include, exclude=exclude, **featureparams)
    X, y, Xtest, ytest = data.get_train_test_features(**aggregateparams)
    time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    pred_cols = ['predict_{}'.format(i) for i in range(3)]
    rrfparams = dict(refineparams)
    rrfparams['n_prunings'] = 1
    kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True)
    pred = pd.DataFrame(0., index=y.index, columns=pred_cols)
    testpreds = []  # list for storing  test predictions
    i = 1
    _run.info['loss'] = []
    _run.info['trainloss'] = []
    _run.info['best_pruning'] = []
    for itrain, itest in kf:
        Xtr, ytr, Xte, yte = data.get_train_test_features(
            itrain, itest, **aggregateparams)
        clf = ET(**clfparams)
        clf.fit(Xtr, ytr)
        rrf = RRF(clf, **rrfparams)
        best_loss = 1000.
        train_loss = 1000.
        no_improvement_in = 0
        for k in range(refineparams['n_prunings']):
            try:
                rrf.fit(Xtr, ytr)  # fit and do 1 pruning
            except IndexError as e:
                print('IndexError')  # sometimes I get an index error when an
                # unfortunate tree gets cut down to the root
                # we'll just stop and use best-so-far prediction in this case
                break
            loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr))
            loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte))
            print("Fold {} Pruning {} mlogloss train: {:.4f}, test: {:.4f}".
                  format(i, k, loss2tr, loss2te))
            if loss2te < best_loss:  # performance is better
                no_improvement_in = 0
                best_loss = loss2te + 0.  # save new best loss
                # predict oof samples with new model
                pred.iloc[itest, :] = rrf.predict_proba(Xte)
                # predict test with new model
                testpred = rrf.predict_proba(Xtest)
                # record current train loss
                train_loss = loss2tr + 0.
            else:
                no_improvement_in += 1
            if no_improvement_in >= 5:
                break
        # Append current testpred to testpreds list
        testpreds.append(
            pd.DataFrame(testpred, index=ytest.index, columns=pred_cols))
        # Save loss and train loss from current fold
        _run.info['loss'].append(best_loss)
        _run.info['trainloss'].append(train_loss)
        _run.info['best_pruning'].append(k + 1)
        print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format(
            i, train_loss, best_loss))
        i += 1
    loss = multiclass_log_loss(y.values, pred.values)
    _run.info['features'] = list(Xtr.columns)

    filename = '{}_{}_cv_{:.4f}.csv'.format(series, time, loss)
    pred.to_csv(filename, index_label='id')
    # Optionally generate test predictions
    testpred = sum(testpreds) / len(testpreds)
    filename = '{}_test_{}_cv_{:.4f}.csv'.format(series, time, loss)
    testpred.to_csv(filename, index_label='id')
    return loss
def xgbrun(series, n_folds, clfparams, featureparams, num_trees,
           early_stopping_rounds, verbose_eval, _seed, _run, aggregateparams,
           include, exclude, save_test_predictions, save_oob_predictions,
           skip_cross_validation):
    clfparams['seed'] = _seed
    data = TelstraData(include=include, exclude=exclude, **featureparams)
    time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    pred_cols = ['predict_{}'.format(i) for i in range(3)]
    num_rounds = num_trees + 0
    params = {"verbose_eval": verbose_eval}
    if skip_cross_validation:
        loss = 999.
    else:
        y = data.get_y()
        kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True)
        pred = pd.DataFrame(0., index=y.index, columns=pred_cols)
        i = 1
        _run.info['loss'] = []
        _run.info['trainloss'] = []
        for itrain, itest in kf:
            Xtr, ytr, Xte, yte = data.get_train_test_features(
                itrain, itest, **aggregateparams)
            dtrain = xgb.DMatrix(Xtr, ytr)
            dvalid = xgb.DMatrix(Xte, yte)
            params = {"verbose_eval": verbose_eval}
            if (i == 1) and (early_stopping_rounds > 0):
                watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
                params["evals"] = watchlist
                params["early_stopping_rounds"] = early_stopping_rounds

            gbm = xgb.train(clfparams, dtrain, num_rounds, **params)

            if i == 1:
                num_rounds = gbm.best_ntree_limit
                _run.info['num_rounds'] = num_rounds

            pred.iloc[itest, :] = gbm.predict(dvalid,
                                              ntree_limit=num_rounds).reshape(
                                                  yte.shape[0], len(pred_cols))
            predtrain = gbm.predict(dtrain, ntree_limit=num_rounds).reshape(
                ytr.shape[0], len(pred_cols))
            loss = multiclass_log_loss(yte, pred.iloc[itest].values)
            trainloss = multiclass_log_loss(ytr, predtrain)
            #print("Fold {:02d}: trainloss = {:.4f}, testloss = {:.4f}".format(i,trainloss, loss))
            _run.info['loss'].append(loss)
            _run.info['trainloss'].append(trainloss)
            i += 1
        loss = multiclass_log_loss(y, pred.values)

        _run.info['features'] = list(Xtr.columns)
        # Optionally save oob predictions
        if save_oob_predictions:
            filename = '{}_{}.csv'.format(series, time)
            pred.to_csv(filename, index_label='id')

    # Optionally generate test predictions
    if save_test_predictions:
        filename = '{}_test_{}.csv'.format(series, time)
        Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams)
        dtrain = xgb.DMatrix(Xtr, ytr)
        dtest = xgb.DMatrix(Xte)
        gbm = xgb.train(clfparams, dtrain, num_rounds, **params)
        predtest = pd.DataFrame(gbm.predict(dtest).reshape(
            yte.shape[0], len(pred_cols)),
                                index=yte.index,
                                columns=pred_cols)
        predtest.to_csv(filename, index_label='id')
    return loss