Exemple #1
0
def rrf(series, n_folds, clfparams, featureparams, aggregateparams,
        refineparams, include, exclude, save_test_predictions,
        save_oob_predictions, skip_cross_validation, _run):
    data = TelstraData(include=include, exclude=exclude, **featureparams)
    time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    pred_cols = ['predict_{}'.format(i) for i in range(3)]
    best_pruning = refineparams['n_prunings']
    if skip_cross_validation:
        loss = 999.
    else:
        y = data.get_y()
        kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True)
        pred = pd.DataFrame(0., index=y.index, columns=pred_cols)
        i = 1
        _run.info['loss'] = []
        _run.info['trainloss'] = []
        for itrain, itest in kf:
            Xtr, ytr, Xte, yte = data.get_train_test_features(
                itrain, itest, **aggregateparams)
            clf = RF(**clfparams)
            clf.fit(Xtr, ytr)
            rrf = RRF(clf, **refineparams)
            rrf.fit(Xtr, ytr)
            loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr))
            loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte))
            _run.info['loss'].append(loss2te)
            _run.info['trainloss'].append(loss2tr)
            print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format(
                i, loss2tr, loss2te))
            pred.iloc[itest, :] = rrf.predict_proba(Xte)
            i += 1
        loss = multiclass_log_loss(y.values, pred.values)
        _run.info['features'] = list(Xtr.columns)
        # Optionally save oob predictions
        if save_oob_predictions:
            filename = '{}_{}.csv'.format(series, time)
            pred.to_csv(filename, index_label='id')
    # Optionally generate test predictions
    if save_test_predictions:
        filename = '{}_test_{}.csv'.format(series, time)
        Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams)
        #
        # weights = np.concatenate((np.ones(ytr.shape[0]),0.3*np.ones(semilabels.shape[0])))
        # Xtr = pd.concat((Xtr, Xtest), axis=0)
        # ytr = pd.concat((ytr, semilabels))
        clf = RF(**clfparams)
        clf.fit(Xtr, ytr)  #,weights)
        rrf = RRF(clf, **refineparams)
        rrf.fit(Xtr, ytr)
        predtest = pd.DataFrame(rrf.predict_proba(Xte),
                                index=yte.index,
                                columns=pred_cols)
        predtest.to_csv(filename, index_label='id')
    return loss
Exemple #2
0
def rrf(series, n_folds, clfparams, featureparams, aggregateparams, refineparams, include, exclude,
        save_test_predictions, save_oob_predictions, skip_cross_validation, _run):
    data = TelstraData(include = include, exclude = exclude, **featureparams)
    time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    pred_cols = ['predict_{}'.format(i) for i in range(3)]
    best_pruning = refineparams['n_prunings']
    if skip_cross_validation:
        loss = 999.
    else:
        y = data.get_y()
        kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True)
        pred = pd.DataFrame(0., index = y.index, columns = pred_cols)
        i = 1
        _run.info['loss'] = []
        _run.info['trainloss'] = []
        for itrain, itest in kf:
            Xtr, ytr, Xte, yte = data.get_train_test_features(itrain, itest, **aggregateparams)
            clf = RF(**clfparams)
            clf.fit(Xtr, ytr)
            rrf = RRF(clf, **refineparams)
            rrf.fit(Xtr, ytr)
            loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr))
            loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte))
            _run.info['loss'].append(loss2te)
            _run.info['trainloss'].append(loss2tr)
            print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format(i, loss2tr, loss2te))
            pred.iloc[itest,:] = rrf.predict_proba(Xte)
            i+=1
        loss = multiclass_log_loss(y.values, pred.values)
        _run.info['features'] = list(Xtr.columns)
        # Optionally save oob predictions
        if save_oob_predictions:
            filename = '{}_{}.csv'.format(series, time)
            pred.to_csv(filename, index_label='id')
    # Optionally generate test predictions
    if save_test_predictions:
        filename = '{}_test_{}.csv'.format(series, time)
        Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams)
        #
        # weights = np.concatenate((np.ones(ytr.shape[0]),0.3*np.ones(semilabels.shape[0])))
        # Xtr = pd.concat((Xtr, Xtest), axis=0)
        # ytr = pd.concat((ytr, semilabels))
        clf = RF(**clfparams)
        clf.fit(Xtr, ytr)#,weights)
        rrf = RRF(clf, **refineparams)
        rrf.fit(Xtr, ytr)
        predtest = pd.DataFrame(rrf.predict_proba(Xte),
                                index = yte.index, columns = pred_cols)
        predtest.to_csv(filename, index_label='id')
    return loss
def ret(series, n_folds, clfparams, featureparams, aggregateparams,
        refineparams, include, exclude, save_test_predictions,
        save_oob_predictions, skip_cross_validation, _run):
    data = TelstraData(include=include, exclude=exclude, **featureparams)
    X, y, Xtest, ytest = data.get_train_test_features(**aggregateparams)
    time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    pred_cols = ['predict_{}'.format(i) for i in range(3)]
    rrfparams = dict(refineparams)
    rrfparams['n_prunings'] = 1
    kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True)
    pred = pd.DataFrame(0., index=y.index, columns=pred_cols)
    testpreds = []  # list for storing  test predictions
    i = 1
    _run.info['loss'] = []
    _run.info['trainloss'] = []
    _run.info['best_pruning'] = []
    for itrain, itest in kf:
        Xtr, ytr, Xte, yte = data.get_train_test_features(
            itrain, itest, **aggregateparams)
        clf = ET(**clfparams)
        clf.fit(Xtr, ytr)
        rrf = RRF(clf, **rrfparams)
        best_loss = 1000.
        train_loss = 1000.
        no_improvement_in = 0
        for k in range(refineparams['n_prunings']):
            try:
                rrf.fit(Xtr, ytr)  # fit and do 1 pruning
            except IndexError as e:
                print('IndexError')  # sometimes I get an index error when an
                # unfortunate tree gets cut down to the root
                # we'll just stop and use best-so-far prediction in this case
                break
            loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr))
            loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte))
            print("Fold {} Pruning {} mlogloss train: {:.4f}, test: {:.4f}".
                  format(i, k, loss2tr, loss2te))
            if loss2te < best_loss:  # performance is better
                no_improvement_in = 0
                best_loss = loss2te + 0.  # save new best loss
                # predict oof samples with new model
                pred.iloc[itest, :] = rrf.predict_proba(Xte)
                # predict test with new model
                testpred = rrf.predict_proba(Xtest)
                # record current train loss
                train_loss = loss2tr + 0.
            else:
                no_improvement_in += 1
            if no_improvement_in >= 5:
                break
        # Append current testpred to testpreds list
        testpreds.append(
            pd.DataFrame(testpred, index=ytest.index, columns=pred_cols))
        # Save loss and train loss from current fold
        _run.info['loss'].append(best_loss)
        _run.info['trainloss'].append(train_loss)
        _run.info['best_pruning'].append(k + 1)
        print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format(
            i, train_loss, best_loss))
        i += 1
    loss = multiclass_log_loss(y.values, pred.values)
    _run.info['features'] = list(Xtr.columns)

    filename = '{}_{}_cv_{:.4f}.csv'.format(series, time, loss)
    pred.to_csv(filename, index_label='id')
    # Optionally generate test predictions
    testpred = sum(testpreds) / len(testpreds)
    filename = '{}_test_{}_cv_{:.4f}.csv'.format(series, time, loss)
    testpred.to_csv(filename, index_label='id')
    return loss
def rf(series, n_folds, clfparams, featureparams, aggregateparams, refineparams, include, exclude,
        save_test_predictions, save_oob_predictions, skip_cross_validation, _run):
    data = TelstraData(include = include, exclude = exclude, **featureparams)
    X, y, Xtest, ytest = data.get_train_test_features(**aggregateparams)
    time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    pred_cols = ['predict_{}'.format(i) for i in range(3)]
    rrfparams = dict(refineparams)
    rrfparams['n_prunings'] = 1
    kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True)
    pred = pd.DataFrame(0., index = y.index, columns = pred_cols)
    testpreds = [] # list for storing  test predictions
    i = 1
    _run.info['loss'] = []
    _run.info['trainloss'] = []
    _run.info['best_pruning'] = []
    for itrain, itest in kf:
        Xtr, ytr, Xte, yte = data.get_train_test_features(itrain, itest, **aggregateparams)
        clf = RF(**clfparams)
        clf.fit(Xtr, ytr)
        rrf = RRF(clf, **rrfparams)
        best_loss = 1000.
        train_loss = 1000.
        no_improvement_in = 0
        for k in range(refineparams['n_prunings']):
            try:
                rrf.fit(Xtr, ytr) # fit and do 1 pruning
            except IndexError as e:
                print('IndexError')
                break
            loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr))
            loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte))
            print("Fold {} Pruning {} mlogloss train: {:.4f}, test: {:.4f}".format(i,k, loss2tr, loss2te))
            if loss2te < best_loss: # performance is better
                no_improvement_in = 0
                best_loss = loss2te + 0. # save new best loss
                # predict oof samples with new model
                pred.iloc[itest,:] = rrf.predict_proba(Xte)
                # predict test with new model
                testpred = rrf.predict_proba(Xtest)
                # record current train loss
                train_loss = loss2tr + 0.
            else:
                no_improvement_in += 1
            if no_improvement_in >= 5:
                break
        # Append current testpred to testpreds list
        testpreds.append(pd.DataFrame(testpred,
                                index = ytest.index, columns = pred_cols))
        # Save loss and train loss from current fold
        _run.info['loss'].append(best_loss)
        _run.info['trainloss'].append(train_loss)
        _run.info['best_pruning'].append(k + 1)
        print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format(i, train_loss, best_loss))
        i+=1
    loss = multiclass_log_loss(y.values, pred.values)
    _run.info['features'] = list(Xtr.columns)

    filename = '{}_{}_cv_{:.4f}.csv'.format(series, time, loss)
    pred.to_csv(filename, index_label='id')
    # Optionally generate test predictions
    testpred = sum(testpreds) / len(testpreds)
    filename = '{}_test_{}_cv_{:.4f}.csv'.format(series, time, loss)
    testpred.to_csv(filename, index_label='id')
    return loss