Example #1
0
def predictions(data):

    logging.info('PREDICTIONS')

    err_count = 0

    # fit logistic regression model on train data
    idx = data.region == 'train'
    xtrain = data.x[idx]
    ytrain = data.y[idx]
    eratrain = data.era[idx]
    clf = LogisticRegression()
    clf.fit(xtrain, ytrain)

    # predict using train data
    yhat_train = clf.predict_proba(xtrain)[:, 1]

    # check train logloss and consistency
    logloss = log_loss(ytrain, yhat_train)
    err_count += interval('train logloss', logloss, [0.691, 0.693])
    loglosses = logloss_by_era(eratrain, ytrain, yhat_train)
    consistency = (loglosses < np.log(2)).mean()
    err_count += interval('train consistency', consistency, [0.57, 0.84])

    # predict using validation data
    yvalid, yhat = calc_yhat('validation', clf, data)

    # check validation logloss and consistency
    logloss = log_loss(yvalid, yhat)
    err_count += interval('validation logloss', logloss, [0.691, 0.693])
    idx = data.region == 'validation'
    loglosses = logloss_by_era(data.era[idx], yvalid, yhat)
    consistency = (loglosses < np.log(2)).mean()
    err_count += interval('validation consistency', consistency, [0.5, 0.84])

    # check test and live predictions
    for region in ('test', 'live'):
        y, yhat = calc_yhat(region, clf, data)
        target = [0.99 * yhat_train.min(), 1.01 * yhat_train.max()]
        msg = 'predictions in %s region'
        err_count += array_interval(msg % region, yhat, target)

    return err_count
Example #2
0
def features(data):

    logging.info('FEATURES')

    err_count = 0

    # nonfinite feature values
    num = (~np.isfinite(data.x)).sum()
    err_count += _assert('nonfinite feature values', num, '==', 0)

    # abs correlation of features
    corr = np.corrcoef(data.x.T)
    corr = upper_triangle(corr)
    corr = np.abs(corr)
    err_count += interval('mean abs corr of features', corr.mean(),
                          [0.18, 0.22])
    err_count += interval('max  abs corr of features', corr.max(),
                          [0.72, 0.76])

    # distribution of each feature in each era
    for era, feature_num, x in data.era_feature_iter():

        msg = 'range of feature %2d in %s' % (feature_num, era.ljust(6))
        err_count += array_interval(msg, x, [0, 1])

        msg = 'mean  of feature %2d in %s' % (feature_num, era.ljust(6))
        err_count += interval(msg, x.mean(), [0.45, 0.551])

        msg = 'std   of feature %2d in %s' % (feature_num, era.ljust(6))
        err_count += interval(msg, x.std(), [0.09, 0.15])

        msg = 'skewn of feature %2d in %s' % (feature_num, era.ljust(6))
        skew = ((x - x.mean())**3).mean() / x.std()**3
        err_count += interval(msg, skew, [-0.44, 0.44])

        msg = 'kurto of feature %2d in %s' % (feature_num, era.ljust(6))
        kurt = ((x - x.mean())**4).mean() / x.std()**4
        err_count += interval(msg, kurt, [2.45, 3.58])

    return err_count