def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = losses.CompositeLossFunction()
    loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features,
                                      uniform_label=1)
    loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features,
                                      uniform_label=[0, 1])
    loss6bin = losses.BinFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=0)
    loss7bin = losses.BinFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=[0, 1])
    loss6knn = losses.KnnFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=1)
    loss7knn = losses.KnnFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=[0, 1])

    for loss in [
            loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn,
            loss7knn
    ]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(
            result, loss)

    trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX))
    for loss in [
            losses.MSELossFunction(),
            losses.MAELossFunction(),
            losses.RankBoostLossFunction(request_column='fake_request')
    ]:
        print(loss)
        clf = UGradientBoostingRegressor(loss=loss,
                                         max_depth=3,
                                         n_estimators=50,
                                         learning_rate=0.01,
                                         subsample=0.5,
                                         train_features=list(
                                             trainX.columns[1:]))
        clf.fit(trainX, trainY)
        roc_auc = roc_auc_score(testY, clf.predict(testX))
        assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format(
            roc_auc, loss)
def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = losses.CompositeLossFunction()
    loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1)
    loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1])
    loss6bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0)
    loss7bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1])
    loss6knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1)
    loss7knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1])

    for loss in [loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(result, loss)

    trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX))
    for loss in [losses.MSELossFunction(),
                 losses.MAELossFunction(),
                 losses.RankBoostLossFunction(request_column='fake_request')]:
        print(loss)
        clf = UGradientBoostingRegressor(loss=loss, max_depth=3, n_estimators=50, learning_rate=0.01, subsample=0.5,
                                         train_features=list(trainX.columns[1:]))
        clf.fit(trainX, trainY)
        roc_auc = roc_auc_score(testY, clf.predict(testX))
        assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format(roc_auc, loss)
Esempio n. 3
0
X_train = train[features]
y_train = train['signal']

X_val = test[features]
y_val = test['signal']

loss = BinFlatnessLossFunction(['mass'],
                               n_bins=15,
                               uniform_label=0,
                               fl_coefficient=15,
                               power=2)
ugbc = UGradientBoostingClassifier(loss=loss,
                                   n_estimators=550,
                                   max_depth=6,
                                   learning_rate=0.15,
                                   train_features=features,
                                   subsample=0.7,
                                   random_state=123)
ugbc.fit(train[features + ['mass']], train['signal'])
pred_raw = ugbc.predict(test[features])
#print(pred_raw)
pred = pd.DataFrame(data={'signal': pred_raw})
#print(pred.head(5))
#accuracy_fn(pred,y_val)
#print(pred_raw.sum())
print(((pred_raw == y_val) & y_val).sum())
print(y_val.sum())
#print(pred['signal'].sum())
#print((pred['signal']==test['signal']).sum())