Ejemplo n.º 1
0
def test_gradient_boosting(n_samples=1000):
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_variables = ['column0']
    n_estimators = 20

    loss1 = SimpleKnnLossFunction(uniform_variables)
    # loss2 = PairwiseKnnLossFunction(uniform_variables, knn=10)
    loss3 = BinomialDevianceLossFunction()
    # loss4 = RandomKnnLossFunction(uniform_variables, samples * 2, knn=5, knn_factor=3)
    # loss5 = DistanceBasedKnnFunction(uniform_variables, knn=10, distance_dependence=lambda r: numpy.exp(-0.1 * r))
    loss6bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5)
    loss7bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1])
    loss6knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5)
    loss7knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1])
    # loss8 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=1)
    # loss9 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1])

    for loss in [loss1, loss3, loss6bin, loss7bin, loss6knn, loss7knn]:
        result = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                             subsample=0.7, n_estimators=n_estimators, train_variables=None) \
            .fit(trainX[:n_samples], trainY[:n_samples]).score(testX, testY)
        assert result >= 0.7, "The quality is too poor: %.3f" % result

    for loss in [loss1, loss3, ]:
        check_gradient(loss)

    print('uniform gradient boosting is ok')
Ejemplo n.º 2
0
def test_gradient_boosting(n_samples=1000):
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_variables = ['column0']
    n_estimators = 20

    loss1 = SimpleKnnLossFunction(uniform_variables)
    # loss2 = PairwiseKnnLossFunction(uniform_variables, knn=10)
    loss3 = BinomialDevianceLossFunction()
    # loss4 = RandomKnnLossFunction(uniform_variables, samples * 2, knn=5, knn_factor=3)
    # loss5 = DistanceBasedKnnFunction(uniform_variables, knn=10, distance_dependence=lambda r: numpy.exp(-0.1 * r))
    loss6bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5)
    loss7bin = BinFlatnessLossFunction(uniform_variables,
                                       ada_coefficient=0.5,
                                       uniform_label=[0, 1])
    loss6knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5)
    loss7knn = KnnFlatnessLossFunction(uniform_variables,
                                       ada_coefficient=0.5,
                                       uniform_label=[0, 1])
    # loss8 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=1)
    # loss9 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1])

    for loss in [loss1, loss3, loss6bin, loss7bin, loss6knn, loss7knn]:
        result = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                             subsample=0.7, n_estimators=n_estimators, train_variables=None) \
            .fit(trainX[:n_samples], trainY[:n_samples]).score(testX, testY)
        assert result >= 0.7, "The quality is too poor: %.3f" % result
Ejemplo n.º 3
0
def test_gb_with_ada(n_samples=1000, n_features=10, distance=0.6):
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    loss = BinomialDevianceLossFunction()
    clf = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                      subsample=0.7, n_estimators=10, train_variables=None)
    clf.fit(trainX, trainY)
    assert clf.n_features == n_features
    assert len(clf.feature_importances_) == n_features
    # checking that predict proba works
    for p in clf.staged_predict_proba(testX):
        assert p.shape == (n_samples, 2)
    assert numpy.all(p == clf.predict_proba(testX))
Ejemplo n.º 4
0
def test_gb_with_ada(n_samples=1000, n_features=10, distance=0.6):
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    loss = BinomialDevianceLossFunction()
    clf = uGradientBoostingClassifier(loss=loss,
                                      min_samples_split=20,
                                      max_depth=5,
                                      learning_rate=.2,
                                      subsample=0.7,
                                      n_estimators=10,
                                      train_variables=None)
    clf.fit(trainX, trainY)
    assert clf.n_features == n_features
    assert len(clf.feature_importances_) == n_features
    # checking that predict proba works
    for p in clf.staged_predict_proba(testX):
        assert p.shape == (n_samples, 2)
    assert numpy.all(p == clf.predict_proba(testX))
Ejemplo n.º 5
0
def test_gb_quality(n_samples=10000, n_features=10, distance=0.5):
    trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)
    testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)

    # Multiplying by random matrix
    multiplier = numpy.random.normal(size=[n_features, n_features])
    shift = numpy.random.normal(size=[1, n_features]) * 5
    trainX = numpy.dot(trainX.values, multiplier) + shift
    testX = numpy.dot(testX.values, multiplier) + shift

    boosters = {
        'old_boost': GradientBoostingClassifier(n_estimators=100, min_samples_split=50, max_depth=5, subsample=0.3),
        'fast+old_tree': CommonGradientBoosting(n_estimators=100,
            base_estimator=DecisionTreeRegressor(min_samples_split=50, max_depth=5)),
        'fast+neuro': TreeGradientBoostingClassifier(n_estimators=100, update_tree=True,
                                                     base_estimator=FastNeuroTreeRegressor()),
        'fold+tree': FoldingGBClassifier(loss=BinomialDeviance(), n_estimators=10, update_tree=True,
                                         base_estimator=FastNeuroTreeRegressor()),
        'ugb': uGradientBoostingClassifier(loss=AdaLossFunction(),
            n_estimators=100, min_samples_split=50, max_depth=5, update_tree=True, subsample=0.3)
    }

    for criterion in ['mse', # 'fmse', # 'pvalue',
                      # 'significance',
                      'significance2',
                      # 'gini',
                      'entropy',
                      'poisson'
    ]:
        boosters['fast-' + criterion[:4]] = TreeGradientBoostingClassifier(n_estimators=100, update_tree=True,
            base_estimator=FastTreeRegressor(criterion=criterion))

    for name, booster in boosters.items():
        start = time.time()
        booster.fit(trainX, trainY)
        auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1])
        print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))
Ejemplo n.º 6
0
def test_gb_quality(n_samples=10000, n_features=10, distance=0.5):
    trainX, trainY = generate_sample(n_samples=n_samples,
                                     n_features=n_features,
                                     distance=distance)
    testX, testY = generate_sample(n_samples=n_samples,
                                   n_features=n_features,
                                   distance=distance)

    # Multiplying by random matrix
    multiplier = numpy.random.normal(size=[n_features, n_features])
    shift = numpy.random.normal(size=[1, n_features]) * 5
    trainX = numpy.dot(trainX.values, multiplier) + shift
    testX = numpy.dot(testX.values, multiplier) + shift

    boosters = {
        'old_boost':
        GradientBoostingClassifier(n_estimators=100,
                                   min_samples_split=50,
                                   max_depth=5,
                                   subsample=0.3),
        'fast+old_tree':
        CommonGradientBoosting(n_estimators=100,
                               base_estimator=DecisionTreeRegressor(
                                   min_samples_split=50, max_depth=5)),
        'fast+neuro':
        TreeGradientBoostingClassifier(
            n_estimators=100,
            update_tree=True,
            base_estimator=FastNeuroTreeRegressor()),
        'fold+tree':
        FoldingGBClassifier(loss=BinomialDeviance(),
                            n_estimators=10,
                            update_tree=True,
                            base_estimator=FastNeuroTreeRegressor()),
        'ugb':
        uGradientBoostingClassifier(loss=AdaLossFunction(),
                                    n_estimators=100,
                                    min_samples_split=50,
                                    max_depth=5,
                                    update_tree=True,
                                    subsample=0.3)
    }

    for criterion in [
            'mse',  # 'fmse', # 'pvalue',
            # 'significance',
            'significance2',
            # 'gini',
            'entropy',
            'poisson'
    ]:
        boosters['fast-' + criterion[:4]] = TreeGradientBoostingClassifier(
            n_estimators=100,
            update_tree=True,
            base_estimator=FastTreeRegressor(criterion=criterion))

    for name, booster in boosters.items():
        start = time.time()
        booster.fit(trainX, trainY)
        auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1])
        print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))