Esempio n. 1
0
def test_quality(n_samples=3000):
    testX, testY = generate_sample(n_samples, 10, 0.6)
    trainX, trainY = generate_sample(n_samples, 10, 0.6)

    params = {
        'n_neighbors': 10,
        'n_estimators': 10,
        'uniform_features': ['column0'],
        'uniform_label': 1,
        'base_estimator': DecisionTreeClassifier(min_samples_leaf=20, max_depth=5)
    }

    for algorithm in ['SAMME', 'SAMME.R']:
        uboost_classifier = uBoostClassifier(
            algorithm=algorithm, efficiency_steps=5, **params)

        bdt_classifier = uBoostBDT(algorithm=algorithm, **params)

        for classifier in [bdt_classifier, uboost_classifier]:
            classifier.fit(trainX, trainY)
            predict_proba = classifier.predict_proba(testX)
            predict = classifier.predict(testX)
            assert roc_auc_score(testY, predict_proba[:, 1]) > 0.7, \
                "quality is awful"
            print("Accuracy = %.3f" % accuracy_score(testY, predict))
 def setUp(self, n_samples=1000, n_features=5):
     self.trainX, self.trainY = generate_sample(n_samples=n_samples, n_features=n_features)
     self.testX, self.testY = generate_sample(n_samples=n_samples, n_features=n_features)
     self.trainW = numpy.ones(n_samples)
     self.testW = numpy.ones(n_samples)
     self.uniform_variables = self.trainX.columns[:1]
     self.train_variables = self.trainX.columns[1:]
Esempio n. 3
0
def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = CompositeLossFunction()
    loss4 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1)
    loss5 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1])
    loss6bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0)
    loss7bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1])
    loss6knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1)
    loss7knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1])

    for loss in [loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(result, loss)
def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = losses.CompositeLossFunction()
    loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features,
                                      uniform_label=1)
    loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features,
                                      uniform_label=[0, 1])
    loss6bin = losses.BinFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=0)
    loss7bin = losses.BinFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=[0, 1])
    loss6knn = losses.KnnFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=1)
    loss7knn = losses.KnnFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=[0, 1])

    for loss in [
            loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn,
            loss7knn
    ]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(
            result, loss)

    trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX))
    for loss in [
            losses.MSELossFunction(),
            losses.MAELossFunction(),
            losses.RankBoostLossFunction(request_column='fake_request')
    ]:
        print(loss)
        clf = UGradientBoostingRegressor(loss=loss,
                                         max_depth=3,
                                         n_estimators=50,
                                         learning_rate=0.01,
                                         subsample=0.5,
                                         train_features=list(
                                             trainX.columns[1:]))
        clf.fit(trainX, trainY)
        roc_auc = roc_auc_score(testY, clf.predict(testX))
        assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format(
            roc_auc, loss)
Esempio n. 5
0
def test_gradient_boosting(n_samples=1000):
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_variables = ['column0']
    n_estimators = 20

    loss1 = SimpleKnnLossFunction(uniform_variables)
    # loss2 = PairwiseKnnLossFunction(uniform_variables, knn=10)
    loss3 = BinomialDevianceLossFunction()
    # loss4 = RandomKnnLossFunction(uniform_variables, samples * 2, knn=5, knn_factor=3)
    # loss5 = DistanceBasedKnnFunction(uniform_variables, knn=10, distance_dependence=lambda r: numpy.exp(-0.1 * r))
    loss6bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5)
    loss7bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1])
    loss6knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5)
    loss7knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1])
    # loss8 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=1)
    # loss9 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1])

    for loss in [loss1, loss3, loss6bin, loss7bin, loss6knn, loss7knn]:
        result = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                             subsample=0.7, n_estimators=n_estimators, train_variables=None) \
            .fit(trainX[:n_samples], trainY[:n_samples]).score(testX, testY)
        assert result >= 0.7, "The quality is too poor: %.3f" % result

    for loss in [loss1, loss3, ]:
        check_gradient(loss)

    print('uniform gradient boosting is ok')
def test_gb_ranking(n_samples=1000):
    """
    Testing RankingLossFunction
    """
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)

    rank_variable = 'column1'
    trainX[rank_variable] = numpy.random.randint(0, 3, size=len(trainX))
    testX[rank_variable] = numpy.random.randint(0, 3, size=len(testX))

    rank_loss1 = losses.RankBoostLossFunction(request_column=rank_variable,
                                              update_iterations=1)
    rank_loss2 = losses.RankBoostLossFunction(request_column=rank_variable,
                                              update_iterations=2)
    rank_loss3 = losses.RankBoostLossFunction(request_column=rank_variable,
                                              update_iterations=10)

    for loss in [rank_loss1, rank_loss2, rank_loss3]:
        clf = UGradientBoostingRegressor(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                         subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = roc_auc_score(testY, clf.predict(testX))
        assert result >= 0.8, "The quality is too poor: {} with loss: {}".format(
            result, loss)
Esempio n. 7
0
def test_workability(n_samples=10000, n_features=10, distance=0.5):
    trainX, trainY = generate_sample(n_samples=n_samples,
                                     n_features=n_features,
                                     distance=distance)
    testX, testY = generate_sample(n_samples=n_samples,
                                   n_features=n_features,
                                   distance=distance)
    for booster in [FoldingGBClassifier, TreeGradientBoostingClassifier]:
        for loss in [BinomialDeviance(), AdaLossFunction()]:
            for update in [True, False]:
                for base in [
                        FastTreeRegressor(max_depth=3),
                        FastNeuroTreeRegressor(max_depth=3)
                ]:
                    if numpy.random.random() > 0.7:
                        clf = booster(loss=loss,
                                      n_estimators=100,
                                      base_estimator=base,
                                      update_tree=update)
                        clf.fit(trainX, trainY)
                        auc = roc_auc_score(testY,
                                            clf.predict_proba(testX)[:, 1])
                        print('booster', booster, loss, 'update=', update,
                              ' base=', base.__class__, ' quality=', auc)
                        assert auc > 0.8
def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6):
    """
    Testing with two main classification losses.
    Also testing copying
    """
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    for loss in [LogLossFunction(), AdaLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss,
                                          min_samples_split=20,
                                          max_depth=5,
                                          learning_rate=.2,
                                          subsample=0.7,
                                          n_estimators=10,
                                          train_features=None)
        clf.fit(trainX, trainY)
        assert clf.n_features == n_features
        assert len(clf.feature_importances_) == n_features
        # checking that predict proba works
        for p in clf.staged_predict_proba(testX):
            assert p.shape == (n_samples, 2)
        assert numpy.all(p == clf.predict_proba(testX))
        assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low'
        # checking clonability
        _ = clone(clf)
        clf_copy = copy.deepcopy(clf)
        assert numpy.all(
            clf.predict_proba(trainX) == clf_copy.predict_proba(
                trainX)), 'copied classifier is different'
Esempio n. 9
0
def test_gradient_boosting(n_samples=1000):
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_variables = ['column0']
    n_estimators = 20

    loss1 = SimpleKnnLossFunction(uniform_variables)
    # loss2 = PairwiseKnnLossFunction(uniform_variables, knn=10)
    loss3 = BinomialDevianceLossFunction()
    # loss4 = RandomKnnLossFunction(uniform_variables, samples * 2, knn=5, knn_factor=3)
    # loss5 = DistanceBasedKnnFunction(uniform_variables, knn=10, distance_dependence=lambda r: numpy.exp(-0.1 * r))
    loss6bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5)
    loss7bin = BinFlatnessLossFunction(uniform_variables,
                                       ada_coefficient=0.5,
                                       uniform_label=[0, 1])
    loss6knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5)
    loss7knn = KnnFlatnessLossFunction(uniform_variables,
                                       ada_coefficient=0.5,
                                       uniform_label=[0, 1])
    # loss8 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=1)
    # loss9 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1])

    for loss in [loss1, loss3, loss6bin, loss7bin, loss6knn, loss7knn]:
        result = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                             subsample=0.7, n_estimators=n_estimators, train_variables=None) \
            .fit(trainX[:n_samples], trainY[:n_samples]).score(testX, testY)
        assert result >= 0.7, "The quality is too poor: %.3f" % result
Esempio n. 10
0
def test_probas(n_samples=1000):
    trainX, trainY = generate_sample(n_samples, 10, 0.6)
    testX, testY = generate_sample(n_samples, 10, 0.6)

    params = {
        'n_neighbors': 10,
        'n_estimators': 10,
        'uniform_variables': ['column0'],
        'base_estimator': DecisionTreeClassifier(max_depth=5)
    }

    for algorithm in ['SAMME', 'SAMME.R']:
        uboost_classifier = uBoostClassifier(
            algorithm=algorithm,
            efficiency_steps=3, **params)

        bdt_classifier = uBoostBDT(algorithm=algorithm, **params)

        for classifier in [bdt_classifier, uboost_classifier]:
            classifier.fit(trainX, trainY)
            proba1 = classifier.predict_proba(testX)
            proba2 = list(classifier.staged_predict_proba(testX))[-1]
            assert np.allclose(proba1, proba2, atol=0.001),\
                "staged_predict doesn't coincide with the predict for proba."

        score1 = bdt_classifier.predict_score(testX)
        score2 = list(bdt_classifier.staged_predict_score(testX))[-1]
        assert np.allclose(score1, score2),\
            "staged_score doesn't coincide with the score."

        assert len(bdt_classifier.feature_importances_) == trainX.shape[1]
Esempio n. 11
0
def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_variables = ['column0']
    rank_variable = 'column1'
    trainX[rank_variable] = numpy.random.randint(0, 3, size=len(trainX))
    testX[rank_variable] = numpy.random.randint(0, 3, size=len(testX))

    loss1 = BinomialDevianceLossFunction()
    loss2 = AdaLossFunction()
    loss3 = CompositeLossFunction()
    loss4 = SimpleKnnLossFunction(uniform_variables=uniform_variables)
    loss5 = RankBoostLossFunction(request_column=rank_variable)
    loss51 = RankBoostLossFunction(request_column=rank_variable, update_terations=2)
    loss52 = RankBoostLossFunction(request_column=rank_variable, update_terations=10)
    loss6bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5)
    loss7bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1])
    loss6knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5)
    loss7knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1])

    for loss in [loss5, loss51, loss52, loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn]:
        clf = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_variables=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(result, loss)
Esempio n. 12
0
def test_probas(n_samples=1000):
    trainX, trainY = generate_sample(n_samples, 10, 0.6)
    testX, testY = generate_sample(n_samples, 10, 0.6)

    params = {
        'n_neighbors': 10,
        'n_estimators': 10,
        'uniform_features': ['column0'],
        'uniform_label': 1,
        'base_estimator': DecisionTreeClassifier(max_depth=5)
    }

    for algorithm in ['SAMME', 'SAMME.R']:
        uboost_classifier = uBoostClassifier(
            algorithm=algorithm,
            efficiency_steps=3, **params)

        bdt_classifier = uBoostBDT(algorithm=algorithm, **params)

        for classifier in [bdt_classifier, uboost_classifier]:
            classifier.fit(trainX, trainY)
            proba1 = classifier.predict_proba(testX)
            proba2 = list(classifier.staged_predict_proba(testX))[-1]
            assert np.allclose(proba1, proba2, atol=0.001), \
                "staged_predict doesn't coincide with the predict for proba."

        score1 = bdt_classifier.decision_function(testX)
        score2 = list(bdt_classifier.staged_decision_function(testX))[-1]
        assert np.allclose(score1, score2), \
            "staged_score doesn't coincide with the score."

        assert len(bdt_classifier.feature_importances_) == trainX.shape[1]
Esempio n. 13
0
def test_bin_transformer_limits(n_features=10, n_bins=123):
    X, y = generate_sample(n_samples=1999, n_features=n_features)
    X = BinTransformer(max_bins=n_bins).fit_transform(X)
    assert numpy.allclose(X.max(axis=0), n_bins - 1)

    X_orig, y = generate_sample(n_samples=20, n_features=n_features)
    X = BinTransformer(max_bins=n_bins).fit_transform(X_orig)
    assert numpy.allclose(X.min(axis=0), 0)
def test_gradient_boosting(size=100, n_features=10):
    trainX, trainY = generate_sample(size, n_features)
    testX, testY = generate_sample(size, n_features)
    for loss in [AdaLossFunction()]:
        for update in ['all', 'same', 'other', 'random']:
            gb = GradientBoosting(loss=loss, update_on=update, smearing=[0.1, -0.1])
            score = gb.fit(trainX, trainY).score(testX, testY)

            print(update, score)
Esempio n. 15
0
 def setUp(self, n_samples=1000, n_features=5):
     self.trainX, self.trainY = generate_sample(n_samples=n_samples,
                                                n_features=n_features)
     self.testX, self.testY = generate_sample(n_samples=n_samples,
                                              n_features=n_features)
     self.trainW = numpy.ones(n_samples)
     self.testW = numpy.ones(n_samples)
     self.uniform_variables = self.trainX.columns[:1]
     self.train_variables = self.trainX.columns[1:]
Esempio n. 16
0
def check_classifiers(n_samples=10000):
    """
    This function is not tested by default, it should be called manually
    """
    testX, testY = generate_sample(n_samples, 10, 0.6)
    trainX, trainY = generate_sample(n_samples, 10, 0.6)
    uniform_features = ['column0']

    ada = AdaBoostClassifier(n_estimators=50)
    ideal_bayes = GaussianNB()

    uBoost_SAMME = uBoostClassifier(
        uniform_features=uniform_features,
        uniform_label=1,
        n_neighbors=50,
        efficiency_steps=5,
        n_estimators=50,
        algorithm="SAMME")

    uBoost_SAMME_R = uBoostClassifier(
        uniform_features=uniform_features,
        uniform_label=1,
        n_neighbors=50,
        efficiency_steps=5,
        n_estimators=50,
        algorithm="SAMME.R")

    uBoost_SAMME_R_threaded = uBoostClassifier(
        uniform_features=uniform_features,
        uniform_label=1,
        n_neighbors=50,
        efficiency_steps=5,
        n_estimators=50,
        n_threads=3,
        subsample=0.9,
        algorithm="SAMME.R")

    clf_dict = OrderedDict({
        "Ada": ada,
        "uBOOST": uBoost_SAMME,
        "uBOOST.R": uBoost_SAMME_R,
        "uBOOST.R2": uBoost_SAMME_R_threaded
    })

    cvms = {}
    for clf_name, clf in clf_dict.items():
        clf.fit(trainX, trainY)
        p = clf.predict_proba(testX)
        metric = KnnBasedCvM(uniform_features=uniform_features)
        metric.fit(testX, testY)
        cvms[clf_name] = metric(testY, p, sample_weight=np.ones(len(testY)))

    assert cvms['uBOOST'] < cvms['ada']
    print(cvms)
def test_gradient_boosting(size=100, n_features=10):
    trainX, trainY = generate_sample(size, n_features)
    testX, testY = generate_sample(size, n_features)
    for loss in [AdaLossFunction()]:
        for update in ['all', 'same', 'other', 'random']:
            gb = GradientBoosting(loss=loss,
                                  update_on=update,
                                  smearing=[0.1, -0.1])
            score = gb.fit(trainX, trainY).score(testX, testY)

            print(update, score)
Esempio n. 18
0
def test_gb_with_ada(n_samples=1000, n_features=10, distance=0.6):
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    loss = BinomialDevianceLossFunction()
    clf = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                      subsample=0.7, n_estimators=10, train_variables=None)
    clf.fit(trainX, trainY)
    assert clf.n_features == n_features
    assert len(clf.feature_importances_) == n_features
    # checking that predict proba works
    for p in clf.staged_predict_proba(testX):
        assert p.shape == (n_samples, 2)
    assert numpy.all(p == clf.predict_proba(testX))
Esempio n. 19
0
def test_weight_misbalance(n_samples=1000, n_features=10, distance=0.6):
    """
    Testing how classifiers work with highly misbalanced (in the terms of weights) datasets.
    """
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    trainW = trainY * 10000 + 1
    testW = testY * 10000 + 1
    for loss in [LogLossFunction(), AdaLossFunction(), losses.CompositeLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                          subsample=0.7, n_estimators=10, train_features=None)
        clf.fit(trainX, trainY, sample_weight=trainW)
        p = clf.predict_proba(testX)
        assert roc_auc_score(testY, p[:, 1], sample_weight=testW) > 0.8, 'quality is too low'
Esempio n. 20
0
def test_weight_misbalance(n_samples=1000, n_features=10, distance=0.6):
    """
    Testing how classifiers work with highly misbalanced (in the terms of weights) datasets.
    """
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    trainW = trainY * 10000 + 1
    testW = testY * 10000 + 1
    for loss in [LogLossFunction(), AdaLossFunction(), losses.CompositeLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                          subsample=0.7, n_estimators=10, train_features=None)
        clf.fit(trainX, trainY, sample_weight=trainW)
        p = clf.predict_proba(testX)
        assert roc_auc_score(testY, p[:, 1], sample_weight=testW) > 0.8, 'quality is too low'
Esempio n. 21
0
def test_workability(n_samples=10000, n_features=10, distance=0.5):
    trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)
    testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)
    for booster in [FoldingGBClassifier, TreeGradientBoostingClassifier]:
        for loss in [BinomialDeviance(), AdaLossFunction()]:
            for update in [True, False]:
                for base in [FastTreeRegressor(max_depth=3), FastNeuroTreeRegressor(max_depth=3)]:
                    if numpy.random.random() > 0.7:
                        clf = booster(loss=loss, n_estimators=100,
                                      base_estimator=base, update_tree=update)
                        clf.fit(trainX, trainY)
                        auc = roc_auc_score(testY, clf.predict_proba(testX)[:, 1])
                        print('booster', booster, loss, 'update=', update, ' base=', base.__class__,
                              ' quality=', auc)
                        assert auc > 0.8
Esempio n. 22
0
def check_classifiers(n_samples=10000, output_name_pattern=None):
    """
    This function is not tested by default, it should be called manually
    """
    testX, testY = generate_sample(n_samples, 10, 0.6)
    trainX, trainY = generate_sample(n_samples, 10, 0.6)
    uniform_variables = ['column0']

    ada = AdaBoostClassifier(n_estimators=50)
    ideal_bayes = HidingClassifier(train_variables=trainX.columns[1:],
                              base_estimator=GaussianNB())

    uBoost_SAMME = uBoostClassifier(
        uniform_variables=uniform_variables,
        n_neighbors=50,
        efficiency_steps=5,
        n_estimators=50,
        algorithm="SAMME")
    uBoost_SAMME_R = uBoostClassifier(
        uniform_variables=uniform_variables,
        n_neighbors=50,
        efficiency_steps=5,
        n_estimators=50,
        algorithm="SAMME.R")

    clf_dict = ClassifiersDict({
        "Ada": ada,
        "Ideal": ideal_bayes,
        "uBOOST": uBoost_SAMME,
        "uBOOST.R": uBoost_SAMME_R
        })

    clf_dict.fit(trainX, trainY)

    predictions = Predictions(clf_dict, testX, testY)
    # predictions.print_mse(uniform_variables, in_html=False)
    print(predictions.compute_metrics())

    predictions.sde_curves(uniform_variables)
    if output_name_pattern is not None:
        pl.savefig(output_name_pattern % "mse_curves", bbox="tight")
    _ = pl.figure()
    predictions.learning_curves()
    if output_name_pattern is not None:
        pl.savefig(output_name_pattern % "learning_curves", bbox="tight")
    predictions.efficiency(uniform_variables)
    if output_name_pattern is not None:
        pl.savefig(output_name_pattern % "efficiency_curves", bbox="tight")
Esempio n. 23
0
def check_single_classification_network(neural_network,
                                        n_samples=200,
                                        n_features=7,
                                        distance=0.8,
                                        retry_attempts=3):
    X, y = generate_sample(n_samples=n_samples,
                           n_features=n_features,
                           distance=distance)
    # each combination is tried 3 times. before raising exception

    for retry_attempt in range(retry_attempts):
        # to initial state
        neural_network = clone(neural_network)
        neural_network.set_params(random_state=42 + retry_attempt)
        print(neural_network)
        neural_network.fit(X, y)
        quality = roc_auc_score(y, neural_network.predict_proba(X)[:, 1])
        # checking that computations don't fail
        computed_loss = neural_network.compute_loss(X,
                                                    y,
                                                    sample_weight=y * 0 + 1)
        if quality > 0.8:
            break
        else:
            print('attempt {} : {}'.format(retry_attempt, quality))
            if retry_attempt == retry_attempts - 1:
                raise RuntimeError('quality of model is too low: {} {}'.format(
                    quality, neural_network))
Esempio n. 24
0
def test_lookup(n_samples=10000, n_features=7, n_bins=8):
    X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=0.6)

    base_estimator = GradientBoostingClassifier()
    clf = LookupClassifier(base_estimator=base_estimator, n_bins=n_bins, keep_trained_estimator=True).fit(X, y)
    p = clf.predict_proba(X)
    assert roc_auc_score(y, p[:, 1]) > 0.8, 'quality of classification is too low'
    assert p.shape == (n_samples, 2)
    assert numpy.allclose(p.sum(axis=1), 1), 'probabilities are not summed up to 1'

    # checking conversions
    lookup_size = n_bins ** n_features
    lookup_indices = numpy.arange(lookup_size, dtype=int)
    bins_indices = clf.convert_lookup_index_to_bins(lookup_indices=lookup_indices)
    lookup_indices2 = clf.convert_bins_to_lookup_index(bins_indices=bins_indices)
    assert numpy.allclose(lookup_indices, lookup_indices2), 'something wrong with conversions'
    assert len(clf._lookup_table) == n_bins ** n_features, 'wrong size of lookup table'

    # checking speed
    X = pandas.concat([X] * 10)
    start = time.time()
    p1 = clf.trained_estimator.predict_proba(clf.transform(X))
    time_old = time.time() - start
    start = time.time()
    p2 = clf.predict_proba(X)
    time_new = time.time() - start
    print(time_old, ' now takes ', time_new)
    assert numpy.allclose(p1, p2), "pipeline doesn't work as expected"
Esempio n. 25
0
def test_grid_search():
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

    grid = {
        'base_estimator': [
            DecisionTreeClassifier(max_depth=3),
            DecisionTreeClassifier(max_depth=4),
            ExtraTreeClassifier(max_depth=4)
        ],
        'learning_rate': [0.01, 0.1, 0.5, 1.],
        'n_estimators': [5, 10, 15, 20, 30, 40, 50, 75, 100, 125],
        'algorithm': ['SAMME', 'SAMME.R']
    }
    grid = OrderedDict(grid)

    trainX, trainY = generate_sample(2000, 10, distance=0.5)
    grid_cv = GridOptimalSearchCV(AdaBoostClassifier(),
                                  grid,
                                  n_evaluations=10,
                                  refit=True,
                                  log_name='test')
    grid_cv.fit(trainX, trainY)
    grid_cv.predict_proba(trainX)
    grid_cv.predict(trainX)
    grid_cv.print_param_stats([0.1, 0.3, 0.5, 0.7])
Esempio n. 26
0
def test_tree_speed(n_samples=100000, n_features=10):
    X, y = generate_sample(n_samples=n_samples, n_features=n_features)
    X = numpy.array(X)
    w = numpy.ones(n_samples)

    regressors = OrderedDict()
    regressors['old'] = DecisionTreeRegressor(max_depth=10,
                                              min_samples_split=50)
    regressors['new'] = FastTreeRegressor(max_depth=10, min_samples_split=50)

    for name, regressor in regressors.items():
        start = time.time()
        for _ in range(3):
            regressor.fit(X, y, sample_weight=w)
        print(name, 'trains in ', time.time() - start)

    # Testing speed of prediction:
    methods = OrderedDict()
    methods['old'] = lambda: regressors['old'].predict(X)
    methods['new'] = lambda: regressors['new'].apply(X)
    methods['new-fast'] = lambda: regressors['new'].fast_apply(X)
    for name, method in methods.items():
        start = time.time()
        for _ in range(5):
            method()
        print(name, 'requires ', time.time() - start)
Esempio n. 27
0
def test_tree_speed(n_samples=100000, n_features=10):
    X, y = generate_sample(n_samples=n_samples, n_features=n_features)
    X = numpy.array(X)
    w = numpy.ones(n_samples)

    regressors = OrderedDict()
    regressors['old'] = DecisionTreeRegressor(max_depth=10, min_samples_split=50)
    regressors['new'] = FastTreeRegressor(max_depth=10, min_samples_split=50)

    for name, regressor in regressors.items():
        start = time.time()
        for _ in range(3):
            regressor.fit(X, y, sample_weight=w)
        print(name, 'trains in ', time.time() - start)

    # Testing speed of prediction:
    methods = OrderedDict()
    methods['old'] = lambda: regressors['old'].predict(X)
    methods['new'] = lambda: regressors['new'].apply(X)
    methods['new-fast'] = lambda: regressors['new'].fast_apply(X)
    for name, method in methods.items():
        start = time.time()
        for _ in range(5):
            method()
        print(name, 'requires ', time.time() - start)
Esempio n. 28
0
def test_metrics_clear(n_samples=2000, knn=50, uniform_class=0):
    """
    Testing that after deleting all inappropriate events (events of other class),
    metrics stays the same
    """
    X, y = generate_sample(n_samples=n_samples, n_features=10)
    sample_weight = numpy.random.exponential(size=n_samples)
    predictions = numpy.random.random(size=[n_samples, 2])
    predictions /= predictions.sum(axis=1, keepdims=True)
    features = X.columns[:1]

    mask = (y == uniform_class)
    X_clear = X.ix[mask, :]
    y_clear = y[mask]
    sample_weight_clear = sample_weight[mask]
    predictions_clear = predictions[mask]

    for function in [sde, theil_flatness, cvm_flatness]:
        flatness_val = function(y, predictions, X, uniform_variables=features, sample_weight=sample_weight, label=0,
                                knn=knn)
        flatness_val_clear = function(y_clear, predictions_clear, X_clear, uniform_variables=features,
                                      sample_weight=sample_weight_clear, label=0, knn=knn)
        assert flatness_val == flatness_val_clear, 'after deleting other class, the metrics changed'

    for class_ in [KnnBasedSDE, KnnBasedTheil, KnnBasedCvM]:
        metric1 = class_(n_neighbours=knn, uniform_features=features, uniform_label=0, )
        metric1.fit(X, y, sample_weight=sample_weight)
        flatness_val1 = metric1(y, predictions, sample_weight)

        metric2 = class_(n_neighbours=knn, uniform_features=features, uniform_label=0, )
        metric2.fit(X_clear, y_clear, sample_weight=sample_weight_clear)
        flatness_val2 = metric2(y_clear, predictions_clear, sample_weight_clear)
        assert flatness_val1 == flatness_val2, 'after deleting other class, the metrics changed'
Esempio n. 29
0
def test_new_metrics(n_samples=2000, knn=50):
    X, y = generate_sample(n_samples=n_samples, n_features=10)
    sample_weight = numpy.random.exponential(size=n_samples) ** 0.
    predictions = numpy.random.random(size=[n_samples, 2])
    predictions /= predictions.sum(axis=1, keepdims=True)
    predictions *= 1000.

    # Checking SDE
    features = X.columns[:1]
    sde_val1 = sde(y, predictions, X, uniform_variables=features, sample_weight=sample_weight, label=0, knn=knn)
    sde2 = KnnBasedSDE(n_neighbours=knn, uniform_features=features, uniform_label=0, )
    sde2.fit(X, y, sample_weight=sample_weight)
    sde_val2 = sde2(y, predictions, sample_weight=sample_weight)

    assert sde_val1 == sde_val2, 'SDE values are different'

    # Checking CVM
    features = X.columns[:1]
    cvm_val1 = cvm_flatness(y, predictions, X, uniform_variables=features, sample_weight=sample_weight, label=0,
                            knn=knn)
    cvm2 = KnnBasedCvM(n_neighbours=knn, uniform_features=features, uniform_label=0, )
    cvm2.fit(X, y, sample_weight=sample_weight)
    cvm_val2 = cvm2(y, predictions, sample_weight=sample_weight)

    assert cvm_val1 == cvm_val2, 'CvM values are different'
Esempio n. 30
0
def test_lookup(n_samples=10000, n_features=7, n_bins=8):
    X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=0.6)

    base_estimator = GradientBoostingClassifier()
    clf = LookupClassifier(base_estimator=base_estimator, n_bins=n_bins, keep_trained_estimator=True).fit(X, y)
    p = clf.predict_proba(X)
    assert roc_auc_score(y, p[:, 1]) > 0.8, 'quality of classification is too low'
    assert p.shape == (n_samples, 2)
    assert numpy.allclose(p.sum(axis=1), 1), 'probabilities are not summed up to 1'

    # checking conversions
    lookup_size = n_bins ** n_features
    lookup_indices = numpy.arange(lookup_size, dtype=int)
    bins_indices = clf.convert_lookup_index_to_bins(lookup_indices=lookup_indices)
    lookup_indices2 = clf.convert_bins_to_lookup_index(bins_indices=bins_indices)
    assert numpy.allclose(lookup_indices, lookup_indices2), 'something wrong with conversions'
    assert len(clf._lookup_table) == n_bins ** n_features, 'wrong size of lookup table'

    # checking speed
    X = pandas.concat([X] * 10)
    start = time.time()
    p1 = clf.trained_estimator.predict_proba(clf.transform(X))
    time_old = time.time() - start
    start = time.time()
    p2 = clf.predict_proba(X)
    time_new = time.time() - start
    print(time_old, ' now takes ', time_new)
    assert numpy.allclose(p1, p2), "pipeline doesn't work as expected"
Esempio n. 31
0
def test_cuts(n_samples=1000):
    base_classifier = DecisionTreeClassifier(min_samples_leaf=10, max_depth=6)
    trainX, trainY = generate_sample(n_samples, 10, 0.6)
    uniform_variables = ['column0']

    for algorithm in ['SAMME', 'SAMME.R']:
        for target_efficiency in [0.1, 0.3, 0.5, 0.7, 0.9]:
            uBDT = uBoostBDT(
                uniform_variables=uniform_variables,
                target_efficiency=target_efficiency,
                n_neighbors=20, n_estimators=20,
                algorithm=algorithm,
                base_estimator=base_classifier)
            uBDT.fit(trainX, trainY)

            passed = sum(trainY) * target_efficiency

            assert uBDT.score_cut == uBDT.score_cuts_[-1],\
                'something wrong with computed cuts'

            for score, cut in zip(uBDT.staged_predict_score(trainX[trainY > 0.5]),
                                   uBDT.score_cuts_):
                passed_upper = np.sum(score > cut - 1e-7)
                passed_lower = np.sum(score > cut + 1e-7)
                assert passed_lower <= passed <= passed_upper, "wrong stage cuts"
Esempio n. 32
0
def test_cuts(n_samples=1000):
    base_classifier = DecisionTreeClassifier(min_samples_leaf=10, max_depth=6)
    trainX, trainY = generate_sample(n_samples, 10, 0.6)
    uniform_features = ['column0']

    for algorithm in ['SAMME', 'SAMME.R']:
        for target_efficiency in [0.1, 0.3, 0.5, 0.7, 0.9]:
            uBDT = uBoostBDT(
                uniform_features=uniform_features,
                uniform_label=1,
                target_efficiency=target_efficiency,
                n_neighbors=20, n_estimators=20,
                algorithm=algorithm,
                base_estimator=base_classifier)
            uBDT.fit(trainX, trainY)

            passed = sum(trainY) * target_efficiency

            assert uBDT.score_cut == uBDT.score_cuts_[-1], \
                'something wrong with computed cuts'

            for score, cut in zip(uBDT.staged_decision_function(trainX[trainY > 0.5]),
                                  uBDT.score_cuts_):
                passed_upper = np.sum(score > cut - 1e-7)
                passed_lower = np.sum(score > cut + 1e-7)
                assert passed_lower <= passed <= passed_upper, "wrong stage cuts"
Esempio n. 33
0
def test_workability(n_samples=2000, knn=50, uniform_label=0, n_bins=10):
    """Simply checks that metrics are working """
    X, y = generate_sample(n_samples=n_samples, n_features=10)
    sample_weight = numpy.random.exponential(size=n_samples)
    predictions = numpy.random.random(size=[n_samples, 2])
    predictions /= predictions.sum(axis=1, keepdims=True)
    features = X.columns[:1]

    for class_ in [KnnBasedSDE, KnnBasedTheil, KnnBasedCvM]:
        metric = class_(
            n_neighbours=knn,
            uniform_features=features,
            uniform_label=uniform_label,
        )
        metric.fit(X, y, sample_weight=sample_weight)
        flatness_val_ = metric(y, predictions, sample_weight)

    for class_ in [BinBasedSDE, BinBasedTheil, BinBasedCvM]:
        metric = class_(
            n_bins=n_bins,
            uniform_features=features,
            uniform_label=uniform_label,
        )
        metric.fit(X, y, sample_weight=sample_weight)
        flatness_val_ = metric(y, predictions, sample_weight)
Esempio n. 34
0
def test_metrics_clear(n_samples=2000, knn=50, uniform_class=0):
    """
    Testing that after deleting all inappropriate events (events of other class),
    metrics stays the same
    """
    X, y = generate_sample(n_samples=n_samples, n_features=10)
    sample_weight = numpy.random.exponential(size=n_samples)
    predictions = numpy.random.random(size=[n_samples, 2])
    predictions /= predictions.sum(axis=1, keepdims=True)
    features = X.columns[:1]

    mask = (y == uniform_class)
    X_clear = X.ix[mask, :]
    y_clear = y[mask]
    sample_weight_clear = sample_weight[mask]
    predictions_clear = predictions[mask]

    for function in [sde, theil_flatness, cvm_flatness]:
        flatness_val = function(y, predictions, X, uniform_features=features, sample_weight=sample_weight, label=0,
                                knn=knn)
        flatness_val_clear = function(y_clear, predictions_clear, X_clear, uniform_features=features,
                                      sample_weight=sample_weight_clear, label=0, knn=knn)
        assert flatness_val == flatness_val_clear, 'after deleting other class, the metrics changed'

    for class_ in [KnnBasedSDE, KnnBasedTheil, KnnBasedCvM]:
        metric1 = class_(n_neighbours=knn, uniform_features=features, uniform_label=0, )
        metric1.fit(X, y, sample_weight=sample_weight)
        flatness_val1 = metric1(y, predictions, sample_weight)

        metric2 = class_(n_neighbours=knn, uniform_features=features, uniform_label=0, )
        metric2.fit(X_clear, y_clear, sample_weight=sample_weight_clear)
        flatness_val2 = metric2(y_clear, predictions_clear, sample_weight_clear)
        assert flatness_val1 == flatness_val2, 'after deleting other class, the metrics changed'
Esempio n. 35
0
def test_step_optimality(n_samples=100):
    """
    testing that for single leaf function returns the optimal value
    """
    X, y = generate_sample(n_samples, n_features=10)
    sample_weight = numpy.random.exponential(size=n_samples)

    rank_column = X.columns[2]
    X[rank_column] = numpy.random.randint(0, 3, size=n_samples)

    tested_losses = [
        losses.LogLossFunction(),
        losses.AdaLossFunction(),
        losses.KnnAdaLossFunction(X.columns[:1], uniform_label=0, knn=5),
        losses.CompositeLossFunction(),
        losses.RankBoostLossFunction(rank_column),
        losses.MSELossFunction(),
    ]

    pred = numpy.random.normal(size=n_samples)

    for loss in tested_losses:
        loss.fit(X, y, sample_weight=sample_weight)

        # Test simple way to get optimal step
        leaf_value = numpy.random.normal()
        step = 0.
        for _ in range(4):
            ministep, = loss.prepare_new_leaves_values(
                terminal_regions=numpy.zeros(n_samples, dtype=int),
                leaf_values=[leaf_value],
                y_pred=pred + step)
            step += ministep

        if isinstance(loss, losses.MAELossFunction):
            # checking that MAE is minimized with long process
            for iteration in range(1, 30):
                ministep, = loss.prepare_new_leaves_values(
                    terminal_regions=numpy.zeros(n_samples, dtype=int),
                    leaf_values=[leaf_value],
                    y_pred=pred + step)
                step += ministep * 1. / iteration

        loss_values = []
        coeffs = [0.9, 1.0, 1.1]
        for coeff in coeffs:
            loss_values.append(loss(pred + coeff * step))
        print(loss, step, 'losses: ', loss_values)
        assert loss_values[1] <= loss_values[0] + 1e-7
        assert loss_values[1] <= loss_values[2] + 1e-7

        # Test standard function
        opt_value = loss.compute_optimal_step(y_pred=pred)
        loss_values2 = []
        for coeff in coeffs:
            loss_values2.append(loss(pred + coeff * opt_value))
        print(loss, step, 'losses: ', loss_values)
        assert loss_values2[1] <= loss_values2[0] + 1e-7
        assert loss_values2[1] <= loss_values2[2] + 1e-7
Esempio n. 36
0
def test_gb_regression(n_samples=1000):
    X, _ = generate_sample(n_samples, 10, distance=0.6)
    y = numpy.tanh(X.sum(axis=1))
    clf = UGradientBoostingRegressor(loss=MSELossFunction())
    clf.fit(X, y)
    y_pred = clf.predict(X)
    zeromse = 0.5 * mean_squared_error(y, y * 0.)
    assert mean_squared_error(y, y_pred) < zeromse, 'something wrong with regression quality'
Esempio n. 37
0
def test_reproducibility(n_samples=200, n_features=15, distance=0.5):
    X, y = generate_sample(n_samples=n_samples,
                           n_features=n_features,
                           distance=distance)
    for trainer in nnet.trainers.keys():
        clf1 = nnet.MLPClassifier(trainer=trainer, random_state=42).fit(X, y)
        clf2 = nnet.MLPClassifier(trainer=trainer, random_state=42).fit(X, y)
        assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X))
Esempio n. 38
0
def check_classifiers(n_samples=10000, output_name_pattern=None):
    """
    This function is not tested by default, it should be called manually
    """
    testX, testY = generate_sample(n_samples, 10, 0.6)
    trainX, trainY = generate_sample(n_samples, 10, 0.6)
    uniform_variables = ['column0']

    ada = AdaBoostClassifier(n_estimators=50)
    ideal_bayes = HidingClassifier(train_variables=trainX.columns[1:],
                                   base_estimator=GaussianNB())

    uBoost_SAMME = uBoostClassifier(uniform_variables=uniform_variables,
                                    n_neighbors=50,
                                    efficiency_steps=5,
                                    n_estimators=50,
                                    algorithm="SAMME")
    uBoost_SAMME_R = uBoostClassifier(uniform_variables=uniform_variables,
                                      n_neighbors=50,
                                      efficiency_steps=5,
                                      n_estimators=50,
                                      algorithm="SAMME.R")

    clf_dict = ClassifiersDict({
        "Ada": ada,
        "Ideal": ideal_bayes,
        "uBOOST": uBoost_SAMME,
        "uBOOST.R": uBoost_SAMME_R
    })

    clf_dict.fit(trainX, trainY)

    predictions = Predictions(clf_dict, testX, testY)
    # predictions.print_mse(uniform_variables, in_html=False)
    print(predictions.compute_metrics())

    predictions.sde_curves(uniform_variables)
    if output_name_pattern is not None:
        pl.savefig(output_name_pattern % "mse_curves", bbox="tight")
    _ = pl.figure()
    predictions.learning_curves()
    if output_name_pattern is not None:
        pl.savefig(output_name_pattern % "learning_curves", bbox="tight")
    predictions.efficiency(uniform_variables)
    if output_name_pattern is not None:
        pl.savefig(output_name_pattern % "efficiency_curves", bbox="tight")
Esempio n. 39
0
def test_gb_regression(n_samples=1000):
    X, _ = generate_sample(n_samples, 10, distance=0.6)
    y = numpy.tanh(X.sum(axis=1))
    clf = UGradientBoostingRegressor(loss=MSELossFunction())
    clf.fit(X, y)
    y_pred = clf.predict(X)
    zeromse = 0.5 * mean_squared_error(y, y * 0.)
    assert mean_squared_error(y, y_pred) < zeromse, 'something wrong with regression quality'
Esempio n. 40
0
def test_gb_simple():
    X, y = generate_sample(n_samples=10000, n_features=10)
    X = BinTransformer().fit_transform(X)

    reg = ResearchGradientBoostingBase(loss=MSELoss())
    reg.fit(X, y)

    assert roc_auc_score(y, reg.decision_function(X)) > 0.6
Esempio n. 41
0
def test_network_with_scaler(n_samples=200, n_features=15, distance=0.5):
    X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)
    for scaler in [BinTransformer(max_bins=16), IronTransformer()]:
        clf = nnet.SimpleNeuralNetwork(scaler=scaler, epochs=300)
        clf.fit(X, y)

        p = clf.predict_proba(X)
        assert roc_auc_score(y, p[:, 1]) > 0.8, 'quality is too low for model: {}'.format(clf)
Esempio n. 42
0
def test_with_scaler(n_samples=200, n_features=15, distance=0.5):
    X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)
    for scaler in [BinTransformer(max_bins=16), IronTransformer()]:
        clf = nnet.SimpleNeuralNetwork(scaler=scaler,epochs=300)
        clf.fit(X, y)

        p = clf.predict_proba(X)
        assert roc_auc_score(y, p[:, 1]) > 0.8, 'quality is too low for model: {}'.format(clf)
Esempio n. 43
0
def test_bin_transformer_extend_to(n_features=10, n_bins=123):
    extended_length = 19
    X, y = generate_sample(n_samples=20, n_features=n_features)
    X1 = BinTransformer(max_bins=n_bins).fit(X).transform(X)
    X2 = BinTransformer(max_bins=n_bins).fit(X).transform(
        X, extend_to=extended_length)
    assert len(X2) % extended_length == 0, 'wrong shape!'
    assert numpy.allclose(X2[:len(X1)],
                          X1), 'extending does not work as expected!'
Esempio n. 44
0
def test_refitting(n_samples=10000, n_features=10, distance=0.5):
    trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)
    testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)

    booster = TreeGradientBoostingClassifier(n_estimators=100, update_tree=True,
                                             base_estimator=FastTreeRegressor())
    booster.fit(trainX, trainY)
    print(roc_auc_score(testY, booster.predict_proba(testX)[:, 1]))
    print(roc_auc_score(trainY, booster.predict_proba(trainX)[:, 1]))

    booster.refit_trees(trainX, trainY)
    print(roc_auc_score(testY, booster.predict_proba(testX)[:, 1]))
    print(roc_auc_score(trainY, booster.predict_proba(trainX)[:, 1]))


    booster.refit_trees(testX, testY)
    print(roc_auc_score(testY, booster.predict_proba(testX)[:, 1]))
    print(roc_auc_score(trainY, booster.predict_proba(trainX)[:, 1]))
Esempio n. 45
0
def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6):
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    for loss in [LogLossFunction(), AdaLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                          subsample=0.7, n_estimators=10, train_features=None)
        clf.fit(trainX, trainY)
        assert clf.n_features == n_features
        assert len(clf.feature_importances_) == n_features
        # checking that predict proba works
        for p in clf.staged_predict_proba(testX):
            assert p.shape == (n_samples, 2)
        assert numpy.all(p == clf.predict_proba(testX))
        assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low'
        # checking clonability
        _ = clone(clf)
        clf_copy = copy.deepcopy(clf)
        assert (clf.predict_proba(trainX) == clf_copy.predict_proba(trainX)).all(), 'copied classifier is different'
Esempio n. 46
0
def test_gb_with_ada(n_samples=1000, n_features=10, distance=0.6):
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    loss = BinomialDevianceLossFunction()
    clf = uGradientBoostingClassifier(loss=loss,
                                      min_samples_split=20,
                                      max_depth=5,
                                      learning_rate=.2,
                                      subsample=0.7,
                                      n_estimators=10,
                                      train_variables=None)
    clf.fit(trainX, trainY)
    assert clf.n_features == n_features
    assert len(clf.feature_importances_) == n_features
    # checking that predict proba works
    for p in clf.staged_predict_proba(testX):
        assert p.shape == (n_samples, 2)
    assert numpy.all(p == clf.predict_proba(testX))
Esempio n. 47
0
def tree_quality_comparison(n_samples=200000, n_features=10):
    """
    Function is NOT a test, bit helpful to compare performance of standard DT and new one.
    """
    trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features)
    testX, testY = generate_sample(n_samples=n_samples, n_features=n_features)

    # Multiplying by random matrix
    multiplier = numpy.random.normal(size=[n_features, n_features])
    trainX = numpy.dot(trainX.values, multiplier)
    testX = numpy.dot(testX.values, multiplier)
    regressors = OrderedDict()
    regressors['old'] = DecisionTreeRegressor(max_depth=10, min_samples_split=50)
    regressors['new'] = FastTreeRegressor(max_depth=10, min_samples_split=50, criterion='pvalue')
    w = numpy.ones(n_samples)

    for name, regressor in regressors.items():
        regressor.fit(trainX, trainY, sample_weight=w)
        print(name, roc_auc_score(testY, regressor.predict(testX)))
Esempio n. 48
0
def test_classifier_with_dataframe():
    try:
        from rep.estimators import SklearnClassifier
        clf = SklearnClassifier(GradientBoostingClassifier(n_estimators=1))
        X, y = generate_sample(n_samples=100, n_features=4)
        for X_ in [X, pandas.DataFrame(X)]:
            lookup = LookupClassifier(clf, n_bins=16).fit(X_, y)
            lookup.predict_proba(X)
    except ImportError:
        print('expected fail: yandex/rep not installed')
Esempio n. 49
0
def test_gb_ranking(n_samples=1000):
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)

    rank_variable = 'column1'
    trainX[rank_variable] = numpy.random.randint(0, 3, size=len(trainX))
    testX[rank_variable] = numpy.random.randint(0, 3, size=len(testX))

    rank_loss1 = RankBoostLossFunction(request_column=rank_variable, update_iterations=1)
    rank_loss2 = RankBoostLossFunction(request_column=rank_variable, update_iterations=2)
    rank_loss3 = RankBoostLossFunction(request_column=rank_variable, update_iterations=10)

    for loss in [rank_loss1, rank_loss2, rank_loss3]:
        clf = UGradientBoostingRegressor(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                         subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = roc_auc_score(testY, clf.predict(testX))
        assert result >= 0.8, "The quality is too poor: {} with loss: {}".format(result, loss)
Esempio n. 50
0
def test_loss_functions(size=50, epsilon=1e-3):
    """
    Testing that Hessians and gradients of loss functions coincide with numerical approximations
    """
    X, y = generate_sample(size, n_features=10)
    rank_column = X.columns[2]
    X[rank_column] = numpy.random.randint(0, 3, size=size)
    sample_weight = numpy.random.exponential(size=size)
    tested_losses = [
        losses.MSELossFunction(),
        losses.MAELossFunction(),
        losses.LogLossFunction(),
        losses.AdaLossFunction(),
        losses.KnnAdaLossFunction(X.columns[:1], uniform_label=1, knn=5),
        losses.CompositeLossFunction(),
        losses.RankBoostLossFunction(rank_column),
    ]
    pred = numpy.random.normal(size=size)
    # y = pred is a special point in i.e. MAELossFunction
    pred[numpy.abs(y - pred) < epsilon] = -0.1
    print(sum(numpy.abs(y - pred) < epsilon))

    for loss in tested_losses:
        loss.fit(X, y, sample_weight=sample_weight)
        # testing sign of gradient
        val = loss(pred)
        gradient = loss.negative_gradient(pred)

        numer_gradient = numpy.zeros(len(pred))
        numer_hessian = numpy.zeros(len(pred))
        for i in range(size):
            pred_plus = pred.copy()
            pred_plus[i] += epsilon
            val_plus = loss(pred_plus)

            pred_minus = pred.copy()
            pred_minus[i] -= epsilon
            val_minus = loss(pred_minus)

            numer_gradient[i] = -(val_plus - val_minus) / 2. / epsilon
            numer_hessian[i] = (val_plus + val_minus - 2 * val) / epsilon**2

        assert numpy.allclose(
            gradient,
            numer_gradient), 'wrong computation of gradient for {}'.format(
                loss)
        if not isinstance(loss, losses.MSELossFunction) and not isinstance(
                loss, losses.MAELossFunction):
            assert (gradient *
                    (2 * y - 1) >= 0).all(), 'wrong signs of gradients'
        if isinstance(loss, losses.HessianLossFunction):
            hessian = loss.hessian(pred)
            assert numpy.allclose(
                hessian, numer_hessian,
                atol=1e-5), 'wrong computation of hessian for {}'.format(loss)
Esempio n. 51
0
def test_constant_fitting(n_samples=1000, n_features=5):
    """
    Testing if initial constant fitted properly
    """
    X, y = generate_sample(n_samples=n_samples, n_features=n_features)
    y = y.astype(numpy.float) + 1000.
    for loss in [MSELossFunction(), losses.MAELossFunction()]:
        gb = UGradientBoostingRegressor(loss=loss, n_estimators=10)
        gb.fit(X, y)
        p = gb.predict(X)
        assert mean_squared_error(p, y) < 0.5
Esempio n. 52
0
def test_step_optimality(n_samples=100):
    """
    testing that for single leaf function returns the optimal value
    """
    X, y = generate_sample(n_samples, n_features=10)
    rank_column = X.columns[2]
    X[rank_column] = numpy.random.randint(0, 3, size=n_samples)

    tested_losses = [
        losses.MAELossFunction(),
        losses.LogLossFunction(),
        losses.AdaLossFunction(),
        losses.KnnAdaLossFunction(X.columns[:1], uniform_label=0, knn=5),
        losses.CompositeLossFunction(),
        losses.RankBoostLossFunction(rank_column),
        losses.MSELossFunction(),
    ]

    pred = numpy.random.normal(size=n_samples)

    for loss in tested_losses:
        if isinstance(loss, losses.MAELossFunction):
            sample_weight = numpy.ones(n_samples)
        else:
            sample_weight = numpy.random.exponential(size=n_samples)

        loss.fit(X, y, sample_weight=sample_weight)

        # Test simple way to get optimal step
        leaf_value = numpy.random.normal()
        # Some basic optimization goes here:
        step = 0.
        for _ in range(4):
            ministep, = loss.prepare_new_leaves_values(terminal_regions=numpy.zeros(n_samples, dtype=int),
                                                       leaf_values=[leaf_value], y_pred=pred + step)
            step += ministep

        print(step)
        loss_values = []
        coeffs = [0.9, 1.0, 1.1]
        for coeff in coeffs:
            loss_values.append(loss(pred + coeff * step))
        print(loss, step, 'losses: ', loss_values)
        assert loss_values[1] <= loss_values[0] + 1e-7
        assert loss_values[1] <= loss_values[2] + 1e-7

        # Test standard function
        opt_value = loss.compute_optimal_step(y_pred=pred)
        loss_values2 = []
        for coeff in coeffs:
            loss_values2.append(loss(pred + coeff * opt_value))
        print(loss, step, 'losses: ', loss_values)
        assert loss_values2[1] <= loss_values2[0] + 1e-7
        assert loss_values2[1] <= loss_values2[2] + 1e-7
Esempio n. 53
0
def test_constant_fitting(n_samples=1000, n_features=5):
    """
    Testing if initial constant fitted properly
    """
    X, y = generate_sample(n_samples=n_samples, n_features=n_features)
    y = y.astype(numpy.float) + 1000.
    for loss in [MSELossFunction(), losses.MAELossFunction()]:
        gb = UGradientBoostingRegressor(loss=loss, n_estimators=10)
        gb.fit(X, y)
        p = gb.predict(X)
        assert mean_squared_error(p, y) < 0.5
Esempio n. 54
0
def test_tree(n_samples=1000):
    X, y = generate_sample(n_samples=n_samples, n_features=5)
    X = numpy.array(X)
    w = numpy.ones(n_samples)
    tree = FastTreeRegressor()
    tree = tree.fit(X, y, sample_weight=w)
    prediction = tree.predict(X)
    tree.print_tree_stats()
    auc = roc_auc_score(y, prediction)
    print("AUC", auc)
    assert auc > 0.7, auc
Esempio n. 55
0
def test_nnet(n_samples=200, n_features=7, distance=0.8, complete=False):
    """
    :param complete: if True, all possible combinations will be checked, and quality is printed
    """
    X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)

    nn_types = [
        nnet.SimpleNeuralNetwork,
        nnet.MLPClassifier,
        nnet.SoftmaxNeuralNetwork,
        nnet.RBFNeuralNetwork,
        nnet.PairwiseNeuralNetwork,
        nnet.PairwiseSoftplusNeuralNetwork,
    ]

    if complete:
        # checking all possible combinations
        for loss in nnet.losses:
            for NNType in nn_types:
                for trainer in nnet.trainers:
                    nn = NNType(layers=[5], loss=loss, trainer=trainer, random_state=42, epochs=100)
                    nn.fit(X, y )
                    print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn)

        lr = LogisticRegression().fit(X, y)
        print(lr, roc_auc_score(y, lr.predict_proba(X)[:, 1]))

        assert 0 == 1, "Let's see and compare results"
    else:
        # checking combinations of losses, nn_types, trainers, most of them are used once during tests.
        attempts = max(len(nnet.losses), len(nnet.trainers), len(nn_types))
        losses_shift = numpy.random.randint(10)
        trainers_shift = numpy.random.randint(10)
        for attempt in range(attempts):
            # each combination is tried 3 times. before raising exception
            retry_attempts = 3
            for retry_attempt in range(retry_attempts):
                loss = list(nnet.losses.keys())[(attempt + losses_shift) % len(nnet.losses)]
                trainer = list(nnet.trainers.keys())[(attempt + trainers_shift) % len(nnet.trainers)]

                nn_type = nn_types[attempt % len(nn_types)]

                nn = nn_type(layers=[5], loss=loss, trainer=trainer, random_state=42 + retry_attempt, epochs=200)
                print(nn)
                nn.fit(X, y)
                quality = roc_auc_score(y, nn.predict_proba(X)[:, 1])
                computed_loss = nn.compute_loss(X, y)
                if quality > 0.8:
                    break
                else:
                    print('attempt {} : {}'.format(retry_attempt, quality))
                    if retry_attempt == retry_attempts - 1:
                        raise RuntimeError('quality of model is too low: {} {}'.format(quality, nn))
Esempio n. 56
0
def tree_quality_comparison(n_samples=200000, n_features=10):
    trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features)
    testX, testY = generate_sample(n_samples=n_samples, n_features=n_features)

    # Multiplying by random matrix
    multiplier = numpy.random.normal(size=[n_features, n_features])
    trainX = numpy.dot(trainX.values, multiplier)
    testX = numpy.dot(testX.values, multiplier)
    regressors = OrderedDict()
    regressors['old'] = DecisionTreeRegressor(max_depth=10, min_samples_split=50)
    regressors['new'] = FastTreeRegressor(max_depth=10, min_samples_split=50, criterion='pvalue')
    w = numpy.ones(n_samples)

    for name, regressor in regressors.items():
        regressor.fit(trainX, trainY, sample_weight=w)
        print(name, roc_auc_score(testY, regressor.predict(testX)))

    # Testing apply method
    indices1, values1 = regressors['new'].apply(testX)
    indices2, values2 = regressors['new'].fast_apply(testX)
    assert numpy.all(values1 == values2), 'two apply methods give different results'
def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = losses.CompositeLossFunction()
    loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1)
    loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1])
    loss6bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0)
    loss7bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1])
    loss6knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1)
    loss7knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1])

    for loss in [loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(result, loss)

    trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX))
    for loss in [losses.MSELossFunction(),
                 losses.MAELossFunction(),
                 losses.RankBoostLossFunction(request_column='fake_request')]:
        print(loss)
        clf = UGradientBoostingRegressor(loss=loss, max_depth=3, n_estimators=50, learning_rate=0.01, subsample=0.5,
                                         train_features=list(trainX.columns[1:]))
        clf.fit(trainX, trainY)
        roc_auc = roc_auc_score(testY, clf.predict(testX))
        assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format(roc_auc, loss)
Esempio n. 58
0
def test_gb_quality(n_samples=10000, n_features=10, distance=0.5):
    trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)
    testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)

    # Multiplying by random matrix
    multiplier = numpy.random.normal(size=[n_features, n_features])
    shift = numpy.random.normal(size=[1, n_features]) * 5
    trainX = numpy.dot(trainX.values, multiplier) + shift
    testX = numpy.dot(testX.values, multiplier) + shift

    boosters = {
        'old_boost': GradientBoostingClassifier(n_estimators=100, min_samples_split=50, max_depth=5, subsample=0.3),
        'fast+old_tree': CommonGradientBoosting(n_estimators=100,
            base_estimator=DecisionTreeRegressor(min_samples_split=50, max_depth=5)),
        'fast+neuro': TreeGradientBoostingClassifier(n_estimators=100, update_tree=True,
                                                     base_estimator=FastNeuroTreeRegressor()),
        'fold+tree': FoldingGBClassifier(loss=BinomialDeviance(), n_estimators=10, update_tree=True,
                                         base_estimator=FastNeuroTreeRegressor()),
        'ugb': uGradientBoostingClassifier(loss=AdaLossFunction(),
            n_estimators=100, min_samples_split=50, max_depth=5, update_tree=True, subsample=0.3)
    }

    for criterion in ['mse', # 'fmse', # 'pvalue',
                      # 'significance',
                      'significance2',
                      # 'gini',
                      'entropy',
                      'poisson'
    ]:
        boosters['fast-' + criterion[:4]] = TreeGradientBoostingClassifier(n_estimators=100, update_tree=True,
            base_estimator=FastTreeRegressor(criterion=criterion))

    for name, booster in boosters.items():
        start = time.time()
        booster.fit(trainX, trainY)
        auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1])
        print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))