def test_quality(n_samples=3000): testX, testY = generate_sample(n_samples, 10, 0.6) trainX, trainY = generate_sample(n_samples, 10, 0.6) params = { 'n_neighbors': 10, 'n_estimators': 10, 'uniform_features': ['column0'], 'uniform_label': 1, 'base_estimator': DecisionTreeClassifier(min_samples_leaf=20, max_depth=5) } for algorithm in ['SAMME', 'SAMME.R']: uboost_classifier = uBoostClassifier( algorithm=algorithm, efficiency_steps=5, **params) bdt_classifier = uBoostBDT(algorithm=algorithm, **params) for classifier in [bdt_classifier, uboost_classifier]: classifier.fit(trainX, trainY) predict_proba = classifier.predict_proba(testX) predict = classifier.predict(testX) assert roc_auc_score(testY, predict_proba[:, 1]) > 0.7, \ "quality is awful" print("Accuracy = %.3f" % accuracy_score(testY, predict))
def setUp(self, n_samples=1000, n_features=5): self.trainX, self.trainY = generate_sample(n_samples=n_samples, n_features=n_features) self.testX, self.testY = generate_sample(n_samples=n_samples, n_features=n_features) self.trainW = numpy.ones(n_samples) self.testW = numpy.ones(n_samples) self.uniform_variables = self.trainX.columns[:1] self.train_variables = self.trainX.columns[1:]
def test_gradient_boosting(n_samples=1000): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_features = ['column0'] loss1 = LogLossFunction() loss2 = AdaLossFunction() loss3 = CompositeLossFunction() loss4 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1) loss5 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1]) loss6bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0) loss7bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) loss6knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1) loss7knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) for loss in [loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(result, loss)
def test_gradient_boosting(n_samples=1000): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_features = ['column0'] loss1 = LogLossFunction() loss2 = AdaLossFunction() loss3 = losses.CompositeLossFunction() loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1) loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1]) loss6bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0) loss7bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) loss6knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1) loss7knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) for loss in [ loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn ]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format( result, loss) trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX)) for loss in [ losses.MSELossFunction(), losses.MAELossFunction(), losses.RankBoostLossFunction(request_column='fake_request') ]: print(loss) clf = UGradientBoostingRegressor(loss=loss, max_depth=3, n_estimators=50, learning_rate=0.01, subsample=0.5, train_features=list( trainX.columns[1:])) clf.fit(trainX, trainY) roc_auc = roc_auc_score(testY, clf.predict(testX)) assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format( roc_auc, loss)
def test_gradient_boosting(n_samples=1000): # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_variables = ['column0'] n_estimators = 20 loss1 = SimpleKnnLossFunction(uniform_variables) # loss2 = PairwiseKnnLossFunction(uniform_variables, knn=10) loss3 = BinomialDevianceLossFunction() # loss4 = RandomKnnLossFunction(uniform_variables, samples * 2, knn=5, knn_factor=3) # loss5 = DistanceBasedKnnFunction(uniform_variables, knn=10, distance_dependence=lambda r: numpy.exp(-0.1 * r)) loss6bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5) loss7bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) loss6knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5) loss7knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) # loss8 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=1) # loss9 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) for loss in [loss1, loss3, loss6bin, loss7bin, loss6knn, loss7knn]: result = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=n_estimators, train_variables=None) \ .fit(trainX[:n_samples], trainY[:n_samples]).score(testX, testY) assert result >= 0.7, "The quality is too poor: %.3f" % result for loss in [loss1, loss3, ]: check_gradient(loss) print('uniform gradient boosting is ok')
def test_gb_ranking(n_samples=1000): """ Testing RankingLossFunction """ distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) rank_variable = 'column1' trainX[rank_variable] = numpy.random.randint(0, 3, size=len(trainX)) testX[rank_variable] = numpy.random.randint(0, 3, size=len(testX)) rank_loss1 = losses.RankBoostLossFunction(request_column=rank_variable, update_iterations=1) rank_loss2 = losses.RankBoostLossFunction(request_column=rank_variable, update_iterations=2) rank_loss3 = losses.RankBoostLossFunction(request_column=rank_variable, update_iterations=10) for loss in [rank_loss1, rank_loss2, rank_loss3]: clf = UGradientBoostingRegressor(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = roc_auc_score(testY, clf.predict(testX)) assert result >= 0.8, "The quality is too poor: {} with loss: {}".format( result, loss)
def test_workability(n_samples=10000, n_features=10, distance=0.5): trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) for booster in [FoldingGBClassifier, TreeGradientBoostingClassifier]: for loss in [BinomialDeviance(), AdaLossFunction()]: for update in [True, False]: for base in [ FastTreeRegressor(max_depth=3), FastNeuroTreeRegressor(max_depth=3) ]: if numpy.random.random() > 0.7: clf = booster(loss=loss, n_estimators=100, base_estimator=base, update_tree=update) clf.fit(trainX, trainY) auc = roc_auc_score(testY, clf.predict_proba(testX)[:, 1]) print('booster', booster, loss, 'update=', update, ' base=', base.__class__, ' quality=', auc) assert auc > 0.8
def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6): """ Testing with two main classification losses. Also testing copying """ testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) for loss in [LogLossFunction(), AdaLossFunction()]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_features=None) clf.fit(trainX, trainY) assert clf.n_features == n_features assert len(clf.feature_importances_) == n_features # checking that predict proba works for p in clf.staged_predict_proba(testX): assert p.shape == (n_samples, 2) assert numpy.all(p == clf.predict_proba(testX)) assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low' # checking clonability _ = clone(clf) clf_copy = copy.deepcopy(clf) assert numpy.all( clf.predict_proba(trainX) == clf_copy.predict_proba( trainX)), 'copied classifier is different'
def test_gradient_boosting(n_samples=1000): # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_variables = ['column0'] n_estimators = 20 loss1 = SimpleKnnLossFunction(uniform_variables) # loss2 = PairwiseKnnLossFunction(uniform_variables, knn=10) loss3 = BinomialDevianceLossFunction() # loss4 = RandomKnnLossFunction(uniform_variables, samples * 2, knn=5, knn_factor=3) # loss5 = DistanceBasedKnnFunction(uniform_variables, knn=10, distance_dependence=lambda r: numpy.exp(-0.1 * r)) loss6bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5) loss7bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) loss6knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5) loss7knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) # loss8 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=1) # loss9 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) for loss in [loss1, loss3, loss6bin, loss7bin, loss6knn, loss7knn]: result = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=n_estimators, train_variables=None) \ .fit(trainX[:n_samples], trainY[:n_samples]).score(testX, testY) assert result >= 0.7, "The quality is too poor: %.3f" % result
def test_probas(n_samples=1000): trainX, trainY = generate_sample(n_samples, 10, 0.6) testX, testY = generate_sample(n_samples, 10, 0.6) params = { 'n_neighbors': 10, 'n_estimators': 10, 'uniform_variables': ['column0'], 'base_estimator': DecisionTreeClassifier(max_depth=5) } for algorithm in ['SAMME', 'SAMME.R']: uboost_classifier = uBoostClassifier( algorithm=algorithm, efficiency_steps=3, **params) bdt_classifier = uBoostBDT(algorithm=algorithm, **params) for classifier in [bdt_classifier, uboost_classifier]: classifier.fit(trainX, trainY) proba1 = classifier.predict_proba(testX) proba2 = list(classifier.staged_predict_proba(testX))[-1] assert np.allclose(proba1, proba2, atol=0.001),\ "staged_predict doesn't coincide with the predict for proba." score1 = bdt_classifier.predict_score(testX) score2 = list(bdt_classifier.staged_predict_score(testX))[-1] assert np.allclose(score1, score2),\ "staged_score doesn't coincide with the score." assert len(bdt_classifier.feature_importances_) == trainX.shape[1]
def test_gradient_boosting(n_samples=1000): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_variables = ['column0'] rank_variable = 'column1' trainX[rank_variable] = numpy.random.randint(0, 3, size=len(trainX)) testX[rank_variable] = numpy.random.randint(0, 3, size=len(testX)) loss1 = BinomialDevianceLossFunction() loss2 = AdaLossFunction() loss3 = CompositeLossFunction() loss4 = SimpleKnnLossFunction(uniform_variables=uniform_variables) loss5 = RankBoostLossFunction(request_column=rank_variable) loss51 = RankBoostLossFunction(request_column=rank_variable, update_terations=2) loss52 = RankBoostLossFunction(request_column=rank_variable, update_terations=10) loss6bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5) loss7bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) loss6knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5) loss7knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) for loss in [loss5, loss51, loss52, loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn]: clf = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_variables=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(result, loss)
def test_probas(n_samples=1000): trainX, trainY = generate_sample(n_samples, 10, 0.6) testX, testY = generate_sample(n_samples, 10, 0.6) params = { 'n_neighbors': 10, 'n_estimators': 10, 'uniform_features': ['column0'], 'uniform_label': 1, 'base_estimator': DecisionTreeClassifier(max_depth=5) } for algorithm in ['SAMME', 'SAMME.R']: uboost_classifier = uBoostClassifier( algorithm=algorithm, efficiency_steps=3, **params) bdt_classifier = uBoostBDT(algorithm=algorithm, **params) for classifier in [bdt_classifier, uboost_classifier]: classifier.fit(trainX, trainY) proba1 = classifier.predict_proba(testX) proba2 = list(classifier.staged_predict_proba(testX))[-1] assert np.allclose(proba1, proba2, atol=0.001), \ "staged_predict doesn't coincide with the predict for proba." score1 = bdt_classifier.decision_function(testX) score2 = list(bdt_classifier.staged_decision_function(testX))[-1] assert np.allclose(score1, score2), \ "staged_score doesn't coincide with the score." assert len(bdt_classifier.feature_importances_) == trainX.shape[1]
def test_bin_transformer_limits(n_features=10, n_bins=123): X, y = generate_sample(n_samples=1999, n_features=n_features) X = BinTransformer(max_bins=n_bins).fit_transform(X) assert numpy.allclose(X.max(axis=0), n_bins - 1) X_orig, y = generate_sample(n_samples=20, n_features=n_features) X = BinTransformer(max_bins=n_bins).fit_transform(X_orig) assert numpy.allclose(X.min(axis=0), 0)
def test_gradient_boosting(size=100, n_features=10): trainX, trainY = generate_sample(size, n_features) testX, testY = generate_sample(size, n_features) for loss in [AdaLossFunction()]: for update in ['all', 'same', 'other', 'random']: gb = GradientBoosting(loss=loss, update_on=update, smearing=[0.1, -0.1]) score = gb.fit(trainX, trainY).score(testX, testY) print(update, score)
def check_classifiers(n_samples=10000): """ This function is not tested by default, it should be called manually """ testX, testY = generate_sample(n_samples, 10, 0.6) trainX, trainY = generate_sample(n_samples, 10, 0.6) uniform_features = ['column0'] ada = AdaBoostClassifier(n_estimators=50) ideal_bayes = GaussianNB() uBoost_SAMME = uBoostClassifier( uniform_features=uniform_features, uniform_label=1, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME") uBoost_SAMME_R = uBoostClassifier( uniform_features=uniform_features, uniform_label=1, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME.R") uBoost_SAMME_R_threaded = uBoostClassifier( uniform_features=uniform_features, uniform_label=1, n_neighbors=50, efficiency_steps=5, n_estimators=50, n_threads=3, subsample=0.9, algorithm="SAMME.R") clf_dict = OrderedDict({ "Ada": ada, "uBOOST": uBoost_SAMME, "uBOOST.R": uBoost_SAMME_R, "uBOOST.R2": uBoost_SAMME_R_threaded }) cvms = {} for clf_name, clf in clf_dict.items(): clf.fit(trainX, trainY) p = clf.predict_proba(testX) metric = KnnBasedCvM(uniform_features=uniform_features) metric.fit(testX, testY) cvms[clf_name] = metric(testY, p, sample_weight=np.ones(len(testY))) assert cvms['uBOOST'] < cvms['ada'] print(cvms)
def test_gb_with_ada(n_samples=1000, n_features=10, distance=0.6): testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) loss = BinomialDevianceLossFunction() clf = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_variables=None) clf.fit(trainX, trainY) assert clf.n_features == n_features assert len(clf.feature_importances_) == n_features # checking that predict proba works for p in clf.staged_predict_proba(testX): assert p.shape == (n_samples, 2) assert numpy.all(p == clf.predict_proba(testX))
def test_weight_misbalance(n_samples=1000, n_features=10, distance=0.6): """ Testing how classifiers work with highly misbalanced (in the terms of weights) datasets. """ testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) trainW = trainY * 10000 + 1 testW = testY * 10000 + 1 for loss in [LogLossFunction(), AdaLossFunction(), losses.CompositeLossFunction()]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_features=None) clf.fit(trainX, trainY, sample_weight=trainW) p = clf.predict_proba(testX) assert roc_auc_score(testY, p[:, 1], sample_weight=testW) > 0.8, 'quality is too low'
def test_workability(n_samples=10000, n_features=10, distance=0.5): trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) for booster in [FoldingGBClassifier, TreeGradientBoostingClassifier]: for loss in [BinomialDeviance(), AdaLossFunction()]: for update in [True, False]: for base in [FastTreeRegressor(max_depth=3), FastNeuroTreeRegressor(max_depth=3)]: if numpy.random.random() > 0.7: clf = booster(loss=loss, n_estimators=100, base_estimator=base, update_tree=update) clf.fit(trainX, trainY) auc = roc_auc_score(testY, clf.predict_proba(testX)[:, 1]) print('booster', booster, loss, 'update=', update, ' base=', base.__class__, ' quality=', auc) assert auc > 0.8
def check_classifiers(n_samples=10000, output_name_pattern=None): """ This function is not tested by default, it should be called manually """ testX, testY = generate_sample(n_samples, 10, 0.6) trainX, trainY = generate_sample(n_samples, 10, 0.6) uniform_variables = ['column0'] ada = AdaBoostClassifier(n_estimators=50) ideal_bayes = HidingClassifier(train_variables=trainX.columns[1:], base_estimator=GaussianNB()) uBoost_SAMME = uBoostClassifier( uniform_variables=uniform_variables, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME") uBoost_SAMME_R = uBoostClassifier( uniform_variables=uniform_variables, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME.R") clf_dict = ClassifiersDict({ "Ada": ada, "Ideal": ideal_bayes, "uBOOST": uBoost_SAMME, "uBOOST.R": uBoost_SAMME_R }) clf_dict.fit(trainX, trainY) predictions = Predictions(clf_dict, testX, testY) # predictions.print_mse(uniform_variables, in_html=False) print(predictions.compute_metrics()) predictions.sde_curves(uniform_variables) if output_name_pattern is not None: pl.savefig(output_name_pattern % "mse_curves", bbox="tight") _ = pl.figure() predictions.learning_curves() if output_name_pattern is not None: pl.savefig(output_name_pattern % "learning_curves", bbox="tight") predictions.efficiency(uniform_variables) if output_name_pattern is not None: pl.savefig(output_name_pattern % "efficiency_curves", bbox="tight")
def check_single_classification_network(neural_network, n_samples=200, n_features=7, distance=0.8, retry_attempts=3): X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) # each combination is tried 3 times. before raising exception for retry_attempt in range(retry_attempts): # to initial state neural_network = clone(neural_network) neural_network.set_params(random_state=42 + retry_attempt) print(neural_network) neural_network.fit(X, y) quality = roc_auc_score(y, neural_network.predict_proba(X)[:, 1]) # checking that computations don't fail computed_loss = neural_network.compute_loss(X, y, sample_weight=y * 0 + 1) if quality > 0.8: break else: print('attempt {} : {}'.format(retry_attempt, quality)) if retry_attempt == retry_attempts - 1: raise RuntimeError('quality of model is too low: {} {}'.format( quality, neural_network))
def test_lookup(n_samples=10000, n_features=7, n_bins=8): X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=0.6) base_estimator = GradientBoostingClassifier() clf = LookupClassifier(base_estimator=base_estimator, n_bins=n_bins, keep_trained_estimator=True).fit(X, y) p = clf.predict_proba(X) assert roc_auc_score(y, p[:, 1]) > 0.8, 'quality of classification is too low' assert p.shape == (n_samples, 2) assert numpy.allclose(p.sum(axis=1), 1), 'probabilities are not summed up to 1' # checking conversions lookup_size = n_bins ** n_features lookup_indices = numpy.arange(lookup_size, dtype=int) bins_indices = clf.convert_lookup_index_to_bins(lookup_indices=lookup_indices) lookup_indices2 = clf.convert_bins_to_lookup_index(bins_indices=bins_indices) assert numpy.allclose(lookup_indices, lookup_indices2), 'something wrong with conversions' assert len(clf._lookup_table) == n_bins ** n_features, 'wrong size of lookup table' # checking speed X = pandas.concat([X] * 10) start = time.time() p1 = clf.trained_estimator.predict_proba(clf.transform(X)) time_old = time.time() - start start = time.time() p2 = clf.predict_proba(X) time_new = time.time() - start print(time_old, ' now takes ', time_new) assert numpy.allclose(p1, p2), "pipeline doesn't work as expected"
def test_grid_search(): from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier grid = { 'base_estimator': [ DecisionTreeClassifier(max_depth=3), DecisionTreeClassifier(max_depth=4), ExtraTreeClassifier(max_depth=4) ], 'learning_rate': [0.01, 0.1, 0.5, 1.], 'n_estimators': [5, 10, 15, 20, 30, 40, 50, 75, 100, 125], 'algorithm': ['SAMME', 'SAMME.R'] } grid = OrderedDict(grid) trainX, trainY = generate_sample(2000, 10, distance=0.5) grid_cv = GridOptimalSearchCV(AdaBoostClassifier(), grid, n_evaluations=10, refit=True, log_name='test') grid_cv.fit(trainX, trainY) grid_cv.predict_proba(trainX) grid_cv.predict(trainX) grid_cv.print_param_stats([0.1, 0.3, 0.5, 0.7])
def test_tree_speed(n_samples=100000, n_features=10): X, y = generate_sample(n_samples=n_samples, n_features=n_features) X = numpy.array(X) w = numpy.ones(n_samples) regressors = OrderedDict() regressors['old'] = DecisionTreeRegressor(max_depth=10, min_samples_split=50) regressors['new'] = FastTreeRegressor(max_depth=10, min_samples_split=50) for name, regressor in regressors.items(): start = time.time() for _ in range(3): regressor.fit(X, y, sample_weight=w) print(name, 'trains in ', time.time() - start) # Testing speed of prediction: methods = OrderedDict() methods['old'] = lambda: regressors['old'].predict(X) methods['new'] = lambda: regressors['new'].apply(X) methods['new-fast'] = lambda: regressors['new'].fast_apply(X) for name, method in methods.items(): start = time.time() for _ in range(5): method() print(name, 'requires ', time.time() - start)
def test_metrics_clear(n_samples=2000, knn=50, uniform_class=0): """ Testing that after deleting all inappropriate events (events of other class), metrics stays the same """ X, y = generate_sample(n_samples=n_samples, n_features=10) sample_weight = numpy.random.exponential(size=n_samples) predictions = numpy.random.random(size=[n_samples, 2]) predictions /= predictions.sum(axis=1, keepdims=True) features = X.columns[:1] mask = (y == uniform_class) X_clear = X.ix[mask, :] y_clear = y[mask] sample_weight_clear = sample_weight[mask] predictions_clear = predictions[mask] for function in [sde, theil_flatness, cvm_flatness]: flatness_val = function(y, predictions, X, uniform_variables=features, sample_weight=sample_weight, label=0, knn=knn) flatness_val_clear = function(y_clear, predictions_clear, X_clear, uniform_variables=features, sample_weight=sample_weight_clear, label=0, knn=knn) assert flatness_val == flatness_val_clear, 'after deleting other class, the metrics changed' for class_ in [KnnBasedSDE, KnnBasedTheil, KnnBasedCvM]: metric1 = class_(n_neighbours=knn, uniform_features=features, uniform_label=0, ) metric1.fit(X, y, sample_weight=sample_weight) flatness_val1 = metric1(y, predictions, sample_weight) metric2 = class_(n_neighbours=knn, uniform_features=features, uniform_label=0, ) metric2.fit(X_clear, y_clear, sample_weight=sample_weight_clear) flatness_val2 = metric2(y_clear, predictions_clear, sample_weight_clear) assert flatness_val1 == flatness_val2, 'after deleting other class, the metrics changed'
def test_new_metrics(n_samples=2000, knn=50): X, y = generate_sample(n_samples=n_samples, n_features=10) sample_weight = numpy.random.exponential(size=n_samples) ** 0. predictions = numpy.random.random(size=[n_samples, 2]) predictions /= predictions.sum(axis=1, keepdims=True) predictions *= 1000. # Checking SDE features = X.columns[:1] sde_val1 = sde(y, predictions, X, uniform_variables=features, sample_weight=sample_weight, label=0, knn=knn) sde2 = KnnBasedSDE(n_neighbours=knn, uniform_features=features, uniform_label=0, ) sde2.fit(X, y, sample_weight=sample_weight) sde_val2 = sde2(y, predictions, sample_weight=sample_weight) assert sde_val1 == sde_val2, 'SDE values are different' # Checking CVM features = X.columns[:1] cvm_val1 = cvm_flatness(y, predictions, X, uniform_variables=features, sample_weight=sample_weight, label=0, knn=knn) cvm2 = KnnBasedCvM(n_neighbours=knn, uniform_features=features, uniform_label=0, ) cvm2.fit(X, y, sample_weight=sample_weight) cvm_val2 = cvm2(y, predictions, sample_weight=sample_weight) assert cvm_val1 == cvm_val2, 'CvM values are different'
def test_cuts(n_samples=1000): base_classifier = DecisionTreeClassifier(min_samples_leaf=10, max_depth=6) trainX, trainY = generate_sample(n_samples, 10, 0.6) uniform_variables = ['column0'] for algorithm in ['SAMME', 'SAMME.R']: for target_efficiency in [0.1, 0.3, 0.5, 0.7, 0.9]: uBDT = uBoostBDT( uniform_variables=uniform_variables, target_efficiency=target_efficiency, n_neighbors=20, n_estimators=20, algorithm=algorithm, base_estimator=base_classifier) uBDT.fit(trainX, trainY) passed = sum(trainY) * target_efficiency assert uBDT.score_cut == uBDT.score_cuts_[-1],\ 'something wrong with computed cuts' for score, cut in zip(uBDT.staged_predict_score(trainX[trainY > 0.5]), uBDT.score_cuts_): passed_upper = np.sum(score > cut - 1e-7) passed_lower = np.sum(score > cut + 1e-7) assert passed_lower <= passed <= passed_upper, "wrong stage cuts"
def test_cuts(n_samples=1000): base_classifier = DecisionTreeClassifier(min_samples_leaf=10, max_depth=6) trainX, trainY = generate_sample(n_samples, 10, 0.6) uniform_features = ['column0'] for algorithm in ['SAMME', 'SAMME.R']: for target_efficiency in [0.1, 0.3, 0.5, 0.7, 0.9]: uBDT = uBoostBDT( uniform_features=uniform_features, uniform_label=1, target_efficiency=target_efficiency, n_neighbors=20, n_estimators=20, algorithm=algorithm, base_estimator=base_classifier) uBDT.fit(trainX, trainY) passed = sum(trainY) * target_efficiency assert uBDT.score_cut == uBDT.score_cuts_[-1], \ 'something wrong with computed cuts' for score, cut in zip(uBDT.staged_decision_function(trainX[trainY > 0.5]), uBDT.score_cuts_): passed_upper = np.sum(score > cut - 1e-7) passed_lower = np.sum(score > cut + 1e-7) assert passed_lower <= passed <= passed_upper, "wrong stage cuts"
def test_workability(n_samples=2000, knn=50, uniform_label=0, n_bins=10): """Simply checks that metrics are working """ X, y = generate_sample(n_samples=n_samples, n_features=10) sample_weight = numpy.random.exponential(size=n_samples) predictions = numpy.random.random(size=[n_samples, 2]) predictions /= predictions.sum(axis=1, keepdims=True) features = X.columns[:1] for class_ in [KnnBasedSDE, KnnBasedTheil, KnnBasedCvM]: metric = class_( n_neighbours=knn, uniform_features=features, uniform_label=uniform_label, ) metric.fit(X, y, sample_weight=sample_weight) flatness_val_ = metric(y, predictions, sample_weight) for class_ in [BinBasedSDE, BinBasedTheil, BinBasedCvM]: metric = class_( n_bins=n_bins, uniform_features=features, uniform_label=uniform_label, ) metric.fit(X, y, sample_weight=sample_weight) flatness_val_ = metric(y, predictions, sample_weight)
def test_metrics_clear(n_samples=2000, knn=50, uniform_class=0): """ Testing that after deleting all inappropriate events (events of other class), metrics stays the same """ X, y = generate_sample(n_samples=n_samples, n_features=10) sample_weight = numpy.random.exponential(size=n_samples) predictions = numpy.random.random(size=[n_samples, 2]) predictions /= predictions.sum(axis=1, keepdims=True) features = X.columns[:1] mask = (y == uniform_class) X_clear = X.ix[mask, :] y_clear = y[mask] sample_weight_clear = sample_weight[mask] predictions_clear = predictions[mask] for function in [sde, theil_flatness, cvm_flatness]: flatness_val = function(y, predictions, X, uniform_features=features, sample_weight=sample_weight, label=0, knn=knn) flatness_val_clear = function(y_clear, predictions_clear, X_clear, uniform_features=features, sample_weight=sample_weight_clear, label=0, knn=knn) assert flatness_val == flatness_val_clear, 'after deleting other class, the metrics changed' for class_ in [KnnBasedSDE, KnnBasedTheil, KnnBasedCvM]: metric1 = class_(n_neighbours=knn, uniform_features=features, uniform_label=0, ) metric1.fit(X, y, sample_weight=sample_weight) flatness_val1 = metric1(y, predictions, sample_weight) metric2 = class_(n_neighbours=knn, uniform_features=features, uniform_label=0, ) metric2.fit(X_clear, y_clear, sample_weight=sample_weight_clear) flatness_val2 = metric2(y_clear, predictions_clear, sample_weight_clear) assert flatness_val1 == flatness_val2, 'after deleting other class, the metrics changed'
def test_step_optimality(n_samples=100): """ testing that for single leaf function returns the optimal value """ X, y = generate_sample(n_samples, n_features=10) sample_weight = numpy.random.exponential(size=n_samples) rank_column = X.columns[2] X[rank_column] = numpy.random.randint(0, 3, size=n_samples) tested_losses = [ losses.LogLossFunction(), losses.AdaLossFunction(), losses.KnnAdaLossFunction(X.columns[:1], uniform_label=0, knn=5), losses.CompositeLossFunction(), losses.RankBoostLossFunction(rank_column), losses.MSELossFunction(), ] pred = numpy.random.normal(size=n_samples) for loss in tested_losses: loss.fit(X, y, sample_weight=sample_weight) # Test simple way to get optimal step leaf_value = numpy.random.normal() step = 0. for _ in range(4): ministep, = loss.prepare_new_leaves_values( terminal_regions=numpy.zeros(n_samples, dtype=int), leaf_values=[leaf_value], y_pred=pred + step) step += ministep if isinstance(loss, losses.MAELossFunction): # checking that MAE is minimized with long process for iteration in range(1, 30): ministep, = loss.prepare_new_leaves_values( terminal_regions=numpy.zeros(n_samples, dtype=int), leaf_values=[leaf_value], y_pred=pred + step) step += ministep * 1. / iteration loss_values = [] coeffs = [0.9, 1.0, 1.1] for coeff in coeffs: loss_values.append(loss(pred + coeff * step)) print(loss, step, 'losses: ', loss_values) assert loss_values[1] <= loss_values[0] + 1e-7 assert loss_values[1] <= loss_values[2] + 1e-7 # Test standard function opt_value = loss.compute_optimal_step(y_pred=pred) loss_values2 = [] for coeff in coeffs: loss_values2.append(loss(pred + coeff * opt_value)) print(loss, step, 'losses: ', loss_values) assert loss_values2[1] <= loss_values2[0] + 1e-7 assert loss_values2[1] <= loss_values2[2] + 1e-7
def test_gb_regression(n_samples=1000): X, _ = generate_sample(n_samples, 10, distance=0.6) y = numpy.tanh(X.sum(axis=1)) clf = UGradientBoostingRegressor(loss=MSELossFunction()) clf.fit(X, y) y_pred = clf.predict(X) zeromse = 0.5 * mean_squared_error(y, y * 0.) assert mean_squared_error(y, y_pred) < zeromse, 'something wrong with regression quality'
def test_reproducibility(n_samples=200, n_features=15, distance=0.5): X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) for trainer in nnet.trainers.keys(): clf1 = nnet.MLPClassifier(trainer=trainer, random_state=42).fit(X, y) clf2 = nnet.MLPClassifier(trainer=trainer, random_state=42).fit(X, y) assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X))
def check_classifiers(n_samples=10000, output_name_pattern=None): """ This function is not tested by default, it should be called manually """ testX, testY = generate_sample(n_samples, 10, 0.6) trainX, trainY = generate_sample(n_samples, 10, 0.6) uniform_variables = ['column0'] ada = AdaBoostClassifier(n_estimators=50) ideal_bayes = HidingClassifier(train_variables=trainX.columns[1:], base_estimator=GaussianNB()) uBoost_SAMME = uBoostClassifier(uniform_variables=uniform_variables, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME") uBoost_SAMME_R = uBoostClassifier(uniform_variables=uniform_variables, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME.R") clf_dict = ClassifiersDict({ "Ada": ada, "Ideal": ideal_bayes, "uBOOST": uBoost_SAMME, "uBOOST.R": uBoost_SAMME_R }) clf_dict.fit(trainX, trainY) predictions = Predictions(clf_dict, testX, testY) # predictions.print_mse(uniform_variables, in_html=False) print(predictions.compute_metrics()) predictions.sde_curves(uniform_variables) if output_name_pattern is not None: pl.savefig(output_name_pattern % "mse_curves", bbox="tight") _ = pl.figure() predictions.learning_curves() if output_name_pattern is not None: pl.savefig(output_name_pattern % "learning_curves", bbox="tight") predictions.efficiency(uniform_variables) if output_name_pattern is not None: pl.savefig(output_name_pattern % "efficiency_curves", bbox="tight")
def test_gb_simple(): X, y = generate_sample(n_samples=10000, n_features=10) X = BinTransformer().fit_transform(X) reg = ResearchGradientBoostingBase(loss=MSELoss()) reg.fit(X, y) assert roc_auc_score(y, reg.decision_function(X)) > 0.6
def test_network_with_scaler(n_samples=200, n_features=15, distance=0.5): X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) for scaler in [BinTransformer(max_bins=16), IronTransformer()]: clf = nnet.SimpleNeuralNetwork(scaler=scaler, epochs=300) clf.fit(X, y) p = clf.predict_proba(X) assert roc_auc_score(y, p[:, 1]) > 0.8, 'quality is too low for model: {}'.format(clf)
def test_with_scaler(n_samples=200, n_features=15, distance=0.5): X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) for scaler in [BinTransformer(max_bins=16), IronTransformer()]: clf = nnet.SimpleNeuralNetwork(scaler=scaler,epochs=300) clf.fit(X, y) p = clf.predict_proba(X) assert roc_auc_score(y, p[:, 1]) > 0.8, 'quality is too low for model: {}'.format(clf)
def test_bin_transformer_extend_to(n_features=10, n_bins=123): extended_length = 19 X, y = generate_sample(n_samples=20, n_features=n_features) X1 = BinTransformer(max_bins=n_bins).fit(X).transform(X) X2 = BinTransformer(max_bins=n_bins).fit(X).transform( X, extend_to=extended_length) assert len(X2) % extended_length == 0, 'wrong shape!' assert numpy.allclose(X2[:len(X1)], X1), 'extending does not work as expected!'
def test_refitting(n_samples=10000, n_features=10, distance=0.5): trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) booster = TreeGradientBoostingClassifier(n_estimators=100, update_tree=True, base_estimator=FastTreeRegressor()) booster.fit(trainX, trainY) print(roc_auc_score(testY, booster.predict_proba(testX)[:, 1])) print(roc_auc_score(trainY, booster.predict_proba(trainX)[:, 1])) booster.refit_trees(trainX, trainY) print(roc_auc_score(testY, booster.predict_proba(testX)[:, 1])) print(roc_auc_score(trainY, booster.predict_proba(trainX)[:, 1])) booster.refit_trees(testX, testY) print(roc_auc_score(testY, booster.predict_proba(testX)[:, 1])) print(roc_auc_score(trainY, booster.predict_proba(trainX)[:, 1]))
def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6): testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) for loss in [LogLossFunction(), AdaLossFunction()]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_features=None) clf.fit(trainX, trainY) assert clf.n_features == n_features assert len(clf.feature_importances_) == n_features # checking that predict proba works for p in clf.staged_predict_proba(testX): assert p.shape == (n_samples, 2) assert numpy.all(p == clf.predict_proba(testX)) assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low' # checking clonability _ = clone(clf) clf_copy = copy.deepcopy(clf) assert (clf.predict_proba(trainX) == clf_copy.predict_proba(trainX)).all(), 'copied classifier is different'
def tree_quality_comparison(n_samples=200000, n_features=10): """ Function is NOT a test, bit helpful to compare performance of standard DT and new one. """ trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features) testX, testY = generate_sample(n_samples=n_samples, n_features=n_features) # Multiplying by random matrix multiplier = numpy.random.normal(size=[n_features, n_features]) trainX = numpy.dot(trainX.values, multiplier) testX = numpy.dot(testX.values, multiplier) regressors = OrderedDict() regressors['old'] = DecisionTreeRegressor(max_depth=10, min_samples_split=50) regressors['new'] = FastTreeRegressor(max_depth=10, min_samples_split=50, criterion='pvalue') w = numpy.ones(n_samples) for name, regressor in regressors.items(): regressor.fit(trainX, trainY, sample_weight=w) print(name, roc_auc_score(testY, regressor.predict(testX)))
def test_classifier_with_dataframe(): try: from rep.estimators import SklearnClassifier clf = SklearnClassifier(GradientBoostingClassifier(n_estimators=1)) X, y = generate_sample(n_samples=100, n_features=4) for X_ in [X, pandas.DataFrame(X)]: lookup = LookupClassifier(clf, n_bins=16).fit(X_, y) lookup.predict_proba(X) except ImportError: print('expected fail: yandex/rep not installed')
def test_gb_ranking(n_samples=1000): distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) rank_variable = 'column1' trainX[rank_variable] = numpy.random.randint(0, 3, size=len(trainX)) testX[rank_variable] = numpy.random.randint(0, 3, size=len(testX)) rank_loss1 = RankBoostLossFunction(request_column=rank_variable, update_iterations=1) rank_loss2 = RankBoostLossFunction(request_column=rank_variable, update_iterations=2) rank_loss3 = RankBoostLossFunction(request_column=rank_variable, update_iterations=10) for loss in [rank_loss1, rank_loss2, rank_loss3]: clf = UGradientBoostingRegressor(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = roc_auc_score(testY, clf.predict(testX)) assert result >= 0.8, "The quality is too poor: {} with loss: {}".format(result, loss)
def test_loss_functions(size=50, epsilon=1e-3): """ Testing that Hessians and gradients of loss functions coincide with numerical approximations """ X, y = generate_sample(size, n_features=10) rank_column = X.columns[2] X[rank_column] = numpy.random.randint(0, 3, size=size) sample_weight = numpy.random.exponential(size=size) tested_losses = [ losses.MSELossFunction(), losses.MAELossFunction(), losses.LogLossFunction(), losses.AdaLossFunction(), losses.KnnAdaLossFunction(X.columns[:1], uniform_label=1, knn=5), losses.CompositeLossFunction(), losses.RankBoostLossFunction(rank_column), ] pred = numpy.random.normal(size=size) # y = pred is a special point in i.e. MAELossFunction pred[numpy.abs(y - pred) < epsilon] = -0.1 print(sum(numpy.abs(y - pred) < epsilon)) for loss in tested_losses: loss.fit(X, y, sample_weight=sample_weight) # testing sign of gradient val = loss(pred) gradient = loss.negative_gradient(pred) numer_gradient = numpy.zeros(len(pred)) numer_hessian = numpy.zeros(len(pred)) for i in range(size): pred_plus = pred.copy() pred_plus[i] += epsilon val_plus = loss(pred_plus) pred_minus = pred.copy() pred_minus[i] -= epsilon val_minus = loss(pred_minus) numer_gradient[i] = -(val_plus - val_minus) / 2. / epsilon numer_hessian[i] = (val_plus + val_minus - 2 * val) / epsilon**2 assert numpy.allclose( gradient, numer_gradient), 'wrong computation of gradient for {}'.format( loss) if not isinstance(loss, losses.MSELossFunction) and not isinstance( loss, losses.MAELossFunction): assert (gradient * (2 * y - 1) >= 0).all(), 'wrong signs of gradients' if isinstance(loss, losses.HessianLossFunction): hessian = loss.hessian(pred) assert numpy.allclose( hessian, numer_hessian, atol=1e-5), 'wrong computation of hessian for {}'.format(loss)
def test_constant_fitting(n_samples=1000, n_features=5): """ Testing if initial constant fitted properly """ X, y = generate_sample(n_samples=n_samples, n_features=n_features) y = y.astype(numpy.float) + 1000. for loss in [MSELossFunction(), losses.MAELossFunction()]: gb = UGradientBoostingRegressor(loss=loss, n_estimators=10) gb.fit(X, y) p = gb.predict(X) assert mean_squared_error(p, y) < 0.5
def test_step_optimality(n_samples=100): """ testing that for single leaf function returns the optimal value """ X, y = generate_sample(n_samples, n_features=10) rank_column = X.columns[2] X[rank_column] = numpy.random.randint(0, 3, size=n_samples) tested_losses = [ losses.MAELossFunction(), losses.LogLossFunction(), losses.AdaLossFunction(), losses.KnnAdaLossFunction(X.columns[:1], uniform_label=0, knn=5), losses.CompositeLossFunction(), losses.RankBoostLossFunction(rank_column), losses.MSELossFunction(), ] pred = numpy.random.normal(size=n_samples) for loss in tested_losses: if isinstance(loss, losses.MAELossFunction): sample_weight = numpy.ones(n_samples) else: sample_weight = numpy.random.exponential(size=n_samples) loss.fit(X, y, sample_weight=sample_weight) # Test simple way to get optimal step leaf_value = numpy.random.normal() # Some basic optimization goes here: step = 0. for _ in range(4): ministep, = loss.prepare_new_leaves_values(terminal_regions=numpy.zeros(n_samples, dtype=int), leaf_values=[leaf_value], y_pred=pred + step) step += ministep print(step) loss_values = [] coeffs = [0.9, 1.0, 1.1] for coeff in coeffs: loss_values.append(loss(pred + coeff * step)) print(loss, step, 'losses: ', loss_values) assert loss_values[1] <= loss_values[0] + 1e-7 assert loss_values[1] <= loss_values[2] + 1e-7 # Test standard function opt_value = loss.compute_optimal_step(y_pred=pred) loss_values2 = [] for coeff in coeffs: loss_values2.append(loss(pred + coeff * opt_value)) print(loss, step, 'losses: ', loss_values) assert loss_values2[1] <= loss_values2[0] + 1e-7 assert loss_values2[1] <= loss_values2[2] + 1e-7
def test_tree(n_samples=1000): X, y = generate_sample(n_samples=n_samples, n_features=5) X = numpy.array(X) w = numpy.ones(n_samples) tree = FastTreeRegressor() tree = tree.fit(X, y, sample_weight=w) prediction = tree.predict(X) tree.print_tree_stats() auc = roc_auc_score(y, prediction) print("AUC", auc) assert auc > 0.7, auc
def test_nnet(n_samples=200, n_features=7, distance=0.8, complete=False): """ :param complete: if True, all possible combinations will be checked, and quality is printed """ X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) nn_types = [ nnet.SimpleNeuralNetwork, nnet.MLPClassifier, nnet.SoftmaxNeuralNetwork, nnet.RBFNeuralNetwork, nnet.PairwiseNeuralNetwork, nnet.PairwiseSoftplusNeuralNetwork, ] if complete: # checking all possible combinations for loss in nnet.losses: for NNType in nn_types: for trainer in nnet.trainers: nn = NNType(layers=[5], loss=loss, trainer=trainer, random_state=42, epochs=100) nn.fit(X, y ) print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn) lr = LogisticRegression().fit(X, y) print(lr, roc_auc_score(y, lr.predict_proba(X)[:, 1])) assert 0 == 1, "Let's see and compare results" else: # checking combinations of losses, nn_types, trainers, most of them are used once during tests. attempts = max(len(nnet.losses), len(nnet.trainers), len(nn_types)) losses_shift = numpy.random.randint(10) trainers_shift = numpy.random.randint(10) for attempt in range(attempts): # each combination is tried 3 times. before raising exception retry_attempts = 3 for retry_attempt in range(retry_attempts): loss = list(nnet.losses.keys())[(attempt + losses_shift) % len(nnet.losses)] trainer = list(nnet.trainers.keys())[(attempt + trainers_shift) % len(nnet.trainers)] nn_type = nn_types[attempt % len(nn_types)] nn = nn_type(layers=[5], loss=loss, trainer=trainer, random_state=42 + retry_attempt, epochs=200) print(nn) nn.fit(X, y) quality = roc_auc_score(y, nn.predict_proba(X)[:, 1]) computed_loss = nn.compute_loss(X, y) if quality > 0.8: break else: print('attempt {} : {}'.format(retry_attempt, quality)) if retry_attempt == retry_attempts - 1: raise RuntimeError('quality of model is too low: {} {}'.format(quality, nn))
def tree_quality_comparison(n_samples=200000, n_features=10): trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features) testX, testY = generate_sample(n_samples=n_samples, n_features=n_features) # Multiplying by random matrix multiplier = numpy.random.normal(size=[n_features, n_features]) trainX = numpy.dot(trainX.values, multiplier) testX = numpy.dot(testX.values, multiplier) regressors = OrderedDict() regressors['old'] = DecisionTreeRegressor(max_depth=10, min_samples_split=50) regressors['new'] = FastTreeRegressor(max_depth=10, min_samples_split=50, criterion='pvalue') w = numpy.ones(n_samples) for name, regressor in regressors.items(): regressor.fit(trainX, trainY, sample_weight=w) print(name, roc_auc_score(testY, regressor.predict(testX))) # Testing apply method indices1, values1 = regressors['new'].apply(testX) indices2, values2 = regressors['new'].fast_apply(testX) assert numpy.all(values1 == values2), 'two apply methods give different results'
def test_gradient_boosting(n_samples=1000): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_features = ['column0'] loss1 = LogLossFunction() loss2 = AdaLossFunction() loss3 = losses.CompositeLossFunction() loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1) loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1]) loss6bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0) loss7bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) loss6knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1) loss7knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) for loss in [loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(result, loss) trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX)) for loss in [losses.MSELossFunction(), losses.MAELossFunction(), losses.RankBoostLossFunction(request_column='fake_request')]: print(loss) clf = UGradientBoostingRegressor(loss=loss, max_depth=3, n_estimators=50, learning_rate=0.01, subsample=0.5, train_features=list(trainX.columns[1:])) clf.fit(trainX, trainY) roc_auc = roc_auc_score(testY, clf.predict(testX)) assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format(roc_auc, loss)
def test_gb_quality(n_samples=10000, n_features=10, distance=0.5): trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) # Multiplying by random matrix multiplier = numpy.random.normal(size=[n_features, n_features]) shift = numpy.random.normal(size=[1, n_features]) * 5 trainX = numpy.dot(trainX.values, multiplier) + shift testX = numpy.dot(testX.values, multiplier) + shift boosters = { 'old_boost': GradientBoostingClassifier(n_estimators=100, min_samples_split=50, max_depth=5, subsample=0.3), 'fast+old_tree': CommonGradientBoosting(n_estimators=100, base_estimator=DecisionTreeRegressor(min_samples_split=50, max_depth=5)), 'fast+neuro': TreeGradientBoostingClassifier(n_estimators=100, update_tree=True, base_estimator=FastNeuroTreeRegressor()), 'fold+tree': FoldingGBClassifier(loss=BinomialDeviance(), n_estimators=10, update_tree=True, base_estimator=FastNeuroTreeRegressor()), 'ugb': uGradientBoostingClassifier(loss=AdaLossFunction(), n_estimators=100, min_samples_split=50, max_depth=5, update_tree=True, subsample=0.3) } for criterion in ['mse', # 'fmse', # 'pvalue', # 'significance', 'significance2', # 'gini', 'entropy', 'poisson' ]: boosters['fast-' + criterion[:4]] = TreeGradientBoostingClassifier(n_estimators=100, update_tree=True, base_estimator=FastTreeRegressor(criterion=criterion)) for name, booster in boosters.items(): start = time.time() booster.fit(trainX, trainY) auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1]) print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))