def test_gradient_boosting(n_samples=1000): # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_variables = ['column0'] n_estimators = 20 loss1 = SimpleKnnLossFunction(uniform_variables) # loss2 = PairwiseKnnLossFunction(uniform_variables, knn=10) loss3 = BinomialDevianceLossFunction() # loss4 = RandomKnnLossFunction(uniform_variables, samples * 2, knn=5, knn_factor=3) # loss5 = DistanceBasedKnnFunction(uniform_variables, knn=10, distance_dependence=lambda r: numpy.exp(-0.1 * r)) loss6bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5) loss7bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) loss6knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5) loss7knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) # loss8 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=1) # loss9 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) for loss in [loss1, loss3, loss6bin, loss7bin, loss6knn, loss7knn]: result = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=n_estimators, train_variables=None) \ .fit(trainX[:n_samples], trainY[:n_samples]).score(testX, testY) assert result >= 0.7, "The quality is too poor: %.3f" % result for loss in [loss1, loss3, ]: check_gradient(loss) print('uniform gradient boosting is ok')
def test_gradient_boosting(n_samples=1000): # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_variables = ['column0'] n_estimators = 20 loss1 = SimpleKnnLossFunction(uniform_variables) # loss2 = PairwiseKnnLossFunction(uniform_variables, knn=10) loss3 = BinomialDevianceLossFunction() # loss4 = RandomKnnLossFunction(uniform_variables, samples * 2, knn=5, knn_factor=3) # loss5 = DistanceBasedKnnFunction(uniform_variables, knn=10, distance_dependence=lambda r: numpy.exp(-0.1 * r)) loss6bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5) loss7bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) loss6knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5) loss7knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) # loss8 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=1) # loss9 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) for loss in [loss1, loss3, loss6bin, loss7bin, loss6knn, loss7knn]: result = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=n_estimators, train_variables=None) \ .fit(trainX[:n_samples], trainY[:n_samples]).score(testX, testY) assert result >= 0.7, "The quality is too poor: %.3f" % result
def test_gb_with_ada(n_samples=1000, n_features=10, distance=0.6): testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) loss = BinomialDevianceLossFunction() clf = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_variables=None) clf.fit(trainX, trainY) assert clf.n_features == n_features assert len(clf.feature_importances_) == n_features # checking that predict proba works for p in clf.staged_predict_proba(testX): assert p.shape == (n_samples, 2) assert numpy.all(p == clf.predict_proba(testX))
def test_gb_with_ada(n_samples=1000, n_features=10, distance=0.6): testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) loss = BinomialDevianceLossFunction() clf = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_variables=None) clf.fit(trainX, trainY) assert clf.n_features == n_features assert len(clf.feature_importances_) == n_features # checking that predict proba works for p in clf.staged_predict_proba(testX): assert p.shape == (n_samples, 2) assert numpy.all(p == clf.predict_proba(testX))
def test_gb_quality(n_samples=10000, n_features=10, distance=0.5): trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) # Multiplying by random matrix multiplier = numpy.random.normal(size=[n_features, n_features]) shift = numpy.random.normal(size=[1, n_features]) * 5 trainX = numpy.dot(trainX.values, multiplier) + shift testX = numpy.dot(testX.values, multiplier) + shift boosters = { 'old_boost': GradientBoostingClassifier(n_estimators=100, min_samples_split=50, max_depth=5, subsample=0.3), 'fast+old_tree': CommonGradientBoosting(n_estimators=100, base_estimator=DecisionTreeRegressor(min_samples_split=50, max_depth=5)), 'fast+neuro': TreeGradientBoostingClassifier(n_estimators=100, update_tree=True, base_estimator=FastNeuroTreeRegressor()), 'fold+tree': FoldingGBClassifier(loss=BinomialDeviance(), n_estimators=10, update_tree=True, base_estimator=FastNeuroTreeRegressor()), 'ugb': uGradientBoostingClassifier(loss=AdaLossFunction(), n_estimators=100, min_samples_split=50, max_depth=5, update_tree=True, subsample=0.3) } for criterion in ['mse', # 'fmse', # 'pvalue', # 'significance', 'significance2', # 'gini', 'entropy', 'poisson' ]: boosters['fast-' + criterion[:4]] = TreeGradientBoostingClassifier(n_estimators=100, update_tree=True, base_estimator=FastTreeRegressor(criterion=criterion)) for name, booster in boosters.items(): start = time.time() booster.fit(trainX, trainY) auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1]) print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))
def test_gb_quality(n_samples=10000, n_features=10, distance=0.5): trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) # Multiplying by random matrix multiplier = numpy.random.normal(size=[n_features, n_features]) shift = numpy.random.normal(size=[1, n_features]) * 5 trainX = numpy.dot(trainX.values, multiplier) + shift testX = numpy.dot(testX.values, multiplier) + shift boosters = { 'old_boost': GradientBoostingClassifier(n_estimators=100, min_samples_split=50, max_depth=5, subsample=0.3), 'fast+old_tree': CommonGradientBoosting(n_estimators=100, base_estimator=DecisionTreeRegressor( min_samples_split=50, max_depth=5)), 'fast+neuro': TreeGradientBoostingClassifier( n_estimators=100, update_tree=True, base_estimator=FastNeuroTreeRegressor()), 'fold+tree': FoldingGBClassifier(loss=BinomialDeviance(), n_estimators=10, update_tree=True, base_estimator=FastNeuroTreeRegressor()), 'ugb': uGradientBoostingClassifier(loss=AdaLossFunction(), n_estimators=100, min_samples_split=50, max_depth=5, update_tree=True, subsample=0.3) } for criterion in [ 'mse', # 'fmse', # 'pvalue', # 'significance', 'significance2', # 'gini', 'entropy', 'poisson' ]: boosters['fast-' + criterion[:4]] = TreeGradientBoostingClassifier( n_estimators=100, update_tree=True, base_estimator=FastTreeRegressor(criterion=criterion)) for name, booster in boosters.items(): start = time.time() booster.fit(trainX, trainY) auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1]) print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))