def check_classifiers(n_samples=10000): """ This function is not tested by default, it should be called manually """ testX, testY = generate_sample(n_samples, 10, 0.6) trainX, trainY = generate_sample(n_samples, 10, 0.6) uniform_features = ['column0'] ada = AdaBoostClassifier(n_estimators=50) ideal_bayes = GaussianNB() uBoost_SAMME = uBoostClassifier( uniform_features=uniform_features, uniform_label=1, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME") uBoost_SAMME_R = uBoostClassifier( uniform_features=uniform_features, uniform_label=1, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME.R") uBoost_SAMME_R_threaded = uBoostClassifier( uniform_features=uniform_features, uniform_label=1, n_neighbors=50, efficiency_steps=5, n_estimators=50, n_threads=3, subsample=0.9, algorithm="SAMME.R") clf_dict = OrderedDict({ "Ada": ada, "uBOOST": uBoost_SAMME, "uBOOST.R": uBoost_SAMME_R, "uBOOST.R2": uBoost_SAMME_R_threaded }) cvms = {} for clf_name, clf in clf_dict.items(): clf.fit(trainX, trainY) p = clf.predict_proba(testX) metric = KnnBasedCvM(uniform_features=uniform_features) metric.fit(testX, testY) cvms[clf_name] = metric(testY, p, sample_weight=np.ones(len(testY))) assert cvms['uBOOST'] < cvms['ada'] print(cvms)
def check_classifiers(n_samples=10000, output_name_pattern=None): """ This function is not tested by default, it should be called manually """ testX, testY = generate_sample(n_samples, 10, 0.6) trainX, trainY = generate_sample(n_samples, 10, 0.6) uniform_variables = ['column0'] ada = AdaBoostClassifier(n_estimators=50) ideal_bayes = HidingClassifier(train_variables=trainX.columns[1:], base_estimator=GaussianNB()) uBoost_SAMME = uBoostClassifier( uniform_variables=uniform_variables, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME") uBoost_SAMME_R = uBoostClassifier( uniform_variables=uniform_variables, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME.R") clf_dict = ClassifiersDict({ "Ada": ada, "Ideal": ideal_bayes, "uBOOST": uBoost_SAMME, "uBOOST.R": uBoost_SAMME_R }) clf_dict.fit(trainX, trainY) predictions = Predictions(clf_dict, testX, testY) # predictions.print_mse(uniform_variables, in_html=False) print(predictions.compute_metrics()) predictions.sde_curves(uniform_variables) if output_name_pattern is not None: pl.savefig(output_name_pattern % "mse_curves", bbox="tight") _ = pl.figure() predictions.learning_curves() if output_name_pattern is not None: pl.savefig(output_name_pattern % "learning_curves", bbox="tight") predictions.efficiency(uniform_variables) if output_name_pattern is not None: pl.savefig(output_name_pattern % "efficiency_curves", bbox="tight")
def test_probas(n_samples=1000): trainX, trainY = generate_sample(n_samples, 10, 0.6) testX, testY = generate_sample(n_samples, 10, 0.6) params = { 'n_neighbors': 10, 'n_estimators': 10, 'uniform_variables': ['column0'], 'base_estimator': DecisionTreeClassifier(max_depth=5) } for algorithm in ['SAMME', 'SAMME.R']: uboost_classifier = uBoostClassifier( algorithm=algorithm, efficiency_steps=3, **params) bdt_classifier = uBoostBDT(algorithm=algorithm, **params) for classifier in [bdt_classifier, uboost_classifier]: classifier.fit(trainX, trainY) proba1 = classifier.predict_proba(testX) proba2 = list(classifier.staged_predict_proba(testX))[-1] assert np.allclose(proba1, proba2, atol=0.001),\ "staged_predict doesn't coincide with the predict for proba." score1 = bdt_classifier.predict_score(testX) score2 = list(bdt_classifier.staged_predict_score(testX))[-1] assert np.allclose(score1, score2),\ "staged_score doesn't coincide with the score." assert len(bdt_classifier.feature_importances_) == trainX.shape[1]
def test_quality(n_samples=3000): testX, testY = generate_sample(n_samples, 10, 0.6) trainX, trainY = generate_sample(n_samples, 10, 0.6) params = { 'n_neighbors': 10, 'n_estimators': 10, 'uniform_features': ['column0'], 'uniform_label': 1, 'base_estimator': DecisionTreeClassifier(min_samples_leaf=20, max_depth=5) } for algorithm in ['SAMME', 'SAMME.R']: uboost_classifier = uBoostClassifier( algorithm=algorithm, efficiency_steps=5, **params) bdt_classifier = uBoostBDT(algorithm=algorithm, **params) for classifier in [bdt_classifier, uboost_classifier]: classifier.fit(trainX, trainY) predict_proba = classifier.predict_proba(testX) predict = classifier.predict(testX) assert roc_auc_score(testY, predict_proba[:, 1]) > 0.7, \ "quality is awful" print("Accuracy = %.3f" % accuracy_score(testY, predict))
def test_probas(n_samples=1000): trainX, trainY = generate_sample(n_samples, 10, 0.6) testX, testY = generate_sample(n_samples, 10, 0.6) params = { 'n_neighbors': 10, 'n_estimators': 10, 'uniform_features': ['column0'], 'uniform_label': 1, 'base_estimator': DecisionTreeClassifier(max_depth=5) } for algorithm in ['SAMME', 'SAMME.R']: uboost_classifier = uBoostClassifier( algorithm=algorithm, efficiency_steps=3, **params) bdt_classifier = uBoostBDT(algorithm=algorithm, **params) for classifier in [bdt_classifier, uboost_classifier]: classifier.fit(trainX, trainY) proba1 = classifier.predict_proba(testX) proba2 = list(classifier.staged_predict_proba(testX))[-1] assert np.allclose(proba1, proba2, atol=0.001), \ "staged_predict doesn't coincide with the predict for proba." score1 = bdt_classifier.decision_function(testX) score2 = list(bdt_classifier.staged_decision_function(testX))[-1] assert np.allclose(score1, score2), \ "staged_score doesn't coincide with the score." assert len(bdt_classifier.feature_importances_) == trainX.shape[1]
def check_classifiers(n_samples=10000, output_name_pattern=None): """ This function is not tested by default, it should be called manually """ testX, testY = generate_sample(n_samples, 10, 0.6) trainX, trainY = generate_sample(n_samples, 10, 0.6) uniform_variables = ['column0'] ada = AdaBoostClassifier(n_estimators=50) ideal_bayes = HidingClassifier(train_variables=trainX.columns[1:], base_estimator=GaussianNB()) uBoost_SAMME = uBoostClassifier(uniform_variables=uniform_variables, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME") uBoost_SAMME_R = uBoostClassifier(uniform_variables=uniform_variables, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME.R") clf_dict = ClassifiersDict({ "Ada": ada, "Ideal": ideal_bayes, "uBOOST": uBoost_SAMME, "uBOOST.R": uBoost_SAMME_R }) clf_dict.fit(trainX, trainY) predictions = Predictions(clf_dict, testX, testY) # predictions.print_mse(uniform_variables, in_html=False) print(predictions.compute_metrics()) predictions.sde_curves(uniform_variables) if output_name_pattern is not None: pl.savefig(output_name_pattern % "mse_curves", bbox="tight") _ = pl.figure() predictions.learning_curves() if output_name_pattern is not None: pl.savefig(output_name_pattern % "learning_curves", bbox="tight") predictions.efficiency(uniform_variables) if output_name_pattern is not None: pl.savefig(output_name_pattern % "efficiency_curves", bbox="tight")
uniform_features = ["mass"] n_estimators = 150 base_estimator = DecisionTreeClassifier(max_depth=4) base_ada = GradientBoostingClassifier(max_depth=4, n_estimators=100, learning_rate=0.1) AdaBoost = SklearnClassifier(base_ada, features=train_features) knnloss = ugb.KnnAdaLossFunction(uniform_features, knn=10, uniform_label=1) ugbKnn = ugb.UGradientBoostingClassifier(loss=knnloss, max_depth=4, n_estimators=n_estimators, learning_rate=0.4, train_features=train_features) uGB+knnAda = SklearnClassifier(ugbKnn) uboost_clf = uboost.uBoostClassifier(uniform_features=uniform_features, uniform_label=1, base_estimator=base_estimator, n_estimators=n_estimators, train_features=train_features, efficiency_steps=12, n_threads=4) uBoost = SklearnClassifier(uboost_clf) flatnessloss = ugb.KnnFlatnessLossFunction(uniform_features, fl_coefficient=3., power=1.3, uniform_label=1) ugbFL = ugb.UGradientBoostingClassifier(loss=flatnessloss, max_depth=4, n_estimators=n_estimators, learning_rate=0.1, train_features=train_features) uGB+FL = SklearnClassifier(ugbFL) AdaBoost.fit(train_i) uGB+knnAda.fit(train_i) uBoost.fit(train_i) uGB+FL.fit(train_i)