def test_probas(n_samples=1000): trainX, trainY = generate_sample(n_samples, 10, 0.6) testX, testY = generate_sample(n_samples, 10, 0.6) params = { 'n_neighbors': 10, 'n_estimators': 10, 'uniform_variables': ['column0'], 'base_estimator': DecisionTreeClassifier(max_depth=5) } for algorithm in ['SAMME', 'SAMME.R']: uboost_classifier = uBoostClassifier( algorithm=algorithm, efficiency_steps=3, **params) bdt_classifier = uBoostBDT(algorithm=algorithm, **params) for classifier in [bdt_classifier, uboost_classifier]: classifier.fit(trainX, trainY) proba1 = classifier.predict_proba(testX) proba2 = list(classifier.staged_predict_proba(testX))[-1] assert np.allclose(proba1, proba2, atol=0.001),\ "staged_predict doesn't coincide with the predict for proba." score1 = bdt_classifier.predict_score(testX) score2 = list(bdt_classifier.staged_predict_score(testX))[-1] assert np.allclose(score1, score2),\ "staged_score doesn't coincide with the score." assert len(bdt_classifier.feature_importances_) == trainX.shape[1]
def test_cuts(n_samples=1000): base_classifier = DecisionTreeClassifier(min_samples_leaf=10, max_depth=6) trainX, trainY = generate_sample(n_samples, 10, 0.6) uniform_variables = ['column0'] for algorithm in ['SAMME', 'SAMME.R']: for target_efficiency in [0.1, 0.3, 0.5, 0.7, 0.9]: uBDT = uBoostBDT( uniform_variables=uniform_variables, target_efficiency=target_efficiency, n_neighbors=20, n_estimators=20, algorithm=algorithm, base_estimator=base_classifier) uBDT.fit(trainX, trainY) passed = sum(trainY) * target_efficiency assert uBDT.score_cut == uBDT.score_cuts_[-1],\ 'something wrong with computed cuts' for score, cut in zip(uBDT.staged_predict_score(trainX[trainY > 0.5]), uBDT.score_cuts_): passed_upper = np.sum(score > cut - 1e-7) passed_lower = np.sum(score > cut + 1e-7) assert passed_lower <= passed <= passed_upper, "wrong stage cuts"
def test_quality(n_samples=3000): testX, testY = generate_sample(n_samples, 10, 0.6) trainX, trainY = generate_sample(n_samples, 10, 0.6) params = { 'n_neighbors': 10, 'n_estimators': 10, 'uniform_features': ['column0'], 'uniform_label': 1, 'base_estimator': DecisionTreeClassifier(min_samples_leaf=20, max_depth=5) } for algorithm in ['SAMME', 'SAMME.R']: uboost_classifier = uBoostClassifier( algorithm=algorithm, efficiency_steps=5, **params) bdt_classifier = uBoostBDT(algorithm=algorithm, **params) for classifier in [bdt_classifier, uboost_classifier]: classifier.fit(trainX, trainY) predict_proba = classifier.predict_proba(testX) predict = classifier.predict(testX) assert roc_auc_score(testY, predict_proba[:, 1]) > 0.7, \ "quality is awful" print("Accuracy = %.3f" % accuracy_score(testY, predict))
def test_probas(n_samples=1000): trainX, trainY = generate_sample(n_samples, 10, 0.6) testX, testY = generate_sample(n_samples, 10, 0.6) params = { 'n_neighbors': 10, 'n_estimators': 10, 'uniform_features': ['column0'], 'uniform_label': 1, 'base_estimator': DecisionTreeClassifier(max_depth=5) } for algorithm in ['SAMME', 'SAMME.R']: uboost_classifier = uBoostClassifier( algorithm=algorithm, efficiency_steps=3, **params) bdt_classifier = uBoostBDT(algorithm=algorithm, **params) for classifier in [bdt_classifier, uboost_classifier]: classifier.fit(trainX, trainY) proba1 = classifier.predict_proba(testX) proba2 = list(classifier.staged_predict_proba(testX))[-1] assert np.allclose(proba1, proba2, atol=0.001), \ "staged_predict doesn't coincide with the predict for proba." score1 = bdt_classifier.decision_function(testX) score2 = list(bdt_classifier.staged_decision_function(testX))[-1] assert np.allclose(score1, score2), \ "staged_score doesn't coincide with the score." assert len(bdt_classifier.feature_importances_) == trainX.shape[1]
def test_cuts(n_samples=1000): base_classifier = DecisionTreeClassifier(min_samples_leaf=10, max_depth=6) trainX, trainY = generate_sample(n_samples, 10, 0.6) uniform_features = ['column0'] for algorithm in ['SAMME', 'SAMME.R']: for target_efficiency in [0.1, 0.3, 0.5, 0.7, 0.9]: uBDT = uBoostBDT( uniform_features=uniform_features, uniform_label=1, target_efficiency=target_efficiency, n_neighbors=20, n_estimators=20, algorithm=algorithm, base_estimator=base_classifier) uBDT.fit(trainX, trainY) passed = sum(trainY) * target_efficiency assert uBDT.score_cut == uBDT.score_cuts_[-1], \ 'something wrong with computed cuts' for score, cut in zip(uBDT.staged_decision_function(trainX[trainY > 0.5]), uBDT.score_cuts_): passed_upper = np.sum(score > cut - 1e-7) passed_lower = np.sum(score > cut + 1e-7) assert passed_lower <= passed <= passed_upper, "wrong stage cuts"
def train_uBoost(X, y, w, cfg, uniforming_rate): """ ... """ # Create base classifier base_tree = DecisionTreeClassifier(**cfg['DecisionTreeClassifier']) # Update training configuration these_cfg = dict(**cfg['uBoost']) these_cfg['uniforming_rate'] = uniforming_rate # Create uBoost classifier uboost = uBoostBDT(base_estimator=base_tree, **these_cfg) # Fit uBoost classifier uboost.fit(X, y, sample_weight=w) return uboost