def test_new_metrics(n_samples=2000, knn=50): X, y = generate_sample(n_samples=n_samples, n_features=10) sample_weight = numpy.random.exponential(size=n_samples) ** 0. predictions = numpy.random.random(size=[n_samples, 2]) predictions /= predictions.sum(axis=1, keepdims=True) predictions *= 1000. # Checking SDE features = X.columns[:1] sde_val1 = sde(y, predictions, X, uniform_variables=features, sample_weight=sample_weight, label=0, knn=knn) sde2 = KnnBasedSDE(n_neighbours=knn, uniform_features=features, uniform_label=0, ) sde2.fit(X, y, sample_weight=sample_weight) sde_val2 = sde2(y, predictions, sample_weight=sample_weight) assert sde_val1 == sde_val2, 'SDE values are different' # Checking CVM features = X.columns[:1] cvm_val1 = cvm_flatness(y, predictions, X, uniform_variables=features, sample_weight=sample_weight, label=0, knn=knn) cvm2 = KnnBasedCvM(n_neighbours=knn, uniform_features=features, uniform_label=0, ) cvm2.fit(X, y, sample_weight=sample_weight) cvm_val2 = cvm2(y, predictions, sample_weight=sample_weight) assert cvm_val1 == cvm_val2, 'CvM values are different'
def check_classifiers(n_samples=10000): """ This function is not tested by default, it should be called manually """ testX, testY = generate_sample(n_samples, 10, 0.6) trainX, trainY = generate_sample(n_samples, 10, 0.6) uniform_features = ['column0'] ada = AdaBoostClassifier(n_estimators=50) ideal_bayes = GaussianNB() uBoost_SAMME = uBoostClassifier( uniform_features=uniform_features, uniform_label=1, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME") uBoost_SAMME_R = uBoostClassifier( uniform_features=uniform_features, uniform_label=1, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME.R") uBoost_SAMME_R_threaded = uBoostClassifier( uniform_features=uniform_features, uniform_label=1, n_neighbors=50, efficiency_steps=5, n_estimators=50, n_threads=3, subsample=0.9, algorithm="SAMME.R") clf_dict = OrderedDict({ "Ada": ada, "uBOOST": uBoost_SAMME, "uBOOST.R": uBoost_SAMME_R, "uBOOST.R2": uBoost_SAMME_R_threaded }) cvms = {} for clf_name, clf in clf_dict.items(): clf.fit(trainX, trainY) p = clf.predict_proba(testX) metric = KnnBasedCvM(uniform_features=uniform_features) metric.fit(testX, testY) cvms[clf_name] = metric(testY, p, sample_weight=np.ones(len(testY))) assert cvms['uBOOST'] < cvms['ada'] print(cvms)
def test_new_metrics(n_samples=2000, knn=50): X, y = generate_sample(n_samples=n_samples, n_features=10) sample_weight = numpy.random.exponential(size=n_samples)**0. predictions = numpy.random.random(size=[n_samples, 2]) predictions /= predictions.sum(axis=1, keepdims=True) predictions *= 1000. # Checking SDE features = X.columns[:1] sde_val1 = sde(y, predictions, X, uniform_variables=features, sample_weight=sample_weight, label=0, knn=knn) sde2 = KnnBasedSDE( n_neighbours=knn, uniform_features=features, uniform_label=0, ) sde2.fit(X, y, sample_weight=sample_weight) sde_val2 = sde2(y, predictions, sample_weight=sample_weight) assert sde_val1 == sde_val2, 'SDE values are different' # Checking CVM features = X.columns[:1] cvm_val1 = cvm_flatness(y, predictions, X, uniform_variables=features, sample_weight=sample_weight, label=0, knn=knn) cvm2 = KnnBasedCvM( n_neighbours=knn, uniform_features=features, uniform_label=0, ) cvm2.fit(X, y, sample_weight=sample_weight) cvm_val2 = cvm2(y, predictions, sample_weight=sample_weight) assert cvm_val1 == cvm_val2, 'CvM values are different'
def test_new_metrics(n_samples=2000, knn=50): X, y = generate_sample(n_samples=n_samples, n_features=10) sample_weight = numpy.random.exponential(size=n_samples) ** 0. predictions_orig = numpy.random.random(size=[n_samples, 2]) for shift in [0.1, 0.2]: predictions = predictions_orig.copy() predictions[:, 1] += shift * y predictions /= predictions.sum(axis=1, keepdims=True) # Checking SDE features = X.columns[:1] sde_val1 = sde(y, predictions, X, uniform_features=features, sample_weight=sample_weight, label=0, knn=knn) sde_metric = KnnBasedSDE(n_neighbours=knn, uniform_features=features, uniform_label=0, ) sde_metric.fit(X, y, sample_weight=sample_weight) sde_val2 = sde_metric(y, predictions, sample_weight=sample_weight) assert numpy.allclose(sde_val1, sde_val2), 'SDE values are different' # Checking theil theil_val1 = theil_flatness(y, predictions, X, uniform_features=features, sample_weight=sample_weight, label=0, knn=knn) theil_metric = KnnBasedTheil(n_neighbours=knn, uniform_features=features, uniform_label=0, ) theil_metric.fit(X, y, sample_weight=sample_weight) theil_val2 = theil_metric(y, predictions, sample_weight=sample_weight) print(theil_val1, theil_val2) assert numpy.allclose(theil_val1, theil_val2), 'Theil values are different' # Checking CVM features = X.columns[:1] cvm_val1 = cvm_flatness(y, predictions, X, uniform_features=features, sample_weight=sample_weight, label=0, knn=knn) cvm_metric = KnnBasedCvM(n_neighbours=knn, uniform_features=features, uniform_label=0, ) cvm_metric.fit(X, y, sample_weight=sample_weight) cvm_val2 = cvm_metric(y, predictions, sample_weight=sample_weight) assert numpy.allclose(cvm_val1, cvm_val2), 'CvM values are different'