def _test_classification_report(n_classes=2): classifiers = ClassifiersFactory() classifiers.add_classifier('gb', GradientBoostingClassifier(n_estimators=10)) classifiers.add_classifier('rf', RandomForestClassifier()) classifiers.add_classifier('ada', AdaBoostClassifier(n_estimators=10)) X, y = generate_classification_sample(1000, 5, n_classes=n_classes) classifiers.fit(X, y) X, y = generate_classification_sample(1000, 5, n_classes=n_classes) test_lds = LabeledDataStorage(X, y, sample_weight=None) report = classifiers.test_on_lds(test_lds) val = numpy.mean(X['column0']) labels_dict = None if n_classes > 2: labels_dict = {} for i in range(n_classes): labels_dict[i] = str(i) _classification_mask_report(report, "column0 > %f" % val, X, labels_dict) _classification_mask_report(report, lambda x: numpy.array(x['column0']) < val, X, labels_dict) _classification_mask_report(report, None, X, labels_dict)
def test_factory(): factory = ClassifiersFactory() try: from rep.estimators.tmva import TMVAClassifier factory.add_classifier('tmva', TMVAClassifier()) except ImportError: pass factory.add_classifier('rf', RandomForestClassifier(n_estimators=10)) factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20)) X, y, sample_weight = generate_classification_data() assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns), parallel_profile='threads-4') for cl in factory.values(): assert list(cl.features) == list(X.columns) proba = factory.predict_proba(X, parallel_profile='threads-4') labels = factory.predict(X, parallel_profile='threads-4') for key, val in labels.items(): score = accuracy_score(y, val) print(key, score) assert score > 0.7, key for key, val in proba.items(): assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1' assert numpy.all(val >= 0.), 'negative probabilities' auc_score = roc_auc_score(y, val[:, 1]) print(auc_score) assert auc_score > 0.8 for key, iterator in factory.staged_predict_proba(X).items(): assert key != 'tmva', 'tmva does not support staged pp' for p in iterator: assert p.shape == (len(X), 2) # checking that last iteration coincides with previous assert numpy.all(p == proba[key]) # testing picklability dump_string = cPickle.dumps(factory) clf_loaded = cPickle.loads(dump_string) assert type(factory) == type(clf_loaded) probs1 = factory.predict_proba(X) probs2 = clf_loaded.predict_proba(X) for key, val in probs1.items(): assert numpy.all(val == probs2[key]), 'something strange was loaded' report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight)) report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3)) report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight)) report = factory.test_on(X, y, sample_weight=sample_weight) val = numpy.mean(X['column0']) yield check_report_with_mask, report, "column0 > %f" % (val / 2.), X yield check_report_with_mask, report, lambda x: numpy.array(x['column0']) < val * 2., X yield check_report_with_mask, report, None, X
def _check_classification_report(n_classes=2): classifiers = ClassifiersFactory() classifiers.add_classifier('gb', GradientBoostingClassifier(n_estimators=10)) classifiers.add_classifier('rf', RandomForestClassifier()) classifiers.add_classifier('ada', AdaBoostClassifier(n_estimators=10)) X, y = generate_classification_sample(1000, 5, n_classes=n_classes) classifiers.fit(X, y) X, y = generate_classification_sample(1000, 5, n_classes=n_classes) test_lds = LabeledDataStorage(X, y, sample_weight=None) report = classifiers.test_on_lds(test_lds) val = numpy.mean(X['column0']) labels_dict = None if n_classes > 2: labels_dict = {i: str(i) for i in range(n_classes)} _classification_mask_report(report, "column0 > %f" % val, X, labels_dict) _classification_mask_report(report, lambda x: numpy.array(x['column0']) < val, X, labels_dict) _classification_mask_report(report, None, X, labels_dict) check_classification_learning_curve_masks(report, n_classes=n_classes)