Example #1
0
def _test_classification_report(n_classes=2):
    classifiers = ClassifiersFactory()
    classifiers.add_classifier('gb',
                               GradientBoostingClassifier(n_estimators=10))
    classifiers.add_classifier('rf', RandomForestClassifier())
    classifiers.add_classifier('ada', AdaBoostClassifier(n_estimators=10))

    X, y = generate_classification_sample(1000, 5, n_classes=n_classes)
    classifiers.fit(X, y)

    X, y = generate_classification_sample(1000, 5, n_classes=n_classes)
    test_lds = LabeledDataStorage(X, y, sample_weight=None)
    report = classifiers.test_on_lds(test_lds)

    val = numpy.mean(X['column0'])
    labels_dict = None
    if n_classes > 2:
        labels_dict = {}
        for i in range(n_classes):
            labels_dict[i] = str(i)
    _classification_mask_report(report, "column0 > %f" % val, X, labels_dict)
    _classification_mask_report(report,
                                lambda x: numpy.array(x['column0']) < val, X,
                                labels_dict)
    _classification_mask_report(report, None, X, labels_dict)
Example #2
0
def test_factory():
    factory = ClassifiersFactory()
    try:
        from rep.estimators.tmva import TMVAClassifier
        factory.add_classifier('tmva', TMVAClassifier())
    except ImportError:
        pass
    factory.add_classifier('rf', RandomForestClassifier(n_estimators=10))
    factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20))

    X, y, sample_weight = generate_classification_data()
    assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns),
                                  parallel_profile='threads-4')
    for cl in factory.values():
        assert list(cl.features) == list(X.columns)
    proba = factory.predict_proba(X, parallel_profile='threads-4')
    labels = factory.predict(X, parallel_profile='threads-4')
    for key, val in labels.items():
        score = accuracy_score(y, val)
        print(key, score)
        assert score > 0.7, key

    for key, val in proba.items():
        assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1'
        assert numpy.all(val >= 0.), 'negative probabilities'

        auc_score = roc_auc_score(y, val[:, 1])
        print(auc_score)
        assert auc_score > 0.8

    for key, iterator in factory.staged_predict_proba(X).items():
        assert key != 'tmva', 'tmva does not support staged pp'
        for p in iterator:
            assert p.shape == (len(X), 2)

        # checking that last iteration coincides with previous
        assert numpy.all(p == proba[key])

    # testing picklability
    dump_string = cPickle.dumps(factory)
    clf_loaded = cPickle.loads(dump_string)

    assert type(factory) == type(clf_loaded)

    probs1 = factory.predict_proba(X)
    probs2 = clf_loaded.predict_proba(X)
    for key, val in probs1.items():
        assert numpy.all(val == probs2[key]), 'something strange was loaded'

    report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight))
    report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3))
    report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight))
    report = factory.test_on(X, y, sample_weight=sample_weight)
    val = numpy.mean(X['column0'])
    yield check_report_with_mask, report, "column0 > %f" % (val / 2.), X
    yield check_report_with_mask, report, lambda x: numpy.array(x['column0']) < val * 2., X
    yield check_report_with_mask, report, None, X
Example #3
0
def _check_classification_report(n_classes=2):
    classifiers = ClassifiersFactory()
    classifiers.add_classifier('gb', GradientBoostingClassifier(n_estimators=10))
    classifiers.add_classifier('rf', RandomForestClassifier())
    classifiers.add_classifier('ada', AdaBoostClassifier(n_estimators=10))

    X, y = generate_classification_sample(1000, 5, n_classes=n_classes)
    classifiers.fit(X, y)

    X, y = generate_classification_sample(1000, 5, n_classes=n_classes)
    test_lds = LabeledDataStorage(X, y, sample_weight=None)
    report = classifiers.test_on_lds(test_lds)

    val = numpy.mean(X['column0'])
    labels_dict = None
    if n_classes > 2:
        labels_dict = {i: str(i) for i in range(n_classes)}
    _classification_mask_report(report, "column0 > %f" % val, X, labels_dict)
    _classification_mask_report(report, lambda x: numpy.array(x['column0']) < val, X, labels_dict)
    _classification_mask_report(report, None, X, labels_dict)
    check_classification_learning_curve_masks(report, n_classes=n_classes)