def test_factory():
    factory = RegressorsFactory()
    try:
        from rep.estimators.tmva import TMVARegressor
        factory.add_regressor('tmva', TMVARegressor())
    except ImportError:
        pass
    factory.add_regressor('rf', RandomForestRegressor(n_estimators=10))
    factory.add_regressor('ada', AdaBoostRegressor(n_estimators=20))

    X, y, sample_weight = generate_classification_data()
    assert factory == factory.fit(X,
                                  y,
                                  sample_weight=sample_weight,
                                  features=list(X.columns))
    values = factory.predict(X)

    for cl in factory.values():
        assert list(cl.features) == list(X.columns)

    for key, val in values.items():
        score = mean_squared_error(y, val)
        print(score)
        assert score < 0.2

    for key, iterator in factory.staged_predict(X).items():
        assert key != 'tmva', 'tmva does not support staged pp'
        for p in iterator:
            assert p.shape == (len(X), )

        # checking that last iteration coincides with previous
        assert numpy.all(p == values[key])

    # testing picklability
    dump_string = cPickle.dumps(factory)
    clf_loaded = cPickle.loads(dump_string)

    assert type(factory) == type(clf_loaded)

    probs1 = factory.predict(X)
    probs2 = clf_loaded.predict(X)
    for key, val in probs1.items():
        assert numpy.all(val == probs2[key]), 'something strange was loaded'

    report = RegressionReport({'rf': factory['rf']},
                              LabeledDataStorage(X, y, sample_weight))
    report.feature_importance_shuffling(mean_squared_mod).plot(new_plot=True,
                                                               figsize=(18, 3))
    report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight))
    report = factory.test_on(X, y, sample_weight=sample_weight)
    report.feature_importance()
    report.features_correlation_matrix()
    report.predictions_scatter()

    val = numpy.mean(X['column0'])
    report_mask(report, "column0 > %f" % val, X)
    report_mask(report, lambda x: numpy.array(x['column0']) < val, X)
    report_mask(report, None, X)
Exemple #2
0
def test_factory():
    factory = ClassifiersFactory()
    try:
        from rep.estimators.tmva import TMVAClassifier
        factory.add_classifier('tmva', TMVAClassifier())
    except ImportError:
        pass
    factory.add_classifier('rf', RandomForestClassifier(n_estimators=10))
    factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20))

    X, y, sample_weight = generate_classification_data()
    assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns),
                                  parallel_profile='threads-4')
    for cl in factory.values():
        assert list(cl.features) == list(X.columns)
    proba = factory.predict_proba(X, parallel_profile='threads-4')
    labels = factory.predict(X, parallel_profile='threads-4')
    for key, val in labels.items():
        score = accuracy_score(y, val)
        print(key, score)
        assert score > 0.7, key

    for key, val in proba.items():
        assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1'
        assert numpy.all(val >= 0.), 'negative probabilities'

        auc_score = roc_auc_score(y, val[:, 1])
        print(auc_score)
        assert auc_score > 0.8

    for key, iterator in factory.staged_predict_proba(X).items():
        assert key != 'tmva', 'tmva does not support staged pp'
        for p in iterator:
            assert p.shape == (len(X), 2)

        # checking that last iteration coincides with previous
        assert numpy.all(p == proba[key])

    # testing picklability
    dump_string = cPickle.dumps(factory)
    clf_loaded = cPickle.loads(dump_string)

    assert type(factory) == type(clf_loaded)

    probs1 = factory.predict_proba(X)
    probs2 = clf_loaded.predict_proba(X)
    for key, val in probs1.items():
        assert numpy.all(val == probs2[key]), 'something strange was loaded'

    report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight))
    report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3))
    report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight))
    report = factory.test_on(X, y, sample_weight=sample_weight)
    val = numpy.mean(X['column0'])
    yield check_report_with_mask, report, "column0 > %f" % (val / 2.), X
    yield check_report_with_mask, report, lambda x: numpy.array(x['column0']) < val * 2., X
    yield check_report_with_mask, report, None, X
Exemple #3
0
def test_own_classification_reports():
    """
    testing clf.test_on
    """
    X, y, sample_weight = generate_classification_data()
    clf = SklearnClassifier(RandomForestClassifier())
    clf.fit(X, y, sample_weight=sample_weight)
    report = clf.test_on(X, y, sample_weight=sample_weight)
    roc1 = report.compute_metric(RocAuc())

    lds = LabeledDataStorage(X, y, sample_weight=sample_weight)
    roc2 = clf.test_on_lds(lds=lds).compute_metric(RocAuc())
    assert roc1 == roc2, 'Something wrong with test_on'
Exemple #4
0
def test_own_regression_reports():
    """
    testing regressor.test_on
    """
    X, y, sample_weight = generate_regression_data()
    regressor = SklearnRegressor(RandomForestRegressor())
    regressor.fit(X, y, sample_weight=sample_weight)
    report = regressor.test_on(X, y, sample_weight=sample_weight)
    mse1 = report.compute_metric(mean_squared_error)

    lds = LabeledDataStorage(X, y, sample_weight=sample_weight)
    mse2 = regressor.test_on_lds(lds=lds).compute_metric(mean_squared_error)
    assert mse1 == mse2, 'Something wrong with test_on'
Exemple #5
0
 def stacker_test_on(self, X, y, sample_weight=None):
     """Return report for the stacker only"""
     lds = LabeledDataStorage(X, y, sample_weight)
     return self.stacker_test_on_lds(lds)
Exemple #6
0
 def test_on(self, X, y, sample_weight=None):
     lds = LabeledDataStorage(X, y, sample_weight)
     return self.test_on_lds(lds)
Exemple #7
0
                            np.zeros(backgr.shape[0])))
        w = np.ones(len(X))

    if primitiv:
        X = pd.DataFrame({'odin': np.array([2., 2., 2., 2., 3., 3., 2., 3., 8.,
                                            7., 8., 7., 8., 8., 7., 8.]),
                          'dwa': np.array([2.2, 2.1, 2.2, 2.3, 3.1, 3.1, 2.1, 3.2, 8.1,
                                           7.5, 8.2, 7.1, 8.5, 8.2, 7.6, 8.1])
                          })
        y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])
        w = np.ones(16)
        branch_names = ['odin', 'dwa']
    print branch_names
    X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, test_size=0.33)

    lds = LabeledDataStorage(X_test, y_test, w_test)
    # CLASSIFIER
    clf_stacking = SklearnClassifier(RandomForestClassifier(n_estimators=5000, bootstrap=False,
                                                            n_jobs=7))
    # clf_stacking = XGBoostClassifier(n_estimators=700, eta=0.1, nthreads=8,
    #                                 subsample=0.5
    #                                 )
    # clf_stacking='nn'
    clf = Mayou(base_estimators={'xgb': None}, bagging_base=None, bagging_stack=8,
                stacking=clf_stacking, features_stack=branch_names,
                transform=False, transform_pred=False)
    # clf = SklearnClassifier(GaussianNB())
    # clf = SklearnClassifier(BaggingClassifier(n_jobs=1, max_features=1.,
    # bootstrap=False, base_estimator=clf, n_estimators=20, max_samples=0.1))
    # clf = XGBoostClassifier(n_estimators=400, eta=0.1, nthreads=6)
    # clf = SklearnClassifier(BaggingClassifier(clf, max_samples=0.8))
Exemple #8
0
def train_one_vs_one(base_estimators,
                     data_b,
                     data_c,
                     data_light,
                     prefix='bdt',
                     n_folds=2,
                     folding=True,
                     features=None,
                     profile=None):

    data_b_c_lds = LabeledDataStorage(pandas.concat([data_b, data_c]),
                                      [1] * len(data_b) + [0] * len(data_c))
    data_c_light_lds = LabeledDataStorage(pandas.concat(
        [data_c, data_light]), [1] * len(data_c) + [0] * len(data_light))
    data_b_light_lds = LabeledDataStorage(pandas.concat(
        [data_b, data_light]), [1] * len(data_b) + [0] * len(data_light))

    if folding:
        tt_folding_b_c = FoldingClassifier(base_estimators[0],
                                           n_folds=n_folds,
                                           random_state=11,
                                           parallel_profile=profile,
                                           features=features)
        tt_folding_c_light = FoldingClassifier(base_estimators[1],
                                               n_folds=n_folds,
                                               random_state=11,
                                               parallel_profile=profile,
                                               features=features)
        tt_folding_b_light = FoldingClassifier(base_estimators[2],
                                               n_folds=n_folds,
                                               random_state=11,
                                               parallel_profile=profile,
                                               features=features)
    else:
        tt_folding_b_c = base_estimators[0]
        tt_folding_b_c.features = features
        tt_folding_c_light = base_estimators[1]
        tt_folding_c_light.features = features
        tt_folding_b_light = base_estimators[2]
        tt_folding_b_light.features = features

    tt_folding_b_c.fit_lds(data_b_c_lds)

    tt_folding_c_light.fit_lds(data_c_light_lds)

    tt_folding_b_light.fit_lds(data_b_light_lds)

    probs_b_c = numpy.concatenate([
        tt_folding_b_c.predict_proba(pandas.concat([data_b, data_c])),
        tt_folding_b_c.predict_proba(data_light)
    ])[:, 1]
    probs_c_light = numpy.concatenate([
        tt_folding_c_light.predict_proba(data_b),
        tt_folding_c_light.predict_proba(pandas.concat([data_c, data_light]))
    ])[:, 1]
    probs_b_light = tt_folding_b_light.predict_proba(
        pandas.concat([data_b, data_light]))[:, 1]
    probs_b_light = numpy.concatenate([
        probs_b_light[:len(data_b)],
        tt_folding_b_light.predict_proba(data_c)[:, 1],
        probs_b_light[len(data_b):]
    ])

    additional_columns = pandas.DataFrame({
        prefix + '_b_c': probs_b_c,
        prefix + '_b_light': probs_b_light,
        prefix + '_c_light': probs_c_light
    })
    return additional_columns