def test_factory(): factory = RegressorsFactory() try: from rep.estimators.tmva import TMVARegressor factory.add_regressor('tmva', TMVARegressor()) except ImportError: pass factory.add_regressor('rf', RandomForestRegressor(n_estimators=10)) factory.add_regressor('ada', AdaBoostRegressor(n_estimators=20)) X, y, sample_weight = generate_classification_data() assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns)) values = factory.predict(X) for cl in factory.values(): assert list(cl.features) == list(X.columns) for key, val in values.items(): score = mean_squared_error(y, val) print(score) assert score < 0.2 for key, iterator in factory.staged_predict(X).items(): assert key != 'tmva', 'tmva does not support staged pp' for p in iterator: assert p.shape == (len(X), ) # checking that last iteration coincides with previous assert numpy.all(p == values[key]) # testing picklability dump_string = cPickle.dumps(factory) clf_loaded = cPickle.loads(dump_string) assert type(factory) == type(clf_loaded) probs1 = factory.predict(X) probs2 = clf_loaded.predict(X) for key, val in probs1.items(): assert numpy.all(val == probs2[key]), 'something strange was loaded' report = RegressionReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight)) report.feature_importance_shuffling(mean_squared_mod).plot(new_plot=True, figsize=(18, 3)) report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight)) report = factory.test_on(X, y, sample_weight=sample_weight) report.feature_importance() report.features_correlation_matrix() report.predictions_scatter() val = numpy.mean(X['column0']) report_mask(report, "column0 > %f" % val, X) report_mask(report, lambda x: numpy.array(x['column0']) < val, X) report_mask(report, None, X)
def test_factory(): factory = ClassifiersFactory() try: from rep.estimators.tmva import TMVAClassifier factory.add_classifier('tmva', TMVAClassifier()) except ImportError: pass factory.add_classifier('rf', RandomForestClassifier(n_estimators=10)) factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20)) X, y, sample_weight = generate_classification_data() assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns), parallel_profile='threads-4') for cl in factory.values(): assert list(cl.features) == list(X.columns) proba = factory.predict_proba(X, parallel_profile='threads-4') labels = factory.predict(X, parallel_profile='threads-4') for key, val in labels.items(): score = accuracy_score(y, val) print(key, score) assert score > 0.7, key for key, val in proba.items(): assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1' assert numpy.all(val >= 0.), 'negative probabilities' auc_score = roc_auc_score(y, val[:, 1]) print(auc_score) assert auc_score > 0.8 for key, iterator in factory.staged_predict_proba(X).items(): assert key != 'tmva', 'tmva does not support staged pp' for p in iterator: assert p.shape == (len(X), 2) # checking that last iteration coincides with previous assert numpy.all(p == proba[key]) # testing picklability dump_string = cPickle.dumps(factory) clf_loaded = cPickle.loads(dump_string) assert type(factory) == type(clf_loaded) probs1 = factory.predict_proba(X) probs2 = clf_loaded.predict_proba(X) for key, val in probs1.items(): assert numpy.all(val == probs2[key]), 'something strange was loaded' report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight)) report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3)) report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight)) report = factory.test_on(X, y, sample_weight=sample_weight) val = numpy.mean(X['column0']) yield check_report_with_mask, report, "column0 > %f" % (val / 2.), X yield check_report_with_mask, report, lambda x: numpy.array(x['column0']) < val * 2., X yield check_report_with_mask, report, None, X
def test_own_classification_reports(): """ testing clf.test_on """ X, y, sample_weight = generate_classification_data() clf = SklearnClassifier(RandomForestClassifier()) clf.fit(X, y, sample_weight=sample_weight) report = clf.test_on(X, y, sample_weight=sample_weight) roc1 = report.compute_metric(RocAuc()) lds = LabeledDataStorage(X, y, sample_weight=sample_weight) roc2 = clf.test_on_lds(lds=lds).compute_metric(RocAuc()) assert roc1 == roc2, 'Something wrong with test_on'
def test_own_regression_reports(): """ testing regressor.test_on """ X, y, sample_weight = generate_regression_data() regressor = SklearnRegressor(RandomForestRegressor()) regressor.fit(X, y, sample_weight=sample_weight) report = regressor.test_on(X, y, sample_weight=sample_weight) mse1 = report.compute_metric(mean_squared_error) lds = LabeledDataStorage(X, y, sample_weight=sample_weight) mse2 = regressor.test_on_lds(lds=lds).compute_metric(mean_squared_error) assert mse1 == mse2, 'Something wrong with test_on'
def stacker_test_on(self, X, y, sample_weight=None): """Return report for the stacker only""" lds = LabeledDataStorage(X, y, sample_weight) return self.stacker_test_on_lds(lds)
def test_on(self, X, y, sample_weight=None): lds = LabeledDataStorage(X, y, sample_weight) return self.test_on_lds(lds)
np.zeros(backgr.shape[0]))) w = np.ones(len(X)) if primitiv: X = pd.DataFrame({'odin': np.array([2., 2., 2., 2., 3., 3., 2., 3., 8., 7., 8., 7., 8., 8., 7., 8.]), 'dwa': np.array([2.2, 2.1, 2.2, 2.3, 3.1, 3.1, 2.1, 3.2, 8.1, 7.5, 8.2, 7.1, 8.5, 8.2, 7.6, 8.1]) }) y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]) w = np.ones(16) branch_names = ['odin', 'dwa'] print branch_names X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, test_size=0.33) lds = LabeledDataStorage(X_test, y_test, w_test) # CLASSIFIER clf_stacking = SklearnClassifier(RandomForestClassifier(n_estimators=5000, bootstrap=False, n_jobs=7)) # clf_stacking = XGBoostClassifier(n_estimators=700, eta=0.1, nthreads=8, # subsample=0.5 # ) # clf_stacking='nn' clf = Mayou(base_estimators={'xgb': None}, bagging_base=None, bagging_stack=8, stacking=clf_stacking, features_stack=branch_names, transform=False, transform_pred=False) # clf = SklearnClassifier(GaussianNB()) # clf = SklearnClassifier(BaggingClassifier(n_jobs=1, max_features=1., # bootstrap=False, base_estimator=clf, n_estimators=20, max_samples=0.1)) # clf = XGBoostClassifier(n_estimators=400, eta=0.1, nthreads=6) # clf = SklearnClassifier(BaggingClassifier(clf, max_samples=0.8))
def train_one_vs_one(base_estimators, data_b, data_c, data_light, prefix='bdt', n_folds=2, folding=True, features=None, profile=None): data_b_c_lds = LabeledDataStorage(pandas.concat([data_b, data_c]), [1] * len(data_b) + [0] * len(data_c)) data_c_light_lds = LabeledDataStorage(pandas.concat( [data_c, data_light]), [1] * len(data_c) + [0] * len(data_light)) data_b_light_lds = LabeledDataStorage(pandas.concat( [data_b, data_light]), [1] * len(data_b) + [0] * len(data_light)) if folding: tt_folding_b_c = FoldingClassifier(base_estimators[0], n_folds=n_folds, random_state=11, parallel_profile=profile, features=features) tt_folding_c_light = FoldingClassifier(base_estimators[1], n_folds=n_folds, random_state=11, parallel_profile=profile, features=features) tt_folding_b_light = FoldingClassifier(base_estimators[2], n_folds=n_folds, random_state=11, parallel_profile=profile, features=features) else: tt_folding_b_c = base_estimators[0] tt_folding_b_c.features = features tt_folding_c_light = base_estimators[1] tt_folding_c_light.features = features tt_folding_b_light = base_estimators[2] tt_folding_b_light.features = features tt_folding_b_c.fit_lds(data_b_c_lds) tt_folding_c_light.fit_lds(data_c_light_lds) tt_folding_b_light.fit_lds(data_b_light_lds) probs_b_c = numpy.concatenate([ tt_folding_b_c.predict_proba(pandas.concat([data_b, data_c])), tt_folding_b_c.predict_proba(data_light) ])[:, 1] probs_c_light = numpy.concatenate([ tt_folding_c_light.predict_proba(data_b), tt_folding_c_light.predict_proba(pandas.concat([data_c, data_light])) ])[:, 1] probs_b_light = tt_folding_b_light.predict_proba( pandas.concat([data_b, data_light]))[:, 1] probs_b_light = numpy.concatenate([ probs_b_light[:len(data_b)], tt_folding_b_light.predict_proba(data_c)[:, 1], probs_b_light[len(data_b):] ]) additional_columns = pandas.DataFrame({ prefix + '_b_c': probs_b_c, prefix + '_b_light': probs_b_light, prefix + '_c_light': probs_c_light }) return additional_columns