def test_factory(): factory = ClassifiersFactory() try: from rep.estimators.tmva import TMVAClassifier factory.add_classifier('tmva', TMVAClassifier()) except ImportError: pass factory.add_classifier('rf', RandomForestClassifier(n_estimators=10)) factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20)) X, y, sample_weight = generate_classification_data() assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns), parallel_profile='threads-4') for cl in factory.values(): assert list(cl.features) == list(X.columns) proba = factory.predict_proba(X, parallel_profile='threads-4') labels = factory.predict(X, parallel_profile='threads-4') for key, val in labels.items(): score = accuracy_score(y, val) print(key, score) assert score > 0.7, key for key, val in proba.items(): assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1' assert numpy.all(val >= 0.), 'negative probabilities' auc_score = roc_auc_score(y, val[:, 1]) print(auc_score) assert auc_score > 0.8 for key, iterator in factory.staged_predict_proba(X).items(): assert key != 'tmva', 'tmva does not support staged pp' for p in iterator: assert p.shape == (len(X), 2) # checking that last iteration coincides with previous assert numpy.all(p == proba[key]) # testing picklability dump_string = cPickle.dumps(factory) clf_loaded = cPickle.loads(dump_string) assert type(factory) == type(clf_loaded) probs1 = factory.predict_proba(X) probs2 = clf_loaded.predict_proba(X) for key, val in probs1.items(): assert numpy.all(val == probs2[key]), 'something strange was loaded' report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight)) report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3)) report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight)) report = factory.test_on(X, y, sample_weight=sample_weight) val = numpy.mean(X['column0']) yield check_report_with_mask, report, "column0 > %f" % (val / 2.), X yield check_report_with_mask, report, lambda x: numpy.array(x['column0']) < val * 2., X yield check_report_with_mask, report, None, X
# save classifiers to pkl file with open(cname, 'wb') as outfile: pickle.dump(classifiers, outfile) # save in TMVA format tmva_vars = [(f, 'F') for f in uconfig.features.train] if "bdt" in uconfig.training.algorithms: skTMVA.convert_bdt__Grad(classifiers["bdt"], tmva_vars, wname_bdt) # make UGradientBoostingClassifier compatible w/ sklearn GradientBoostingClassifier if "ubdt" in uconfig.training.algorithms: from mods import uGB_to_GB uGB_to_GB(classifiers["ubdt"]) skTMVA.convert_bdt__Grad(classifiers["ubdt"], tmva_vars, wname_ubdt) # save reports reports = {} # have to evaluate with all sets of weights because of report structure for weight in sorted(W): reports["train" + weight] = classifiers.test_on( trainX, trainY, sample_weight=trainW[weight]) reports["test" + weight] = classifiers.test_on(testX, testY, sample_weight=testW[weight]) with open(rname, 'wb') as outfile: pickle.dump(reports, outfile) if args.verbose: fprint("Finish saving")