def _classification_mask_report(report, mask, X, labels_dict): report.features_correlation_matrix(mask=mask).plot() report.features_correlation_matrix_by_class( mask=mask, labels_dict=labels_dict).plot() report.efficiencies(features=X.columns[1:3], mask=mask, labels_dict=labels_dict).plot() report.features_pdf(mask=mask, labels_dict=labels_dict).plot() report.learning_curve(RocAuc(), mask=mask, metric_label='roc').plot() significance = lambda s, b: s / (numpy.sqrt(b) + 0.01) report.metrics_vs_cut(significance, mask=mask, metric_label='sign').plot() report.prediction_pdf(mask=mask, labels_dict=labels_dict).plot() report.scatter([X.columns[:2], X.columns[1:3]], mask=mask, labels_dict=labels_dict).plot() report.feature_importance().plot() if labels_dict is None: report.feature_importance_shuffling(mask=mask).plot() report.roc(mask=mask).plot() report.roc(mask=mask, physics_notion=False).plot() report.efficiencies_2d(['column0', 'column1'], 0.3, mask=mask, labels_dict=labels_dict) print(report.compute_metric(RocAuc()))
def test_own_classification_reports(): """ testing clf.test_on """ X, y, sample_weight = generate_classification_data() clf = SklearnClassifier(RandomForestClassifier()) clf.fit(X, y, sample_weight=sample_weight) report = clf.test_on(X, y, sample_weight=sample_weight) roc1 = report.compute_metric(RocAuc()) lds = LabeledDataStorage(X, y, sample_weight=sample_weight) roc2 = clf.test_on_lds(lds=lds).compute_metric(RocAuc()) assert roc1 == roc2, 'Something wrong with test_on'
def test_gridsearch_sklearn(): metric = numpy.random.choice([OptimalAMS(), RocAuc(), LogLoss()]) scorer = ClassificationFoldingScorer(metric) maximization = True if isinstance(metric, LogLoss): maximization = False grid_param = OrderedDict({ "n_estimators": [10, 20], "learning_rate": [0.1, 0.05], 'features': [['column0', 'column1'], ['column0', 'column1', 'column2']] }) generator = RegressionParameterOptimizer(grid_param, n_evaluations=4, maximize=maximization) grid = GridOptimalSearchCV(SklearnClassifier(clf=AdaBoostClassifier()), generator, scorer, parallel_profile='threads-3') _ = check_grid(grid, False, False, False, use_weights=True) classifier = check_grid(grid, False, False, False, use_weights=False) # Check parameters of best fitted classifier assert 2 <= len(classifier.features) <= 3, 'Features were not set' params = classifier.get_params() for key in grid_param: if key in params: assert params[key] == grid.generator.best_params_[key] else: assert params['clf__' + key] == grid.generator.best_params_[key]
def test_gridsearch_metrics_threads(n_threads=3): X, y, sample_weight = generate_classification_data(n_classes=2, distance=0.7) param_grid = OrderedDict({'reg_param': numpy.linspace(0, 1, 20)}) from itertools import cycle optimizers = cycle([ RegressionParameterOptimizer(param_grid=param_grid, n_evaluations=4, start_evaluations=2), SubgridParameterOptimizer(param_grid=param_grid, n_evaluations=4), RandomParameterOptimizer(param_grid=param_grid, n_evaluations=4), ]) for metric in [RocAuc(), OptimalAMS(), OptimalSignificance(), log_loss]: scorer = FoldingScorer(metric) clf = SklearnClassifier(QDA()) grid = GridOptimalSearchCV( estimator=clf, params_generator=next(optimizers), scorer=scorer, parallel_profile='threads-{}'.format(n_threads)) grid.fit(X, y) print(grid.params_generator.best_score_) print(grid.params_generator.best_params_) grid.params_generator.print_results()
def check_report_with_mask(report, mask, X): report.roc(mask=mask).plot() report.prediction_pdf(mask=mask).plot() report.features_pdf(mask=mask).plot() report.efficiencies(list(X.columns), mask=mask).plot() report.features_correlation_matrix(mask=mask).plot() report.feature_importance().plot() report.scatter([(X.columns[0], X.columns[2])], mask=mask).plot() report.learning_curve(RocAuc(), mask=mask).plot() report.metrics_vs_cut(significance, mask=mask).plot()
def check_classification_learning_curve_masks(report, n_classes): """testing predict_only_mask option""" for mask in [None, "column0 > 0", "column0 > -0.5"]: if n_classes == 2: loss = RocAuc() else: loss = log_loss lc1 = report.learning_curve(loss, mask=mask, predict_only_masked=True, steps=3) lc2 = report.learning_curve(loss, mask=mask, predict_only_masked=False, steps=3) assert lc1.functions == lc2.functions
def test_gridsearch_threads(n_threads=3): scorer = FoldingScorer(numpy.random.choice([OptimalAMS(), RocAuc()])) grid_param = OrderedDict({ "n_estimators": [10, 20], "learning_rate": [0.1, 0.05], 'features': [['column0', 'column1'], ['column0', 'column1', 'column2']] }) generator = RegressionParameterOptimizer(grid_param, n_evaluations=4) base = SklearnClassifier(clf=AdaBoostClassifier()) grid = GridOptimalSearchCV(base, generator, scorer, parallel_profile='threads-{}'.format(n_threads)) X, y, sample_weight = generate_classification_data() grid.fit(X, y, sample_weight=sample_weight)
def test_gridsearch_on_tmva(): metric = numpy.random.choice([OptimalAMS(), RocAuc()]) scorer = FoldingScorer(metric) grid_param = OrderedDict({"MaxDepth": [4, 5], "NTrees": [10, 20]}) generator = SubgridParameterOptimizer(grid_param) try: from rep.estimators import TMVAClassifier grid = GridOptimalSearchCV( TMVAClassifier(features=['column0', 'column1']), generator, scorer) classifier = check_grid(grid, False, False, False) # checking parameters assert len(classifier.features) == 2 params = classifier.get_params() for key in grid_param: assert params[key] == grid.generator.best_params_[key] except ImportError: pass
def test_gridsearch_on_tmva(): metric = numpy.random.choice([OptimalAMS(), RocAuc()]) scorer = FoldingScorer(metric) grid_param = OrderedDict({"MaxDepth": [4, 5], "NTrees": [10, 20]}) generator = SubgridParameterOptimizer(n_evaluations=5, param_grid=grid_param) try: from rep.estimators import TMVAClassifier base_tmva = TMVAClassifier( factory_options="Silent=True:V=False:DrawProgressBar=False", features=['column0', 'column1'], method='kBDT') grid = GridOptimalSearchCV(base_tmva, generator, scorer) classifier = check_grid(grid, False, False, False) # checking parameters assert len(classifier.features) == 2 params = classifier.get_params() for key in grid_param: assert params[key] == grid.generator.best_params_[key] except ImportError: pass
def clf_mayou(data1, data2, n_folds=3, n_base_clf=5): """DEVELOPEMENT, WIP. Test a setup of clf involving bagging and stacking.""" # import raredecay.analysis.ml_analysis as ml_ana # import pandas as pd import copy from rep.estimators import SklearnClassifier, XGBoostClassifier from rep.metaml.folding import FoldingClassifier from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.ensemble import BaggingClassifier # , VotingClassifier, AdaBoostClassifier from rep.estimators.theanets import TheanetsClassifier from sklearn.linear_model import LogisticRegression from rep.metaml.cache import CacheClassifier from rep.report.metrics import RocAuc import rep.metaml.cache from rep.metaml._cache import CacheHelper rep.metaml.cache.cache_helper = CacheHelper('/home/mayou/cache', 100000) # data1.make_folds(n_folds) # data2.make_folds(n_folds) output = {} # for i in range(n_folds): xgb_clf = XGBoostClassifier(n_estimators=350, eta=0.1, max_depth=4, nthreads=3) xgb_folded = FoldingClassifier(base_estimator=xgb_clf, stratified=True, parallel_profile='threads-2') xgb_bagged = BaggingClassifier(base_estimator=xgb_folded, n_estimators=n_base_clf, bootstrap=False) xgb_bagged = SklearnClassifier(xgb_bagged) xgb_big_stacker = copy.deepcopy(xgb_bagged) xgb_bagged = CacheClassifier(name='xgb_bagged1', clf=xgb_bagged) xgb_single = XGBoostClassifier(n_estimators=350, eta=0.1, max_depth=4, nthreads=3) xgb_single = FoldingClassifier(base_estimator=xgb_single, stratified=True, n_folds=10, parallel_profile='threads-2') xgb_single = CacheClassifier(name='xgb_singled1', clf=xgb_single) rdf_clf = SklearnClassifier( RandomForestClassifier(n_estimators=300, n_jobs=3)) rdf_folded = FoldingClassifier(base_estimator=rdf_clf, stratified=True, parallel_profile='threads-2') rdf_bagged = BaggingClassifier(base_estimator=rdf_folded, n_estimators=n_base_clf, bootstrap=False) rdf_bagged = SklearnClassifier(rdf_bagged) rdf_bagged = CacheClassifier(name='rdf_bagged1', clf=rdf_bagged) gb_clf = SklearnClassifier(GradientBoostingClassifier(n_estimators=50)) gb_folded = FoldingClassifier(base_estimator=gb_clf, stratified=True, parallel_profile='threads-6') gb_bagged = BaggingClassifier(base_estimator=gb_folded, n_estimators=n_base_clf, bootstrap=False, n_jobs=5) gb_bagged = SklearnClassifier(gb_bagged) gb_bagged = CacheClassifier(name='gb_bagged1', clf=gb_bagged) nn_clf = TheanetsClassifier(layers=[300, 300], hidden_dropout=0.03, trainers=[{ 'optimize': 'adagrad', 'patience': 5, 'learning_rate': 0.2, 'min_improvement': 0.1, 'momentum': 0.4, 'nesterov': True, 'loss': 'xe' }]) nn_folded = FoldingClassifier(base_estimator=nn_clf, stratified=True, parallel_profile=None) # 'threads-6') nn_bagged = BaggingClassifier(base_estimator=nn_folded, n_estimators=n_base_clf, bootstrap=False, n_jobs=1) nn_bagged = CacheClassifier(name='nn_bagged1', clf=nn_bagged) nn_single_clf = TheanetsClassifier(layers=[300, 300, 300], hidden_dropout=0.03, trainers=[{ 'optimize': 'adagrad', 'patience': 5, 'learning_rate': 0.2, 'min_improvement': 0.1, 'momentum': 0.4, 'nesterov': True, 'loss': 'xe' }]) nn_single = FoldingClassifier(base_estimator=nn_single_clf, n_folds=3, stratified=True) nn_single = CacheClassifier(name='nn_single1', clf=nn_single) logit_stacker = SklearnClassifier( LogisticRegression(penalty='l2', solver='sag')) logit_stacker = FoldingClassifier(base_estimator=logit_stacker, n_folds=n_folds, stratified=True, parallel_profile='threads-6') logit_stacker = CacheClassifier(name='logit_stacker1', clf=logit_stacker) xgb_stacker = XGBoostClassifier(n_estimators=400, eta=0.1, max_depth=4, nthreads=8) # HACK xgb_stacker = xgb_big_stacker xgb_stacker = FoldingClassifier(base_estimator=xgb_stacker, n_folds=n_folds, random_state=42, stratified=True, parallel_profile='threads-6') xgb_stacker = CacheClassifier(name='xgb_stacker1', clf=xgb_stacker) # train1, test1 = data1.get_fold(i) # train2, test2 = data1.get_fold(i) # # t_data, t_targets, t_weights = data, targets, weights = data1.make_dataset(data2, weights_ratio=1) # xgb_bagged.fit(data, targets, weights) # xgb_report = xgb_bagged.test_on(data, targets, weights) # xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_base classifier") # output['xgb_base'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc())) # xgb_proba = xgb_report.prediction['clf'][:, 1] # del xgb_bagged, xgb_folded, xgb_clf, xgb_report # # xgb_single.fit(data, targets, weights) # xgb_report = xgb_single.test_on(data, targets, weights) # xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_single classifier") # output['xgb_single'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc())) # xgb_proba = xgb_report.prediction['clf'][:, 1] # del xgb_single, xgb_report nn_single.fit(data, targets, weights) nn_report = nn_single.test_on(data, targets, weights) nn_report.roc(physics_notion=True).plot( new_plot=True, title="ROC AUC nn_single classifier") output['nn_single'] = "roc auc:" + str( nn_report.compute_metric(metric=RocAuc())) # nn_proba = nn_report.prediction['clf'][:, 1] del nn_single, nn_report # rdf_bagged.fit(data, targets, weights) # rdf_report = rdf_bagged.test_on(data, targets, weights) # rdf_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC rdf_base classifier") # output['rdf_base'] = "roc auc:" + str(rdf_report.compute_metric(metric=RocAuc())) # rdf_proba = rdf_report.prediction['clf'][:, 1] # del rdf_bagged, rdf_clf, rdf_folded, rdf_report # gb_bagged.fit(data, targets, weights) # gb_report = gb_bagged.test_on(data, targets, weights) # gb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC gb_base classifier") # output['gb_base'] = "roc auc:" + str(gb_report.compute_metric(metric=RocAuc())) # gb_proba = gb_report.prediction['clf'][:, 1] # del gb_bagged, gb_clf, gb_folded, gb_report # nn_bagged.fit(data, targets, weights) # nn_report = nn_bagged.test_on(data, targets, weights) # nn_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC nn_base classifier") # output['nn_base'] = "roc auc:" + str(nn_report.compute_metric(metric=RocAuc())) # nn_proba = nn_report.prediction['clf'][:, 1] # del nn_bagged, nn_clf, nn_folded, nn_report # # base_predict = pd.DataFrame({'xgb': xgb_proba, # #'rdf': rdf_proba, # #'gb': gb_proba, # 'nn': nn_proba # }) # # # xgb_stacker.fit(base_predict, targets, weights) # xgb_report = xgb_stacker.test_on(base_predict, targets, weights) # xgb_report.roc(physics_notion=True).plot(new_plot=True, # title="ROC AUC xgb_stacked classifier") # output['stacker_xgb'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc())) # del xgb_stacker, xgb_report # # logit_stacker.fit(base_predict, targets, weights) # logit_report = logit_stacker.test_on(base_predict, targets, weights) # logit_report.roc(physics_notion=True).plot(new_plot=True, # title="ROC AUC logit_stacked classifier") # output['stacker_logit'] = "roc auc:" + str(logit_report.compute_metric(metric=RocAuc())) # del logit_stacker, logit_report print output
data = data.drop('g', axis=1) import numpy import numexpr import pandas from rep import utils from sklearn.ensemble import GradientBoostingClassifier from rep.report.metrics import RocAuc from rep.metaml import GridOptimalSearchCV, FoldingScorer, RandomParameterOptimizer from rep.estimators import SklearnClassifier, TMVAClassifier, XGBoostRegressor # define grid parameters grid_param = {} grid_param['learning_rate'] = [0.2, 0.1, 0.05, 0.02, 0.01] grid_param['max_depth'] = [2, 3, 4, 5] # use random hyperparameter optimization algorithm generator = RandomParameterOptimizer(grid_param) # define folding scorer scorer = FoldingScorer(RocAuc(), folds=3, fold_checks=3) estimator = SklearnClassifier(GradientBoostingClassifier(n_estimators=30)) #grid_finder = GridOptimalSearchCV(estimator, generator, scorer) #% time grid_finder.fit(data, labels) grid_finder = GridOptimalSearchCV(estimator, generator, scorer, parallel_profile="default") print "start grid search" grid_finder.fit(data, labels) grid_finder.params_generator.print_results() assert 10 == grid_finder.params_generator.n_evaluations, "oops"