def test_folding_classifier(): base_ada = SklearnClassifier(AdaBoostClassifier()) folding_str = FoldingClassifier(base_ada, n_folds=2) check_folding(folding_str, True, True, True) base_svm = SklearnClassifier(SVC()) folding_str = FoldingClassifier(base_svm, n_folds=4) check_folding(folding_str, True, False, False)
def test_folding_classifier(): base_ada = SklearnClassifier(AdaBoostClassifier()) folding_str = FoldingClassifier(base_ada, n_folds=2) check_folding(folding_str, True, True, True) base_log_reg = SklearnClassifier(LogisticRegression()) folding_str = FoldingClassifier(base_log_reg, n_folds=4) check_folding(folding_str, True, False, False, False)
def test_complex_stacking_xgboost(): # Ada over kFold over xgboost base_kfold = FoldingClassifier(base_estimator=XGBoostClassifier()) check_classifier(SklearnClassifier( clf=AdaBoostClassifier(base_estimator=base_kfold, n_estimators=3)), has_staged_pp=False, has_importances=False)
def test_complex_stacking_tmva(): # Ada over kFold over TMVA base_kfold = FoldingClassifier(base_estimator=TMVAClassifier(), random_state=13) check_classifier(SklearnClassifier( clf=AdaBoostClassifier(base_estimator=base_kfold, n_estimators=3)), has_staged_pp=False, has_importances=False)
def test_complex_stacking_mn(): # Ada over kFold over MatrixNet base_kfold = FoldingClassifier(base_estimator=MatrixNetClassifier( iterations=30)) check_classifier(SklearnClassifier( clf=AdaBoostClassifier(base_estimator=base_kfold, n_estimators=3)), has_staged_pp=False, has_importances=False)
def fit_one(self, data, model_y, model_stereo): event_ids = numpy.unique(data.EventID.values) if self.train_size != None: event_ids_train, event_ids_test= train_test_split(event_ids, train_size=self.train_size, random_state=42) else: event_ids_test = event_ids # fit train tracks if self.train_size != None: tracks_train = {} p = Pool(self.processes) results_train = p.map(tracks_reconstruction, zip(event_ids_train, [data]*len(event_ids_train), [model_y]*len(event_ids_train), [model_stereo]*len(event_ids_train))) tracks_train = merge_dicts(results_train) # train clf if self.train_size != None: sc = SuperCombinator() combination_data = sc.data_collection(tracks_train, data) X_data = combination_data[combination_data.columns[:-1]].values y_data = combination_data.label.values xgb_base = XGBoostClassifier(n_estimators=1000, colsample=0.7, eta=0.01, nthreads=1, subsample=0.7, max_depth=8) folding = FoldingClassifier(xgb_base, n_folds=10, random_state=11) folding.fit(X_data, y_data) clf = folding.estimators[0] else: clf = None # fit test tracks tracks_test = {} p = Pool(self.processes) results_test = p.map(tracks_reconstruction, zip(event_ids_test, [data]*len(event_ids_test), [model_y]*len(event_ids_test), [model_stereo]*len(event_ids_test))) tracks_test = merge_dicts(results_test) # quality p = Pool(self.processes) effs = p.map(get_eff_value, zip(event_ids_test, [data]*len(event_ids_test), [tracks_test]*len(event_ids_test), [clf]*len(event_ids_test))) eff = 100. * numpy.array(effs).sum() / len(effs) return eff
lds = LabeledDataStorage(X_test, y_test, w_test) # CLASSIFIER clf_stacking = SklearnClassifier(RandomForestClassifier(n_estimators=5000, bootstrap=False, n_jobs=7)) # clf_stacking = XGBoostClassifier(n_estimators=700, eta=0.1, nthreads=8, # subsample=0.5 # ) # clf_stacking='nn' clf = Mayou(base_estimators={'xgb': None}, bagging_base=None, bagging_stack=8, stacking=clf_stacking, features_stack=branch_names, transform=False, transform_pred=False) # clf = SklearnClassifier(GaussianNB()) # clf = SklearnClassifier(BaggingClassifier(n_jobs=1, max_features=1., # bootstrap=False, base_estimator=clf, n_estimators=20, max_samples=0.1)) # clf = XGBoostClassifier(n_estimators=400, eta=0.1, nthreads=6) # clf = SklearnClassifier(BaggingClassifier(clf, max_samples=0.8)) # clf = SklearnClassifier(NuSVC(cache_size=1000000)) # clf = SklearnClassifier(clf) if folding: X_train = X_test = X y_train = y_test = y w_train = w_test = w clf = FoldingClassifier(clf, n_folds=5) clf.fit(X_train, y_train, w_train) # report.features_correlation_matrix().plot(new_plot=True) plt.show()
def train_one_vs_one(base_estimators, data_b, data_c, data_light, prefix='bdt', n_folds=2, folding=True, features=None, profile=None): data_b_c_lds = LabeledDataStorage(pandas.concat([data_b, data_c]), [1] * len(data_b) + [0] * len(data_c)) data_c_light_lds = LabeledDataStorage(pandas.concat( [data_c, data_light]), [1] * len(data_c) + [0] * len(data_light)) data_b_light_lds = LabeledDataStorage(pandas.concat( [data_b, data_light]), [1] * len(data_b) + [0] * len(data_light)) if folding: tt_folding_b_c = FoldingClassifier(base_estimators[0], n_folds=n_folds, random_state=11, parallel_profile=profile, features=features) tt_folding_c_light = FoldingClassifier(base_estimators[1], n_folds=n_folds, random_state=11, parallel_profile=profile, features=features) tt_folding_b_light = FoldingClassifier(base_estimators[2], n_folds=n_folds, random_state=11, parallel_profile=profile, features=features) else: tt_folding_b_c = base_estimators[0] tt_folding_b_c.features = features tt_folding_c_light = base_estimators[1] tt_folding_c_light.features = features tt_folding_b_light = base_estimators[2] tt_folding_b_light.features = features tt_folding_b_c.fit_lds(data_b_c_lds) tt_folding_c_light.fit_lds(data_c_light_lds) tt_folding_b_light.fit_lds(data_b_light_lds) probs_b_c = numpy.concatenate([ tt_folding_b_c.predict_proba(pandas.concat([data_b, data_c])), tt_folding_b_c.predict_proba(data_light) ])[:, 1] probs_c_light = numpy.concatenate([ tt_folding_c_light.predict_proba(data_b), tt_folding_c_light.predict_proba(pandas.concat([data_c, data_light])) ])[:, 1] probs_b_light = tt_folding_b_light.predict_proba( pandas.concat([data_b, data_light]))[:, 1] probs_b_light = numpy.concatenate([ probs_b_light[:len(data_b)], tt_folding_b_light.predict_proba(data_c)[:, 1], probs_b_light[len(data_b):] ]) additional_columns = pandas.DataFrame({ prefix + '_b_c': probs_b_c, prefix + '_b_light': probs_b_light, prefix + '_c_light': probs_c_light }) return additional_columns
def test_complex_stacking_tmva(): # Ada over kFold over TMVA base_kfold = FoldingClassifier(base_estimator=TMVAClassifier(factory_options="Silent=True:V=False:DrawProgressBar=False", method='kBDT', NTrees=10), random_state=13) check_classifier(SklearnClassifier(clf=AdaBoostClassifier(base_estimator=base_kfold, n_estimators=3)), has_staged_pp=False, has_importances=False)
def fit_one(self, data, model_y, model_stereo): event_ids = numpy.unique(data.EventID.values) if self.train_size != None: event_ids_train, event_ids_test = train_test_split( event_ids, train_size=self.train_size, random_state=42) else: event_ids_test = event_ids # fit train tracks if self.train_size != None: tracks_train = {} p = Pool(self.processes) results_train = p.map( tracks_reconstruction, zip(event_ids_train, [data] * len(event_ids_train), [model_y] * len(event_ids_train), [model_stereo] * len(event_ids_train))) tracks_train = merge_dicts(results_train) # train clf if self.train_size != None: sc = SuperCombinator() combination_data = sc.data_collection(tracks_train, data) X_data = combination_data[combination_data.columns[:-1]].values y_data = combination_data.label.values xgb_base = XGBoostClassifier(n_estimators=1000, colsample=0.7, eta=0.01, nthreads=1, subsample=0.7, max_depth=8) folding = FoldingClassifier(xgb_base, n_folds=10, random_state=11) folding.fit(X_data, y_data) clf = folding.estimators[0] else: clf = None # fit test tracks tracks_test = {} p = Pool(self.processes) results_test = p.map( tracks_reconstruction, zip(event_ids_test, [data] * len(event_ids_test), [model_y] * len(event_ids_test), [model_stereo] * len(event_ids_test))) tracks_test = merge_dicts(results_test) # quality p = Pool(self.processes) effs = p.map( get_eff_value, zip(event_ids_test, [data] * len(event_ids_test), [tracks_test] * len(event_ids_test), [clf] * len(event_ids_test))) eff = 100. * numpy.array(effs).sum() / len(effs) return eff
def train_one_vs_one(base_estimators, data_b, data_c, data_light, prefix='bdt', n_folds=2, folding=True, features=None, profile=None): data_b_c_lds = LabeledDataStorage(pandas.concat([data_b, data_c]), [1] * len(data_b) + [0] * len(data_c)) data_c_light_lds = LabeledDataStorage(pandas.concat([data_c, data_light]), [1] * len(data_c) + [0] * len(data_light)) data_b_light_lds = LabeledDataStorage(pandas.concat([data_b, data_light]), [1] * len(data_b) + [0] * len(data_light)) if folding: tt_folding_b_c = FoldingClassifier(base_estimators[0], n_folds=n_folds, random_state=11, parallel_profile=profile, features=features) tt_folding_c_light = FoldingClassifier(base_estimators[1], n_folds=n_folds, random_state=11, parallel_profile=profile, features=features) tt_folding_b_light = FoldingClassifier(base_estimators[2], n_folds=n_folds, random_state=11, parallel_profile=profile, features=features) else: tt_folding_b_c = base_estimators[0] tt_folding_b_c.features = features tt_folding_c_light = base_estimators[1] tt_folding_c_light.features = features tt_folding_b_light = base_estimators[2] tt_folding_b_light.features = features tt_folding_b_c.fit_lds(data_b_c_lds) tt_folding_c_light.fit_lds(data_c_light_lds) tt_folding_b_light.fit_lds(data_b_light_lds) probs_b_c = numpy.concatenate([tt_folding_b_c.predict_proba(pandas.concat([data_b, data_c])), tt_folding_b_c.predict_proba(data_light)])[:, 1] probs_c_light = numpy.concatenate([tt_folding_c_light.predict_proba(data_b), tt_folding_c_light.predict_proba(pandas.concat([data_c, data_light]))])[:, 1] probs_b_light = tt_folding_b_light.predict_proba(pandas.concat([data_b, data_light]))[:, 1] probs_b_light = numpy.concatenate([probs_b_light[:len(data_b)], tt_folding_b_light.predict_proba(data_c)[:, 1], probs_b_light[len(data_b):]]) additional_columns = pandas.DataFrame({prefix + '_b_c': probs_b_c, prefix + '_b_light': probs_b_light, prefix + '_c_light': probs_c_light}) return additional_columns