def test_xgboost_random_states(): X, y, weights = generate_classification_data(n_classes=2, distance=5) for random_state in [145, None, check_random_state(None), check_random_state(145)]: clf1 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf1.fit(X, y) clf2 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf2.fit(X, y) if isinstance(random_state, numpy.random.RandomState): assert not numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state) else: assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
def test_xgboost_feature_importance(): X, y, weights = generate_classification_data(n_classes=2, distance=5) clf = XGBoostClassifier(n_estimators=1, max_depth=1) clf.fit(X, y) importances = clf.get_feature_importances() original_features = set(X.columns) importances_features = set(importances.index) print(original_features, importances_features) assert original_features == importances_features, 'feature_importances_ return something wrong' assert len(original_features) == len(clf.feature_importances_)
def test_feature_importances(): clf = XGBoostClassifier() X, y, sample_weight = generate_classification_data() clf.fit(X, y, sample_weight=sample_weight) # checking feature importance (three ways) res_default = clf.xgboost_classifier.get_fscore() res2 = clf._get_fscore() res3 = clf.feature_importances_ assert res_default == res2, res_default for i, val in enumerate(res3): if val > 0.0: assert val == res_default['f' + str(i)]
def test_complex_stacking_xgboost(): # Ada over kFold over xgboost base_kfold = FoldingClassifier(base_estimator=XGBoostClassifier()) check_classifier(SklearnClassifier( clf=AdaBoostClassifier(base_estimator=base_kfold, n_estimators=3)), has_staged_pp=False, has_importances=False)
def test_basic_xgboost(): X, y, w = generate_classification_data(n_classes=2) clf = XGBoostClassifier(n_estimators=10).fit(X, y) clf.predict(X) clf.predict_proba(X) # testing that returned features in importances are correct and in the same order assert numpy.all(clf.features == clf.get_feature_importances().index)
def test_xgboost_works_with_different_dtypes(): dtypes = ['float32', 'float64', 'int32', 'int64', 'uint32'] for dtype in dtypes: X, y, weights = generate_classification_data(n_classes=2, distance=5) clf = XGBoostClassifier(n_estimators=10) clf.fit(X.astype(dtype=dtype), y.astype(dtype=dtype), sample_weight=weights.astype(dtype)) probabilities = clf.predict_proba(X.astype(dtype)) # testing single pandas.DataFrame with different dtypes X, y, weights = generate_classification_data(n_classes=2, distance=5) import pandas X = pandas.DataFrame() for dtype in dtypes: X[dtype] = numpy.random.normal(0, 10, size=len(y)).astype(dtype) clf = XGBoostClassifier(n_estimators=10) clf.fit(X, y, sample_weight=weights) probabilities = clf.predict_proba(X)
def very_basic_xgboost_test(): X, y, w = generate_classification_data(n_classes=2) clf = XGBoostClassifier(n_estimators=10).fit(X, y) clf.predict(X) clf.predict_proba(X) # testing that returned features in importances are correct and in the same order assert numpy.all(clf.features == clf.get_feature_importances().index)
def test_feature_splitter(): # testing splitter from rep.metaml import FeatureSplitter X, y, sample_weight = generate_classification_data(n_classes=3) split_column = X.columns[0] splitters = numpy.random.randint(0, 3, size=len(X)) X[split_column] = splitters X.ix[splitters == 1, :] += 4 X.ix[splitters == 2, :] -= 4 fs = FeatureSplitter(base_estimator=XGBoostClassifier(features=list( X.columns[1:]), n_estimators=10, max_depth=3), split_feature=split_column) fs.fit(X, y, sample_weight=sample_weight) assert fs.score(X, y) > 0.9
def _make_clf(self, clf, bagging=None): """Creates a classifier from a dict or returns the clf""" if isinstance(clf, dict): key, val = clf.popitem() try: val = self.__DEFAULT_CLF_CFG.get(key) if val is None else val except KeyError: logger.error(str(val) + " not an implemented classifier.") raise temp_bagging = val.pop('bagging', bagging) bagging = temp_bagging if bagging is None else bagging if key == 'rdf': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(RandomForestClassifier(**config_clf)) elif key == 'erf': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(ExtraTreesClassifier(**config_clf)) elif key == 'nn': config_clf = dict(val) # possible multi-threading arguments clf = TheanetsClassifier(**config_clf) elif key == 'ada': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(AdaBoostClassifier(**config_clf)) elif key == 'gb': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(GradientBoostingClassifier(**config_clf)) elif key == 'xgb': config_clf = dict(val) # possible multi-threading arguments clf = XGBoostClassifier(**config_clf) elif hasattr(clf, 'fit'): bagging = False # return the classifier # bagging over the instantiated estimators if isinstance(bagging, int) and bagging >= 1: bagging = dict(self.__DEFAULT_BAG_CFG, n_estimators=bagging) if isinstance(bagging, dict): # TODO: implement multi-thread: bagging.update({'base_estimator': clf}) clf = SklearnClassifier(BaggingClassifier(**bagging)) else: raise ValueError(str(clf) + " not valid as a classifier.") clf = {key: clf} return clf
def test_feature_splitter(): # testing splitter from rep.metaml import FeatureSplitter X, y, sample_weight = generate_classification_data(n_classes=3) split_column = X.columns[0] splitters = numpy.random.randint(0, 3, size=len(X)) X[split_column] = splitters X.ix[splitters == 1, :] += 4 X.ix[splitters == 2, :] -= 4 fs = FeatureSplitter(base_estimator=XGBoostClassifier(n_estimators=10, max_depth=3), split_feature=split_column, train_features=list(X.columns[1:])) fs.fit(X, y, sample_weight=sample_weight) assert fs.score(X, y) > 0.9 p_final = fs.predict_proba(X) for p in fs.staged_predict_proba(X): pass assert numpy.allclose(p_final, p), 'end of iterations differs from expected'
def main(job_id, params): print "Anything printed here will end up in the output directory for job ", job_id print params if job_id > 50: file = open("optimisation_done_flag", "a").close() comp_file_list = [( "/Users/weisser/MIT_Dropbox/MIT/Research/learningml/learningml/GoF/data/accept_reject/sin1diff_data/data_sin1diff_5_and_5_periods4D_sample_optimisation_0.txt", "/Users/weisser/MIT_Dropbox/MIT/Research/learningml/learningml/GoF/data/accept_reject/sin1diff_data/data_sin1diff_5_and_6_periods4D_sample_optimisation_0.txt" )] clf = XGBoostClassifier(base_score=0.5, colsample=1.0, eta=params['eta'], features=None, gamma=None, max_depth=6, min_child_weight=1.0, missing=-999.0, n_estimators=params['n_estimators'], nthreads=16, num_feature=None, random_state=0, scale_pos_weight=1.0, subsample=1.0, verbose=0) result = classifier_eval.classifier_eval( name="xgb_4Dsin_5_6_CPV_syst_0_01_", title="xgb Sin 5 6 periods syst0.01", comp_file_list=comp_file_list, clf=clf, mode="spearmint_optimisation", scoring="chi2", no_bins=5, systematics_fraction=0.01) with open( "xgb_4Dsin_5_6_CPV_syst_0_01__chi2scoring_5_optimisation_values.txt", "a") as myfile: myfile.write( str(params["n_estimators"][0]) + "\t" + str(params["eta"][0]) + "\t" + str(result) + "\n") return result
def name_to_nclf(name): #This function gives some standard versions of common machine learning classifiers. if name == "dt": anclf = nclf('dt', tree.DecisionTreeClassifier(), ['max_depth', 'min_samples_split'], [[1, 60], [2, 100]]) if name == "bdt": anclf = nclf( 'bdt', AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier( max_depth=2)), ['learning_rate', 'n_estimators'], [[0.01, 2.0], [100, 1000]]) if name == "xgb": anclf = nclf('xgb', XGBoostClassifier(), ['n_estimators', 'eta'], [[10, 1000], [0.01, 1.0]]) if name == "svm": anclf = nclf('svm', SVC(probability=True, cache_size=7000), ['C', 'gamma'], [[1.0, 1000.0], [1E-6, 0.1]]) if name == "nn": anclf = nclf('nn', "no classifier needed for nn", ['n_hidden_layers', 'dimof_middle'], [[0, 1], [100, 500]]) return anclf
def test_xgboost_random_states(): X, y, weights = generate_classification_data(n_classes=2, distance=5) for random_state in [ 145, None, check_random_state(None), check_random_state(145) ]: clf1 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf1.fit(X, y) clf2 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf2.fit(X, y) if isinstance(random_state, numpy.random.RandomState): assert not numpy.allclose( clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state) else: assert numpy.allclose( clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
def fit_one(self, data, model_y, model_stereo): event_ids = numpy.unique(data.EventID.values) if self.train_size != None: event_ids_train, event_ids_test = train_test_split( event_ids, train_size=self.train_size, random_state=42) else: event_ids_test = event_ids # fit train tracks if self.train_size != None: tracks_train = {} p = Pool(self.processes) results_train = p.map( tracks_reconstruction, zip(event_ids_train, [data] * len(event_ids_train), [model_y] * len(event_ids_train), [model_stereo] * len(event_ids_train))) tracks_train = merge_dicts(results_train) # train clf if self.train_size != None: sc = SuperCombinator() combination_data = sc.data_collection(tracks_train, data) X_data = combination_data[combination_data.columns[:-1]].values y_data = combination_data.label.values xgb_base = XGBoostClassifier(n_estimators=1000, colsample=0.7, eta=0.01, nthreads=1, subsample=0.7, max_depth=8) folding = FoldingClassifier(xgb_base, n_folds=10, random_state=11) folding.fit(X_data, y_data) clf = folding.estimators[0] else: clf = None # fit test tracks tracks_test = {} p = Pool(self.processes) results_test = p.map( tracks_reconstruction, zip(event_ids_test, [data] * len(event_ids_test), [model_y] * len(event_ids_test), [model_stereo] * len(event_ids_test))) tracks_test = merge_dicts(results_test) # quality p = Pool(self.processes) effs = p.map( get_eff_value, zip(event_ids_test, [data] * len(event_ids_test), [tracks_test] * len(event_ids_test), [clf] * len(event_ids_test))) eff = 100. * numpy.array(effs).sum() / len(effs) return eff
if typedata=="Data": arr = dataset20.to_records() array2root(arr, outputCentral+"_AppliedTo20pOfPlain"+typedata+".root" , 'tree', 'recreate') # for ii in range(0,3): if ii==0 : train= trainFeaturesplot Var='All' if ii==1 : train= trainFeaturesObvious Var='Mass' if ii==2 : train= trainFeaturesHH Var='HH' xgb = XGBoostClassifier(train) #, original = xgboriginal.XGBClassifier(train) """ n_estimators = 200, eta = 0.1, max_depth = 7, subsample = 0.9, colsample = 0.6) """ xgb.fit(traindatasetmix[train].astype(np.float64), traindatasetmix.target.astype(np.bool), sample_weight= (traindatasetmix[weights].astype(np.float64))) prob = xgb.predict_proba(valdatasetmix[train].astype(np.float64) ) if ii==0 : reportAll = xgb.test_on(traindatasetmix[trainFeaturesplot].astype(np.float64), traindatasetmix.target.astype(np.bool)) if ii==1 : reportObvious = xgb.test_on(traindatasetmix[trainFeaturesObvious].astype(np.float64), traindatasetmix.target.astype(np.bool)) if ii==2 : reportHH = xgb.test_on(traindatasetmix[trainFeaturesHH].astype(np.float64), traindatasetmix.target.astype(np.bool)) # compatible with lustr/lxplus #features = ['costhst_DiJets[0]_HH', 'costhst_Jets[0]_DiJets[0]', 'costhst_Jets[2]_DiJets[1]', 'CSV3', 'CSV4', 'Jets[0].eta()', 'Jets[1].eta()', 'Jets[2].eta()', 'Jets[3].eta()', 'HT_other_jets']
def test_xgboost(): check_classifier(XGBoostClassifier(), n_classes=2) check_classifier(XGBoostClassifier(), n_classes=4) check_regression(XGBoostRegressor())
#nclf_list = [nclf('xgb',XGBoostClassifier(),['n_estimators','eta'], [[10,1000],[0.01,1.0]], param_opt=[1000.,0.9738])] #nclf_list = [nclf('nn',"no classifier needed for nn", ['n_hidden_layers','dimof_middle'], [[0,1],[100,500]],param_opt=[1,210])] #nclf_list = [name_to_nclf("nn")] #nclf_list = [name_to_nclf("bdt"), name_to_nclf("xgb"), name_to_nclf("svm"), name_to_nclf("nn")] #nclf_list = [name_to_nclf("bdt"), name_to_nclf("xgb"), name_to_nclf("nn")] #nclf_list = [name_to_nclf("svm")] nclf_list = [ nclf('bdt', AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier( max_depth=2)), ['learning_rate', 'n_estimators'], [[0.01, 2.0], [1, 1000]], param_opt=[0.01, 992]), nclf('xgb', XGBoostClassifier(), ['n_estimators', 'eta'], [[10, 1000], [0.01, 1.0]], param_opt=[423, 0.0104]), nclf('nn', "no classifier needed for nn", ['n_hidden_layers', 'dimof_middle'], [[0, 1], [100, 500]], param_opt=[1, 210]), nclf('svm', SVC(probability=True, cache_size=7000), ['C', 'gamma'], [[1.0, 1000.0], [1E-6, 0.1]], param_opt=[583.3, 0.0012]) ] #nclf_list = [nclf('bdt',AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier(max_depth=2)), ['learning_rate','n_estimators'], [[0.01,2.0],[1,1000]], param_opt=[0.432, 18]), nclf('xgb',XGBoostClassifier(), ['n_estimators','eta'], [[10,1000],[0.01,1.0]], param_opt=[619, 0.1489]), nclf('nn',"no classifier needed for nn", ['n_hidden_layers','dimof_middle'], [[0,1],[100,500]],param_opt=[0,174]), nclf('svm',SVC(probability=True, cache_size=7000), ['C','gamma'], [[1.0,1000.0],[1E-6,0.1]], param_opt=[5.269, 0.00453])] systematics_fraction = 0.01
def test_simple_stacking_xgboost(): base_xgboost = XGBoostClassifier() classifier = SklearnClassifier( clf=AdaBoostClassifier(base_estimator=base_xgboost, n_estimators=3)) check_classifier(classifier, has_staged_pp=False)
def clf_mayou(data1, data2, n_folds=3, n_base_clf=5): """DEVELOPEMENT, WIP. Test a setup of clf involving bagging and stacking.""" # import raredecay.analysis.ml_analysis as ml_ana # import pandas as pd import copy from rep.estimators import SklearnClassifier, XGBoostClassifier from rep.metaml.folding import FoldingClassifier from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.ensemble import BaggingClassifier # , VotingClassifier, AdaBoostClassifier from rep.estimators.theanets import TheanetsClassifier from sklearn.linear_model import LogisticRegression from rep.metaml.cache import CacheClassifier from rep.report.metrics import RocAuc import rep.metaml.cache from rep.metaml._cache import CacheHelper rep.metaml.cache.cache_helper = CacheHelper('/home/mayou/cache', 100000) # data1.make_folds(n_folds) # data2.make_folds(n_folds) output = {} # for i in range(n_folds): xgb_clf = XGBoostClassifier(n_estimators=350, eta=0.1, max_depth=4, nthreads=3) xgb_folded = FoldingClassifier(base_estimator=xgb_clf, stratified=True, parallel_profile='threads-2') xgb_bagged = BaggingClassifier(base_estimator=xgb_folded, n_estimators=n_base_clf, bootstrap=False) xgb_bagged = SklearnClassifier(xgb_bagged) xgb_big_stacker = copy.deepcopy(xgb_bagged) xgb_bagged = CacheClassifier(name='xgb_bagged1', clf=xgb_bagged) xgb_single = XGBoostClassifier(n_estimators=350, eta=0.1, max_depth=4, nthreads=3) xgb_single = FoldingClassifier(base_estimator=xgb_single, stratified=True, n_folds=10, parallel_profile='threads-2') xgb_single = CacheClassifier(name='xgb_singled1', clf=xgb_single) rdf_clf = SklearnClassifier( RandomForestClassifier(n_estimators=300, n_jobs=3)) rdf_folded = FoldingClassifier(base_estimator=rdf_clf, stratified=True, parallel_profile='threads-2') rdf_bagged = BaggingClassifier(base_estimator=rdf_folded, n_estimators=n_base_clf, bootstrap=False) rdf_bagged = SklearnClassifier(rdf_bagged) rdf_bagged = CacheClassifier(name='rdf_bagged1', clf=rdf_bagged) gb_clf = SklearnClassifier(GradientBoostingClassifier(n_estimators=50)) gb_folded = FoldingClassifier(base_estimator=gb_clf, stratified=True, parallel_profile='threads-6') gb_bagged = BaggingClassifier(base_estimator=gb_folded, n_estimators=n_base_clf, bootstrap=False, n_jobs=5) gb_bagged = SklearnClassifier(gb_bagged) gb_bagged = CacheClassifier(name='gb_bagged1', clf=gb_bagged) nn_clf = TheanetsClassifier(layers=[300, 300], hidden_dropout=0.03, trainers=[{ 'optimize': 'adagrad', 'patience': 5, 'learning_rate': 0.2, 'min_improvement': 0.1, 'momentum': 0.4, 'nesterov': True, 'loss': 'xe' }]) nn_folded = FoldingClassifier(base_estimator=nn_clf, stratified=True, parallel_profile=None) # 'threads-6') nn_bagged = BaggingClassifier(base_estimator=nn_folded, n_estimators=n_base_clf, bootstrap=False, n_jobs=1) nn_bagged = CacheClassifier(name='nn_bagged1', clf=nn_bagged) nn_single_clf = TheanetsClassifier(layers=[300, 300, 300], hidden_dropout=0.03, trainers=[{ 'optimize': 'adagrad', 'patience': 5, 'learning_rate': 0.2, 'min_improvement': 0.1, 'momentum': 0.4, 'nesterov': True, 'loss': 'xe' }]) nn_single = FoldingClassifier(base_estimator=nn_single_clf, n_folds=3, stratified=True) nn_single = CacheClassifier(name='nn_single1', clf=nn_single) logit_stacker = SklearnClassifier( LogisticRegression(penalty='l2', solver='sag')) logit_stacker = FoldingClassifier(base_estimator=logit_stacker, n_folds=n_folds, stratified=True, parallel_profile='threads-6') logit_stacker = CacheClassifier(name='logit_stacker1', clf=logit_stacker) xgb_stacker = XGBoostClassifier(n_estimators=400, eta=0.1, max_depth=4, nthreads=8) # HACK xgb_stacker = xgb_big_stacker xgb_stacker = FoldingClassifier(base_estimator=xgb_stacker, n_folds=n_folds, random_state=42, stratified=True, parallel_profile='threads-6') xgb_stacker = CacheClassifier(name='xgb_stacker1', clf=xgb_stacker) # train1, test1 = data1.get_fold(i) # train2, test2 = data1.get_fold(i) # # t_data, t_targets, t_weights = data, targets, weights = data1.make_dataset(data2, weights_ratio=1) # xgb_bagged.fit(data, targets, weights) # xgb_report = xgb_bagged.test_on(data, targets, weights) # xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_base classifier") # output['xgb_base'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc())) # xgb_proba = xgb_report.prediction['clf'][:, 1] # del xgb_bagged, xgb_folded, xgb_clf, xgb_report # # xgb_single.fit(data, targets, weights) # xgb_report = xgb_single.test_on(data, targets, weights) # xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_single classifier") # output['xgb_single'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc())) # xgb_proba = xgb_report.prediction['clf'][:, 1] # del xgb_single, xgb_report nn_single.fit(data, targets, weights) nn_report = nn_single.test_on(data, targets, weights) nn_report.roc(physics_notion=True).plot( new_plot=True, title="ROC AUC nn_single classifier") output['nn_single'] = "roc auc:" + str( nn_report.compute_metric(metric=RocAuc())) # nn_proba = nn_report.prediction['clf'][:, 1] del nn_single, nn_report # rdf_bagged.fit(data, targets, weights) # rdf_report = rdf_bagged.test_on(data, targets, weights) # rdf_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC rdf_base classifier") # output['rdf_base'] = "roc auc:" + str(rdf_report.compute_metric(metric=RocAuc())) # rdf_proba = rdf_report.prediction['clf'][:, 1] # del rdf_bagged, rdf_clf, rdf_folded, rdf_report # gb_bagged.fit(data, targets, weights) # gb_report = gb_bagged.test_on(data, targets, weights) # gb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC gb_base classifier") # output['gb_base'] = "roc auc:" + str(gb_report.compute_metric(metric=RocAuc())) # gb_proba = gb_report.prediction['clf'][:, 1] # del gb_bagged, gb_clf, gb_folded, gb_report # nn_bagged.fit(data, targets, weights) # nn_report = nn_bagged.test_on(data, targets, weights) # nn_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC nn_base classifier") # output['nn_base'] = "roc auc:" + str(nn_report.compute_metric(metric=RocAuc())) # nn_proba = nn_report.prediction['clf'][:, 1] # del nn_bagged, nn_clf, nn_folded, nn_report # # base_predict = pd.DataFrame({'xgb': xgb_proba, # #'rdf': rdf_proba, # #'gb': gb_proba, # 'nn': nn_proba # }) # # # xgb_stacker.fit(base_predict, targets, weights) # xgb_report = xgb_stacker.test_on(base_predict, targets, weights) # xgb_report.roc(physics_notion=True).plot(new_plot=True, # title="ROC AUC xgb_stacked classifier") # output['stacker_xgb'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc())) # del xgb_stacker, xgb_report # # logit_stacker.fit(base_predict, targets, weights) # logit_report = logit_stacker.test_on(base_predict, targets, weights) # logit_report.roc(physics_notion=True).plot(new_plot=True, # title="ROC AUC logit_stacked classifier") # output['stacker_logit'] = "roc auc:" + str(logit_report.compute_metric(metric=RocAuc())) # del logit_stacker, logit_report print output
def test_xgboost(): check_classifier(XGBoostClassifier(n_estimators=20), n_classes=2) check_classifier(XGBoostClassifier(n_estimators=20), n_classes=4) check_regression(XGBoostRegressor(n_estimators=20))