def create_test_data(X, y, directory, n_dump_samples=100, objective="binary:logitraw"): if not os.path.exists(directory): os.makedirs(directory) model = XGBClassifier(n_estimators=100, max_depth=7, objective=objective).fit(X, y) model._Booster.dump_model(os.path.join(directory, "model.txt")) model._Booster.save_model(os.path.join(directory, "model.bin")) feature_names = [("f" + str(i), "F") for i in range(len(y))] xgboost2tmva.convert_model(model._Booster.get_dump(), feature_names, os.path.join(directory, "model.xml")) X_dump = X[:n_dump_samples] preds_dump = model.predict_proba(X_dump) if preds_dump.shape[1] == 2: preds_dump = preds_dump[:, 1] pd.DataFrame(X_dump).to_csv(os.path.join(directory, "X.csv"), **csv_args) pd.DataFrame(preds_dump).to_csv(os.path.join(directory, "preds.csv"), **csv_args)
""" cls = xgb.XGBClassifier() cls.fit( traindataset[trainVars()].values, traindataset.target.astype(np.bool), #sample_weight= (traindataset[weights].astype(np.float64)), #eval_set=[(traindataset[trainVars()].values, traindataset.target.astype(np.bool),traindataset[weights].astype(np.float64)), #(valdataset[trainVars()].values, valdataset.target.astype(np.bool), valdataset[weights].astype(np.float64))] , #eval_metric='logloss' ) #print (cls.evals_result()) #print (cls.evals_result()['validation_0']['logloss']) #""" model = cls.booster().get_dump( fmap='', with_stats=False) #.get_dump() #pickle.dumps(cls) xgboost2tmva.convert_model(model, trainVars(), "TMVABDT_2lss_1tau_XGB_wMEMallVars.xml") # xmllint --format TMVABDT_2lss_1tau_XGB_wMEMallVars.xml #skTMVA.convert_bdt_sklearn_tmva(cls, trainVars(), "TMVABDT_2lss_1tau_XGB_wMEMallVars.xml") #sklearn_to_tmva.xgbr_to_tmva(cls,evals_result,data[trainVars()],trainVars(),"TMVABDT_2lss_1tau_XGB_wMEMallVars.xml",coef=2) # run cross validation print("XGBoost trained") proba = cls.predict_proba(traindataset[trainVars()].values) fpr, tpr, thresholds = roc_curve(traindataset["target"], proba[:, 1]) train_auc = auc(fpr, tpr, reorder=True) print("XGBoost train set auc - {}".format(train_auc)) proba = cls.predict_proba(valdataset[trainVars()].values) fprt, tprt, thresholds = roc_curve(valdataset["target"], proba[:, 1]) test_auct = auc(fprt, tprt, reorder=True) print("XGBoost test set auc - {}".format(test_auct)) """ sklearn_to_tmva.gbr_to_tmva(
plt.xlim([-1, len(features)]) plt.savefig('vriable_importance_15032019_nTree260_endcap.png') variable_importance(model, input_vars) ################################################################################################################################## # convert xgboost to TMVA weights import tempfile feature_map = tempfile.NamedTemporaryFile(suffix=".txt") for index, varname in enumerate(input_vars): print >> feature_map, index, varname, "q" feature_map.flush() import re tmva_output_fname = re.sub("\\.pkl$", ".xml", model_fname) model_dump = model.get_booster().get_dump(fmap=feature_map.name) xgboost2tmva.convert_model(model_dump, input_variables=[(input_var, 'F') for input_var in input_vars], output_xml=tmva_output_fname, pretty=True) print "Wrote", tmva_output_fname ###############################################################################################################################
sys.setrecursionlimit(1000000) X, y = make_classification(n_samples=10000, n_features=5, random_state=42, n_classes=2, weights=[0.5]) model = XGBClassifier(n_estimators=1000).fit(X, y) model._Booster.dump_model("model.txt") model._Booster.save_model("model.bin") # export to TMVA-style XML file input_variables = [("f" + str(i), "F") for i in range(5)] xgboost2tmva.convert_model(model._Booster.get_dump(), input_variables, "model.xml") # export to hardcoded C code = m2c.export_to_c(model) with open("model.c", "w") as c_file: c_file.write(code) X_test = np.random.uniform(-5, 5, size=(100000, 5)) start_time = time.time() preds = model.predict_proba(X_test)[:, 1] print(np.mean(preds)) elapsed_secs = time.time() - start_time
fpr, tpr, thresholds = roc_curve(traindataset["target"], proba[:, 1]) train_auc = auc(fpr, tpr, reorder=True) print("XGBoost train set auc - {}".format(train_auc)) proba = cls.predict_proba(valdataset[trainVars(False)].values) fprt, tprt, thresholds = roc_curve(valdataset["target"], proba[:, 1]) test_auct = auc(fprt, tprt, reorder=True) print("XGBoost test set auc - {}".format(test_auct)) if options.doXML == True: print("Date: ", time.asctime(time.localtime(time.time()))) pklpath = channel + "/" + channel + "_XGB_" + trainvar + "_" + bdtType + ".pkl" pickle.dump(cls, open(pklpath, 'wb')) # save the model in file 'xgb.model.dump' model = cls.booster().get_dump( fmap='', with_stats=False) #.get_dump() #pickle.dumps(cls) xmlfile = channel + "/" + channel + "_XGB_" + trainvar + "_" + bdtType + ".xml" xgboost2tmva.convert_model(model, trainVars(False), xmlfile) print xmlfile + " written" print("Date: ", time.asctime(time.localtime(time.time()))) """ model2 = cls.booster().get_score(fmap='', importance_type='weight') #print json.dump(model2, ensure_ascii=False, sort_keys=True, indent=4, default=lambda x: None) with open(pklpath, 'rb') as fpkl, open('%s.json' % pklpath, 'w') as fjson: pkldata = pickle.load(fpkl) #model.save_model('0001.model') json.dump(pkldata, fjson, ensure_ascii=False, sort_keys=True, indent=4, default=lambda x: None) """ #print json.dumps(model, sort_keys=True) #parse in command line: xmllint --format TMVABDT_2lss_1tau_XGB_wMEMallVars.xml ################################################## """
df = df.query(cfg["selection_base"]) df = df.query(cfg["trainings"][idname][training_bin]["cut"]) df.eval("y = ({0}) + 2 * ({1}) - 1".format(cfg["selection_bkg"], cfg["selection_sig"]), inplace=True) print("Running bayesian optimized training...") xgb_bo_trainer = XgbBoTrainer(data=df, X_cols=feature_cols, y_col="y") xgb_bo_trainer.run() print("Saving weight files...") tmvafile = join(out_dir, "weights.xml") xgboost2tmva.convert_model( xgb_bo_trainer.models["bo"]._Booster.get_dump(), input_variables=list(zip(feature_cols, len(feature_cols) * ["F"])), output_xml=tmvafile, ) os.system("xmllint --format {0} > {0}.tmp".format(tmvafile)) os.system("mv {0} {0}.bak".format(tmvafile)) os.system("mv {0}.tmp {0}".format(tmvafile)) os.system("cd " + out_dir + " && gzip -f weights.xml") print("Saving bayesian optimization results...") xgb_bo_trainer.get_results_df().to_csv( join(out_dir, "xgb_bo_results.csv")) print("Saving individual cv results...") if not os.path.exists(join(out_dir, "cv_results")): os.makedirs(join(out_dir, "cv_results")) for i, cvr in enumerate(xgb_bo_trainer.cv_results):