#data["mva1"] = 0.0
#data["mva2"] = 0.0

cls = GradientBoostingClassifier(max_depth=3,
                                 learning_rate=0.01,
                                 n_estimators=100,
                                 verbose=True,
                                 min_samples_leaf=10,
                                 min_samples_split=10)
d_train, d_test, t_train, t_test, w_train, w_test = train_test_split(
    data_train, targets, data_weights, test_size=0.2, random_state=7)
#cls.fit(data_train, targets)
cls.fit(d_train, t_train, w_train)

sklearn_to_tmva.gbr_to_tmva(cls,
                            data[trainVars()],
                            "TMVABDT_3l_1tau_maxDepth3_8Var_frWt.xml",
                            coef=2)

import ROOT, array
from ROOT import TFile, TH1F, TGraph, TCanvas, TLegend

fout = TFile("3l_1tau_performance_maxDepth3_8Var_frWt.root", "RECREATE")
c1 = TCanvas()
c1.SetFillColor(10)
c1.SetBorderSize(2)
c1.SetLeftMargin(0.12)
c1.SetBottomMargin(0.12)
c1.SetRightMargin(0.05)
c1.SetLogy()

histogram_base = TH1F("histogram_base", "", 100, 0., 1.)
Ejemplo n.º 2
0
                                 learning_rate=0.01,
                                 n_estimators=100,
                                 verbose=True,
                                 min_samples_leaf=10,
                                 min_samples_split=10)
#d_train, d_test, t_train, t_test, w_train, w_test = train_test_split(data_train, targets, data_weights, test_size=0.2, random_state=7)
d_train, d_test, t_train, t_test = train_test_split(data_train,
                                                    targets,
                                                    test_size=0.2,
                                                    random_state=12345)
#cls.fit(data_train, targets)
#cls.fit(d_train, t_train, w_train)
cls.fit(d_train, t_train)

sklearn_to_tmva.gbr_to_tmva(cls,
                            data[trainVars()],
                            "TMVABDT_hadTopTagger_maxDepth3_9Var_ps75.xml",
                            coef=2)

import ROOT, array
from ROOT import TFile, TH1F, TGraph, TCanvas, TLegend

fout = TFile("hadTopTagger_performance_maxDepth3_9Var_ps75.root", "RECREATE")
c1 = TCanvas()
c1.SetFillColor(10)
c1.SetBorderSize(2)
c1.SetLeftMargin(0.12)
c1.SetBottomMargin(0.12)
c1.SetRightMargin(0.05)
c1.SetLogy()

histogram_base = TH1F("histogram_base", "", 100, 0., 1.)
xgboost2tmva.convert_model(model, trainVars(),
                           "TMVABDT_2lss_1tau_XGB_wMEMallVars.xml")
# xmllint --format TMVABDT_2lss_1tau_XGB_wMEMallVars.xml
#skTMVA.convert_bdt_sklearn_tmva(cls, trainVars(), "TMVABDT_2lss_1tau_XGB_wMEMallVars.xml")
#sklearn_to_tmva.xgbr_to_tmva(cls,evals_result,data[trainVars()],trainVars(),"TMVABDT_2lss_1tau_XGB_wMEMallVars.xml",coef=2)
# run cross validation
print("XGBoost trained")
proba = cls.predict_proba(traindataset[trainVars()].values)
fpr, tpr, thresholds = roc_curve(traindataset["target"], proba[:, 1])
train_auc = auc(fpr, tpr, reorder=True)
print("XGBoost train set auc - {}".format(train_auc))
proba = cls.predict_proba(valdataset[trainVars()].values)
fprt, tprt, thresholds = roc_curve(valdataset["target"], proba[:, 1])
test_auct = auc(fprt, tprt, reorder=True)
print("XGBoost test set auc - {}".format(test_auct))
""" 
sklearn_to_tmva.gbr_to_tmva(
    cls,
    data[trainVars()],
	trainVars(),
    "TMVABDT_2lss_1tau_XGB_wMEMallVars.xml",
    coef=2
)
#"""
##################################################
clc = catboost.CatBoostClassifier()
clc.fit(
    traindataset[trainVars()].values,
    traindataset.target.astype(np.bool),
    #sample_weight= np.absolute((traindataset[weights].astype(np.float64))),
    #eval_set=[(traindataset[trainVars()].values,  traindataset.target.astype(np.bool),traindataset[weights].astype(np.float64)),
# joblib.dump(clf, training_file,protocol = HIGHEST_PROTOCOL)


#######################################
## Converting to TMVA readable xml file
#######################################

import sklearn_to_tmva as convert

trainingWeights_TMVA = "TMVAClassification_BDTG.weights.xml"
log.info("Dumping training file in: " + trainingWeights_TMVA)
# *** Sklearn(python)-type training file (.pkl) ***
# joblib.dump(clf, trainingWeights_TMVA, compress=True)
# *** TMVA-style training file (.xml) ***
out_ext = (trainingWeights_TMVA).split(".")[-1]
convert.gbr_to_tmva(clf, X, trainingWeights_TMVA, mva_name="BDTG", coef=10, var_names=variables)

#################################
# 				#
# 	Validation		#
# 				#
#################################

# input_files = [i.strip() for i in open('data_trees/inputs/ttjets.list')]
input_files = [
    i.strip() for i in open("../data_trees/inputs/ttjets.list")
]  # Make sure there are no empty lines in .list
pt_bins = [15, 40, 60, 90, 150, 400, 600]
eta_bins = [1.2, 2.1]
# flavors = ['C', 'B', 'DUSG']
# sv_categories = ["NoVertex", "PseudoVertex", "RecoVertex"]
Ejemplo n.º 5
0
start = time.time()
clf.fit(X, y,weights)
end = time.time()
log.info('training completed --> Elapsed time: %.1f minutes' % ((end-start)/60))

## num_nodes = [nnodes(i.tree_) for i in clf.estimators_]
## 
## tot_nodes = sum(num_nodes)
## mean = float(tot_nodes)/len(num_nodes)
## print tot_nodes, mean

if args.out:
   log.info('Dumping training file in: ' + args.out)
   out_ext = (args.out).split('.')[-1]
   if (out_ext == 'xml'):
   	convert.gbr_to_tmva(clf,X,args.out,mva_name = "BDTG",coef = 10, var_names = variables)
   else:
   	joblib.dump(clf, args.out, compress=True)

#################################
#				#
# 	Validation		#
#				#
#################################

# you can reload the training if needed (or if you only want to do a validation on an existing training)
# but it is much faster to use the still existing classifier from the training
'''
print 'Loading training file from: ' + training_file
clf_val = joblib.load(training_file)
'''
Ejemplo n.º 6
0
    max_depth=4,
    learning_rate=0.01,
    n_estimators=100,
    verbose=True,
    min_samples_leaf=10,
    min_samples_split=10
)

cls.fit(
    np.array(data[["Jet_CSV", "Jet_CSVIVF", "Jet_pt"]]),
    np.array(np.abs(data[["Jet_flavour"]]) == 5).ravel(len(data), )
)

sklearn_to_tmva.gbr_to_tmva(
    cls,
    data[["Jet_CSV", "Jet_CSVIVF", "Jet_pt"]],
    "test.xml",
    coef=2
)

import ROOT, array
from ROOT import TMVA

reader = TMVA.Reader("!V")
vardict = {}
for fn in ["Jet_CSV", "Jet_CSVIVF", "Jet_pt"]:
    vardict[fn] = array.array("f", [0])
    reader.AddVariable(fn, vardict[fn])
reader.BookMVA("testmva", "test.xml")

def mva1(x,y,z):
    ret = 0
cls = GradientBoostingClassifier(
    max_depth=3,
    learning_rate=0.01,
    n_estimators=100,
    verbose=True,
    min_samples_leaf=10,
    min_samples_split=10
)
d_train, d_test, t_train, t_test, w_train, w_test = train_test_split(data_train, targets, data_weights, test_size=0.2, random_state=7)
#cls.fit(data_train, targets)
cls.fit(d_train, t_train, w_train)

sklearn_to_tmva.gbr_to_tmva(
    cls,
    data[trainVars()],
    "TMVABDT_2lss_1tau_ttV_maxDepth3_10Var_frWt_wMEMall.xml",
    coef=2
)

import ROOT, array
from ROOT import TFile, TH1F, TGraph, TCanvas, TLegend

fout = TFile("2lss_1tau_ttV_performance_maxDepth3_10Var_frWt_wMEMall.root", "RECREATE")
c1 = TCanvas()
c1.SetFillColor(10)
c1.SetBorderSize(2)
c1.SetLeftMargin(0.12)
c1.SetBottomMargin(0.12)
c1.SetRightMargin(0.05)
c1.SetLogy()