def histVariables2D(self, vX, vY, plot_name, sample, cat): # get number of bins and binrange from config file binsX = binning.getNbins(vX) binsY = binning.getNbins(vY) rangeX = binning.getBinrange(vX) rangeY = binning.getBinrange(vY) # check if bin_range was found if not rangeX: maxValue = max(self.samples[sample].cut_data[cat][vX].values) minValue = min(self.samples[sample].cut_data[cat][vX].values) config_string = "variables[\""+vX+"\"]\t\t\t= Variable(bin_range = [{},{}])\n".format(minValue, maxValue) with open("new_variable_configs.txt", "a") as f: f.write(config_string) rangeX = [minValue, maxValue] if not rangeY: maxValue = max(self.samples[sample].cut_data[cat][vY].values) minValue = min(self.samples[sample].cut_data[cat][vY].values) config_string = "variables[\""+vY+"\"]\t\t\t= Variable(bin_range = [{},{}])\n".format(minValue, maxValue) with open("new_variable_configs.txt", "a") as f: f.write(config_string) rangeY = [minValue, maxValue] # fill hist weights = self.samples[sample].cut_data[cat]["weight"].values valuesX = self.samples[sample].cut_data[cat][vX].values valuesY = self.samples[sample].cut_data[cat][vY].values hist = setup.setupHistogram2D( valuesX = valuesX, valuesY = valuesY, weights = weights, binsX = binsX, binsY = binsY, rangeX = rangeX, rangeY = rangeY, titleX = vX, titleY = vY) canvas = setup.drawHistOnCanvas2D( hist = hist, canvasName = vX+"_vs_"+vY, catLabel = JTcut.getJTlabel(cat), sampleName = sample) # add lumi and category to plot setup.printLumi(canvas, lumi = self.options["lumiScale"], twoDim = True) # save canvas setup.saveCanvas(canvas, plot_name)
def __init__(self, save_path, input_samples, event_category, train_variables, batch_size=5000, train_epochs=500, early_stopping=10, optimizer=None, loss_function="categorical_crossentropy", test_percentage=0.2, eval_metrics=None, additional_cut=None, use_pca=False): # save some information # list of samples to load into dataframe self.input_samples = input_samples # output directory for results self.save_path = save_path if not os.path.exists(self.save_path): os.makedirs(self.save_path) # name of event category (usually nJet/nTag category) self.JTstring = event_category self.event_category = JTcut.getJTstring(event_category) self.categoryLabel = JTcut.getJTlabel(event_category) # list of input variables self.train_variables = train_variables # batch size for training self.batch_size = batch_size # number of training epochs self.train_epochs = train_epochs # number of early stopping epochs self.early_stopping = early_stopping # percentage of events saved for testing self.test_percentage = test_percentage # loss function for training self.loss_function = loss_function # additional metrics for evaluation of the training process self.eval_metrics = eval_metrics # additional cuts to be applied after variable norm self.additional_cut = additional_cut # option for principle component analysis self.PCA = use_pca # load data set self.data = self._load_datasets() self.event_classes = self.data.output_classes # save variable norm self.cp_path = self.save_path + "/checkpoints/" if not os.path.exists(self.cp_path): os.makedirs(self.cp_path) out_file = self.cp_path + "/variable_norm.csv" self.data.norm_csv.to_csv(out_file) print("saved variabe norms at " + str(out_file)) # make plotdir self.plot_path = self.save_path + "/plots/" if not os.path.exists(self.plot_path): os.makedirs(self.plot_path) # dict with architectures for analysis self.architecture = arch.getArchitecture(self.JTstring) self.inputName = "inputLayer" self.outputName = "outputLayer" # optimizer for training if not (optimizer): self.optimizer = self.architecture["optimizer"] else: self.optimizer = optimizer
def __init__(self, in_path, save_path, event_classes, event_category, train_variables, batch_size = 5000, train_epochs = 10, early_stopping = 10, optimizer = None, loss_function = "categorical_crossentropy", test_percentage = 0.2, eval_metrics = None, additional_cut = None, phi_padding = 0): # save some information # path to input files self.in_path = in_path # output directory for results self.save_path = save_path if not os.path.exists(self.save_path): os.makedirs( self.save_path ) # list of classes self.event_classes = event_classes # name of event category (usually nJet/nTag category) self.JTstring = event_category self.event_category = JTcut.getJTstring(event_category) self.categoryLabel = JTcut.getJTlabel(event_category) # list of input features self.train_variables = train_variables # batch size for training self.batch_size = batch_size # number of maximum training epochs self.train_epochs = train_epochs # number of early stopping epochs self.early_stopping = early_stopping # percentage of events saved for testing self.test_percentage = test_percentage # loss function for training self.loss_function = loss_function # additional metrics for evaluation of training process self.eval_metrics = eval_metrics # additional cut to be applied after variable norm self.additional_cut = additional_cut self.optimizer = optimizer self.phi_padding = phi_padding # load dataset self.data = self._load_datasets() self.data.get_train_data_cnn #print(self.data.get_train_data_cnn.values) out_path = self.save_path+"/checkpoints" if not os.path.exists(out_path): os.makedirs(out_path) out_file = out_path+"/variable_norm.csv" #self.data.norm_csv.to_csv(out_file) print("saved variable norms at "+str(out_file)) # make plotdir self.plot_path = self.save_path+"/plots/" if not os.path.exists(self.plot_path): os.makedirs(self.plot_path) self.inputName = "inputLayer" self.outputName = "outputLayer" # optimizer for training if not(optimizer): self.optimizer = "adam" else: self.optimizer = optimizer
def perform1Danalysis(self, metric="KS"): # loop over categories and get list of variables for cat in self.categories: print("starting with category {}".format(cat)) cat_dir = self.output_dir + "/" + cat + "/" if not os.path.exists(cat_dir): os.makedirs(cat_dir) output_csv = self.output_dir + "/" + cat + "_1Ddistances_" + metric + ".csv" good_variables_file = self.output_dir + "/" + cat + "_good_vars_1D.txt" # load list of variables from variable set if cat in self.variable_set.variables: variables = self.variable_set.variables[cat] + self.add_vars else: variables = self.variable_set.all_variables + self.add_vars # filter events according to JT category for key in self.sampleNames: self.samples[key].cutData(cat, variables) # loop over all variables and perform plot each time variable_info = {} good_variables = [] for variable in variables: print("analyzing variable: {}".format(variable)) # generate plot output name plot_name = cat_dir + "/{}.pdf".format(variable) plot_name = plot_name.replace("[", "_").replace("]", "") distanceDictionary = self.calculateAllDistances( variable=variable, cat=cat, metric=metric) variable_info[variable] = distanceDictionary max_pvalue = distanceDictionary[max( distanceDictionary, key=lambda k: distanceDictionary[k])] if max_pvalue < 0.05: good_variables.append(variable) distanceMatrix = self.generateMatrix(distanceDictionary) m = setup.setup2DHistogram( matrix=distanceMatrix, ncls=len(self.sampleNames), xtitle=setup.generateLatexLabel(variable), ytitle="", binlabel=self.sampleNames) canvas = setup.draw2DHistOnCanvas(m, "KSpvalues" + cat + variable, JTcut.getJTlabel(cat)) setup.saveCanvas(canvas, plot_name) # generate dataframe info df = pandas.DataFrame(variable_info) df.to_csv(output_csv) with open(good_variables_file, "w") as f: f.write("variables[\"{}\"] = [\n".format(cat)) for v in good_variables: f.write(" \"{}\",\n".format(v)) f.write(" ]\n\n") print("saving distances in csv file {}".format(output_csv))
def __init__(self, save_path, input_samples, category_name, train_variables, category_cutString=None, category_label=None, norm_variables=True, train_epochs=500, test_percentage=0.2, eval_metrics=None, shuffle_seed=None, balanceSamples=False, evenSel=None, addSampleSuffix=""): # save some information # list of samples to load into dataframe self.input_samples = input_samples # suffix of additional (ttbb) sample self.addSampleSuffix = addSampleSuffix # output directory for results self.save_path = save_path if not os.path.exists(self.save_path): os.makedirs(self.save_path) # name of event category (usually nJet/nTag category) self.category_name = category_name # string containing event selection requirements; # if not specified (default), deduced via JTcut self.category_cutString = (category_cutString if category_cutString is not None else JTcut.getJTstring(category_name)) # category label (string); # if not specified (default), deduced via JTcut self.category_label = (category_label if category_label is not None else JTcut.getJTlabel(category_name)) # selection self.evenSel = "" self.oddSel = "1." if not evenSel == None: if evenSel == True: self.evenSel = "(Evt_Odd==0)" self.oddSel = "(Evt_Odd==1)" elif evenSel == False: self.evenSel = "(Evt_Odd==1)" self.oddSel = "(Evt_Odd==0)" # list of input variables self.train_variables = train_variables # percentage of events saved for testing self.test_percentage = test_percentage # number of train epochs self.train_epochs = train_epochs # additional metrics for evaluation of the training process self.eval_metrics = eval_metrics # normalize variables in DataFrame self.norm_variables = norm_variables # load data set self.data = self._load_datasets(shuffle_seed, balanceSamples) self.event_classes = self.data.output_classes # save variable norm self.cp_path = self.save_path + "/checkpoints/" if not os.path.exists(self.cp_path): os.makedirs(self.cp_path) if self.norm_variables: out_file = self.cp_path + "/variable_norm.csv" self.data.norm_csv.to_csv(out_file) print("saved variabe norms at " + str(out_file)) # make plotdir self.plot_path = self.save_path + "/plots/" if not os.path.exists(self.plot_path): os.makedirs(self.plot_path) # layer names for in and output (needed for c++ implementation) self.inputName = "inputLayer" self.outputName = "outputLayer"
def histVariable(self, variable, plot_name, cat): # get number of bins and binrange from config filea bins = binning.getNbins(variable) bin_range = binning.getBinrange(variable) # check if bin_range was found if not bin_range: maxValue = -999 minValue = 999 for key in self.samples: maxValue = max( maxValue, max(self.samples[key].cut_data[cat][variable].values)) minValue = min( minValue, min(self.samples[key].cut_data[cat][variable].values)) config_string = "variables[\"" + variable + "\"]\t\t\t= Variable(bin_range = [{},{}])\n".format( minValue, maxValue) with open("new_variable_configs.txt", "a") as f: f.write(config_string) bin_range = [minValue, maxValue] bkgHists = [] bkgLabels = [] weightIntegral = 0 # loop over bachgrounds and fill hists for sampleName in self.ordered_stack: sample = self.samples[sampleName] # get weights weights = sample.cut_data[cat]["weight"].values weightIntegral += sum(weights) # setup histogram hist = setup.setupHistogram( values=sample.cut_data[cat][variable].values, weights=weights, nbins=bins, bin_range=bin_range, color=sample.plotColor, xtitle=cat + "_" + sample.sampleName + "_" + variable, ytitle=setup.GetyTitle(), filled=True) bkgHists.append(hist) bkgLabels.append(sample.sampleName) sigHists = [] sigLabels = [] sigScales = [] # if not background was added, the weight integral is equal to 0 if weightIntegral == 0: self.options["scaleSignal"] = 1 # loop over signals and fill hists for key in self.samples: sample = self.samples[key] if not sample.isSignal: continue # get weights weights = sample.cut_data[cat]["weight"].values # determine scale factor if self.options["scaleSignal"] == -1: scaleFactor = weightIntegral / (sum(weights) + 1e-9) else: scaleFactor = float(self.options["scaleSignal"]) # setup histogram hist = setup.setupHistogram( values=sample.cut_data[cat][variable].values, weights=weights, nbins=bins, bin_range=bin_range, color=sample.plotColor, xtitle=cat + "_" + sample.sampleName + "_" + variable, ytitle=setup.GetyTitle(), filled=False) hist.Scale(scaleFactor) sigHists.append(hist) sigLabels.append(sample.sampleName) sigScales.append(scaleFactor) # init canvas canvas = setup.drawHistsOnCanvas(sigHists, bkgHists, self.options, canvasName=cat + "_" + variable) # setup legend legend = setup.getLegend() # add signal entries for iSig in range(len(sigHists)): legend.AddEntry( sigHists[iSig], sigLabels[iSig] + " x {:4.0f}".format(sigScales[iSig]), "L") # add background entries for iBkg in range(len(bkgHists)): legend.AddEntry(bkgHists[iBkg], bkgLabels[iBkg], "F") # draw loegend legend.Draw("same") # add lumi and category to plot setup.printLumi(canvas, lumi=self.options["lumiScale"], ratio=self.options["ratio"]) setup.printCategoryLabel(canvas, JTcut.getJTlabel(cat), ratio=self.options["ratio"]) # save canvas setup.saveCanvas(canvas, plot_name)
def histVariable(self, variable, plot_name, cat): histInfo = {} if variable in self.variableconfig.index: # get variable info from config file bins = int(self.variableconfig.loc[variable, 'numberofbins']) minValue = float(self.variableconfig.loc[variable, 'minvalue']) maxValue = float(self.variableconfig.loc[variable, 'maxvalue']) displayname = self.variableconfig.loc[variable, 'displayname'] logoption = self.variableconfig.loc[variable, 'logoption'] else: bins = 50 maxValue = max([ max(self.samples[sample].cut_data[cat][variable].values) for sample in self.samples ]) minValue = min([ min(self.samples[sample].cut_data[cat][variable].values) for sample in self.samples ]) displayname = variable logoption = "-" config_string = "{},{},{},{},{},{}\n".format( variable, minValue, maxValue, bins, logoption, displayname) with open("new_variable_configs.csv", "a") as f: f.write(config_string) bin_range = [minValue, maxValue] if logoption == "x" or logoption == "X": logoption = True else: logoption = False histInfo["nbins"] = bins histInfo["range"] = bin_range bkgHists = [] bkgLabels = [] weightIntegral = 0 # loop over backgrounds and fill hists for sampleName in self.ordered_stack: sample = self.samples[sampleName] # get weights weights = sample.cut_data[cat]["weight"].values # get values values = sample.cut_data[cat][variable].values #weights = [weights[i] for i in range(len(weights)) if not np.isnan(values[i])] #values = [values[i] for i in range(len(values)) if not np.isnan(values[i])] weightIntegral += sum(weights) # setup histogram hist = setup.setupHistogram( values=values, weights=weights, nbins=bins, bin_range=bin_range, color=sample.plotColor, xtitle=cat + "_" + sample.sampleName + "_" + variable, ytitle=setup.GetyTitle(self.options["privateWork"]), filled=sample.filled) bkgHists.append(hist) bkgLabels.append(sample.sampleName) sigHists = [] sigLabels = [] sigScales = [] # if not background was added, the weight integral is equal to 0 if weightIntegral == 0: self.options["scaleSignal"] = 0 histInfo["bkgYield"] = weightIntegral # scale stack to one if lumiScale is set to zero if self.options["lumiScale"] == 0: for hist in bkgHists: hist.Scale(1. / weightIntegral) weightIntegral = 1. # loop over signals and fill hists for key in self.samples: sample = self.samples[key] if not sample.isSignal: continue # get weights weights = sample.cut_data[cat]["weight"].values # determine scale factor if self.options["scaleSignal"] == -1: scaleFactor = weightIntegral / (sum(weights) + 1e-9) elif self.options["scaleSignal"] == 0: scaleFactor = (1. / (sum(weights) + 1e-9)) else: scaleFactor = float(self.options["scaleSignal"]) # setup histogram hist = setup.setupHistogram( values=sample.cut_data[cat][variable].values, weights=weights, nbins=bins, bin_range=bin_range, color=sample.plotColor, xtitle=cat + "_" + sample.sampleName + "_" + variable, ytitle=setup.GetyTitle(), filled=sample.filled) hist.Scale(scaleFactor) sigHists.append(hist) sigLabels.append(sample.sampleName) sigScales.append(scaleFactor) # init canvas canvas = setup.drawHistsOnCanvas(sigHists, bkgHists, self.options, canvasName=variable, displayname=displayname, logoption=logoption) # setup legend legend = setup.getLegend() # add signal entriesa for iSig in range(len(sigHists)): labelstring = sigLabels[iSig] if not self.options["lumiScale"] == 0.: labelstring = sigLabels[iSig] + " x {:4.0f}".format( sigScales[iSig]) # add KS score to label if activated if self.options["KSscore"]: KSscore = setup.calculateKSscore(bkgHists[0], sigHists[iSig]) labelstring = "#splitline{" + labelstring + "}{KSscore = %.3f}" % ( KSscore) histInfo["KSScore"] = KSscore legend.AddEntry(sigHists[iSig], labelstring, "L") # add background entries for iBkg in range(len(bkgHists)): legend.AddEntry(bkgHists[iBkg], bkgLabels[iBkg], "F") # draw loegend legend.Draw("same") # add lumi and category to plot setup.printLumi(canvas, lumi=self.options["lumiScale"], ratio=self.options["ratio"]) setup.printCategoryLabel(canvas, JTcut.getJTlabel(cat), ratio=self.options["ratio"]) if self.options["privateWork"]: setup.printPrivateWork(canvas, ratio=self.options["ratio"]) # save canvas setup.saveCanvas(canvas, plot_name) return histInfo
def histVariable2D(self, name, vX, vY, plot_name, cat): # get number of bins and binrange from config file binsX = binning.getNbins(vX) binsY = binning.getNbins(vY) rangeX = binning.getBinrange(vX) rangeY = binning.getBinrange(vY) # check if bin_range was found if not rangeX: maxValue = max([max(self.samples[sample].cut_data[cat][vX].values) for sample in self.samples]) minValue = min([min(self.samples[sample].cut_data[cat][vX].values) for sample in self.samples]) config_string = "variables[\""+vX+"\"]\t\t\t= Variable(bin_range = [{},{}])\n".format(minValue, maxValue) with open("new_variable_configs.txt", "a") as f: f.write(config_string) rangeX = [minValue, maxValue] if not rangeY: maxValue = max([max(self.samples[sample].cut_data[cat][vY].values) for sample in self.samples]) minValue = min([min(self.samples[sample].cut_data[cat][vY].values) for sample in self.samples]) config_string = "variables[\""+vY+"\"]\t\t\t= Variable(bin_range = [{},{}])\n".format(minValue, maxValue) with open("new_variable_configs.txt", "a") as f: f.write(config_string) rangeY = [minValue, maxValue] # init value lists weights = np.array([]) valuesX = np.array([]) valuesY = np.array([]) for sample in self.samples: weights = np.append(weights, self.samples[sample].cut_data[cat]["weight"].values) valuesX = np.append(valuesX, self.samples[sample].cut_data[cat][vX].values) valuesY = np.append(valuesY, self.samples[sample].cut_data[cat][vY].values) hist = setup.setupHistogram2D( valuesX = valuesX, valuesY = valuesY, weights = weights, binsX = binsX, binsY = binsY, rangeX = rangeX, rangeY = rangeY, titleX = vX, titleY = vY) canvas = setup.drawHistOnCanvas2D( hist = hist, canvasName = name, catLabel = JTcut.getJTlabel(cat), sampleName = name) # add lumi setup.printLumi(canvas, lumi = self.options["lumiScale"], twoDim = True) if self.options["privateWork"]: setup.printPrivateWork(canvas, ratio = self.options["ratio"], twoDim = True) if self.options["getCorr"]: correlation = hist.GetCorrelationFactor() setup.printCorrelation(canvas, correlation) # save canvas setup.saveCanvas(canvas, plot_name) # plot distributions in 1D if activated if self.options["plot1D"]: # get averages bins = int((binsX + binsY)/2.) bin_range = [min(rangeX[0],rangeY[0]), max(rangeX[1],rangeY[1])] hX = setup.setupHistogram( values = valuesX, weights = weights, nbins = bins, bin_range = bin_range, color = ROOT.kBlack, xtitle = vX+"1D", ytitle = setup.GetyTitle(self.options["lumiScale"]), filled = False) if self.options["lumiScale"] == 0.: hXInt = hX.Integral() hX.Scale(1./hXInt) hY = setup.setupHistogram( values = valuesY, weights = weights, nbins = bins, bin_range = bin_range, color = ROOT.kRed, xtitle = vY+"1D", ytitle = setup.GetyTitle(self.options["lumiScale"]), filled = False) if self.options["lumiScale"] == 0.: hYInt = hY.Integral() hY.Scale(1./hYInt) # init canvas canvas = setup.drawHistsOnCanvas( hX, hY, self.options, canvasName = name) # setup legend legend = setup.getLegend() legend.AddEntry( hX, self.options["xName"], "L") labelY = self.options["yName"] # add KS score to label if activated if self.options["KSscore"]: KSscore = setup.calculateKSscore(hX, hY) labelY="#splitline{"+labelY+"}{KSscore = %.3f}"%(KSscore) legend.AddEntry( hY, labelY, "L") # draw loegend legend.Draw("same") # add lumi and category to plot setup.printLumi(canvas, lumi = self.options["lumiScale"], ratio = self.options["ratio"]) setup.printCategoryLabel(canvas, JTcut.getJTlabel(cat), ratio = self.options["ratio"]) if self.options["privateWork"]: setup.printPrivateWork(canvas, ratio = self.options["ratio"]) # save canvas setup.saveCanvas(canvas, plot_name.replace(".pdf","_1D.pdf"))
def histVariable1D(self, sample, name, vX, vY, plot_name, cat): # get number of bins and binrange from config file binsX = binning.getNbins(vX) binsY = binning.getNbins(vY) rangeX = binning.getBinrange(vX) rangeY = binning.getBinrange(vY) # check if bin_range was found if not rangeX: maxValue = max(sample.cut_data[cat][vX].values) minValue = min(sample.cut_data[cat][vX].values) config_string = "variables[\""+vX+"\"]\t\t\t= Variable(bin_range = [{},{}])\n".format(minValue, maxValue) with open("new_variable_configs.txt", "a") as f: f.write(config_string) rangeX = [minValue, maxValue] if not rangeY: maxValue = max(sample.cut_data[cat][vY].values) minValue = min(sample.cut_data[cat][vY].values) config_string = "variables[\""+vY+"\"]\t\t\t= Variable(bin_range = [{},{}])\n".format(minValue, maxValue) with open("new_variable_configs.txt", "a") as f: f.write(config_string) rangeY = [minValue, maxValue] # get averages bins = int((binsX + binsY)/2.) bin_range = [min(rangeX[0],rangeY[0]), max(rangeX[1],rangeY[1])] weights = sample.cut_data[cat]["weight"].values valuesX = sample.cut_data[cat][vX].values valuesY = sample.cut_data[cat][vY].values hX = setup.setupHistogram( values = valuesX, weights = weights, nbins = bins, bin_range = bin_range, color = ROOT.kBlack, xtitle = cat+"_"+sample.sampleName+"_"+vX, ytitle = setup.GetyTitle(self.options["lumiScale"]), filled = False) hY = setup.setupHistogram( values = valuesY, weights = weights, nbins = bins, bin_range = bin_range, color = ROOT.kRed, xtitle = cat+"_"+sample.sampleName+"_"+vY, ytitle = setup.GetyTitle(self.options["lumiScale"]), filled = False) # init canvas canvas = setup.drawHistsOnCanvas( hX, hY, self.options, canvasName = "[{}] {}".format(sample.sampleName, name)) # setup legend legend = setup.getLegend() legend.AddEntry( hX, self.options["xName"], "L") labelY = self.options["yName"] # add KS score to label if activated if self.options["KSscore"]: KSscore = setup.calculateKSscore(hX, hY) labelY="#splitline{"+labelY+"}{KSscore = %.3f}"%(KSscore) legend.AddEntry( hY, labelY, "L") # draw loegend legend.Draw("same") # add lumi and category to plot setup.printLumi(canvas, lumi = self.options["lumiScale"], ratio = self.options["ratio"]) setup.printCategoryLabel(canvas, JTcut.getJTlabel(cat), ratio = self.options["ratio"]) if self.options["privateWork"]: setup.printPrivateWork(canvas, ratio = self.options["ratio"]) # save canvas setup.saveCanvas(canvas, plot_name)