def separations(self): """Plot the separation between different samples for each target 1D: Bar graph showing the separation between S/B (0,1) https://matplotlib.org/gallery/lines_bars_and_markers/barh.html 2D: Matrix (a la correlation matrix) with scale (0,1) showing separation between S/B in 2D plane """ # 1D fig, ax = plt.subplots() data = {} # calculate the separations for feature in self.features: separation = util.getSeparation(self.targets[0].df[feature],self.targets[1].df[feature]) data[feature] = separation sorted_data_keys = sorted( data, key=data.get, reverse=True ) # sort by largest -> smallest separation ticks = range(len(self.features)) ax.barh( ticks, [data[f] for f in sorted_data_keys], align='center' ) ax.set_yticks() ax.set_yticklabels(sorted_data_keys) ax.invert_yaxis() ax.set_xlim([0,1]) ax.set_xlabel(r'Separation',fontsize=22,ha='right',va='top',position=(1,0)) ax.set_xticklabels(ax.get_xticks(),fontsize=22) ax.xaxis.set_major_formatter(FormatStrFormatter('%g')) ## CMS/COM Energy Label + Signal name cms_stamp = hpl.CMSStamp(self.CMSlabelStatus) cms_stamp.coords = [0.02,1.00] cms_stamp.fontsize = 16 cms_stamp.va = 'bottom' ax.text(0.02,1.00,cms_stamp.text,fontsize=cms_stamp.fontsize, ha=cms_stamp.ha,va=cms_stamp.va,transform=ax.transAxes) energy_stamp = hpl.EnergyStamp() energy_stamp.ha = 'right' energy_stamp.coords = [0.99,1.00] energy_stamp.fontsize = 16 energy_stamp.va = 'bottom' ax.text(energy_stamp.coords[0],energy_stamp.coords[1],energy_stamp.text, fontsize=energy_stamp.fontsize,ha=energy_stamp.ha, va=energy_stamp.va, transform=ax.transAxes) ax.text(0.03,0.93,target.label,fontsize=16,ha='left',va='top',transform=ax.transAxes) plt.savefig(self.output_dir+"/separations1D_{0}_{1}.{2}".format(target.name,self.date,self.image_format), format=self.image_format,dpi=300,bbox_inches='tight') plt.close() return
def features(self): """ Plot the features For classification, compare different targets For regression, just plot the features <- should do data/mc plots instead! """ self.msg_svc.INFO("DL : Plotting features.") plt_feature = self.df.keys() for hi,feature in enumerate(plt_feature): if feature=='target': continue hist = HepPlotter("histogram",1) hist.normed = True hist.stacked = False hist.logplot = False hist.binning = self.variable_labels[feature].binning hist.x_label = self.variable_labels[feature].label hist.y_label = "Events" hist.format = self.image_format hist.saveAs = self.output_dir+"/hist_"+feature+"_"+self.date hist.ratio_plot = True hist.ratio_type = 'significance' hist.CMSlabel = 'top left' hist.CMSlabelStatus = self.CMSlabelStatus hist.numLegendColumns = 1 # Add some extra text to the plot if self.processlabel: hist.extra_text.Add(self.processlabel,coords=[0.03,0.80]) # physics process that produces these features hist.initialize() for t,target in enumerate(self.targets): hist.Add(target.df[feature], name=target.name, draw='step', linecolor=target.color, label=target.label) if self.classification=='binary': separation = util.getSeparation(self.targets[0].df[feature],self.targets[1].df[feature]) hist.extra_text.Add("Separation = {0:.2f}".format(separation),coords=[0.03,0.73]) p = hist.execute() hist.savefig() return
def prediction(self, train_data={}, test_data={}): """Plot the training and testing predictions""" self.msg_svc.INFO("DL : Plotting DNN prediction. ") # Plot all k-fold cross-validation results for i, (train, trainY, test, testY) in enumerate( zip(train_data['X'], train_data['Y'], test_data['X'], test_data['Y'])): hist = HepPlotter("histogram", 1) hist.ratio_plot = True hist.ratio_type = "ratio" hist.y_ratio_label = "Test/Train" hist.label_size = 14 hist.normed = True # compare shape differences (likely don't have the same event yield) hist.format = self.image_format hist.saveAs = "{0}/hist_DNN_prediction_kfold{1}_{2}".format( self.output_dir, i, self.date) hist.binning = [bb / 10. for bb in range(11)] hist.stacked = False hist.logplot = {"y": False, "x": False, "data": False} hist.x_label = "Prediction" hist.y_label = "Arb. Units" hist.CMSlabel = 'top left' hist.CMSlabelStatus = self.CMSlabelStatus hist.numLegendColumns = 1 if self.processlabel: hist.extra_text.Add(self.processlabel, coords=[0.03, 0.80], fontsize=14) hist.initialize() test_data = [] train_data = [] json_data = {} for t, target in enumerate(self.targets): ## Training target_value = target.target_value hist.Add(train[trainY == target_value], name=target.name + '_train', linecolor=target.color, linewidth=2, draw='step', label=target.label + " Train", ratio_den=True, ratio_num=False, ratio_partner=target.name + '_test') ## Testing hist.Add(test[testY == target_value], name=target.name + '_test', linecolor=target.color, color=target.color, linewidth=0, draw='stepfilled', label=target.label + " Test", alpha=0.5, ratio_den=False, ratio_num=True, ratio_partner=target.name + '_train') ## Save data to JSON file json_data[target.name + "_train"] = {} json_data[target.name + "_test"] = {} d_tr, b_tr = np.histogram(train[trainY == target_value], bins=hist.binning) d_te, b_te = np.histogram(test[testY == target_value], bins=hist.binning) json_data[target.name + "_train"]["binning"] = b_tr.tolist() json_data[target.name + "_train"]["content"] = d_tr.tolist() json_data[target.name + "_test"]["binning"] = b_te.tolist() json_data[target.name + "_test"]["content"] = d_te.tolist() test_data.append(d_te.tolist()) train_data.append(d_tr.tolist()) separation = util.getSeparation(test_data[0], test_data[1]) hist.extra_text.Add("Test Separation = {0:.4f}".format(separation), coords=[0.03, 0.72]) p = hist.execute() hist.savefig() # save results to JSON file (just histogram values & bins) to re-make plots with open("{0}.json".format(hist.saveAs), 'w') as outfile: json.dump(json_data, outfile) return
def features(self): """ Plot the features For classification, compare different targets For regression, just plot the features <- should do data/mc plots instead! """ self.msg_svc.INFO("DL : Plotting features.") target0 = self.targets[0] # hard-coded for binary comparisons target1 = self.targets[1] plt_features = self.df.keys() for hi, feature in enumerate(plt_features): if feature == 'target': continue binning = self.variable_labels[feature].binning hist = HepPlotter("histogram", 1) hist.normed = True hist.stacked = False hist.logplot = {"y": False, "x": False, "data": False} hist.binning = binning hist.x_label = self.variable_labels[feature].label hist.y_label = "Events" hist.format = self.image_format hist.saveAs = self.output_dir + "/hist_" + feature + "_" + self.date hist.ratio_plot = True hist.ratio_type = 'ratio' hist.y_ratio_label = '{0}/{1}'.format(target0.label, target1.label) hist.CMSlabel = 'top left' hist.CMSlabelStatus = self.CMSlabelStatus hist.numLegendColumns = 1 # Add some extra text to the plot if self.processlabel: hist.extra_text.Add(self.processlabel, coords=[ 0.03, 0.80 ]) # physics process that produces these features hist.initialize() hist.Add(target0.df[feature], name=target0.name, draw='step', linecolor=target0.color, label=target0.label, ratio_num=True, ratio_den=False, ratio_partner=target1.name) hist.Add(target1.df[feature], name=target1.name, draw='step', linecolor=target1.color, label=target1.label, ratio_num=False, ratio_den=True, ratio_partner=target0.name) if self.classification == 'binary': t0, _ = np.histogram(target0.df[feature], bins=binning, normed=True) t1, _ = np.histogram(target1.df[feature], bins=binning, normed=True) separation = util.getSeparation(t0, t1) hist.extra_text.Add("Separation = {0:.4f}".format(separation), coords=[0.03, 0.73]) p = hist.execute() hist.savefig() return
def getSeparations(self): """Calculate separations between classes for each feature""" self.separations = dict((k, {}) for k in self.listOfFeatures) for featurepairs in self.listOfFeaturePairs: self.separations['-'.join(featurepairs)] = {} # One dimensional separations for target in self.target_pairs: target_a = [i for i in self.targets if i.name == target[0]][0] target_b = [i for i in self.targets if i.name == target[1]][0] saveAs = "{0}/separations1D_{1}-{2}_{3}".format( self.output_dir, target_a.name, target_b.name, self.date) fcsv = open("{0}.csv".format(saveAs), "w") for feature in self.listOfFeatures: if feature == 'target': continue # bin the data to make separation calculation simple data_a, _ = np.histogram( target_a.df[feature], normed=True, bins=self.variable_labels[feature].binning) data_b, _ = np.histogram( target_b.df[feature], normed=True, bins=self.variable_labels[feature].binning) separation = util.getSeparation(data_a, data_b) self.separations[feature]['-'.join(target)] = separation # Save separation info to CSV file fcsv.write("{0},{1}".format(feature, separation)) fcsv.close() # Two dimensional separations saveAs = "{0}/separations2D_{1}-{2}_{3}".format( self.output_dir, target_a.name, target_b.name, self.date) fcsv = open("{0}.csv".format(saveAs), "w") for featurepairs in self.listOfFeaturePairs: feature_x = featurepairs[0] feature_y = featurepairs[1] binning_x = self.variable_labels[feature_x].binning binning_y = self.variable_labels[feature_y].binning # bin the data to make separation calculation simple data_a, _, _ = np.histogram2d(target_a.df[feature_x], target_a.df[feature_y], bins=[binning_x, binning_y], normed=True) data_b, _, _ = np.histogram2d(target_b.df[feature_x], target_b.df[feature_y], bins=[binning_x, binning_y], normed=True) separation = util.getSeparation2D(data_a, data_b) self.separations['-'.join(featurepairs)]['-'.join( target)] = separation # Save separation info to CSV file fcsv.write("{0},{1},{2}".format(feature_x, feature_y, separation)) fcsv.close() return
def prediction(self, train_data={}, test_data={}, i=0): """ Plot the training and testing predictions. To save on memory, pass this TH1s directly, rather than raw values. """ self.msg_svc.INFO("DL : Plotting DNN prediction. ") binning = [bb * 0.1 for bb in range(11)] # Make a plot for each target value (e.g., QCD prediction to be QCD; Top prediction to be QCD, etc) hist = Histogram1D() hist.normed = True # compare shape differences (likely don't have the same event yield) hist.format = self.image_format hist.saveAs = "{0}/hist_DNN_prediction_{1}".format( self.output_dir, self.date) hist.binning = binning hist.stacked = False hist.x_label = "Prediction" hist.y_label = "A.U." hist.CMSlabel = 'outer' hist.CMSlabelStatus = self.CMSlabelStatus hist.legend['fontsize'] = 18 hist.ratio.value = "ratio" hist.ratio.ylabel = "Train/Test" hist.initialize() json_data = {} for t, target in enumerate(self.targets): target_value = target.target_value # arrays for multiclassification train_t = train_data[target.name] test_t = test_data[target.name] train_kwargs = { "draw_type": "step", "edgecolor": target.color, "label": target.label + " Train" } test_kwargs = { "draw_type": "stepfilled", "edgecolor": target.color, "color": target.color, "linewidth": 0, "alpha": 0.5, "label": target.label + " Test" } hist.Add(train_t, name=target.name + '_train', **train_kwargs) # Training hist.Add(test_t, name=target.name + '_test', **test_kwargs) # Testing hist.ratio.Add(numerator=target.name + '_train', denominator=target.name + '_test') ## Save data to JSON file if isinstance(train_t, ROOT.TH1): d_tr = hpt.hist2list(train_t) d_te = hpt.hist2list(test_t) json_data[target.name + "_train"] = { "binning": d_tr.bins.tolist(), "content": d_tr.content.tolist() } json_data[target.name + "_test"] = { "binning": d_te.bins.tolist(), "content": d_te.content.tolist() } else: json_data[target.name + "_train"] = { "binning": hist.binning, "content": train_t } json_data[target.name + "_test"] = { "binning": hist.binning, "content": test_t } # calculate separation between predictions for t, target in enumerate(self.target_pairs): data_a = json_data[target[0] + "_test"]["content"] data_b = json_data[target[1] + "_test"]["content"] separation = util.getSeparation(data_a, data_b) hist.extra_text.Add("Test Sep({0},{1}) = {2:.2f}".format( target[0], target[1], separation), coords=[0.03, (0.97 - 0.08 * t)]) json_data['-'.join(target) + "_test"] = {"separation": separation} p = hist.execute() hist.savefig() # save results to JSON file (just histogram values & bins) to re-make plots with open("{0}.json".format(hist.saveAs), 'w') as outfile: json.dump(json_data, outfile) return
def feature(self,dataframe,ndims=-1): """ Plot the features For classification, compare different targets For regression, just plot the features @param dataframe Pandas dataframe with data to be plotted @param ndims Number of dimensions to plot (-1=ALL; 1=1D features only) [always plot 1D features, for now] """ self.msg_svc.DEBUG("DL : Plotting features.") self.separations = dict( (k,{}) for k in self.features) for featurepairs in self.featurePairs: self.separations['-'.join(featurepairs)] = {} ## ++ Plot the features for each target for hi,feature in enumerate(self.features): vl = self.variable_labels[feature] hist = Histogram1D() hist.backend = self.backend hist.normed = True hist.stacked = False hist.binning = vl.binning hist.x_label = vl.label hist.y_label = "A.U." if hist.normed else "Events" hist.format = self.image_format hist.saveAs = self.output_dir+"/hist_"+feature hist.CMSlabel = 'outer' hist.CMSlabelStatus = self.CMSlabelStatus hist.legend['fontsize'] = 10 hist.ratio.value = "significance" hist.ratio.ylabel = r"S(A/$\sqrt{\text{B}}$)" hist.ratio.update_legend = True hist.initialize() # Draw the distribution for this feature for each NN class histValues = {} for c in self.classCollection: kwargs = {"draw_type":"step","edgecolor":c.color,"label":c.label} # Put into histogram before passing to hepPlotter to reduce memory h,bx = np.histogram(dataframe[dataframe.target==c.value][feature],bins=vl.binning) bin_centers = 0.5*(bx[:-1]+bx[1:]) hist.Add(bin_centers,weights=h,name=c.name,**kwargs) histValues[c.name] = h.copy() # Add ratio plot comparing the targets (in pairs) for this feature # e.g., feature (QCD) vs feature (QB), etc. numerators = {} markers = ['o','v','^'] # for ratios with the same numerator, change the marker style for pair in self.class_pairs: try: idx = numerators[pair[0]] numerators[pair[0]]+=1 except KeyError: idx = 0 numerators[pair[0]]=1 num = self.classCollection.get(pair[0]) den = self.classCollection.get(pair[1]) hist.ratio.Add(numerator=pair[0],denominator=pair[1],draw_type='errorbar', mec='k',mfc=num.color,ecolor=num.color,fmt=markers[idx], label=r"S({0},{1})".format(num.label,den.label)) p = hist.execute() hist.savefig() ## Calculate 1D separations for this feature between classes for pair in self.class_pairs: data_a = histValues.get(pair[0]) data_b = histValues.get(pair[1]) separation = util.getSeparation(data_a,data_b) self.separations[feature]['-'.join(pair)] = separation ## ++ Plot two features against each other for each target (multi-jet,W,QB,tt_bckg) for hi,featurepairs in enumerate(self.featurePairs): xfeature = featurepairs[0] yfeature = featurepairs[1] xvar = self.variable_labels[xfeature] yvar = self.variable_labels[yfeature] xbins = xvar.binning ybins = yvar.binning histValues = {} for c in self.classCollection: # save memory by making the histogram here and passing the result to hepPlotter xdf = dataframe[dataframe.target==c.value][xfeature] ydf = dataframe[dataframe.target==c.value][yfeature] h,binsx,binsy = np.histogram2d(xdf,ydf,bins=[xbins,ybins]) histValues[c.name] = h # h[0] yields the y-axis array for the first bin in x if ndims==1: continue # only plot 1D features; still calculate 2D features hist = Histogram2D() hist.backend = self.backend hist.colormap = 'default' hist.colorbar['title'] = "Events" try: hist.binning = [xbins.tolist(),ybins.tolist()] except: hist.binning = [xbins,ybins] hist.x_label = xvar.label hist.y_label = yvar.label hist.format = self.image_format hist.saveAs = self.output_dir+"/hist2d_"+c.name+"_"+xfeature+"-"+yfeature hist.CMSlabel = 'outer' hist.CMSlabelStatus = self.CMSlabelStatus hist.logplot['data'] = True hist.extra_text.Add(c.label,coords=[0.03,0.97]) hist.initialize() # create dummy binning binsx = 0.5*(binsx[:-1]+binsx[1:]) binsy = 0.5*(binsy[:-1]+binsy[1:]) xdummy = binsx.repeat(len(binsy)) ydummy = np.tile(binsy, (1,len(binsx)) )[0] hist.Add([xdummy,ydummy],weights=h.flatten(),name=c.name) p = hist.execute() hist.savefig(dpi=100) ## Calculate 2D separations for these features between classes for pair in self.class_pairs: data_a = histValues[pair[0]] data_b = histValues[pair[1]] separation = util.getSeparation(data_a,data_b) self.separations['-'.join(featurepairs)]['-'.join(pair)] = separation ## ++ Save separation info to CSV file # Storing raw values of separations to plot / analyze later for pair in self.class_pairs: saveAs1 = "{0}/separations1D_{1}-{2}".format(self.output_dir,pair[0],pair[1]) saveAs2 = "{0}/separations2D_{1}-{2}".format(self.output_dir,pair[0],pair[1]) fcsv1 = open("{0}.csv".format(saveAs1),"w") fcsv2 = open("{0}.csv".format(saveAs2),"w") fcsv1.write("feature,separation") fcsv2.write("xfeature,yfeature,separation") for f in self.separations.keys(): separation = self.separations[f]['-'.join(pair)] if '-' in f: feature_x,feature_y = f.split('-') fcsv2.write("{0},{1},{2}\n".format(feature_x,feature_y,separation)) else: fcsv1.write("{0},{1}\n".format(f,separation)) fcsv1.close() fcsv2.close() return
def _prediction(self,train_data={},test_data={},c=None): """Make the plot for DNN predictions""" target_label = '' target_name = '' if c is not None: target_label = ": {0}".format(self.sample_labels[c.name].label) target_name = "_{0}".format(c.name) hist = Histogram1D() hist.backend = self.backend hist.normed = True # compare shape differences (likely don't have the same event yield) hist.format = self.image_format hist.saveAs = "{0}/hist_DNN_prediction{1}".format(self.output_dir,target_name) hist.stacked = False hist.x_label = "Prediction{0}".format(target_label) hist.y_label = "A.U." hist.CMSlabel = 'outer' hist.CMSlabelStatus = self.CMSlabelStatus hist.legend['fontsize'] = 10 hist.ratio.value = "ratio" hist.ratio.ylabel = "Train/Test" hist.initialize() json_data = {} for t,cc in enumerate(self.classCollection): target_value = cc.value # arrays for multiclassification # Access histogram data (for binary or multi-classification) try: train_t = train_data[c.name][cc.name] test_t = test_data[c.name][cc.name] except: train_t = train_data[cc.name] test_t = test_data[cc.name] train_weights = train_t[0] train_bins = train_t[1] train_dummy = hpt.midpoints(train_bins) train_kwargs = {"draw_type":"step","edgecolor":cc.color, "label":cc.label+" Train"} test_weights = test_t[0] test_bins = test_t[1] test_dummy = hpt.midpoints(test_bins) test_kwargs = {"draw_type":"stepfilled","edgecolor":cc.color, "color":cc.color,"linewidth":0,"alpha":0.5, "label":cc.label+" Test"} hist.binning = train_bins # should be the same for train/test hist.Add(train_dummy,weights=train_weights,\ name=cc.name+'_train',**train_kwargs) # Training hist.Add(test_dummy,weights=test_weights,\ name=cc.name+'_test',**test_kwargs) # Testing hist.ratio.Add(numerator=cc.name+'_train',denominator=cc.name+'_test') ## Save data to JSON file json_data[cc.name+"_train"] = {"binning":train_t[1].tolist(), "content":train_t[0].tolist()} json_data[cc.name+"_test"] = {"binning":test_t[1].tolist(), "content":test_t[0].tolist()} p = hist.execute() hist.savefig() # calculate separation between predictions separations = OrderedDict() for t,target in enumerate(self.class_pairs): data_a = json_data[ target[0]+"_test" ]["content"] data_b = json_data[ target[1]+"_test" ]["content"] separation = util.getSeparation(np.asarray(data_a),np.asarray(data_b)) json_data[ '-'.join(target)+"_test" ] = {"separation":separation} separations['-'.join(target)] = separation # save results to JSON file (just histogram values & bins) to re-make plots with open("{0}.json".format(hist.saveAs), 'w') as outfile: json.dump(json_data, outfile) ## Plot separation between predictions for given target saveAs = "{0}/hist_DNN_prediction_sep_{1}".format(self.output_dir,target_name) sorted_sep = sorted(separations, key=separations.__getitem__) # sort data by separation value ypos = np.arange(len(sorted_sep)) # make the bar plot fig,ax = plt.subplots() ax.barh(ypos, [separations[i] for i in sorted_sep], align='center') ax.set_yticks(ypos) yticklabels = [] for i in sorted_sep: split = i.split("-") first = self.sample_labels[ split[0] ].label second = self.sample_labels[ split[1] ].label yticklabels.append( '{0}-{1}'.format(first,second) ) ax.set_yticklabels(yticklabels,fontsize=12) ax.set_xticklabels([self.formatter(i) for i in ax.get_xticks()]) ax.set_xlabel("Separation",ha='right',va='top',position=(1,0)) # CMS/COM Energy Label + Signal name self.stamp_cms(ax) self.stamp_energy(ax) ax.text(0.95,0.05,"DNN Prediction{0}".format(target_label),fontsize=16, ha='right',va='bottom',transform=ax.transAxes) plt.savefig("{0}.{1}".format(saveAs,self.image_format)) plt.close()