def calculate(self, histo): from ROOT import TGraphAsymmErrors frac = self.__getWeightOneHisto(histo, self.__name) total = self.__getWeightOneHisto( histo, self.__norm) if(frac.GetEntries() > total.GetEntries()): raise StandardError," comparing '%s' to '%s' in '%s' makes no sense eff > 1!"%(self.__name, self.__norm, histo.GetName()) eff = TGraphAsymmErrors(1) eff.BayesDivide(frac, total) if eff.GetN() < 1: raise StandardError,"Efficiency cannot be calculated '%s' in '%s'"%(self.__name, histo.GetName()) return ( eff.GetY()[0], (eff.GetEYlow()[0],eff.GetEYhigh()[0]) )
h1.SetBinContent(1, 0) #h1.SetBinError(1,dfx.rc_w2_sq.iloc[i]) h2.SetBinContent(1, w_s) h2.SetBinError(1, 0.008) #np.sqrt(w2_s)) g_tpr = GAE() g_tpr.Divide(h1, h2, "cl=0.683 b(1,1) mode") x_s = Double() y_s = Double() g_tpr.GetPoint(0, x_s, y_s) xl.append(x_s) yl.append(y_s) buffer_l_s = g_tpr.GetEYlow() buffer_l_s.SetSize(1) arr_l_s = np.array(buffer_l_s, copy=True) buffer_h_s = g_tpr.GetEYhigh() buffer_h_s.SetSize(1) arr_h_s = np.array(buffer_h_s, copy=True) hl.append(np.array(arr_h_s)[0]) ll.append(np.array(arr_l_s)[0]) #print arr_h_s #print arr_l_s h1.Delete() h2.Delete()
#print arr_y # if g_size is always equal to g_size_s we can put these loops together for i in range( g_size_s ): g_tpr.GetPoint(i,x_s,y_s) arr_x_s[i] = x_s arr_y_s[i] = y_s # GetEYhigh() work as the following 3 ways(presumably the 'copy' version works most consistently): #----------------------------------------------V1 #buffer_l = g_efficiency.GetEYlow() #arr_l = np.ndarray(g_size, 'f', buffer_l) #----------------------------------------------V2 #buffer_h = g_efficiency.GetEYhigh() #arr_h = np.frombuffer(buffer_h, count=g_size) #----------------------------------------------V3 buffer_l = g_efficiency.GetEYlow() buffer_l.SetSize(g_size) arr_l = np.array(buffer_l, copy=True) buffer_h = g_efficiency.GetEYhigh() buffer_h.SetSize(g_size) arr_h = np.array(buffer_h, copy=True) #print arr_h #print arr_l buffer_l_s = g_tpr.GetEYlow() buffer_l_s.SetSize(g_size_s) arr_l_s = np.array(buffer_l_s, copy=True) buffer_h_s = g_tpr.GetEYhigh() buffer_h_s.SetSize(g_size_s)
def SetBin(load, n_bins, bin_list): df = load.copy() df['w2'] = df['weight'].copy().apply(np.square) group_w = df.groupby(pd.cut(df['signal'], bin_list)) df_g_w_s = (group_w.sum()).copy() df1 = df_g_w_s[['weight']].copy() df1.fillna(0, inplace=True) df1['w2'] = df1['weight'].apply(np.square) df2 = df1.iloc[::-1] df2.loc[:, 'recum'] = df2['weight'].cumsum() df2.loc[:, 'rc_w2'] = df2['w2'].cumsum() df3 = df2.iloc[::-1] df1['recum'] = df3['recum'] df1['rc_w2_sq'] = df3['rc_w2'].apply(np.sqrt) w_s = df1['weight'].sum() sqr_s_w2 = np.sqrt(df['w2'].sum()) xl, yl, hl, ll = [], [], [], [] for i in xrange(n_bins): h_post = TH1F('h_post', 'h_post', 1, bin_list[i], bin_list[i + 1]) h_pre = TH1F('h_pre', 'h_pre', 1, bin_list[i], bin_list[i + 1]) h_post.SetBinContent(1, df1.recum.iloc[i]) h_post.SetBinError(1, df1.rc_w2_sq.iloc[i]) h_pre.SetBinContent(1, w_s) h_pre.SetBinError(1, sqr_s_w2) g_eff = GAE() g_eff.Divide(h_post, h_pre, "cl=0.683 b(1,1) mode") x = Double() y = Double() g_eff.GetPoint(0, x, y) xl.append(x) yl.append(y) buffer_l = g_eff.GetEYlow() buffer_l.SetSize(1) arr_l = np.array(buffer_l, copy=True) buffer_h = g_eff.GetEYhigh() buffer_h.SetSize(1) arr_h = np.array(buffer_h, copy=True) hl.append(np.array(arr_h)[0]) ll.append(np.array(arr_l)[0]) h_post.Delete() h_pre.Delete() #print xl #print yl #print hl #print ll out_dict = {} out_dict['x'] = xl out_dict['y'] = yl out_dict['eh'] = hl out_dict['el'] = ll return out_dict
def CutBaseROC(df_sg, df_bg, df_pos_sgn, df_pos_bkg): h_cut_pre_tpr = TH1F('h_cut_pre_tpr', 'hist_cut_pre_tpr', 1, bin_i, bin_f) h_cut_pos_tpr = TH1F('h_cut_pos_tpr', 'hist_cut_pos_tpr', 1, bin_i, bin_f) h_cut_pre_fpr = TH1F('h_cut_pre_fpr', 'hist_cut_pre_fpr', 1, bin_i, bin_f) h_cut_pos_fpr = TH1F('h_cut_pos_fpr', 'hist_cut_pos_fpr', 1, bin_i, bin_f) rnp.fill_hist(h_cut_pre_tpr, df_sg, df_sg) rnp.fill_hist(h_cut_pos_tpr, df_pos_sgn, df_pos_sgn) rnp.fill_hist(h_cut_pre_fpr, df_bg, df_bg) rnp.fill_hist(h_cut_pos_fpr, df_pos_bkg, df_pos_bkg) g_cut_tpr = GAE() g_cut_fpr = GAE() g_cut_tpr.Divide(h_cut_pos_tpr, h_cut_pre_tpr, "cl=0.683 b(1,1) mode") g_cut_fpr.Divide(h_cut_pos_fpr, h_cut_pre_fpr, "cl=0.683 b(1,1) mode") g_size_cut = 1 x = Double() y = Double() x_s = Double() y_s = Double() arr_x = np.zeros(g_size_cut) arr_y = np.zeros(g_size_cut) arr_x_s = np.zeros(g_size_cut) arr_y_s = np.zeros(g_size_cut) for i in xrange(g_size_cut): g_cut_fpr.GetPoint(i, x, y) arr_x[i] = x arr_y[i] = y g_cut_tpr.GetPoint(i, x_s, y_s) arr_x_s[i] = x_s arr_y_s[i] = y_s buffer_l = g_cut_fpr.GetEYlow() buffer_l.SetSize(g_size_cut) arr_l = np.array(buffer_l, copy=True) buffer_h = g_cut_fpr.GetEYhigh() buffer_h.SetSize(g_size_cut) arr_h = np.array(buffer_h, copy=True) buffer_l_s = g_cut_tpr.GetEYlow() buffer_l_s.SetSize(g_size_cut) arr_l_s = np.array(buffer_l_s, copy=True) buffer_h_s = g_cut_tpr.GetEYhigh() buffer_h_s.SetSize(g_size_cut) arr_h_s = np.array(buffer_h_s, copy=True) print len(arr_h) print len(arr_l) print 'TPR: ', arr_y_s print 'FPR: ', arr_y print arr_l_s print arr_l print arr_h_s print arr_h out_dict = {} out_dict['tpr'] = arr_y_s[0] out_dict['fpr'] = arr_y[0] out_dict['tpr_e_l'] = arr_l_s[0] out_dict['fpr_e_l'] = arr_l[0] out_dict['tpr_e_h'] = arr_h_s[0] out_dict['fpr_e_h'] = arr_h[0] return out_dict
def ROC_GEN(load_s, load_b): n_scan = 10000000 n_bins = 300 #100 bin_i = 0 bin_f = 1 draw = 1 param = {} param['n_scan'] = n_scan param['n_bins'] = n_bins param['bin_i'] = bin_i param['bin_f'] = bin_f print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Loading data...' #load_s = joblib.load(pth+'/dumps/s.pkl') #load_b = joblib.load(pth+'/dumps/b.pkl') print len(load_b) # Calculate the bin vector: bin_width = float(bin_f - bin_i) / n_bins bin_value_dict = {} for j in xrange(n_bins): bin_value_dict[j] = bin_i + (0.5 + j) * bin_width #print bin_value_dict # if you want to use the GetCumulative() of a histogram and set its error: do not call the Sumw2() of it h_after_selection = TH1F('h_after_selection', 'hist_after_selection', n_bins, bin_i, bin_f) #h_after_selection.Sumw2() h_before_selection = TH1F('h_before_selection', 'hist_before_selection', n_bins, bin_i, bin_f) #h_before_selection.Sumw2() h_true_positive = TH1F('h_true_positive', 'True_Positives', n_bins, bin_i, bin_f) h_true = TH1F('h_true', 'Trues', n_bins, bin_i, bin_f) # see if reversing the bin_min and bin_max will cause the histogram axis to reverse(no) #h_true = TH1F('h_true' ,'Trues' , n_bins, bin_f, bin_i) h_c_b = TH1F('h_c_b', 'hist_after_selection_cum_rev', n_bins, bin_i, bin_f) #h_c_b.Sumw2() h_c_s = TH1F('h_c_s', 'hist_true_positives_cum_rev', n_bins, bin_i, bin_f) #h_c_s.Sumw2() #################################### #################################### print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Filling histogram...' zeitA = time() df_b = load_b.copy()[:n_scan] df_b['bin'] = df_b['signal'] #df_s = load_s.copy() #df_s['bin'] = df_s['signal'] df_list_pre_sel = [] df_list_pos_sel = [] def Appnd(tupl_in): df_list_pre_sel.append(tupl_in[0]) df_list_pos_sel.append(tupl_in[1]) def DataFrameExpand(k): mask_k_b = (bin_value_dict[k] - 0.5 * bin_width <= df_b['bin']) & ( bin_value_dict[k] + 0.5 * bin_width > df_b['bin']) df_b['bin'][mask_k_b] = k #mask_k_s = ( bin_value_dict[k] - 0.5*bin_width <= df_s['bin'] ) & ( bin_value_dict[k] + 0.5*bin_width > df_s['bin'] ) #df_s['bin'][mask_k_s] = k mask_k = df_b.bin == k df_b_w_k = df_b.weight[mask_k] df_list_k_pre_sel = [] df_list_k_pos_sel = [] for kk in xrange(n_bins): df_tmp_kk_pre_sel = pd.DataFrame() df_tmp_kk_pre_sel['weight'] = df_b_w_k df_tmp_kk_pre_sel['bin'] = bin_value_dict[kk] df_list_k_pre_sel.append(df_tmp_kk_pre_sel) if kk > k: continue df_tmp_kk_pos_sel = pd.DataFrame() df_tmp_kk_pos_sel['weight'] = df_b_w_k df_tmp_kk_pos_sel['bin'] = bin_value_dict[kk] df_list_k_pos_sel.append(df_tmp_kk_pos_sel) df_tmp_k_pre_sel = pd.concat(df_list_k_pre_sel) df_tmp_k_pos_sel = pd.concat(df_list_k_pos_sel) return df_tmp_k_pre_sel, df_tmp_k_pos_sel # Parallelization: ''' pool_dfe = mp.Pool() for i in xrange(n_bins): pool_dfe.apply_async(DataFrameExpand, args=(i, ), callback=Appnd) pool_dfe.close() pool_dfe.join() ''' for ii in xrange(n_bins): callback = DataFrameExpand(ii) Appnd(callback) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~alternatives: #process = mp.Process(target=F, args=(k,)) #manager = mp.Manager() #pool.map(FT,L) df_before_selection = pd.concat(df_list_pre_sel) df_after_selection = pd.concat(df_list_pos_sel) rnp.fill_hist(h_before_selection, df_before_selection.bin, df_before_selection.weight) rnp.fill_hist(h_c_b, df_after_selection.bin, df_after_selection.weight) zeitB = time() print 'Time taken for filling histogram(for #events: ' + str( n_scan) + '): ', str(zeitB - zeitA) #################################### #################################### print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Filling histogram (for tpr)...' zeitA = time() for index, row in load_s.iterrows(): tmp_weight = row['weight'] tmp_signal = row['signal'] for k in xrange(n_bins): h_true.Fill(bin_value_dict[k], tmp_weight) if bin_value_dict[ k] - 0.5 * bin_width <= tmp_signal and bin_value_dict[ k] + 0.5 * bin_width > tmp_signal: for kk in xrange(k): h_c_s.Fill(bin_value_dict[kk], tmp_weight) g_fpr = GAE() g_tpr = GAE() g_fpr.Divide(h_c_b, h_before_selection, "cl=0.683 b(1,1) mode") g_tpr.Divide(h_c_s, h_true, "cl=0.683 b(1,1) mode") g_size = g_fpr.GetN() x = Double() y = Double() x_s = Double() y_s = Double() arr_x = np.zeros(g_size) arr_y = np.zeros(g_size) arr_x_s = np.zeros(g_size) arr_y_s = np.zeros(g_size) for i in xrange(g_size): g_fpr.GetPoint(i, x, y) arr_x[i] = x arr_y[i] = y g_tpr.GetPoint(i, x_s, y_s) arr_x_s[i] = x_s arr_y_s[i] = y_s buffer_l = g_fpr.GetEYlow() buffer_l.SetSize(g_size) arr_l = np.array(buffer_l, copy=True) buffer_h = g_fpr.GetEYhigh() buffer_h.SetSize(g_size) arr_h = np.array(buffer_h, copy=True) #print arr_h #print arr_l buffer_l_s = g_tpr.GetEYlow() buffer_l_s.SetSize(g_size) arr_l_s = np.array(buffer_l_s, copy=True) buffer_h_s = g_tpr.GetEYhigh() buffer_h_s.SetSize(g_size) arr_h_s = np.array(buffer_h_s, copy=True) print len(arr_h) print len(arr_l) ####################### # Calculate AOC # ####################### ''' x = np.array(arr_y_s) y = np.array(arr_y) exl = np.array(arr_l_s) eyl = np.array(arr_l) exh = np.array(arr_h_s) eyh = np.array(arr_h) ''' ####################### # Export ROC Position # ####################### roc_dict = {} roc_dict['param'] = param roc_dict['tpr'] = np.array(arr_y_s) roc_dict['fpr'] = np.array(arr_y) roc_dict['e_tpr_l'] = np.array(arr_l_s) roc_dict['e_fpr_l'] = np.array(arr_l) roc_dict['e_tpr_h'] = np.array(arr_h_s) roc_dict['e_fpr_h'] = np.array(arr_h) roc_dict['threshold'] = bin_value_dict roc_dict['cut_based'] = {} #roc_dict['cut_based']['lc'] = LC_dict #roc_dict['cut_based']['hc'] = HC_dict #roc_dict['aoc'] = aoc #roc_dict['aoc_l'] = aoc_l #roc_dict['aoc_h'] = aoc_h #raw_data = {} #raw_data['load_s'] = load_s #raw_data['load_b'] = load_b #roc_dict['raw'] = raw_data ''' path_dump = '/beegfs/desy/user/hezhiyua/2bBacked/roc_data/' name_dump = 'roc.pkl' joblib.dump(roc_dict, path_dump+name_dump) ''' return roc_dict
#print arr_y # if g_size is always equal to g_size_s we can put these loops together #for i in xrange( g_size_s ): # g_tpr.GetPoint(i,x_s,y_s) # arr_x_s[i] = x_s # arr_y_s[i] = y_s # GetEYhigh() work as the following 3 ways(presumably the 'copy' version works most consistently): #----------------------------------------------V1 #buffer_l = g_efficiency.GetEYlow() #arr_l = np.ndarray(g_size, 'f', buffer_l) #----------------------------------------------V2 #buffer_h = g_efficiency.GetEYhigh() #arr_h = np.frombuffer(buffer_h, count=g_size) #----------------------------------------------V3 buffer_l = g_efficiency.GetEYlow() buffer_l.SetSize(g_size) arr_l = np.array(buffer_l, copy=True) buffer_h = g_efficiency.GetEYhigh() buffer_h.SetSize(g_size) arr_h = np.array(buffer_h, copy=True) #print arr_h #print arr_l """ buffer_l_s = g_tpr.GetEYlow() buffer_l_s.SetSize(g_size_s) arr_l_s = np.array(buffer_l_s, copy=True) buffer_h_s = g_tpr.GetEYhigh() buffer_h_s.SetSize(g_size_s)
def CutBaseBenchmarkNew(df_test_orig, inDict, JetPrfx_bkg, refAttr='pt', isSigAttrStr='is_signal', weightAttrStr='weight'): refAttrLabel = JetPrfx_bkg + refAttr tt = df_test_orig.copy() sg = tt[isSigAttrStr] == 1 bg = tt[isSigAttrStr] == 0 BA_l = {} #pick out events that satisfiy the cut for iAttr, iList in inDict.iteritems(): iAttr = 'J1' + iAttr if iList[0] == '<': BA_l[iAttr] = tt[JetPrfx_bkg + iAttr] < iList[1] elif iList[0] == '>': BA_l[iAttr] = tt[JetPrfx_bkg + iAttr] > iList[1] pos = tt[refAttrLabel] pos_sgn = tt[weightAttrStr] pos_bkg = tt[weightAttrStr] n_pos = tt[weightAttrStr] for iAttr, iList in inDict.iteritems(): iAttr = 'J1' + iAttr pos = pos[BA_l[iAttr]] #events that pass the selection(all the cuts) pos_sgn = pos_sgn[ BA_l[iAttr]] #signal events that pass the selection(all the cuts) pos_bkg = pos_bkg[BA_l[ iAttr]] #background events that pass the selection(all the cuts) n_pos = n_pos[BA_l[iAttr]] #see below pos_sgn = pos_sgn[sg] pos_bkg = pos_bkg[bg] n_pos = float(n_pos.sum()) #sum up the weights n_sgn = float(tt[weightAttrStr][sg].sum()) #sum of weights from all signal n_bkg = float( tt[weightAttrStr][bg].sum()) #sum of weights from all background n_pos_sgn = float(pos_sgn.sum( )) #sum of weights of signal events that pass the selection n_pos_bkg = float(pos_bkg.sum( )) #sum of weights of background events that pass the selection sgn_eff = np.divide(n_pos_sgn, n_sgn) fls_eff = np.divide(n_pos_bkg, n_bkg) print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Benchmark:' print 'num of total test events: ', tt[refAttrLabel].count() print "num of signals : ", n_sgn print 'num of background : ', n_bkg print "num of pos events : ", n_pos print "num of pos bkg : ", n_pos_bkg print "num of pos sgn : ", n_pos_sgn print "true positive rate : ", sgn_eff print "false positive rate : ", fls_eff #return tt[weightAttrStr][sg], tt[weightAttrStr][bg], pos_sgn, pos_bkg#sgn_eff, fls_eff df_sg = tt[weightAttrStr][sg] df_bg = tt[weightAttrStr][bg] df_pos_sgn = pos_sgn df_pos_bkg = pos_bkg bin_i = 0 bin_f = 1 #def CutBaseROC(df_sg, df_bg, df_pos_sgn, df_pos_bkg): h_cut_pre_tpr = TH1F('h_cut_pre_tpr', 'hist_cut_pre_tpr', 1, bin_i, bin_f) h_cut_pos_tpr = TH1F('h_cut_pos_tpr', 'hist_cut_pos_tpr', 1, bin_i, bin_f) h_cut_pre_fpr = TH1F('h_cut_pre_fpr', 'hist_cut_pre_fpr', 1, bin_i, bin_f) h_cut_pos_fpr = TH1F('h_cut_pos_fpr', 'hist_cut_pos_fpr', 1, bin_i, bin_f) root_numpy.fill_hist(h_cut_pre_tpr, df_sg, df_sg) root_numpy.fill_hist(h_cut_pos_tpr, df_pos_sgn, df_pos_sgn) root_numpy.fill_hist(h_cut_pre_fpr, df_bg, df_bg) root_numpy.fill_hist(h_cut_pos_fpr, df_pos_bkg, df_pos_bkg) g_cut_tpr = GAE() g_cut_fpr = GAE() g_cut_tpr.Divide(h_cut_pos_tpr, h_cut_pre_tpr, "cl=0.683 b(1,1) mode") g_cut_fpr.Divide(h_cut_pos_fpr, h_cut_pre_fpr, "cl=0.683 b(1,1) mode") g_size_cut = 1 x = Double() y = Double() x_s = Double() y_s = Double() arr_x = np.zeros(g_size_cut) arr_y = np.zeros(g_size_cut) arr_x_s = np.zeros(g_size_cut) arr_y_s = np.zeros(g_size_cut) for i in xrange(g_size_cut): g_cut_fpr.GetPoint(i, x, y) arr_x[i] = x arr_y[i] = y g_cut_tpr.GetPoint(i, x_s, y_s) arr_x_s[i] = x_s arr_y_s[i] = y_s buffer_l = g_cut_fpr.GetEYlow() buffer_l.SetSize(g_size_cut) arr_l = np.array(buffer_l, copy=True) buffer_h = g_cut_fpr.GetEYhigh() buffer_h.SetSize(g_size_cut) arr_h = np.array(buffer_h, copy=True) buffer_l_s = g_cut_tpr.GetEYlow() buffer_l_s.SetSize(g_size_cut) arr_l_s = np.array(buffer_l_s, copy=True) buffer_h_s = g_cut_tpr.GetEYhigh() buffer_h_s.SetSize(g_size_cut) arr_h_s = np.array(buffer_h_s, copy=True) print len(arr_h) print len(arr_l) print 'TPR: ', arr_y_s print 'FPR: ', arr_y print arr_l_s print arr_l print arr_h_s print arr_h out_dict = {} out_dict['tpr'] = arr_y_s[0] out_dict['fpr'] = arr_y[0] out_dict['tpr_e_l'] = arr_l_s[0] out_dict['fpr_e_l'] = arr_l[0] out_dict['tpr_e_h'] = arr_h_s[0] out_dict['fpr_e_h'] = arr_h[0] return out_dict