Esempio n. 1
0
    def calculate(self, histo):        
        from ROOT import TGraphAsymmErrors
        frac = self.__getWeightOneHisto(histo, self.__name)
        total = self.__getWeightOneHisto( histo, self.__norm)

        if(frac.GetEntries() > total.GetEntries()):
            raise StandardError," comparing '%s' to '%s' in '%s' makes no sense eff > 1!"%(self.__name, self.__norm, histo.GetName())
        
        eff = TGraphAsymmErrors(1)
        eff.BayesDivide(frac, total)
        if eff.GetN() < 1: 
            raise StandardError,"Efficiency cannot be calculated '%s' in '%s'"%(self.__name, histo.GetName())
        return ( eff.GetY()[0], (eff.GetEYlow()[0],eff.GetEYhigh()[0]) )
Esempio n. 2
0
        g_tpr = GAE()
        g_tpr.Divide(h1, h2, "cl=0.683 b(1,1) mode")

        x_s = Double()
        y_s = Double()

        g_tpr.GetPoint(0, x_s, y_s)

        xl.append(x_s)
        yl.append(y_s)

        buffer_l_s = g_tpr.GetEYlow()
        buffer_l_s.SetSize(1)
        arr_l_s = np.array(buffer_l_s, copy=True)

        buffer_h_s = g_tpr.GetEYhigh()
        buffer_h_s.SetSize(1)
        arr_h_s = np.array(buffer_h_s, copy=True)

        hl.append(np.array(arr_h_s)[0])
        ll.append(np.array(arr_l_s)[0])
        #print arr_h_s
        #print arr_l_s

        h1.Delete()
        h2.Delete()

    print xl
    print yl
    print hl
    print ll
Esempio n. 3
0
    arr_x_s[i] = x_s 
    arr_y_s[i] = y_s

# GetEYhigh() work as the following 3 ways(presumably the 'copy' version works most consistently):
#----------------------------------------------V1
#buffer_l = g_efficiency.GetEYlow()
#arr_l    = np.ndarray(g_size, 'f', buffer_l)
#----------------------------------------------V2
#buffer_h   = g_efficiency.GetEYhigh()
#arr_h      = np.frombuffer(buffer_h, count=g_size)
#----------------------------------------------V3
buffer_l   = g_efficiency.GetEYlow()
buffer_l.SetSize(g_size)
arr_l      = np.array(buffer_l, copy=True)

buffer_h   = g_efficiency.GetEYhigh()
buffer_h.SetSize(g_size)
arr_h      = np.array(buffer_h, copy=True)
#print arr_h
#print arr_l

buffer_l_s   = g_tpr.GetEYlow()
buffer_l_s.SetSize(g_size_s)
arr_l_s      = np.array(buffer_l_s, copy=True)

buffer_h_s   = g_tpr.GetEYhigh()
buffer_h_s.SetSize(g_size_s)
arr_h_s      = np.array(buffer_h_s, copy=True)
print len(arr_h)
print len(arr_l)
Esempio n. 4
0
def SetBin(load, n_bins, bin_list):
    df = load.copy()
    df['w2'] = df['weight'].copy().apply(np.square)
    group_w = df.groupby(pd.cut(df['signal'], bin_list))
    df_g_w_s = (group_w.sum()).copy()
    df1 = df_g_w_s[['weight']].copy()
    df1.fillna(0, inplace=True)
    df1['w2'] = df1['weight'].apply(np.square)
    df2 = df1.iloc[::-1]
    df2.loc[:, 'recum'] = df2['weight'].cumsum()
    df2.loc[:, 'rc_w2'] = df2['w2'].cumsum()
    df3 = df2.iloc[::-1]
    df1['recum'] = df3['recum']
    df1['rc_w2_sq'] = df3['rc_w2'].apply(np.sqrt)
    w_s = df1['weight'].sum()
    sqr_s_w2 = np.sqrt(df['w2'].sum())

    xl, yl, hl, ll = [], [], [], []
    for i in xrange(n_bins):
        h_post = TH1F('h_post', 'h_post', 1, bin_list[i], bin_list[i + 1])
        h_pre = TH1F('h_pre', 'h_pre', 1, bin_list[i], bin_list[i + 1])
        h_post.SetBinContent(1, df1.recum.iloc[i])
        h_post.SetBinError(1, df1.rc_w2_sq.iloc[i])
        h_pre.SetBinContent(1, w_s)
        h_pre.SetBinError(1, sqr_s_w2)

        g_eff = GAE()
        g_eff.Divide(h_post, h_pre, "cl=0.683 b(1,1) mode")

        x = Double()
        y = Double()

        g_eff.GetPoint(0, x, y)

        xl.append(x)
        yl.append(y)

        buffer_l = g_eff.GetEYlow()
        buffer_l.SetSize(1)
        arr_l = np.array(buffer_l, copy=True)

        buffer_h = g_eff.GetEYhigh()
        buffer_h.SetSize(1)
        arr_h = np.array(buffer_h, copy=True)

        hl.append(np.array(arr_h)[0])
        ll.append(np.array(arr_l)[0])

        h_post.Delete()
        h_pre.Delete()

    #print xl
    #print yl
    #print hl
    #print ll
    out_dict = {}
    out_dict['x'] = xl
    out_dict['y'] = yl
    out_dict['eh'] = hl
    out_dict['el'] = ll

    return out_dict
Esempio n. 5
0
def CutBaseROC(df_sg, df_bg, df_pos_sgn, df_pos_bkg):

    h_cut_pre_tpr = TH1F('h_cut_pre_tpr', 'hist_cut_pre_tpr', 1, bin_i, bin_f)
    h_cut_pos_tpr = TH1F('h_cut_pos_tpr', 'hist_cut_pos_tpr', 1, bin_i, bin_f)

    h_cut_pre_fpr = TH1F('h_cut_pre_fpr', 'hist_cut_pre_fpr', 1, bin_i, bin_f)
    h_cut_pos_fpr = TH1F('h_cut_pos_fpr', 'hist_cut_pos_fpr', 1, bin_i, bin_f)

    rnp.fill_hist(h_cut_pre_tpr, df_sg, df_sg)
    rnp.fill_hist(h_cut_pos_tpr, df_pos_sgn, df_pos_sgn)

    rnp.fill_hist(h_cut_pre_fpr, df_bg, df_bg)
    rnp.fill_hist(h_cut_pos_fpr, df_pos_bkg, df_pos_bkg)

    g_cut_tpr = GAE()
    g_cut_fpr = GAE()
    g_cut_tpr.Divide(h_cut_pos_tpr, h_cut_pre_tpr, "cl=0.683 b(1,1) mode")
    g_cut_fpr.Divide(h_cut_pos_fpr, h_cut_pre_fpr, "cl=0.683 b(1,1) mode")

    g_size_cut = 1

    x = Double()
    y = Double()
    x_s = Double()
    y_s = Double()

    arr_x = np.zeros(g_size_cut)
    arr_y = np.zeros(g_size_cut)
    arr_x_s = np.zeros(g_size_cut)
    arr_y_s = np.zeros(g_size_cut)

    for i in xrange(g_size_cut):
        g_cut_fpr.GetPoint(i, x, y)
        arr_x[i] = x
        arr_y[i] = y

        g_cut_tpr.GetPoint(i, x_s, y_s)
        arr_x_s[i] = x_s
        arr_y_s[i] = y_s

    buffer_l = g_cut_fpr.GetEYlow()
    buffer_l.SetSize(g_size_cut)
    arr_l = np.array(buffer_l, copy=True)

    buffer_h = g_cut_fpr.GetEYhigh()
    buffer_h.SetSize(g_size_cut)
    arr_h = np.array(buffer_h, copy=True)

    buffer_l_s = g_cut_tpr.GetEYlow()
    buffer_l_s.SetSize(g_size_cut)
    arr_l_s = np.array(buffer_l_s, copy=True)

    buffer_h_s = g_cut_tpr.GetEYhigh()
    buffer_h_s.SetSize(g_size_cut)
    arr_h_s = np.array(buffer_h_s, copy=True)
    print len(arr_h)
    print len(arr_l)

    print 'TPR: ', arr_y_s
    print 'FPR: ', arr_y
    print arr_l_s
    print arr_l
    print arr_h_s
    print arr_h

    out_dict = {}
    out_dict['tpr'] = arr_y_s[0]
    out_dict['fpr'] = arr_y[0]

    out_dict['tpr_e_l'] = arr_l_s[0]
    out_dict['fpr_e_l'] = arr_l[0]
    out_dict['tpr_e_h'] = arr_h_s[0]
    out_dict['fpr_e_h'] = arr_h[0]

    return out_dict
Esempio n. 6
0
def ROC_GEN(load_s, load_b):
    n_scan = 10000000
    n_bins = 300  #100
    bin_i = 0
    bin_f = 1
    draw = 1

    param = {}
    param['n_scan'] = n_scan
    param['n_bins'] = n_bins
    param['bin_i'] = bin_i
    param['bin_f'] = bin_f
    print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Loading data...'
    #load_s = joblib.load(pth+'/dumps/s.pkl')
    #load_b = joblib.load(pth+'/dumps/b.pkl')

    print len(load_b)

    # Calculate the bin vector:
    bin_width = float(bin_f - bin_i) / n_bins
    bin_value_dict = {}
    for j in xrange(n_bins):
        bin_value_dict[j] = bin_i + (0.5 + j) * bin_width
    #print bin_value_dict

    # if you want to use the GetCumulative() of a histogram and set its error: do not call the Sumw2() of it
    h_after_selection = TH1F('h_after_selection', 'hist_after_selection',
                             n_bins, bin_i, bin_f)
    #h_after_selection.Sumw2()
    h_before_selection = TH1F('h_before_selection', 'hist_before_selection',
                              n_bins, bin_i, bin_f)
    #h_before_selection.Sumw2()

    h_true_positive = TH1F('h_true_positive', 'True_Positives', n_bins, bin_i,
                           bin_f)
    h_true = TH1F('h_true', 'Trues', n_bins, bin_i, bin_f)
    # see if reversing the bin_min and bin_max will cause the histogram axis to reverse(no)
    #h_true          = TH1F('h_true'          ,'Trues'          , n_bins, bin_f, bin_i)

    h_c_b = TH1F('h_c_b', 'hist_after_selection_cum_rev', n_bins, bin_i, bin_f)
    #h_c_b.Sumw2()
    h_c_s = TH1F('h_c_s', 'hist_true_positives_cum_rev', n_bins, bin_i, bin_f)
    #h_c_s.Sumw2()

    ####################################
    ####################################

    print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Filling histogram...'
    zeitA = time()

    df_b = load_b.copy()[:n_scan]
    df_b['bin'] = df_b['signal']
    #df_s        = load_s.copy()
    #df_s['bin'] = df_s['signal']

    df_list_pre_sel = []
    df_list_pos_sel = []

    def Appnd(tupl_in):
        df_list_pre_sel.append(tupl_in[0])
        df_list_pos_sel.append(tupl_in[1])

    def DataFrameExpand(k):
        mask_k_b = (bin_value_dict[k] - 0.5 * bin_width <= df_b['bin']) & (
            bin_value_dict[k] + 0.5 * bin_width > df_b['bin'])
        df_b['bin'][mask_k_b] = k
        #mask_k_s = ( bin_value_dict[k] - 0.5*bin_width <= df_s['bin'] ) & ( bin_value_dict[k] + 0.5*bin_width > df_s['bin'] )
        #df_s['bin'][mask_k_s] = k

        mask_k = df_b.bin == k
        df_b_w_k = df_b.weight[mask_k]

        df_list_k_pre_sel = []
        df_list_k_pos_sel = []
        for kk in xrange(n_bins):
            df_tmp_kk_pre_sel = pd.DataFrame()
            df_tmp_kk_pre_sel['weight'] = df_b_w_k
            df_tmp_kk_pre_sel['bin'] = bin_value_dict[kk]
            df_list_k_pre_sel.append(df_tmp_kk_pre_sel)
            if kk > k: continue
            df_tmp_kk_pos_sel = pd.DataFrame()
            df_tmp_kk_pos_sel['weight'] = df_b_w_k
            df_tmp_kk_pos_sel['bin'] = bin_value_dict[kk]
            df_list_k_pos_sel.append(df_tmp_kk_pos_sel)

        df_tmp_k_pre_sel = pd.concat(df_list_k_pre_sel)
        df_tmp_k_pos_sel = pd.concat(df_list_k_pos_sel)

        return df_tmp_k_pre_sel, df_tmp_k_pos_sel

    # Parallelization:
    ''' 
    pool_dfe = mp.Pool()
    for i in xrange(n_bins):
        pool_dfe.apply_async(DataFrameExpand, args=(i, ), callback=Appnd)
    pool_dfe.close()
    pool_dfe.join()
    '''
    for ii in xrange(n_bins):
        callback = DataFrameExpand(ii)
        Appnd(callback)
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~alternatives:
    #process = mp.Process(target=F, args=(k,))
    #manager = mp.Manager()
    #pool.map(FT,L)

    df_before_selection = pd.concat(df_list_pre_sel)
    df_after_selection = pd.concat(df_list_pos_sel)

    rnp.fill_hist(h_before_selection, df_before_selection.bin,
                  df_before_selection.weight)
    rnp.fill_hist(h_c_b, df_after_selection.bin, df_after_selection.weight)

    zeitB = time()
    print 'Time taken for filling histogram(for #events: ' + str(
        n_scan) + '): ', str(zeitB - zeitA)
    ####################################
    ####################################

    print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Filling histogram (for tpr)...'
    zeitA = time()
    for index, row in load_s.iterrows():
        tmp_weight = row['weight']
        tmp_signal = row['signal']
        for k in xrange(n_bins):
            h_true.Fill(bin_value_dict[k], tmp_weight)
            if bin_value_dict[
                    k] - 0.5 * bin_width <= tmp_signal and bin_value_dict[
                        k] + 0.5 * bin_width > tmp_signal:
                for kk in xrange(k):
                    h_c_s.Fill(bin_value_dict[kk], tmp_weight)

    g_fpr = GAE()
    g_tpr = GAE()
    g_fpr.Divide(h_c_b, h_before_selection, "cl=0.683 b(1,1) mode")
    g_tpr.Divide(h_c_s, h_true, "cl=0.683 b(1,1) mode")

    g_size = g_fpr.GetN()

    x = Double()
    y = Double()
    x_s = Double()
    y_s = Double()

    arr_x = np.zeros(g_size)
    arr_y = np.zeros(g_size)
    arr_x_s = np.zeros(g_size)
    arr_y_s = np.zeros(g_size)

    for i in xrange(g_size):
        g_fpr.GetPoint(i, x, y)
        arr_x[i] = x
        arr_y[i] = y

        g_tpr.GetPoint(i, x_s, y_s)
        arr_x_s[i] = x_s
        arr_y_s[i] = y_s

    buffer_l = g_fpr.GetEYlow()
    buffer_l.SetSize(g_size)
    arr_l = np.array(buffer_l, copy=True)

    buffer_h = g_fpr.GetEYhigh()
    buffer_h.SetSize(g_size)
    arr_h = np.array(buffer_h, copy=True)
    #print arr_h
    #print arr_l

    buffer_l_s = g_tpr.GetEYlow()
    buffer_l_s.SetSize(g_size)
    arr_l_s = np.array(buffer_l_s, copy=True)

    buffer_h_s = g_tpr.GetEYhigh()
    buffer_h_s.SetSize(g_size)
    arr_h_s = np.array(buffer_h_s, copy=True)
    print len(arr_h)
    print len(arr_l)

    #######################
    # Calculate AOC       #
    #######################
    '''
    x   = np.array(arr_y_s)
    y   = np.array(arr_y)
    exl = np.array(arr_l_s)
    eyl = np.array(arr_l)
    exh = np.array(arr_h_s)
    eyh = np.array(arr_h)
    '''

    #######################
    # Export ROC Position #
    #######################
    roc_dict = {}
    roc_dict['param'] = param
    roc_dict['tpr'] = np.array(arr_y_s)
    roc_dict['fpr'] = np.array(arr_y)
    roc_dict['e_tpr_l'] = np.array(arr_l_s)
    roc_dict['e_fpr_l'] = np.array(arr_l)
    roc_dict['e_tpr_h'] = np.array(arr_h_s)
    roc_dict['e_fpr_h'] = np.array(arr_h)
    roc_dict['threshold'] = bin_value_dict

    roc_dict['cut_based'] = {}
    #roc_dict['cut_based']['lc'] = LC_dict
    #roc_dict['cut_based']['hc'] = HC_dict
    #roc_dict['aoc']       = aoc
    #roc_dict['aoc_l']       = aoc_l
    #roc_dict['aoc_h']       = aoc_h

    #raw_data           = {}
    #raw_data['load_s'] = load_s
    #raw_data['load_b'] = load_b
    #roc_dict['raw'] = raw_data
    '''
    path_dump = '/beegfs/desy/user/hezhiyua/2bBacked/roc_data/'
    name_dump = 'roc.pkl'
    joblib.dump(roc_dict, path_dump+name_dump)
    '''
    return roc_dict
Esempio n. 7
0
#    arr_x_s[i] = x_s 
#    arr_y_s[i] = y_s

# GetEYhigh() work as the following 3 ways(presumably the 'copy' version works most consistently):
#----------------------------------------------V1
#buffer_l = g_efficiency.GetEYlow()
#arr_l    = np.ndarray(g_size, 'f', buffer_l)
#----------------------------------------------V2
#buffer_h   = g_efficiency.GetEYhigh()
#arr_h      = np.frombuffer(buffer_h, count=g_size)
#----------------------------------------------V3
buffer_l   = g_efficiency.GetEYlow()
buffer_l.SetSize(g_size)
arr_l      = np.array(buffer_l, copy=True)

buffer_h   = g_efficiency.GetEYhigh()
buffer_h.SetSize(g_size)
arr_h      = np.array(buffer_h, copy=True)
#print arr_h
#print arr_l
"""
buffer_l_s   = g_tpr.GetEYlow()
buffer_l_s.SetSize(g_size_s)
arr_l_s      = np.array(buffer_l_s, copy=True)

buffer_h_s   = g_tpr.GetEYhigh()
buffer_h_s.SetSize(g_size_s)
arr_h_s      = np.array(buffer_h_s, copy=True)
"""

Esempio n. 8
0
def CutBaseBenchmarkNew(df_test_orig,
                        inDict,
                        JetPrfx_bkg,
                        refAttr='pt',
                        isSigAttrStr='is_signal',
                        weightAttrStr='weight'):
    refAttrLabel = JetPrfx_bkg + refAttr
    tt = df_test_orig.copy()
    sg = tt[isSigAttrStr] == 1
    bg = tt[isSigAttrStr] == 0

    BA_l = {}
    #pick out events that satisfiy the cut
    for iAttr, iList in inDict.iteritems():
        iAttr = 'J1' + iAttr
        if iList[0] == '<': BA_l[iAttr] = tt[JetPrfx_bkg + iAttr] < iList[1]
        elif iList[0] == '>': BA_l[iAttr] = tt[JetPrfx_bkg + iAttr] > iList[1]

    pos = tt[refAttrLabel]
    pos_sgn = tt[weightAttrStr]
    pos_bkg = tt[weightAttrStr]
    n_pos = tt[weightAttrStr]
    for iAttr, iList in inDict.iteritems():
        iAttr = 'J1' + iAttr
        pos = pos[BA_l[iAttr]]  #events that pass the selection(all the cuts)
        pos_sgn = pos_sgn[
            BA_l[iAttr]]  #signal events that pass the selection(all the cuts)
        pos_bkg = pos_bkg[BA_l[
            iAttr]]  #background events that pass the selection(all the cuts)
        n_pos = n_pos[BA_l[iAttr]]  #see below
    pos_sgn = pos_sgn[sg]
    pos_bkg = pos_bkg[bg]
    n_pos = float(n_pos.sum())  #sum up the weights

    n_sgn = float(tt[weightAttrStr][sg].sum())  #sum of weights from all signal
    n_bkg = float(
        tt[weightAttrStr][bg].sum())  #sum of weights from all background
    n_pos_sgn = float(pos_sgn.sum(
    ))  #sum of weights of signal events that pass the selection
    n_pos_bkg = float(pos_bkg.sum(
    ))  #sum of weights of background events that pass the selection
    sgn_eff = np.divide(n_pos_sgn, n_sgn)
    fls_eff = np.divide(n_pos_bkg, n_bkg)
    print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Benchmark:'
    print 'num of total test events: ', tt[refAttrLabel].count()
    print "num of signals          : ", n_sgn
    print 'num of background       : ', n_bkg
    print "num of pos events       : ", n_pos
    print "num of pos bkg          : ", n_pos_bkg
    print "num of pos sgn          : ", n_pos_sgn
    print "true positive rate      : ", sgn_eff
    print "false positive rate     : ", fls_eff
    #return tt[weightAttrStr][sg], tt[weightAttrStr][bg], pos_sgn, pos_bkg#sgn_eff, fls_eff

    df_sg = tt[weightAttrStr][sg]
    df_bg = tt[weightAttrStr][bg]
    df_pos_sgn = pos_sgn
    df_pos_bkg = pos_bkg
    bin_i = 0
    bin_f = 1
    #def CutBaseROC(df_sg, df_bg, df_pos_sgn, df_pos_bkg):
    h_cut_pre_tpr = TH1F('h_cut_pre_tpr', 'hist_cut_pre_tpr', 1, bin_i, bin_f)
    h_cut_pos_tpr = TH1F('h_cut_pos_tpr', 'hist_cut_pos_tpr', 1, bin_i, bin_f)

    h_cut_pre_fpr = TH1F('h_cut_pre_fpr', 'hist_cut_pre_fpr', 1, bin_i, bin_f)
    h_cut_pos_fpr = TH1F('h_cut_pos_fpr', 'hist_cut_pos_fpr', 1, bin_i, bin_f)

    root_numpy.fill_hist(h_cut_pre_tpr, df_sg, df_sg)
    root_numpy.fill_hist(h_cut_pos_tpr, df_pos_sgn, df_pos_sgn)

    root_numpy.fill_hist(h_cut_pre_fpr, df_bg, df_bg)
    root_numpy.fill_hist(h_cut_pos_fpr, df_pos_bkg, df_pos_bkg)

    g_cut_tpr = GAE()
    g_cut_fpr = GAE()
    g_cut_tpr.Divide(h_cut_pos_tpr, h_cut_pre_tpr, "cl=0.683 b(1,1) mode")
    g_cut_fpr.Divide(h_cut_pos_fpr, h_cut_pre_fpr, "cl=0.683 b(1,1) mode")

    g_size_cut = 1

    x = Double()
    y = Double()
    x_s = Double()
    y_s = Double()

    arr_x = np.zeros(g_size_cut)
    arr_y = np.zeros(g_size_cut)
    arr_x_s = np.zeros(g_size_cut)
    arr_y_s = np.zeros(g_size_cut)

    for i in xrange(g_size_cut):
        g_cut_fpr.GetPoint(i, x, y)
        arr_x[i] = x
        arr_y[i] = y

        g_cut_tpr.GetPoint(i, x_s, y_s)
        arr_x_s[i] = x_s
        arr_y_s[i] = y_s

    buffer_l = g_cut_fpr.GetEYlow()
    buffer_l.SetSize(g_size_cut)
    arr_l = np.array(buffer_l, copy=True)

    buffer_h = g_cut_fpr.GetEYhigh()
    buffer_h.SetSize(g_size_cut)
    arr_h = np.array(buffer_h, copy=True)

    buffer_l_s = g_cut_tpr.GetEYlow()
    buffer_l_s.SetSize(g_size_cut)
    arr_l_s = np.array(buffer_l_s, copy=True)

    buffer_h_s = g_cut_tpr.GetEYhigh()
    buffer_h_s.SetSize(g_size_cut)
    arr_h_s = np.array(buffer_h_s, copy=True)
    print len(arr_h)
    print len(arr_l)

    print 'TPR: ', arr_y_s
    print 'FPR: ', arr_y
    print arr_l_s
    print arr_l
    print arr_h_s
    print arr_h
    out_dict = {}
    out_dict['tpr'] = arr_y_s[0]
    out_dict['fpr'] = arr_y[0]
    out_dict['tpr_e_l'] = arr_l_s[0]
    out_dict['fpr_e_l'] = arr_l[0]
    out_dict['tpr_e_h'] = arr_h_s[0]
    out_dict['fpr_e_h'] = arr_h[0]

    return out_dict