Python loadDataFrameの例

プログラミング言語: Python

名前空間/パッケージ名: prepareTraining

メソッド/関数: loadDataFrame

hotexamples.comのコード掲載数: 10

Python loadDataFrame - 10件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのprepareTraining.loadDataFrameの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: evaluate_signal.py プロジェクト: dhandl/MLkit

def main():

    opts = parse_options()

    model = load_model(os.path.join(opts.modelDir, opts.name + '.h5'))

    scaler = joblib.load(os.path.join(opts.modelDir,
                                      opts.name + '_scaler.pkl'))

    db = (RESOLUTION[2] - RESOLUTION[1]
          ) / RESOLUTION[0]  # bin width in discriminator distribution
    bins = np.arange(RESOLUTION[1], RESOLUTION[2] + db,
                     db)  # bin edges in discriminator distribution
    center = (bins[:-1] + bins[1:]) / 2

    ###########################
    # Read and evaluate signals
    ###########################

    SIGNAL = [opts.signal]
    Signal = []
    for s in SIGNAL:
        x, y = pickBenchmark(s)
        df, weight = loadDataFrame(os.path.join(inputDir, s), PRESELECTION,
                                   VAR, WEIGHTS, LUMI)
        y_hat = evaluate(model, df.values, scaler)
        bin_index = np.digitize(
            y_hat[:, 0],
            bins[1:])  # get the bin index of the output score for each event
        outputWeighted = []
        outputWeightedVar = []
        outputMC = []
        outputMCVar = []
        for i in range(len(bins[1:])):
            w = weight.values[np.where(bin_index == i)[0]]
            sigma = np.sum(w**2.)
            outputWeighted.append(w.sum())
            outputWeightedVar.append(sigma)
            outputMC.append(len(w))
            outputMCVar.append(np.sqrt(len(w)))

        Signal.append({
            'name': s,
            'm_stop': x,
            'm_X': y,
            'dataset': df,
            'weight': weight,
            'nEvents': weight.sum(),
            'y_pred': y_hat,
            'outputScore': np.array(outputWeighted),
            'outputMC': np.array(outputMC),
            'output_var': np.array(outputWeightedVar),
            'outputMC_var': np.array(outputMCVar)
        })

        del df, weight, y_hat, bin_index, outputWeighted, outputWeightedVar, outputMC, outputMCVar

    ###########################
    # Read and evaluate backgrounds
    ###########################

    totBkgEvents = 0.
    totBkgVar = 0.
    Background = []
    for b in BACKGROUND:
        df, weight = loadDataFrame(os.path.join(inputDir, b), PRESELECTION,
                                   VAR, WEIGHTS, LUMI)
        y_hat = evaluate(model, df.values, scaler)
        bin_index = np.digitize(y_hat[:, 0], bins[1:])
        outputWeighted = []
        outputWeightedVar = []
        outputMC = []
        outputMCVar = []

        totBkgEvents += weight.sum()
        totBkgVar += np.sum(weight.values**2.)
        for i in range(len(bins[1:])):
            w = weight.values[np.where(bin_index == i)[0]]
            sigma = np.sum(w**2.)
            outputWeighted.append(w.sum())
            outputWeightedVar.append(sigma)
            outputMC.append(len(w))
            outputMCVar.append(len(w))

        Background.append({
            'name': b,
            'dataset': df,
            'weight': weight,
            'nEvents': weight.sum(),
            'y_pred': y_hat,
            'outputScore': np.array(outputWeighted),
            'outputMC': np.array(outputMC),
            'output_var': np.array(outputWeightedVar),
            'outputMC_var': np.array(outputMCVar)
        })

        del df, weight, y_hat, bin_index, outputWeighted, outputWeightedVar, outputMC, outputMCVar

    totalBkgOutput = np.array([b['outputScore'] for b in Background])
    totalBkgOutput = totalBkgOutput.sum(axis=0)

    totalBkgVar = np.array([b['output_var'] for b in Background])
    totalBkgVar = totalBkgVar.sum(axis=0)

    ###########################
    # Determine Significance  #
    ###########################

    for s in Signal:
        significance = []
        significance_err = []
        asimov = []
        tot_rel = np.sqrt(np.sum(s['output_var'])) / s['nEvents']
        for i in range(len(bins[1:])):
            #eff_sig = s['outputScore'][:i+1].sum() / s['nEvents']
            #eff_bkg = totalBkgOutput[:i+1].sum() / totalBkgOutput.sum()
            eff_sig = s['outputScore'][i:].sum() / s['nEvents']
            eff_bkg = totalBkgOutput[i:].sum() / totalBkgOutput.sum()

            #err_sig = np.sqrt(np.sum(s['output_var'][:i+1])) / s['nEvents']
            #err_bkg = np.sqrt(np.sum(totalBkgVar[:i+1])) / totalBkgOutput.sum()
            err_sig = np.sqrt(np.sum(s['output_var'][i:])) / s['nEvents']
            err_bkg = np.sqrt(np.sum(totalBkgVar[i:])) / totalBkgOutput.sum()

            #if totalBkgOutput[:i+1].sum() > 0.:
            #  rel_err_bkg = np.sqrt(np.sum(totalBkgVar[:i+1])) / totalBkgOutput[:i+1].sum()
            if totalBkgOutput[i:].sum() > 0.:
                rel_err_bkg = np.sqrt(np.sum(
                    totalBkgVar[i:])) / totalBkgOutput[i:].sum()
            else:
                rel_err_bkg = 0.
            #if s['outputScore'][:i+1].sum() > 0.:
            #  rel_err_sig = np.sqrt(np.sum(s['output_var'][:i+1])) / s['outputScore'][:i+1].sum()
            if s['outputScore'][i:].sum() > 0.:
                rel_err_sig = np.sqrt(np.sum(
                    s['output_var'][i:])) / s['outputScore'][i:].sum()
            else:
                rel_err_sig = 0.

            #total_rel_err = np.sqrt(rel_err_sig**2. + rel_err_bkg**2. + 0.25**2.)
            total_rel_err = np.sqrt(rel_err_bkg**2. + 0.25**2.)

            if (eff_sig == 0) or (eff_bkg == 0):
                Z = 0.
                Z_err = 0.
                ams = 0.
            elif (err_sig / eff_sig > 0.75) or (err_bkg / eff_bkg > 0.75):
                Z = 0.
                Z_err = 0.
                ams = 0.
            else:
                #Z = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(s['outputScore'][:i+1].sum(), totalBkgOutput[:i+1].sum(), total_rel_err)
                Z = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
                    s['outputScore'][i:].sum(), totalBkgOutput[i:].sum(),
                    total_rel_err)
                ams = asimovZ(s['outputScore'][i:].sum(),
                              totalBkgOutput[i:].sum(),
                              np.sqrt(totalBkgVar[i:].sum()))

                Zplus_sig = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
                    (eff_sig + err_sig) * s['nEvents'],
                    eff_bkg * totalBkgOutput.sum(), total_rel_err)
                Zmins_sig = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
                    (eff_sig - err_sig) * s['nEvents'],
                    eff_bkg * totalBkgOutput.sum(), total_rel_err)
                Zplus_bkg = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
                    eff_sig * s['nEvents'],
                    (eff_bkg + err_bkg) * totalBkgOutput.sum(), total_rel_err)
                Zmins_bkg = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
                    eff_sig * s['nEvents'],
                    (eff_bkg - err_bkg) * totalBkgOutput.sum(), total_rel_err)

            Z_err_sig = abs(Zplus_sig - Zmins_sig) / 2
            Z_err_bkg = abs(Zplus_bkg - Zmins_bkg) / 2
            Z_err = np.sqrt(Z_err_sig**2 + Z_err_bkg**2)

            significance.append(Z)
            significance_err.append(Z_err)
            asimov.append(ams)

        s['sig'] = np.array(significance)
        s['sig_max'] = s['sig'].max()
        s['sig_err'] = np.array(significance_err)
        s['ams'] = np.array(asimov)
        #print s['sig']
        #print s['ams']
        sigMax_index = bins[np.where(s['sig'] == s['sig'].max())][0]
        Z = asimovZ(
            Signal[0]['outputScore'][np.where(bins[:-1] == sigMax_index)],
            totalBkgOutput[np.where(bins[:-1] == sigMax_index)],
            np.sqrt(totalBkgVar[np.where(bins[:-1] == sigMax_index)]),
            syst=False)
        Z_syst = asimovZ(
            Signal[0]['outputScore'][np.where(bins[:-1] == sigMax_index)],
            totalBkgOutput[np.where(bins[:-1] == sigMax_index)],
            np.sqrt(totalBkgVar[np.where(bins[:-1] == sigMax_index)]),
            syst=True)
        #print s['sig'].max(), sigMax_index, Z, Z_syst

    x = np.array([s['m_stop'] for s in Signal], dtype=float)
    y = np.array([s['m_X'] for s in Signal], dtype=float)
    z = np.array([s['sig_max'] for s in Signal], dtype=float)

    #print x, y, z

    #print Signal[0]['outputScore'][np.where(bins[:-1] >= sigMax_index)], Signal[0]['output_var'][np.where(bins[:-1] >= sigMax_index)]
    #print totalBkgOutput[np.where(bins[:-1] >= sigMax_index)], totalBkgVar[np.where(bins[:-1] >= sigMax_index)]

    #print Signal[0]['outputScore'], Signal[0]['output_var']
    #print totalBkgOutput, totalBkgVar

    ###################################
    # Write single bin to .root files #
    ##################################

    sigFile = ROOT.TFile(opts.name + "_output_sig.root", "RECREATE")
    sig_sr = ROOT.TH1D("SR", "SR", 1, 0, 1)
    sig_sr.SetBinContent(
        1,
        np.sum(Signal[0]['outputScore'][np.where(bins[:-1] >= sigMax_index)]))
    sig_sr.SetBinError(
        1,
        np.sum(
            np.sqrt(
                Signal[0]['output_var'][np.where(bins[:-1] >= sigMax_index)])))
    sigFile.Write()
    sigFile.Close()

    bkgFile = ROOT.TFile(opts.name + "_output_bkg.root", "RECREATE")
    bkg_sr = ROOT.TH1D("SR", "SR", 1, 0, 1)
    bkg_sr.SetBinContent(
        1, np.sum(totalBkgOutput[np.where(bins[:-1] >= sigMax_index)]))
    bkg_sr.SetBinError(
        1, np.sum(np.sqrt(totalBkgVar[np.where(bins[:-1] >= sigMax_index)])))
    bkgFile.Write()
    bkgFile.Close()

    ###################################
    # Write multi bins to .root files #
    ###################################

    multibin_sigFile = ROOT.TFile(opts.name + "_output_sig_multibin.root",
                                  "RECREATE")
    multibin_sig_sr = ROOT.TH1D("SR", "SR", 5, 0, 5)
    for i in xrange(1, 6):
        index = -6 + i
        multibin_sig_sr.SetBinContent(i, Signal[0]['outputScore'][index])
        multibin_sig_sr.SetBinError(i, np.sqrt(Signal[0]['output_var'][index]))
    multibin_sigFile.Write()
    multibin_sigFile.Close()

    multibin_bkgFile = ROOT.TFile(opts.name + "_output_bkg_multibin.root",
                                  "RECREATE")
    multibin_bkg_sr = ROOT.TH1D("SR", "SR", 5, 0, 5)
    for i in xrange(1, 6):
        index = -6 + i
        multibin_bkg_sr.SetBinContent(i, totalBkgOutput[index])
        multibin_bkg_sr.SetBinError(i, np.sqrt(totalBkgVar[index]))
    multibin_bkgFile.Write()
    multibin_bkgFile.Close()

コード例 #2

ファイルを表示

ファイル: compareTruthRecoOutput.py プロジェクト: dhandl/MLkit

def main():

    model = load_model(modelDir)

    scaler = joblib.load(SCALING)

    infofile = open(modelDir.replace('.h5', '_infofile.txt'))
    infos = infofile.readlines()
    analysis = infos[0].replace('Used analysis method: ', '').replace('\n', '')
    dataset = DatasetDir + infos[3].replace('Used dataset: ', '').replace(
        '\n', '')
    recurrent = False
    if analysis.lower() == 'rnn':
        recurrent = True
        seq_scaler = dataset + '_scaling.json'

    db = (RESOLUTION[2] - RESOLUTION[1]
          ) / RESOLUTION[0]  # bin width in discriminator distribution
    bins = np.arange(RESOLUTION[1], RESOLUTION[2] + db,
                     db)  # bin edges in discriminator distribution
    center = (bins[:-1] + bins[1:]) / 2

    print '#----MODEL----#'
    print modelDir

    ###########################
    # Read and evaluate signals
    ###########################

    Signal = []
    for s in SIGNAL:
        x, y = pickBenchmark(s)
        if not recurrent:
            df, weight = loadDataFrame(os.path.join(inputDir, s + '/'),
                                       PRESELECTION, VAR, WEIGHTS, LUMI)
            y_hat = evaluate(model, df.values, scaler)
        else:
            df, weight, collection = loadSequentialDataFrame(
                os.path.join(inputDir, s + '/'), PRESELECTION, COLLECTION,
                REMOVE_VAR, VAR, WEIGHTS, LUMI)
            y_hat = evaluate(model,
                             df.values,
                             scaler,
                             seq_scaler,
                             rnn=True,
                             col=collection)

        bin_index = np.digitize(
            y_hat[:, 0],
            bins[1:])  # get the bin index of the output score for each event
        outputWeighted = []
        outputWeightedVar = []
        outputMC = []
        outputMCVar = []
        for i in range(len(bins[1:])):
            w = weight.values[np.where(bin_index == i)[0]]
            sigma = np.sum(w**2.)
            outputWeighted.append(w.sum())
            outputWeightedVar.append(sigma)
            outputMC.append(len(w))
            outputMCVar.append(np.sqrt(len(w)))

        Signal.append({
            'name': s,
            'm_stop': x,
            'm_X': y,
            'dataset': df,
            'weight': weight,
            'nEvents': weight.sum(),
            'y_pred': y_hat,
            'outputScore': np.array(outputWeighted),
            'outputMC': np.array(outputMC),
            'output_var': np.array(outputWeightedVar),
            'outputMC_var': np.array(outputMCVar)
        })

        del df, weight, y_hat, bin_index, outputWeighted, outputWeightedVar, outputMC, outputMCVar

    ###########################
    # Read and evaluate backgrounds
    ###########################

    totBkgEvents = 0.
    totBkgVar = 0.
    Background = []
    for b in BACKGROUND:
        if not recurrent:
            df, weight = loadDataFrame(os.path.join(inputDir, b + '/'),
                                       PRESELECTION, VAR, WEIGHTS, LUMI)
            y_hat = evaluate(model, df.values, scaler)
        else:
            df, weight, collection = loadSequentialDataFrame(
                os.path.join(inputDir, b + '/'), PRESELECTION, COLLECTION,
                REMOVE_VAR, VAR, WEIGHTS, LUMI)
            y_hat = evaluate(model,
                             df.values,
                             scaler,
                             seq_scaler,
                             rnn=True,
                             col=collection)

        bin_index = np.digitize(y_hat[:, 0], bins[1:])
        outputWeighted = []
        outputWeightedVar = []
        outputMC = []
        outputMCVar = []

        totBkgEvents += weight.sum()
        totBkgVar += np.sum(weight.values**2.)
        for i in range(len(bins[1:])):
            w = weight.values[np.where(bin_index == i)[0]]
            sigma = np.sum(w**2.)
            outputWeighted.append(w.sum())
            outputWeightedVar.append(sigma)
            outputMC.append(len(w))
            outputMCVar.append(len(w))

        Background.append({
            'name': b,
            'dataset': df,
            'weight': weight,
            'nEvents': weight.sum(),
            'y_pred': y_hat,
            'outputScore': np.array(outputWeighted),
            'outputMC': np.array(outputMC),
            'output_var': np.array(outputWeightedVar),
            'outputMC_var': np.array(outputMCVar)
        })

        del df, weight, y_hat, bin_index, outputWeighted, outputWeightedVar, outputMC, outputMCVar

    totalBkgOutput = np.array([b['outputScore'] for b in Background])
    totalBkgOutput = totalBkgOutput.sum(axis=0)

    totalBkgVar = np.array([b['output_var'] for b in Background])
    totalBkgVar = totalBkgVar.sum(axis=0)

    for s in Signal:
        significance = []
        significance_err = []
        asimov = []
        tot_rel = np.sqrt(np.sum(s['output_var'])) / s['nEvents']
        for i in range(len(bins[1:])):
            #eff_sig = s['outputScore'][:i+1].sum() / s['nEvents']
            #eff_bkg = totalBkgOutput[:i+1].sum() / totalBkgOutput.sum()
            eff_sig = s['outputScore'][i:].sum() / s['nEvents']
            eff_bkg = totalBkgOutput[i:].sum() / totalBkgOutput.sum()

            #err_sig = np.sqrt(np.sum(s['output_var'][:i+1])) / s['nEvents']
            #err_bkg = np.sqrt(np.sum(totalBkgVar[:i+1])) / totalBkgOutput.sum()
            err_sig = np.sqrt(np.sum(s['output_var'][i:])) / s['nEvents']
            err_bkg = np.sqrt(np.sum(totalBkgVar[i:])) / totalBkgOutput.sum()

            #if totalBkgOutput[:i+1].sum() > 0.:
            #  rel_err_bkg = np.sqrt(np.sum(totalBkgVar[:i+1])) / totalBkgOutput[:i+1].sum()
            if totalBkgOutput[i:].sum() > 0.:
                rel_err_bkg = np.sqrt(np.sum(
                    totalBkgVar[i:])) / totalBkgOutput[i:].sum()
            else:
                rel_err_bkg = 0.
            #if s['outputScore'][:i+1].sum() > 0.:
            #  rel_err_sig = np.sqrt(np.sum(s['output_var'][:i+1])) / s['outputScore'][:i+1].sum()
            if s['outputScore'][i:].sum() > 0.:
                rel_err_sig = np.sqrt(np.sum(
                    s['output_var'][i:])) / s['outputScore'][i:].sum()
            else:
                rel_err_sig = 0.

            #total_rel_err = np.sqrt(rel_err_sig**2. + rel_err_bkg**2. + 0.25**2.)
            total_rel_err = np.sqrt(rel_err_bkg**2. + 0.25**2.)

            if (eff_sig == 0) or (eff_bkg == 0):
                Z = 0.
                Z_err = 0.
                ams = 0.
            elif (err_sig / eff_sig > 0.75) or (err_bkg / eff_bkg > 0.75):
                Z = 0.
                Z_err = 0.
                ams = 0.
            else:
                #Z = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(s['outputScore'][:i+1].sum(), totalBkgOutput[:i+1].sum(), total_rel_err)
                Z = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
                    s['outputScore'][i:].sum(), totalBkgOutput[i:].sum(),
                    total_rel_err)
                ams = asimovZ(s['outputScore'][i:].sum(),
                              totalBkgOutput[i:].sum(),
                              np.sqrt(totalBkgVar[i:].sum()))

                Zplus_sig = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
                    (eff_sig + err_sig) * s['nEvents'],
                    eff_bkg * totalBkgOutput.sum(), total_rel_err)
                Zmins_sig = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
                    (eff_sig - err_sig) * s['nEvents'],
                    eff_bkg * totalBkgOutput.sum(), total_rel_err)
                Zplus_bkg = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
                    eff_sig * s['nEvents'],
                    (eff_bkg + err_bkg) * totalBkgOutput.sum(), total_rel_err)
                Zmins_bkg = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
                    eff_sig * s['nEvents'],
                    (eff_bkg - err_bkg) * totalBkgOutput.sum(), total_rel_err)

            Z_err_sig = abs(Zplus_sig - Zmins_sig) / 2
            Z_err_bkg = abs(Zplus_bkg - Zmins_bkg) / 2
            Z_err = np.sqrt(Z_err_sig**2 + Z_err_bkg**2)

            significance.append(Z)
            significance_err.append(Z_err)
            asimov.append(ams)

        s['sig'] = np.array(significance)
        s['sig_max'] = s['sig'].max()
        s['sig_err'] = np.array(significance_err)
        s['ams'] = np.array(asimov)
        #print s['sig']
        #print s['ams']
        #sigMax_index = bins[np.where(s['sig'] == s['sig'].max())][0]
        #Z = asimovZ(Signal[0]['outputScore'][np.where(bins[:-1] == sigMax_index)], totalBkgOutput[np.where(bins[:-1] == sigMax_index)], np.sqrt(totalBkgVar[np.where(bins[:-1] == sigMax_index)]), syst=False)
        #Z_syst = asimovZ(Signal[0]['outputScore'][np.where(bins[:-1] == sigMax_index)], totalBkgOutput[np.where(bins[:-1] == sigMax_index)], np.sqrt(totalBkgVar[np.where(bins[:-1] == sigMax_index)]), syst=True)
        #print s['sig'].max(), sigMax_index, Z, Z_syst

    x = np.array([s['m_stop'] for s in Signal], dtype=float)
    y = np.array([s['m_X'] for s in Signal], dtype=float)
    z = np.array([s['sig_max'] for s in Signal], dtype=float)

    #print x, y, z

    #print Signal[0]['outputScore'][np.where(bins[:-1] >= sigMax_index)], Signal[0]['output_var'][np.where(bins[:-1] >= sigMax_index)]
    #print totalBkgOutput[np.where(bins[:-1] >= sigMax_index)], totalBkgVar[np.where(bins[:-1] >= sigMax_index)]

    #print Signal[0]['outputScore'], Signal[0]['output_var']
    #print totalBkgOutput, totalBkgVar
    # Set up a regular grid of interpolation points

    print('Plotting the output score...')
    fig = plt.figure(figsize=(8, 6))
    ax1 = plt.subplot2grid((4, 4), (0, 0), colspan=4, rowspan=3)
    ax1.set_xlim((bins[0], bins[-1]))
    ax1.set_ylabel("Events", horizontalalignment='right', y=1.0)

    sb_ratio = Signal[0]['outputScore'].sum() / totalBkgOutput.sum()
    #if sb_ratio < 0.2:
    #  #ATTENTION! Simplified error propagation (treated as uncorrelated)
    #  scaled = Signal[0]['outputScore'] / Signal[0]['outputScore'].sum() * totalBkgOutput.sum()
    #  scaled_var = scaled*scaled * ( (Signal[0]['output_var']/Signal[0]['outputScore'])**2 + (totalBkgVar.sum()/totalBkgOutput.sum())**2 + (Signal[0]['output_var'].sum()/Signal[0]['outputScore'].sum())**2 )
    #  scaled_label = 'Signal scaled to Bkg'
    #
    #else:
    scaled = Signal[0]['outputScore']
    scaled_var = Signal[0]['output_var']
    scaled_label = 'Signal'

    plt.bar(center,
            totalBkgOutput / totalBkgOutput.sum(),
            width=db,
            yerr=np.sqrt(totalBkgVar) / totalBkgOutput.sum(),
            color='b',
            alpha=0.25,
            error_kw=dict(ecolor='b', lw=1.5),
            label=Background[0]['name'])
    plt.bar(center,
            Signal[0]['outputScore'] / Signal[0]['outputScore'].sum(),
            width=db,
            yerr=np.sqrt(Signal[0]['output_var']) /
            Signal[0]['outputScore'].sum(),
            label=Signal[0]['name'],
            color='r',
            alpha=0.25,
            error_kw=dict(ecolor='r', lw=1.5))

    ax1.set_ylim(
        (0.,
         np.max([
             np.max(totalBkgOutput / totalBkgOutput.sum()),
             np.max(Signal[0]['outputScore'] / Signal[0]['outputScore'].sum())
         ]) * 1.3))
    #ax1.set_yscale('log')
    leg = plt.legend(loc="best", frameon=False)

    AtlasStyle_mpl.ATLASLabel(ax1, 0.02, 0.925, 'Work in progress')
    #AtlasStyle_mpl.LumiLabel(ax1, 0.02, 0.875, lumi=LUMI*0.001)

    ax2 = plt.subplot2grid((4, 4), (3, 0), colspan=4, rowspan=1)
    getRatio(Signal[0]['outputScore'] / Signal[0]['outputScore'].sum(), bins,
             np.sqrt(Signal[0]['output_var']) / Signal[0]['outputScore'].sum(),
             totalBkgOutput / totalBkgOutput.sum(), bins,
             np.sqrt(totalBkgVar) / totalBkgOutput.sum(), 'r')
    ax2.set_xlabel('Output score', horizontalalignment='right', x=1.0)
    ax2.set_ylabel('Reco/Truth')
    ax2.set_xlim((0., 1.))
    ax2.set_ylim((0, 2))
    ax2.grid()
    ax2.tick_params(direction='in')
    ax2.xaxis.set_ticks_position('both')
    ax2.yaxis.set_ticks_position('both')

    plt.savefig("plots/" + modelfile + "_shapeComparison_outputScore.pdf")
    plt.savefig("plots/" + modelfile + "_shapeComparison_outputScore.png")
    plt.close()

コード例 #3

ファイルを表示

ファイル: plot_Classification.py プロジェクト: dhandl/MLkit

def plot_classification_datapoint(SignalList,
                                  model,
                                  preselection,
                                  nvar,
                                  weight,
                                  lumi,
                                  save=False,
                                  fileName='Test',
                                  multiclass=True):
    '''
    Evaluate the classification on certain datapoints. sigList is supposed to be in a form like in config/samples.py
    '''

    print '----- Plotting the Classification for different datapoints-----'

    print 'Using preselection', preselection

    #met_cut = False
    #for pre in preselection:
    #if pre['name'] == 'met':
    #met_cut = True
    #met_threshold = pre['threshold']
    #met_cut_addStr = 'met' + str(int(pre['threshold']*0.001))

    #if not met_cut:
    #print 'Using no met-preselection!'

    input = '/project/etp5/dhandl/samples/SUSY/Stop1L/FullRun2/hdf5/cut_mt30_met60_preselection/'

    bkgList = [{
        'name': 'powheg_ttbar',
        'path': input + 'mc16d_ttbar/'
    }, {
        'name': 'powheg_singletop',
        'path': input + 'mc16d_singletop/'
    }, {
        'name': 'sherpa22_Wjets',
        'path': input + 'mc16d_Wjets/'
    }]

    #Loading background once
    print 'Loading background...'

    Background = []
    for b in bkgList:
        print 'Loading background {} from {}...'.format(b['name'], b['path'])
        Background.append(
            Sample(
                b['name'],
                pT.loadDataFrame(b['path'], preselection, nvar, weight, lumi)))

    bkg = np.empty([0, Background[0].dataframe[0].shape[1]])
    bkg_w = np.empty(0)
    bkg_y = np.empty(0)

    for i, b in enumerate(Background):
        i = i + 1
        bkg = np.concatenate((bkg, b.dataframe[0]))
        bkg_w = np.concatenate((bkg_w, b.dataframe[1]))
        bkg_y = np.concatenate((bkg_y, np.full(b.dataframe[0].shape[0], i)))

    print 'Background shape', bkg.shape

    #Evaluating on signal for each set of points
    print 'Evaluating on signal sets...'

    for sigList in SignalList:
        Signal = []
        addStr = '_stop_bWN_'
        name = False
        title = ''
        for s in sigList:
            if not name:
                addStr += s['name'].replace('stop_bWN_', '')
                name = True
            else:
                addStr += s['name'].replace(s['name'][:12], '')

            print 'Loading signal {} from {}...'.format(s['name'], s['path'])
            Signal.append(
                Sample(
                    s['name'],
                    pT.loadDataFrame(s['path'], preselection, nvar, weight,
                                     lumi)))

        title = addStr[1:17].replace('_', ' ')

        mstop = int(addStr[10:13])
        mneutralino = int(addStr[14:17])
        sample = [
            r'$m_{\tilde{t}}$=%i GeV' % mstop,
            r'$m_{\chi}$=%i GeV' % mneutralino
        ]

        sig = np.empty([0, Signal[0].dataframe[0].shape[1]])
        sig_w = np.empty(0)
        sig_y = np.empty(0)

        for s in Signal:
            sig = np.concatenate((sig, s.dataframe[0]))
            sig_w = np.concatenate((sig_w, s.dataframe[1]))
            sig_y = np.concatenate((sig_y, np.zeros(s.dataframe[0].shape[0])))

        X = np.concatenate((sig, bkg))
        w = np.concatenate((sig_w, bkg_w))

        if multiclass:
            y = np.concatenate((sig_y, bkg_y))
        else:
            y = []
            for _df, ID in [(sig, 0), (bkg, 1)]:
                y.extend([ID] * _df.shape[0])
            y = np.array(y)

        scaler = StandardScaler()

        X_scaled = scaler.fit_transform(X)
        y_predict = model.predict(X_scaled)

        #if not met_cut:
        #addStr += '_no_met_cut'

        #print 'True classes:', y.shape, 'Predicted classes:', y_predict.shape

        #sig_predicted = deepcopy(y_predict)[y==0]
        #bkg_predicted = deepcopy(y_predict)[y!=0]
        #bkg1_predicted= deepcopy(y_predict)[y==1]
        #bkg2_predicted= deepcopy(y_predict)[y==2]
        #bkg3_predicted= deepcopy(y_predict)[y==3]
        #bkg1_w = deepcopy(w)[y==1]
        #bkg2_w = deepcopy(w)[y==2]
        #bkg3_w = deepcopy(w)[y==3]

        variables = nvar

        plot_classification(y,
                            y_predict,
                            w,
                            save=save,
                            fileName=fileName,
                            weighted=True,
                            sample=sample,
                            addStr=addStr)
        plot_classification(y[X[:, variables.index('met')] >= 250e3],
                            y_predict[X[:, variables.index('met')] >= 250e3],
                            w[X[:, variables.index('met')] >= 250e3],
                            save=save,
                            fileName=fileName,
                            weighted=True,
                            sample=sample,
                            addStr=addStr + '_met250')

コード例 #4

ファイルを表示

def plot_confusion_matrix_datapoint(SignalList,
                                    model,
                                    preselection,
                                    nvar,
                                    weight,
                                    lumi,
                                    save=False,
                                    fileName='Test',
                                    multiclass=True):
    '''
    Evaluate the confusion matrix on certain datapoints. sigList is supposed to be in a form like in config/samples.py
    '''

    print '----- Plotting the confusion matrices for different datapoints-----'

    met_cut = False
    for pre in preselection:
        if pre['name'] == 'met':
            met_cut = True

    if not met_cut:
        print 'Using no met-preselection!'

    input = '/project/etp5/dhandl/samples/SUSY/Stop1L/hdf5/cut_mt30_met60_preselection/'

    bkgList = [{
        'name': 'powheg_ttbar',
        'path': input + 'powheg_ttbar/'
    }, {
        'name': 'powheg_singletop',
        'path': input + 'powheg_singletop/'
    }, {
        'name': 'sherpa22_Wjets',
        'path': input + 'sherpa22_Wjets/'
    }]

    #Loading background once
    print 'Loading background...'

    Background = []
    for b in bkgList:
        print 'Loading background {} from {}...'.format(b['name'], b['path'])
        Background.append(
            Sample(
                b['name'],
                pT.loadDataFrame(b['path'], preselection, nvar, weight, lumi)))

    bkg = np.empty([0, Background[0].dataframe[0].shape[1]])
    bkg_w = np.empty(0)
    bkg_y = np.empty(0)

    for i, b in enumerate(Background):
        i = i + 1
        bkg = np.concatenate((bkg, b.dataframe[0]))
        bkg_w = np.concatenate((bkg_w, b.dataframe[1]))
        bkg_y = np.concatenate((bkg_y, np.full(b.dataframe[0].shape[0], i)))

    #Evaluating on signal for each set of points
    print 'Evaluating on signal sets...'

    for sigList in SignalList:
        Signal = []
        addStr = '_stop_bWN_'
        name = False
        for s in sigList:
            if not name:
                addStr += s['name'].replace('stop_bWN_', '')
                name = True
            else:
                addStr += s['name'].replace(s['name'][:12], '')

            print 'Loading signal {} from {}...'.format(s['name'], s['path'])
            Signal.append(
                Sample(
                    s['name'],
                    pT.loadDataFrame(s['path'], preselection, nvar, weight,
                                     lumi)))

        sig = np.empty([0, Signal[0].dataframe[0].shape[1]])
        sig_w = np.empty(0)
        sig_y = np.empty(0)

        for s in Signal:
            sig = np.concatenate((sig, s.dataframe[0]))
            sig_w = np.concatenate((sig_w, s.dataframe[1]))
            sig_y = np.concatenate((sig_y, np.zeros(s.dataframe[0].shape[0])))

        X = np.concatenate((sig, bkg))
        w = np.concatenate((sig_w, bkg_w))

        if multiclass:
            y = np.concatenate((sig_y, bkg_y))
        else:
            y = []
            for _df, ID in [(sig, 0), (bkg, 1)]:
                y.extend([ID] * _df.shape[0])
            y = np.array(y)

        scaler = StandardScaler()

        X_scaled = scaler.fit_transform(X)
        y_predict = model.predict(X_scaled)
        y_true = y

        if not met_cut:
            addStr += '_no_met_cut'

        plot_confusion_matrix(y_true,
                              y_predict,
                              filename=fileName,
                              save=save,
                              addStr=addStr)

コード例 #5

ファイルを表示

ファイル: evaluateSignificance.py プロジェクト: dhandl/MLkit

def main():

    # Check number of arguments and act respectively thereof
    if len(sys.argv) == 2:
        modelfile = sys.argv[1:][0]
    else:
        print 'Usage: evaluate_signal.py <model> (omit directory and file suffix)'
        return

    print modelfile, type(modelfile)

    Dir = 'TrainedModels/models/'
    DatasetDir = 'TrainedModels/datasets/'

    modelDir = Dir + modelfile + '.h5'

    if os.path.exists(os.path.join(Dir, modelfile + '_scaler.pkl')):
        scaler = joblib.load(os.path.join(Dir, modelfile + '_scaler.pkl'))
    else:
        scaler = None

    infofile = open(modelDir.replace('.h5', '_infofile.txt'))
    infos = infofile.readlines()
    analysis = infos[0].replace('Used analysis method: ', '').replace('\n', '')
    dataset = DatasetDir + infos[3].replace('Used dataset: ', '').replace(
        '\n', '')
    VAR = infos[5].replace('Used variables for training: ',
                           '').replace('\n', '').split()

    print VAR

    recurrent = False
    if analysis.lower() == 'rnn':
        recurrent = True
        seq_scaler = dataset + '_scaling.json'

    if 'nn' in analysis.lower():
        model = load_model(os.path.join(Dir, modelfile + '.h5'))
    elif 'bdt' in analysis.lower():
        model = joblib.load(os.path.join(Dir, modelfile + '.h5'))

    db = (RESOLUTION[2] - RESOLUTION[1]
          ) / RESOLUTION[0]  # bin width in discriminator distribution
    bins = np.arange(RESOLUTION[1], RESOLUTION[2] + db,
                     db)  # bin edges in discriminator distribution
    center = (bins[:-1] + bins[1:]) / 2

    print '#----MODEL----#'
    print '\t', modelDir

    ###########################
    # Read and evaluate signals
    ###########################

    Signal = []
    for smp in SIGNAL:
        first = True
        for s in smp:
            print 'Sample:\t', s
            x, y = pickBenchmark(s)
            if not recurrent:
                _df, _weight = loadDataFrame(os.path.join(inputDir, s + '/'),
                                             PRESELECTION, VAR, WEIGHTS, LUMI)
                print _df.shape, _weight.shape
                if first:
                    df = _df.copy()
                    weight = _weight.copy()
                    first = False
                else:
                    df = pd.concat((df, _df), ignore_index=True)
                    weight = pd.concat((weight, _weight), ignore_index=True)
            else:
                _df, _weight, collection = loadSequentialDataFrame(
                    os.path.join(inputDir, s + '/'), PRESELECTION, COLLECTION,
                    REMOVE_VAR, VAR, WEIGHTS, LUMI)
                print _df.shape, _weight.shape, collection[0]['df'].shape
                if first:
                    df = _df.copy()
                    weight = _weight.copy()
                    seq = collection[0]['df'].copy()
                    first = False
                else:
                    df = pd.concat((df, _df), ignore_index=True)
                    weight = pd.concat((weight, _weight), ignore_index=True)
                    seq = pd.concat((seq, collection[0]['df']),
                                    ignore_index=True)

        if not recurrent:
            y_hat = evaluate(model, df.values, scaler, method=analysis)
            print df.shape, weight.shape
        else:
            collection[0]['df'] = seq
            print df.shape, weight.shape, collection[0]['df'].shape
            y_hat = evaluate(model,
                             df.values,
                             scaler,
                             seq_scaler,
                             method=analysis,
                             col=collection)

        bin_index = np.digitize(
            y_hat[:, 0],
            bins[1:])  # get the bin index of the output score for each event
        outputWeighted = []
        outputWeightedVar = []
        outputMC = []
        outputMCVar = []
        for i in range(len(bins[1:])):
            w = weight.values[np.where(bin_index == i)[0]]
            sigma = np.sum(w**2.)
            outputWeighted.append(w.sum())
            outputWeightedVar.append(sigma)
            outputMC.append(len(w))
            outputMCVar.append(np.sqrt(len(w)))

        Signal.append({
            'name': s[6:],
            'm_stop': x,
            'm_X': y,
            'dataset': df,
            'weight': weight,
            'nEvents': weight.sum(),
            'y_pred': y_hat,
            'outputScore': np.array(outputWeighted),
            'outputMC': np.array(outputMC),
            'output_var': np.array(outputWeightedVar),
            'outputMC_var': np.array(outputMCVar)
        })

        del df, weight, y_hat, bin_index, outputWeighted, outputWeightedVar, outputMC, outputMCVar

    ###########################
    # Read and evaluate backgrounds
    ###########################

    totBkgEvents = 0.
    totBkgVar = 0.
    Background = []
    for smp in BACKGROUND:
        first = True
        for b in smp:
            print 'Sample:\t', b
            if not recurrent:
                _df, _weight = loadDataFrame(os.path.join(inputDir, b + '/'),
                                             PRESELECTION, VAR, WEIGHTS, LUMI)
                print _df.shape, _weight.shape
                if first:
                    df = _df.copy()
                    weight = _weight.copy()
                    first = False
                else:
                    df = pd.concat((df, _df), ignore_index=True)
                    weight = pd.concat((weight, _weight), ignore_index=True)
            else:
                _df, _weight, collection = loadSequentialDataFrame(
                    os.path.join(inputDir, b + '/'), PRESELECTION, COLLECTION,
                    REMOVE_VAR, VAR, WEIGHTS, LUMI)
                print _df.shape, _weight.shape, collection[0]['df'].shape
                if first:
                    df = _df.copy()
                    weight = _weight.copy()
                    seq = collection[0]['df'].copy()
                    first = False
                else:
                    df = pd.concat((df, _df), ignore_index=True)
                    weight = pd.concat((weight, _weight), ignore_index=True)
                    seq = pd.concat((seq, collection[0]['df']),
                                    ignore_index=True)

        if not recurrent:
            print df.shape, weight.shape
            y_hat = evaluate(model, df.values, scaler, method=analysis)
        else:
            collection[0]['df'] = seq
            print df.shape, weight.shape, collection[0]['df'].shape
            y_hat = evaluate(model,
                             df.values,
                             scaler,
                             seq_scaler,
                             method=analysis,
                             col=collection)

        bin_index = np.digitize(y_hat[:, 0], bins[1:])
        outputWeighted = []
        outputWeightedVar = []
        outputMC = []
        outputMCVar = []

        totBkgEvents += weight.sum()
        totBkgVar += np.sum(weight.values**2.)
        for i in range(len(bins[1:])):
            w = weight.values[np.where(bin_index == i)[0]]
            sigma = np.sum(w**2.)
            outputWeighted.append(w.sum())
            outputWeightedVar.append(sigma)
            outputMC.append(len(w))
            outputMCVar.append(len(w))

        Background.append({
            'name': b,
            'dataset': df,
            'weight': weight,
            'nEvents': weight.sum(),
            'y_pred': y_hat,
            'outputScore': np.array(outputWeighted),
            'outputMC': np.array(outputMC),
            'output_var': np.array(outputWeightedVar),
            'outputMC_var': np.array(outputMCVar)
        })

        del df, weight, y_hat, bin_index, outputWeighted, outputWeightedVar, outputMC, outputMCVar

    totalBkgOutput = np.array([b['outputScore'] for b in Background])
    totalBkgOutput = totalBkgOutput.sum(axis=0)

    totalBkgVar = np.array([b['output_var'] for b in Background])
    totalBkgVar = totalBkgVar.sum(axis=0)

    print len(Signal), len(
        Background), Signal[0]['outputScore'][:].sum(), totalBkgOutput

    for s in Signal:
        significance = []
        significance_err = []
        asimov = []
        asimov_err = []
        roc = []
        roc_err = []

        tot_rel = np.sqrt(np.sum(s['output_var'])) / s['nEvents']
        for i in range(len(bins[1:])):
            #eff_sig = s['outputScore'][:i+1].sum() / s['nEvents']
            #eff_bkg = totalBkgOutput[:i+1].sum() / totalBkgOutput.sum()
            eff_sig = s['outputScore'][i:].sum() / s['nEvents']
            eff_bkg = totalBkgOutput[i:].sum() / totalBkgOutput.sum()

            #err_sig = np.sqrt(np.sum(s['output_var'][:i+1])) / s['nEvents']
            #err_bkg = np.sqrt(np.sum(totalBkgVar[:i+1])) / totalBkgOutput.sum()
            err_sig = np.sqrt(np.sum(s['output_var'][i:])) / s['nEvents']
            err_bkg = np.sqrt(np.sum(totalBkgVar[i:])) / totalBkgOutput.sum()

            #if totalBkgOutput[:i+1].sum() > 0.:
            #  rel_err_bkg = np.sqrt(np.sum(totalBkgVar[:i+1])) / totalBkgOutput[:i+1].sum()
            if totalBkgOutput[i:].sum() > 0.:
                rel_err_bkg = np.sqrt(np.sum(
                    totalBkgVar[i:])) / totalBkgOutput[i:].sum()
            else:
                rel_err_bkg = 0.
            #if s['outputScore'][:i+1].sum() > 0.:
            #  rel_err_sig = np.sqrt(np.sum(s['output_var'][:i+1])) / s['outputScore'][:i+1].sum()
            if s['outputScore'][i:].sum() > 0.:
                rel_err_sig = np.sqrt(np.sum(
                    s['output_var'][i:])) / s['outputScore'][i:].sum()
            else:
                rel_err_sig = 0.

            #total_rel_err = np.sqrt(rel_err_sig**2. + rel_err_bkg**2. + 0.25**2.)
            total_rel_err = np.sqrt(rel_err_bkg**2. + 0.25**2.)

            if float(eff_sig == 0) or float(eff_bkg == 0):
                Z = 0.
                Z_err = 0.
                ams = 0.
                ams_err = 0.
            elif (err_sig / eff_sig > 0.75) or (err_bkg / eff_bkg > 0.75):
                Z = 0.
                Z_err = 0.
                ams = 0.
                ams_err = 0.
            else:
                #Z = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(s['outputScore'][:i+1].sum(), totalBkgOutput[:i+1].sum(), total_rel_err)
                Z = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
                    s['outputScore'][i:].sum(), totalBkgOutput[i:].sum(),
                    total_rel_err)
                ams = asimovZ(s['outputScore'][i:].sum(),
                              totalBkgOutput[i:].sum(),
                              np.sqrt(totalBkgVar[i:].sum()))
                roc.append((eff_sig, 1 - eff_bkg))

                ams_plus_sig = asimovZ((s['outputScore'][i:].sum() +
                                        np.sqrt(np.sum(s['output_var'][i:]))),
                                       totalBkgOutput[i:].sum(),
                                       np.sqrt(totalBkgVar[i:].sum()))
                ams_mins_sig = asimovZ((s['outputScore'][i:].sum() -
                                        np.sqrt(np.sum(s['output_var'][i:]))),
                                       totalBkgOutput[i:].sum(),
                                       np.sqrt(totalBkgVar[i:].sum()))
                ams_plus_bkg = asimovZ(s['outputScore'][i:].sum(),
                                       (totalBkgOutput[i:].sum() +
                                        np.sqrt(np.sum(totalBkgVar[i:]))),
                                       np.sqrt(totalBkgVar[i:].sum()))
                ams_mins_bkg = asimovZ(s['outputScore'][i:].sum(),
                                       (totalBkgOutput[i:].sum() -
                                        np.sqrt(np.sum(totalBkgVar[i:]))),
                                       np.sqrt(totalBkgVar[i:].sum()))

                Zplus_sig = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
                    (eff_sig + err_sig) * s['nEvents'],
                    eff_bkg * totalBkgOutput.sum(), total_rel_err)
                Zmins_sig = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
                    (eff_sig - err_sig) * s['nEvents'],
                    eff_bkg * totalBkgOutput.sum(), total_rel_err)
                Zplus_bkg = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
                    eff_sig * s['nEvents'],
                    (eff_bkg + err_bkg) * totalBkgOutput.sum(), total_rel_err)
                Zmins_bkg = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
                    eff_sig * s['nEvents'],
                    (eff_bkg - err_bkg) * totalBkgOutput.sum(), total_rel_err)

                Z_err_sig = abs(Zplus_sig - Zmins_sig) / 2
                Z_err_bkg = abs(Zplus_bkg - Zmins_bkg) / 2
                Z_err = np.sqrt(Z_err_sig**2 + Z_err_bkg**2)

                ams_err_sig = abs(ams_plus_sig - ams_mins_sig) / 2.
                ams_err_bkg = abs(ams_plus_bkg - ams_mins_bkg) / 2.
                ams_err = np.sqrt(ams_err_sig**2 + ams_err_bkg**2)

            significance.append(Z)
            significance_err.append(Z_err)
            asimov.append(ams)
            asimov_err.append(ams_err)

        s['sig'] = np.array(significance)
        s['sig_max'] = s['sig'].max()
        s['sig_err'] = np.array(significance_err)
        s['ams'] = np.array(asimov)
        s['ams_err'] = np.array(asimov_err)
        s['roc'] = np.array(roc)

        print s['sig']
        print s['ams']
        #print s['roc']
        sigMax_index = bins[np.where(s['sig'] == s['sig'].max())][0]
        amsMax_index = bins[np.where(s['ams'] == s['ams'].max())][0]
        Z = asimovZ(
            Signal[0]['outputScore'][np.where(bins[:-1] == sigMax_index)],
            totalBkgOutput[np.where(bins[:-1] == sigMax_index)],
            np.sqrt(totalBkgVar[np.where(bins[:-1] == sigMax_index)]),
            syst=False)
        Z_syst = asimovZ(
            Signal[0]['outputScore'][np.where(bins[:-1] == sigMax_index)],
            totalBkgOutput[np.where(bins[:-1] == sigMax_index)],
            np.sqrt(totalBkgVar[np.where(bins[:-1] == sigMax_index)]),
            syst=True)
        print 'RooStats: ', s['sig'].max(), sigMax_index, Z, Z_syst
        print 'asmiov : ', s['ams'].max(), amsMax_index

    x = np.array([s['m_stop'] for s in Signal], dtype=float)
    y = np.array([s['m_X'] for s in Signal], dtype=float)
    z = np.array([s['sig_max'] for s in Signal], dtype=float)

    #print x, y, z

    print Signal[0]['outputScore'][np.where(
        bins[:-1] >= sigMax_index)], Signal[0]['output_var'][np.where(
            bins[:-1] >= sigMax_index)]
    print totalBkgOutput[np.where(
        bins[:-1] >= sigMax_index)], totalBkgVar[np.where(
            bins[:-1] >= sigMax_index)]

    print np.sum(Signal[0]['outputScore'][np.where(
        bins[:-1] >= sigMax_index)]), np.sqrt(
            np.sum(Signal[0]['output_var'][np.where(
                bins[:-1] >= sigMax_index)]**2))
    print np.sum(totalBkgOutput[np.where(bins[:-1] >= sigMax_index)]), np.sqrt(
        np.sum(totalBkgVar[np.where(bins[:-1] >= sigMax_index)]**2))

    print Signal[0]['outputScore'], Signal[0]['output_var']
    print totalBkgOutput, totalBkgVar
    # Set up a regular grid of interpolation points

    print('Plotting the output score...')
    fig = plt.figure(figsize=(8, 6))
    ax1 = plt.subplot2grid((4, 4), (0, 0), colspan=4, rowspan=4)
    ax1.set_xlim((bins[0], bins[-1]))
    ax1.set_xlabel('Output score', horizontalalignment='right', x=1.0)
    ax1.set_ylabel("Events", horizontalalignment='right', y=1.0)

    sb_ratio = Signal[0]['outputScore'].sum() / totalBkgOutput.sum()
    #if sb_ratio < 0.2:
    #  #ATTENTION! Simplified error propagation (treated as uncorrelated)
    #  scaled = Signal[0]['outputScore'] / Signal[0]['outputScore'].sum() * totalBkgOutput.sum()
    #  scaled_var = scaled*scaled * ( (Signal[0]['output_var']/Signal[0]['outputScore'])**2 + (totalBkgVar.sum()/totalBkgOutput.sum())**2 + (Signal[0]['output_var'].sum()/Signal[0]['outputScore'].sum())**2 )
    #  scaled_label = 'Signal scaled to Bkg'
    #
    #else:
    scaled = Signal[0]['outputScore']
    scaled_var = Signal[0]['output_var']
    scaled_label = 'Signal'

    multib = plt.bar(center,
                     Background[4]['outputScore'],
                     width=db,
                     yerr=np.sqrt(Background[4]['output_var']),
                     color='seagreen',
                     alpha=0.5,
                     error_kw=dict(ecolor='seagreen', lw=1.5),
                     label='multiboson')
    ttV = plt.bar(center,
                  Background[3]['outputScore'],
                  width=db,
                  yerr=np.sqrt(Background[4]['output_var']),
                  color='lightcoral',
                  alpha=0.5,
                  error_kw=dict(ecolor='lightcoral', lw=1.5),
                  label='ttV',
                  bottom=Background[4]['outputScore'])
    w = plt.bar(center,
                Background[2]['outputScore'],
                width=db,
                yerr=np.sqrt(Background[2]['output_var']),
                color='gold',
                alpha=0.5,
                error_kw=dict(ecolor='gold', lw=1.5),
                label='W+jets',
                bottom=Background[4]['outputScore'] +
                Background[3]['outputScore'])
    st = plt.bar(center,
                 Background[1]['outputScore'],
                 width=db,
                 yerr=np.sqrt(Background[1]['output_var']),
                 color='limegreen',
                 alpha=0.5,
                 error_kw=dict(ecolor='limegreen', lw=1.5),
                 label='singletop',
                 bottom=Background[4]['outputScore'] +
                 Background[3]['outputScore'] + Background[2]['outputScore'])
    tt = plt.bar(center,
                 Background[0]['outputScore'],
                 width=db,
                 yerr=np.sqrt(Background[0]['output_var']),
                 color='dodgerblue',
                 alpha=0.5,
                 error_kw=dict(ecolor='dodgerblue', lw=1.5),
                 label='ttbar',
                 bottom=Background[4]['outputScore'] +
                 Background[3]['outputScore'] + Background[2]['outputScore'] +
                 Background[1]['outputScore'])
    plt.bar(center,
            Signal[0]['outputScore'],
            width=db,
            yerr=np.sqrt(Signal[0]['output_var']),
            label=Signal[0]['name'],
            color='r',
            alpha=0.5,
            error_kw=dict(ecolor='r', lw=1.5))
    #plt.step(center, Signal[0]['outputScore'], width=db, yerr= np.sqrt(Signal[0]['output_var']), label=Signal[0]['name'], color='r', error_kw=dict(ecolor='r', lw=1.5))

    ax1.set_ylim((0.1, totalBkgOutput.max() * (15.)))
    ax1.set_yscale('log')
    leg = plt.legend(loc="best", frameon=False)

    AtlasStyle_mpl.ATLASLabel(ax1, 0.14, 0.84, 'Work in progress')
    AtlasStyle_mpl.LumiLabel(ax1, 0.14, 0.79, lumi=LUMI * 0.001)

    plt.savefig("plots/" + modelfile + "_eval-bWN-500-380_outputScore.pdf")
    plt.savefig("plots/" + modelfile + "_eval-bWN-500-380_outputScore.png")
    plt.close()

    print('Plotting significance...')
    fig = plt.figure(figsize=(8, 6))
    ax1 = plt.subplot2grid((4, 4), (0, 0), colspan=4, rowspan=4)
    ax1.set_xlim((bins[0], bins[-1]))
    ax1.set_xlabel('Output score', horizontalalignment='right', x=1.0)
    ax1.set_ylabel("Z", horizontalalignment='right', y=1.0)

    plt.plot(center,
             Signal[0]['ams'],
             'k-',
             color='cornflowerblue',
             label='Asimov Z (max = %0.3f at %0.2f)' %
             (s['ams'].max(), amsMax_index))
    plt.fill_between(center,
                     Signal[0]['ams'] - Signal[0]['ams_err'],
                     Signal[0]['ams'] + Signal[0]['ams_err'],
                     alpha=0.2,
                     edgecolor='cornflowerblue',
                     facecolor='cornflowerblue',
                     linewidth=0)
    ax1.set_ylim((0., Signal[0]['ams'].max() * (1.5)))

    plt.plot(center,
             Signal[0]['sig'],
             'k-',
             color='darkred',
             label='Binomial Z (max = %0.3f at %0.2f)' %
             (s['sig'].max(), sigMax_index))
    plt.fill_between(center,
                     Signal[0]['sig'] - Signal[0]['sig_err'],
                     Signal[0]['sig'] + Signal[0]['sig_err'],
                     alpha=0.2,
                     edgecolor='darkred',
                     facecolor='darkred',
                     linewidth=0)
    plt.plot(center, len(center) * [3.], '--', color='grey', alpha=0.5)
    plt.plot(center, len(center) * [5.], '--', color='red', alpha=0.5)
    leg = plt.legend(loc="best", frameon=False)

    AtlasStyle_mpl.ATLASLabel(ax1, 0.14, 0.84, 'Work in progress')
    AtlasStyle_mpl.LumiLabel(ax1, 0.14, 0.79, lumi=LUMI * 0.001)

    plt.savefig("plots/" + modelfile + "_Significance_bWN-500-380.pdf")
    plt.savefig("plots/" + modelfile + "_Significance_bWN-500-380.png")
    plt.close()

    print('Plotting ROC...')
    fig = plt.figure(figsize=(8, 6))
    ax1 = plt.subplot2grid((4, 4), (0, 0), colspan=4, rowspan=4)
    ax1.set_xlim((bins[0], bins[-1]))
    ax1.set_ylim((0, 1))
    ax1.set_xlabel('$\epsilon_{Sig.}$', horizontalalignment='right', x=1.0)
    ax1.set_ylabel("$r_{Bkg.}$", horizontalalignment='right', y=1.0)

    auc = np.trapz(s['roc'][:, 0], s['roc'][:, 1], dx=db)
    print 'Area under ROC?!: ', auc

    plt.plot(s['roc'][:, 0],
             s['roc'][:, 1],
             'k-',
             color='cornflowerblue',
             label='ROC (AUC = %0.4f)' % (auc))
    #plt.fill_between(center, Signal[0]['ams']-Signal[0]['ams_err'], Signal[0]['ams']+Signal[0]['ams_err'], alpha=0.2, edgecolor='cornflowerblue', facecolor='cornflowerblue', linewidth=0)
    plt.plot([0, 1], [1, 0], '--', color=(0.6, 0.6, 0.6), label='Luck')
    leg = plt.legend(loc="lower left", frameon=False)

    AtlasStyle_mpl.ATLASLabel(ax1, 0.14, 0.28, 'Work in progress')
    AtlasStyle_mpl.LumiLabel(ax1, 0.14, 0.23, lumi=LUMI * 0.001)

    plt.savefig("plots/" + modelfile + "_ROC_bWN-500-380.pdf")
    plt.savefig("plots/" + modelfile + "_ROC_bWN-500-380.png")
    plt.close()

コード例 #6

ファイルを表示

ファイル: getWorkingPoint.py プロジェクト: dhandl/MLkit

def main():

    ###########################
    # Read and evaluate signals
    ###########################

    Signal = []
    for smp in SIGNAL:
        first = True
        for s in smp:
            print 'Sample:\t', s
            x, y = pickBenchmark(s)
            _df, _weight = loadDataFrame(os.path.join(inputDir, s + '/'),
                                         PRESELECTION, VAR, WEIGHTS, LUMI)
            print _df.shape, _weight.shape
            if first:
                df = _df.copy()
                weight = _weight.copy()
                first = False
            else:
                df = pd.concat((df, _df), ignore_index=True)
                weight = pd.concat((weight, _weight), ignore_index=True)

        sigma = np.sum(weight.values**2.)

        Signal.append({
            'name': s[6:],
            'm_stop': x,
            'm_X': y,
            'dataset': df,
            'weight': weight,
            'nEvents': weight.sum(),
            'nSigma': np.sqrt(sigma)
        })

        del df, weight

    ###########################
    # Read and evaluate backgrounds
    ###########################

    totBkgEvents = 0.
    totBkgVar = 0.
    Background = []
    for smp in BACKGROUND:
        first = True
        for b in smp:
            print 'Sample:\t', b
            _df, _weight = loadDataFrame(os.path.join(inputDir, b + '/'),
                                         PRESELECTION, VAR, WEIGHTS, LUMI)
            print _df.shape, _weight.shape
            if first:
                df = _df.copy()
                weight = _weight.copy()
                first = False
            else:
                df = pd.concat((df, _df), ignore_index=True)
                weight = pd.concat((weight, _weight), ignore_index=True)

        totBkgEvents += weight.sum()
        totBkgVar += np.sum(weight.values**2.)

        Background.append({
            'name': b,
            'dataset': df,
            'weight': weight,
            'nEvents': weight.sum(),
            'nSigma': np.sqrt(np.sum(weight.values**2.))
        })

        del df, weight

        total_rel_err = np.sqrt(totBkgVar / totBkgEvents**2. +
                                (totBkgEvents * 0.25)**2.)

    print 'Bkg:\t%.2f +/- %.2f' % (totBkgEvents, np.sqrt(totBkgVar))

    for s in Signal:
        significance = []
        significance_err = []
        asimov = []
        asimov_err = []

        s['Z'] = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(
            s['nEvents'], totBkgEvents, total_rel_err)
        s['ams'] = asimovZ(s['nEvents'], totBkgEvents, np.sqrt(totBkgVar))
        print 'Z:\t%.2f' % s['Z']
        print 'Asimov:\t%.2f' % s['ams']
        print 'Sig %s:\t%.2f +/- %.2f' % (s['name'], s['nEvents'], s['nSigma'])
        print 'r_bkg:\t%.2f' % (1. - (totBkgEvents / BWN_PRESEL_BKG))
        print 'e_sig:\t%.2f' % ((s['nEvents'] / BWN_PRESEL_SIG))

コード例 #7

ファイルを表示

def main():

    for m in MODELS:

        modelDir = DIR + m['mdir'] + '.h5'
        DatasetDir = 'TrainedModels/datasets/'

        if os.path.exists(os.path.join(DIR, m['mdir'] + '_scaler.pkl')):
            m['scaler'] = joblib.load(
                os.path.join(DIR, m['mdir'] + '_scaler.pkl'))
        else:
            m['scaler'] = None

        infofile = open(modelDir.replace('.h5', '_infofile.txt'))
        infos = infofile.readlines()
        m['analysis'] = infos[0].replace('Used analysis method: ',
                                         '').replace('\n', '')
        m['dataset'] = DatasetDir + infos[3].replace('Used dataset: ',
                                                     '').replace('\n', '')
        m['VAR'] = infos[5].replace('Used variables for training: ',
                                    '').replace('\n', '').split()

        m['recurrent'] = False
        if m['analysis'].lower() == 'rnn':
            m['recurrent'] = True
            m['seq_scaler'] = m['dataset'] + '_scaling.json'

        if 'nn' in m['analysis'].lower():
            m['model'] = load_model(os.path.join(DIR, m['mdir'] + '.h5'))
        elif 'bdt' in m['analysis'].lower():
            m['model'] = joblib.load(os.path.join(DIR, m['mdir'] + '.h5'))

        print '#----MODEL----#'
        print '\t', m['mdir']

        ###########################
        # Read and evaluate signals
        ###########################

        m['Signal'] = []
        for smp in SIGNAL:
            first = True
            for s in smp:
                print 'Sample:\t', s
                x, y = pickBenchmark(s)
                if not m['recurrent']:
                    _df, _weight = loadDataFrame(
                        os.path.join(inputDir, s + '/'), PRESELECTION,
                        m['VAR'], WEIGHTS, LUMI)
                    print _df.shape, _weight.shape
                    if first:
                        df = _df.copy()
                        weight = _weight.copy()
                        first = False
                    else:
                        df = pd.concat((df, _df), ignore_index=True)
                        weight = pd.concat((weight, _weight),
                                           ignore_index=True)
                else:
                    _df, _weight, collection = loadSequentialDataFrame(
                        os.path.join(inputDir, s + '/'), PRESELECTION,
                        COLLECTION, REMOVE_VAR, m['VAR'], WEIGHTS, LUMI)
                    print _df.shape, _weight.shape, collection[0]['df'].shape
                    if first:
                        df = _df.copy()
                        weight = _weight.copy()
                        seq = collection[0]['df'].copy()
                        first = False
                    else:
                        df = pd.concat((df, _df), ignore_index=True)
                        weight = pd.concat((weight, _weight),
                                           ignore_index=True)
                        seq = pd.concat((seq, collection[0]['df']),
                                        ignore_index=True)

            if not m['recurrent']:
                m['y_pred_sig'] = evaluate(m['model'],
                                           df.values,
                                           m['scaler'],
                                           method=m['analysis'])
                m['y_sig'] = np.ones(m['y_pred_sig'].shape[0])
            else:
                collection[0]['df'] = seq.copy()
                m['y_pred_sig'] = evaluate(m['model'],
                                           df.values,
                                           m['scaler'],
                                           m['seq_scaler'],
                                           method=m['analysis'],
                                           col=collection)
                m['y_sig'] = np.ones(m['y_pred_sig'].shape[0])

            bin_index = np.digitize(
                m['y_pred_sig'][:, 0], bins[1:]
            )  # get the bin index of the output score for each event
            outputWeighted = []
            outputWeightedVar = []
            outputMC = []
            outputMCVar = []
            for i in range(len(bins[1:])):
                w = weight.values[np.where(bin_index == i)[0]]
                sigma = np.sum(w**2.)
                outputWeighted.append(w.sum())
                outputWeightedVar.append(sigma)
                outputMC.append(len(w))
                outputMCVar.append(np.sqrt(len(w)))

            m['Signal'].append({
                'name': s[6:],
                'm_stop': x,
                'm_X': y,
                'dataset': df,
                'weight': weight,
                'nEvents': weight.sum(),
                'outputScore': np.array(outputWeighted),
                'outputMC': np.array(outputMC),
                'output_var': np.array(outputWeightedVar),
                'outputMC_var': np.array(outputMCVar)
            })

            del df, weight, bin_index, outputWeighted, outputWeightedVar, outputMC, outputMCVar

        ###############################
        # Read and evaluate backgrounds
        ###############################

        m['totBkgEvents'] = 0.
        m['totBkgVar'] = 0.
        m['Background'] = []
        for smp in BACKGROUND:
            first = True
            for b in smp:
                print 'Sample:\t', b
                if not m['recurrent']:
                    _df, _weight = loadDataFrame(
                        os.path.join(inputDir, b + '/'), PRESELECTION,
                        m['VAR'], WEIGHTS, LUMI)
                    print _df.shape, _weight.shape
                    if first:
                        df = _df.copy()
                        weight = _weight.copy()
                        first = False
                    else:
                        df = pd.concat((df, _df), ignore_index=True)
                        weight = pd.concat((weight, _weight),
                                           ignore_index=True)
                else:
                    _df, _weight, collection = loadSequentialDataFrame(
                        os.path.join(inputDir, b + '/'), PRESELECTION,
                        COLLECTION, REMOVE_VAR, m['VAR'], WEIGHTS, LUMI)
                    print _df.shape, _weight.shape, collection[0]['df'].shape
                    if first:
                        df = _df.copy()
                        weight = _weight.copy()
                        seq = collection[0]['df'].copy()
                        first = False
                    else:
                        df = pd.concat((df, _df), ignore_index=True)
                        weight = pd.concat((weight, _weight),
                                           ignore_index=True)
                        seq = pd.concat((seq, collection[0]['df']),
                                        ignore_index=True)

            if not m['recurrent']:
                print df.shape, weight.shape
                m['_'.join(['y_pred', b])] = evaluate(m['model'],
                                                      df.values,
                                                      m['scaler'],
                                                      method=m['analysis'])
                m['_'.join(['y', b])] = np.zeros(m['_'.join(['y_pred',
                                                             b])].shape[0])
            else:
                collection[0]['df'] = seq
                print df.shape, weight.shape, collection[0]['df'].shape
                m['_'.join(['y_pred', b])] = evaluate(m['model'],
                                                      df.values,
                                                      m['scaler'],
                                                      m['seq_scaler'],
                                                      method=m['analysis'],
                                                      col=collection)
                m['_'.join(['y', b])] = np.zeros(m['_'.join(['y_pred',
                                                             b])].shape[0])

            bin_index = np.digitize(m['_'.join(['y_pred', b])][:, 0], bins[1:])
            outputWeighted = []
            outputWeightedVar = []
            outputMC = []
            outputMCVar = []

            m['totBkgEvents'] += weight.sum()
            m['totBkgVar'] += np.sum(weight.values**2.)
            for i in range(len(bins[1:])):
                w = weight.values[np.where(bin_index == i)[0]]
                sigma = np.sum(w**2.)
                outputWeighted.append(w.sum())
                outputWeightedVar.append(sigma)
                outputMC.append(len(w))
                outputMCVar.append(len(w))

            m['Background'].append({
                'name': b,
                'dataset': df,
                'weight': weight,
                'nEvents': weight.sum(),
                'outputScore': np.array(outputWeighted),
                'outputMC': np.array(outputMC),
                'output_var': np.array(outputWeightedVar),
                'outputMC_var': np.array(outputMCVar)
            })

            del df, weight, bin_index, outputWeighted, outputWeightedVar, outputMC, outputMCVar

        m['totalBkgOutput'] = np.array(
            [b['outputScore'] for b in m['Background']])
        m['totalBkgOutput'] = m['totalBkgOutput'].sum(axis=0)

        m['totalBkgVar'] = np.array([b['output_var'] for b in m['Background']])
        m['totalBkgVar'] = m['totalBkgVar'].sum(axis=0)

        for s in m['Signal']:
            m['roc'] = []
            m['roc_err'] = []

            m['tot_rel'] = np.sqrt(np.sum(s['output_var'])) / s['nEvents']
            for i in range(len(bins[1:])):
                eff_sig = s['outputScore'][i:].sum() / s['nEvents']
                eff_bkg = m['totalBkgOutput'][i:].sum(
                ) / m['totalBkgOutput'].sum()

                err_sig = np.sqrt(np.sum(s['output_var'][i:])) / s['nEvents']
                err_bkg = np.sqrt(np.sum(
                    m['totalBkgVar'][i:])) / m['totalBkgOutput'].sum()

                if m['totalBkgOutput'][i:].sum() > 0.:
                    rel_err_bkg = np.sqrt(np.sum(
                        m['totalBkgVar'][i:])) / m['totalBkgOutput'][i:].sum()
                else:
                    rel_err_bkg = 0.
                if s['outputScore'][i:].sum() > 0.:
                    rel_err_sig = np.sqrt(np.sum(
                        s['output_var'][i:])) / s['outputScore'][i:].sum()
                else:
                    rel_err_sig = 0.

                m['total_rel_err'] = np.sqrt(rel_err_bkg**2. + 0.25**2.)

                m['roc'].append((eff_sig, 1 - eff_bkg))

                roc_plus_sig = eff_sig + err_sig
                roc_mins_sig = eff_sig - err_sig
                roc_plus_bkg = 1 - (eff_bkg + err_bkg)
                roc_mins_bkg = 1 - (eff_bkg - err_bkg)

                #roc_err_sig = abs(roc_plus_sig - roc_mins_sig) / 2.
                roc_err_bkg = abs(roc_plus_bkg - roc_mins_bkg) / 2.
                m['roc_err'].append(roc_err_bkg)

            m['roc'] = np.array(m['roc'])
            m['roc_err'] = np.array(m['roc_err'])

        #m['y_bkg'] = np.empty(0)
        #m['y_pred_bkg'] = np.empty(0)

        #for b in BACKGROUND:
        #  m['y_bkg'] = np.concatenate((m['y_bkg'], m['_'.join(['y',b])]))
        #  m['y_pred_bkg'] = np.concatenate((m['y_pred_bkg'], m['_'.join(['y_pred',b])][:,0]))

        #m['y'] = np.concatenate((m['y_sig'], m['y_bkg']))
        #m['y_pred'] = np.concatenate((m['y_pred_sig'][:,0], m['y_pred_bkg']))

        #m['fpr'], m['tpr'], m['threshold'] = roc_curve(m['y'], m['y_pred'])
        #m['auc'] = roc_auc_score(m['y'], m['y_pred'])

    print('Plotting ROC curve ...')
    fig = plt.figure(figsize=(8, 6))
    ax1 = plt.subplot2grid((4, 4), (0, 0), colspan=4, rowspan=4)
    #ax1.set_xlim((bins[0], bins[-1]))
    #ax1.set_ylim((0, 1))
    ax1.set_xlabel('$\epsilon_{Sig.}$', horizontalalignment='right', x=1.0)
    ax1.set_ylabel('$r_{Bkg.}$', horizontalalignment='right', y=1.0)

    for m in MODELS:
        m['auc'] = np.trapz(m['roc'][:, 0], m['roc'][:, 1], dx=db)
        print 'Area under ROC:\t', m['auc']
        if logScale:
            ax1.set_yscale('log')
            plt.plot(m['roc'][:, 0],
                     1. / (1. - m['roc'][:, 1]),
                     'k-',
                     color=m['color'],
                     label='%s (AUC = %0.4f)' % (m['name'], m['auc']))
            plt.fill_between(m['roc'][:, 0],
                             1. / (1. - (m['roc'][:, 1] - m['roc_err'])),
                             1. / (1. - (m['roc'][:, 1] + m['roc_err'])),
                             alpha=0.2,
                             edgecolor=m['color'],
                             facecolor=m['color'],
                             linewidth=0)
            #plt.plot(m['tpr'], 1./m['fpr'], lw=2, label=m['name']+' (AUC = %0.3f)'%(m['auc']))
        else:
            plt.plot(m['roc'][:, 0],
                     m['roc'][:, 1],
                     'k-',
                     color=m['color'],
                     label='%s (AUC = %0.2f)' % (m['name'], m['auc']))
            plt.fill_between(m['roc'][:, 0], (m['roc'][:, 1] - m['roc_err']),
                             (m['roc'][:, 1] + m['roc_err']),
                             alpha=0.2,
                             edgecolor=m['color'],
                             facecolor=m['color'],
                             linewidth=0)
            #plt.plot(m['tpr'], 1.-m['fpr'], lw=2, label=m['name']+' (AUC = %0.3f)'%(m['auc']))
            ax1.set_xlim((0, 0.16))
            ax1.set_ylim((0.975, 1.0))

    #plt.plot([0, 1], [1, 0], '--', color=(0.6, 0.6, 0.6), label='Luck')

    for p in WP:
        p['eff_sig'] = p['sig'] / BWN_PRESEL_SIG
        p['eff_bkg'] = p['bkg'] / BWN_PRESEL_BKG
        if p['legend']:
            plt.plot([p['eff_sig']], [1 - p['eff_bkg']],
                     '.',
                     color=p['color'],
                     label=p['name'])
        else:
            plt.plot([p['eff_sig']], [1 - p['eff_bkg']], '.', color=p['color'])

    leg = plt.legend(loc="lower left", frameon=False)

    #AtlasStyle_mpl.ATLASLabel(ax1, 0.02, 0.25, 'Work in progress')
    AtlasStyle_mpl.Text(ax1, 0.14, 0.52, 'Simulation')
    AtlasStyle_mpl.LumiLabel(ax1, 0.14, 0.46, lumi=LUMI * 0.001)

    plt.savefig(SAVEDIR + FILENAME + '.pdf')
    plt.savefig(SAVEDIR + FILENAME + '.png')
    plt.close()

コード例 #8

ファイルを表示

def evaluate_signalGridCuts(modelDir, resolution=np.array([50,0,1], dtype=float), save=False, fileName='Test'):
  print('Evaluating singal grid...') 
  
  if fileName=='Grid_test':
      fileName=modelDir.replace('TrainedModels/models/','').replace('.h5','')
    
  infofile = open(modelDir.replace('.h5','_infofile.txt'))
  infos = infofile.readlines()
  
  #Parse Strings for correct datatypes
  
  variables=infos[4].replace('Used variables for training: ','').replace('\n','').split()
  weights=infos[5].replace('Used weights: ', '').replace('\n','').split()
  lumi=float(infos[7].replace('Used Lumi: ','').replace('\n',''))
  background=infos[9].replace('Used background files: ','').replace('; \n','').replace(' ','').split(';')
  
  preselection = preselection_evaluate
  
  print 'Using the following preselection to evaluate:' , preselection
  
  signal = ['stop_bWN_250_100', 'stop_bWN_250_130', 'stop_bWN_250_160', 'stop_bWN_300_150', 'stop_bWN_300_180', 'stop_bWN_300_210', 'stop_bWN_350_185', 'stop_bWN_350_200', 'stop_bWN_350_230', 'stop_bWN_350_260', 'stop_bWN_400_235', 'stop_bWN_400_250', 'stop_bWN_400_280', 'stop_bWN_400_310', 'stop_bWN_450_285', 'stop_bWN_450_300', 'stop_bWN_450_330', 'stop_bWN_450_360', 'stop_bWN_500_335', 'stop_bWN_500_350', 'stop_bWN_500_380', 'stop_bWN_550_385', 'stop_bWN_550_400', 'stop_bWN_550_430', 'stop_bWN_550_460', 'stop_bWN_600_435', 'stop_bWN_600_450', 'stop_bWN_600_480', 'stop_bWN_600_510', 'stop_bWN_650_485', 'stop_bWN_650_500', 'stop_bWN_650_530', 'stop_bWN_650_560']
  
  #Get Scaler and model from modelDir
   
  model = load_model(modelDir)
  
  scalerDir=modelDir.replace('.h5','_scaler.pkl')
  scaler=joblib.load(scalerDir)
    
  #Evaluate

  db = (resolution[2] - resolution[1]) / resolution[0]    # bin width in discriminator distribution
  bins = np.arange(resolution[1], resolution[2]+db, db)   # bin edges in discriminator distribution

  ###########################
  # Read and evaluate signals
  ###########################

  Signal = []
  for s in signal:
    x, y = pickBenchmark(s)
    df, weight = loadDataFrame(os.path.join(inputDirSig, s+'/'), preselection, variables, weights, lumi)
    y_hat = evaluate(model, df.values, scaler)
    bin_index = np.digitize(y_hat[:,0], bins[1:])   # get the bin index of the output score for each event 
    outputWeighted = []
    outputWeightedVar = []
    outputMC = []
    outputMCVar = []
    for i in range(len(bins[1:])):
      w = weight.values[np.where(bin_index==i)[0]]
      sigma = np.sum(w**2.)
      outputWeighted.append(w.sum())
      outputWeightedVar.append(sigma)
      outputMC.append(len(w))
      outputMCVar.append(np.sqrt(len(w)))
    
    Signal.append({'name':s, 'm_stop':x, 'm_X':y, 'dataset':df, 'weight':weight, 'nEvents':weight.sum(), 'y_pred':y_hat, 'outputScore':np.array(outputWeighted), 'outputMC':np.array(outputMC), 'output_var':np.array(outputWeightedVar), 'outputMC_var':np.array(outputMCVar)})

    del df, weight, y_hat, bin_index, outputWeighted, outputWeightedVar, outputMC, outputMCVar

  ###########################
  # Read and evaluate backgrounds 
  ###########################
  
  totBkgEvents = 0.
  totBkgVar = 0.
  Background = []
  for b in background:
    df, weight = loadDataFrame(os.path.join(inputDirBkg, b+'/'), preselection, variables, weights, lumi)
    y_hat = evaluate(model, df.values, scaler)
    bin_index = np.digitize(y_hat[:,0], bins[1:])
    outputWeighted = []
    outputWeightedVar = []
    outputMC = []
    outputMCVar = []

    totBkgEvents += weight.sum()
    totBkgVar += np.sum(weight.values**2.)
    for i in range(len(bins[1:])):
      w = weight.values[np.where(bin_index==i)[0]]
      sigma = np.sum(w**2.)
      outputWeighted.append(w.sum())
      outputWeightedVar.append(sigma)
      outputMC.append(len(w))
      outputMCVar.append(len(w))

    Background.append({'name':b, 'dataset':df, 'weight':weight, 'nEvents':weight.sum(), 'y_pred':y_hat, 'outputScore':np.array(outputWeighted), 'outputMC':np.array(outputMC), 'output_var':np.array(outputWeightedVar), 'outputMC_var':np.array(outputMCVar)})

    del df, weight, y_hat, bin_index, outputWeighted, outputWeightedVar, outputMC, outputMCVar
  
  totalBkgOutput = np.array([b['outputScore'] for b in Background]) 
  totalBkgOutput = totalBkgOutput.sum(axis=0)
  
  totalBkgVar = np.array([b['output_var'] for b in Background])
  totalBkgVar = totalBkgVar.sum(axis=0)
   
  for s in Signal:
    significance = []
    significance_err = []
    tot_rel = np.sqrt(np.sum(s['output_var'])) / s['nEvents']
    for i in range(len(bins[1:])):
      #eff_sig = s['outputScore'][:i+1].sum() / s['nEvents']
      #eff_bkg = totalBkgOutput[:i+1].sum() / totalBkgOutput.sum()
      eff_sig = s['outputScore'][i:-1].sum() / s['nEvents']
      eff_bkg = totalBkgOutput[i:-1].sum() / totalBkgOutput.sum()
 
      #err_sig = np.sqrt(np.sum(s['output_var'][:i+1])) / s['nEvents']
      #err_bkg = np.sqrt(np.sum(totalBkgVar[:i+1])) / totalBkgOutput.sum()
      err_sig = np.sqrt(np.sum(s['output_var'][i:-1])) / s['nEvents']
      err_bkg = np.sqrt(np.sum(totalBkgVar[i:-1])) / totalBkgOutput.sum()

      #if totalBkgOutput[:i+1].sum() > 0.:
      #  rel_err_bkg = np.sqrt(np.sum(totalBkgVar[:i+1])) / totalBkgOutput[:i+1].sum()
      if totalBkgOutput[i:-1].sum() > 0.:
        rel_err_bkg = np.sqrt(np.sum(totalBkgVar[i:-1])) / totalBkgOutput[i:-1].sum()
      else:
        rel_err_bkg = 0.
      #if s['outputScore'][:i+1].sum() > 0.:
      #  rel_err_sig = np.sqrt(np.sum(s['output_var'][:i+1])) / s['outputScore'][:i+1].sum()
      if s['outputScore'][i:-1].sum() > 0.:
        rel_err_sig = np.sqrt(np.sum(s['output_var'][i:-1])) / s['outputScore'][i:-1].sum()
      else:
        rel_err_sig = 0.
      
      total_rel_err = np.sqrt(rel_err_sig**2. + rel_err_bkg**2. + 0.25**2.)

      if (eff_sig == 0) or (eff_bkg == 0):
        Z = 0.
        Z_err = 0.
      elif (err_sig / eff_sig > 0.75) or (err_bkg / eff_bkg > 0.75):
        Z = 0
        Z_err = 0
      else:
        #Z = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(s['outputScore'][:i+1].sum(), totalBkgOutput[:i+1].sum(), total_rel_err)
        Z = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(s['outputScore'][i:-1].sum(), totalBkgOutput[i:-1].sum(), total_rel_err)

        Zplus_sig = ROOT.RooStats.NumberCountingUtils.BinomialExpZ((eff_sig + err_sig) * s['nEvents'], eff_bkg * totalBkgOutput.sum(), total_rel_err)
        Zmins_sig = ROOT.RooStats.NumberCountingUtils.BinomialExpZ((eff_sig - err_sig) * s['nEvents'], eff_bkg * totalBkgOutput.sum(), total_rel_err)
        Zplus_bkg = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(eff_sig * s['nEvents'], (eff_bkg + err_bkg) * totalBkgOutput.sum(), total_rel_err)
        Zmins_bkg = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(eff_sig * s['nEvents'], (eff_bkg - err_bkg) * totalBkgOutput.sum(), total_rel_err)

      Z_err_sig = abs(Zplus_sig - Zmins_sig) / 2
      Z_err_bkg = abs(Zplus_bkg - Zmins_bkg) / 2
      Z_err = np.sqrt(Z_err_sig**2 + Z_err_bkg**2)

      significance.append(Z)
      significance_err.append(Z_err)

    s['sig'] = np.array(significance)
    s['sig_max'] = s['sig'].max()
    s['sig_err'] = np.array(significance_err)
    print s['sig']
    print s['sig'].max(), bins[np.where(s['sig'] == s['sig'].max())]

  x = np.array([s['m_stop'] for s in Signal], dtype=float)
  y = np.array([s['m_X'] for s in Signal], dtype=float)
  z = np.array([s['sig_max'] for s in Signal],dtype=float)

  print x, y, z
  # Set up a regular grid of interpolation points
  fig, ax1 = plt.subplots(figsize=(8,6))
  xi, yi = np.linspace(x.min(), x.max(), 100), np.linspace(y.min(), y.max(), 100)
  xi, yi = np.meshgrid(xi, yi)

  # Interpolate
  rbf = scipy.interpolate.LinearNDInterpolator(points=np.array((x, y)).T, values=z)
  zi = rbf(xi, yi)

  im = ax1.imshow(zi, vmin=0., vmax=5., origin='lower',
             extent=[x.min(), x.max(), y.min(), y.max()])
  cbar = plt.colorbar(im)
  cbar.set_label('Significance')
  ax1.set_xlabel(r'$m_{\tilde{t}}$')
  ax1.set_xlim([x.min(), x.max()])
  ax1.set_ylabel(r'$m_{\chi}$')
  ax1.set_ylim([y.min(), y.max()])
  plt.scatter(x, y, c='black')
  plt.plot(x, x-84., color='black')
  plt.plot(x, x-175., color='black')
  AtlasStyle_mpl.ATLASLabel(ax1, 0.022, 0.925, 'Work in progress')
  AtlasStyle_mpl.LumiLabel(ax1, 0.022, 0.875, lumi=lumi*0.001)
  #plt.show()
  
  if save:
        if not os.path.exists('./plots/'):
            os.makedirs('./plots/')
            print('Creating folder plots')
        isFile = True
        n = 1
        while isFile:
            filepath = './plots/' + fileName + '_evaluated_grid_cuts_' + str(n) + '_infofile.txt'
            if os.path.isfile(filepath) and filepath.endswith('.txt'):
                n += 1
                isFile=True
            else: 
                isFile=False
                infofile = open(filepath, 'w')
                print('Saving evaluation informations to ' , filepath)
                presels = ''
                for pre in preselection_evaluate:
                    if pre['type'] == 'condition':
                        presels += pre['name'] + '-threshold: ' + str(pre['threshold']) + ' type: ' + pre['type'] + ' variable: ' + pre['variable'] + ' lessthan: ' + str(pre['lessthan']) + ' and morethan: ' +  str(pre['morethan']) + '; '
                    else:
                        presels += pre['name'] + '-threshold: ' + str(pre['threshold']) + ' type: ' + pre['type'] + '; '
                infofile.write('Used preselection for evaluation: ' + presels)
                infofile.close()            
        plt.savefig('plots/'+fileName+'_evaluated_grid_cuts_' + str(n) + '.pdf')
        plt.savefig('plots/'+fileName+'_evaluated_grid_cuts_' + str(n) + '.png')
        plt.close()

コード例 #9

ファイルを表示

def evaluate_signalGrid(modelDir, resolution=np.array([50,0,1], dtype=float), save=False, fileName='Test'):
  print('Evaluating signal grid...')  
  
  infofile = open(modelDir.replace('.h5','_infofile.txt'))
  infos = infofile.readlines()
  
  #Parse Strings for correct datatypes
  
  variables=infos[4].replace('Used variables for training: ','').replace('\n','').split()
  weights=infos[5].replace('Used weights: ', '').replace('\n','').split()
  preselection_raw=infos[6].replace('Used preselection: ', '').replace('; \n', '').split(';')
  preselection=[]
  for x in preselection_raw:
      xdict = {}
      xdict['name']= x.split()[0].split('-')[0]
      xdict['threshold']= float(x.split()[1])
      xdict['type'] = x.split()[3]
      if xdict['type'] == 'condition':
          xdict['variable'] = x.split()[5]
          xdict['lessthan'] = float(x.split()[7])
          xdict['morethan'] = float(x.split()[10])
      preselection.append(xdict)
  lumi=float(infos[7].replace('Used Lumi: ','').replace('\n',''))
  background=infos[9].replace('Used background files: ','').replace('; \n','').replace(' ','').split(';')
  #signal=infos[8].replace('Used signal files: ','').replace('; \n','').replace(' ','').split(';')
  
  signal = ['stop_bWN_250_100', 'stop_bWN_250_130', 'stop_bWN_250_160', 'stop_bWN_300_150', 'stop_bWN_300_180', 'stop_bWN_300_210', 'stop_bWN_350_185', 'stop_bWN_350_200', 'stop_bWN_350_230', 'stop_bWN_350_260', 'stop_bWN_400_235', 'stop_bWN_400_250', 'stop_bWN_400_280', 'stop_bWN_400_310', 'stop_bWN_450_285', 'stop_bWN_450_300', 'stop_bWN_450_330', 'stop_bWN_450_360', 'stop_bWN_500_335', 'stop_bWN_500_350', 'stop_bWN_500_380', 'stop_bWN_550_385', 'stop_bWN_550_400', 'stop_bWN_550_430', 'stop_bWN_550_460', 'stop_bWN_600_435', 'stop_bWN_600_450', 'stop_bWN_600_480', 'stop_bWN_600_510', 'stop_bWN_650_485', 'stop_bWN_650_500', 'stop_bWN_650_530', 'stop_bWN_650_560']
  
   
  #For Debugging
  #print variables, type(variables)
  #print weights, type(variables)
  #print preselection, type(preselection[1])
  #print lumi, type(lumi)
  #print signal, type(signal)
  #print background, type(background)
   
  #Get Scaler and model from modelDir
   
  model = load_model(modelDir)
  
  scalerDir=modelDir.replace('.h5','_scaler.pkl')
  scaler=joblib.load(scalerDir)
    
  #Evaluate

  db = (resolution[2] - resolution[1]) / resolution[0]    # bin width in discriminator distribution
  bins = np.arange(resolution[1], resolution[2]+db, db)   # bin edges in discriminator distribution

  ###########################
  # Read and evaluate signals
  ###########################
  
  statInfoSig = {}
  #Infos about statistic

  Signal = []
  for s in signal:
    x, y = pickBenchmark(s)
    df, weight = loadDataFrame(os.path.join(inputDirSig, s+'/'), preselection, variables, weights, lumi)
    statInfoSig[s]=df.shape[0]
    y_hat = evaluate(model, df.values, scaler)
    bin_index = np.digitize(y_hat[:,0], bins[1:])   # get the bin index of the output score for each event 
    outputWeighted = []
    outputWeightedVar = []
    outputMC = []
    outputMCVar = []
    for i in range(len(bins[1:])):
      w = weight.values[np.where(bin_index==i)[0]]
      sigma = np.sum(w**2.)
      outputWeighted.append(w.sum())
      outputWeightedVar.append(sigma)
      outputMC.append(len(w))
      outputMCVar.append(np.sqrt(len(w)))
    
    Signal.append({'name':s, 'm_stop':x, 'm_X':y, 'dataset':df, 'weight':weight, 'nEvents':weight.sum(), 'y_pred':y_hat, 'outputScore':np.array(outputWeighted), 'outputMC':np.array(outputMC), 'output_var':np.array(outputWeightedVar), 'outputMC_var':np.array(outputMCVar)})

    del df, weight, y_hat, bin_index, outputWeighted, outputWeightedVar, outputMC, outputMCVar

  ###########################
  # Read and evaluate backgrounds 
  ###########################
  
  statInfoBkg = {}
  #Infos about statistic
  
  totBkgEvents = 0.
  totBkgVar = 0.
  Background = []
  for b in background:
    df, weight = loadDataFrame(os.path.join(inputDirBkg, b+'/'), preselection, variables, weights, lumi)
    statInfoBkg[b]=df.shape[0]
    y_hat = evaluate(model, df.values, scaler)
    bin_index = np.digitize(y_hat[:,0], bins[1:])
    outputWeighted = []
    outputWeightedVar = []
    outputMC = []
    outputMCVar = []

    totBkgEvents += weight.sum()
    totBkgVar += np.sum(weight.values**2.)
    for i in range(len(bins[1:])):
      w = weight.values[np.where(bin_index==i)[0]]
      sigma = np.sum(w**2.)
      outputWeighted.append(w.sum())
      outputWeightedVar.append(sigma)
      outputMC.append(len(w))
      outputMCVar.append(len(w))

    Background.append({'name':b, 'dataset':df, 'weight':weight, 'nEvents':weight.sum(), 'y_pred':y_hat, 'outputScore':np.array(outputWeighted), 'outputMC':np.array(outputMC), 'output_var':np.array(outputWeightedVar), 'outputMC_var':np.array(outputMCVar)})

    del df, weight, y_hat, bin_index, outputWeighted, outputWeightedVar, outputMC, outputMCVar
  
  totalBkgOutput = np.array([b['outputScore'] for b in Background]) 
  totalBkgOutput = totalBkgOutput.sum(axis=0)
  
  totalBkgVar = np.array([b['output_var'] for b in Background])
  totalBkgVar = totalBkgVar.sum(axis=0)
   
  for s in Signal:
    significance = []
    significance_err = []
    tot_rel = np.sqrt(np.sum(s['output_var'])) / s['nEvents']
    for i in range(len(bins[1:])):
      #eff_sig = s['outputScore'][:i+1].sum() / s['nEvents']
      #eff_bkg = totalBkgOutput[:i+1].sum() / totalBkgOutput.sum()
      eff_sig = s['outputScore'][i:-1].sum() / s['nEvents']
      eff_bkg = totalBkgOutput[i:-1].sum() / totalBkgOutput.sum()
 
      #err_sig = np.sqrt(np.sum(s['output_var'][:i+1])) / s['nEvents']
      #err_bkg = np.sqrt(np.sum(totalBkgVar[:i+1])) / totalBkgOutput.sum()
      err_sig = np.sqrt(np.sum(s['output_var'][i:-1])) / s['nEvents']
      err_bkg = np.sqrt(np.sum(totalBkgVar[i:-1])) / totalBkgOutput.sum()

      #if totalBkgOutput[:i+1].sum() > 0.:
      #  rel_err_bkg = np.sqrt(np.sum(totalBkgVar[:i+1])) / totalBkgOutput[:i+1].sum()
      if totalBkgOutput[i:-1].sum() > 0.:
        rel_err_bkg = np.sqrt(np.sum(totalBkgVar[i:-1])) / totalBkgOutput[i:-1].sum()
      else:
        rel_err_bkg = 0.
      #if s['outputScore'][:i+1].sum() > 0.:
      #  rel_err_sig = np.sqrt(np.sum(s['output_var'][:i+1])) / s['outputScore'][:i+1].sum()
      if s['outputScore'][i:-1].sum() > 0.:
        rel_err_sig = np.sqrt(np.sum(s['output_var'][i:-1])) / s['outputScore'][i:-1].sum()
      else:
        rel_err_sig = 0.
      
      total_rel_err = np.sqrt(rel_err_sig**2. + rel_err_bkg**2. + 0.25**2.)

      if (eff_sig == 0) or (eff_bkg == 0):
        Z = 0.
        Z_err = 0.
      elif (err_sig / eff_sig > 0.75) or (err_bkg / eff_bkg > 0.75):
        Z = 0
        Z_err = 0
      else:
        #Z = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(s['outputScore'][:i+1].sum(), totalBkgOutput[:i+1].sum(), total_rel_err)
        Z = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(s['outputScore'][i:-1].sum(), totalBkgOutput[i:-1].sum(), total_rel_err)

        Zplus_sig = ROOT.RooStats.NumberCountingUtils.BinomialExpZ((eff_sig + err_sig) * s['nEvents'], eff_bkg * totalBkgOutput.sum(), total_rel_err)
        Zmins_sig = ROOT.RooStats.NumberCountingUtils.BinomialExpZ((eff_sig - err_sig) * s['nEvents'], eff_bkg * totalBkgOutput.sum(), total_rel_err)
        Zplus_bkg = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(eff_sig * s['nEvents'], (eff_bkg + err_bkg) * totalBkgOutput.sum(), total_rel_err)
        Zmins_bkg = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(eff_sig * s['nEvents'], (eff_bkg - err_bkg) * totalBkgOutput.sum(), total_rel_err)

      Z_err_sig = abs(Zplus_sig - Zmins_sig) / 2
      Z_err_bkg = abs(Zplus_bkg - Zmins_bkg) / 2
      Z_err = np.sqrt(Z_err_sig**2 + Z_err_bkg**2)

      significance.append(Z)
      significance_err.append(Z_err)

    s['sig'] = np.array(significance)
    s['sig_max'] = s['sig'].max()
    s['sig_err'] = np.array(significance_err)
    #print s['sig']
    print s['m_stop'], s['m_X'], s['sig'].max(), bins[np.where(s['sig'] == s['sig'].max())]

  x = np.array([s['m_stop'] for s in Signal], dtype=float)
  y = np.array([s['m_X'] for s in Signal], dtype=float)
  z = np.array([s['sig_max'] for s in Signal],dtype=float)

  #print x, y, z
  # Set up a regular grid of interpolation points
  fig, ax1 = plt.subplots(figsize=(8,6))
  xi, yi = np.linspace(x.min(), x.max(), 100), np.linspace(y.min(), y.max(), 100)
  xi, yi = np.meshgrid(xi, yi)

  # Interpolate
  rbf = scipy.interpolate.LinearNDInterpolator(points=np.array((x, y)).T, values=z)
  zi = rbf(xi, yi)

  im = ax1.imshow(zi, vmin=0., vmax=5., origin='lower',
             extent=[x.min(), x.max(), y.min(), y.max()])
  cbar = plt.colorbar(im)
  cbar.set_label('Significance')
  ax1.set_xlabel(r'$m_{\tilde{t}}$')
  ax1.set_xlim([x.min(), x.max()])
  ax1.set_ylabel(r'$m_{\chi}$')
  ax1.set_ylim([y.min(), y.max()])
  plt.scatter(x, y, c='black')
  plt.plot(x, x-84., color='black')
  plt.plot(x, x-175., color='black')
  AtlasStyle_mpl.ATLASLabel(ax1, 0.022, 0.925, 'Work in progress')
  AtlasStyle_mpl.LumiLabel(ax1, 0.022, 0.875, lumi=lumi*0.001)
  #plt.show()
  
  if save:
        if not os.path.exists('./plots/'):
            os.makedirs('./plots/')
            print('Creating folder plots')
        plt.savefig('plots/'+fileName+'_evaluated_grid.pdf')
        plt.savefig('plots/'+fileName+'_evaluated_grid.png')
        plt.close()
  
  diag_165 = {}
  diag_150 = {}
  diag_120 = {}
  diag_90 = {}
  
  for key, value in statInfoSig.iteritems():
      x, y = pickBenchmark(key)
      deltaM = float(x)-float(y)
      if deltaM==165.0:
          diag_165[x]=value
      elif deltaM==150.0:
          diag_150[x]=value
      elif deltaM==120.0:
          diag_120[x]=value
      elif deltaM==90.0:
          diag_90[x]=value
      else:
          print 'Error: Unknown diagonal in evaluate_signalGrid'
          return 0 
  
  sortedLabels165 = sorted(diag_165)
  sortedLabels150 = sorted(diag_150)
  sortedLabels120 = sorted(diag_120)
  sortedLabels90 = sorted(diag_90)
  
  values_165 = []
  values_150 = []
  values_120 = []
  values_90 = []
  
  for label in sortedLabels165:
      values_165.append(diag_165[label])

  for label in sortedLabels150:
      values_150.append(diag_150[label])
      
  for label in sortedLabels120:
      values_120.append(diag_120[label])
      
  for label in sortedLabels90:
      values_90.append(diag_90[label])
      
  csignal = sum(values_90)+sum(values_120)+sum(values_150)+sum(values_165)
  trainable_count = int(np.sum([K.count_params(p) for p in set(model.trainable_weights)]))
      
  signalP = mpatches.Patch(color='None', label='signal: ' + str(csignal))
  ttbar = mpatches.Patch(color='None', label=r'$t\overline{t}$: ' + str(statInfoBkg['mc16d_ttbar']))
  singletop = mpatches.Patch(color='None', label= 'single top: '+ str(statInfoBkg['mc16d_singletop']))
  Wjets = mpatches.Patch(color='None', label= r'$W$ + jets: '+ str(statInfoBkg['mc16d_Wjets']))
  tps = mpatches.Patch(color='None', label='params(t): ' + str(trainable_count)) #Trainable parameters
  
  #print sortedLabels90, sortedLabels120, sortedLabels150
  #print values_90, values_120, values_150
  
  plt.figure('statistic')
  d165 = plt.plot(sortedLabels165, values_165, 'b-x',label=r'$\Delta M = 165$ GeV')
  d150 = plt.plot(sortedLabels150, values_150, 'b-x',label=r'$\Delta M = 150$ GeV')
  d120 = plt.plot(sortedLabels120, values_120, 'r-x',label=r'$\Delta M = 120$ GeV')
  d90 = plt.plot(sortedLabels90, values_90, 'g-x', label=r'$\Delta M = 90$ GeV')
  plt.xlabel(r'$m_{\tilde{t}}$ [GeV]')
  plt.ylabel('Statistic')
  plt.title('Statistic of samples')
  plt.legend(loc='best', handles=[d165[0],d150[0],d120[0],d90[0],signalP,ttbar,singletop,Wjets,tps])
  
  if save:
        if not os.path.exists('./plots/'):
            os.makedirs('./plots/')
            print('Creating folder plots')
        plt.savefig('plots/'+fileName+'_StatisticTraining.pdf')
        plt.savefig('plots/'+fileName+'_StatisticTraining.png')
        plt.close()
        
        filepath = 'plots/' + fileName + '_StatisticTrainingValues.txt'
        infofile = open(filepath, 'w')
        infofile.write('M165: ' + ';'.join(sortedLabels165) + ' ' +';'.join([str(i) for i in values_165])+'\n')
        infofile.write('M150: ' + ';'.join(sortedLabels150) + ' ' +';'.join([str(i) for i in values_150])+'\n')
        infofile.write('M120: ' + ';'.join(sortedLabels120) + ' ' + ';'.join([str(i) for i in values_120])+'\n')
        infofile.write('M90: ' + ';'.join(sortedLabels90) + ' '+ ';'.join([str(i) for i in values_90]))
        infofile.close()

コード例 #10

ファイルを表示

def main():
  
  model = load_model(modelDir)

  scaler = joblib.load(SCALING)

  infofile = open(modelDir.replace('.h5','_infofile.txt'))
  infos = infofile.readlines()
  analysis=infos[0].replace('Used analysis method: ','').replace('\n','')
  dataset = DatasetDir + infos[3].replace('Used dataset: ', '').replace('\n','')
  recurrent = False
  if analysis.lower() == 'rnn':
    recurrent = True
    seq_scaler = dataset+'_scaling.json'


  db = (RESOLUTION[2] - RESOLUTION[1]) / RESOLUTION[0]    # bin width in discriminator distribution
  bins = np.arange(RESOLUTION[1], RESOLUTION[2]+db, db)   # bin edges in discriminator distribution
  center = (bins[:-1] + bins[1:]) / 2

  print '#----MODEL----#'
  print modelDir


  ###########################
  # Read and evaluate signals
  ###########################

  Signal = []
  for s in SIGNAL:
    print s
    x, y = pickBenchmark(s)
    if not recurrent:
      df, weight = loadDataFrame(os.path.join(inputDirSig, s+'/'), PRESELECTION, VAR, WEIGHTS, LUMI)
      y_hat = evaluate(model, df.values, scaler)
    else:
      df, weight, collection = loadSequentialDataFrame(os.path.join(inputDirSig, s+'/'), PRESELECTION, COLLECTION, REMOVE_VAR, VAR, WEIGHTS, LUMI)
      y_hat = evaluate(model, df.values, scaler, seq_scaler, rnn=True, col=collection)

    bin_index = np.digitize(y_hat[:,0], bins[1:])   # get the bin index of the output score for each event 
    outputWeighted = []
    outputWeightedVar = []
    outputMC = []
    outputMCVar = []
    for i in range(len(bins[1:])):
      w = weight.values[np.where(bin_index==i)[0]]
      sigma = np.sum(w**2.)
      outputWeighted.append(w.sum())
      outputWeightedVar.append(sigma)
      outputMC.append(len(w))
      outputMCVar.append(np.sqrt(len(w)))
    
    Signal.append({'name':s, 'm_stop':x, 'm_X':y, 'dataset':df, 'weight':weight, 'nEvents':weight.sum(), 'y_pred':y_hat, 'outputScore':np.array(outputWeighted), 'outputMC':np.array(outputMC), 'output_var':np.array(outputWeightedVar), 'outputMC_var':np.array(outputMCVar)})

    del df, weight, y_hat, bin_index, outputWeighted, outputWeightedVar, outputMC, outputMCVar

  ###########################
  # Read and evaluate backgrounds 
  ###########################
  
  totBkgEvents = 0.
  totBkgVar = 0.
  Background = []
  for b in BACKGROUND:
    if not recurrent:
      df, weight = loadDataFrame(os.path.join(inputDirBkg, b+'/'), PRESELECTION, VAR, WEIGHTS, LUMI)
      y_hat = evaluate(model, df.values, scaler)
    else:
      df, weight, collection = loadSequentialDataFrame(os.path.join(inputDirBkg, b+'/'), PRESELECTION, COLLECTION, REMOVE_VAR, VAR, WEIGHTS, LUMI)
      y_hat = evaluate(model, df.values, scaler, seq_scaler, rnn=True, col=collection)
     
    bin_index = np.digitize(y_hat[:,0], bins[1:])
    outputWeighted = []
    outputWeightedVar = []
    outputMC = []
    outputMCVar = []

    totBkgEvents += weight.sum()
    totBkgVar += np.sum(weight.values**2.)
    for i in range(len(bins[1:])):
      w = weight.values[np.where(bin_index==i)[0]]
      sigma = np.sum(w**2.)
      outputWeighted.append(w.sum())
      outputWeightedVar.append(sigma)
      outputMC.append(len(w))
      outputMCVar.append(len(w))

    Background.append({'name':b, 'dataset':df, 'weight':weight, 'nEvents':weight.sum(), 'y_pred':y_hat, 'outputScore':np.array(outputWeighted), 'outputMC':np.array(outputMC), 'output_var':np.array(outputWeightedVar), 'outputMC_var':np.array(outputMCVar)})

    del df, weight, y_hat, bin_index, outputWeighted, outputWeightedVar, outputMC, outputMCVar
  
  totalBkgOutput = np.array([b['outputScore'] for b in Background]) 
  totalBkgOutput = totalBkgOutput.sum(axis=0)
  
  totalBkgVar = np.array([b['output_var'] for b in Background])
  totalBkgVar = totalBkgVar.sum(axis=0)
   
  for s in Signal:
    significance = []
    significance_err = []
    asimov = []
    tot_rel = np.sqrt(np.sum(s['output_var'])) / s['nEvents']
    for i in range(len(bins[1:])):
      #eff_sig = s['outputScore'][:i+1].sum() / s['nEvents']
      #eff_bkg = totalBkgOutput[:i+1].sum() / totalBkgOutput.sum()
      eff_sig = s['outputScore'][i:-1].sum() / s['nEvents']
      eff_bkg = totalBkgOutput[i:-1].sum() / totalBkgOutput.sum()
 
      #err_sig = np.sqrt(np.sum(s['output_var'][:i+1])) / s['nEvents']
      #err_bkg = np.sqrt(np.sum(totalBkgVar[:i+1])) / totalBkgOutput.sum()
      err_sig = np.sqrt(np.sum(s['output_var'][i:-1])) / s['nEvents']
      err_bkg = np.sqrt(np.sum(totalBkgVar[i:-1])) / totalBkgOutput.sum()

      #if totalBkgOutput[:i+1].sum() > 0.:
      #  rel_err_bkg = np.sqrt(np.sum(totalBkgVar[:i+1])) / totalBkgOutput[:i+1].sum()
      if totalBkgOutput[i:-1].sum() > 0.:
        rel_err_bkg = np.sqrt(np.sum(totalBkgVar[i:-1])) / totalBkgOutput[i:-1].sum()
      else:
        rel_err_bkg = 0.
      #if s['outputScore'][:i+1].sum() > 0.:
      #  rel_err_sig = np.sqrt(np.sum(s['output_var'][:i+1])) / s['outputScore'][:i+1].sum()
      if s['outputScore'][i:-1].sum() > 0.:
        rel_err_sig = np.sqrt(np.sum(s['output_var'][i:-1])) / s['outputScore'][i:-1].sum()
      else:
        rel_err_sig = 0.
      
      #total_rel_err = np.sqrt(rel_err_sig**2. + rel_err_bkg**2. + 0.25**2.)
      total_rel_err = np.sqrt(rel_err_bkg**2. + 0.25**2.)

      if (eff_sig == 0) or (eff_bkg == 0):
        Z = 0.
        Z_err = 0.
        ams = 0.
      elif (err_sig / eff_sig > 0.75) or (err_bkg / eff_bkg > 0.75):
        Z = 0.
        Z_err = 0.
        ams = 0.
      else:
        #Z = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(s['outputScore'][:i+1].sum(), totalBkgOutput[:i+1].sum(), total_rel_err)
        Z = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(s['outputScore'][i:-1].sum(), totalBkgOutput[i:-1].sum(), total_rel_err)
        ams = asimovZ( s['outputScore'][i:].sum(), totalBkgOutput[i:].sum(), np.sqrt(totalBkgVar[i:].sum()))

        Zplus_sig = ROOT.RooStats.NumberCountingUtils.BinomialExpZ((eff_sig + err_sig) * s['nEvents'], eff_bkg * totalBkgOutput.sum(), total_rel_err)
        Zmins_sig = ROOT.RooStats.NumberCountingUtils.BinomialExpZ((eff_sig - err_sig) * s['nEvents'], eff_bkg * totalBkgOutput.sum(), total_rel_err)
        Zplus_bkg = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(eff_sig * s['nEvents'], (eff_bkg + err_bkg) * totalBkgOutput.sum(), total_rel_err)
        Zmins_bkg = ROOT.RooStats.NumberCountingUtils.BinomialExpZ(eff_sig * s['nEvents'], (eff_bkg - err_bkg) * totalBkgOutput.sum(), total_rel_err)

      Z_err_sig = abs(Zplus_sig - Zmins_sig) / 2
      Z_err_bkg = abs(Zplus_bkg - Zmins_bkg) / 2
      Z_err = np.sqrt(Z_err_sig**2 + Z_err_bkg**2)

      significance.append(Z)
      significance_err.append(Z_err)
      asimov.append(ams)

    s['sig'] = np.array(significance)
    s['sig_max'] = s['sig'].max()
    s['sig_err'] = np.array(significance_err)
    s['ams'] = np.array(asimov)
    print s['sig']
    print s['ams']
    print s['m_stop'], s['m_X'], s['sig'].max(), bins[np.where(s['sig'] == s['sig'].max())]

  x = np.array([s['m_stop'] for s in Signal], dtype=float)
  y = np.array([s['m_X'] for s in Signal], dtype=float)
  z = np.array([s['sig_max'] for s in Signal],dtype=float)

  #print x, y, z
  # Set up a regular grid of interpolation points
  fig, ax1 = plt.subplots(figsize=(8,6))
  xi, yi = np.linspace(x.min(), x.max(), 100), np.linspace(y.min(), y.max(), 100)
  xi, yi = np.meshgrid(xi, yi)

  # Interpolate
  rbf = scipy.interpolate.LinearNDInterpolator(points=np.array((x, y)).T, values=z)
  zi = rbf(xi, yi)

  im = ax1.imshow(zi, vmin=0., vmax=5., origin='lower',
             extent=[x.min(), x.max(), y.min(), y.max()])
  
  contours = plt.contour(xi, yi, zi, colors='black', levels=[3.])
  cbar = plt.colorbar(im)
  cbar.set_label('Significance')
  ax1.set_xlabel(r'$m_{\tilde{t}}$')
  ax1.set_xlim([x.min(), x.max()])
  ax1.set_ylabel(r'$m_{\chi}$')
  ax1.set_ylim([y.min(), y.max()])
  plt.scatter(x, y, c='black', s=[0.75]*len(x))
  plt.plot(x, x-84., color='grey')
  plt.plot(x, x-175., color='grey')

  AtlasStyle_mpl.ATLASLabel(ax1, 0.022, 0.925, 'Work in progress')
  AtlasStyle_mpl.LumiLabel(ax1, 0.022, 0.875, lumi=LUMI*0.001)

  plt.savefig("plots/"+modelfile+"_eval-Grid.pdf")
  plt.savefig("plots/"+modelfile+"_eval-Grid.png")
  plt.close()