def get_nn_data(sample):

    histo_data = []
    weight_data = []
    weight2_data = []

    with h5py.File(sample.filename, 'r', libver='latest') as sample_file:
        for dataset_name in sample_file:
            dataset = sample_file[dataset_name]
            for chunk in chunk_generator(dataset):
                if chunk.dtype.names[0] != 'eventweight':
                    print(
                        'ERROR chunk first field is not the event weight for NN data'
                    )
                    sys.exit()

                data = chunk['nn_score_0']
                weights = chunk['eventweight']
                weights *= 36.1

                valid = valid_idx(data)
                data = data[valid]
                weights = weights[valid]

                histo_data.extend(data)
                weight_data.extend(weights)
                weight2_data.extend(weights**2)

    return histo_data, weight_data, weight2_data
def get_cut_data(sample):

    histo_data = []
    weight_data = []
    weight2_data = []

    with h5py.File(sample.filename, 'r', libver='latest') as sample_file:

        if 'superNt' not in sample_file:
            print('ERROR superNt dataset not found in input cut data')
            sys.exit()

        dataset = sample_file['superNt']
        for chunk in chunk_generator(dataset):
            sel_idx = (chunk['mbb'] > 100) & (chunk['mbb'] < 140) & (
                chunk['mt2_llbb'] > 100) & (chunk['mt2_llbb'] < 140) & (
                    chunk['HT2Ratio'] > 0.8) & (chunk['dRll'] < 0.9) & (
                        chunk['nBJets'] == 2) & (chunk['l1_pt'] > 20.) & (
                            chunk['mll'] > 20.)  # & (chunk['mt2_bb']>150.)
            chunk = chunk[sel_idx]
            weights = chunk['eventweight']
            weights *= 36.1

            if 'ttbar' in sample.name:
                weights *= 0.92
            elif 'wt' in sample.name:
                weights *= 1.1037

            data = chunk['mt2_bb']
            histo_data.extend(data)
            weight_data.extend(weights)
            weight2_data.extend(weights**2)

    return histo_data, weight_data, weight2_data
def get_single_nn_histo(sample, scaler, model):

    lcd = 0.0
    histo_data = []
    weight_data = []
    w2_data = []

    total_read = 0

    with h5py.File(sample.filename, 'r', libver='latest') as sample_file:
        if 'superNt' not in sample_file:
            print('ERROR superNt dataset not found in input file (={})'.format(
                sample.filename))
            sys.exit()
        dataset = sample_file['superNt']
        if 'hh' in sample.name:
            dataset = dataset[19000:]
        for chunk in chunk_generator(dataset):
            total_read += chunk.size
            if total_read > 1e6: continue
            weights = chunk['eventweight']
            lcd_idx = (chunk['nBJets'] >= 1)
            weights = weights[lcd_idx] * 36.1
            lcd += np.sum(weights)
            chunk = chunk[lcd_idx]

            more_idx = (chunk['nBJets'] >= 2) & (chunk['mbb'] >
                                                 110) & (chunk['mbb'] < 140)
            chunk = chunk[more_idx]
            weights = weights[more_idx]

            input_features = chunk[scaler.feature_list()]
            input_features = floatify(input_features, scaler.feature_list())
            input_features = (input_features - scaler.mean()) / scaler.scale()
            scores = model.predict(input_features)

            num_data = scores[:, 0]
            den_data = scores[:, 1]

            ok_idx = den_data != 0
            num_data = num_data[ok_idx]
            den_data = den_data[ok_idx]
            weights = weights[ok_idx]

            data = np.log(num_data / den_data)
            ok_idx = (data > -np.inf) & (data < np.inf)
            data = data[ok_idx]
            weights = weights[ok_idx]

            histo_data.extend(data)
            weight_data.extend(weights)
            w2_data.extend(np.power(weights, 2))

    h = Histo(sample.name)
    h.lcd = lcd
    h.weights = weight_data
    h.histo_data = histo_data
    h_sumw2_histo_data = w2_data

    return h
def get_data(sample, kind, scaler=None, model=None):

    lcd = 0.0
    histo_data = []
    weight_data = []
    weight2_data = []

    use_stored_model = scaler and model
    total_read = 0.0
    total_pass_raw = 0
    total_pass_w = 0.

    with h5py.File(sample.filename, 'r', libver='latest') as sample_file:
        if 'superNt' not in sample_file:
            print("ERROR superNt dataset not found in input file (={})".format(
                sample.filename))
            sys.exit()
        dataset = sample_file['superNt']
        if 'hh' in sample.name:
            #dataset = dataset[8800:]
            dataset = dataset[19000:]
        for chunk in chunk_generator(dataset):
            total_read += chunk.size
            if total_read > 1000000.: break
            #print("TOTAL READ = {}".format(total_read))

            weights = chunk['eventweight']

            # HELLO

            # count the total number of weighted events at the base, denominator selection
            #lcd_idx = (chunk['nBJets']>=1) & (chunk['mbb']>100) & (chunk['mbb']<140) & (chunk['mt2_llbb']>100) & (chunk['mt2_llbb']<140) & (chunk['dRll']<0.9)
            lcd_idx = (
                chunk['nBJets'] >= 1
            )  #& (chunk['mt2_llbb']<140) #& (chunk['mbb']<140)# & (chunk['dRll']<0.9)
            #lcd_idx = chunk['nBJets'] >= 1
            weights_lcd = weights[lcd_idx] * 36.1
            lcd += np.sum(weights_lcd)

            # now get the disciminants we want to scan over
            if kind == 'nn':
                chunk = chunk[lcd_idx]
                weights = weights[lcd_idx] * 36.1

                # add more
                #more_idx = (chunk['nBMJets'] >= 2) & (chunk['nSJets']>0) & (chunk['mt2_bb_bm'] > 65) # & (chunk['mbb_bm'] > 100) & (chunk['mbb_bm'] < 140)
                #more_idx = (chunk['mbb']>100) & (chunk['mbb']<140) & (chunk['mt2_bb']>65)#(chunk['mt2_bb']>30)# & (chunk['met']>45) & (chunk['l1_pt']>15)
                #more_idx = (chunk['met']>50)# & (chunk['l1_pt']>20)
                more_idx = (chunk['nBJets'] >= 2) & (chunk['mbb'] > 110) & (
                    chunk['mbb'] < 140) & (chunk['mt2_bb'] > 65)
                chunk = chunk[more_idx]
                weights = weights[more_idx]

                if use_stored_model:
                    input_features = chunk[scaler.feature_list()]
                    input_features = floatify(input_features,
                                              scaler.feature_list())
                    input_features = (input_features -
                                      scaler.mean()) / scaler.scale()
                    scores = model.predict(input_features)

                    num_data = scores[:, 0]  # get the HH score from NN output
                    den_data = scores[:, 1]
                    print('WARNING ONLY GRABBING ONE SCORE')
                    #   den_data += scores[:,2]
                    #   den_data += scores[:,3]# * 0.1

                    ok_idx = den_data != 0
                    num_data = num_data[ok_idx]
                    den_data = den_data[ok_idx]

                    weights = weights[ok_idx]
                    data = np.log(num_data / den_data)
                    ok_idx = (data > -np.inf) & (data < np.inf)
                    data = data[ok_idx]
                    weights = weights[ok_idx]
                    print("MIN MAX = {} {}".format(np.min(data), np.max(data)))
                    total_pass_raw += data.size
                    total_pass_w += np.sum(weights)

                else:
                    data = chunk['nn_p_hh']  # target HH score from NN

                histo_data.extend(data)
                weight_data.extend(weights)
                weight2_data.extend(weights**2)

            elif kind == 'cut':
                sel_idx = (chunk['mbb'] > 100) & (chunk['mbb'] < 140) & (
                    chunk['mt2_llbb'] > 100) & (chunk['mt2_llbb'] < 140) & (
                        chunk['dRll'] < 0.9) & (chunk['HT2Ratio'] > 0.8) & (
                            chunk['nBJets'] == 2) & (chunk['l1_pt'] > 20.) & (
                                chunk['mll'] > 20.)
                data = chunk[sel_idx]
                weights = weights[sel_idx] * 36.1
                data = data[
                    'mt2_bb']  # we are going to scan over mt2_bb in the cut based strategy

                histo_data.extend(data)
                weight_data.extend(weights)
                weight2_data.extend(weights**2)

                total_pass_raw += data.size
                total_pass_w += np.sum(weights)

    print("Total pass for {} : {} ({})".format(sample.name, total_pass_w,
                                               total_pass_raw))
    return lcd, histo_data, weight_data, weight2_data
def make_plots(args):

    sample = Sample("sample", args.input, "")
    data_scaler, model = load_stored_model(args.nn_dir)

    lwtnn_data = []
    otf_data = []
    weight_data = []
    weight2_data = []

    with h5py.File(sample.filename, 'r', libver='latest') as sample_file:
        if 'superNt' not in sample_file:
            print(
                'ERROR "superNt" dataset not found in input file (={})'.format(
                    sample.filename))
            sys.exit(1)
        dataset = sample_file['superNt']
        for chunk in chunk_generator(dataset):
            weights = chunk['eventweight']

            # LWTNN var
            lwtnn_var = chunk['NN_p_hh']

            # OTF
            input_features = chunk[data_scaler.feature_list()]
            input_features = floatify(input_features,
                                      data_scaler.feature_list())
            input_features = (input_features -
                              data_scaler.mean()) / data_scaler.scale()
            scores = model.predict(input_features)

            nn_p_hh = scores[:, 0]
            nn_p_tt = scores[:, 1]
            nn_p_wt = scores[:, 2]
            nn_p_zjets = scores[:, 3]

            otf_var = nn_p_hh
            #otf_var = np.log( nn_p_tt / (nn_p_hh + nn_p_wt + nn_p_zjets) )
            #ok_idx = valid_idx(otf_var)
            #otf_var = otf_var[ok_idx]
            #weights = weights[ok_idx]
            #lwtnn_var = lwtnn_var[ok_idx]

            lwtnn_data.extend(lwtnn_var)
            otf_data.extend(otf_var)
            weight_data.extend(weights)
            weight2_data.extend(weights**2)

    ## histos
    bw = 0.05
    bins = np.arange(0, 1 + bw, bw)
    hist_lwtnn, _ = np.histogram(lwtnn_data, bins=bins, weights=weight_data)
    hist_otf, _ = np.histogram(otf_data, bins=bins, weights=weight_data)
    sumw2_hist, _ = np.histogram(lwtnn_data, bins=bins, weights=weight2_data)

    print('lwtnn: {}'.format(hist_lwtnn[:20]))
    print('otf  : {}'.format(hist_otf[:20]))

    bin_centers = bins + 0.5 * bw
    ratio_hist = hist_lwtnn / hist_otf
    bin_centers = bin_centers[:-1]

    fig, ax = plt.subplots(2, 1)
    #ax[0].hist( [otf_data], bins = bins, weights = [weight_data], label = ['otf'], histtype = 'step', color = ['b'] )
    ax[0].hist([lwtnn_data, otf_data],
               bins=bins,
               weights=[weight_data, weight_data],
               label=['lwtnn', 'otf'],
               histtype='step',
               color=['r', 'b'])
    ax[1].plot(bin_centers, ratio_hist, label='lwtnn/otf')
    ax[0].set_ylabel('Entries')
    ax[1].set_xlabel('$hh$ NN score')
    ax[1].set_ylabel('lwtnn compute / keras compute')
    fig.savefig('test_otf_lwtnn_comp.pdf', bbox_inches='tight', dpi=200)
def get_total_bkg_disc(sample, scaler=None, model=None, add_mt2bb_cut=False):

    lcd = 0.0
    histo_data = []
    weight_data = []
    w2_data = []
    total_read = 0

    with h5py.File(sample.filename, 'r', libver='latest') as sample_file:
        if 'superNt' not in sample_file:
            print('ERROR superNt dataset not found in input file (={})'.format(
                sample.filename))
            sys.exit()
        dataset = sample_file['superNt']
        if 'hh' in sample.name:
            dataset = dataset[19000:]
        for chunk in chunk_generator(dataset):
            total_read += chunk.size
            if total_read > 1e6: break
            weights = chunk['eventweight']
            lcd_idx = (chunk['nBJets'] >= 1)
            weights = weights[lcd_idx] * 36.1
            lcd += np.sum(weights)

            chunk = chunk[lcd_idx]
            if add_mt2bb_cut:
                print('get_total_bkg_disc    ADDING mt2_bb cut to selection')
                more_idx = (chunk['nBJets'] >= 2) & (chunk['mbb'] > 100) & (
                    chunk['mbb'] < 140) & (chunk['mt2_bb'] > 50)
            else:
                more_idx = (chunk['nBJets'] >=
                            2) & (chunk['mbb'] > 100) & (chunk['mbb'] < 140)
            chunk = chunk[more_idx]
            weights = weights[more_idx]

            input_features = chunk[scaler.feature_list()]
            input_features = floatify(input_features, scaler.feature_list())
            input_features = (input_features - scaler.mean()) / scaler.scale()
            scores = model.predict(input_features)

            num_data = scores[:, 0]
            den_data = scores[:, 1]
            if scores.shape[1] > 2:
                den_data += scores[:, 2]
                den_data += scores[:, 3]

            ok_idx = den_data != 0
            num_data = num_data[ok_idx]
            den_data = den_data[ok_idx]

            weights = weights[ok_idx]
            data = np.log(num_data / den_data)
            ok_idx = (data > -np.inf) & (data < np.inf)
            data = data[ok_idx]
            weights = weights[ok_idx]

            histo_data.extend(data)
            weight_data.extend(weights)
            w2_data.extend(np.power(weights, 2))

    h = Histo(sample.name)
    h.lcd = lcd
    h.weights = weight_data
    h.histo_data = histo_data
    h.sumw2_histo_data = w2_data

    return h
Beispiel #7
0
def get_yields(args, kind = '') :

    if not kind :
        print('did not provide kind')
        sys.exit()

    filename = {'reco' : reco_sig,
                'truth' : truth_sig}[kind]
    treename = {'reco' : 'superNt',
                'truth' : 'truth'}[kind]

    sample = Sample(kind, filename, '')
    data_scaler, model = load_stored_model(args.nn_dir)

    total_counts_raw = 0
    total_counts_weighted = 0.0

    with h5py.File(sample.filename, 'r', libver = 'latest') as sample_file :
        if treename not in sample_file :
            print('ERROR treename (={}) is not found in input file (={})'.format(treename, sample.filename))
            sys.exit()
        dataset = sample_file[treename]
        for chunk in chunk_generator(dataset) :

            weights = chunk['eventweight'] * lumi_factor

            if not args.cut_based :
                # calculate OTF
                input_features = chunk[data_scaler.feature_list()]
                input_features = floatify(input_features, data_scaler.feature_list())
                input_features = (input_features - data_scaler.mean()) / data_scaler.scale()
                scores = model.predict(input_features)

                nn_p_hh = scores[:,0]
                nn_p_tt = scores[:,1]
                nn_p_wt = scores[:,2]
                nn_p_zjets = scores[:,3]

                nn_d_hh = np.log( nn_p_hh / (nn_p_tt + nn_p_wt + nn_p_zjets) )
                ok_idx = (nn_d_hh > -np.inf) & (nn_d_hh < np.inf)
                weights = weights[ok_idx]
                chunk = chunk[ok_idx]

                selection_idx = (chunk['nBJets']>=2) & (chunk['mbb']>110) & (chunk['mbb']<140) & (chunk['mt2_bb']>65)
                nn_idx = nn_d_hh > 6.2
                #selection_idx = (chunk['nBJets']>=2) & (chunk['mbb']>100) & (chunk['mbb']<140) & (chunk['mt2_bb']>65)
                #nn_idx = nn_d_hh > 6.3

                print('nn_idx = {}'.format(nn_idx.any()))
                selection_idx = nn_idx & selection_idx

                weights = weights[selection_idx]
                data = chunk[selection_idx]

                total_counts_raw += data.size
                total_counts_weighted += np.sum(weights)
            else :
                selection_idx = (chunk['mll']>20.) & (chunk['l1_pt']>20.) & (chunk['nBJets']==2) & (chunk['dRll']<0.9) & (chunk['HT2Ratio']>0.8) & (chunk['mt2_bb']>150.) & (chunk['mbb']>100) & (chunk['mbb']<140) & (chunk['mt2_llbb']>100.) & (chunk['mt2_llbb']<140.)
                weights = weights[selection_idx]
                data = chunk[selection_idx]

                total_counts_raw += data.size
                total_counts_weighted += np.sum(weights)

    print('yields for {}: {} ({})'.format(kind, total_counts_weighted, total_counts_raw))
    return total_counts_weighted, total_counts_raw
def get_data(sample, scaler, model, to_do) :

    data = []
    w = []
    name = ""
    total_read = 0.0

    with h5py.File(sample.filename, 'r', libver = 'latest') as sample_file :
        if 'superNt' not in sample_file :
            print('ERROR superNt dataset not found in input file (={})'.format(sample.filename))
            sys.exit()
        dataset = sample_file['superNt']
        if 'hh' in sample.name :
            dataset = dataset[19000:]
        for chunk in chunk_generator(dataset) :
            total_read += chunk.size
            if total_read > 1e6 : break
            print('{} > {}'.format(sample.name, total_read))
            chunk = chunk[ (chunk['nBJets'] >= 1 ) ]
            weights = chunk['eventweight'] * 36.1
            input_features = chunk[scaler.feature_list()]
            input_features = floatify(input_features, scaler.feature_list())
            input_features = (input_features - scaler.mean()) / scaler.scale()
            scores = model.predict(input_features)

            #to_do = "hh"
            p_hh = scores[:,0]
            p_tt = scores[:,1]
            p_wt = scores[:,2]
            p_z  = scores[:,3]

            hist_data = None
            if 'd_' in to_do :
                num_data = None
                den_data = None
                if to_do == 'd_hh' :
                    num_data = p_hh
                    den_data = (p_tt + p_wt + p_z)
                elif to_do == 'd_tt' :
                    num_data = p_tt
                    den_data = (p_hh + p_wt + p_z)
                elif to_do == 'd_wt' :
                    num_data = p_wt
                    den_data = (p_hh + p_tt + p_z)
                elif to_do == 'd_z' :
                    num_data = p_z
                    den_data = (p_hh + p_tt + p_wt)

                d = np.log(num_data / den_data)
                idx = (d > -np.inf) & (d < np.inf)
                d = d[idx]
                weights = weights[idx]
                hist_data = d[:]

            else :
                hist_data = { "hh" : p_hh,
                                "tt" : p_tt,
                                "wt" : p_wt,
                                "z" : p_z } [ to_do ]

            name = { "hh" : "$p_{hh}$",
                        "tt" : "$p_{t\\bar{t}}$",
                        "wt" : "$p_{Wt}$",
                        "z" : "$p_{Z}$" } [ to_do.replace('d_','') ]

            if 'd_' in to_do :
                name = name.replace('p_', 'd_')

            data.extend(hist_data)
            w.extend(weights)

    return data, w, name
Beispiel #9
0
def make_plot(var_dict, sample, scaler, model, args):

    x_data = []
    y_data = []
    w_data = []

    total_read = 0

    disc_for_x = args.varX

    with h5py.File(sample.filename, 'r', libver='latest') as sample_file:
        if 'superNt' not in sample_file:
            print('ERROR superNt dataset not found in input file (={})'.format(
                sample.filename))
            sys.exit()
        dataset = sample_file['superNt']
        is_first = True
        if 'data' in sample.filename:
            is_first = False
        for chunk in chunk_generator(dataset, chunksize=19000):
            if is_first:
                is_first = False
                continue

            total_read += chunk.size
            if total_read > 1e6: break

            idx = (chunk['nBJets'] >= 2)

            weights = chunk[idx]
            chunk = chunk[idx]

            input_features = chunk[scaler.feature_list()]
            input_features = floatify(input_features, scaler.feature_list())
            input_features = (input_features - scaler.mean()) / scaler.scale()
            scores = model.predict(input_features)

            p_hh = scores[:, 0]
            p_tt = scores[:, 1]
            p_wt = scores[:, 2]
            p_z = scores[:, 3]

            i_x_data = p_hh
            if 'p_' in disc_for_x:
                i_x_data = {
                    'p_hh': p_hh,
                    'p_tt': p_tt,
                    'p_wt': p_wt,
                    'p_z': p_z
                }[disc_for_x]

            elif 'd_' in disc_for_x:
                num_data = {
                    'd_hh': p_hh,
                    'd_tt': p_tt,
                    'd_wt': p_wt,
                    'd_z': p_z
                }[disc_for_x]
                den_data = {
                    'd_hh': (p_tt + p_wt + p_z),
                    'd_tt': (p_wt + p_hh + p_z),
                    'd_wt': (p_hh + p_tt + p_z),
                    'd_z': (p_tt + p_wt + p_hh)
                }[disc_for_x]
                disc = np.log(num_data / den_data)
                idx = valid_idx(disc)

                p_hh = p_hh[idx]
                p_tt = p_tt[idx]
                p_wt = p_wt[idx]
                p_z = p_z[idx]

                disc = disc[idx]
                weights = weights[idx]
                chunk = chunk[idx]
                i_x_data = disc

            i_y_data = None
            if 'p_' in args.varY or 'd_' in args.varY:
                if 'p_' in args.varY:
                    i_y_data = {
                        'p_hh': p_hh,
                        'p_tt': p_tt,
                        'p_wt': p_wt,
                        'p_z': p_z
                    }[args.varY]
                elif 'd_' in args.varY:
                    num_data = {
                        'd_hh': p_hh,
                        'd_tt': p_tt,
                        'd_wt': p_wt,
                        'd_z': p_z
                    }[args.varY]
                    den_data = {
                        'd_hh': (p_tt + p_wt + p_z),
                        'd_tt': (p_wt + p_hh + p_z),
                        'd_wt': (p_hh + p_tt + p_z),
                        'd_z': (p_tt + p_wt + p_hh)
                    }[args.varY]
                    y_disc = np.log(num_data / den_data)
                    #idx = valid_idx(y_disc)
                    #y_disc = y_disc[idx]
                    #weights = weights[idx]
                    #chunk = chunk[idx]
                    i_y_data = y_disc
            else:
                i_y_data = chunk[args.varY]

            x_data.extend(list(i_x_data))
            y_data.extend(list(i_y_data))
            w_data.extend(list(weights))

    x_data = np.array(x_data)
    y_data = np.array(y_data)
    w_data = np.array(w_data)

    fig, ax = plt.subplots(1, 1)
    ax.grid(color='k', which='both', linestyle='-', lw=0.5, alpha=0.1)
    ax.tick_params(axis='both',
                   which='both',
                   direction='in',
                   labelleft=True,
                   bottom=True,
                   top=True,
                   right=True,
                   left=True)

    var_dict = all_vars()
    x_bounds = var_dict[disc_for_x]['bounds']
    y_bounds = var_dict[args.varY]['bounds']

    x_label = var_dict[disc_for_x]['name']
    y_label = var_dict[args.varY]['name']

    x_edges = np.arange(x_bounds[1], x_bounds[2] + x_bounds[0], x_bounds[0])
    y_edges = np.arange(y_bounds[1], y_bounds[2] + y_bounds[0], y_bounds[0])

    bins = [x_edges, y_edges]

    ax.set_xlabel(x_label, horizontalalignment='right', x=1)
    ax.set_ylabel(y_label, horizontalalignment='right', y=1)

    print('x_data shape = {}'.format(x_data.shape))
    print('y_data shape = {}'.format(y_data.shape))
    h, x, y = np.histogram2d(x_data, y_data, bins=bins, normed=False)
    #integral = h.sum()
    #h = h / integral
    imextent = list((min(x_edges), max(x_edges))) + list(
        (min(y_edges), max(y_edges)))
    ax.set_facecolor('lightgrey')
    h = h.T
    im = ax.imshow(h,
                   origin='lower',
                   cmap='coolwarm',
                   aspect='auto',
                   interpolation='nearest',
                   extent=imextent,
                   norm=LogNorm())
    ax.contour(h, levels=[1, 3, 10], colors='black', extent=imextent)
    cb = fig.colorbar(im)

    process = ''
    if 'wt' in sample.filename:
        process = 'wt'
    elif '123456' in sample.filename:
        process = 'hh'
    elif 'ttbar' in sample.filename or '410009' in sample.filename:
        process = 'ttbar'
    elif 'zll' in sample.filename or 'zjets' in sample.filename:
        process = 'zjets'
    elif 'ztt' in sample.filename:
        process = 'zjets_tt'
    elif 'data' in sample.filename:
        process = 'data'

    ax.text(0.85, 0.93, process, weight='bold', transform=ax.transAxes)

    outname = './plots_input_output/input_output_2D_{}_{}_{}.pdf'.format(
        process, disc_for_x, args.varY)
    print(' >> saving plot to: {}'.format(os.path.abspath(outname)))
    fig.savefig(outname, bbox_inches='tight', dpi=200)