def load_input_file(args):

    if not os.path.isfile(args.input):
        print(
            "ERROR provided input file (={}) is not found or is not a regular file"
            .format(args.input))
        sys.exit()

    samples_group_name = "samples"
    scaling_group_name = "scaling"
    scaling_data_name = "scaling_data"

    samples = []
    data_scaler = None

    with h5py.File(args.input, 'r', libver='latest') as input_file:

        # look up the scaling first
        if scaling_group_name in input_file:
            scaling_group = input_file[scaling_group_name]
            scaling_dataset = scaling_group[scaling_data_name]
            data_scaler = DataScaler(scaling_dataset=scaling_dataset,
                                     ignore_features=['eventweight'])
            print(
                "DataScaler found {} features to use as inputs (there were {} total features in the input)"
                .format(len(data_scaler.feature_list()),
                        len(data_scaler.raw_feature_list())))
        else:
            print("scaling group (={}) not found in file".format(
                scaling_group_name))
            sys.exit()

        # build the samples
        if samples_group_name in input_file:
            sample_group = input_file[samples_group_name]
            for p in sample_group:
                process_group = sample_group[p]
                class_label = process_group.attrs['training_label']
                s = Sample(name=p,
                           class_label=int(class_label),
                           input_data=floatify(
                               process_group['validation_features'][tuple(
                                   data_scaler.feature_list())],
                               data_scaler.feature_list()))
                s.eventweights = floatify(
                    process_group['validation_features'][tuple(['eventweight'
                                                                ])],
                    ['eventweight'])
                samples.append(s)
        else:
            print("samples group (={}) not found in file".format(
                samples_group_name))
            sys.exit()

    return samples, data_scaler
def get_single_nn_histo(sample, scaler, model):

    lcd = 0.0
    histo_data = []
    weight_data = []
    w2_data = []

    total_read = 0

    with h5py.File(sample.filename, 'r', libver='latest') as sample_file:
        if 'superNt' not in sample_file:
            print('ERROR superNt dataset not found in input file (={})'.format(
                sample.filename))
            sys.exit()
        dataset = sample_file['superNt']
        if 'hh' in sample.name:
            dataset = dataset[19000:]
        for chunk in chunk_generator(dataset):
            total_read += chunk.size
            if total_read > 1e6: continue
            weights = chunk['eventweight']
            lcd_idx = (chunk['nBJets'] >= 1)
            weights = weights[lcd_idx] * 36.1
            lcd += np.sum(weights)
            chunk = chunk[lcd_idx]

            more_idx = (chunk['nBJets'] >= 2) & (chunk['mbb'] >
                                                 110) & (chunk['mbb'] < 140)
            chunk = chunk[more_idx]
            weights = weights[more_idx]

            input_features = chunk[scaler.feature_list()]
            input_features = floatify(input_features, scaler.feature_list())
            input_features = (input_features - scaler.mean()) / scaler.scale()
            scores = model.predict(input_features)

            num_data = scores[:, 0]
            den_data = scores[:, 1]

            ok_idx = den_data != 0
            num_data = num_data[ok_idx]
            den_data = den_data[ok_idx]
            weights = weights[ok_idx]

            data = np.log(num_data / den_data)
            ok_idx = (data > -np.inf) & (data < np.inf)
            data = data[ok_idx]
            weights = weights[ok_idx]

            histo_data.extend(data)
            weight_data.extend(weights)
            w2_data.extend(np.power(weights, 2))

    h = Histo(sample.name)
    h.lcd = lcd
    h.weights = weight_data
    h.histo_data = histo_data
    h_sumw2_histo_data = w2_data

    return h
def get_data(sample, kind, scaler=None, model=None):

    lcd = 0.0
    histo_data = []
    weight_data = []
    weight2_data = []

    use_stored_model = scaler and model
    total_read = 0.0
    total_pass_raw = 0
    total_pass_w = 0.

    with h5py.File(sample.filename, 'r', libver='latest') as sample_file:
        if 'superNt' not in sample_file:
            print("ERROR superNt dataset not found in input file (={})".format(
                sample.filename))
            sys.exit()
        dataset = sample_file['superNt']
        if 'hh' in sample.name:
            #dataset = dataset[8800:]
            dataset = dataset[19000:]
        for chunk in chunk_generator(dataset):
            total_read += chunk.size
            if total_read > 1000000.: break
            #print("TOTAL READ = {}".format(total_read))

            weights = chunk['eventweight']

            # HELLO

            # count the total number of weighted events at the base, denominator selection
            #lcd_idx = (chunk['nBJets']>=1) & (chunk['mbb']>100) & (chunk['mbb']<140) & (chunk['mt2_llbb']>100) & (chunk['mt2_llbb']<140) & (chunk['dRll']<0.9)
            lcd_idx = (
                chunk['nBJets'] >= 1
            )  #& (chunk['mt2_llbb']<140) #& (chunk['mbb']<140)# & (chunk['dRll']<0.9)
            #lcd_idx = chunk['nBJets'] >= 1
            weights_lcd = weights[lcd_idx] * 36.1
            lcd += np.sum(weights_lcd)

            # now get the disciminants we want to scan over
            if kind == 'nn':
                chunk = chunk[lcd_idx]
                weights = weights[lcd_idx] * 36.1

                # add more
                #more_idx = (chunk['nBMJets'] >= 2) & (chunk['nSJets']>0) & (chunk['mt2_bb_bm'] > 65) # & (chunk['mbb_bm'] > 100) & (chunk['mbb_bm'] < 140)
                #more_idx = (chunk['mbb']>100) & (chunk['mbb']<140) & (chunk['mt2_bb']>65)#(chunk['mt2_bb']>30)# & (chunk['met']>45) & (chunk['l1_pt']>15)
                #more_idx = (chunk['met']>50)# & (chunk['l1_pt']>20)
                more_idx = (chunk['nBJets'] >= 2) & (chunk['mbb'] > 110) & (
                    chunk['mbb'] < 140) & (chunk['mt2_bb'] > 65)
                chunk = chunk[more_idx]
                weights = weights[more_idx]

                if use_stored_model:
                    input_features = chunk[scaler.feature_list()]
                    input_features = floatify(input_features,
                                              scaler.feature_list())
                    input_features = (input_features -
                                      scaler.mean()) / scaler.scale()
                    scores = model.predict(input_features)

                    num_data = scores[:, 0]  # get the HH score from NN output
                    den_data = scores[:, 1]
                    print('WARNING ONLY GRABBING ONE SCORE')
                    #   den_data += scores[:,2]
                    #   den_data += scores[:,3]# * 0.1

                    ok_idx = den_data != 0
                    num_data = num_data[ok_idx]
                    den_data = den_data[ok_idx]

                    weights = weights[ok_idx]
                    data = np.log(num_data / den_data)
                    ok_idx = (data > -np.inf) & (data < np.inf)
                    data = data[ok_idx]
                    weights = weights[ok_idx]
                    print("MIN MAX = {} {}".format(np.min(data), np.max(data)))
                    total_pass_raw += data.size
                    total_pass_w += np.sum(weights)

                else:
                    data = chunk['nn_p_hh']  # target HH score from NN

                histo_data.extend(data)
                weight_data.extend(weights)
                weight2_data.extend(weights**2)

            elif kind == 'cut':
                sel_idx = (chunk['mbb'] > 100) & (chunk['mbb'] < 140) & (
                    chunk['mt2_llbb'] > 100) & (chunk['mt2_llbb'] < 140) & (
                        chunk['dRll'] < 0.9) & (chunk['HT2Ratio'] > 0.8) & (
                            chunk['nBJets'] == 2) & (chunk['l1_pt'] > 20.) & (
                                chunk['mll'] > 20.)
                data = chunk[sel_idx]
                weights = weights[sel_idx] * 36.1
                data = data[
                    'mt2_bb']  # we are going to scan over mt2_bb in the cut based strategy

                histo_data.extend(data)
                weight_data.extend(weights)
                weight2_data.extend(weights**2)

                total_pass_raw += data.size
                total_pass_w += np.sum(weights)

    print("Total pass for {} : {} ({})".format(sample.name, total_pass_w,
                                               total_pass_raw))
    return lcd, histo_data, weight_data, weight2_data
Example #4
0
def make_plots(args):

    sample = Sample("sample", args.input, "")
    data_scaler, model = load_stored_model(args.nn_dir)

    lwtnn_data = []
    otf_data = []
    weight_data = []
    weight2_data = []

    with h5py.File(sample.filename, 'r', libver='latest') as sample_file:
        if 'superNt' not in sample_file:
            print(
                'ERROR "superNt" dataset not found in input file (={})'.format(
                    sample.filename))
            sys.exit(1)
        dataset = sample_file['superNt']
        for chunk in chunk_generator(dataset):
            weights = chunk['eventweight']

            # LWTNN var
            lwtnn_var = chunk['NN_p_hh']

            # OTF
            input_features = chunk[data_scaler.feature_list()]
            input_features = floatify(input_features,
                                      data_scaler.feature_list())
            input_features = (input_features -
                              data_scaler.mean()) / data_scaler.scale()
            scores = model.predict(input_features)

            nn_p_hh = scores[:, 0]
            nn_p_tt = scores[:, 1]
            nn_p_wt = scores[:, 2]
            nn_p_zjets = scores[:, 3]

            otf_var = nn_p_hh
            #otf_var = np.log( nn_p_tt / (nn_p_hh + nn_p_wt + nn_p_zjets) )
            #ok_idx = valid_idx(otf_var)
            #otf_var = otf_var[ok_idx]
            #weights = weights[ok_idx]
            #lwtnn_var = lwtnn_var[ok_idx]

            lwtnn_data.extend(lwtnn_var)
            otf_data.extend(otf_var)
            weight_data.extend(weights)
            weight2_data.extend(weights**2)

    ## histos
    bw = 0.05
    bins = np.arange(0, 1 + bw, bw)
    hist_lwtnn, _ = np.histogram(lwtnn_data, bins=bins, weights=weight_data)
    hist_otf, _ = np.histogram(otf_data, bins=bins, weights=weight_data)
    sumw2_hist, _ = np.histogram(lwtnn_data, bins=bins, weights=weight2_data)

    print('lwtnn: {}'.format(hist_lwtnn[:20]))
    print('otf  : {}'.format(hist_otf[:20]))

    bin_centers = bins + 0.5 * bw
    ratio_hist = hist_lwtnn / hist_otf
    bin_centers = bin_centers[:-1]

    fig, ax = plt.subplots(2, 1)
    #ax[0].hist( [otf_data], bins = bins, weights = [weight_data], label = ['otf'], histtype = 'step', color = ['b'] )
    ax[0].hist([lwtnn_data, otf_data],
               bins=bins,
               weights=[weight_data, weight_data],
               label=['lwtnn', 'otf'],
               histtype='step',
               color=['r', 'b'])
    ax[1].plot(bin_centers, ratio_hist, label='lwtnn/otf')
    ax[0].set_ylabel('Entries')
    ax[1].set_xlabel('$hh$ NN score')
    ax[1].set_ylabel('lwtnn compute / keras compute')
    fig.savefig('test_otf_lwtnn_comp.pdf', bbox_inches='tight', dpi=200)
Example #5
0
def get_process_inputs(data_scaler):

    idx = -1
    if training_dir.endswith('/'): idx = -2
    input_file_dir = '/'.join(training_dir.split('/')[:idx])
    input_file = '%s/wwbb_preprocessed.h5' % input_file_dir

    if not os.path.isdir(input_file_dir):
        print('ERROR could not locate file dir (={})'.format(input_file_dir))
        sys.exit()
    if not os.path.isfile(input_file):
        print('ERROR could not locate file (={})'.format(input_file))
        sys.exit()

    samples_group_name = 'samples'
    scaling_group_name = 'scaling'
    scaling_data_name = 'scaling_data'

    sample_dict = {}

    with h5py.File(input_file, 'r', libver='latest') as input_file:

        if samples_group_name in input_file:
            sample_group = input_file[samples_group_name]
            n_per_sample = -1
            for p in sample_group:

                if p == 'ttbar' or p == 'hh':

                    sample_dict[p] = {}

                    process_group = sample_group[p]
                    class_label = process_group.attrs['training_label']

                    # get the "test" sample
                    test_sample = Sample(
                        name='%s_test' % p,
                        class_label=int(class_label),
                        input_data=floatify(
                            process_group['validation_features'][tuple(
                                data_scaler.feature_list())],
                            data_scaler.feature_list()))
                    test_sample.eventweights = floatify(
                        process_group['validation_features'][tuple(
                            ['eventweight'])], ['eventweight'])

                    # get the "training" sample (which has our validation data in it)
                    training_data = floatify(
                        process_group['train_features'][tuple(
                            data_scaler.feature_list())],
                        data_scaler.feature_list())
                    training_weights = floatify(
                        process_group['train_features'][tuple(['eventweight'
                                                               ])],
                        ['eventweight'])

                    # randomize
                    if p != 'hh':
                        if n_per_sample < 0:
                            print(
                                'ERROR Did not get number to split for train/validation from signal'
                            )
                            sys.exit()
                    elif p == 'hh':
                        n_per_sample = int(training_data.shape[0])

                    randomize = np.arange(len(training_data))
                    np.random.shuffle(randomize)
                    shuffled_training_data = training_data[randomize]
                    shuffled_training_weights = training_weights[randomize]

                    fraction_for_validation = 0.2
                    total_n = len(shuffled_training_data)
                    n_for_validation = int(fraction_for_validation * total_n)

                    split_train_data = shuffled_training_data[
                        n_for_validation:]
                    split_train_weights = shuffled_training_weights[
                        n_for_validation:]
                    split_val_data = shuffled_training_data[:n_for_validation]
                    split_val_weights = shuffled_training_weights[:
                                                                  n_for_validation]

                    train_sample = Sample(name='%s_train' % p,
                                          class_label=int(class_label),
                                          input_data=split_train_data)
                    train_sample.eventweights = split_train_weights

                    val_sample = Sample(name='%s_val' % p,
                                        class_label=int(class_label),
                                        input_data=split_val_data)
                    val_sample.eventweights = split_val_weights

                    print(
                        'Loaded sample %s: n train = %d, n val = %d, n_test = %d'
                        % (p, len(train_sample.data()), len(
                            val_sample.data()), len(test_sample.data())))

                    sample_dict[p]['test'] = test_sample
                    sample_dict[p]['train'] = train_sample
                    sample_dict[p]['val'] = val_sample

    return sample_dict
def get_total_bkg_disc(sample, scaler=None, model=None, add_mt2bb_cut=False):

    lcd = 0.0
    histo_data = []
    weight_data = []
    w2_data = []
    total_read = 0

    with h5py.File(sample.filename, 'r', libver='latest') as sample_file:
        if 'superNt' not in sample_file:
            print('ERROR superNt dataset not found in input file (={})'.format(
                sample.filename))
            sys.exit()
        dataset = sample_file['superNt']
        if 'hh' in sample.name:
            dataset = dataset[19000:]
        for chunk in chunk_generator(dataset):
            total_read += chunk.size
            if total_read > 1e6: break
            weights = chunk['eventweight']
            lcd_idx = (chunk['nBJets'] >= 1)
            weights = weights[lcd_idx] * 36.1
            lcd += np.sum(weights)

            chunk = chunk[lcd_idx]
            if add_mt2bb_cut:
                print('get_total_bkg_disc    ADDING mt2_bb cut to selection')
                more_idx = (chunk['nBJets'] >= 2) & (chunk['mbb'] > 100) & (
                    chunk['mbb'] < 140) & (chunk['mt2_bb'] > 50)
            else:
                more_idx = (chunk['nBJets'] >=
                            2) & (chunk['mbb'] > 100) & (chunk['mbb'] < 140)
            chunk = chunk[more_idx]
            weights = weights[more_idx]

            input_features = chunk[scaler.feature_list()]
            input_features = floatify(input_features, scaler.feature_list())
            input_features = (input_features - scaler.mean()) / scaler.scale()
            scores = model.predict(input_features)

            num_data = scores[:, 0]
            den_data = scores[:, 1]
            if scores.shape[1] > 2:
                den_data += scores[:, 2]
                den_data += scores[:, 3]

            ok_idx = den_data != 0
            num_data = num_data[ok_idx]
            den_data = den_data[ok_idx]

            weights = weights[ok_idx]
            data = np.log(num_data / den_data)
            ok_idx = (data > -np.inf) & (data < np.inf)
            data = data[ok_idx]
            weights = weights[ok_idx]

            histo_data.extend(data)
            weight_data.extend(weights)
            w2_data.extend(np.power(weights, 2))

    h = Histo(sample.name)
    h.lcd = lcd
    h.weights = weight_data
    h.histo_data = histo_data
    h.sumw2_histo_data = w2_data

    return h
Example #7
0
def get_yields(args, kind = '') :

    if not kind :
        print('did not provide kind')
        sys.exit()

    filename = {'reco' : reco_sig,
                'truth' : truth_sig}[kind]
    treename = {'reco' : 'superNt',
                'truth' : 'truth'}[kind]

    sample = Sample(kind, filename, '')
    data_scaler, model = load_stored_model(args.nn_dir)

    total_counts_raw = 0
    total_counts_weighted = 0.0

    with h5py.File(sample.filename, 'r', libver = 'latest') as sample_file :
        if treename not in sample_file :
            print('ERROR treename (={}) is not found in input file (={})'.format(treename, sample.filename))
            sys.exit()
        dataset = sample_file[treename]
        for chunk in chunk_generator(dataset) :

            weights = chunk['eventweight'] * lumi_factor

            if not args.cut_based :
                # calculate OTF
                input_features = chunk[data_scaler.feature_list()]
                input_features = floatify(input_features, data_scaler.feature_list())
                input_features = (input_features - data_scaler.mean()) / data_scaler.scale()
                scores = model.predict(input_features)

                nn_p_hh = scores[:,0]
                nn_p_tt = scores[:,1]
                nn_p_wt = scores[:,2]
                nn_p_zjets = scores[:,3]

                nn_d_hh = np.log( nn_p_hh / (nn_p_tt + nn_p_wt + nn_p_zjets) )
                ok_idx = (nn_d_hh > -np.inf) & (nn_d_hh < np.inf)
                weights = weights[ok_idx]
                chunk = chunk[ok_idx]

                selection_idx = (chunk['nBJets']>=2) & (chunk['mbb']>110) & (chunk['mbb']<140) & (chunk['mt2_bb']>65)
                nn_idx = nn_d_hh > 6.2
                #selection_idx = (chunk['nBJets']>=2) & (chunk['mbb']>100) & (chunk['mbb']<140) & (chunk['mt2_bb']>65)
                #nn_idx = nn_d_hh > 6.3

                print('nn_idx = {}'.format(nn_idx.any()))
                selection_idx = nn_idx & selection_idx

                weights = weights[selection_idx]
                data = chunk[selection_idx]

                total_counts_raw += data.size
                total_counts_weighted += np.sum(weights)
            else :
                selection_idx = (chunk['mll']>20.) & (chunk['l1_pt']>20.) & (chunk['nBJets']==2) & (chunk['dRll']<0.9) & (chunk['HT2Ratio']>0.8) & (chunk['mt2_bb']>150.) & (chunk['mbb']>100) & (chunk['mbb']<140) & (chunk['mt2_llbb']>100.) & (chunk['mt2_llbb']<140.)
                weights = weights[selection_idx]
                data = chunk[selection_idx]

                total_counts_raw += data.size
                total_counts_weighted += np.sum(weights)

    print('yields for {}: {} ({})'.format(kind, total_counts_weighted, total_counts_raw))
    return total_counts_weighted, total_counts_raw
Example #8
0
def dump_scores(input_file, model, data_scaler, args):
    """
    From the input HDF5 file, go through it and get the NN output
    for the features, storing them to a single output file whose
    name is based on the input filename.

    Args :
        input_file : input filename for HDF5 file to be opened and processed
        model : loaded Keras model
        data_scaler : loaded DataScaler object used to scale the input features
            prior to network evaluation
        args : command line inputs
    """

    outname = input_file.split("/")[-1].replace(".h5", "").replace(".hdf5", "")
    outname += "_scores.h5"
    if args.outdir != "":
        mkdir_p(args.outdir)
    outname = "{}/{}".format(args.outdir, outname)

    #out_ds_created = False
    #out_ds = None

    #gen = chunk_generator(input_file, dataset_name = args.dataset)
    #chunk = next(gen)
    #chunk = chunk [ (chunk['nBJets']==2) ]
    #row_count = chunk.shape[0]

    #weights = chunk['eventweight']
    #input_features = chunk[data_scaler.feature_list()]
    #input_features = floatify(chunk[data_scaler.feature_list()], data_scaler.feature_list())
    #input_features = (input_features - data_scaler.mean()) / data_scaler.scale()
    #scores = model.predict(input_features)
    #n_outputs = scores.shape[1]

    #ds = np.array( list(weights), dtype = [('eventweight', float)])
    #for io in range(n_outputs) :
    #    ds = recfunctions.append_fields( ds , names = 'nn_score_{}'.format(io), data = scores[:,io], dtypes = float )
    #dtype = ds.dtype
    #row_count = ds.shape[0]

    dataset_id = 0
    with h5py.File(outname, 'w', libver='latest') as outfile:

        for chunk in chunk_generator(input_file, dataset_name=args.dataset):

            # apply the selection here
            chunk = chunk[(chunk['nBJets'] >= 1) & (
                chunk['HT2Ratio'] > 0.5
            )]  # & (chunk['mt2_bb'] > 65) & (chunk['l1_pt']>20.) & (chunk['mll']>20.) ]
            #chunk = chunk[ (chunk['nBJets'] >= 1) & (chunk['mt2_bb'] > 55) & (chunk['l1_pt']>20.) & (chunk['mll']>20.) ]
            #chunk = chunk[ (chunk['nBJets'] >= 1) & (chunk['mt2_bb'] > 65) & (chunk['HT2Ratio']>0.5) ]
            if chunk.size == 0: continue

            weights = chunk['eventweight']
            input_features = chunk[data_scaler.feature_list()]
            input_features = floatify(input_features,
                                      data_scaler.feature_list())
            input_features = (input_features -
                              data_scaler.mean()) / data_scaler.scale()
            scores = model.predict(input_features)
            n_outputs = scores.shape[1]
            discriminants = build_discriminant_array(scores, n_outputs)

            ds = np.array(list(weights), dtype=[('eventweight', float)])
            for io in range(n_outputs):
                ds = recfunctions.append_fields(ds,
                                                names='nn_score_{}'.format(io),
                                                data=scores[:, io],
                                                dtypes=np.float64)
            for io in range(n_outputs):
                ds = recfunctions.append_fields(ds,
                                                names='nn_disc_{}'.format(io),
                                                data=discriminants[io],
                                                dtypes=np.float64)
            maxshape = (None, ) + ds.shape[1:]

            dsname = "nn_scores_{}".format(dataset_id)
            out_ds = outfile.create_dataset(dsname,
                                            shape=ds.shape,
                                            maxshape=maxshape,
                                            chunks=ds.shape,
                                            dtype=ds.dtype)
            out_ds[:] = ds
            dataset_id += 1

    print(" > output saved : {}".format(os.path.abspath(outname)))
def get_data(sample, scaler, model, to_do) :

    data = []
    w = []
    name = ""
    total_read = 0.0

    with h5py.File(sample.filename, 'r', libver = 'latest') as sample_file :
        if 'superNt' not in sample_file :
            print('ERROR superNt dataset not found in input file (={})'.format(sample.filename))
            sys.exit()
        dataset = sample_file['superNt']
        if 'hh' in sample.name :
            dataset = dataset[19000:]
        for chunk in chunk_generator(dataset) :
            total_read += chunk.size
            if total_read > 1e6 : break
            print('{} > {}'.format(sample.name, total_read))
            chunk = chunk[ (chunk['nBJets'] >= 1 ) ]
            weights = chunk['eventweight'] * 36.1
            input_features = chunk[scaler.feature_list()]
            input_features = floatify(input_features, scaler.feature_list())
            input_features = (input_features - scaler.mean()) / scaler.scale()
            scores = model.predict(input_features)

            #to_do = "hh"
            p_hh = scores[:,0]
            p_tt = scores[:,1]
            p_wt = scores[:,2]
            p_z  = scores[:,3]

            hist_data = None
            if 'd_' in to_do :
                num_data = None
                den_data = None
                if to_do == 'd_hh' :
                    num_data = p_hh
                    den_data = (p_tt + p_wt + p_z)
                elif to_do == 'd_tt' :
                    num_data = p_tt
                    den_data = (p_hh + p_wt + p_z)
                elif to_do == 'd_wt' :
                    num_data = p_wt
                    den_data = (p_hh + p_tt + p_z)
                elif to_do == 'd_z' :
                    num_data = p_z
                    den_data = (p_hh + p_tt + p_wt)

                d = np.log(num_data / den_data)
                idx = (d > -np.inf) & (d < np.inf)
                d = d[idx]
                weights = weights[idx]
                hist_data = d[:]

            else :
                hist_data = { "hh" : p_hh,
                                "tt" : p_tt,
                                "wt" : p_wt,
                                "z" : p_z } [ to_do ]

            name = { "hh" : "$p_{hh}$",
                        "tt" : "$p_{t\\bar{t}}$",
                        "wt" : "$p_{Wt}$",
                        "z" : "$p_{Z}$" } [ to_do.replace('d_','') ]

            if 'd_' in to_do :
                name = name.replace('p_', 'd_')

            data.extend(hist_data)
            w.extend(weights)

    return data, w, name
Example #10
0
def make_plot(var_dict, sample, scaler, model, args):

    x_data = []
    y_data = []
    w_data = []

    total_read = 0

    disc_for_x = args.varX

    with h5py.File(sample.filename, 'r', libver='latest') as sample_file:
        if 'superNt' not in sample_file:
            print('ERROR superNt dataset not found in input file (={})'.format(
                sample.filename))
            sys.exit()
        dataset = sample_file['superNt']
        is_first = True
        if 'data' in sample.filename:
            is_first = False
        for chunk in chunk_generator(dataset, chunksize=19000):
            if is_first:
                is_first = False
                continue

            total_read += chunk.size
            if total_read > 1e6: break

            idx = (chunk['nBJets'] >= 2)

            weights = chunk[idx]
            chunk = chunk[idx]

            input_features = chunk[scaler.feature_list()]
            input_features = floatify(input_features, scaler.feature_list())
            input_features = (input_features - scaler.mean()) / scaler.scale()
            scores = model.predict(input_features)

            p_hh = scores[:, 0]
            p_tt = scores[:, 1]
            p_wt = scores[:, 2]
            p_z = scores[:, 3]

            i_x_data = p_hh
            if 'p_' in disc_for_x:
                i_x_data = {
                    'p_hh': p_hh,
                    'p_tt': p_tt,
                    'p_wt': p_wt,
                    'p_z': p_z
                }[disc_for_x]

            elif 'd_' in disc_for_x:
                num_data = {
                    'd_hh': p_hh,
                    'd_tt': p_tt,
                    'd_wt': p_wt,
                    'd_z': p_z
                }[disc_for_x]
                den_data = {
                    'd_hh': (p_tt + p_wt + p_z),
                    'd_tt': (p_wt + p_hh + p_z),
                    'd_wt': (p_hh + p_tt + p_z),
                    'd_z': (p_tt + p_wt + p_hh)
                }[disc_for_x]
                disc = np.log(num_data / den_data)
                idx = valid_idx(disc)

                p_hh = p_hh[idx]
                p_tt = p_tt[idx]
                p_wt = p_wt[idx]
                p_z = p_z[idx]

                disc = disc[idx]
                weights = weights[idx]
                chunk = chunk[idx]
                i_x_data = disc

            i_y_data = None
            if 'p_' in args.varY or 'd_' in args.varY:
                if 'p_' in args.varY:
                    i_y_data = {
                        'p_hh': p_hh,
                        'p_tt': p_tt,
                        'p_wt': p_wt,
                        'p_z': p_z
                    }[args.varY]
                elif 'd_' in args.varY:
                    num_data = {
                        'd_hh': p_hh,
                        'd_tt': p_tt,
                        'd_wt': p_wt,
                        'd_z': p_z
                    }[args.varY]
                    den_data = {
                        'd_hh': (p_tt + p_wt + p_z),
                        'd_tt': (p_wt + p_hh + p_z),
                        'd_wt': (p_hh + p_tt + p_z),
                        'd_z': (p_tt + p_wt + p_hh)
                    }[args.varY]
                    y_disc = np.log(num_data / den_data)
                    #idx = valid_idx(y_disc)
                    #y_disc = y_disc[idx]
                    #weights = weights[idx]
                    #chunk = chunk[idx]
                    i_y_data = y_disc
            else:
                i_y_data = chunk[args.varY]

            x_data.extend(list(i_x_data))
            y_data.extend(list(i_y_data))
            w_data.extend(list(weights))

    x_data = np.array(x_data)
    y_data = np.array(y_data)
    w_data = np.array(w_data)

    fig, ax = plt.subplots(1, 1)
    ax.grid(color='k', which='both', linestyle='-', lw=0.5, alpha=0.1)
    ax.tick_params(axis='both',
                   which='both',
                   direction='in',
                   labelleft=True,
                   bottom=True,
                   top=True,
                   right=True,
                   left=True)

    var_dict = all_vars()
    x_bounds = var_dict[disc_for_x]['bounds']
    y_bounds = var_dict[args.varY]['bounds']

    x_label = var_dict[disc_for_x]['name']
    y_label = var_dict[args.varY]['name']

    x_edges = np.arange(x_bounds[1], x_bounds[2] + x_bounds[0], x_bounds[0])
    y_edges = np.arange(y_bounds[1], y_bounds[2] + y_bounds[0], y_bounds[0])

    bins = [x_edges, y_edges]

    ax.set_xlabel(x_label, horizontalalignment='right', x=1)
    ax.set_ylabel(y_label, horizontalalignment='right', y=1)

    print('x_data shape = {}'.format(x_data.shape))
    print('y_data shape = {}'.format(y_data.shape))
    h, x, y = np.histogram2d(x_data, y_data, bins=bins, normed=False)
    #integral = h.sum()
    #h = h / integral
    imextent = list((min(x_edges), max(x_edges))) + list(
        (min(y_edges), max(y_edges)))
    ax.set_facecolor('lightgrey')
    h = h.T
    im = ax.imshow(h,
                   origin='lower',
                   cmap='coolwarm',
                   aspect='auto',
                   interpolation='nearest',
                   extent=imextent,
                   norm=LogNorm())
    ax.contour(h, levels=[1, 3, 10], colors='black', extent=imextent)
    cb = fig.colorbar(im)

    process = ''
    if 'wt' in sample.filename:
        process = 'wt'
    elif '123456' in sample.filename:
        process = 'hh'
    elif 'ttbar' in sample.filename or '410009' in sample.filename:
        process = 'ttbar'
    elif 'zll' in sample.filename or 'zjets' in sample.filename:
        process = 'zjets'
    elif 'ztt' in sample.filename:
        process = 'zjets_tt'
    elif 'data' in sample.filename:
        process = 'data'

    ax.text(0.85, 0.93, process, weight='bold', transform=ax.transAxes)

    outname = './plots_input_output/input_output_2D_{}_{}_{}.pdf'.format(
        process, disc_for_x, args.varY)
    print(' >> saving plot to: {}'.format(os.path.abspath(outname)))
    fig.savefig(outname, bbox_inches='tight', dpi=200)