Beispiel #1
2
def test_flatten():
    tf = ROOT.TFile('tmp.root', 'RECREATE')
    tt = ROOT.TTree("a", "a")

    length = np.array([3])
    x = np.array([0, 1, 2], dtype='float64')
    tt.Branch('length', length, 'length/I')
    tt.Branch('x', x, 'x[length]/D')

    tt.Fill()
    x[0] = 3
    x[1] = 4
    x[2] = 5
    tt.Fill()
    
    tf.Write()
    tf.Close()

    branches = list_branches('tmp.root')

    df_ = read_root('tmp.root', flatten=True)

    assert('__array_index' in df_.columns)
    assert(len(df_) == 6)
    assert(np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2])))

    # Also flatten chunked data

    for df_ in read_root('tmp.root', flatten=True, chunksize=1):
        assert(len(df_) == 3)
        assert(np.all(df_['__array_index'] == np.array([0, 1, 2])))

    os.remove('tmp.root')
Beispiel #2
0
def test_read_write():
    df = pd.DataFrame({'x': [1,2,3]})
    df.to_root('tmp.root')
    df_ = read_root('tmp.root')
    os.remove('tmp.root')

    df.to_root('tmp.root', key='mykey')
    df_ = read_root('tmp.root', key='mykey')
    assert_frame_equal(df, df_)
    os.remove('tmp.root')

    tf = ROOT.TFile('tmp.root', 'recreate')
    tt = ROOT.TTree("a", "a")

    x = np.array([1])
    x[0] = 42
    tt.Branch('x', x, 'x/D')

    tt.Fill()
    x[0] = 1
    tt.Fill()
    tt.Write()
    tf.Close()

    # Read when no index is present
    df = read_root('tmp.root', columns=['x'])
    os.remove('tmp.root')
def get_list_branches(root_file, directory='', tree='DecayTree'):
    '''function to return the list of branches in a TTree in pandas.
    arguments :
        root_file : the name of the file, can be eos://...
        directory: the name of the directory or even path/to/TTree
        tree : name of the TTree
    '''
    if not directory:
        df = root_pandas.read_root(root_file, stop=1, key='{0}'.format(tree))
    else : 
        df = root_pandas.read_root(root_file, stop=1, key='{0}/{1}'.format(directory, tree))
    return df.columns
Beispiel #4
0
def test_ignore_columns():
    df = pd.DataFrame({'x': [1,2,3], 'y1': [2,3,4], 'y2': [3,4,5]})
    df.to_root('tmp.root')

    df = read_root('tmp.root', ignore=['y1'])
    assert(df.columns[0] == 'x' and df.columns[1] == 'y2')

    df = read_root('tmp.root', ignore=['y*'])
    assert(df.columns == ['x'])

    # Test interaction with columns kwarg
    df = read_root('tmp.root', columns=['y*'], ignore=['*1'])
    assert(df.columns == ['y2'])

    os.remove('tmp.root')
Beispiel #5
0
def resample_branch(options):
    # maybe use pyroot here instead of root_pandas
    import pickle
    from root_pandas import read_root

    with open(options.configfile) as f:
        config = json.load(f)

    #load resamplers into config dictionary
    for task in config["tasks"]:
        with open(task["resampler_path"], 'rb') as f:
            resamplers = pickle.load(f)
            for pid in task["pids"]:
                try:
                    pid["resampler"] = resamplers[pid["kind"]]
                except KeyError:
                    logging.error("No resampler found for "+task["particle"]+" and "+pid["kind"]+".")
                    raise

    chunksize = 100000
    for i, chunk in enumerate(read_root(options.source_file, ignore=["*_COV_"], chunksize=chunksize)):
        for task in config["tasks"]:
            deps = chunk[task["features"]]
            for pid in task["pids"]:
                chunk[pid["name"]] = pid["resampler"].sample(deps.values.T)
        chunk.to_root(options.output_file, mode="a")
        logging.info('Processed {} entries'.format((i+1) * chunksize))
Beispiel #6
0
def test_drop_nonscalar_columns():
    array = np.array([1, 2, 3])
    matrix = np.array([[1, 2, 3], [4, 5, 6]])
    bool_matrix = np.array([[True, False, True], [True, True, True]])

    dt = np.dtype([
        ('a', 'i4'),
        ('b', 'int64', array.shape),
        ('c', 'int64', matrix.shape),
        ('d', 'bool_'),
        ('e', 'bool_', matrix.shape)
        ])
    arr = np.array([
        (3, array, matrix, True, bool_matrix),
        (2, array, matrix, False, bool_matrix)],
        dtype=dt)

    path = 'tmp.root'
    array2root(arr, path, 'ntuple', mode='recreate')

    df = read_root(path, flatten=False)
    # the above line throws an error if flatten=True because nonscalar columns
    # are dropped only after the flattening is applied. However, the flattening
    # algorithm can not deal with arrays of more than one dimension.
    assert(len(df.columns) == 2)
    assert(np.all(df.index.values == np.array([0, 1])))
    assert(np.all(df.a.values == np.array([3, 2])))
    assert(np.all(df.d.values == np.array([True, False])))

    os.remove(path)
Beispiel #7
0
def resample_branch(options):
    import pickle
    from root_pandas import read_root
    try:
        os.remove(options.output_file)
    except OSError:
        pass

    with open(options.configfile) as f:
        config = json.load(f)

    #load resamplers into config dictionary
    for task in config["tasks"]:
        with open(task["resampler_path"], 'rb') as f:
            resamplers = pickle.load(f)
            for pid in task["pids"]:
                try:
                    pid["resampler"] = resamplers[pid["kind"]]
                except KeyError:
                    print (resamplers)
                    logging.error("No resampler found for {kind} in {picklefile}.".format(kind=pid["kind"], picklefile=task["resampler_path"]))
                    raise

    chunksize = 100000
    for i, chunk in enumerate(read_root(options.source_file, tree_key=options.tree, ignore=["*_COV_"], chunksize=chunksize)):
        for task in config["tasks"]:
            deps = chunk[task["features"]]
            for pid in task["pids"]:
                chunk[pid["name"]] = pid["resampler"].sample(deps.values.T)
        chunk.to_root(options.output_file, mode="a", tree_key=options.tree)
        logging.info('Processed {} entries'.format((i+1) * chunksize))
Beispiel #8
0
def test_persistent_index():
    df = pd.DataFrame({'index': [42, 0, 1], 'x': [1,2,3]})
    df = df.set_index('index')
    df.index.name = 'MyAwesomeName'
    df.to_root('tmp.root')
    assert('__index__MyAwesomeName' in list_branches('tmp.root'))
    df_ = read_root('tmp.root')
    assert_frame_equal(df, df_)
    os.remove('tmp.root')

    # See what happens if the index has no name
    df = pd.DataFrame({'x': [1,2,3]})
    df.to_root('tmp.root')
    df_ = read_root('tmp.root')
    assert_frame_equal(df, df_)
    os.remove('tmp.root')
    def get_DataFrame(self, columns, query=None):
        """
        Returns the desired DataFrame.

        columns : list of strings
            List of columns that are needed. Must be frugal. All columns that
            are used in the query must be listed here.
        query : string
            A string to pass to the pandas.DataFrame.query method.
        """
        # decrease priority for existing columns
        for column in self.columns_needed:
            self.columns_priority[column] -= 1;
        # join the new columns into the set
        self.columns_needed = list(set(list(self.columns_needed) + columns))
        # set the priority of the new columns to max
        for column in columns:
            self.columns_priority[column] = DataFrameManagerROOT.max_priority

        self._prune_columns_needed()

        columns_to_load = [ x for x in self.columns_needed if x not in self._raw_dataset.columns.values ]
        if columns_to_load:
            loaded = root_pandas.read_root(self.filename, self.treename, columns=columns_to_load)
            assert( len(loaded.index) == len(self._raw_dataset.index) or len(self._raw_dataset.index) == 0 )
            self._raw_dataset = pandas.concat([self._raw_dataset, loaded] , axis=1)


        if query is None:
            return self._raw_dataset
        else:
            return self._raw_dataset.query(query)
Beispiel #10
0
def test_noexpand_prefix():
    xs = np.array([1, 2, 3])
    df = pd.DataFrame({'x': xs})
    df.to_root('tmp.root')

    # Not using the prefix should throw, as there's no matching branch name
    try:
        df = read_root('tmp.root', columns=['2*x'])
    except ValueError:
        pass
    else:
        assert False

    # Could also use TMath::Sqrt here
    df = read_root('tmp.root', columns=['noexpand:2*sqrt(x)'])
    # Note that the column name shouldn't have the noexpand prefix
    assert np.all(df['2*sqrt(x)'].values == 2*np.sqrt(xs))

    os.remove('tmp.root')
Beispiel #11
0
def create_resamplers(options):
    import os.path
    import pickle
    from root_pandas import read_root
    from PIDPerfScripts.Binning import GetBinScheme

    pid_variables = ['{}_CombDLLK', '{}_CombDLLmu', '{}_CombDLLp', '{}_CombDLLe',
                    '{}_V3ProbNNK', '{}_V3ProbNNpi', '{}_V3ProbNNmu', '{}_V3ProbNNp', '{}_V3ProbNNe', '{}_V3ProbNNghost',
                    '{}_V3ProbNNK_Trafo', '{}_V3ProbNNpi_Trafo', '{}_V3ProbNNmu_Trafo', '{}_V3ProbNNp_Trafo', '{}_V3ProbNNe_Trafo', '{}_V3ProbNNghost_Trafo',        #transformed variables with log( var/(1-var) )
                    '{}_V2ProbNNK', '{}_V2ProbNNpi', '{}_V2ProbNNmu', '{}_V2ProbNNp', '{}_V2ProbNNe', '{}_V2ProbNNghost',
                    '{}_V2ProbNNK_Trafo', '{}_V2ProbNNpi_Trafo', '{}_V2ProbNNmu_Trafo', '{}_V2ProbNNp_Trafo', '{}_V2ProbNNe_Trafo', '{}_V2ProbNNghost_Trafo']        #transformed variables with log( var/(1-var) )
    kin_variables = ['{}_P', '{}_Eta','nTracks']

    with open('raw_data.json') as f:
        locations = json.load(f)
    if options.particles:
        locations = [sample for sample in locations if sample["particle"] in options.particles]
    if options.both_magnet_orientations:
        locations = [sample for sample in locations if sample["magnet"]=="Up"] # we use both maagnet orientations on the first run
    for sample in locations:
        binning_P = rooBinning_to_list(GetBinScheme(sample['branch_particle'], "P", None)) #last argument takes name of user-defined binning
        binning_ETA = rooBinning_to_list(GetBinScheme(sample['branch_particle'], "ETA", None)) #last argument takes name of user-defined binning TODO: let user pass this argument
        binning_nTracks = rooBinning_to_list(GetBinScheme(sample['branch_particle'], "nTracks", None)) #last argument takes name of user-defined binning TODO: let user pass this argument
    	if options.both_magnet_orientations:
            if sample["magnet"]=="Up":
                data =  [options.location + '/{particle}_Stripping{stripping}_MagnetUp.root'  .format(**sample)]
                data += [options.location + '/{particle}_Stripping{stripping}_MagnetDown.root'.format(**sample)]
                resampler_location = options.saveto + '/{particle}_Stripping{stripping}_MagnetAny.pkl'.format(**sample)
        else:
            data = [options.location + '/{particle}_Stripping{stripping}_Magnet{magnet}.root'.format(**sample)]
            resampler_location = options.saveto + '/{particle}_Stripping{stripping}_Magnet{magnet}.pkl'.format(**sample)
        if os.path.exists(resampler_location):
            os.remove(resampler_location)
        resamplers = dict()
        deps = map(lambda x: x.format(sample['branch_particle']), kin_variables)
        pids = map(lambda x: x.format(sample['branch_particle']), pid_variables)
        for pid in pids:
            if "DLL" in pid:
                target_binning = np.linspace(-150, 150, 300) # binning for DLL
            elif "ProbNN" in pid and "Trafo" in pid:
                target_binning = np.linspace(-30, 30, 1000) # binning for transformed ProbNN
            elif "ProbNN" in pid:
                target_binning = np.linspace(0, 1, 100) # binning for (raw) ProbNN
            else:
                raise Exception
            resamplers[pid] = Resampler(binning_P, binning_ETA, binning_nTracks, target_binning)

        for dataSet in data:
            for i, chunk in enumerate(read_root(dataSet, columns=deps + pids + ['nsig_sw'], chunksize=100000, where=options.cutstring)): # where is None if option is not set
                for pid in pids:
                    resamplers[pid].learn(chunk[deps + [pid]].values.T, weights=chunk['nsig_sw'])
                logging.info('Finished chunk {}'.format(i))
        with open(resampler_location, 'wb') as f:
            pickle.dump(resamplers, f)
Beispiel #12
0
def test_chunked_reading():
    df = pd.DataFrame({'x': [1,2,3,4,5,6]})
    df.to_root('tmp.root')

    count = 0
    for df_ in read_root('tmp.root', chunksize=2):
        assert(not df_.empty)
        count += 1

    assert count == 3
    os.remove('tmp.root')
Beispiel #13
0
def test_multiple_files():
    df = pd.DataFrame({'x': [1,2,3,4,5,6]})
    df.to_root('tmp1.root')
    df.to_root('tmp2.root')
    df.to_root('tmp3.root')

    df_ = read_root(['tmp1.root', 'tmp2.root', 'tmp3.root'])

    assert(len(df_) == 3 * len(df))

    # Also test chunked read of multiple files

    counter = 0
    for df_ in read_root(['tmp1.root', 'tmp2.root', 'tmp3.root'], chunksize=3):
        assert(len(df_) == 3)
        counter += 1
    assert(counter == 6)

    os.remove('tmp1.root')
    os.remove('tmp2.root')
    os.remove('tmp3.root')
Beispiel #14
0
def plot(data, plotfile, mcfile=None, cuts=None, variables=None, bins=30):
    import numpy as np
    from root_numpy import root2array
    import matplotlib.pyplot as plt
    from matplotlib.colors import LogNorm
    from matplotlib.backends.backend_pdf import PdfPages
    #sns.set_palette("deep", desat=.6)
    #sns.set_context('talk')

    if cuts is None:
        cuts = []

    if variables is None:
        variables = []

    arr = read_root(data, where=prepare_sel(cuts))

    if mcfile:
        arr_mc = read_root(mcfile, where=prepare_sel(cuts))

    logger.info('Saving plots to {}'.format(plotfile))
    with PdfPages(plotfile) as pdf:
        for col in arr.columns:
            logger.debug('Plotting ' + col)
            x = arr[col]
            n, bine, _ = plt.hist(x.values, histtype='stepfilled', bins=bins, color='blue')

            if mcfile:
                x_mc = arr_mc[col]
                if col in arr_mc.columns:
                    n_mc, edges = np.histogram(arr_mc[col], bine)
                    binned_hist(plt.gca(), n_mc, edges, histtype='stepfilled', alpha=0.7)

                    #plt.hist(x_mc, histtype='stepfilled', bins=bins, alpha=0.8, normed=True)
            #plt.yscale('log')
            plt.xlabel(col)
            plt.ylim(0, max(n) * 1.05)
            pdf.savefig()
            plt.clf()
Beispiel #15
0
def get_event_number(config):
    """ Compute the total number of events contained in the base tuples of
    a given configuration.

    Parameters
    ----------
    config : dictionary
        expected to contain the keys
            - 'filepath'
            - 'files'
            - 'pandas_kwargs'
    """
    files = [config['filepath'] + f for f in config['files']]
    df = read_root(files, key=config['pandas_kwargs']['key'],
                   columns=['SigYield_sw', 'nCandidate'])
    return df[df.nCandidate == 0].SigYield_sw.sum()
Beispiel #16
0
def set_target(treeName, branch_names, target, cuts):
    for i in range(utils.IO.nTarget):
        tmp_data_frame = (rpd.read_root(utils.IO.targetName[i],
                                        treeName,
                                        columns=branch_names)).query(cuts)
        utils.IO.target_df.append(tmp_data_frame)

    for j in range(len(target)):
        if j == 0:
            X_target = tmp_data_frame[[target[j].replace('noexpand:', '')]]
        else:
            X_target = np.concatenate([
                X_target, tmp_data_frame[[target[j].replace('noexpand:', '')]]
            ],
                                      axis=1)

    return np.round(X_target, 5)
Beispiel #17
0
def test_brace_pattern_in_columns():
    reference_df = pd.DataFrame()
    reference_df['var1'] = np.array([1, 2, 3])
    reference_df['var2'] = np.array([4, 5, 6])
    reference_df['var3'] = np.array([7, 8, 9])
    reference_df['var{03}'] = np.array([10, 11, 12])
    reference_df['var{04}'] = np.array([13, 14, 15])
    reference_df['var{5}'] = np.array([16, 17, 18])
    reference_df['var01'] = np.array([1.1, 2.1, 3.1])
    reference_df['var02'] = np.array([4.1, 5.1, 6.1])
    reference_df['var03'] = np.array([7.1, 8.1, 9.1])
    reference_df['var11'] = np.array([10.1, 11.1, 12.1])
    reference_df['var12'] = np.array([13.1, 14.1, 15.1])
    reference_df['var13'] = np.array([16.1, 17.1, 18.1])
    reference_df.to_root('tmp.root')

    # Try looking for a column that doesn't exist
    with assert_raises(ValueError):
        read_root('tmp.root', columns=['var{1,2,4}'])

    # Simple expansion
    df = read_root('tmp.root', columns=['var{1,2}'])
    assert set(df.columns) == {'var1', 'var2'}
    assert_frame_equal(df[['var1', 'var2']], reference_df[['var1', 'var2']])

    # Single expansion with braces in name
    df = read_root('tmp.root', columns=['var{5}'])
    assert set(df.columns) == {'var{5}'}
    assert_frame_equal(df[['var{5}']], reference_df[['var{5}']])

    # Single expansion with braces in name
    df = read_root('tmp.root', columns=['var{03}'])
    assert set(df.columns) == {'var{03}'}
    assert_frame_equal(df[['var{03}']], reference_df[['var{03}']])

    # Multiple expansions with braces in name
    df = read_root('tmp.root', columns=[r'var{{03},2,{04}}'])
    assert set(df.columns) == {'var{03}', 'var2', 'var{04}'}
    assert_frame_equal(df[['var{03}', 'var2', 'var{04}']],
                       reference_df[['var{03}', 'var2', 'var{04}']])

    # Recursive expansions
    df = read_root('tmp.root', columns=[r'var{0{2,3},1{1,3}}'])
    assert set(df.columns) == {'var02', 'var03', 'var11', 'var13'}
    assert_frame_equal(df[['var02', 'var03', 'var11', 'var13']],
                       reference_df[['var02', 'var03', 'var11', 'var13']])

    os.remove('tmp.root')
Beispiel #18
0
def main():

    channel = "mt"

    filename = "{0}-NOMINAL_ntuple_Data.root".format(channel)
    dirpath = "/eos/user/m/msajatov/data/ntuples_scp/v10"
    path = os.path.join(dirpath, filename)

    Cut.cutfile = "./cuts_2017.json"
    cut = Cut(cutstring="-OS- && -ISO- && -VETO- && -MT- && -TRIG-",
              channel=channel)

    branches = [
        "pt_1", "pt_2", "jpt_1", "jpt_2", "bpt_1", "bpt_2", "njets", "nbtag",
        "m_sv", "mt_1", "mt_2", "pt_vis", "pt_tt", "mjj", "jdeta", "m_vis",
        "dijetpt", "met", "eta_1", "eta_2", "iso_1", "iso_2"
    ] + [
        "evt", "by*IsolationMVArun2017v2DBoldDMwLT2017*", "pt_1", "pt_2",
        "q_1", "q_2", "iso_1", "iso_2", "phi_1", "phi_2", "eta_1", "eta_2",
        "mt_1", "njets", "decayMode_1", "decayMode_2", "dilepton_veto",
        "extraelec_veto", "extramuon_veto", "againstMuon*", "againstElectron*",
        "flagMETFilter", "trg*", "*Weight*", "*weight*", "htxs*"
    ] + [
        "*weight*", "gen_match*", "topPtReweightWeight*", "zPtReweightWeight",
        "sf*", "njets", "jpt_1", "jdeta", "mjj"
    ]

    # chunksize is the number of events BEFORE the selection (where) is applied
    # for tt a chunksize of 5000 results in 37 events in the SR
    # for mt a chunksize of 2000 results in 36 events in the SR
    # for et a chunksize of 1000 results in 34 events in the SR
    df_iter = rp.read_root(paths=path,
                           where=cut.getForDF(),
                           columns=branches,
                           chunksize=2000)

    print df_iter

    # get first chunk only (there must be a better way to do this but next() is not implemented for the genchunk...)
    for df in df_iter:
        print df
        break

    outpath = "../testdata/{0}_test.root".format(channel)
    df.to_root(outpath, key="TauCheck", mode="w")
Beispiel #19
0
def skim():
    #open only the columns we need (for speed) over all the trees (files in folder)
    for i_file, file in enumerate(sorted(os.listdir(args.trees))):
        for i_key, key in enumerate(keys):
            print("Opening", key, "data in", args.trees + "/" + file)
            data_all = read_root(args.trees + "/" + file,
                                 key,
                                 columns=[
                                     "station", "trackT0", "trackMomentum",
                                     "trackMomentumY", "trackPValue"
                                 ])
            data_all['trackT0'] = data_all['trackT0'] * 1e-3  # ns -> us
            total_tv[i_key] += data_all.shape[0]  # add to total from each file
            print("Total of", data_all.shape[0], "entries")

            #define the time and energy cuts (otherwise the wiggle looks wobbly!)
            time_cut = (data_all['trackT0'] > args.t_cut
                        )  # us, define a time_cut with time > 30 us
            mom_cut = (data_all['trackMomentum'] > args.p_cut)  # MeV
            #Apply cuts in one go!
            data = data_all[time_cut & mom_cut]
            total_tv_skim[i_key] += data.shape[
                0]  # add to total from each file
            print("Total of", data.shape[0],
                  "entries after energy and momentum cuts")

            #save the skimmed dataframe in a compressed HDF5 format
            # here we are appending to the file over tracks and then vertices
            print("Saving compressed data...")
            data = data.reset_index()  # reset index from 0
            cols_to_keep = ["trackMomentum",
                            "trackPValue"]  # only write for time and station
            # cols_to_keep = ["station", "trackT0", "trackMomentum", "trackMomentumY"] # only write for time and station
            data[cols_to_keep].to_hdf(args.df + "_" + str(i_file) + ".h5",
                                      key=key,
                                      mode='a',
                                      complevel=9,
                                      complib="zlib",
                                      format="fixed")
            print("Skimmed dataframe saved to disk", args.df, "\n")

    print("Grand total of (M)", total_tv[0] / 1e6, "tracks,",
          total_tv[1] / 1e6, "vertices")
    print("After the cuts (M)", total_tv_skim[0] / 1e6, "tracks,",
          total_tv_skim[1] / 1e6, "vertices")
Beispiel #20
0
    def ev_score_toROOT(self, savepredicOnly=False):
        '''evaluate the score for the loaded modle'''
        L_varList = self.varlist()
        #get the model
        self.load_model()

        df = read_root(self.infile,
                       'sf/t',
                       columns=L_varList,
                       flatten=[
                           'DLMS_ST', 'DLMS_HT', 'DLMS_dPhiLepW',
                           'DLMS_nJets30Clean'
                       ])
        print('df loaded')
        print(" going to evalute the score from ", self.pathToModel)
        if not '_SMS_' in self.infile:
            df.loc[:, 'mGo'] = float(self.mGo)
            df.loc[:, 'mLSP'] = float(self.mLSP)
        print('prediction will be made to mGo = ', self.mGo, ' and mLSP =  ',
              self.mLSP)

        #print (df['mGo'].dtype)

        if self.do_multiClass:
            self.model.compile(loss='sparse_categorical_crossentropy',
                               metrics=['accuracy'],
                               optimizer='adam')
            prediction = self.model.predict_proba(df[self.var_list].values)
            if not savepredicOnly:
                for mm, mult in enumerate(self.ClassList):
                    df.loc[:, mult] = prediction[:, mm]
            else:
                df = pd.DataFrame(prediction, columns=self.ClassList)
        else:
            self.model.compile(loss='binary_crossentropy',
                               optimizer='adam',
                               metrics=['accuracy'])
            if not savepredicOnly:
                df.loc[:, 'DNN'] = self.model.predict(df[self.var_list])
            else:
                df = pd.DataFrame(self.model.predict(df[self.var_list]),
                                  columns=["DNN"])
        df.to_root(self.outdir + '/' + self.infile.split("/")[-1], key='sf/t')
        print("out put fle is wrote to ",
              self.outdir + '/' + self.infile.split("/")[-1])
Beispiel #21
0
def LoadBDTdf(dtype, polarity):
    df_list_bdt = []
    ifname = filedir + 'Data/Lb_' + dtype + '_' + polarity + '.root'
    bdtfname = ifname[0:-5] + '_MVA.root'
    if os.path.isfile(bdtfname):
        print('BDT file already created')
    else:
        print()
        print('>>>   Creating file with BDT variable')
        print()
        AddBDTinfo(ifname,
                   'tupleout/DecayTree',
                   bdtfname,
                   'Data',
                   pickled_model_path='../PIDGen_PIDCalib_MVA/xgb_reg.pkl')
    for df_bdt in read_root(bdtfname, 'DecayTree', chunksize=100000):
        df_list_bdt.append(df_bdt)
    return df_list_bdt
Beispiel #22
0
 def get_MVAdf(self):
     '''this method creates the df with the right columns/branches from the root df for the MVA procedure'''
     MVAdf = read_root(paths=self.path,
                       columns=self.ids + self.label + self.feats4MVA,
                       flatten=self.flatfeats4MVA)
     #always change index to be a TB index first, just in case we only want a select few of TBs, which we cant do if we have a ET index
     MVAdf.index = MVAdf.apply(lambda x: str(int(x.runNumber)) + str(
         int(x.eventNumber)) + '-' + str(int(x.nCandidate)),
                               axis=1)
     #if specific_TBs is not empty then we need to filter out the unwanted TBs by their id
     if self.specific_TBs.shape[0] != 0:
         MVAdf = MVAdf.loc[self.specific_TBs, :]
     #we then change the index according to whether were dealing with TBs or ETs, if its TBs then the index is essentially left unchanged
     MVAdf.index = MVAdf.apply(self.index_function, axis=1)
     #if specific_ETs is not empty that means we need to filter out and keep only the ETs asked for by using their ids
     if self.specific_ETs.shape[0] != 0:
         MVAdf = MVAdf.loc[self.specific_ETs, :]
     return MVAdf
Beispiel #23
0
def ReadRootFile(dtype, polarity):
    df_list = []
    varsON = [
        'Lb_L0Global_TIS', 'Lb_L0HadronDecision_TOS',
        'Lc_Hlt1TrackMVADecision_TOS', 'Lc_Hlt1TwoTrackMVADecision_TOS',
        'Lb_Hlt2XcMuXForTauB2XcMuDecision_TOS',
        'Lb_Hlt2XcMuXForTauB2XcFakeMuDecision_TOS', 'Lc_M', 'p_ProbNNp',
        'p_ProbNNk', 'mu_PID*', '*_P', '*_PT', 'nTracks', 'runNumber',
        'eventNumber', 'Lb_ISOLATION_*', 'mu_PX', 'mu_PY', 'mu_PZ', 'mu_ID',
        'Lc_PX', 'Lc_PY', 'Lc_PZ'
    ]
    for df in read_root(filedir + 'Data/Lb_' + dtype + '_' + polarity +
                        '.root',
                        'tupleout/DecayTree',
                        chunksize=100000,
                        columns=varsON):
        df_list.append(df)
    return df_list
Beispiel #24
0
def phsp_goofit_alt():
    import root_pandas
    path = 'root://eoslhcb.cern.ch//eos/lhcb/user/d/dmuller/K3Pi/RS_with_weight_dtime.root'
    df = root_pandas.read_root(path, 'events')
    df.rename(columns={
        'c12': vars.cos1(),
        'c34': vars.cos2(),
        'dtime': vars.ltime(mode_config.D0),
        'phi': vars.phi1(),
        'm12': vars.m12(),
        'm34': vars.m34()
    },
              inplace=True)
    df[vars.m12()] = df[vars.m12()] * 1000.
    df[vars.m34()] = df[vars.m34()] * 1000.
    df['D0_Loki_BPVLTIME'] = df['D0_Loki_BPVLTIME'] / 1000.

    return df
Beispiel #25
0
def addbdtscore(infile, tree):
    ifile = open("discriminator_resolved.pickle")
    model = pickle.load(ifile)

    vars_to_load_ = [
        'MET', 'METSig', 'Jet1Pt', 'Jet1Eta', 'Jet1Phi', 'Jet2Pt', 'Jet2Eta',
        'Jet2Phi', 'DiJetMass', 'DiJetPt', 'DiJetEta', 'DiJetPhi', 'nJets',
        'met_Phi'
    ]

    if not ("SR" in tree or "SBand" in tree): vars_to_load_[0] = "RECOIL"
    df = read_root(infile, tree, columns=vars_to_load_)
    #df=df[vars_to_load_]
    print df[:1]
    out = model.decision_function(df).ravel()

    print out[:10]
    return out
def get_gen_sample(sample='mu'):
    den_columns = []
    den_columns += ['Lambda_b0_TRUEP_E']
    den_columns += ['Lambda_b0_TRUEP_X']
    den_columns += ['Lambda_b0_TRUEP_Y']
    den_columns += ['Lambda_b0_TRUEP_Z']
    den_columns += ['Lambda_cplus_TRUEP_E']
    den_columns += ['Lambda_cplus_TRUEP_X']
    den_columns += ['Lambda_cplus_TRUEP_Y']
    den_columns += ['Lambda_cplus_TRUEP_Z']
    den_columns += [sample + 'minus_TRUEP_E']
    den_columns += [sample + 'minus_TRUEP_X']
    den_columns += [sample + 'minus_TRUEP_Y']
    den_columns += [sample + 'minus_TRUEP_Z']
    den_columns += ['nu_' + sample + '~_TRUEP_E']
    den_columns += ['nu_' + sample + '~_TRUEP_X']
    den_columns += ['nu_' + sample + '~_TRUEP_Y']
    den_columns += ['nu_' + sample + '~_TRUEP_Z']
    den_fname = '~/LbToLclnu_RunTwo/Selection/PID/FFs/GenMC/Lc' + sample.capitalize(
    ) + 'Nu_gen.root'

    df_den = rpd.read_root(den_fname,
                           columns=den_columns,
                           key='MCDecayTreeTuple/MCDecayTree')

    PLc_lab = atfk.lorentz_vector(
        atfk.vector(df_den['Lambda_cplus_TRUEP_X'],
                    df_den['Lambda_cplus_TRUEP_Y'],
                    df_den['Lambda_cplus_TRUEP_Z']),
        df_den['Lambda_cplus_TRUEP_E'])
    Pl_lab = atfk.lorentz_vector(
        atfk.vector(df_den[sample + 'minus_TRUEP_X'],
                    df_den[sample + 'minus_TRUEP_Y'],
                    df_den[sample + 'minus_TRUEP_Z']),
        df_den[sample + 'minus_TRUEP_E'])
    PNu_lab = atfk.lorentz_vector(
        atfk.vector(df_den["nu_" + sample + "~_TRUEP_X"],
                    df_den["nu_" + sample + "~_TRUEP_Y"],
                    df_den["nu_" + sample + "~_TRUEP_Z"]),
        df_den["nu_" + sample + "~_TRUEP_E"])
    PLb_lab = PLc_lab + Pl_lab + PNu_lab
    df_den['Lb_True_Q2'], df_den['Lb_True_Costhetal'] = get_phasespace_vars(
        PLb_lab, PLc_lab, Pl_lab)
    return df_den[['Lb_True_Q2', 'Lb_True_Costhetal']].to_numpy(),
def resample_branch(options):
    import pickle
    from root_pandas import read_root
    try:
        os.remove(options.output_file)
    except OSError:
        pass

    if options.seed:
        np.random.seed(options.seed)

    with open(options.configfile) as f:
        config = json.load(f)

    #load resamplers into config dictionary
    for task in config["tasks"]:
        with open(task["resampler_path"], 'rb') as f:
            resamplers = pickle.load(f)
            for pid in task["pids"]:
                try:
                    pid["resampler"] = resamplers[pid["kind"]]
                except KeyError:
                    print(resamplers)
                    logging.error(
                        "No resampler found for {kind} in {picklefile}.".
                        format(kind=pid["kind"],
                               picklefile=task["resampler_path"]))
                    raise

    chunksize = 100000
    for i, chunk in enumerate(
            read_root(options.source_file,
                      tree_key=options.input_tree,
                      ignore=["*_COV_"],
                      chunksize=chunksize)):
        for task in config["tasks"]:
            deps = chunk[task["features"]]
            for pid in task["pids"]:
                chunk[pid["name"]] = pid["resampler"].sample(deps.values.T)
        chunk.to_root(options.output_file,
                      tree_key=options.output_tree,
                      mode="a")
        logging.info('Processed {} entries'.format((i + 1) * chunksize))
Beispiel #28
0
def plot_time_series(df, channel):
    #if chn == 4 or chn == 5:
    #return
    df = df[df.error == 0]
    df = time_conversion(df[df.channel == channel])
    df_ana = read_root(mostRecentDir.split('mx_')[0] + "analysis/ANA_" +
                       mostRecentDir.split('/')[-2] + '.root',
                       columns=['rate', 'drate', 'time', 'channel', 'e'])
    df_ana = time_conversion(df_ana[(df_ana.channel == channel)
                                    & (df_ana.e < 665) & (df_ana.e > 655)])
    df = df.set_index('time').resample('10T').count().dropna().reset_index(
    ).rename(columns={'integral': 'Count'})
    df = df.iloc[1:-1]
    fig, ax = plt.subplots(nrows=2, ncols=1)
    df.plot(x='time', y='Count', ax=ax[0])
    df_ana.plot(x='time', y='rate', ax=ax[1])
    plt.savefig(plotoutDir + '/time_series_channel' + str(channel) + '.png')
    plt.close()
    return
Beispiel #29
0
def set_data(treeName,branch_names):
    utils.IO.data_df.append(rpd.read_root(utils.IO.dataName[0],treeName, columns = branch_names))
    utils.IO.data_df[0]['proc'] =  ( np.ones_like(utils.IO.data_df[0].index)*utils.IO.dataProc[0] ).astype(np.int8)
    #input_df=rpd.read_root(utils.IO.dataName[0],treeName, columns = ['isSignal'])
    w = (np.ones_like(utils.IO.data_df[0].index)).astype(np.int8)
    #utils.IO.data_df[0]['weight'] = np.multiply(w,input_df['isSignal'])
    #utils.IO.data_df[0]['weight'] = np.multiply(w,1.)
    utils.IO.data_df[0]['weight'] = w

    y_data = utils.IO.data_df[0][['proc']]
    w_data = utils.IO.data_df[0][['weight']]

    for j in range(len(branch_names)):
        if j == 0:
            X_data = utils.IO.data_df[0][[branch_names[j].replace('noexpand:','')]]
        else:
            X_data = np.concatenate([X_data,utils.IO.data_df[0][[branch_names[j].replace('noexpand:','')]]],axis=1)
    
    return np.round(X_data,5),y_data,w_data
Beispiel #30
0
def loadFile(ifile):
    from root_pandas import read_root

    if 'MUTAU' in ifile:
        channel = 'mt'
    elif 'ETAU' in ifile:
        channel = 'et'
    elif 'TAUTAU' in ifile:
        channel = 'tt'
    else:
        raise Exception(
            'Input files must have MUTAU, ETAU, or TAUTAU in the provided path. You gave {}, ya goober.'
            .format(ifile))

    filename = ifile.split('/')[-1]
    print 'Loading input file...', filename

    input_df = read_root(ifile, columns=scaled_vars +
                         selection_vars)  ## read from TTrees into DataFrame
    slim_df = input_df[(input_df['njets'] > 1) & (input_df['mjj'] > 300) &
                       (input_df['mt'] < 50)]  ## preselection
    selection_df = slim_df[
        selection_vars]  ## get variables needed for selection (so they aren't normalized)
    weights = slim_df[
        'evtwt']  ## get just the weights (they are scaled differently)
    slim_df = slim_df.drop(selection_vars + ['evtwt'], axis=1)

    ## add the event label
    if 'VBF' in ifile or 'ggH' in ifile:
        isSignal = np.ones(len(slim_df))
    else:
        isSignal = np.zeros(len(slim_df))

    ## save the name of the process
    somenames = np.full(len(slim_df), filename.split('.root')[0])

    ## scale event weights between 0 - 1
    weights = MinMaxScaler().fit_transform(weights.values.reshape(-1, 1))

    ## get lepton channel
    lepton = np.full(len(slim_df), channel)

    return slim_df, selection_df, somenames, lepton, isSignal, weights
Beispiel #31
0
    def __init__(self, phase, tag, config,
                 delayed_eff_mode="rel",
                 delayed_eff_impl="add-then-calc",
                 delayed_eff_ref=None,
                 vtx_eff_nom_tagconf=None):
        self.phase = phase
        self.tag = tag
        self.config = config

        self.delayed_eff_mode = delayed_eff_mode
        if delayed_eff_impl == "new":
            delayed_eff_impl = "add-then-calc"
        self.delayed_eff_impl = delayed_eff_impl

        self.hardcoded = Hardcoded(phase)

        self.files, self.results = {}, {}
        for site in [1, 2, 3]:
            path = stage2_pbp_path(site, phase, tag, config)

            self.files[site] = R.TFile(path)

            results = read_root(path, 'results')
            for det in dets_for_phase(site, phase):
                self.results[(site, det)] = \
                    results.query(f'detector == {det}')

        cfg_path = configfile_path(tag, config)
        self.cfg = ConfigFile(cfg_path)
        if delayed_eff_impl == "old":
            ref_tag, ref_conf = delayed_eff_ref.split("@")
            self.delEffCalc = DelayedEffCalc(self.cfg['ibdDelayedEmin'],
                                             self.phase,
                                             ref_tag, ref_conf)
        self.promptEffCalc = PromptEffCalc(cfg_path)

        if vtx_eff_nom_tagconf:
            nom_tag, nom_conf = vtx_eff_nom_tagconf.split("@")
            self.vertexEffCalc = VertexEffCalc(self, phase,
                                               tag, config,
                                               nom_tag, nom_conf)
        else:
            self.vertexEffCalc = DummyVertexEffCalc()
Beispiel #32
0
def get_luminosity(mode, polarity, year):
    mode = get_mode(polarity, year, mode)

    # For a yet to be determined reason, some files do not contain a LumiTuple
    # so sort those ones out
    infiles = []
    for f in mode.files:
        fl = ROOT.TFile.Open(f)
        if fl.Get('GetIntegratedLuminosity/LumiTuple'):
            infiles.append(f)
        fl.Close()

    # Get the files and stuff them into a dataframe
    df = root_pandas.read_root(
        infiles, key='GetIntegratedLuminosity/LumiTuple')

    log.info('Luminosity {} {}: {} +- {}'.format(
        year, polarity,
        df.sum().IntegratedLuminosity, df.sum().IntegratedLuminosityErr))
Beispiel #33
0
def run(input_fns, output_fn, h1, h2, h3):
    keys = list_trees(input_fns[0])
    assert len(keys) == 1, keys
    df = read_root(input_fns, keys[0])

    df['H1_isMuon'] = df['H1_isMuon'].astype(np.bool)
    df['H2_isMuon'] = df['H2_isMuon'].astype(np.bool)
    df['H3_isMuon'] = df['H3_isMuon'].astype(np.bool)

    # Sort the columns so that the first is the most kaon-like
    assert sorted([h1, h2, h3
                   ]) == [h1, h2, h3
                          ], 'Children are ranked from kaon-like to pion-like'
    order = np.argsort(df[['H3_ProbK', 'H2_ProbK', 'H1_ProbK']], axis=1)
    for col in [c for c in df.columns if c.startswith('H1_')]:
        col = col[len('H1_'):]
        cols = [f'H1_{col}', f'H2_{col}', f'H3_{col}']
        df[cols] = df[cols].values[np.arange(order.shape[0])[:, None], order]

    # Compute the PE and mass of all particles
    for head, mass in [('H1', mass_dict[h1]), ('H2', mass_dict[h2]),
                       ('H3', mass_dict[h3])]:
        df.eval(f'{head}_P = sqrt({head}_PX**2 + {head}_PY**2 + {head}_PZ**2)',
                inplace=True)
        df.eval(f'{head}_PE = sqrt({mass}**2 + {head}_P**2)', inplace=True)
    for component in ['PE', 'PX', 'PY', 'PZ']:
        df.eval(
            f'B_{component} = H1_{component} + H2_{component} + H3_{component}',
            inplace=True)
    df.eval(f'B_M = sqrt(B_PE**2 - B_PX**2 - B_PY**2 - B_PZ**2)', inplace=True)

    # if [h1, h2, h3] == ['K', 'K', 'K']:
    # Apply ignore muons
    df.query('~(H1_isMuon | H2_isMuon | H3_isMuon)', inplace=True)
    # Apply an additional selection
    df.query(f'(H1_IPChi2 > 25) & (H2_IPChi2 > 25) & (H3_IPChi2 > 25)',
             inplace=True)
    # Apply a PID selection
    df.query(
        f'(H1_Prob{h1} > {pid_cut}) & (H2_Prob{h2} > {pid_cut}) & (H3_Prob{h3} > {pid_cut})',
        inplace=True)

    to_root(df, output_fn, key=f'B2{h1}{h2}{h3}', mode='w', store_index=False)
Beispiel #34
0
def set_data_simple(treeName, branch_names):
    for i in range(utils.IO.nData):
        utils.IO.data_df.append(
            rpd.read_root(utils.IO.dataName[i], treeName,
                          columns=branch_names))

    for j in range(len(branch_names)):
        if j == 0:
            X_data = utils.IO.data_df[0][[
                branch_names[j].replace('noexpand:', '')
            ]]
        else:
            X_data = np.concatenate([
                X_data,
                utils.IO.data_df[0][[branch_names[j].replace('noexpand:', '')]]
            ],
                                    axis=1)

    return np.round(X_data, 5)
Beispiel #35
0
def readDatasetsToDataframes(pathToFolder):
    listOfDatasets = []
    #    identifiers = ["ChargedHiggs_", "TT_", "DYJets", "QCD_", "ST_", "WJets", "WW", "WZ", "ZZ"]
    identifiers = ["ChargedHiggs_", "TT_", "ST_", "WJets"]
    for identifier in identifiers:
        filePaths = glob.glob(pathToFolder + identifier + "*.root")
        dataset = read_root(filePaths, columns=COLUMNS_)
        dataset["eventType"] = eventTypeDict[identifier]
        listOfDatasets.append(dataset)

    numberOfSignalEvents = listOfDatasets[0].shape[0]
    numberOfBackgroundEvents = np.sum([x.shape[0] for x in listOfDatasets[1:]])
    if (numberOfSignalEvents > numberOfBackgroundEvents):
        listOfDatasets[0] = listOfDatasets[0].sample(
            n=numberOfBackgroundEvents)

    dataframe = listOfDatasets[0].append(listOfDatasets[1:])

    return dataframe
Beispiel #36
0
    def get_LOFdf(self):
        '''this method creates the df with the right columns/branches in which to perform the LOF calculation on, and require the COM variables for the TBs and ETs'''
        LOFdf = read_root(paths=self.path,
                          columns=self.ids + self.feats4LOF,
                          flatten=self.flatfeats4LOF)
        #always change index to be a TB index first, just in case we only want a select few of TBs, which we cant do if we have a ET index
        LOFdf.index = LOFdf.apply(lambda x: str(int(x.runNumber)) + str(
            int(x.eventNumber)) + '-' + str(int(x.nCandidate)),
                                  axis=1)
        # if specific_TBs is not empty then we need to filter out the unwanted TBs by their id
        if self.specific_TBs.shape[0] != 0:
            LOFdf = LOFdf.loc[self.specific_TBs, :]
        # we then change the index according to whether were dealing with TBs or ETs, if its TBs then the index is essentially left unchanged
        LOFdf.index = LOFdf.apply(self.index_function, axis=1)
        #if specific_ETs is not empty that means we need to filter out and keep only the ETs asked for by using their ids
        if self.specific_ETs.shape[0] != 0:
            LOFdf = LOFdf.loc[self.specific_ETs, :]

        return LOFdf
Beispiel #37
0
def create_resamplers(options):
    import os.path
    import numpy as np
    import pickle
    from root_pandas import read_root
    from PIDPerfScripts.Binning import GetBinScheme

    pid_variables = ['{}_CombDLLK', '{}_CombDLLmu', '{}_CombDLLp', '{}_CombDLLe', '{}_V3ProbNNK', '{}_V3ProbNNpi', '{}_V3ProbNNmu', '{}_V3ProbNNp']
    kin_variables = ['{}_P', '{}_Eta','nTracks']


    with open('raw_data.json') as f:
        locations = json.load(f)
    if options.particles:
        locations =  [sample for sample in locations if sample["particle"] in options.particles]
    for sample in locations:
        binning_P = rooBinning_to_list(GetBinScheme(sample['branch_particle'], "P", None)) #last argument takes name of user-defined binning
        binning_ETA = rooBinning_to_list(GetBinScheme(sample['branch_particle'], "ETA", None)) #last argument takes name of user-defined binning TODO: let user pass this argument 
        binning_nTracks = rooBinning_to_list(GetBinScheme(sample['branch_particle'], "nTracks", None)) #last argument takes name of user-defined binning TODO: let user pass this argument
    	
        data = options.location + '/{particle}_Stripping{stripping}_Magnet{magnet}.root'.format(**sample)
        resampler_location = '{particle}_Stripping{stripping}_Magnet{magnet}.pkl'.format(**sample)
        if os.path.exists(resampler_location):
            os.remove(resampler_location)
        resamplers = dict()
        deps = map(lambda x: x.format(sample['branch_particle']), kin_variables)
        pids = map(lambda x: x.format(sample['branch_particle']), pid_variables)
        for pid in pids:
            if "DLL" in pid:
                target_binning = np.linspace(-150, 150, 300) # binning for DLL
            elif "ProbNN" in pid:
                target_binning = np.linspace(0, 1, 100) # binning for ProbNN
            else:
                raise Exception
            resamplers[pid] = Resampler(binning_P, binning_ETA, binning_nTracks, target_binning)
        for i, chunk in enumerate(read_root(data, columns=deps + pids + ['nsig_sw'], chunksize=100000, where=options.cutstring)): # where is None if option is not set 
            for pid in pids:
                resamplers[pid].learn(chunk[deps + [pid]].values.T, weights=chunk['nsig_sw'])
            logging.info('Finished chunk {}'.format(i))
        with open(resampler_location, 'wb') as f:
            pickle.dump(resamplers, f)
Beispiel #38
0
def tot_event_number(base_file,
                     base_tree,
                     weight_column='',
                     unique_events=['runNumber', 'eventNumber', 'nCandidate'],
                     preselection=None,
                     presel_column=[]):
    columns = []
    if weight_column != '':
        columns.append(weight_column)
    for pr in presel_column:
        columns.append(pr)
    for ue in unique_events:
        columns.append(ue)
    df = read_root(base_file, key=base_tree, columns=columns)
    if preselection is not None:
        df = df.query(preselection)
    if weight_column != '':
        nevt = df.groupby(unique_events)[weight_column].head(1).sum()
    else:
        nevt = df.groupby(unique_events).ngroups
    return nevt
def weight_signal_with_resolution(w_s, y_s):
    proc = 999
    for i in range(utils.IO.nSig):
        w_sig = np.asarray(w_s[np.asarray(y_s) == utils.IO.sigProc[i]])
        proc = utils.IO.sigProc[i]
        input_df = rpd.read_root(utils.IO.signalName[i],
                                 "bbggSelectionTree",
                                 columns=['benchmark_reweight_SM'])
        #input_df=rpd.read_root(utils.IO.signalName[i],"bbggSelectionTree", columns = ['benchmark_reweight_2017fake'])
        utils.IO.signal_df[i][['weight']] = np.multiply(
            utils.IO.signal_df[i][['weight']],
            input_df[['benchmark_reweight_SM']])
        #utils.IO.signal_df[i][['weight']] = np.multiply(utils.IO.signal_df[i][['weight']],41.5/7.666672541)  #2017 SM
        #utils.IO.signal_df[i][['weight']] = np.multiply(utils.IO.signal_df[i][['weight']],35.9/57.32398753)   #2016 SM
        utils.IO.signal_df[i][['weight']] = np.multiply(
            utils.IO.signal_df[i][['weight']], 59.4 / 7.719390612)  #2018 SM
        utils.IO.signal_df[i][['weight']] = np.divide(
            utils.IO.signal_df[i][['weight']],
            utils.IO.signal_df[i][['sigmaMOverM']])

    return utils.IO.signal_df[i][['weight']]
Beispiel #40
0
def test(raw_test_dataset, cut, output_dataset, path2tree, tree_name, year):
    test_dataset = os.getcwd() + '/dataset/' + 'test_tree.root'
    MjjRegLib.prepare_dataset(raw_test_dataset, test_dataset, path2tree,
                              tree_name, cut)
    input_arr = read_root(test_dataset,
                          columns=MjjRegConf.get_features(),
                          key=path2tree + tree_name)

    test_first_ev = 0
    test_last_ev = input_arr.shape[0] - 1

    test_arr = input_arr.loc[test_first_ev:test_last_ev, 'leadingJet_pt':]

    reg_model = loaded_model = pickle.load(
        open(os.getcwd() + '/dataset/XGB_Mjj_Reg_model_' + year + '.xgb',
             "rb"))
    reg_C_arr = reg_model.predict(data=test_arr)

    MjjRegLib.make_output_file(test_dataset, path2tree, tree_name, reg_C_arr,
                               output_dataset)
    print('Testing successfully compleated!')
Beispiel #41
0
def convertROOT_2_Parquet_2_TFRecord(fileNames):
    for fileName in fileNames: 
        print("Processing file:",fileName)
        label = fileName.split("/")[-1].split(".")[0]
        label = label.lstrip("omtfHits_omtfAlgo0x0006_v1")
        path = str(pathlib.Path(fileName).parent)
        path = path.rstrip("omtfHits_omtfAlgo0x0006_v1")
        path = path.replace("ROOT","Python/")
        for iChunk, dfChunk in enumerate(read_root(fileName, chunksize=int(15E6))):
            print("\tProcessing chunk: {}".format(iChunk))
            transformColumns(dfChunk, unionFormat="new")  
            parquetFile = path+'df.parquet_{}_chunk_{}.gzip'.format(label, iChunk)
            dfChunk.to_parquet(parquetFile, compression='gzip')
            dataset = loadDatasetFromParquet(parquetFile)
            dataset = dataset.map(tf.io.serialize_tensor)
            tfrecordFileName = path+'{}_chunk_{}.tfrecord.gzip'.format(label,iChunk)
            writer = tf.data.experimental.TFRecordWriter(tfrecordFileName, compression_type="GZIP")
            writer.write(dataset)
            print("Chunk done.")
            break
        print("File done.")
Beispiel #42
0
    def fillHisto(self, path, cut, name, weight=True):

        tmp = rp.read_root(
            paths=path,
            where=cut.get(),
            columns=self.weights + self.var +
            ["gen_match_1", "gen_match_2", "decayMode_1", "decayMode_2"])

        if weight:
            tmp.eval("eweight = " + "*".join(self.weights), inplace=True)
            tmp["eweight"] *= float(self.lumi)
        else:
            tmp.eval("eweight = 1", inplace=True)

        tmpHist = R.TH2D(name, name, *(self.binning))
        tmpHist.SetOption("COLZ")
        rn.fill_hist(tmpHist,
                     array=tmp[self.var].values,
                     weights=tmp["eweight"].values)

        return tmpHist
def main():

	parser.add_argument('-i', dest='inputfile', help='Sample to run over', type=str, metavar = 'INPUTFILE', default = "")
	parser.add_argument('-c', dest='channel', help='Dataset channel',choices = ['mt','et','tt'], default = 'mt')
	parser.add_argument('-o', dest='outputfile', help='File the changes get written to', type=str, metavar = 'OUTPUTFILE', default = "output.root")
	parser.add_argument('-w', dest='weights', help='Weight that is to be recalculated', choices = ["all", "antilep_tauscaling", "puweight", "zweight", "trk_sf", "reco_sf", "top_weight"], default = "all" )
	
	global args 
	args = parser.parse_args()
	
	ignore_list = ['addlepton_p4'] # root_pandas can't handle vector<TLorentzVector> 
	tmp = rp.read_root( paths = args.inputfile, ignore=ignore_list) 
	
	if args.weights = "all" or args.weights = "puweight": tmp['puweight']	= tmp.apply( recalcPileupWeight, axis=1 )
	#if args.weights = "all" or args.weights = "zweight":  tmp[''] 			= tmp.apply( recalcZWeight, axis=1 )
	if args.weights = "all" or args.weights = "trk_sf":	  tmp['trk_sf']   	= tmp.apply( recalcTrkSF, axis=1 )
	if args.weights = "all" or args.weights = "reco_sf":  tmp['reco_sf'] 	= tmp.apply( recalcRecoSF, axis=1 )
	if args.weights = "all" or args.weights = "antilep_tauscaling":  
		tmp['eleTauFakeRateWeight'] = tmp.apply( recalcEleTauFakeRateWeight, axis=1 )
		tmp['muTauFakeRateWeight'] 	= tmp.apply( recalcMuTauFakeRateWeight, axis=1 )
		tmp['antilep_tauscaling']   = tmp.apply( recalcAntilepTauscaling, axis=1 ) # Always run eleTauFakeRateWeight and muTauFakeRateWeight first!
Beispiel #44
0
def sim_skim():
    # key='trackerNTup/tracker'
    key = 'QualityVertices'
    for i_file, file in enumerate(sorted(os.listdir(args.trees))):
        print("Opening", key, "data in", args.trees + "/" + file)
        data_all = read_root(
            args.trees + "/" + file,
            key,
            columns=["station", "trackT0", "trackMomentum", "trackMomentumY"])
        data_all['trackT0'] = data_all['trackT0'] * 1e-3  # ns -> us
        print("Total of", data_all.shape[0], "entries")

        #Apply cuts in one go!
        data = data_all

        #save the skimmed dataframe in a compressed HDF5 format
        # here we are appending to the file over tracks and then vertices
        print("Saving compressed data...")
        data = data.reset_index()  # reset index from 0
        cols_to_keep = [
            "station", "trackT0", "trackMomentum", "trackMomentumY"
        ]  # only write for time and station
        data[cols_to_keep].to_hdf(args.df + "_" + str(i_file) + ".h5",
                                  key=key,
                                  mode='a',
                                  complevel=9,
                                  complib="zlib",
                                  format="fixed")
        print("Skimmed dataframe saved to disk", args.df, "\n")

        # print("Opening root file...", file)
        # data_all = read_root(args.trees+"/"+file, key)
        # data_all['trackT0']=data_all['trackT0']*1e-3   # ns -> us
        # print("Total of", data_all.shape[0], "entries")
        # # data_all.to_hdf(args.df+"_"+str(i_file)+".h5", key="sim", mode='a', complevel=9, complib="zlib", format="table", data_columns=True)
        # data_all.to_hdf(args.df+"_"+str(i_file)+".h5", key="sim", mode='a', complevel=9, complib="zlib", format="table", data_columns=True)
        # print("Skimmed dataframe saved to disk", args.df, "\n")

    print("Exiting...")
    sys.exit()
def draw_result(testh5, h5key_test=None, gamma=True, Range=[-0.01, 1.0]):
    # ****** load the file

    if h5key_test:
        columns_CSE = []
        for i in range(25):
            if gamma:
                columns_CSE.append('CSE_' + str(i))
            else:
                columns_CSE.append('CSE0_' + str(i))
        Gamma = pd.read_hdf(testh5, h5key_test, columns=columns_CSE)

        Gamma.plot.hist(sort_columns=False,
                        subplots=True,
                        layout=(5, 5),
                        sharex=True,
                        sharey=True,
                        legend=False,
                        range=Range,
                        bins=20)

    else:
        if gamma:
            mykey = 'gamma'
            Col = 'gamma_CSE'
        else:
            mykey = 'Pi0'
            Col = 'pi0_gamma0_CSE'
        print("Col = ", Col)
        Gamma = root_pandas.read_root(testh5, mykey, columns=Col)

        pd.DataFrame(Gamma[Col].values.tolist()).replace(-9999.0, 0).plot.hist(
            sort_columns=False,
            subplots=True,
            layout=(5, 5),
            sharex=True,
            sharey=True,
            legend=False,
            range=Range,
            bins=20)
Beispiel #46
0
def get_dataframe(infile, treename=None, **kwargs):
    """
    Get the dataframe from the input file.

    Args:
        infile (str): Name of the inputfile from which the dataframe should be
            read. Must either be a pkl or a root file. Which format will be read
            depends entirely on the ending of the filename.
        treename (str, optional): The TTree in the TFile that should be read.
            Since it is possible to store multiple trees in one file it can be
            necessary to specify which on to read. Option is only used for reads
            from .root files.
    Keyword Args:
         Forwarded to root_pandas.read_root

    See also: root_pandas.read_root

    Returns:
        pandas.DataFrame: The dataframe read from the file.
    """
    logging.debug('Getting DataFrame from {}'.format(infile))
    if not infile.endswith('.pkl') and not infile.endswith('.root'):
        logging.error('Infile does not have a parseable format: {}'
                      ' Valid formats are .root and .pkl'.format(infile))

    if infile.endswith('.pkl'):
        return pd.read_pickle(infile)
    if infile.endswith('.root'):
        try:
            from root_pandas import read_root
            if treename is None:
                # If there is more than one tree we still fail in read_root
                treename = get_treename(infile)
            return read_root(infile, key=treename, **kwargs)
        except ImportError:
            # log and bail out
            logging.error('Requested to read DataFrame from {}, but could not '
                          'import root_pandas'.format(infile))
    sys.exit(1)
Beispiel #47
0
def write_weights(name, inputMC, inputMC_tree, outputMC, outputMC_tree, referenceBranches):
    logging.info("opening file "+inputMC)

    #Create cut-string to write the weights only in the ranges that were used for calculating the weights
    weightBranches = []
    i = 0;
    for referenceBranch in referenceBranches:
        weightBranch = referenceBranch+"_weights"
        weightBranches.append(weightBranch)

        with open(name+referenceBranch+"_weights.pickle", "r") as inputFile:
            loaded = pickle.load(inputFile)
        rangemin = loaded["rangemin"]
        rangemax = loaded["rangemax"]
        rangemin_str = str(rangemin)
        rangemax_str = str(rangemax)

        if i == 0:
            cutstring = referenceBranch + " > " + rangemin_str + " && " + referenceBranch + " < " + rangemax_str
        else:
            cutstring +=  " && " + referenceBranch + " > " + rangemin_str + " && " + referenceBranch + " < " + rangemax_str

        i += 1
    logging.debug("Cutstring: "+cutstring)

    #Get entries of inputfile (without cuts)
    ignoreBranches = list(weightBranches)
    ignoreBranches.append("total_weights")
    mc_frame_wo_cuts = root_pandas.read_root(inputMC, inputMC_tree, ignore=ignoreBranches)
    logging.info("Entries without cuts:"+ str(len(mc_frame_wo_cuts.index)))

    #Read inputfile with cuts
    mc_frame = root_pandas.read_root(inputMC, inputMC_tree, where=cutstring, ignore=ignoreBranches)
    logging.info("Entries with above cuts:"+ str(len(mc_frame.index)))


    #Write referenceBranch_weights (can be multiple branches) to output rootfile
    for referenceBranch in referenceBranches:
        weightBranch = referenceBranch+"_weights"

        logging.info("Applying weights for " + weightBranch)
        with open(name+referenceBranch+"_weights.pickle", "r") as inputFile:
            loaded = pickle.load(inputFile)
        bin_edges = loaded["bin_edges"]
        weights = loaded["weights"]

        mc_frame[weightBranch] = weights[pandas.cut(mc_frame[referenceBranch], bin_edges, labels=False)]



        # checking for MC events with nan event weights
        nanWeightEvents = mc_frame.loc[numpy.isnan(mc_frame[weightBranch])]
        InfWeightEvents = mc_frame.loc[numpy.isinf(mc_frame[weightBranch])]
        for nan_or_inf in (nanWeightEvents, InfWeightEvents):
            if len(nan_or_inf) > 0:
                logging.warning("Some events got a weight of Nan or inf in "+weightBranch)
                #logging.debug(nan_or_inf)
                #logging.warning("plotting them!")
                #plot = nan_or_inf.plot()
                #matplotlib.pyplot.savefig('./Plots/ReWeight/reweight' + weightBranch + inputMC[inputMC.rfind("/")+1:] + "_nanInfWeightEvents.png")
                logging.warning("REMOVING THE NAN/INF-WEIGHTED EVENTS")
                mc_frame = mc_frame.loc[numpy.isnan(mc_frame[weightBranch]) == False]
                mc_frame = mc_frame.loc[numpy.isinf(mc_frame[weightBranch]) == False]


    logging.info("Entries after removing NaN or Inf-Events: "+str(len(mc_frame.index)))


    #Normalizing the weights
    if len(weightBranches) > 1:
        totalWeight = numpy.ones(len(mc_frame.index))
    for weightBranch in weightBranches:
        if len(weightBranches) > 1:
            totalWeight *= mc_frame[weightBranch]
        #normalizing this one weight. If you later use the product of weights, you need to normalize that product too
        factor = len(mc_frame.index)/sum(mc_frame[weightBranch])
        mc_frame[weightBranch] = mc_frame[weightBranch] * factor





    #Normalizing the total weight (if applicable)
    if len(weightBranches) > 1:
        factor = len(mc_frame.index)/sum(totalWeight)
        totalWeight = totalWeight * factor
        mc_frame["total_weight"] = totalWeight



    logging.info("writing root file to " + outputMC)
    mc_frame.to_root(outputMC, outputMC_tree)
             #'eminus_TRACKHITS_nHitsOT',
             #'eminus_TRACKHITS_nHitsTT',
             #'eminus_TRACKHITS_nHitsMuon',
             #'eminus_NEARESTCELL_NEIGHBORS_E',
             #'eminus_NEARESTCELL_E',
             #'eminus_WeightedSum_Photon_P',
             'eminus_DivisionWeightedSum_Photon_P',
             'eminus_AcceptedSum_Photon_P',
             'eminus_P_with_AcceptedSum',
             #'eminus_P_with_WeightedSum',
             'eminus_P_with_DivisionWeightedSum',

             'eminus_TRUEP']


data = read_root(filelocation, columns=variables)

#Take only subset of data
data = data.sample(50000)

TRUEP = np.asarray(data['eminus_TRUEP'])
BremAdder_P = np.asarray(data['eminus_P'])

#Remove target variable
data = data.drop('eminus_TRUEP', 1)

#Print training variables
print "Training variables:\n", '='*20, '\n', '\n'.join(list(data.columns))


#Create training/test-sample
            'eminus_ProbNN*',
            'eminus_TRACKHITS_nHitsTotal', 'eminus_TRACKHITS_nHitsVelo',
            'eminus_TRACKHITS_nHitsIT', 'eminus_TRACKHITS_nHitsOT',
            'eminus_TRACKHITS_nHitsTT', 'eminus_TRACKHITS_nHitsMuon', 'eminus_TRACKHITS_nHitsM1',
            #Vector features
            'eminus_RECOPHOTONS_P_VEC', #'eminus_RECOPHOTONS_PT_VEC',
            'eminus_TRUEP',
            'eminus_NEARESTCELL_NEIGHBORS_E', 'eminus_NEARESTCELL_E',
             'eminus_PIDe',
             'eminus_PIDmu',
             'eminus_PIDK',
             'eminus_PIDp',
             'eminus_TRACK_GhostProb'
            ]

data = read_root(filelocation, columns=variables)     #training data (excluding true)


#Load the classifier output
#==========================
classifieroutput = pd.read_pickle("/home/dberninghoff/Master-Make-Based/B2Kemu-Electrons/Pickles/Classifieroutput.pkl")


#Calculate the sum of all reconstructed photon momenta weighted by the classifier output
#=======================================================================================
weighted_sums = []
divisionweight_sums = []

threshold = 0.4989
accepted_sums = []
Beispiel #50
0
def concat_df_chunks(filenames, chunksize, **kwargs):
    return chain(
        *(read_root(f, chunksize=chunksize, **kwargs) for f in filenames)
    )
# no single electron cut (170418)
#s_path_to_input = './resources/ambe_no_s1_pulse_shape.txt'
#df_ambe_data = pd.read_table(s_path_to_input, sep='\t', )


#s_path_to_input = './resources/list_lax_NR_0.6.2.csv'
#s_path_to_input = './resources/list_lax_NR_0.8.4.csv'
#s_path_to_input = './resources/list_lax_NR_0.9.1.csv'
#s_path_to_input = './resources/list_lax_NR_0.9.2.csv'
#s_path_to_input = './resources/data_AmBe_lowenergy.csv'
#df_ambe_data = pd.read_table(s_path_to_input, sep=',')
#print df_ambe_data['x'], df_ambe_data['y'], df_ambe_data['z'], df_ambe_data['distance_to_source']

s_path_to_input = './resources/data_AmBe_cs1_lt_200.root'
df_ambe_data = root_pandas.read_root(s_path_to_input)

# AmBe optimized
df_ambe_data = df_ambe_data[((df_ambe_data['x']**2. + df_ambe_data['y']**2.) < config_xe1t.max_r**2.) & (df_ambe_data['z'] < config_xe1t.max_z) & (config_xe1t.min_z < df_ambe_data['z']) & (df_ambe_data['distance_to_source'] < 80.)]


# apply cuts
#df_ambe_data = df_ambe_data[df_ambe_data['CutLowEnergyAmBe']]
#df_ambe_data = df_ambe_data[df_ambe_data['CutAmBeFiducial']]
df_ambe_data = df_ambe_data[df_ambe_data['CutS1LowEnergyRange']]
df_ambe_data = df_ambe_data[df_ambe_data['CutS2Threshold']]
df_ambe_data = df_ambe_data[df_ambe_data['CutInteractionPeaksBiggest']]
df_ambe_data = df_ambe_data[df_ambe_data['CutS2AreaFractionTop']]
df_ambe_data = df_ambe_data[df_ambe_data['CutS2SingleScatterSimple']]
df_ambe_data = df_ambe_data[df_ambe_data['CutDAQVeto']]
#df_ambe_data = df_ambe_data[df_ambe_data['CutEndOfRunCheck']]
Beispiel #52
0
def main():
    args = parse_args()
    config = parse_config(args.config_file)
    if config is None:
        print('No configuration file is defined. '
              'Define one with `--config-file`.')
        sys.exit(1)

    # read dataset
    files = config['files']
    if 'filepath' in config:
        files = [config['filepath'] + f for f in files]
    kwargs = config['pandas_kwargs']

    print('Reading ', end='')
    entries = 0
    for f in files:
        rootfile = ROOT.TFile(f)
        tree = rootfile.Get(kwargs['key'])
        entries += tree.GetEntries()
    maxslices = args.max_slices
    chunksize = kwargs['chunksize']
    total = (maxslices
             if maxslices is not None and maxslices < (entries / chunksize)
             else (entries / chunksize))
    print(total * chunksize, 'events.')
    df = pd.concat([
        df for df in tqdm(
            islice(
                read_root(files, flatten=True, **kwargs), maxslices),
            total=total)])

    # rename the tagging particle branches
    df.rename(columns=dict(zip(df.columns,
        [c.replace(config['tagging_particle_prefix'], 'tp').replace('-', '_')
            for c in df.columns])),
        inplace=True)
    df['event_id'] = df.runNumber.apply(str) + '_' + df.eventNumber.apply(str)
    if 'invert_target' in config and config['invert_target']:
        df['target'] = np.sign(df.B_ID) != np.sign(df.tp_ID)
    else:
        df['target'] = np.sign(df.B_ID) == np.sign(df.tp_ID)

    # read features and selections
    try:
        if 'inclusive_mva_features' in config:
            mva_features = ['tp_' + f for f in config['inclusive_mva_features']]
        else:
            mva_features = ['tp_' + f.split(' ')[0] for f in config['selections']]
    except:
        raise ValueError('Tried to parse features for the BDT.'
                         ' Either provide well-formatted `selections` or'
                         ' define a `inclusive_mva_features` set.')

    # build BDT model and train the classifier n_cv x 3 times
    xgb_kwargs = config['xgb_kwargs']
    n_jobs = config['n_jobs']

    bootstrap_scores = []
    bootstrap_d2s = []
    nfold = (args.bootstrap_folds
             if args.bootstrap_folds is not None
             else config['n_cv'])
    print('Starting bootstrapping.')
    pbar = tqdm(total=nfold * 3)
    for _ in range(nfold):
        # yield 3-fold split for CV
        df_sets = [df.iloc[indices] for indices in NSplit(df)]

        cv_scores = []
        for i in range(3):
            df1, df2, df3 = (df_sets[i % 3].copy(),
                             df_sets[(i + 1) % 3].copy(),
                             df_sets[(i + 2) % 3].copy())
            model = XGBClassifier(nthread=n_jobs, **xgb_kwargs)
            sample_weight = (df1.target
                             if 'training_weights' in config
                                and config['training_weights']
                             else None)
            model.fit(df1[mva_features], df1.target,
                      sample_weight=df1.SigYield_sw)

            df2['probas'] = model.predict_proba(df2[mva_features])[:, 1]
            df2.reset_index(inplace=True, drop=True)
            df2_max = df2.iloc[df2.groupby('event_id')['probas'].idxmax()].copy()
            df3['probas'] = model.predict_proba(df3[mva_features])[:, 1]
            df3.reset_index(inplace=True, drop=True)
            df3_max = df3.iloc[df3.groupby('event_id')['probas'].idxmax()].copy()

            # calibrate
            calibrator = PolynomialLogisticRegression(power=4,
                                                      solver='lbfgs',
                                                      n_jobs=n_jobs)
            calibrator.fit(df2_max.probas.reshape(-1, 1), df2_max.target,
                           sample_weight=df2_max.SigYield_sw)

            df3_max['calib_probas'] = calibrator.predict_proba(df3_max.probas)[:, 1]

            score = tagging_power_score(df3_max.calib_probas,
                                        tot_event_number=get_event_number(df3_max),
                                        sample_weight=df3_max.SigYield_sw)
            bootstrap_scores.append(score)
            bootstrap_d2s.append(d2_score(df3_max.calib_probas,
                                          sample_weight=df3_max.SigYield_sw))
            pbar.update(1)

    pbar.close()
    print(dedent("""\
          Final {}-fold bootstrap performance
             D2 = {:<6}%
          ε_eff = {:<6}%""")
          .format(nfold,
                  100 * ufloat(np.mean(bootstrap_d2s),
                               np.std(bootstrap_d2s)),
                  100 * ufloat(np.mean(noms(bootstrap_scores)),
                               np.std(noms(bootstrap_scores)))))
#! /usr/bin/env python

import os
import cPickle as pkl
from root_pandas import read_root

current_dir = os.path.dirname(__file__)
bb_dir      = os.path.join(current_dir, '../..')
hgg_bg      = pkl.load(open(bb_dir+'/files/hgg_bg.p', "rb"))

df_data = read_root(bb_dir+'/files/BH/OutputFile_ForBrian.root','BH_Tree')
pkl.dump(df_data, open( bb_dir+"/files/BH/BH_paper_data.p", "wb" ), protocol = -1)

#! /usr/bin/env python

import cPickle as pkl
import pandas as pd
from root_pandas import read_root

#df_signal = read_root('../../files/HiggsToGG/Tree_LowPtSUSY_Tree_HGG_BB1.root','HGG_Tree')
df_bg = read_root('../../files/HiggsToGG/Tree_LowPtSUSY_Tree_PPGG_BB_All.root','HGG_Tree')
#pkl.dump(df_signal, open( "../../files/hgg_signal.p", "wb" ), protocol = -1)
pkl.dump(df_bg, open( "../../files/hgg_bg.p", "wb" ), protocol = -1)

Beispiel #55
0
def create_weights(name, referenceMC, referenceMC_tree, referenceData, referenceData_tree, referenceBranches, ranges, binning, weightBranch):
    logging.info("(Re-)Creating the weights from the control channel")
    mc_frame = root_pandas.read_root(referenceMC, referenceMC_tree, columns=referenceBranches)
    if weightBranch == None:
        data_frame = root_pandas.read_root(referenceData, referenceData_tree, columns=referenceBranches)
    else:
        referenceBranches_w_weight = list(referenceBranches)
        referenceBranches_w_weight.append(weightBranch)
        data_frame = root_pandas.read_root(referenceData, referenceData_tree, columns=referenceBranches_w_weight)

    #Go through the Branches
    counter = 0
    for referenceBranch in referenceBranches:
        branch_range = ranges[counter] #string that looks like this: [min,max]
        branch_range = branch_range.replace("[", "")
        branch_range = branch_range.replace("]", "")
        rangemin, rangemax = branch_range.split(",")
        rangemin = float(rangemin)
        rangemax = float(rangemax)
        if rangemin > rangemax:
            raise SystemExit("rangemin > rangemax in ", referenceBranch)


        # Create histograms and create control plots
        mc_counts, bin_edges = numpy.histogram(mc_frame[referenceBranch], range=(rangemin,rangemax), bins=binning, density=True)
        if weightBranch == None:
            data_counts, bin_edges = numpy.histogram(data_frame[referenceBranch], range=(rangemin,rangemax),bins=binning, density=True)
        else:
           data_counts, bin_edges = numpy.histogram(data_frame[referenceBranch], weights=data_frame[weightBranch], range=(rangemin,rangemax), bins=binning, density=True)

        weights = ( data_counts.astype(float)/float(sum(data_counts)) ) / ( mc_counts.astype(float)/float(sum(mc_counts)) )
        logging.info("Weights for "+ referenceBranch +":")
        logging.debug(weights)

        # Plot without weights
        mc_counts_plot = numpy.append(mc_counts, 0) #append any number so that the last regular bin is shown
        data_counts_plot = numpy.append(data_counts, 0)
        matplotlib.pyplot.step(bin_edges, mc_counts_plot, where='post', color='g')
        matplotlib.pyplot.step(bin_edges, data_counts_plot, where='post', color='r')
        matplotlib.pyplot.xlim(min(bin_edges), max(bin_edges))
        matplotlib.pyplot.title(referenceBranch+" normalised BEFORE weighting. Green=MC, Red=DATA")
        matplotlib.pyplot.savefig("./Plots/ReWeight/"+referenceBranch+"_before_reweight.png")
        matplotlib.pyplot.clf()
        matplotlib.pyplot.cla()

        # Plot with weights
        plot_weights = list(weights)    #check for nan or inf weights
        for plot_weight in plot_weights:
            if numpy.isnan(plot_weight) or numpy.isinf(plot_weight):
                plot_weight = 0

        mc_counts_weighted = mc_counts * plot_weights
        mc_counts_weighted_plot = numpy.append(mc_counts_weighted, 0)
        matplotlib.pyplot.step(bin_edges, mc_counts_weighted_plot, where='post', color='g')
        matplotlib.pyplot.step(bin_edges, data_counts_plot, where='post', color='r', linestyle='--')
        matplotlib.pyplot.xlim(min(bin_edges),max(bin_edges))
        matplotlib.pyplot.title(referenceBranch+" AFTER weighting. Green=MC, Red=DATA")
        matplotlib.pyplot.savefig("./Plots/ReWeight/"+referenceBranch+"_after_reweight.png")
        matplotlib.pyplot.clf()
        matplotlib.pyplot.cla()


    #A NaN occurs when a bin in MC is empty. This happens mainly for high PT though, where data bins should also be mostly empty and it is usually best to assign a zero.

        #For every referenceBranch save another pickle file
        with open(name+referenceBranch+"_weights.pickle", "w") as outputFile:
            pickle.dump({"bin_edges" : bin_edges, "weights" : weights, "rangemin" : rangemin, "rangemax" : rangemax}, outputFile)
            logging.info(name+referenceBranch+"_weights.pickle saved")


        counter += 1
#!/usr/bin/env python

from root_pandas import read_root

tree = read_root("test/genfaketau/out/tmva.root", "TestTree")
signal = tree[tree.classID < .5]
background = tree[tree.classID > .5]

for i in [.05 * x for x in range(20)]:
    cut = background.BDTG.quantile(i)
    sfraction = signal[signal.BDTG > cut].size / float(signal.size)
    bfraction = background[background.BDTG < cut].size / float(background.size)
    print "{:4.2f} {:6.4f} {:6.4f} {:6.4f}".format(i, cut, sfraction, bfraction)
vectorfeatures =  [ 'eminus_P', 'eminus_RECOPHOTONS_P_VEC',
                    'eminus_RECOPHOTONS_PT_VEC',
                    'eminus_RECOPHOTONS_Dist2Orig*_VEC',
                    'eminus_RECOPHOTONS_DOCA*_VEC', 'eminus_RECOPHOTONS_IP_BESTPV_VEC',
                    #'eminus_RECOPHOTONS_TRUE_FromThisParticle_VEC',
                    'eminus_RECOPHOTONS_TRUE_PhotonFromThisParticle_VEC',
                    #'eminus_RECOPHOTONS_BremAdded_VEC',
                    'eminus_RECOPHOTONS_Dist2TrExtrap_Velo_VEC', 'eminus_RECOPHOTONS_Dist2TrExtrapInError_Velo_VEC',
                    'eminus_RECOPHOTONS_Dist2TrExtrap_TT_VEC', 'eminus_RECOPHOTONS_Dist2TrExtrapInError_TT_VEC',
                    'eminus_RECOPHOTONS_BremAdded_VEC']

ignorefeatures = [ ]


print "Reading data..."
datascalar = read_root(filelocation, columns=scalarfeatures, ignore=ignorefeatures)     #dataframe
datavector = read_root(filelocation, columns=vectorfeatures, ignore=ignorefeatures)
datavector = datavector.drop('eminus_P', 1)
print "Reading data complete."

#Create the unpacked dataset
print "Creating unpacked dataset..."
data = dataframe_join_vectors_to_scalars(datascalar, datavector)
print "Creating unpacked dataset complete."


#Create new variables: maximum/minimum of Dist2TrExtrap(InError)
maxDist2TrExtrapInError = np.maximum(data['eminus_RECOPHOTONS_Dist2TrExtrapInError_Velo_VEC'], data['eminus_RECOPHOTONS_Dist2TrExtrapInError_TT_VEC'])
minDist2TrExtrapInError = np.minimum(data['eminus_RECOPHOTONS_Dist2TrExtrapInError_Velo_VEC'], data['eminus_RECOPHOTONS_Dist2TrExtrapInError_TT_VEC'])

maxDist2TrExtrap = np.maximum(data['eminus_RECOPHOTONS_Dist2TrExtrap_Velo_VEC'], data['eminus_RECOPHOTONS_Dist2TrExtrap_TT_VEC'])
    entries = float(tree.GetEntries(cutstring))
    rel_eff = entries/entries_before
    abs_eff = generator_eff * entries/before_stripping

    print "\n\n============After %s============" % stage
    if cutstring != "1":
        print "Additional cuts: %s" % cutstring
    print "Entries: %i" % entries
    print "Rel. Efficiency: %f" % rel_eff
    print "Abs. Efficiency: %f" % abs_eff

    #Get entries if weight-branch is found
    for branch in tree.GetListOfBranches():
        if weightbranch == branch.GetName():
            weighted = True
            dataframe = read_root(rootfilename, treename, columns=[weightbranch], where=cutstring)
            entries_weight = np.sum(dataframe[weightbranch])
            rel_eff_weight = entries_weight/entries_before_w_weight
            abs_eff_weight = generator_eff * entries_weight/before_stripping

            print "\n--With Weights--"
            if cutstring != "1":
                print "Additional cuts: %s" % cutstring
            print "Entries: %.2f" % entries_weight
            print "Rel. Efficiency: %f" % rel_eff_weight
            print "Abs. Efficiency: %f" % abs_eff_weight

            #Update entries before with weight
            entries_before_w_weight = entries_weight

    #Update entries before
Beispiel #59
0
def create_resamplers(options):
    import os.path
    import pickle
    from root_pandas import read_root
    from PIDPerfScripts.Binning import GetBinScheme

    if options.binningFile and options.binningName:
        import imp

        try:
            imp.load_source("userbinning", options.binningFile)
        except IOError:
            msg = "Failed to load binning scheme file '{0}'".format(options.binningFile)
            raise IOError(msg)
        print(
            "Using custom binning scheme defined in {0} with name {1}".format(options.binningFile, options.binningName)
        )
    else:
        print("Using default binning scheme")
        options.binningName = None

    pid_variables = [
        "{}_CombDLLK",
        "{}_CombDLLmu",
        "{}_CombDLLp",
        "{}_CombDLLe",
        "{}_V3ProbNNK",
        "{}_V3ProbNNpi",
        "{}_V3ProbNNmu",
        "{}_V3ProbNNp",
    ]
    kin_variables = ["{}_P", "{}_Eta", "nTracks"]

    with open("raw_data.json") as f:
        locations = json.load(f)
    if options.particles:
        locations = [sample for sample in locations if sample["particle"] in options.particles]
    if options.both_magnet_orientations:
        locations = [
            sample for sample in locations if sample["magnet"] == "Up"
        ]  # we use both maagnet orientations on the first run
    for sample in locations:
        binning_P = rooBinning_to_list(
            GetBinScheme(sample["branch_particle"], "P", options.binningName)
        )  # last argument takes name of user-defined binning
        binning_ETA = rooBinning_to_list(
            GetBinScheme(sample["branch_particle"], "ETA", options.binningName)
        )  # last argument takes name of user-defined binning
        binning_nTracks = rooBinning_to_list(
            GetBinScheme(sample["branch_particle"], "nTracks", options.binningName)
        )  # last argument takes name of user-defined binning
        if options.both_magnet_orientations:
            if sample["magnet"] == "Up":
                data = [options.location + "/{particle}_Stripping{stripping}_MagnetUp.root".format(**sample)]
                data += [options.location + "/{particle}_Stripping{stripping}_MagnetDown.root".format(**sample)]
                resampler_location = "{particle}_Stripping{stripping}_MagnetAny.pkl".format(**sample)
        else:
            data = [options.location + "/{particle}_Stripping{stripping}_Magnet{magnet}.root".format(**sample)]
            resampler_location = "{particle}_Stripping{stripping}_Magnet{magnet}.pkl".format(**sample)
        if os.path.exists(resampler_location):
            os.remove(resampler_location)
        resamplers = dict()
        deps = map(lambda x: x.format(sample["branch_particle"]), kin_variables)
        pids = map(lambda x: x.format(sample["branch_particle"]), pid_variables)
        for pid in pids:
            if "DLL" in pid:
                target_binning = np.linspace(-150, 150, 300)  # binning for DLL
            elif "ProbNN" in pid:
                target_binning = np.linspace(0, 1, 100)  # binning for ProbNN
            else:
                raise Exception
            resamplers[pid] = Resampler(binning_P, binning_ETA, binning_nTracks, target_binning)
        for dataSet in data:
            for i, chunk in enumerate(
                read_root(dataSet, columns=deps + pids + ["nsig_sw"], chunksize=100000, where=options.cutstring)
            ):  # where is None if option is not set
                for pid in pids:
                    resamplers[pid].learn(chunk[deps + [pid]].values.T, weights=chunk["nsig_sw"])
                logging.info("Finished chunk {}".format(i))
        with open(resampler_location, "wb") as f:
            pickle.dump(resamplers, f)
Beispiel #60
0
#!/usr/bin/env python

import ROOT as r

from root_pandas import read_root

quants = 'eta pt chargedPt constituents chargedConstituents'.split()
quants += 'closestdr closestpt closestparticledr closestparticlept'.split()
quants += 'signalPt signalChargedPt signalConstituents signalChargedConstituents'.split()
quants += 'isoPt isoChargedPt isoConstituents isoChargedConstituents'.split()

taus_in = ['genjet_' + q.lower() for q in quants] + ['isoMVA03', 'antiElectron', 'antiMuon', 'match', 'pt']
taus_in = ['tau_' + v for v in taus_in]
taus = read_root("test/genfaketau/out/ntuple.root", "ttjets", columns=taus_in, flatten=True)

fakes = taus[(taus.tau_match == 6)]
selection = taus[
    (taus.tau_match == 6)
    & (taus.tau_isoMVA03 >= 3)
    & (taus.tau_pt >= 20.)
]

gen_in = [q.lower() for q in quants]
gen_in = ['genjet_' + v for v in gen_in]
alljets = read_root("test/genfaketau/out/ntuple.root", "ttjets", columns=gen_in, flatten=True)
jets = alljets[
    (alljets.genjet_pt > 18)
    & (alljets.genjet_eta > -2.5)
    & (alljets.genjet_eta < 2.5)
    & (alljets.genjet_closestparticledr > 0.1)
    & (alljets.genjet_constituents <= 22)