Example #1
0
def predict(data, features, baseName, opt):
    """ runs the prediction for the trained models and dumps a tree """

    print '[predict] with', baseName, 'with', len(data), 'events'

    #load models and standard scaler
    with open(opt.model, 'r') as cache:
        best_models = pickle.load(cache)
        scaler = pickle.load(cache)

    #scale data and switch to pandas DataFrame
    df = pd.DataFrame(scaler.transform(data))
    df.columns = features

    #run all predictions
    pred = pd.DataFrame()
    for key in best_models:
        if key != 'rfc': continue
        for xangle in best_models[key]:
            tag = '%s_%d' % (key, xangle)
            print tag, 'for', baseName
            clf = best_models[key][xangle][0]
            features = best_models[key][xangle][-1]
            y_prob = clf.predict_proba(df[features])[:, 0]
            pred[tag] = y_prob

    #write to output
    rp.to_root(pred, baseName, key='pudiscr', store_index=False)
    if opt.output:
        os.system('xrdcp -f {0} root://eoscms//{1}/{0}'.format(
            baseName, opt.output.replace('/eos/cms/', '')))
        os.system('rm {0}'.format(baseName))
Example #2
0
def store_dataframe(df, outfile, tname='chi2_values'):
    """
    Store the dataframe either into a pkl file or into a root file via
    root_pandas.
    """
    logging.debug('Storing DataFrame to {}'.format(outfile))
    if not outfile.endswith('.pkl') and not outfile.endswith('.root'):
        logging.warning('Output file doesnot have .root or .pkl format. '
                        'Creating a .pkl file instead')
        logging.debug('Output filename before substitution: {}'.format(outfile))
        import re
        outfile = re.sub(r'(.*\.)(\w*)$', r'\1pkl', outfile)
        logging.debug('Output filename after substitution: {}'.format(outfile))

    logging.info('Writing resulting DataFramet to: {}'.format(outfile))
    # if .root is requested check if root_pandas is here, otherwise go to .pkl
    if outfile.endswith('.root'):
        try:
            from root_pandas import to_root
            # current version of to_root doesnot support the store_index argument
            to_root(df, outfile, tname, mode='w'# , store_index=False
            )
        except ImportError:
            logging.warning('Output to .root file was requested, but root_pandas'
                            ' was not found. Creating a .pkl file instead')
            outfile = outfile.replace('.pkl', '.root')

    if outfile.endswith('.pkl'):
        df.to_pickle(outfile)
Example #3
0
def write_to_EPM(output, dfCols = None, trueTag = None, fileName = 'tagsToEPM.root'):

    '''
        Takes classifier output (in [0, 1]), associated true tag associations (as PDG MC IDs) in a Pandas Series or
        DataFrame, along with (DataFrame) other variables to be written to the resulting ROOT ifle.

        Writes a ROOT file that can be imported into Espresso Performance Monitor.
    '''

    try:
        import pandas as pd
        from root_pandas import to_root
    except ImportError:
        print('ERROR: Cannot import from root_pandas - no ROOT files have been written.')
        return

    decisions, mistags = decision_and_mistag(output)

    if type(dfCols) == pd.Series:
        dfCols = dfCols.to_frame()
    elif dfCols is None or type(dfCols) != pd.DataFrame:
        dfCols = pd.DataFrame()

    dfCols['tag'] = decisions.flatten().astype(np.int32) # Short_t
    dfCols['eta'] = mistags.flatten().astype(np.double) # Float_t
    if not trueTag is None:
        dfCols['truth'] = trueTag.flatten().astype(np.int32) # Short_t

    to_root(dfCols, fileName, key = 'tree')
Example #4
0
def test_issue_80():
    df = pd.DataFrame({'a': [1, 2], 'b': [4, 5]})
    df.columns = ['a', 'a']
    try:
        root_pandas.to_root(df, '/tmp/example.root')
    except ValueError as e:
        assert 'DataFrame contains duplicated column names' in e.args[0]
    else:
        raise Exception('ValueError is expected')
Example #5
0
def test_issue_60():
    df = pd.DataFrame({'a': list(range(10)), 'b': list(range(10))})
    root_pandas.to_root(df, 'tmp_1.root', 'my_tree_1')
    root_pandas.to_root(df, 'tmp_2.root', 'my_tree')
    result = root_pandas.read_root(['tmp_1.root', 'tmp_2.root'],
                                   'my_tree',
                                   warn_missing_tree=True)
    assert len(result) == 10
    os.remove('tmp_1.root')
    os.remove('tmp_2.root')
Example #6
0
def test_detect_branches_first_missing():
    df = pd.DataFrame({'a': list(range(10)), 'b': list(range(10))})
    to_root(df, 'tmp_1.root', 'my_tree_1')
    to_root(df, 'tmp_2.root', 'my_tree')
    read_df = read_root(['tmp_1.root', 'tmp_2.root'],
                        'my_tree',
                        warn_missing_tree=True)
    assert_frame_equal(df, read_df)
    os.remove('tmp_1.root')
    os.remove('tmp_2.root')
Example #7
0
    def run(self):
        df = root_pandas.read_root(*self.get_input_file_names('train.root'),
                                   key=self.tree_name)

        # resample
        resampled_df = resample(df, random_state=self.random_seed)

        # store to root
        root_pandas.to_root(resampled_df,
                            self.get_output_file_name('train.root'),
                            key=self.tree_name)
Example #8
0
    def run(self):
        train, test = split_sample(ntuple_file=self.ntuple_file,
                                   train_size=self.train_size,
                                   test_size=self.test_size)

        # Store as Rootfile
        root_pandas.to_root(train,
                            self.get_output_file_name('train.root'),
                            key=self.tree_name)
        root_pandas.to_root(test,
                            self.get_output_file_name('test.root'),
                            key=self.tree_name)
Example #9
0
def test_issue_63():
    df = pd.DataFrame({'a': [], 'b': []})
    root_pandas.to_root(df, 'tmp_1.root', 'my_tree')
    df = pd.DataFrame({'a': list(range(10)), 'b': list(range(10))})
    root_pandas.to_root(df, 'tmp_2.root', 'my_tree')
    result = list(
        root_pandas.read_root(['tmp_1.root', 'tmp_2.root'],
                              'my_tree',
                              where='a > 2',
                              chunksize=1))
    assert len(result) == 7
    assert all(len(df) == 1 for df in result)
    os.remove('tmp_1.root')
    os.remove('tmp_2.root')
Example #10
0
    def run(self):
        expert = root_pandas.read_root(
            self.get_input_file_names('validation_expert.root'))

        # normalize to len_data / len_mc (off-res.)
        key_EventType = expert.keys()[1]
        len_data = len(expert[expert[key_EventType] == 1])
        len_mc = len(expert) - len_data

        weights = get_weights(expert_df=expert, normalize_to=len_data / len_mc)

        root_pandas.to_root(
            weights,
            self.get_output_file_name('validation_weights.root'),
            key='weights')
Example #11
0
    def run(self):
        # calculate the normalization from ValadiatonReweighting output
        validation_weights = root_pandas.read_root(
            self.get_input_file_names("validation_weights.root"))

        len_data = len(
            validation_weights[validation_weights["EventType"] == 1])
        len_mc = len(validation_weights) - len_data

        expert = root_pandas.read_root(
            self.get_input_file_names('expert.root'))
        weights = get_weights(expert_df=expert, normalize_to=len_data / len_mc)
        root_pandas.to_root(weights,
                            self.get_output_file_name('weights.root'),
                            key=self.tree_name)
Example #12
0
def store_dataframe(dfr, outfile, tname='chi2_values', **kwargs):
    """
    Store the dataframe either into a pkl file or into a root file via
    root_pandas.

    Args:
        dfr (pandas.DataFrame): The dataframe that should be stored
        outfile (str): The filename to which the DataFrame should be stored.
            If this ends with .pkl, a pkl file will be created, if it ends on
            .root a root file will be created (if root_pandas is available),
            Otherwise a .pkl file will be created with .root replaced with .pkl
        tname (str, optional): Name of the TTree to be used for storing the
            DataFrame if stored to a root file

    Keyword Args:
        Forwarded to root_pandas.to_root

    See Also: root_pandas.to_root
    """
    logging.debug('Storing DataFrame to {}'.format(outfile))
    if not outfile.endswith('.pkl') and not outfile.endswith('.root'):
        logging.warning('Output file doesnot have .root or .pkl format. '
                        'Creating a .pkl file instead')
        logging.debug(
            'Output filename before substitution: {}'.format(outfile))
        outfile = re.sub(r'(.*\.)(\w*)$', r'\1pkl', outfile)
        logging.debug('Output filename after substitution: {}'.format(outfile))

    logging.info('Writing resulting DataFrame to: {}'.format(outfile))
    # if .root is requested check if root_pandas is here, otherwise go to .pkl
    if outfile.endswith('.root'):
        try:
            from root_pandas import to_root
            # current version of to_root doesn't support the store_index argument
            to_root(
                dfr,
                outfile,
                tname,
                **kwargs  # , store_index=False
            )
        except ImportError:
            logging.warning(
                'Output to .root file was requested, but root_pandas'
                ' was not found. Creating a .pkl file instead')
            outfile = outfile.replace('.pkl', '.root')

    if outfile.endswith('.pkl'):
        dfr.to_pickle(outfile)
Example #13
0
def run(input_fns, output_fn, h1, h2, h3):
    keys = list_trees(input_fns[0])
    assert len(keys) == 1, keys
    df = read_root(input_fns, keys[0])

    df['H1_isMuon'] = df['H1_isMuon'].astype(np.bool)
    df['H2_isMuon'] = df['H2_isMuon'].astype(np.bool)
    df['H3_isMuon'] = df['H3_isMuon'].astype(np.bool)

    # Sort the columns so that the first is the most kaon-like
    assert sorted([h1, h2, h3
                   ]) == [h1, h2, h3
                          ], 'Children are ranked from kaon-like to pion-like'
    order = np.argsort(df[['H3_ProbK', 'H2_ProbK', 'H1_ProbK']], axis=1)
    for col in [c for c in df.columns if c.startswith('H1_')]:
        col = col[len('H1_'):]
        cols = [f'H1_{col}', f'H2_{col}', f'H3_{col}']
        df[cols] = df[cols].values[np.arange(order.shape[0])[:, None], order]

    # Compute the PE and mass of all particles
    for head, mass in [('H1', mass_dict[h1]), ('H2', mass_dict[h2]),
                       ('H3', mass_dict[h3])]:
        df.eval(f'{head}_P = sqrt({head}_PX**2 + {head}_PY**2 + {head}_PZ**2)',
                inplace=True)
        df.eval(f'{head}_PE = sqrt({mass}**2 + {head}_P**2)', inplace=True)
    for component in ['PE', 'PX', 'PY', 'PZ']:
        df.eval(
            f'B_{component} = H1_{component} + H2_{component} + H3_{component}',
            inplace=True)
    df.eval(f'B_M = sqrt(B_PE**2 - B_PX**2 - B_PY**2 - B_PZ**2)', inplace=True)

    # if [h1, h2, h3] == ['K', 'K', 'K']:
    # Apply ignore muons
    df.query('~(H1_isMuon | H2_isMuon | H3_isMuon)', inplace=True)
    # Apply an additional selection
    df.query(f'(H1_IPChi2 > 25) & (H2_IPChi2 > 25) & (H3_IPChi2 > 25)',
             inplace=True)
    # Apply a PID selection
    df.query(
        f'(H1_Prob{h1} > {pid_cut}) & (H2_Prob{h2} > {pid_cut}) & (H3_Prob{h3} > {pid_cut})',
        inplace=True)

    to_root(df, output_fn, key=f'B2{h1}{h2}{h3}', mode='w', store_index=False)
Example #14
0
    def fill_trees(self, branch_names, year, print_yields=False):

        #have to save individual trees as root files (fn=bn), then hadd over single proc on the command line, to get one proc file with all tag trees
        debug_cols = [
            'dielectronMass', 'leadElectronPtOvM', 'subleadElectronPtOvM',
            'dijetMass', 'leadJetPt', 'subleadJetPt', 'ggH_mva', 'VBF_mva',
            'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag', 'proc',
            'tree_name'
        ]

        if print_yields:
            print_str = '*** Yields ***'
            lumi_map = {
                '2016A': 35.9,
                '2016B': 35.9,
                '2017': 41.5,
                '2018': 59.7
            }

        for proc in self.true_procs:
            selected_df = self.combined_df[self.combined_df.proc == proc]
            if print_yields: print_str += '\n \n Process: {}'.format(proc)
            for bn in branch_names[proc]:
                print bn
                branch_selected_df = selected_df[selected_df.tree_name == bn]
                print branch_selected_df[debug_cols].head(10)
                root_pandas.to_root(branch_selected_df[self.tree_vars],
                                    'output_trees/{}/{}_{}.root'.format(
                                        year, bn, year),
                                    key=bn)
                if print_yields:
                    if proc is not 'Data':
                        print_str += '\n Summed events in category {}: {}'.format(
                            bn,
                            np.sum(branch_selected_df['weight']) *
                            lumi_map[year] * 1000)
                    else:
                        print_str += '\n Summed events in category {}: {}'.format(
                            bn, np.sum(branch_selected_df['weight']))
                print

        if print_yields: print print_str
Example #15
0
    def fit(self, X_train, y_train):
        '''Fit routine.

        Parameters
        ----------
        X_train : numpy.array, shape=(n_samples, n_obervables)
            Observable sample.
        
        y_train : numpy.array, shape=(n_samples,)
            Target variable sample.
        '''
        X_train, y_train = super(TRUEEUnfolding, self).fit(X_train, y_train)
        self.binning_X.fit(X_train, y_train)
        self.binning_y.fit(y_train)
        X_digit = self.binning_X.digitize(X_train)
        y_digit = self.binning_y.digitize(y_train)
        df_train = pd.DataFrame(np.column_stack((X_digit, y_digit)),
                                columns=['x', 'y'])

        file_mc = 'temp_truee_train.root'
        self.tempdir = '_truee_temp_dir'
        self.path_mc = os.path.join(self.tempdir, file_mc)
        if not os.path.exists('_truee_temp_dir'):
            os.mkdir('_truee_temp_dir')
        rp.to_root(df_train, self.path_mc, 'data')

        self._config.update(source_file_moca=self.path_mc)
        self._config.update(roottree_moca='data')

        self._config.update(branch_y='x')
        self._config.update(number_y_bins=self.binning_X.n_bins)
        self._config.update(
            limits_y='{} {}'.format(-0.5, self.binning_X.n_bins - 0.5))

        self._config.update(branch_x='y')
        self._config.update(number_bins=self.binning_y.n_bins)
        self._config.update(max_number_bins=self.binning_y.n_bins)
        self._config.update(
            limits_x='{} {}'.format(-0.5, self.binning_y.n_bins - 0.5))
        self.is_fitted = True
Example #16
0
def generate_data(
        size_mc=500000,
        size_data=10000,
        size_mc_offres=150000,
        size_data_offres=8000,
        frac_a=0.8):
    """Generate root files to represent data and MC samples to demonstrate
    the re-weighting.

    Parameters:
        size_mc, size_data, size_mc_offres, size_data_offres: number of events
        in the corresponding sample.
        frac_a: fraction of events in componentA

    Return:
        data, componentA, componentB, data_offres, componentA_offres:
            pd.DataFrames of the generated samples.
    """

    frac_b = 1 - frac_a

    # GENERATE DATA
    print(
        "Generating the following dataframes:\n"
        "data, componentA, componentB, data_offres and componentA_offres ...")

    # Random state for random number generation
    rs = np.random.RandomState(seed=1)

    # on res
    data = pd.DataFrame()
    componentA = pd.DataFrame()
    componentB = pd.DataFrame()

    # variable1
    tmp_data = rs.triangular(0, 1, 1, size=int(size_data*frac_a*0.3))
    tmp_data = np.append(
        tmp_data, rs.normal(0.3, 0.1, int(size_data*frac_b)))
    tmp_data = np.append(
        tmp_data, rs.uniform(size=int(size_data*frac_a*0.7)))
    data["variable1"] = tmp_data
    data = data.loc[data["variable1"] >= 0]

    componentA["variable1"] = rs.uniform(size=int(size_mc * frac_a))
    componentB["variable1"] = rs.normal(
        0.3, 0.1, size=int(size_mc * frac_b))

    # variable2
    data["variable2"] = rs.uniform(size=len(data))
    componentA["variable2"] = rs.uniform(size=int(size_mc*frac_a))
    componentB["variable2"] = rs.uniform(size=int(size_mc*frac_b))

    # candidate and EventType
    data["__candidate__"] = [0]*len(data)
    componentA["__candidate__"] = [0]*len(componentA)
    componentB["__candidate__"] = [0]*len(componentB)

    data["EventType"] = [float(1)]*len(data)
    componentA["EventType"] = [float(0)]*len(componentA)
    componentB["EventType"] = [float(0)]*len(componentB)

    # off res
    data_offres = pd.DataFrame()
    componentA_offres = pd.DataFrame()

    # variable1
    tmp_data = rs.triangular(
        0, 1, 1, size=int(size_data_offres*frac_a*0.3))
    tmp_data = np.append(
        tmp_data, rs.uniform(size=int(size_data_offres*frac_a*0.7)))
    data_offres["variable1"] = tmp_data
    componentA_offres["variable1"] = rs.uniform(
        size=int(size_mc_offres*frac_a))

    # variable2
    data_offres["variable2"] = rs.uniform(size=len(data_offres))
    componentA_offres["variable2"] = rs.uniform(
        size=int(size_mc_offres*frac_a))

    # candidate and EventType
    data_offres["__candidate__"] = [0]*len(data_offres)
    componentA_offres["__candidate__"] = [0]*len(componentA_offres)

    data_offres["EventType"] = [float(1)]*len(data_offres)
    componentA_offres["EventType"] = [float(0)]*len(componentA_offres)

    # SAVE DATA
    print("Saving data to 'example_input/<file>.root' ...")

    if not os.path.exists("example_input"):
        os.makedirs("example_input")

    to_root(data, "example_input/data.root", key="variables")
    to_root(componentA, "example_input/componentA.root", key="variables")
    to_root(componentB, "example_input/componentB.root", key="variables")
    to_root(data_offres, "example_input/data_offres.root", key="variables")
    to_root(
        componentA_offres,
        "example_input/componentA_offres.root", key="variables")

    return data, componentA, componentB, data_offres, componentA_offres
Example #17
0
    pid = ham.add_process(Bc2JpsiLNu)
    pids.append(pid)

    #    pl =  ev.mu3_pt
    #    q2 =  ev.q2
    
    #    ham.fill_event_histogram("pEllVsQ2:Bc", [pl, q2])

    #     print ("chekpoint C", i)

    ham.process_event()

    #     print ("chekpoint D", i, pid)
    #     import pdb ; pdb.set_trace()
    
    
    #print (pid, ham.get_weight('BGL', [pid]))
    #print (pid, ham.get_weight('Kiselev', [pid]))
    #     weights.append(ham.get_weight('BGL', [pid]))
    #weights.append(ham.get_weight('BGL'))
    weights.append(ham.get_weight('Kiselev'))

    #     print ("chekpoint E", i)

    if i>maxevents: break

reduced_tree = tree_df[:len(weights)]
reduced_tree['hammer'] = np.nan_to_num(np.array(weights)) # sone NaNs, check the manual
to_root(reduced_tree, 'reweighed_bc_tree_tau.root', key='tree')

Example #18
0
def main(options):

    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        #data not needed yet, but stil specify in the config for compatibility with constructor
        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        proc_to_train_vars = config['train_vars']
        all_train_vars = [
            item for sublist in proc_to_train_vars.values() for item in sublist
        ]

        vars_to_add = config['vars_to_add']

        #Data handling stuff#
        #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority)
        loosest_selection = 'dielectronMass > 110 and dielectronMass < 150'

        #load the mc dataframe for all years. Do not apply any specific preselection
        root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                               data_fnames, proc_to_tree_name, all_train_vars,
                               vars_to_add, loosest_selection)
        root_obj.no_lumi_scale()
        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        if options.data_as_bkg:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj,
                                   reload_samples=options.reload_samples)
        else:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj,
                                 bkg=True,
                                 reload_samples=options.reload_samples)
        root_obj.concat()

        #Tag sequence stuff#
    if options.data_as_bkg:
        combined_df = pd.concat([root_obj.mc_df_sig, root_obj.data_df])
    else:
        combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg])
    del root_obj

    #decide sequence of tags and specify preselection for use with numpy.select:
    tag_sequence = ['VBF', 'ggH']
    proc_to_preselection = {
        'VBF': [
            combined_df['dielectronMass'].gt(110)
            & combined_df['dielectronMass'].lt(150)
            & combined_df['leadElectronPToM'].gt(0.333)
            & combined_df['subleadElectronPToM'].gt(0.25)
            & combined_df['dijetMass'].gt(350)
            & combined_df['leadJetPt'].gt(40)
            & combined_df['subleadJetPt'].gt(30)
        ],
        'ggH': [
            combined_df['dielectronMass'].gt(110)
            & combined_df['dielectronMass'].lt(150)
            & combined_df['leadElectronPToM'].gt(0.333)
            & combined_df['subleadElectronPToM'].gt(0.25)
        ]
    }

    with open(options.bdt_config, 'r') as bdt_config_file:
        config = yaml.load(bdt_config_file)
        proc_to_model = config['models']
        proc_to_tags = config['boundaries']

        #evaluate MVA scores used in categorisation
        for proc, model in proc_to_model.iteritems():
            print 'evaluating classifier: {}'.format(model)
            clf = pickle.load(open('models/{}'.format(model), "rb"))
            train_vars = proc_to_train_vars[proc]
            combined_df[proc + '_bdt'] = clf.predict_proba(
                combined_df[train_vars].values)[:, 1:].ravel()

        # TAG NUMBER #

        #decide on tag
        for proc in tag_sequence:
            presel = proc_to_preselection[proc]
            tag_bounds = proc_to_tags[proc].values()
            tag_masks = []
            for i_bound in range(
                    len(tag_bounds)):  #c++ type looping for index reasons
                if i_bound == 0:  #first bound, tag 0
                    tag_masks.append(presel[0] & combined_df['{}_bdt'.format(
                        proc)].gt(tag_bounds[i_bound]))
                else:  #intermed bound
                    tag_masks.append(presel[0] & combined_df['{}_bdt'.format(
                        proc)].lt(tag_bounds[i_bound - 1]) & combined_df[
                            '{}_bdt'.format(proc)].gt(tag_bounds[i_bound]))

            mask_key = [icat for icat in range(len(tag_bounds))]

            combined_df['{}_analysis_tag'.format(proc)] = np.select(
                tag_masks, mask_key, default=-999)

        # PROC PRIORITY #

        # deduce tag priority: if two or more tags satisfied then set final tag to highest priority tag. make this non hardcoded i.e. compare proc in position 1 to all lower prioty positions. then compare proc in pos 2 ...
        tag_priority_filter = [
            combined_df['VBF_analysis_tag'].ne(-999)
            & combined_df['ggH_analysis_tag'].ne(-999),  # 1) if both filled...
            combined_df['VBF_analysis_tag'].ne(-999)
            & combined_df['ggH_analysis_tag'].eq(
                -999),  # 2) if VBF filled and ggH not, take VBF
            combined_df['VBF_analysis_tag'].eq(-999)
            & combined_df['ggH_analysis_tag'].ne(
                -999),  # 3) if ggH filled and VBF not, take ggH
        ]

        tag_priority_key = [
            'VBF',  #1) take VBF
            'VBF',  #2) take VBF
            'ggH',  #3) take ggH
        ]
        combined_df['priority_tag'.format(proc)] = np.select(
            tag_priority_filter, tag_priority_key,
            default='NOTAG')  # else keep -999 i.e. NOTAG

        #some debug checks:
        #print combined_df[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]
        #print combined_df[combined_df.VBF_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]
        #print combined_df[combined_df.ggH_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]

        # FILL TREES BASED ON BOTH OF ABOVE
        tree_vars = ['dZ', 'CMS_hgg_mass', 'weight']
        combined_df['dZ'] = float(0.)
        combined_df['CMS_hgg_mass'] = combined_df['dielectronMass']

        # FIXME: dont loop through events eventually but for now I cba to use numpy to vectorise it again
        #for true_proc in tag_sequence+['Data']:
        #    #isolate true proc
        #    true_proc_df = combined_df[combined_df.proc==true_proc.lower()]
        #    #how much true proc landed in each of our analysis cats?
        #    for target_proc in tag_sequence:  #for all events that got the proc tag, which tag did they fall into?
        #        true_proc_target_proc_df = true_proc_df[true_proc_df.priority_tag==target_proc]
        #        for i_tag in range(len(proc_to_tags[target_proc].values())):#for each tag corresponding to the category we target, which events go in which tag
        #             true_procs_target_proc_tag_i  = true_proc_target_proc_df[true_proc_target_proc_df['{}_analysis_tag'.format(target_proc)].eq(i_tag)]
        #
        #             branch_name = '{}_125_13TeV_{}cat{} :'.format(true_proc.lower(), target_proc.lower(), i_tag )
        #             print true_procs_target_proc_tag_i[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']].head(10)
        #             print branch_name

        #get tree names
        branch_names = {}
        #print 'DEBUG: {}'.format(np.unique(combined_df['proc']))
        for true_proc in tag_sequence + ['Data']:
            branch_names[true_proc] = []
            for target_proc in tag_sequence:  #for all events that got the proc tag, which tag did they fall into?
                for i_tag in range(
                        len(proc_to_tags[target_proc].values())
                ):  #for each tag corresponding to the category we target, which events go in which tag
                    if true_proc is not 'Data':
                        branch_names[true_proc].append(
                            '{}_125_13TeV_{}cat{}'.format(
                                true_proc.lower(), target_proc.lower(), i_tag))
                    else:
                        branch_names[true_proc].append(
                            '{}_13TeV_{}cat{}'.format(true_proc,
                                                      target_proc.lower(),
                                                      i_tag))

        #debug_procs = ['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']
        debug_vars = [
            'proc', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag'
        ]
        combined_df['tree_name'] = combined_df.apply(assign_tree, axis=1)
        print combined_df[debug_vars + ['tree_name']]

        if not path.isdir('output_trees/'):
            print 'making directory: {}'.format('output_trees/')
            system('mkdir -p %s' % 'output_trees/')

        #have to save individual trees then hadd procs together on the command line.
        for proc in tag_sequence + ['Data']:
            selected_df = combined_df[combined_df.proc == proc]
            for bn in branch_names[proc]:
                print bn
                branch_selected_df = selected_df[selected_df.tree_name == bn]
                print branch_selected_df[debug_vars + ['tree_name']].head(20)
                root_pandas.to_root(branch_selected_df[tree_vars],
                                    'output_trees/{}.root'.format(bn),
                                    key=bn)
                print
Example #19
0
    pids.append(pid)

    #    pl =  ev.mu3_pt
    #    q2 =  ev.q2

    #    ham.fill_event_histogram("pEllVsQ2:Bc", [pl, q2])

    #     print ("chekpoint C", i)

    ham.process_event()

    #     print ("chekpoint D", i, pid)
    #     import pdb ; pdb.set_trace()

    #print (pid, ham.get_weight('BGL', [pid]))
    #print (pid, ham.get_weight('Kiselev', [pid]))
    #     weights.append(ham.get_weight('BGL', [pid]))
    #weights.append(ham.get_weight('BGL'))
    weights.append(ham.get_weight('Kiselev'))

    #     print ("chekpoint E", i)

    if i > maxevents: break

reduced_tree = tree_df[:len(weights)]
reduced_tree['hammer'] = np.nan_to_num(
    np.array(weights))  # sone NaNs, check the manual
to_root(reduced_tree,
        'reweighed_bc_tree_tau_EFTtoKis_14Apr21_1vertx.root',
        key='tree')
Example #20
0
    pid = ham.add_process(Bc2JpsiLNu)
    pids.append(pid)

    pl =  ev.mu3_pt
    q2 =  ev.q2
    
    #    ham.fill_event_histogram("pEllVsQ2:Bc", [pl, q2])

    #     print ("chekpoint C", i)

    ham.process_event()

    #     print ("chekpoint D", i, pid)
    #     import pdb ; pdb.set_trace()
    
    
    #print (pid, ham.get_weight('BGL', [pid]))
    #print (pid, ham.get_weight('Kiselev', [pid]))
    #     weights.append(ham.get_weight('BGL', [pid]))
    #weights.append(ham.get_weight('BGL'))
    weights.append(ham.get_weight('Kiselev'))

    #     print ("chekpoint E", i)

    if i>maxevents: break

reduced_tree = tree_df[:len(weights)]
reduced_tree['hammer'] = np.nan_to_num(np.array(weights)) # sone NaNs, check the manual
to_root(reduced_tree, 'reweighed_bc_tree_fromEfgtoKis.root', key='tree')

Example #21
0
#####   ETA BINS
##########################################################################################
@np.vectorize
def tauEta(eta):
    if abs(eta) > 2.1: return 7
    elif abs(eta) > 1.8: return 6
    elif abs(eta) > 1.5: return 5
    elif abs(eta) > 1.1: return 4
    elif abs(eta) > 0.8: return 3
    elif abs(eta) > 0.5: return 2
    elif abs(eta) > 0.2: return 1
    else: return 0


features.append('tauEta')
sigW['tauEta'] = tauEta(sigW['cand_refit_tau_eta'])
bkg['tauEta'] = tauEta(bkg['cand_refit_tau_eta'])

##########################################################################################
data = pd.concat([sigW, bkg], ignore_index=True, sort=True)
#data['id'] = np.arange(len(data))
train, test = train_test_split(data, test_size=0.4, random_state=1986)
## assign an id to the test and train sets seprately to avoid mismatch when folding
train.insert(len(train.columns), 'id', np.arange(len(train)))
test.insert(len(test.columns), 'id', np.arange(len(test)))

if __name__ == '__main__':
    print "[INFO] Interactive mode: saving dataset to disk"
    import root_pandas
    root_pandas.to_root(data, 'dataframe.root', key='tree')
Example #22
0
    'mu1_softID',
    'mu2_softID',
    'k_tightID',
    'k_mediumID',
    'k_softID',
    #'mu1_isPF',
    #'mu2_isPF',
    #'k_isPF',
]

for k, v in samples.items():
#for k in ['psi2s_tau']:
    for new_column, new_definition in to_define:
        if samples[k].HasColumn(new_column): continue
        samples[k] = samples[k].Define(new_column, new_definition)
    # convert to pandas
    samples[k] = pd.DataFrame(samples[k].AsNumpy())

    for icolumn in to_cast:
        if not math.isnan(samples[k][icolumn][0]):
            samples[k][icolumn] = samples[k][icolumn].astype(int)
 
    print('enrich the data', k)
    for i, label in zip(range(3), ['mu', 'tau', 'bkg']):
        samples[k]['bdt_%s' %label] = model.predict_proba(samples[k][features])[:,i]
    
    to_root(samples[k], '%s/%s_bdtenriched.root' %(tree_dir, k), key='BTo3Mu', store_index=False)



Example #23
0
    def predict(self,
                X,
                n_knots,
                n_dof,
                data_luminosity=1.0,
                moca_luminosity=1.0,
                moca_weight=1.0,
                fx_positive=False,
                smooth_x=False,
                zero_left=False,
                zero_right=False,
                constraints='',
                weight_first=0,
                cleanup=True,
                **kwargs):
        '''Calculates an estimate for the unfolding by calling TRUEE.

        Parameters
        ----------
        X : numpy.array, shape=(n_samples, n_obervables)
            Observable sample.
       
        n_knots : int
            Number of knots for the spline representation used in TRUEE.
            Rule of thumb: Should be about twice the number of bins in the
            target variable space.
       
        n_dof : int
            Number of degrees of freedom, the more, the less regularized the
            unfolding.
       
        data_luminosity : float
            I guess weights for X?
        
        moca_luminosity : float
            I guess weights for y?
        
        fx_positive : bool
            Whether to enforce positive results for the unfolded spectrum.
        
        smooth_x : bool
            Whether to smooth ... the observable vector? I don't know.
        
        zero_left, zero_right : bool
            I think supposedly, this is supposed to set the left/right-most bin
            to zero. However, I don't think it does anything at all
        
        constraints : str
            A string containing a C-style formula (without spaces!). No idea.
        
        weight_first : int
            Who knows
        
        cleanup : bool
            Whether or not to delete all temporary files after TRUEE was called.

        Returns
        -------
        result : ``pyunfolding.utils.UnfoldingResult`` object
            The result of the unfolding, see documentation for 
            `UnfoldingResult`.
        '''
        if not self.is_fitted:
            raise RuntimeError(
                'Unfolding not yet fitted. Use `fit` routine first.')

        X = super(TRUEEUnfolding, self).predict(X)

        # Storing parameters to config dictionary
        self._config.update(number_deg_free=n_dof)
        self._config.update(max_number_deg_free=n_dof)
        self._config.update(number_knots=n_knots)
        self._config.update(max_number_knots=n_knots)
        self._config.update(data_luminosity=data_luminosity)
        self._config.update(moca_luminosity=moca_luminosity)
        self._config.update(moca_weight=moca_weight)
        self._config.update(fx_positive=int(fx_positive))
        self._config.update(smooth_x=int(smooth_x))
        self._config.update(zero_left=int(zero_left))
        self._config.update(zero_right=int(zero_right))
        self._config.update(constraints=constraints)
        self._config.update(weight_first=weight_first)

        X_digit = self.binning_X.digitize(X)
        file_dt = 'temp_truee_test.root'
        self.path_dt = os.path.join(self.tempdir, file_dt)
        df_test = pd.DataFrame(np.column_stack(
            (X_digit, np.zeros(len(X_digit)))),
                               columns=['x', 'y'])
        rp.to_root(df_test, self.path_dt, 'data')

        self._config.update(roottree_data='data')
        self._config.update(source_file_data=self.path_dt)

        # Write config file and run TRUEE
        file_conf = 'parameters.config'
        self.path_conf = os.path.join(self.tempdir, file_conf)
        self._write_config_file(self.path_conf)
        os.system('{} {}'.format(self.TRUEE_CALL, self.path_conf))

        f = ROOT.TFile.Open(self.TRUEE_RESULT)
        g = f.GetDirectory('RealDataResults')
        string = 'bins_{}_knots_{}_degFree_{}'.format(self.binning_y.n_bins,
                                                      n_knots, n_dof)
        cov = np.array([[
            g.Get('Tcovar_matrix_{};1'.format(string))(i, j)
            for i in range(self.binning_y.n_bins)
        ] for j in range(self.binning_y.n_bins)])
        h = g.Get('events_result_{};1'.format(string))
        f_vals = np.array(
            [h.GetBinContent(i) for i in range(self.binning_y.n_bins + 1)])
        f_err = np.sqrt(cov.diagonal())

        # Cleanup temp files
        if cleanup:
            os.remove(self.path_mc)
            os.remove(self.path_dt)
            os.remove(self.path_conf)
            os.remove(self.TRUEE_RESULT)
            os.rmdir(self.tempdir)

        # I'm not sure why this is necessary, but it is. And it's not an elegant solution either.
        scaling = np.sum(f_vals) / len(X)

        return UnfoldingResult(f=f_vals[1:] / scaling,
                               f_err=np.vstack((f_err, f_err)) / scaling,
                               cov=cov,
                               binning_y=self.binning_y,
                               success=True)
Example #24
0
# Target for the regression to predict the correction factor
data['target'] = data.genJetPt / data.jetPt

# Additional selections to limit phase space
#data = data[(np.abs(data.jetEta) < 1.3) & (data.genJetPt > 60.) & ((data.target > 0.9) & (data.target < 1.1))]
data = data[(np.abs(data.jetEta) < 1.3) & (data.genJetPt > 60.)]

# Split into set used for training and validation, and a separate test sets 0.9/0.1
training, test = train_test_split(data, shuffle=True, test_size=0.1)
test.reset_index(drop=True, inplace=True)
training.reset_index(drop=True, inplace=True)

training = training[((training.target > 0.9) & (training.target < 1.1))]

# Save test data to a separate file for post training plotting
to_root(test, 'test_data.root', key='tree')

# Scale input variables for training and save scaler for future use in plotting
scaler = MinMaxScaler().fit(training[Training_variables].values)
dump(scaler, "scaler.pkl")
train_inp = pd.DataFrame(scaler.transform(training[Training_variables].values),
                         columns=Training_variables)
train_trg = training['target']

# Prepare test data for monitoring plots
test_true = test[[
    'isPhysUDS', 'isPhysG', 'genJetPt', 'jetPt', 'QG_ptD', 'QG_axis2',
    'QG_mult'
]]
test_inp = pd.DataFrame(scaler.transform(test[Training_variables].values),
                        columns=Training_variables)
Example #25
0
    pids.append(pid)

    pl = ev.mu3_pt
    q2 = ev.q2

    #    ham.fill_event_histogram("pEllVsQ2:Bc", [pl, q2])

    #     print ("chekpoint C", i)

    ham.process_event()

    #     print ("chekpoint D", i, pid)
    #     import pdb ; pdb.set_trace()

    #print (pid, ham.get_weight('BGL', [pid]))
    #print (pid, ham.get_weight('Kiselev', [pid]))
    #     weights.append(ham.get_weight('BGL', [pid]))
    #weights.append(ham.get_weight('BGL'))
    weights.append(ham.get_weight('Kiselev'))

    #     print ("chekpoint E", i)

    if i > maxevents: break

reduced_tree = tree_df[:len(weights)]
reduced_tree['hammer'] = np.nan_to_num(
    np.array(weights))  # sone NaNs, check the manual
to_root(reduced_tree,
        'reweighed_bc_tree_mu_fromEfgtoKis_14Apr21.root',
        key='tree')
def main(data_path, gamma_path, corsika_path, config_template, output_base,
         threshold, theta2_cut, gamma_fraction, title, start, end, zd_min,
         zd_max):

    with h5py.File(data_path, 'r') as f:
        source_dependent = 'gamma_prediction_off_1' in f['events'].keys()

    if source_dependent:
        other_columns.extend(bg_prediction_columns)
        theta_cut = np.inf
        theta2_cut = np.inf
        print('Source dependent separation, ignoring theta cut')

    theta_cut = np.sqrt(theta2_cut)

    data = read_h5py(data_path,
                     key='events',
                     columns=data_columns + output_columns + other_columns)

    gammas = read_h5py(
        gamma_path,
        key='events',
        columns=mc_columns + output_columns + other_columns,
    )
    gammas.rename(
        columns={'corsika_evt_header_total_energy': 'true_energy'},
        inplace=True,
    )

    runs = read_h5py(data_path, key='runs')

    data['timestamp'] = pd.to_datetime(
        data['unix_time_utc_0'] * 1e6 + data['unix_time_utc_1'],
        unit='us',
    )

    if start:
        data = data.query('timestamp >= @start')
        runs = runs.query('run_start >= @start')
    if end:
        data = data.query('timestamp <= @end')
        runs = runs.query('run_start <= @end')

    min_zenith = runs.zenith.min()
    max_zenith = runs.zenith.max()

    if zd_min:
        min_zenith = max(min_zenith, zd_min)

    if zd_max:
        max_zenith = min(max_zenith, zd_max)

    print('Zenith range of the input data:', min_zenith, max_zenith)

    if source_dependent:
        on_data, off_data = split_on_off_source_dependent(data, threshold)
        on_gammas = gammas.query('gamma_prediction >= {}'.format(threshold))
    else:
        on_data, off_data = split_on_off_source_independent(
            data.query('gamma_prediction >= {}'.format(threshold)),
            theta2_cut=theta2_cut,
        )
        on_gammas = gammas.query(
            '(theta_deg <= {}) & (gamma_prediction >= {})'.format(
                theta_cut,
                threshold,
            ))

    query = '(zd_tracking >= {}) and (zd_tracking <= {})'.format(
        min_zenith, max_zenith)
    on_gammas = on_gammas.query(query).copy()

    output_columns.append('theta_deg')
    on_gammas = on_gammas.loc[:, output_columns + ['true_energy']]
    on_data = on_data.loc[:, output_columns + data_columns]
    off_data = off_data.loc[:, output_columns + data_columns]

    off_data['weight'] = 0.2
    on_data['weight'] = 1.0
    on_gammas['weight'] = 1.0

    rpd.to_root(on_data, output_base + '_on.root', key='events')
    rpd.to_root(off_data, output_base + '_off.root', key='events')
    rpd.to_root(on_gammas, output_base + '_mc.root', key='events')

    print('N_on: {}'.format(len(on_data)))
    print('N_off: {}'.format(len(off_data)))
    print('S(Li&Ma): {}'.format(
        li_ma_significance(len(on_data), len(off_data), 0.2)))
    print('N_mc: {}'.format(len(on_gammas)))

    n_excess = len(on_data) - 0.2 * len(off_data)
    fraction = n_excess / len(on_gammas)

    print('N_excess:', n_excess)
    print('Fraction: {:1.4f}'.format(fraction))

    with open(config_template) as f:
        template = f.read()

    t_obs = runs.ontime.sum()

    try:
        corsika = pd.read_hdf(corsika_path, key='table')
    except KeyError:
        f = h5py.File(corsika_path)
        print("given key not in file: possible keys are: {}".format(
            list(f.keys())))
        return

    corsika['zenith'] = np.rad2deg(corsika['zenith'])
    corsika = corsika.query('(zenith >= {}) and (zenith <= {})'.format(
        min_zenith, max_zenith))
    print('Simulated events after zenith cut: {}'.format(len(corsika)))

    config = template.format(
        t_obs=t_obs,
        selection_fraction=gamma_fraction,
        n_gamma=len(corsika),
        source_file_on=output_base + '_on.root',
        source_file_off=output_base + '_off.root',
        source_file_mc=output_base + '_mc.root',
        tree_name='events',
        output_file=output_base + '_result.root',
        fraction=fraction,
        min_zenith=min_zenith,
        max_zenith=max_zenith,
        title=title,
    )

    with open(output_base + '.config', 'w') as f:
        f.write(config)
Example #27
0
#print name_fields

#name_fields = np.append( name_fields, ['ggFVBF'] )
#name_fields = np.append( name_fields, ['NNScore'] )
#name_fields = np.append( name_fields, ['NNScore'] )

DF_test = pd.DataFrame(np.load(DF_path + 'ResultsTestPD.npy'),
                       columns=name_fields)
DF_train = pd.DataFrame(np.load(DF_path + 'ResultsTrainPD.npy'),
                        columns=name_fields)

#print DF_test
#print DF_test.shape
#print name_fields.shape

rp.to_root(DF_test, 'NNFlatTree_TestSample.root', key='NNFlatTree')

DF_test_VBF = DF_test[DF_test['ggFVBF'] == 1]
DF_test_ggF = DF_test[DF_test['ggFVBF'] == 0]

DF_train_VBF = DF_train[DF_train['ggFVBF'] == 1]
DF_train_ggF = DF_train[DF_train['ggFVBF'] == 0]

rp.to_root(DF_test_VBF, 'NNFlatTree_VBF1000.root', key='NNFlatTree')
rp.to_root(DF_test_ggF, 'NNFlatTree_ggF1000.root', key='NNFlatTree')

### Vectorial Tree from Reader ###
if runForVBFggF:
    VT_name = VT_path + 'VBF_H1000.root'
    DF_VT_VBF1000 = pd.DataFrame(
        root2array(VT_name, 'Nominal', branches=list_branches(VT_name)))
Example #28
0
def main():
    #get the observables from the MC root files
    if file_type == 'Signal_MU':
        fname = '/disk/lhcb_data/amathad/Lb2Lclnu_analysis/MC/Lb2Lcmunu_MagUp_2016_Combine.root'
        key = 'DecayTree'
        reco_truth_vars = [
            'Lb_True_Q2_mu', 'Lb_True_Costhetal_mu', 'q2_Pred', 'costhl_Pred'
        ]
        extra_sel_vars = [
            'isTruth', 'isFiducial', 'Event_LbProdcorr',
            'Event_TrackCalibcorr', 'Event_PIDCalibEffWeight',
            'Event_L0Muoncorr', 'isFullsel', 'runNumber', 'eventNumber'
        ]
        columns = reco_truth_vars + extra_sel_vars
    elif file_type == 'Signal_MD':
        fname = '/disk/lhcb_data/amathad/Lb2Lclnu_analysis/MC/Lb2Lcmunu_MagDown_2016_Combine.root'
        key = 'DecayTree'
        reco_truth_vars = [
            'Lb_True_Q2_mu', 'Lb_True_Costhetal_mu', 'q2_Pred', 'costhl_Pred'
        ]
        extra_sel_vars = [
            'isTruth', 'isFiducial', 'Event_LbProdcorr',
            'Event_TrackCalibcorr', 'Event_PIDCalibEffWeight',
            'Event_L0Muoncorr', 'isFullsel', 'runNumber', 'eventNumber'
        ]
        columns = reco_truth_vars + extra_sel_vars
    elif file_type == 'Gen':
        fname = '/home/hep/amathad/LbToLclnu_RunTwo/FittingScripts/qsq_cthl_spectra/Differential_density/responsematrix_eff/GeomEffFiles/LcMuNu_gen_new.root'
        key = 'DecayTree'
        columns = ['Lb_True_Costhetal_mu', 'Lb_True_Q2_mu', 'Event_LbProdcorr']

    #get phsp array using the model.import_unbinned_data function (using pathrootfile as pathname input)
    df_phsp_arr = read_root(fname, key=key, columns=columns)

    #import the fit results file for PDF_OLD and make a dictionary
    with open('./MC_fitres.txt') as txt:
        data = txt.readlines()
    print(len(data), data)
    dict_params_pdf_old = {}
    for i in range(len(data)):
        dataline = data[i].split()
        print(dataline)
        if 'loglh' in str(dataline[0]):
            break
        else:
            dict_params_pdf_old[str(dataline[0])] = float(dataline[1])

    print(dict_params_pdf_old)

    #fill with weights
    fill_weights(scenario, df_phsp_arr, dict_params_pdf_old, n_params=n_params)
    print(df_phsp_arr)

    #dump the file to root
    if conservative:
        f_new_name = './model_dependency_rootfiles_conservative/' + fname.split(
            '/')[-1]
    else:
        f_new_name = './model_dependency_rootfiles/' + fname.split('/')[-1]

    f_new_name = f_new_name.replace('.root',
                                    '_' + scenario + '_modeldependency.root')
    print(f_new_name)
    to_root(df_phsp_arr, f_new_name, key=key, store_index=False)
Example #29
0
    'mmm_p4',
    'jpsiK_p4',
    'pion_p4',
    'jpsipi_p4',
    'jpsi_p4',
    'Bdir_eta',
    'Bdir_phi',
]

for k, v in samples.items():

    for new_column, new_definition in to_define:
        if samples[k].HasColumn(new_column): continue
        samples[k] = samples[k].Define(new_column, new_definition)
    # convert to pandas
    samples[k] = pd.DataFrame(samples[k].AsNumpy(exclude=to_exclude))

    for icolumn in to_cast:
        samples[k][icolumn] = samples[k][icolumn].astype(np.bool, copy=False)

    print('enrich the data', k)

    for i, label in zip(range(3), ['mu', 'tau', 'bkg']):
        samples[k]['bdt_%s' % label] = model.predict_proba(
            samples[k][features])[:, i]

    to_root(samples[k],
            '%s/BcToXToJpsi_is_%s_enriched.root' % (tree_dir, k),
            key='BTommm',
            store_index=False)
df['time'] = pd.to_datetime(df['created_at'])
df['tbench'] = df['field1'].astype(float)
df['hbench'] = df['field2'].astype(float)
df['tchiller'] = df['field3'].astype(float)
df['chillerstatus'] = df['field4'].astype(float)
df['tlab'] = df['field5'].astype(float)
df['hlab'] = df['field6'].astype(float)

df.set_index('time', inplace=True)  #set the index to the date column

#convert time to Rome timezone
#df.index=df.index.tz_localize('GMT')
#df.index=df.index.tz_convert('Europe/Rome')

#select only meaningful data
df = df[df.index >= '2019-05-10']

#convert date to epoch
df['timestamp'] = df.index.astype('int64') / 1000000000

#removes unnecessary colums
df = df.loc[:, 'tbench':'timestamp']

print df.head(5)
print "......."
print df.tail(5)

from root_pandas import to_root

to_root(df, options.output, key='LYBenchTemp')