Beispiel #1
0
def main(options):

    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        #data not needed yet, but stil specify in the config for compatibility with constructor
        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        #check if dnn (lstm) variables need to be read in
        proc_to_train_vars = config['train_vars']
        all_train_vars = []
        for proc, varrs in proc_to_train_vars.iteritems():
            if isinstance(varrs, dict):
                object_vars = proc_to_train_vars[proc]['object_vars']
                flat_obj_vars = [
                    var for i_object in object_vars for var in i_object
                ]
                event_vars = proc_to_train_vars[proc]['event_vars']
                all_train_vars += (flat_obj_vars + event_vars)
            else:
                all_train_vars += varrs

        vars_to_add = config['vars_to_add']

        if options.syst_name is not None:
            syst = options.syst_name
            read_syst = True
        else:
            read_syst = False

        if read_syst and options.dump_weight_systs:
            raise IOError(
                'Cannot dump weight variations and tree systematics at the same time. Please run separately for each.'
            )
        if options.data_only and (read_syst or options.dump_weight_systs):
            raise IOError('Cannot read Data and apply sysetmatic shifts')

        #Data handling stuff#
        #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority)
        #also note we norm the MC before applying this cut. In data we apply it when reading in.
        #loosest_selection = 'dielectronMass > 110 and dielectronMass < 150 and leadElectronPtOvM > 0.333 and subleadElectronPtOvM > 0.25' cant do this since these vars change with systematics!
        loosest_selection = 'dielectronMass > 100'

        #load the mc dataframe for all years. Do not apply any specific preselection to sim samples
        root_obj = ROOTHelpers(output_tag,
                               mc_dir,
                               mc_fnames,
                               data_dir,
                               data_fnames,
                               proc_to_tree_name,
                               all_train_vars,
                               vars_to_add,
                               loosest_selection,
                               read_systs=(read_syst
                                           or options.dump_weight_systs))
        root_obj.no_lumi_scale()
        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        #if not read_syst:
        if not options.data_as_bkg:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj,
                                 bkg=True,
                                 reload_samples=options.reload_samples)
        else:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj,
                                   reload_samples=options.reload_samples)
            #overwrite background attribute, for compat with DNN class
            root_obj.mc_df_bkg = root_obj.data_df
        root_obj.concat()

        #get year of samples for roob obj and check we didn't accidentally read in more than 1 year
        if len(root_obj.years) != 1:
            raise IOError(
                'Reading in more than one year at a time! Tagging should be split by year'
            )
        else:
            year = list(root_obj.years)[0]
        if ("2016" in year) and (not options.data_only):
            root_obj.scale_sig_partial_2016(
            )  #FIXME: check this is actually called

    #if read_syst: combined_df = root_obj.mc_df_sig doesnt work with DNN set up since need bkg class in _init_
    #else: combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg])
    combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg])

    #Tag sequence stuff#
    #specify sequence of tags and preselection targetting each

    tag_sequence = ['VBF', 'ggH']  #categories targetted
    true_procs = ['VBF', 'ggH', 'ttH']  #procs to run through cats
    #true_procs        = ['ggH', 'ttH'] #procs to run through cats
    if (not read_syst) and (not options.dump_weight_systs):
        true_procs.append(
            'Data'
        )  #is this line needed? guess so since could run mc and data together in a stat-only config
    if options.data_only:
        true_procs = ['Data']  #do data on its own (for memory really)

    #create tag object
    tag_obj = taggerBase(tag_sequence,
                         true_procs,
                         combined_df,
                         syst_name=options.syst_name)
    if read_syst:
        tag_obj.relabel_syst_vars()  #not run if reading weight systematics

    #get number models and tag boundaries from config
    with open(options.mva_config, 'r') as mva_config_file:
        config = yaml.load(mva_config_file)
        proc_to_model = config['models']
        tag_boundaries = config['boundaries']

        #evaluate MVA scores used in categorisation
        for proc, model in proc_to_model.iteritems():
            #for BDT - proc:[var list]. For DNN - proc:{var_type1:[var_list_type1], var_type2: [...], ...}
            if isinstance(model, dict):
                object_vars = proc_to_train_vars[proc]['object_vars']
                flat_obj_vars = [
                    var for i_object in object_vars for var in i_object
                ]
                event_vars = proc_to_train_vars[proc]['event_vars']

                dnn_loaded = tag_obj.load_dnn(proc, model)
                train_tag = model['architecture'].split('_model')[0]
                tag_obj.eval_lstm(dnn_loaded, train_tag, root_obj, proc,
                                  object_vars, flat_obj_vars, event_vars)

            elif isinstance(model, str):
                tag_obj.eval_bdt(proc, model, proc_to_train_vars[proc])
            else:
                raise IOError(
                    'Did not get a classifier models in correct format in config'
                )

    del root_obj

    #need to do this after eval MVAs, since LSTM class used in eval_lstm needs some Data in df for constructor
    if (read_syst or options.dump_weight_systs):
        tag_obj.combined_df = tag_obj.combined_df[
            tag_obj.combined_df.proc != 'Data'].copy(
            )  #avoid copy warnings later
    tag_preselection = tag_obj.get_tag_preselection()

    #set up tag boundaries for each process being targeted
    tag_obj.decide_tag(tag_preselection, tag_boundaries)
    tag_obj.decide_priority()
    branch_names = tag_obj.get_tree_names(tag_boundaries, year)
    tag_obj.set_tree_names(tag_boundaries, options.dump_weight_systs, year)
    tag_obj.fill_trees(branch_names, year, print_yields=not read_syst)
    if not read_syst:
        pass  #tag_obj.plot_matrix(branch_names, output_tag)  #struct error?
Beispiel #2
0
def main(options):

    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        #data not needed yet, but stil specify in the config for compatibility with constructor
        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        proc_to_train_vars = config['train_vars']
        all_train_vars = [
            item for sublist in proc_to_train_vars.values() for item in sublist
        ]

        vars_to_add = config['vars_to_add']

        #Data handling stuff#
        #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority)
        loosest_selection = 'dielectronMass > 110 and dielectronMass < 150'

        #load the mc dataframe for all years. Do not apply any specific preselection
        root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                               data_fnames, proc_to_tree_name, all_train_vars,
                               vars_to_add, loosest_selection)
        root_obj.no_lumi_scale()
        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        if options.data_as_bkg:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj,
                                   reload_samples=options.reload_samples)
        else:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj,
                                 bkg=True,
                                 reload_samples=options.reload_samples)
        root_obj.concat()

        #Tag sequence stuff#
    if options.data_as_bkg:
        combined_df = pd.concat([root_obj.mc_df_sig, root_obj.data_df])
    else:
        combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg])
    del root_obj

    #decide sequence of tags and specify preselection for use with numpy.select:
    tag_sequence = ['VBF', 'ggH']
    proc_to_preselection = {
        'VBF': [
            combined_df['dielectronMass'].gt(110)
            & combined_df['dielectronMass'].lt(150)
            & combined_df['leadElectronPToM'].gt(0.333)
            & combined_df['subleadElectronPToM'].gt(0.25)
            & combined_df['dijetMass'].gt(350)
            & combined_df['leadJetPt'].gt(40)
            & combined_df['subleadJetPt'].gt(30)
        ],
        'ggH': [
            combined_df['dielectronMass'].gt(110)
            & combined_df['dielectronMass'].lt(150)
            & combined_df['leadElectronPToM'].gt(0.333)
            & combined_df['subleadElectronPToM'].gt(0.25)
        ]
    }

    with open(options.bdt_config, 'r') as bdt_config_file:
        config = yaml.load(bdt_config_file)
        proc_to_model = config['models']
        proc_to_tags = config['boundaries']

        #evaluate MVA scores used in categorisation
        for proc, model in proc_to_model.iteritems():
            print 'evaluating classifier: {}'.format(model)
            clf = pickle.load(open('models/{}'.format(model), "rb"))
            train_vars = proc_to_train_vars[proc]
            combined_df[proc + '_bdt'] = clf.predict_proba(
                combined_df[train_vars].values)[:, 1:].ravel()

        # TAG NUMBER #

        #decide on tag
        for proc in tag_sequence:
            presel = proc_to_preselection[proc]
            tag_bounds = proc_to_tags[proc].values()
            tag_masks = []
            for i_bound in range(
                    len(tag_bounds)):  #c++ type looping for index reasons
                if i_bound == 0:  #first bound, tag 0
                    tag_masks.append(presel[0] & combined_df['{}_bdt'.format(
                        proc)].gt(tag_bounds[i_bound]))
                else:  #intermed bound
                    tag_masks.append(presel[0] & combined_df['{}_bdt'.format(
                        proc)].lt(tag_bounds[i_bound - 1]) & combined_df[
                            '{}_bdt'.format(proc)].gt(tag_bounds[i_bound]))

            mask_key = [icat for icat in range(len(tag_bounds))]

            combined_df['{}_analysis_tag'.format(proc)] = np.select(
                tag_masks, mask_key, default=-999)

        # PROC PRIORITY #

        # deduce tag priority: if two or more tags satisfied then set final tag to highest priority tag. make this non hardcoded i.e. compare proc in position 1 to all lower prioty positions. then compare proc in pos 2 ...
        tag_priority_filter = [
            combined_df['VBF_analysis_tag'].ne(-999)
            & combined_df['ggH_analysis_tag'].ne(-999),  # 1) if both filled...
            combined_df['VBF_analysis_tag'].ne(-999)
            & combined_df['ggH_analysis_tag'].eq(
                -999),  # 2) if VBF filled and ggH not, take VBF
            combined_df['VBF_analysis_tag'].eq(-999)
            & combined_df['ggH_analysis_tag'].ne(
                -999),  # 3) if ggH filled and VBF not, take ggH
        ]

        tag_priority_key = [
            'VBF',  #1) take VBF
            'VBF',  #2) take VBF
            'ggH',  #3) take ggH
        ]
        combined_df['priority_tag'.format(proc)] = np.select(
            tag_priority_filter, tag_priority_key,
            default='NOTAG')  # else keep -999 i.e. NOTAG

        #some debug checks:
        #print combined_df[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]
        #print combined_df[combined_df.VBF_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]
        #print combined_df[combined_df.ggH_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]

        # FILL TREES BASED ON BOTH OF ABOVE
        tree_vars = ['dZ', 'CMS_hgg_mass', 'weight']
        combined_df['dZ'] = float(0.)
        combined_df['CMS_hgg_mass'] = combined_df['dielectronMass']

        # FIXME: dont loop through events eventually but for now I cba to use numpy to vectorise it again
        #for true_proc in tag_sequence+['Data']:
        #    #isolate true proc
        #    true_proc_df = combined_df[combined_df.proc==true_proc.lower()]
        #    #how much true proc landed in each of our analysis cats?
        #    for target_proc in tag_sequence:  #for all events that got the proc tag, which tag did they fall into?
        #        true_proc_target_proc_df = true_proc_df[true_proc_df.priority_tag==target_proc]
        #        for i_tag in range(len(proc_to_tags[target_proc].values())):#for each tag corresponding to the category we target, which events go in which tag
        #             true_procs_target_proc_tag_i  = true_proc_target_proc_df[true_proc_target_proc_df['{}_analysis_tag'.format(target_proc)].eq(i_tag)]
        #
        #             branch_name = '{}_125_13TeV_{}cat{} :'.format(true_proc.lower(), target_proc.lower(), i_tag )
        #             print true_procs_target_proc_tag_i[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']].head(10)
        #             print branch_name

        #get tree names
        branch_names = {}
        #print 'DEBUG: {}'.format(np.unique(combined_df['proc']))
        for true_proc in tag_sequence + ['Data']:
            branch_names[true_proc] = []
            for target_proc in tag_sequence:  #for all events that got the proc tag, which tag did they fall into?
                for i_tag in range(
                        len(proc_to_tags[target_proc].values())
                ):  #for each tag corresponding to the category we target, which events go in which tag
                    if true_proc is not 'Data':
                        branch_names[true_proc].append(
                            '{}_125_13TeV_{}cat{}'.format(
                                true_proc.lower(), target_proc.lower(), i_tag))
                    else:
                        branch_names[true_proc].append(
                            '{}_13TeV_{}cat{}'.format(true_proc,
                                                      target_proc.lower(),
                                                      i_tag))

        #debug_procs = ['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']
        debug_vars = [
            'proc', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag'
        ]
        combined_df['tree_name'] = combined_df.apply(assign_tree, axis=1)
        print combined_df[debug_vars + ['tree_name']]

        if not path.isdir('output_trees/'):
            print 'making directory: {}'.format('output_trees/')
            system('mkdir -p %s' % 'output_trees/')

        #have to save individual trees then hadd procs together on the command line.
        for proc in tag_sequence + ['Data']:
            selected_df = combined_df[combined_df.proc == proc]
            for bn in branch_names[proc]:
                print bn
                branch_selected_df = selected_df[selected_df.tree_name == bn]
                print branch_selected_df[debug_vars + ['tree_name']].head(20)
                root_pandas.to_root(branch_selected_df[tree_vars],
                                    'output_trees/{}.root'.format(bn),
                                    key=bn)
                print
def main(options):

    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        proc_to_train_vars = config['train_vars']
        object_vars = proc_to_train_vars['VBF']['object_vars']
        flat_obj_vars = [var for i_object in object_vars for var in i_object]
        event_vars = proc_to_train_vars['VBF']['event_vars']

        #used to check all vars we need for categorisation are in our dfs
        all_train_vars = proc_to_train_vars['ggH'] + flat_obj_vars + event_vars

        vars_to_add = config['vars_to_add']

        #Data handling stuff#
        #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority)
        loosest_selection = 'dielectronMass > 110 and dielectronMass < 150'

        #load the mc dataframe for all years. Do not apply any specific preselection
        root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                               data_fnames, proc_to_tree_name, all_train_vars,
                               vars_to_add, loosest_selection)
        root_obj.no_lumi_scale()
        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        if not options.data_as_bkg:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj,
                                 bkg=True,
                                 reload_samples=options.reload_samples)
        else:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj,
                                   reload_samples=options.reload_samples)
            #overwrite background attribute, for compat with DNN class
            root_obj.mc_df_bkg = root_obj.data_df
        root_obj.concat()

        #Tag sequence stuff#
    #NOTE: these must be concatted in the same way they are concatted in LSTM.create_X_y(), else predicts are misaligned
    combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg])

    #decide sequence of tags and specify preselection for use with numpy.select:
    tag_sequence = ['VBF', 'ggH']
    proc_to_preselection = {
        'VBF': [
            combined_df['dielectronMass'].gt(110)
            & combined_df['dielectronMass'].lt(150)
            & combined_df['leadElectronPToM'].gt(0.333)
            & combined_df['subleadElectronPToM'].gt(0.25)
            & combined_df['dijetMass'].gt(350)
            & combined_df['leadJetPt'].gt(40)
            & combined_df['subleadJetPt'].gt(30)
        ],
        'ggH': [
            combined_df['dielectronMass'].gt(110)
            & combined_df['dielectronMass'].lt(150)
            & combined_df['leadElectronPToM'].gt(0.333)
            & combined_df['subleadElectronPToM'].gt(0.25)
        ]
    }

    # GET MVA SCORES #

    with open(options.mva_config, 'r') as mva_config_file:
        config = yaml.load(mva_config_file)
        proc_to_model = config['models']
        proc_to_tags = config['boundaries']

        #evaluate ggH BDT scores
        print 'evaluating ggH classifier: {}'.format(proc_to_model['ggH'])
        clf = pickle.load(open('models/{}'.format(proc_to_model['ggH']), "rb"))
        train_vars = proc_to_train_vars['ggH']
        combined_df['ggH_mva'] = clf.predict_proba(
            combined_df[train_vars].values)[:, 1:].ravel()

        #Evaluate VBF LSTM
        print 'loading VBF DNN:'
        with open('models/{}'.format(proc_to_model['VBF']['architecture']),
                  'r') as model_json:
            model_architecture = model_json.read()
        model = keras.models.model_from_json(model_architecture)
        model.load_weights('models/{}'.format(proc_to_model['VBF']['model']))

        LSTM = LSTM_DNN(root_obj, object_vars, event_vars, 1.0, False, True)

        # set up X and y Matrices. Log variables that have GeV units
        LSTM.var_transform(do_data=False)
        X_tot, y_tot = LSTM.create_X_y()
        X_tot = X_tot[flat_obj_vars + event_vars]  #filter unused vars
        print np.isnan(X_tot).any()

        #scale X_vars to mean=0 and std=1. Use scaler fit during previous dnn training
        LSTM.load_X_scaler(out_tag='VBF_DNN')
        X_tot = LSTM.X_scaler.transform(X_tot)

        #make 2D vars for LSTM layers
        X_tot = pd.DataFrame(X_tot, columns=flat_obj_vars + event_vars)
        X_tot_high_level = X_tot[event_vars].values
        X_tot_low_level = LSTM.join_objects(X_tot[flat_obj_vars])

        #predict probs. Corresponds to same events, since dfs are concattened internally in the same
        combined_df['VBF_mva'] = model.predict(
            [X_tot_high_level, X_tot_low_level], batch_size=1).flatten()

        # TAG NUMBER #

        #decide on tag
        for proc in tag_sequence:
            presel = proc_to_preselection[proc]
            tag_bounds = proc_to_tags[proc].values()
            tag_masks = []
            for i_bound in range(
                    len(tag_bounds)):  #c++ type looping for index reasons
                if i_bound == 0:  #first bound, tag 0
                    tag_masks.append(presel[0] & combined_df['{}_mva'.format(
                        proc)].gt(tag_bounds[i_bound]))
                else:  #intermed bound
                    tag_masks.append(presel[0] & combined_df['{}_mva'.format(
                        proc)].lt(tag_bounds[i_bound - 1]) & combined_df[
                            '{}_mva'.format(proc)].gt(tag_bounds[i_bound]))

            mask_key = [icat for icat in range(len(tag_bounds))]

            combined_df['{}_analysis_tag'.format(proc)] = np.select(
                tag_masks, mask_key, default=-999)

        # PROC PRIORITY #

        # deduce tag priority: if two or more tags satisfied then set final tag to highest priority tag. make this non hardcoded i.e. compare proc in position 1 to all lower prioty positions. then compare proc in pos 2 ...
        tag_priority_filter = [
            combined_df['VBF_analysis_tag'].ne(-999)
            & combined_df['ggH_analysis_tag'].ne(-999),  # 1) if both filled...
            combined_df['VBF_analysis_tag'].ne(-999)
            & combined_df['ggH_analysis_tag'].eq(
                -999),  # 2) if VBF filled and ggH not, take VBF
            combined_df['VBF_analysis_tag'].eq(-999)
            & combined_df['ggH_analysis_tag'].ne(
                -999),  # 3) if ggH filled and VBF not, take ggH
        ]

        tag_priority_key = [
            'VBF',  #1) take VBF
            'VBF',  #2) take VBF
            'ggH',  #3) take ggH
        ]
        combined_df['priority_tag'.format(proc)] = np.select(
            tag_priority_filter, tag_priority_key,
            default='NOTAG')  # else keep -999 i.e. NOTAG

        #some debug checks:
        #print combined_df[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]
        #print combined_df[combined_df.VBF_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]
        #print combined_df[combined_df.ggH_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]

        # FILL TREES BASED ON BOTH OF ABOVE
        tree_vars = ['dZ', 'CMS_hgg_mass', 'weight']
        combined_df['dZ'] = float(0.)
        combined_df['CMS_hgg_mass'] = combined_df['dielectronMass']

        # FIXME: dont loop through events eventually but for now I cba to use numpy to vectorise it again
        #for true_proc in tag_sequence+['Data']:
        #    #isolate true proc
        #    true_proc_df = combined_df[combined_df.proc==true_proc.lower()]
        #    #how much true proc landed in each of our analysis cats?
        #    for target_proc in tag_sequence:  #for all events that got the proc tag, which tag did they fall into?
        #        true_proc_target_proc_df = true_proc_df[true_proc_df.priority_tag==target_proc]
        #        for i_tag in range(len(proc_to_tags[target_proc].values())):#for each tag corresponding to the category we target, which events go in which tag
        #             true_procs_target_proc_tag_i  = true_proc_target_proc_df[true_proc_target_proc_df['{}_analysis_tag'.format(target_proc)].eq(i_tag)]
        #
        #             branch_name = '{}_125_13TeV_{}cat{} :'.format(true_proc.lower(), target_proc.lower(), i_tag )
        #             print true_procs_target_proc_tag_i[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']].head(10)
        #             print branch_name

        #get tree names
        branch_names = {}
        #print 'DEBUG: {}'.format(np.unique(combined_df['proc']))
        for true_proc in tag_sequence + ['Data']:
            branch_names[true_proc] = []
            for target_proc in tag_sequence:  #for all events that got the proc tag, which tag did they fall into?
                for i_tag in range(
                        len(proc_to_tags[target_proc].values())
                ):  #for each tag corresponding to the category we target, which events go in which tag
                    if true_proc is not 'Data':
                        branch_names[true_proc].append(
                            '{}_125_13TeV_{}cat{}'.format(
                                true_proc.lower(), target_proc.lower(), i_tag))
                    else:
                        branch_names[true_proc].append(
                            '{}_13TeV_{}cat{}'.format(true_proc,
                                                      target_proc.lower(),
                                                      i_tag))

        #debug_procs = ['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']
        debug_vars = [
            'proc', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag'
        ]
        combined_df['tree_name'] = combined_df.apply(assign_tree, axis=1)
        print combined_df[debug_vars + ['tree_name']]

        if not path.isdir('output_trees/'):
            print 'making directory: {}'.format('output_trees/')
            system('mkdir -p %s' % 'output_trees/')

        #have to save individual trees then hadd procs together on the command line.
        for proc in tag_sequence + ['Data']:
            selected_df = combined_df[combined_df.proc == proc]
            for bn in branch_names[proc]:
                print bn
                branch_selected_df = selected_df[selected_df.tree_name == bn]
                print branch_selected_df[debug_vars + ['tree_name']].head(20)
                root_pandas.to_root(branch_selected_df[tree_vars],
                                    'output_trees/{}.root'.format(bn),
                                    key=bn)
                print