Ejemplo n.º 1
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config            = yaml.load(config_file)
        output_tag        = config['output_tag']

        mc_dir            = config['mc_file_dir']
        mc_fnames         = config['mc_file_names']
  
        #data not needed yet, could use this for validation later. keep for compat with class
        data_dir          = config['data_file_dir']
        data_fnames       = config['data_file_names']

        train_vars        = config['train_vars']
        vars_to_add       = config['vars_to_add']
        presel            = config['preselection']

        proc_to_tree_name = config['proc_to_tree_name']
        colours           = ['#d7191c', '#fdae61', '#f2f229', '#abdda4', '#2b83ba']

                                           #Data handling stuff#

        #load the mc dataframe for all years
        if options.pt_reweight: 
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection)
        else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples)
        #for data_obj in root_obj.data_objects:
        #    root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        if options.pt_reweight and options.reload_samples: 
            for year in root_obj.years:
                root_obj.pt_reweight('DYMC', year, presel)

                                            #Plotter stuff#

        #add model predictions to sig df
        print 'loading classifier: {}'.format(options.model)
        clf = pickle.load(open("{}".format(options.model), "rb"))
        sig_df = root_obj.mc_df_sig
        sig_df['bdt_score'] = clf.predict_proba(sig_df[train_vars].values)[:,1:].ravel()
        bkg_df = root_obj.mc_df_bkg
        bkg_df['bdt_score'] = clf.predict_proba(bkg_df[train_vars].values)[:,1:].ravel()
 
        plotter  = Plotter(root_obj, train_vars)
        #for VBF, good set is: [0.30 0.50 0.70 0.80 0.90 1.0]
        #for ggH, good set is: [0.10 0.30 0.45 0.53 0.60 0.8]
        bdt_bins = np.array(options.boundaries)
        Utils.check_dir('{}/plotting/plots/{}_sig_bkg_evo'.format(os.getcwd(), output_tag))
        i_hist = 0

        for var in train_vars+['dielectronMass']:
            fig  = plt.figure(1)
            axes = fig.gca()
            var_bins = np.linspace(plotter.var_to_xrange[var][0], plotter.var_to_xrange[var][1], options.n_bins)
            for ibin in range(len(bdt_bins)-1):
                sig_cut = sig_df[np.logical_and( sig_df['bdt_score'] > bdt_bins[ibin], sig_df['bdt_score'] < bdt_bins[ibin+1])][var]
                weights_cut = sig_df[np.logical_and( sig_df['bdt_score'] > bdt_bins[ibin], sig_df['bdt_score'] < bdt_bins[ibin+1])]['weight']
                axes.hist(sig_cut, bins=var_bins, label='{:.2f} $<$ MVA $<$ {:.2f}'.format(bdt_bins[ibin], bdt_bins[ibin+1]), weights=weights_cut, histtype='step', color=colours[i_hist], normed=True)
                i_hist += 1
            i_hist=0
            annotate_and_save(axes, plotter, var)
            fig.savefig('{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}.pdf'.format(os.getcwd(), output_tag, var))
            print('saving: {0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}.pdf'.format(os.getcwd(), output_tag, var))
            plt.close()

        #plot background (check mass is not being sculpted)
        for var in ['dielectronMass']:
            fig  = plt.figure(1)
            axes = fig.gca()
            var_bins = np.linspace(plotter.var_to_xrange[var][0], plotter.var_to_xrange[var][1], options.n_bins)
            for ibin in range(len(bdt_bins)-1):
                bkg_cut = bkg_df[np.logical_and( bkg_df['bdt_score'] > bdt_bins[ibin], bkg_df['bdt_score'] < bdt_bins[ibin+1])][var]
                bkg_weights_cut = bkg_df[np.logical_and( bkg_df['bdt_score'] > bdt_bins[ibin], bkg_df['bdt_score'] < bdt_bins[ibin+1])]['weight']
                axes.hist(bkg_cut, bins=var_bins, label='{:.2f} $<$ MVA $<$ {:.2f}'.format(bdt_bins[ibin], bdt_bins[ibin+1]), weights=bkg_weights_cut, histtype='step', color=colours[i_hist], normed=True)
                i_hist+=1
            i_hist=0

            annotate_and_save(axes, plotter, var)
            fig.savefig('{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}_bkg.pdf'.format(os.getcwd(), output_tag, var))
            plt.close()
Ejemplo n.º 2
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config            = yaml.load(config_file)
        output_tag        = config['output_tag']

        mc_dir            = config['mc_file_dir']
        mc_fnames         = config['mc_file_names']
  
        #data not needed yet, could use this for validation later. keep for compat with class
        data_dir          = config['data_file_dir']
        data_fnames       = config['data_file_names']

        train_vars        = config['train_vars']
        vars_to_add       = config['vars_to_add']
        presel            = config['preselection']

        proc_to_tree_name = config['proc_to_tree_name']

        sig_colour        = 'forestgreen'
        #sig_colour        = 'red'
        bkg_colour        = 'violet'
 
                                           #Data handling stuff#

        #load the mc dataframe for all years
        if options.pt_reweight: 
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection)
        else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples)
        root_obj.concat()

        if options.pt_reweight and options.reload_samples: 
            for year in root_obj.years:
                root_obj.pt_reweight('DYMC', year, presel)

                                            #Plotter stuff#
        with open('plotting/var_to_xrange.yaml', 'r') as plot_config_file:
            plot_config        = yaml.load(plot_config_file)
            var_to_xrange      = plot_config['var_to_xrange']

        #get x string replacements from yaml config
        with open('plotting/var_to_xstring.yaml', 'r') as plot_config_file:
            plot_string_cfg    = yaml.load(plot_config_file)
            var_to_xstring     = plot_string_cfg['var_to_xstring']
 
        #set up X, w and y, train-test 
        plotter = Plotter(root_obj, train_vars, sig_col=sig_colour, norm_to_data=True)
        for var in train_vars:
        #for var in ['dielectronCosPhi']:

            fig  = plt.figure(1)
            axes = fig.gca()

            var_sig     = root_obj.mc_df_sig[var].values
            sig_weights = root_obj.mc_df_sig['weight'].values
            var_bkg     = root_obj.mc_df_bkg[var].values
            bkg_weights = root_obj.mc_df_bkg['weight'].values

            bins = np.linspace(var_to_xrange[var][0], var_to_xrange[var][1], 56)

            #add sig mc
            axes.hist(var_sig, bins=bins, label=plotter.sig_labels[0]+r' ($\mathrm{H}\rightarrow\mathrm{ee}$)', weights=sig_weights, histtype='stepfilled', color='red', zorder=10, alpha=0.4, normed=True)
            axes.hist(var_bkg, bins=bins, label='Simulated background', weights=bkg_weights, histtype='stepfilled', color='blue', zorder=0, alpha=0.4, normed=True)

            axes.set_ylabel('Arbitrary Units', ha='right', y=1, size=13)
            current_bottom, current_top = axes.get_ylim()
            axes.set_ylim(bottom=0, top=1.2*current_top)
            axes.set_xlim(left=var_to_xrange[var][0], right=var_to_xrange[var][1])
            axes.legend(bbox_to_anchor=(0.97,0.97), ncol=1)
            plotter.plot_cms_labels(axes, lumi='')
               
            axes.set_xlabel('{}'.format(var_to_xstring[var]), ha='right', x=1, size=13)

            Utils.check_dir('{}/plotting/plots/{}/normed/'.format(os.getcwd(), output_tag))
            fig.savefig('{0}/plotting/plots/{1}/normed/{1}_{2}_normalised.pdf'.format(os.getcwd(), output_tag, var))
            plt.close()
Ejemplo n.º 3
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        #data not needed yet, but stil specify in the config for compatibility with constructor
        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        train_vars = config['train_vars']
        vars_to_add = config['vars_to_add']
        presel = config['preselection']

        #Data handling stuff#

        #load the mc dataframe for all years
        if options.pt_reweight:
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name, train_vars,
                                   vars_to_add, cr_selection)
        else:
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name, train_vars,
                                   vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj,
                             bkg=True,
                             reload_samples=options.reload_samples)
        for data_obj in root_obj.data_objects:
            root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        #reweight samples in bins of pT (and maybe Njets), for each year separely. Note targetted selection
        # is applied here and all df's are resaved for smaller mem
        if options.pt_reweight and options.reload_samples:
            for year in root_obj.years:
                root_obj.pt_reweight('DYMC', year, presel)
                #root_obj.pt_njet_reweight('DYMC', year, presel)

                #BDT stuff#

        #set up X, w and y, train-test
        bdt_hee = BDTHelpers(root_obj,
                             train_vars,
                             options.train_frac,
                             eq_train=options.eq_train)
        bdt_hee.create_X_and_y(mass_res_reweight=True)
        #bdt_hee.create_X_and_y(mass_res_reweight=False)

        #submit the HP search if option true
        if options.hp_perm is not None:
            if options.opt_hps and options.train_best:
                raise Exception(
                    'Cannot optimise HPs and train best model. Run optimal training after hyper paramter optimisation'
                )
            elif options.opt_hps and options.hp_perm:
                raise Exception(
                    'opt_hps option submits scripts with the hp_perm option; Cannot submit a script with both!'
                )
            else:
                print 'About to train + validate on dataset with {} fold splitting'.format(
                    options.k_folds)
                bdt_hee.set_hyper_parameters(options.hp_perm)
                bdt_hee.set_k_folds(options.k_folds)
                for i_fold in range(options.k_folds):
                    bdt_hee.set_i_fold(i_fold)
                    bdt_hee.train_classifier(root_obj.mc_dir, save=False)
                    bdt_hee.validation_rocs.append(bdt_hee.compute_roc())
                with open('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag),
                          'a+') as val_roc_file:
                    bdt_hee.compare_rocs(val_roc_file, options.hp_perm)
                    val_roc_file.close()

        elif options.opt_hps:
            #FIXME: add warning that many jobs are about to be submiited
            if options.k_folds < 2:
                raise ValueError('K-folds option must be at least 2')
            if path.isfile('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag)):
                system('rm {}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag))
                print('deleting: {}/bdt_hp_opt_{}.txt'.format(
                    mc_dir, output_tag))
            bdt_hee.batch_gs_cv(k_folds=options.k_folds,
                                pt_rew=options.pt_reweight)

        elif options.train_best:
            output_tag += '_best'
            with open('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag),
                      'r') as val_roc_file:
                hp_roc = val_roc_file.readlines()
                best_params = hp_roc[-1].split(';')[0]
                print 'Best classifier params are: {}'.format(best_params)
                bdt_hee.set_hyper_parameters(best_params)
                bdt_hee.train_classifier(root_obj.mc_dir,
                                         save=True,
                                         model_name=output_tag)
                bdt_hee.compute_roc()
                bdt_hee.plot_roc(output_tag)
                bdt_hee.plot_output_score(
                    output_tag,
                    ratio_plot=True,
                    norm_to_data=(not options.pt_reweight))

        #else just train BDT with default HPs
        else:
            bdt_hee.train_classifier(root_obj.mc_dir,
                                     save=True,
                                     model_name=output_tag + '_clf')
            bdt_hee.compute_roc()
            bdt_hee.plot_roc(output_tag)
            #bdt_hee.plot_output_score(output_tag, ratio_plot=True, norm_to_data=(not options.pt_reweight))
            bdt_hee.plot_output_score(output_tag,
                                      ratio_plot=True,
                                      norm_to_data=(not options.pt_reweight))
Ejemplo n.º 4
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        object_vars = config['object_vars']
        flat_obj_vars = [var for i_object in object_vars for var in i_object]
        event_vars = config['event_vars']
        vars_to_add = config['vars_to_add']
        presel = config['preselection']

        #Data handling stuff#

        if options.pt_reweight:
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name,
                                   flat_obj_vars + event_vars, vars_to_add,
                                   cr_selection)
        else:
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name,
                                   flat_obj_vars + event_vars, vars_to_add,
                                   presel)

        #load the dataframes for all years
        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj,
                             bkg=True,
                             reload_samples=options.reload_samples)
        for data_obj in root_obj.data_objects:  # for plotting
            root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        #reweight samples in bins of pT (and maybe Njets), for each year separely.
        if options.pt_reweight and options.reload_samples:
            for year in root_obj.years:
                root_obj.pt_reweight('DYMC', year, presel)

                #LSTM stuff#

        LSTM = LSTM_DNN(root_obj, object_vars, event_vars, options.train_frac,
                        options.eq_weights, options.batch_boost)
        if not options.opt_hps:
            LSTM.var_transform(do_data=True)
            X_tot, y_tot = LSTM.create_X_y(mass_res_reweight=True)
            LSTM.split_X_y(X_tot, y_tot, do_data=True)
            if options.hp_perm is not None:
                LSTM.get_X_scaler(LSTM.all_vars_X_train,
                                  out_tag=output_tag,
                                  save=False)
            else:
                LSTM.get_X_scaler(LSTM.all_vars_X_train, out_tag=output_tag)
            LSTM.X_scale_train_test(do_data=True)
            LSTM.set_low_level_2D_test_train(do_data=True,
                                             ignore_train=options.batch_boost)

        #functions called in subbed job, if options.opt_hps was true
        if options.hp_perm is not None:
            if options.opt_hps and options.train_best:
                raise Exception(
                    'Cannot optimise HPs and train best model. Run optimal training after hyper paramter optimisation'
                )
            elif options.opt_hps and options.hp_perm:
                raise Exception(
                    'opt_hps option submits scripts with the hp_perm option; Cannot submit a script with both!'
                )
            else:
                LSTM.set_hyper_parameters(options.hp_perm)
                LSTM.model.summary()
                LSTM.train_w_batch_boost(out_tag=output_tag, save=False)
                with open('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag),
                          'a+') as val_roc_file:
                    LSTM.compare_rocs(val_roc_file, options.hp_perm)
                    val_roc_file.close()

        elif options.opt_hps:
            #FIXME: add warning that many jobs are about to be submiited
            if path.isfile('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag)):
                system('rm {}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag))
                print('deleting: {}/lstm_hp_opt_{}.txt'.format(
                    mc_dir, output_tag))
            LSTM.batch_gs_cv(pt_rew=options.pt_reweight)

        elif options.train_best:
            output_tag += '_best'
            with open('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag),
                      'r') as val_roc_file:
                hp_roc = val_roc_file.readlines()
                best_params = hp_roc[-1].split(';')[0]
                print 'Best classifier params are: {}'.format(best_params)
                LSTM.set_hyper_parameters(best_params)
                LSTM.model.summary()
                LSTM.train_w_batch_boost(out_tag=output_tag)
                #compute final roc on test set
                LSTM.compute_roc(batch_size=1024)
                LSTM.plot_roc(output_tag)
                LSTM.plot_output_score(output_tag,
                                       batch_size=1024,
                                       ratio_plot=True,
                                       norm_to_data=(not options.pt_reweight))

        #else train with basic parameters/architecture
        else:
            LSTM.model.summary()
            if options.batch_boost:  #type of model selection so need validation set
                LSTM.train_w_batch_boost(
                    out_tag=output_tag
                )  #handles creating validation set and 2D vars and sequential saving
            else:
                #LSTM.train_network(epochs=3, batch_size=1024)
                LSTM.train_network(epochs=7, batch_size=32)
                LSTM.save_model(out_tag=output_tag)
            LSTM.compute_roc(batch_size=32)
            #compute final roc on test set
            LSTM.plot_roc(output_tag)
            LSTM.plot_output_score(output_tag,
                                   batch_size=32,
                                   ratio_plot=True,
                                   norm_to_data=(not options.pt_reweight))
Ejemplo n.º 5
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        #data not needed yet, could use this for validation later. keep for compat with class
        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        train_vars = config['train_vars']
        vars_to_add = config['vars_to_add']
        presel = config['preselection']

        proc_to_tree_name = config['proc_to_tree_name']

        sig_colour = 'forestgreen'
        #sig_colour        = 'red'

        #Data handling stuff#

        #load the mc dataframe for all years
        if options.pt_reweight:
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name, train_vars,
                                   vars_to_add, cr_selection)
        else:
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name, train_vars,
                                   vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj,
                             bkg=True,
                             reload_samples=options.reload_samples)
        for data_obj in root_obj.data_objects:
            root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        if options.pt_reweight and options.reload_samples:
            for year in root_obj.years:
                root_obj.pt_reweight('DYMC', year, presel)

                #Plotter stuff#

        #set up X, w and y, train-test
        plotter = Plotter(root_obj,
                          train_vars,
                          sig_col=sig_colour,
                          norm_to_data=True)
        for var in train_vars + ['dielectronMass', 'dielectronPt']:
            plotter.plot_input(var,
                               options.n_bins,
                               output_tag,
                               options.ratio_plot,
                               norm_to_data=(not options.pt_reweight))