Esempio n. 1
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config            = yaml.load(config_file)
        output_tag        = config['output_tag']

        mc_dir            = config['mc_file_dir']
        mc_fnames         = config['mc_file_names']
  
        data_dir          = config['data_file_dir']
        data_fnames       = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']       

        object_vars       = config['object_vars']
        flat_obj_vars     = [var for i_object in object_vars for var in i_object]
        event_vars        = config['event_vars']
        vars_to_add       = config['vars_to_add']
        presel            = config['preselection']
                 
                                           #Data handling stuff#
                 
        #load the mc dataframe for all years
        root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, flat_obj_vars+event_vars, vars_to_add, presel) 

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        if not options.data_as_bkg:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples)
        else:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj, reload_samples=options.reload_samples)
            #overwrite background attribute, for compat with DNN class
            root_obj.mc_df_bkg = root_obj.data_df
        root_obj.concat()

        #apply cut-based selection if not optimising BDT score (pred probs still evaluated for compatability w exisiting catOpt constructor). 
        if len(options.cut_based_str)>0:
            root_obj.apply_more_cuts(options.cut_based_str)

                                           # DNN evaluation stuff #

        #load architecture and model weights
        print 'loading DNN: {}'.format(options.model_architecture)
        with open('{}'.format(options.model_architecture), 'r') as model_json:
            model_architecture = model_json.read()
        model = keras.models.model_from_json(model_architecture)
        model.load_weights('{}'.format(options.model))

        LSTM = LSTM_DNN(root_obj, object_vars, event_vars, 1.0, False, True)

        # set up X and y Matrices. Log variables that have GeV units
        LSTM.var_transform(do_data=False) #bkg=data here. This option is for plotting purposes
        X_tot, y_tot     = LSTM.create_X_y()
        X_tot            = X_tot[flat_obj_vars+event_vars] #filter unused vars

        #scale X_vars to mean=0 and std=1. Use scaler fit during previous dnn training
        LSTM.load_X_scaler(out_tag=output_tag)
        X_tot            = LSTM.X_scaler.transform(X_tot)

        #make 2D vars for LSTM layers
        X_tot            = pd.DataFrame(X_tot, columns=flat_obj_vars+event_vars)
        X_tot_high_level = X_tot[event_vars].values
        X_tot_low_level  = LSTM.join_objects(X_tot[flat_obj_vars])

        #predict probs
        pred_prob_tot    = model.predict([X_tot_high_level, X_tot_low_level], batch_size=1024).flatten()

        sig_weights   = root_obj.mc_df_sig['weight'].values
        sig_m_ee      = root_obj.mc_df_sig['dielectronMass'].values
        pred_prob_sig = pred_prob_tot[y_tot==1] 

        bkg_weights   = root_obj.data_df['weight'].values
        bkg_m_ee      = root_obj.data_df['dielectronMass'].values
        pred_prob_bkg = pred_prob_tot[y_tot==0]

                                             #category optimisation stuff#

        #set up optimiser ranges and no. categories to test if non-cut based
        ranges    = [ [0.3,1.] ]
        names     = ['{} score'.format(output_tag)] #arbitrary
        print_str = ''
        cats = [1,2,3,4]
        AMS  = []

        #just to use class methods here
        if len(options.cut_based_str)>0:
            optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig], bkg_weights, bkg_m_ee, [pred_prob_bkg], 0, ranges, names)
            AMS = optimiser.cutBasedAMS()
            print 'String for cut based optimimastion: {}'.format(options.cut_based_str)
            print 'Cut-based optimimsation gives AMS = {:1.8f}'.format(AMS)

        else:
            for n_cats in cats:
                optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig], bkg_weights, bkg_m_ee, [pred_prob_bkg], n_cats, ranges, names)
                optimiser.optimise(1, options.n_iters) #set lumi to 1 as already scaled when loading in
                print_str += 'Results for {} categories : \n'.format(n_cats)
                print_str += optimiser.getPrintableResult()
                AMS.append(optimiser.bests.totSignif)
            print '\n {}'.format(print_str)

        #make nCat vs AMS plots
        Plotter.cats_vs_ams(cats, AMS, output_tag)
Esempio n. 2
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        train_vars = config['train_vars']
        vars_to_add = config['vars_to_add']
        presel = config['preselection']

        #load the mc dataframe for all years
        root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                               data_fnames, proc_to_tree_name, train_vars,
                               vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        if not options.data_as_bkg:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj,
                                 bkg=True,
                                 reload_samples=options.reload_samples)
        else:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj,
                                   reload_samples=options.reload_samples)
        root_obj.concat()

        print 'loading classifier: {}'.format(options.model)
        clf = pickle.load(open("{}".format(options.model), "rb"))

        #apply cut-based selection if not optimising BDT score (pred probs still evaluated for compatability w exisiting constructor).
        if len(options.cut_based_str) > 0:
            root_obj.apply_more_cuts(options.cut_based_str)

        sig_weights = root_obj.mc_df_sig['weight'].values
        sig_m_ee = root_obj.mc_df_sig['dielectronMass'].values
        pred_prob_sig = clf.predict_proba(
            root_obj.mc_df_sig[train_vars].values)[:, 1:].ravel()

        if options.data_as_bkg:
            bkg_weights = root_obj.data_df['weight'].values
            bkg_m_ee = root_obj.data_df['dielectronMass'].values
            pred_prob_bkg = clf.predict_proba(
                root_obj.data_df[train_vars].values)[:, 1:].ravel()

        else:
            bkg_weights = root_obj.mc_df_bkg['weight'].values
            bkg_m_ee = root_obj.mc_df_bkg['dielectronMass'].values
            pred_prob_bkg = clf.predict_proba(
                root_obj.mc_df_bkg[train_vars].values)[:, 1:].ravel()

        #set up optimiser ranges and no. categories to test if non-cut based
        ranges = [[0.15, 1.]]
        names = ['{} score'.format(output_tag)]  #arbitrary
        print_str = ''
        cats = [1, 2, 3, 4, 5]
        AMS = []

        #just to use class methods here
        if len(options.cut_based_str) > 0:
            optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig],
                                 bkg_weights, bkg_m_ee, [pred_prob_bkg], 0,
                                 ranges, names)
            AMS = optimiser.cutBasedAMS()
            print 'String for cut based optimimastion: {}'.format(
                options.cut_based_str)
            print 'Cut-based optimimsation gives AMS = {:1.8f}'.format(AMS)

        else:
            for n_cats in cats:
                optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig],
                                     bkg_weights, bkg_m_ee, [pred_prob_bkg],
                                     n_cats, ranges, names)
                optimiser.optimise(
                    1, options.n_iters
                )  #set lumi to 1 as already scaled when loading in
                print_str += 'Results for {} categories : \n'.format(n_cats)
                print_str += optimiser.getPrintableResult()
                AMS.append(optimiser.bests.totSignif)
            print '\n {}'.format(print_str)

        #make nCat vs AMS plots
        Plotter.cats_vs_ams(cats, AMS, output_tag)