Ejemplo n.º 1
0
def punzi_target(priors, relevant_classes, params, mode = "S"):
    zzroot = os.environ["CMSSW_BASE"]
    bin_dir = os.path.join(zzroot, "bin/slc6_amd64_gcc630/")
    cost_function_evaluator = "run_prior_evaluator"
    
    output = check_output([bin_dir + cost_function_evaluator, run_dir, out_dir, engine, str(params["min_iterations"]), str(params["max_iterations"]), str(priors["ggh_prior"]), str(priors["whhadr_prior"]), 
              str(priors["zhhadr_prior"]), str(priors["whlept_prior"]), str(priors["zhlept_prior"]), str(priors["zhmet_prior"]), 
              str(priors["tthhadr_prior"]), str(priors["tthlept_prior"]), str(priors["bkg_prior"]), str(priors["qq_prior"]), mode, ref_dir])

    if mode == "S":
        punzi_file = "Mor18_punzi_S_comp.conf"
    elif mode == "SB":
        punzi_file = "Mor18_punzi_comp.conf"

    # read directly the configuration file containing the relative Punzi improvements w.r.t. the reference 
    # (the one with flat priors)
    punzihandler = ConfigFileHandler()
    punzihandler.load_configuration(os.path.join(out_dir, punzi_file))
    
    costval = 0.0
         
    # use the weighted cost function
    delta_pi = []
    for relevant_class in relevant_classes:
        delta_pi.append(float(punzihandler.get_field('Punzi', relevant_class)) - 1.0)
    
    costval = cost_func(delta_pi, 8.0, 2)

    if math.isnan(costval):
        print "caught NaN!"
        costval = -7.0
 
    return costval
Ejemplo n.º 2
0
def save_params(out_path, params, evalcnt):
    confhandler = ConfigFileHandler()
    
    if os.path.exists(out_path):
        confhandler.load_configuration(out_path)
    
    section_name = 'evaluation_' + str(evalcnt)
    confhandler.new_section(section_name)
    
    for key, value in params.iteritems():
        confhandler.set_field(section_name, key, str(value))
     
    confhandler.save_configuration(out_path)
def main():
    if len(sys.argv) != 4:
        print "Error: exactly 3 arguments are required"

    config_file_in = sys.argv[1]
    config_file_out = sys.argv[2]
    masspoint = float(sys.argv[3])

    confhandler = ConfigFileHandler()
    confhandler.load_configuration(config_file_in)
    confhandler.set_field('global', 'mass_point', str(masspoint))
    confhandler.save_configuration(config_file_out)
def punzi_target(priors, relevant_classes, params):
    bin_dir = "/home/llr/cms/wind/cmssw/CMSSW_9_4_2/bin/slc6_amd64_gcc630/"
    cost_function_evaluator = "run_prior_evaluator"

    output = check_output([
        bin_dir + cost_function_evaluator, run_dir, out_dir, engine,
        str(params["min_iterations"]),
        str(params["max_iterations"]),
        str(priors["ggh_prior"]),
        str(priors["whhadr_prior"]),
        str(priors["zhhadr_prior"]),
        str(priors["whlept_prior"]),
        str(priors["zhlept_prior"]),
        str(priors["zhmet_prior"]),
        str(priors["tthhadr_prior"]),
        str(priors["tthlept_prior"])
    ])

    # read directly the configuration file containing the relative Punzi improvements w.r.t. the reference
    # (the one with flat priors)
    punzihandler = ConfigFileHandler()
    punzihandler.load_configuration(os.path.join(out_dir, punzi_file))

    costval = 0.0

    # use the weighted cost function
    delta_pi = []
    for relevant_class in relevant_classes:
        delta_pi.append(
            float(punzihandler.get_field('Punzi', relevant_class)) - 1.0)

    costval = cost_func(delta_pi, 8.0, 2)

    if math.isnan(costval):
        print "caught NaN!"
        costval = -7.0

    return costval
Ejemplo n.º 5
0
def make_input_plot(input_file):
    confhandler = ConfigFileHandler()
    confhandler.load_configuration(input_file)
    models = confhandler.get_sections()

    df = pd.DataFrame()

    for model in models:
        cur_sect = confhandler.get_section(model)

        used_nonperiodic_vars = filter(
            None,
            ConfigFileUtils.parse_list(cur_sect["nonperiodic_columns"],
                                       lambda x: x))
        used_periodic_vars = filter(
            None,
            ConfigFileUtils.parse_list(cur_sect["periodic_columns"],
                                       lambda x: x))

        used_vars = used_nonperiodic_vars + used_periodic_vars
        var_dict = {col: [1.0] for col in used_vars}
        var_dict["model"] = model

        row_df = pd.DataFrame.from_dict(var_dict)

        df = pd.concat([df, row_df], axis=0)

    df = df.fillna(0.0)

    datacols = [col for col in df.columns if col is not "model"]
    plot_data = df[datacols].as_matrix()

    y_label = [convert_variable_name(name) for name in np.array(datacols)]
    x_label = [convert_model_label(label) for label in df["model"].as_matrix()]

    fig = plt.figure(figsize=(12, 10))

    ax = fig.add_subplot(111)

    cax = ax.matshow(plot_data.transpose(), cmap='Blues', vmin=0, vmax=1)
    ax.set_xticklabels(np.concatenate([[''], x_label]),
                       rotation='vertical',
                       fontsize=11)
    ax.set_yticklabels(np.concatenate([[''], y_label]), fontsize=10)
    ax.xaxis.set_label_position("top")
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.tight_layout()

    return fig
Ejemplo n.º 6
0
def load_file(path, keys):
    confhandler = ConfigFileHandler()
    confhandler.load_configuration(path)
    
    retval = {}
    
    for section_name in confhandler.get_sections():
        cur_section = confhandler.get_section(section_name)
        
        for key in keys:
            if not key in retval:
                retval[key] = []
                
            retval[key].append(float(cur_section[key]))
            
    return retval
def get_loss(run, mcoll, model):
    confhandler = ConfigFileHandler()
    confhandler.load_configuration(
        os.path.join(run, "training", mcoll, "model_benchmark.txt"))
    return float(confhandler.get_field(model, 'val_loss'))
Ejemplo n.º 8
0
def main():

    if len(sys.argv) != 2:
        print "Error: exactly 1 argument is required"

    campaign_dir = sys.argv[1]

    bin_dir = {"Untagged": 0, "VBF1j": 1, "VBF2j": 2, "VHhadr": 3}

    df = pd.DataFrame()

    for subdir in next(os.walk(campaign_dir))[1]:
        if "statistics" not in subdir:
            punzi_path = campaign_dir + subdir + "/comp/Mor18_punzi_comp.conf"
            settings_path = campaign_dir + subdir + "/settings.conf"

            # first, read back the configuration file for this run
            conf = ModelCollectionConfigFileHandler()
            conf.LoadConfiguration(settings_path)

            # now select a typical model and read its hyperparameters
            typical_model = conf._get_model_list(
                conf._get_model_collection_list()[0])[0]
            hyperparams = conf.GetHyperparameters(typical_model)
            hyperparam_dict = {
                key: [val]
                for key, val in hyperparams.iteritems()
            }

            # then read in the results in terms of relative Punzi improvement for each category
            conf = ConfigFileHandler()
            conf.LoadConfiguration(punzi_path)

            # load the Punzi values for each category
            values = {}
            for category, bin_number in bin_dir.iteritems():
                values[category] = float(conf._get_field("Punzi", category))

            # merge the two dictionaries
            values.update(hyperparam_dict)
            df = df.append(pd.DataFrame.from_dict(values))

    # all different values of the number of neurons that were used in the sweep
    number_neurons = set(df['number_neurons'])

    statistics_dir = campaign_dir + "statistics/"
    if not os.path.exists(statistics_dir):
        os.makedirs(statistics_dir)

    # sort the pandas dataframe ascending by number of hidden layers
    df = df.sort_values("number_layers")

    for num in number_neurons:
        number_layers = df.loc[df["number_neurons"] == num,
                               ["number_layers"]].as_matrix().flatten().astype(
                                   int)
        punzi_data = df.loc[df["number_neurons"] == num,
                            bin_dir.keys()].as_matrix()

        plt.figure()
        plt.imshow(punzi_data,
                   interpolation='none',
                   cmap='RdYlGn',
                   aspect=0.6,
                   vmin=0.8,
                   vmax=1.2)
        plt.xticks(range(len(bin_dir)), bin_dir.keys())
        plt.yticks(range(len(number_layers)), number_layers)
        plt.ylabel("number hidden layers")
        plt.colorbar()
        plt.title("Punzi purity ratio [" + str(int(num)) + " hidden neurons]")
        plt.savefig(statistics_dir + "punzi_" + str(int(num)) +
                    "_hidden_neurons.pdf")
Ejemplo n.º 9
0
def save_priors(out_path, priors):
    # combine all the results into the final prior and save it again
    confhandler = ConfigFileHandler()
    confhandler.config.optionxform = str
    confhandler.new_section('Priors')
    confhandler.set_field('Priors', 'VBF_prior', str(1.0))
    confhandler.set_field('Priors', 'ggH_prior', str(priors["ggh_prior"]))
    confhandler.set_field('Priors', 'ttHlept_prior', str(priors["tthlept_prior"]))
    confhandler.set_field('Priors', 'ttHhadr_prior', str(priors["tthhadr_prior"]))
    confhandler.set_field('Priors', 'ZHlept_prior', str(priors["zhlept_prior"]))
    confhandler.set_field('Priors', 'WHlept_prior', str(priors["whlept_prior"]))
    confhandler.set_field('Priors', 'ZHhadr_prior', str(priors["zhhadr_prior"]))
    confhandler.set_field('Priors', 'WHhadr_prior', str(priors["whhadr_prior"]))
    confhandler.set_field('Priors', 'ZHMET_prior', str(priors["zhmet_prior"]))    
    confhandler.set_field('Priors', "ZX_prior", str(priors["bkg_prior"]))
    confhandler.set_field('Priors', "qq_prior", str(priors["qq_prior"]))
    confhandler.save_configuration(out_path)
def main():
    
    if len(sys.argv) < 3:
        print "Error: at least 2 arguments are required"

    campaign_dir = sys.argv[1]
    workdir = sys.argv[2]

    if len(sys.argv) >= 4:
        input_config_file = sys.argv[3]
    else:
        input_config_file = None

    # make sure that the given directory ends with a /
    if not campaign_dir.endswith('/'):
        campaign_dir += "/"
        
    confhandler = ConfigFileHandler()
    confhandler.load_configuration(campaign_dir + "campaign.conf")
    
    iterables = {}
    
    for section in confhandler.get_sections():
        if '!' in section:
            sweep_name = re.sub('!', '', section)
            sweep_sections = ConfigFileUtils.parse_list(confhandler.get_field(section, 'variables'), lambda x: x)

            # now look for the sweep variables that belong to this sweep
            for sweep_section in sweep_sections:
                # this is a section that determines a new sweep direction, possibly linked
                sweep_metadata = confhandler.get_field(sweep_section, 'variable').split(':')
                sweep_scope = sweep_metadata[0]
                sweep_parameter = sweep_metadata[1]

                # request more information
                sweep_behaviour = confhandler.get_field(sweep_section, 'behaviour')

                if ConfigFileUtils.is_dict(confhandler.get_field(sweep_section, 'start')):
                    # will need a dictionary iterable
                    start_dict = ConfigFileUtils.parse_dict(confhandler.get_field(sweep_section, 'start'), lambda x: float(x))
                    end_dict = ConfigFileUtils.parse_dict(confhandler.get_field(sweep_section, 'end'), lambda x: float(x))
                    step_dict = ConfigFileUtils.parse_dict(confhandler.get_field(sweep_section, 'step'), lambda x: float(x))

                    if sweep_name not in iterables:
                        it = SweepDimensionDict(sweep_scope, sweep_parameter, start_dict, end_dict, step_dict, sweep_behaviour)
                        iterables[sweep_name] = it
                    else:
                        iterables[sweep_name].add(sweep_scope, sweep_parameter, start_dict, end_dict, step_dict, sweep_behaviour)
                else:
                    # construct a list iterable instead
                    start_list = ConfigFileUtils.parse_list(confhandler.get_field(sweep_section, 'start'), lambda x: x)    
                    end_list = ConfigFileUtils.parse_list(confhandler.get_field(sweep_section, 'end'), lambda x: x)

                    if sweep_name not in iterables:
                        it = SweepDimensionList(sweep_scope, sweep_parameter, start_list, end_list, sweep_behaviour)
                        iterables[sweep_name] = it
                    else:
                        iterables[sweep_name].add(sweep_scope, sweep_parameter, start_list, end_list, sweep_behaviour)

    MC_path = os.path.join(workdir, "trainval/")
    model_type = confhandler.get_field('global', 'model_type')

    # get the mass point from the global config file in a way that ensures backward compatibility
    try:
        mass_point = float(confhandler.get_field('global', 'mass_point'))
    except KeyError:
        mass_point = 125.0

    if model_type == 'SimpleModel':
        # using the full mass range for training, not using the 118/130GeV cut
        mcoll = SimpleModelFactoryDynamic.GenerateSimpleModelCollections(MC_path, input_config_file = input_config_file, hyperparam_config_file = None, mass_point = mass_point)
    elif model_type == 'CombinedModel':
        mcoll = ModelFactoryFullCategorySetOptimizedInputs.GenerateCombinedModelCollections(MC_path)
        
    iterate(iterables, {}, lambda it: augment_config(mcoll, campaign_dir, it))
def main():

    if len(sys.argv) != 2:
        print "Error: exactly 1 argument is required"

    campaign_dir = sys.argv[1]

    bin_dir = {"Untagged": 0,
               "VBF1j": 1,
               "VBF2j": 2,
               "VHhadr": 3}

    def format_parameter_list(inlist):
        outstring = ""
        linewidth = 0
    
        for parameter in inlist:
            if "D_" in parameter:
                newstring = "MELA, "
            else:
                newstring = parameter + ", "
            
            outstring += newstring
            linewidth += len(newstring)
        
            if linewidth > 20:
                outstring += "\n"
                linewidth = 0
            
        return outstring[:-2]

    df = pd.DataFrame()

    for subdir in next(os.walk(campaign_dir))[1]:
        if "statistics" not in subdir:
            values = {}

            punzi_path = campaign_dir + subdir + "/comp/Mor18_punzi_comp.conf"
            settings_path = campaign_dir + subdir + "/settings.conf"
    
            # first, read back the configuration file for this run
            conf = ModelCollectionConfigFileHandler()
            conf.LoadConfiguration(settings_path)
    
            # now select a typical model and read its hyperparameters
            typical_model = conf._get_model_list(conf._get_model_collection_list()[0])[0]
            hyperparams = conf.GetHyperparameters(typical_model)
            hyperparam_dict = {key: [val] for key, val in hyperparams.iteritems()}
        
            # also read the list of input parameters that have been fed into the network
            param_list = conf.GetInputParameterList(typical_model)
            values['input_columns'] = [format_parameter_list(param_list)]
            values['number_inputs'] = len(param_list)
            
            # then read in the results in terms of relative Punzi improvement for each category
            conf = ConfigFileHandler()
            conf.LoadConfiguration(punzi_path)
    
            # load the Punzi values for each category
            for category, bin_number in bin_dir.iteritems():
                values[category] = float(conf._get_field("Punzi", category))
        
            # merge the two dictionaries
            values.update(hyperparam_dict)
            df = df.append(pd.DataFrame.from_dict(values))

    statistics_dir = campaign_dir + "statistics/"
    if not os.path.exists(statistics_dir):
        os.makedirs(statistics_dir)

    punzi_data = df[bin_dir.keys()].as_matrix()
    punzi_data = np.transpose(punzi_data)

    inparam_labels = df['input_columns'].as_matrix()

    plt.figure(figsize = (8, 9))
    plt.imshow(punzi_data, interpolation = 'none', cmap = 'RdYlGn', aspect = 0.6, vmin = 0.8, vmax = 1.2)
    plt.colorbar()
    plt.yticks(range(len(bin_dir)), bin_dir.keys())
    plt.xticks(range(len(df)), inparam_labels, rotation = 'vertical')
    plt.title("Punzi purity ratio")
    #plt.tight_layout()
    plt.savefig(statistics_dir + "punzi.pdf", bbox_inches = 'tight')
Ejemplo n.º 12
0
def main():
    global evalcnt

    if len(sys.argv) != 4:
        print "Error: exactly 3 arguments are required"

    ref_dir = sys.argv[1]
    out_dir = sys.argv[2]
    lumi = float(sys.argv[3])

    print ref_dir
    print out_dir
    print lumi

    def punzi_target(WP_VBF2j, WP_VBF1j, WP_WHh, WP_ZHh):
        global evalcnt

        bin_dir = "/home/llr/cms/wind/cmssw/CMSSW_9_4_2/bin/slc6_amd64_gcc630/"
        cost_function_evaluator = "run_WP_evaluator"

        output = check_output([
            bin_dir + cost_function_evaluator, ref_dir, out_dir,
            str(lumi),
            str(WP_VBF2j),
            str(WP_VBF1j),
            str(WP_WHh),
            str(WP_ZHh)
        ])

        costval = 0.0

        for line in output.split('\n'):
            if "cost = " in line:
                costval = float(line.replace("cost = ", ""))
                break

        if math.isnan(costval):
            costval = -8.75

        # save the sampled point such that later they can be used as exploration points (if the need occurs)
        confhandler = ConfigFileHandler()
        evaluations_path = out_dir + 'evaluations.txt'

        if os.path.exists(evaluations_path):
            confhandler.load_configuration(evaluations_path)

        print "saving evaluation for iteration " + str(evalcnt)

        section_name = 'evaluation_' + str(evalcnt)
        confhandler.new_section(section_name)
        confhandler.set_field(section_name, 'cost', str(costval))
        confhandler.set_field(section_name, 'WP_VBF2j', str(WP_VBF2j))
        confhandler.set_field(section_name, 'WP_VBF1j', str(WP_VBF1j))
        confhandler.set_field(section_name, 'WP_WHh', str(WP_WHh))
        confhandler.set_field(section_name, 'WP_ZHh', str(WP_ZHh))

        confhandler.save_configuration(evaluations_path)

        evalcnt += 1

        return costval

    eps = 1e-3
    delta = 0.2
    bo = BayesianOptimization(
        punzi_target, {
            'WP_VBF2j': (eps, 1.0 - eps),
            'WP_VBF1j': (eps, 1.0 - eps),
            'WP_WHh': (eps, 1.0 - eps),
            'WP_ZHh': (eps, 1.0 - eps)
        })

    # check if a file with previously evaluated points exists, if so, use them for initialization
    confhandler = ConfigFileHandler()
    evaluations_path = out_dir + 'evaluations.txt'

    if os.path.exists(evaluations_path):
        confhandler.load_configuration(evaluations_path)

        targets_init = []
        WP_VBF2j_init = []
        WP_VBF1j_init = []
        WP_WHh_init = []
        WP_ZHh_init = []

        for section_name in confhandler.get_sections():
            cur_section = confhandler.get_section(section_name)

            targets_init.append(float(cur_section['cost']))
            WP_VBF2j_init.append(float(cur_section['WP_VBF2j']))
            WP_VBF1j_init.append(float(cur_section['WP_VBF1j']))
            WP_WHh_init.append(float(cur_section['WP_WHh']))
            WP_ZHh_init.append(float(cur_section['WP_ZHh']))

        init_dict = {
            'target': targets_init,
            'WP_VBF2j': WP_VBF2j_init,
            'WP_VBF1j': WP_VBF1j_init,
            'WP_WHh': WP_WHh_init,
            'WP_ZHh': WP_ZHh_init
        }

        evalcnt = int(re.sub('evaluation_', '',
                             confhandler.get_sections()[-1])) + 1

        print "resuming at evaluation " + str(evalcnt)

        bo.initialize(init_dict)
        initialized = True
    else:
        initialized = False

    # change the kernel to have a length scale more appropriate to this function
    gp_params = {
        'kernel':
        1.0 *
        Matern(length_scale=0.05, length_scale_bounds=(1e-5, 1e5), nu=1.5),
        'alpha':
        1e-5
    }

    # perform the standard initialization and setup
    if initialized:
        bo.maximize(init_points=0,
                    n_iter=0,
                    acq='poi',
                    kappa=3,
                    xi=xi_scheduler(0.0),
                    **gp_params)
    else:
        bo.maximize(init_points=6,
                    n_iter=0,
                    acq='poi',
                    kappa=3,
                    xi=xi_scheduler(0.0),
                    **gp_params)

    cur_iteration = 1
    for it in range(1000):
        cur_xi = xi_scheduler(cur_iteration)
        cur_iteration += 1
        print "using xi = " + str(cur_xi)

        bo.maximize(init_points=6,
                    n_iter=1,
                    acq='poi',
                    kappa=3,
                    xi=cur_xi,
                    **gp_params)

        # evaluate the current maximum
        curval = bo.res['max']
        cost = curval['max_val']
        WPs = curval['max_params']

        confhandler = ConfigFileHandler()
        confhandler.config.optionxform = str
        confhandler.new_section('WPs')
        confhandler.set_field('WPs', 'cost', str(cost))

        for key, val in WPs.iteritems():
            confhandler.set_field('WPs', key, str(val))

        confhandler.save_configuration(out_dir + 'WPs.txt')
Ejemplo n.º 13
0
def main():

    if len(sys.argv) != 3:
        print "Error: exactly 2 arguments are required"

    # global settings
    categories = [
        "UnTagged", "VBF1jTagged", "VBF2jTagged", "VHLeptTagged",
        "VHHadrTagged", "ttHLeptTagged", "ttHHadrTagged", "VHMETTagged"
    ]
    processes = {
        'ggH': 'ggH_hzz',
        'qqH': 'qqH_hzz',
        'WH_lep': 'WH_lep_hzz',
        'WH_had': 'WH_had_hzz',
        'ZH_lep': 'ZH_lep_hzz',
        'ZH_had': 'ZH_had_hzz',
        'ttH_lep': 'ttH_lep_hzz',
        'ttH_had': 'ttH_had_hzz',
        'tqH': 'tqH_hzz',
        'bbH': 'bbH_hzz',
        'qqZZ': 'qqZZ_hzz',
        'ggZZ': 'ggZZ_hzz'
    }

    input_file = sys.argv[1]
    output_path = sys.argv[2]
    # output_path = os.path.dirname(input_file)

    output_file_expt = os.path.join(output_path, "systematics_expt_13TeV.yaml")
    output_file_th = os.path.join(output_path, "systematics_theory_13TeV.yaml")

    confhandler = ConfigFileHandler()
    confhandler.load_configuration(input_file)

    # prepare the YAML file for the experimental uncertainties
    syst_expt = Systematics2YAML()

    # --------------------------------
    # PileUp
    # --------------------------------
    # the global, inclusive one
    source_syst_name = 'PileUp'
    YAML_syst_name = 'CMS_PileUp'
    syst_type = 'lnN'
    syst_expt.add_systematics(YAML_syst_name)

    for category in categories:
        syst_expt.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " +
                                           "inclusive")

        syst_expt.add_category_entry(YAML_syst_name, category,
                                     {'type': syst_type})

        for process in processes.keys():
            val_list = ConfigFileUtils.parse_list(cur_sect[process],
                                                  lambda x: float(x))
            val_string = str(val_list[0]) + "/" + str(val_list[1])

            if not (val_list[0] == 1.0 and val_list[1] == 1.0):
                syst_expt.add_category_entry(YAML_syst_name, category,
                                             {processes[process]: val_string})

    # for each category
    source_syst_name = 'PileUp'
    YAML_syst_name = 'CMS_PileUp_cat'
    syst_type = 'lnN'
    syst_expt.add_systematics(YAML_syst_name)

    for category in categories:
        syst_expt.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_expt.add_category_entry(YAML_syst_name, category,
                                     {'type': syst_type})

        for process in processes.keys():
            val_list = ConfigFileUtils.parse_list(cur_sect[process],
                                                  lambda x: float(x))
            val_string = str(val_list[0]) + "/" + str(val_list[1])

            if not (val_list[0] == 1.0 and val_list[1] == 1.0):
                syst_expt.add_category_entry(YAML_syst_name, category,
                                             {processes[process]: val_string})

    # --------------------------------
    # Jet Energy Scale
    # --------------------------------
    source_syst_name = 'JEC'
    YAML_syst_name = 'CMS_scale_j_13TeV'
    syst_type = 'lnN'
    syst_expt.add_systematics(YAML_syst_name)

    for category in categories:
        syst_expt.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_expt.add_category_entry(YAML_syst_name, category,
                                     {'type': syst_type})

        for process in processes.keys():
            val_list = ConfigFileUtils.parse_list(cur_sect[process],
                                                  lambda x: float(x))
            val_string = str(val_list[0]) + "/" + str(val_list[1])

            if not (val_list[0] == 1.0 and val_list[1] == 1.0):
                syst_expt.add_category_entry(YAML_syst_name, category,
                                             {processes[process]: val_string})

    # --------------------------------
    # Lepton Energy Scale
    # --------------------------------
    source_syst_name = 'LEC'
    YAML_syst_name = 'CMS_scale_l_13TeV'
    syst_type = 'lnN'
    syst_expt.add_systematics(YAML_syst_name)

    for category in categories:
        syst_expt.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_expt.add_category_entry(YAML_syst_name, category,
                                     {'type': syst_type})

        for process in processes.keys():
            val_list = ConfigFileUtils.parse_list(cur_sect[process],
                                                  lambda x: float(x))
            val_string = str(val_list[0]) + "/" + str(val_list[1])

            if not (val_list[0] == 1.0 and val_list[1] == 1.0):
                syst_expt.add_category_entry(YAML_syst_name, category,
                                             {processes[process]: val_string})

    # --------------------------------
    # BTagging
    # --------------------------------
    source_syst_name = 'BTag'
    YAML_syst_name = 'CMS_eff_b'
    syst_type = 'lnN'
    syst_expt.add_systematics(YAML_syst_name)

    for category in categories:
        syst_expt.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_expt.add_category_entry(YAML_syst_name, category,
                                     {'type': syst_type})

        for process in processes.keys():
            val_list = ConfigFileUtils.parse_list(cur_sect[process],
                                                  lambda x: float(x))
            val_string = str(val_list[0]) + "/" + str(val_list[1])

            if not (val_list[0] == 1.0 and val_list[1] == 1.0):
                syst_expt.add_category_entry(YAML_syst_name, category,
                                             {processes[process]: val_string})

    syst_expt.save(output_file_expt)

    # ---------------------------------------------------------------------------------
    # ---------------------------------------------------------------------------------

    # prepare the YAML file for the theoretical uncertainties
    syst_th = Systematics2YAML()

    # --------------------------------
    # PDF scale
    # --------------------------------
    source_syst_name = 'PDF_scale'
    YAML_syst_name = 'pdf_Higgs_gg_cat'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        for process in ["ggH", "ggZZ"]:
            val_list = ConfigFileUtils.parse_list(cur_sect[process],
                                                  lambda x: float(x))
            val_string = str(val_list[0]) + "/" + str(val_list[1])

            if not (val_list[0] == 1.0 and val_list[1] == 1.0):
                syst_th.add_category_entry(YAML_syst_name, category,
                                           {processes[process]: val_string})

    source_syst_name = 'PDF_scale'
    YAML_syst_name = 'pdf_Higgs_qqbar_cat'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        for process in ["qqH", "WH_lep", "WH_had", "ZH_lep", "ZH_had"]:
            val_list = ConfigFileUtils.parse_list(cur_sect[process],
                                                  lambda x: float(x))
            val_string = str(val_list[0]) + "/" + str(val_list[1])

            if not (val_list[0] == 1.0 and val_list[1] == 1.0):
                syst_th.add_category_entry(YAML_syst_name, category,
                                           {processes[process]: val_string})

    source_syst_name = 'PDF_scale'
    YAML_syst_name = 'pdf_Higgs_ttH_cat'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        for process in ["ttH_lep", "ttH_had"]:
            val_list = ConfigFileUtils.parse_list(cur_sect[process],
                                                  lambda x: float(x))
            val_string = str(val_list[0]) + "/" + str(val_list[1])

            if not (val_list[0] == 1.0 and val_list[1] == 1.0):
                syst_th.add_category_entry(YAML_syst_name, category,
                                           {processes[process]: val_string})

    source_syst_name = 'PDF_scale'
    YAML_syst_name = 'pdf_qqbar_cat'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        for process in ["qqZZ"]:
            val_list = ConfigFileUtils.parse_list(cur_sect[process],
                                                  lambda x: float(x))
            val_string = str(val_list[0]) + "/" + str(val_list[1])

            if not (val_list[0] == 1.0 and val_list[1] == 1.0):
                syst_th.add_category_entry(YAML_syst_name, category,
                                           {processes[process]: val_string})

    # --------------------------------
    # QCD scale
    # --------------------------------

    source_syst_name = 'QCD_scale'
    YAML_syst_name = 'QCDscale_qqH_cat'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        for process in ["qqH"]:
            val_list = ConfigFileUtils.parse_list(cur_sect[process],
                                                  lambda x: float(x))
            val_string = str(val_list[0]) + "/" + str(val_list[1])

            if not (val_list[0] == 1.0 and val_list[1] == 1.0):
                syst_th.add_category_entry(YAML_syst_name, category,
                                           {processes[process]: val_string})

    source_syst_name = 'QCD_scale'
    YAML_syst_name = 'QCDscale_VH_cat'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        for process in ["WH_lep", "WH_had"]:
            val_list = ConfigFileUtils.parse_list(cur_sect[process],
                                                  lambda x: float(x))
            val_string = str(val_list[0]) + "/" + str(val_list[1])

            if not (val_list[0] == 1.0 and val_list[1] == 1.0):
                syst_th.add_category_entry(YAML_syst_name, category,
                                           {processes[process]: val_string})

                # here, since did not have the values for ZH_had and ZH_lep available, copy them from the corresponding WH processes
                if process == "WH_lep":
                    syst_th.add_category_entry(
                        YAML_syst_name, category,
                        {processes["ZH_lep"]: val_string})
                if process == "WH_had":
                    syst_th.add_category_entry(
                        YAML_syst_name, category,
                        {processes["ZH_had"]: val_string})

    source_syst_name = 'QCD_scale'
    YAML_syst_name = 'QCDscale_ttH_cat'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        for process in ["ttH_lep", "ttH_had", "tqH", "bbH"]:
            val_list = ConfigFileUtils.parse_list(cur_sect[process],
                                                  lambda x: float(x))
            val_string = str(val_list[0]) + "/" + str(val_list[1])

            if not (val_list[0] == 1.0 and val_list[1] == 1.0):
                syst_th.add_category_entry(YAML_syst_name, category,
                                           {processes[process]: val_string})

    source_syst_name = 'QCD_scale'
    YAML_syst_name = 'QCDscale_VV_cat'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        for process in ["qqZZ"]:
            val_list = ConfigFileUtils.parse_list(cur_sect[process],
                                                  lambda x: float(x))
            val_string = str(val_list[0]) + "/" + str(val_list[1])

            if not (val_list[0] == 1.0 and val_list[1] == 1.0):
                syst_th.add_category_entry(YAML_syst_name, category,
                                           {processes[process]: val_string})

    # --------------------------------
    # EW corrections
    # --------------------------------

    source_syst_name = 'EWCorr'
    YAML_syst_name = 'EWcorr_VV_cat'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        for process in ["qqZZ"]:
            val_list = ConfigFileUtils.parse_list(cur_sect[process],
                                                  lambda x: float(x))
            val_string = str(val_list[0]) + "/" + str(val_list[1])

            if not (val_list[0] == 1.0 and val_list[1] == 1.0):
                syst_th.add_category_entry(YAML_syst_name, category,
                                           {processes[process]: val_string})

    # --------------------------------
    # ggH uncertainties
    # --------------------------------

    source_syst_name = 'THU_ggH_Mu'
    YAML_syst_name = 'THU_ggH_Mu'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        process = "ggH"
        val = float(cur_sect[process])

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes[process]: val})

        # can use the same value as well for the ggZZ background! (here and in the following)
        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes["ggZZ"]: val})

    source_syst_name = 'THU_ggH_Res'
    YAML_syst_name = 'THU_ggH_Res'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        process = "ggH"
        val = float(cur_sect[process])

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes[process]: val})

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes["ggZZ"]: val})

    source_syst_name = 'THU_ggH_Mig01'
    YAML_syst_name = 'THU_ggH_Mig01'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        process = "ggH"
        val = float(cur_sect[process])

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes[process]: val})

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes["ggZZ"]: val})

    source_syst_name = 'THU_ggH_Mig12'
    YAML_syst_name = 'THU_ggH_Mig12'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        process = "ggH"
        val = float(cur_sect[process])

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes[process]: val})

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes["ggZZ"]: val})

    source_syst_name = 'THU_ggH_VBF2j'
    YAML_syst_name = 'THU_ggH_VBF2j'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        process = "ggH"
        val = float(cur_sect[process])

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes[process]: val})

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes["ggZZ"]: val})

    source_syst_name = 'THU_ggH_VBF3j'
    YAML_syst_name = 'THU_ggH_VBF3j'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        process = "ggH"
        val = float(cur_sect[process])

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes[process]: val})

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes["ggZZ"]: val})

    source_syst_name = 'THU_ggH_PT60'
    YAML_syst_name = 'THU_ggH_PT60'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        process = "ggH"
        val = float(cur_sect[process])

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes[process]: val})

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes["ggZZ"]: val})

    source_syst_name = 'THU_ggH_PT120'
    YAML_syst_name = 'THU_ggH_PT120'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        process = "ggH"
        val = float(cur_sect[process])

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes[process]: val})

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes["ggZZ"]: val})

    source_syst_name = 'THU_ggH_qmtop'
    YAML_syst_name = 'THU_ggH_qmtop'
    syst_type = 'lnN'
    syst_th.add_systematics(YAML_syst_name)

    for category in categories:
        syst_th.add_category(YAML_syst_name, category)
        cur_sect = confhandler.get_section(source_syst_name + ": " + category)

        syst_th.add_category_entry(YAML_syst_name, category,
                                   {'type': syst_type})

        process = "ggH"
        val = float(cur_sect[process])

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes[process]: val})

        if not (val == 1.0):
            syst_th.add_category_entry(YAML_syst_name, category,
                                       {processes["ggZZ"]: val})

    syst_th.save(output_file_th)
Ejemplo n.º 14
0
    def punzi_target(WP_VBF2j, WP_VBF1j, WP_WHh, WP_ZHh):
        global evalcnt

        bin_dir = "/home/llr/cms/wind/cmssw/CMSSW_9_4_2/bin/slc6_amd64_gcc630/"
        cost_function_evaluator = "run_WP_evaluator"

        output = check_output([
            bin_dir + cost_function_evaluator, ref_dir, out_dir,
            str(lumi),
            str(WP_VBF2j),
            str(WP_VBF1j),
            str(WP_WHh),
            str(WP_ZHh)
        ])

        costval = 0.0

        for line in output.split('\n'):
            if "cost = " in line:
                costval = float(line.replace("cost = ", ""))
                break

        if math.isnan(costval):
            costval = -8.75

        # save the sampled point such that later they can be used as exploration points (if the need occurs)
        confhandler = ConfigFileHandler()
        evaluations_path = out_dir + 'evaluations.txt'

        if os.path.exists(evaluations_path):
            confhandler.load_configuration(evaluations_path)

        print "saving evaluation for iteration " + str(evalcnt)

        section_name = 'evaluation_' + str(evalcnt)
        confhandler.new_section(section_name)
        confhandler.set_field(section_name, 'cost', str(costval))
        confhandler.set_field(section_name, 'WP_VBF2j', str(WP_VBF2j))
        confhandler.set_field(section_name, 'WP_VBF1j', str(WP_VBF1j))
        confhandler.set_field(section_name, 'WP_WHh', str(WP_WHh))
        confhandler.set_field(section_name, 'WP_ZHh', str(WP_ZHh))

        confhandler.save_configuration(evaluations_path)

        evalcnt += 1

        return costval
Ejemplo n.º 15
0
def run_bayesian_optimization(name, eval_file, target, var_ranges, init_points, max_iterations, patience, alpha):
    global evalcnt
    evalcnt = 0
    
    print "now optimizing the following variables: " + str(var_ranges)
    print "alpha = " + str(alpha)

    # change the kernel to have a length scale more appropriate to this function
    # alpha ... corresponds to the value added to the diagonal elements of the covariance matrix <-> the approximate noise level in the observations
    gp_params = {'kernel': ConstantKernel(1.0, (1e-8, 1e2)) * Matern(length_scale = 0.01, length_scale_bounds = (1e-5, 1e5), nu = 1.5),
                 'alpha': alpha}

    bo = BayesianOptimization(target, var_ranges)
    
    # check if a file with previous evaluations of this utility function already exists, if so, use it for initialization
    evaluations_path = os.path.join(out_dir, eval_file)
    
    if os.path.exists(evaluations_path):
        confhandler = ConfigFileHandler()
        confhandler.load_configuration(evaluations_path)
        
        init_dict = {}
        
        for section_name in confhandler.get_sections():
            cur_section = confhandler.get_section(section_name)
            
            for key, value in cur_section.iteritems():
                # only take those variables that are actually relevant
                if key in var_ranges or key == "target":
                    if key not in init_dict:
                        init_dict[key] = []
                    
                    init_dict[key].append(float(value))
                
        evalcnt = int(re.sub('evaluation_', '', confhandler.get_sections()[-1])) + 1
        print "resuming " + name + " at evaluation " + str(evalcnt)
        
        init_points_loaded = len(init_dict["target"])
        print "found " + str(init_points_loaded) + " initialization points: " + str(init_dict)
        
        bo.initialize(init_dict)
        bo.maximize(init_points = max(0, init_points - init_points_loaded), n_iter = 0, acq = 'poi', kappa = 3, xi = xi_scheduler(0.0, max_iterations), **gp_params)
        print "initialization done"
    else:
        bo.maximize(init_points = init_points, n_iter = 0, acq = 'poi', kappa = 3, xi = xi_scheduler(0.0, max_iterations), **gp_params)
    
    cur_iteration = 1
    patience_cnt = 0
    best_cost = -7.0
    
    for it in range(max_iterations): 
        cur_xi = xi_scheduler(cur_iteration, max_iterations)
        print "cur_iteration = " + str(cur_iteration) + ", using xi = " + str(cur_xi)

        cur_iteration += 1
        
        bo.maximize(init_points = 0, n_iter = 1, acq = 'poi', kappa = 3, xi = cur_xi, **gp_params)

        # evaluate the current maximum
        curval = bo.res['max']
        cost = curval['max_val']
        curparams = curval['max_params']
    
        confhandler = ConfigFileHandler()
        confhandler.config.optionxform = str
        confhandler.new_section(name)
        confhandler.set_field(name, 'target', str(cost))
        
        for key, val in curparams.iteritems():
            confhandler.set_field(name, key, str(val))
        
        confhandler.save_configuration(os.path.join(out_dir, name + '.txt'))
        
        # check if it is time to stop this optimization
        if(cost > best_cost):
            best_cost = cost
            patience_cnt = 0
            
        patience_cnt += 1
        
        if(patience_cnt > patience):
            break
            
    return curparams
def main():
    if len(sys.argv) != 3:
        print "Error: exactly 2 arguments are required!"

    source_path = sys.argv[1]
    #source_path = "/data_CMS/cms/wind/CJLST_NTuples_prepared_systematics/"
    dest_path = sys.argv[2]

    # global settings:
    zzroot = os.environ["CMSSW_BASE"]
    bin_dir = os.path.join(zzroot, "bin/slc6_amd64_gcc630/")

    scrambler = os.path.join(bin_dir, "run_scrambler")
    chunk_extractor = os.path.join(bin_dir, "run_chunk_extractor")

    settings_path = os.path.join(dest_path, "settings.conf")

    confhandler = ConfigFileHandler()
    confhandler.load_configuration(settings_path)

    # load global settings from the configuration file
    root_file_name = confhandler.get_field("Global", "root_file_name")
    source_dir = confhandler.get_field("Global", "source_dir")
    chunk_size = int(confhandler.get_field("Global", "chunk_size"))

    def submit_job(cmd_dir, command):
        job_submitter = os.environ["JOB_SUBMITTER"]

        filename = str(uuid.uuid4()) + ".sh"
        file_path = os.path.join(cmd_dir, filename)
        with open(file_path, "w") as cmd_file:
            cmd_file.write("#!/bin/bash\n")
            cmd_file.write(command)

        while True:
            try:
                output = sp.check_output([job_submitter, "-short", file_path])
                break
            except sp.CalledProcessError:
                print "-------------------------------------------------"
                print " error submitting job, retrying ... "
                print "-------------------------------------------------"

        print output

    def chunk_file(in_dir, out_root, base_name, number_chunks, cmd_dir):
        splits = np.linspace(0, 1, number_chunks)
        in_file = os.path.join(in_dir, root_file_name)

        if number_chunks == 1:
            out_folder = os.path.join(out_root, base_name + "_chunk_0/")

            if not os.path.exists(out_folder):
                os.makedirs(out_folder)

            out_file = os.path.join(out_folder, root_file_name)

            command = " ".join([chunk_extractor, in_file, out_file, str(0.0), str(1.0), str(0)])
            submit_job(cmd_dir, command)
            print command

        else:
            for i in range(len(splits) - 1):
                start_split = splits[i]
                end_split = splits[i + 1]
            
                out_folder = os.path.join(out_root, base_name + "_chunk_" + str(i) + "/")
                if not os.path.exists(out_folder):
                    os.makedirs(out_folder)

                out_file = os.path.join(out_folder, root_file_name)
                
                command = " ".join([chunk_extractor, in_file, out_file, str(start_split), str(end_split), str(0)])
                submit_job(cmd_dir, command)
                print command

    # create the needed folders:
    train_dir = os.path.join(dest_path, "training/")
    validation_dir = os.path.join(dest_path, "validation/")
    test_dir = os.path.join(dest_path, "test/")
    trainval_dir = os.path.join(dest_path, "trainval/")
    temp_dir = os.path.join(dest_path, "temp/")

    # create these directories
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    
    if not os.path.exists(validation_dir):
        os.makedirs(validation_dir)
    
    if not os.path.exists(test_dir):
        os.makedirs(test_dir)
    
    if not os.path.exists(trainval_dir):
        os.makedirs(trainval_dir)
    
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    training_files = [cur_file for cur_file in confhandler.get_sections() if "Global" not in cur_file]
    available_files = next(os.walk(source_path))[1]
    used_files = []
    
    for training_file in training_files:
        sect = confhandler.get_section(training_file)
    
        print "--------------------------------------------------"
        print "currently splitting: " + training_file
    
        source_files = ConfigFileUtils.parse_list(sect["source"], lambda x: x)
        train_val_splits = ConfigFileUtils.parse_list(sect["train_val_split"], lambda x: float(x))
        val_test_splits = ConfigFileUtils.parse_list(sect["val_test_split"], lambda x: float(x))
    
        # first split the needed files into 3 pieces, as dictated by the splits read from the config file
        for source_file, train_val_split, val_test_split in zip(source_files, train_val_splits, val_test_splits):
        
            print "extracting 0.0 - " + str(train_val_split) + " from " + source_file
        
            dest_dir = os.path.join(train_dir, source_file)
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)
    
            output = sp.check_output([chunk_extractor, os.path.join(source_path, source_file, root_file_name),
                                      os.path.join(dest_dir, root_file_name), str(0.0), str(train_val_split)])      
            print output
        
            print "-- -- -- -- -- -- -- -- -- -- -- --"
        
            print "extracting " + str(train_val_split) + " - " + str(val_test_split) + " from " + source_file
        
            dest_dir = os.path.join(validation_dir, source_file)
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)
    
            output = sp.check_output([chunk_extractor, os.path.join(source_path, source_file, root_file_name),
                                      os.path.join(dest_dir, root_file_name), str(train_val_split), str(val_test_split)])      
            print output
        
            print "-- -- -- -- -- -- -- -- -- -- -- --"
        
            print "extracting " + str(val_test_split) + " - 1.0 from " + source_file
        
            dest_dir = os.path.join(test_dir, source_file)
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)
    
            output = sp.check_output([chunk_extractor, os.path.join(source_path, source_file, root_file_name),
                                      os.path.join(dest_dir, root_file_name), str(val_test_split), str(1.0)])      
            print output
        
            used_files.append(source_file)
    
        print "--------------------------------------------------"

    unused_files = [cur_file for cur_file in available_files if cur_file not in used_files]

    # for all files that are not used for training, split them 50:50 into validation and test ...
    for unused_file in unused_files:
        source_dir = os.path.join(source_path, unused_file)

        # ... unless they are only needed to assess systematics, i.e. are not going to be used at all during the validation step
        if "ext" in unused_file or "tuneup" in unused_file or "tunedown" in unused_file:
            print "extracting 0.0 - 1.0 from " + unused_file

            dest_dir = os.path.join(test_dir, unused_file)
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)
                
            output = sp.check_output([chunk_extractor, os.path.join(source_dir, root_file_name),
                                      os.path.join(dest_dir, root_file_name), str(0.0), str(1.0)])      
            print output

        else:
            print "extracting 0.0 - 0.5 from " + unused_file
            
            dest_dir = os.path.join(validation_dir, unused_file)
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)
    
            output = sp.check_output([chunk_extractor, os.path.join(source_dir, root_file_name),
                                      os.path.join(dest_dir, root_file_name), str(0.0), str(0.5)])      
            print output

            print "-- -- -- -- -- -- -- -- -- -- -- --"

            print "extracting 0.5 - 1.0 from " + unused_file
            
            dest_dir = os.path.join(test_dir, unused_file)
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)
    
            output = sp.check_output([chunk_extractor, os.path.join(source_dir, root_file_name),
                                      os.path.join(dest_dir, root_file_name), str(0.5), str(1.0)])      
            print output
    
    # now have all the needed files split apart, can now proceed to combine them into the training 
    # datasets that will end up in trainval
    for training_file in training_files:
        print "now building training dataset: " + training_file
        sect = confhandler.get_section(training_file)
        source_folders = ConfigFileUtils.parse_list(sect["source"], lambda x: x)
    
        for mode in ["training", "validation"]:

            temp_dest_folder = os.path.join(dest_path, temp_dir, training_file, mode)
            temp_dest_file = os.path.join(temp_dest_folder, root_file_name)

            if not os.path.exists(temp_dest_folder):
                os.makedirs(temp_dest_folder)

            source_files = [os.path.join(dest_path, mode, cur_file, root_file_name) for cur_file in source_folders]

            print "hadd " + temp_dest_file + " " + " ".join(source_files)
            output = sp.check_output(["hadd", temp_dest_file] + source_files)      
            print output
    
            temp_scrambled_folder = os.path.join(dest_path, temp_dir, "scrambled", training_file, mode)
            if not os.path.exists(temp_scrambled_folder):
                os.makedirs(temp_scrambled_folder)
            
            temp_scrambled_file = os.path.join(temp_scrambled_folder, root_file_name)
        
            print scrambler + " " + temp_dest_file + " " + temp_scrambled_file
            output = sp.check_output([scrambler, temp_dest_file, temp_scrambled_file])      
            print output
        
        trainval_dest_folder = os.path.join(trainval_dir, training_file)
        if not os.path.exists(trainval_dest_folder):
            os.makedirs(trainval_dest_folder)
        
        print "hadd " + os.path.join(trainval_dest_folder, root_file_name) + " " + os.path.join(dest_path, temp_dir, "scrambled", training_file, "training", root_file_name) + " " + os.path.join(dest_path, temp_dir, "scrambled", training_file, "validation", root_file_name)
        
        output = sp.check_output(["hadd", os.path.join(trainval_dest_folder, root_file_name),
                                 os.path.join(dest_path, temp_dir, "scrambled", training_file, "training", root_file_name),
                                 os.path.join(dest_path, temp_dir, "scrambled", training_file, "validation", root_file_name)])
        print output

    # at the end, chunk the ROOT files into many smaller ones, to keep the augmentation time short
    train_chunks_dir = os.path.join(dest_path, "training_chunks/")
    validation_chunks_dir = os.path.join(dest_path, "validation_chunks/")
    test_chunks_dir = os.path.join(dest_path, "test_chunks/")

    # create these directories
    if not os.path.exists(train_chunks_dir):
        os.makedirs(train_chunks_dir)
    
    if not os.path.exists(validation_chunks_dir):
        os.makedirs(validation_chunks_dir)
    
    if not os.path.exists(test_chunks_dir):
        os.makedirs(test_chunks_dir)

    for mode in ["training", "validation", "test"]:
        # look at each file individually and put it into chunks
        cur_dir = os.path.join(dest_path, mode)
        available_folders = next(os.walk(cur_dir))[1]

        for available_folder in available_folders:
            available_file = os.path.join(cur_dir, available_folder, root_file_name)

            number_chunks = max(1, os.path.getsize(available_file) / chunk_size)

            print "now splitting file " + available_file + " into " + str(number_chunks) + " chunks"

            out_root = os.path.join(dest_path, mode + "_chunks")
            
            chunk_file(os.path.join(dest_path, mode, available_folder), out_root, available_folder, number_chunks, temp_dir)
        
    print "done."        
Ejemplo n.º 17
0
    def punzi_target(ggH_prior, WHhadr_prior, ZHhadr_prior, WHlept_prior,
                     ZHlept_prior, ZHMET_prior, ttHhadr_prior, ttHlept_prior):
        global evalcnt

        bin_dir = "/home/llr/cms/wind/cmssw/CMSSW_9_4_2/bin/slc6_amd64_gcc630/"
        cost_function_evaluator = "run_prior_evaluator"

        output = check_output([
            bin_dir + cost_function_evaluator, run_dir, out_dir, engine,
            str(ggH_prior),
            str(WHhadr_prior),
            str(ZHhadr_prior),
            str(WHlept_prior),
            str(ZHlept_prior),
            str(ZHMET_prior),
            str(ttHhadr_prior),
            str(ttHlept_prior)
        ])

        costval = 0.0

        for line in output.split('\n'):
            if "cost = " in line:
                costval = float(line.replace("cost = ", ""))
                break

        if math.isnan(costval):
            costval = -8.75

        # add a regularization term that prefers default priors (i.e. close to 1.0)
        reg_term = 1.0 / 8.0 * (
            (ggH_prior - 1.0)**2.0 + (WHhadr_prior - 1.0)**2.0 +
            (ZHhadr_prior - 1.0)**2.0 + (WHlept_prior - 1.0)**2.0 +
            (ZHlept_prior - 1.0)**2.0 + (ZHMET_prior - 1.0)**2.0 +
            (ttHhadr_prior - 1.0)**2.0 + (ttHlept_prior - 1.0)**2.0)
        costval -= reg_term * lambda_reg

        # save the sampled point such that later they can be used as exploration points (if the need occurs)
        confhandler = ConfigFileHandler()
        evaluations_path = out_dir + 'evaluations.txt'

        if os.path.exists(evaluations_path):
            confhandler.load_configuration(evaluations_path)

        print "saving evaluation for iteration " + str(evalcnt)

        section_name = 'evaluation_' + str(evalcnt)
        confhandler.new_section(section_name)
        confhandler.set_field(section_name, 'cost', str(costval))
        confhandler.set_field(section_name, 'ggH_prior', str(ggH_prior))
        confhandler.set_field(section_name, 'WHhadr_prior', str(WHhadr_prior))
        confhandler.set_field(section_name, 'ZHhadr_prior', str(ZHhadr_prior))
        confhandler.set_field(section_name, 'WHlept_prior', str(WHlept_prior))
        confhandler.set_field(section_name, 'ZHlept_prior', str(ZHlept_prior))
        confhandler.set_field(section_name, 'ZHMET_prior', str(ZHMET_prior))
        confhandler.set_field(section_name, 'ttHhadr_prior',
                              str(ttHhadr_prior))
        confhandler.set_field(section_name, 'ttHlept_prior',
                              str(ttHlept_prior))

        confhandler.save_configuration(evaluations_path)

        evalcnt += 1

        return costval
Ejemplo n.º 18
0
def main():
    global evalcnt

    if len(sys.argv) != 4:
        print "Error: exactly 3 arguments are required"

    run_dir = sys.argv[1]
    out_dir = sys.argv[2]
    engine = sys.argv[3]

    print run_dir
    print out_dir
    print engine

    # punzi_target_2d = lambda WHlept_prior, ZHlept_prior: punzi_target(ggH_prior_default, WHhadr_prior_default, ZHhadr_prior_default,
    #                                                                       WHlept_prior, ZHlept_prior, ZHMET_prior_default,
    #                                                                       ttHhadr_prior_default, ttHlept_prior_default)

    def punzi_target(ggH_prior, WHhadr_prior, ZHhadr_prior, WHlept_prior,
                     ZHlept_prior, ZHMET_prior, ttHhadr_prior, ttHlept_prior):
        global evalcnt

        bin_dir = "/home/llr/cms/wind/cmssw/CMSSW_9_4_2/bin/slc6_amd64_gcc630/"
        cost_function_evaluator = "run_prior_evaluator"

        output = check_output([
            bin_dir + cost_function_evaluator, run_dir, out_dir, engine,
            str(ggH_prior),
            str(WHhadr_prior),
            str(ZHhadr_prior),
            str(WHlept_prior),
            str(ZHlept_prior),
            str(ZHMET_prior),
            str(ttHhadr_prior),
            str(ttHlept_prior)
        ])

        costval = 0.0

        for line in output.split('\n'):
            if "cost = " in line:
                costval = float(line.replace("cost = ", ""))
                break

        if math.isnan(costval):
            costval = -8.75

        # add a regularization term that prefers default priors (i.e. close to 1.0)
        reg_term = 1.0 / 8.0 * (
            (ggH_prior - 1.0)**2.0 + (WHhadr_prior - 1.0)**2.0 +
            (ZHhadr_prior - 1.0)**2.0 + (WHlept_prior - 1.0)**2.0 +
            (ZHlept_prior - 1.0)**2.0 + (ZHMET_prior - 1.0)**2.0 +
            (ttHhadr_prior - 1.0)**2.0 + (ttHlept_prior - 1.0)**2.0)
        costval -= reg_term * lambda_reg

        # save the sampled point such that later they can be used as exploration points (if the need occurs)
        confhandler = ConfigFileHandler()
        evaluations_path = out_dir + 'evaluations.txt'

        if os.path.exists(evaluations_path):
            confhandler.load_configuration(evaluations_path)

        print "saving evaluation for iteration " + str(evalcnt)

        section_name = 'evaluation_' + str(evalcnt)
        confhandler.new_section(section_name)
        confhandler.set_field(section_name, 'cost', str(costval))
        confhandler.set_field(section_name, 'ggH_prior', str(ggH_prior))
        confhandler.set_field(section_name, 'WHhadr_prior', str(WHhadr_prior))
        confhandler.set_field(section_name, 'ZHhadr_prior', str(ZHhadr_prior))
        confhandler.set_field(section_name, 'WHlept_prior', str(WHlept_prior))
        confhandler.set_field(section_name, 'ZHlept_prior', str(ZHlept_prior))
        confhandler.set_field(section_name, 'ZHMET_prior', str(ZHMET_prior))
        confhandler.set_field(section_name, 'ttHhadr_prior',
                              str(ttHhadr_prior))
        confhandler.set_field(section_name, 'ttHlept_prior',
                              str(ttHlept_prior))

        confhandler.save_configuration(evaluations_path)

        evalcnt += 1

        return costval

    eps = 1e-1
    delta = 0.2
    bo = BayesianOptimization(
        punzi_target, {
            'ggH_prior': (1.0 - delta, 1.0 + delta),
            'WHhadr_prior': (eps, 1.0),
            'ZHhadr_prior': (eps, 1.0),
            'WHlept_prior': (eps, 1.0),
            'ZHlept_prior': (eps, 1.0),
            'ZHMET_prior': (eps, 1.0),
            'ttHhadr_prior': (eps, 1.0),
            'ttHlept_prior': (eps, 1.0)
        })

    # bo = BayesianOptimization(punzi_target_2d, {'WHlept_prior': (eps, WHlept_prior_default + delta),
    #                                                  'ZHlept_prior': (eps, ZHlept_prior_default + delta)})

    # check if a file with previously evaluated points exists, if so, use them for initialization
    confhandler = ConfigFileHandler()
    evaluations_path = out_dir + 'evaluations.txt'

    if os.path.exists(evaluations_path):
        confhandler.load_configuration(evaluations_path)

        ggH_priors_init = []
        WHhadr_priors_init = []
        ZHhadr_priors_init = []
        WHlept_priors_init = []
        ZHlept_priors_init = []
        ZHMET_priors_init = []
        ttHhadr_priors_init = []
        ttHlept_priors_init = []
        targets_init = []

        for section_name in confhandler.get_sections():
            cur_section = confhandler.get_section(section_name)

            targets_init.append(float(cur_section['cost']))
            ggH_priors_init.append(float(cur_section['ggH_prior']))
            WHhadr_priors_init.append(float(cur_section['WHhadr_prior']))
            ZHhadr_priors_init.append(float(cur_section['ZHhadr_prior']))
            WHlept_priors_init.append(float(cur_section['WHlept_prior']))
            ZHlept_priors_init.append(float(cur_section['ZHlept_prior']))
            ZHMET_priors_init.append(float(cur_section['ZHMET_prior']))
            ttHhadr_priors_init.append(float(cur_section['ttHhadr_prior']))
            ttHlept_priors_init.append(float(cur_section['ttHlept_prior']))

        init_dict = {
            'target': targets_init,
            'ggH_prior': ggH_priors_init,
            'WHhadr_prior': WHhadr_priors_init,
            'ZHhadr_prior': ZHhadr_priors_init,
            'WHlept_prior': WHlept_priors_init,
            'ZHlept_prior': ZHlept_priors_init,
            'ZHMET_prior': ZHMET_priors_init,
            'ttHhadr_prior': ttHhadr_priors_init,
            'ttHlept_prior': ttHlept_priors_init
        }

        evalcnt = int(re.sub('evaluation_', '',
                             confhandler.get_sections()[-1])) + 1

        print "resuming at evaluation " + str(evalcnt)

        bo.initialize(init_dict)
        initialized = True
    else:
        initialized = False

    # change the kernel to have a length scale more appropriate to this function
    # alpha ... corresponds to the value added to the diagonal elements of the covariance matrix <-> the approximate noise level in the observations
    gp_params = {
        'kernel':
        1.0 *
        Matern(length_scale=0.05, length_scale_bounds=(1e-5, 1e5), nu=1.5),
        'alpha':
        1e-1
    }

    # perform the standard initialization and setup
    if initialized:
        bo.maximize(init_points=0,
                    n_iter=0,
                    acq='poi',
                    kappa=3,
                    xi=xi_scheduler(0.0),
                    **gp_params)
    else:
        bo.maximize(init_points=6,
                    n_iter=0,
                    acq='poi',
                    kappa=3,
                    xi=xi_scheduler(0.0),
                    **gp_params)

    cur_iteration = 1
    for it in range(1000):
        cur_iteration += 1

        cur_xi = xi_scheduler(cur_iteration)
        print "using xi = " + str(cur_xi)

        bo.maximize(init_points=6,
                    n_iter=1,
                    acq='poi',
                    kappa=3,
                    xi=cur_xi,
                    **gp_params)

        # evaluate the current maximum
        curval = bo.res['max']
        cost = curval['max_val']
        priors = curval['max_params']

        confhandler = ConfigFileHandler()
        confhandler.config.optionxform = str
        confhandler.new_section('Priors')
        confhandler.set_field('Priors', 'cost', str(cost))
        confhandler.set_field('Priors', 'VBF_prior', str(1.0))

        for key, val in priors.iteritems():
            confhandler.set_field('Priors', key, str(val))

        confhandler.save_configuration(out_dir + 'priors.txt')
def main():
    # runs to check for (good) models (the first one passed is taken as reference run from which the available models
    # are taken - it is expected that all others runs also follow this structure):
    input_runs = []

    print "==================================================================="
    print "looking for models in the following runs:"

    for campaign_dir in sys.argv[1:-2]:
        for run_dir in next(os.walk(campaign_dir))[1]:
            if not "bin" in run_dir:
                run_path = os.path.join(campaign_dir, run_dir)
                print run_path
                input_runs.append(run_path)

    print "==================================================================="

    # output training campaign, this will consist of a combination of the models found in the campaigns listed above, in such a way that the overall performance is optimized
    output_run = os.path.join(sys.argv[-1], "optimized")

    # where the configuration file for the hyperparameter settings should be stored
    hyperparam_output = os.path.join(output_run, "../hyperparameters.conf")

    os.makedirs(output_run)

    # load the available model names
    reference_run = input_runs[0]
    available_mcolls = os.walk(os.path.join(reference_run,
                                            "training")).next()[1]

    mcolls_winning = []

    for mcoll in available_mcolls:
        models = os.walk(os.path.join(reference_run, "training",
                                      mcoll)).next()[1]

        # load a representative version of the current model collection...
        mconfhandler = ModelCollectionConfigFileHandler()
        mconfhandler.load_configuration(
            os.path.join(reference_run, "settings_training", mcoll,
                         "settings.conf"))
        mcoll_template = mconfhandler.GetModelCollection()[0]

        # ... but strip away all the actual model components
        mcoll_template.model_dict = {}
        mcoll_template.preprocessor_dict = {}
        mcoll_template.settings_dict = {}

        for model in models:
            # compare this model across the different runs
            losses = [get_loss(run, mcoll, model) for run in input_runs]

            winner = np.argmin(losses)

            winning_run = input_runs[winner]

            # copy the winning model into the output run
            shutil.copytree(
                os.path.join(winning_run, "training", mcoll, model),
                os.path.join(output_run, "training", mcoll, model))

            print "--------------------------------------------"
            print " take " + model + " from " + winning_run
            print "--------------------------------------------"

            # load the winning model to keep track of its settings
            mconfhandler = ModelCollectionConfigFileHandler()
            mconfhandler.load_configuration(
                os.path.join(winning_run, "settings_training", mcoll,
                             "settings.conf"))
            mcoll_winning = mconfhandler.GetModelCollection()[0]

            # then pull the winning model over into the template
            winning_model = mcoll_winning.model_dict[model]
            winning_preprocessor = mcoll_winning.preprocessor_dict[model]
            winning_settings = mcoll_winning.settings_dict[model]

            mcoll_template.add_model(winning_preprocessor, winning_model,
                                     winning_settings)

        mcolls_winning.append(mcoll_template)

    # now save the put-together config file also into the output run
    mconfhandler = ModelCollectionConfigFileHandler()
    mconfhandler.ToConfiguration(mcolls_winning)
    mconfhandler.save_configuration(os.path.join(output_run, "settings.conf"))

    # now distriute again the training settings, as usual:
    distribute_training_settings(output_run + '/')

    # now create the hyperparameter config file for each model, taken from the winners
    hp_confhandler = ConfigFileHandler()
    for mcoll in mcolls_winning:
        for model_name, model in mcoll.model_dict.iteritems():
            hp_confhandler.new_section(model_name)
            hp_confhandler.set_field(
                model_name, "hyperparameters",
                ConfigFileUtils.serialize_dict(model.hyperparameters,
                                               lambda x: str(x)))

    hp_confhandler.save_configuration(hyperparam_output)

    print "==================================================================="
    print "hyperparameter configuration file written to " + hyperparam_output
    print "==================================================================="
Ejemplo n.º 20
0
def main():
    def _compute_class_weights_lengths(gen, preprocessor, MC_weighting=False):
        # determine the actual size of the available dataset and adjust the sample weights correspondingly
        H1_data = gen.H1_collection.get_data(Config.branches, 0.0, 1.0)
        H0_data = gen.H0_collection.get_data(Config.branches, 0.0, 1.0)
        H1_length = len(preprocessor.process(H1_data).values()[0])
        H1_indices = preprocessor.get_last_indices()
        H0_length = len(preprocessor.process(H0_data).values()[0])
        H0_indices = preprocessor.get_last_indices()

        print "H1_length = " + str(H1_length)
        print "H0_length = " + str(H0_length)

        # if per-sample weighting is enabled, also set up the normalization of the event weights
        if MC_weighting:
            H1_weight_sum = np.sum(
                np.maximum(np.array(H1_data["training_weight"][H1_indices]),
                           0.0))
            H0_weight_sum = np.sum(
                np.maximum(np.array(H0_data["training_weight"][H0_indices]),
                           0.0))

            H1_class_weight = float(H0_length) / H1_weight_sum
            H0_class_weight = float(H1_length) / H0_weight_sum
        else:
            # H1_class_weight = 1.0
            # H0_class_weight = float(H1_length) / float(H0_length)
            H1_class_weight = 1.0 + float(H0_length) / float(H1_length)
            H0_class_weight = 1.0 + float(H1_length) / float(H0_length)

        return H1_class_weight, H0_class_weight, H1_length, H0_length

    # this computes low-level performance metrics for a model collection, i.e. the mean-quare error
    # computed on the validation dataset for each discriminant. Since the validation datasets will be held constant,
    # this is an easy way to directly compare different models

    setting_dir = sys.argv[1]
    training_dir = sys.argv[2]
    out_dir = sys.argv[3]

    # first, need to read in the trained ModelCollection:
    mconfhandler = ModelCollectionConfigFileHandler()
    mconfhandler.load_configuration(setting_dir + "settings.conf")
    mcolls = mconfhandler.GetModelCollection(weightpath=training_dir)

    confhandler = ConfigFileHandler()
    out_path = out_dir + "model_benchmark.txt"

    # for the evaluation, need to proceed in the same way as for training, but evaluate the models on the validation
    # data instead of training them on the training data

    for mcoll in mcolls:
        models, preprocessors, settings = mcoll.get_models(
        ), mcoll.get_preprocessors(), mcoll.get_settings()

        for cur_model, cur_preprocessor, cur_settings in zip(
                models, preprocessors, settings):
            val_gen = Generator(mcoll.H1_stream,
                                mcoll.H0_stream,
                                Config.branches,
                                preprocessor=cur_preprocessor,
                                chunks=1,
                                MC_weighting=False)
            val_gen.setup_validation_data()
            val_H1_classweight, val_H0_classweight, H1_length, H0_length = _compute_class_weights_lengths(
                val_gen, cur_preprocessor, False)
            print val_H1_classweight
            print val_H0_classweight
            print H1_length
            print H0_length
            val_gen.set_H1_weight(val_H1_classweight)
            val_gen.set_H0_weight(val_H0_classweight)
            val_gen.set_minimum_length(0)
            cur_model.get_keras_model().compile(optimizer=optimizers.Adam(),
                                                loss="mean_squared_error",
                                                metrics=["binary_accuracy"])
            res = cur_model.get_keras_model().evaluate_generator(
                val_gen.preprocessed_generator(), steps=1)
            print "statistics for model " + cur_model.name
            print res
            print cur_model.get_keras_model().metrics_names

            confhandler.new_section(cur_model.name)
            confhandler.set_field(cur_model.name, 'H0_val_length',
                                  str(H0_length))
            confhandler.set_field(cur_model.name, 'H1_val_length',
                                  str(H1_length))
            confhandler.set_field(cur_model.name, 'val_loss', str(res[0]))

    confhandler.save_configuration(out_path)
def main():

    def append_variables_raw(confhandler, impdict):
        section_name = impdict["discriminant"]
        confhandler.new_section(section_name)

        periodic_inputs = []
        nonperiodic_inputs = []
        for key, val in impdict.iteritems():
            if key is not "discriminant":
                if "phi" in key or "Phi" in key or "xi" in key or "xistar" in key:
                    periodic_inputs.append(key)
                else:
                    nonperiodic_inputs.append(key)

        confhandler.set_field(section_name, "nonperiodic_columns", ConfigFileUtils.serialize_list(nonperiodic_inputs, lambda x: x))
        confhandler.set_field(section_name, "periodic_columns", ConfigFileUtils.serialize_list(periodic_inputs, lambda x: x))

    def convert_varname(raw):
        raw = raw.replace('(', '[')
        raw = raw.replace(')', ']')
        return raw

    def select_input_features_cumulative(H1_stream, H0_stream, discriminant_name, scalar_branches, list_branches, list_pt_limits, confhandler, df_scores, cumulative_threshold = 0.99):
        print "using cumulative threshold = " + str(cumulative_threshold)

        # temporary fix: for any discriminant that involves Z+X, do not use PF-MET (for the data / MC comparison would otherwise be potentially biased)
        if "ZX" in model.name:
            print "blocking PFMET for model " + model.name
            scalar_branches.remove("PFMET")

        print "will select input features from: " + str(scalar_branches + list_branches)

        implist = scorer.get_sorted_feature_importance_list(H1_stream, H0_stream, scalar_branches, list_branches, list_pt_limits)

        print "implist: " + str(implist)

        # now iterate through the sorted list that has been returned and put only the highest-ranked variables, up to the threshold
        running_sum = 0
        impdict = {}
        for key, val in implist:
            if running_sum < cumulative_threshold:
                impdict[convert_varname(key)] = [val]
                running_sum += val

        impdict["discriminant"] = discriminant_name

        print "impdict: " + str(impdict)

        append_variables_raw(confhandler, impdict)

        df = df_scores.append(pd.DataFrame.from_dict(impdict))

        print str(implist)
        return df

    if len(sys.argv) != 6:
        print "Error: exactly 5 arguments are required"
        return

    out_dir = sys.argv[1]
    campaign_name = sys.argv[2]
    MC_path = sys.argv[3]
    usemela = sys.argv[4]
    threshold = float(sys.argv[5])

    # input variables that are stored as lists
    list_branches = ["Jet", "ExtraLep"]

    # limit pt values for these lists: here, jets are only used if their pt > 30GeV, no restrictions are placed on leptons
    list_pt_limits = [30, 0]

    # scalar (i.e. non-list) input variables
    production_branches = ["PFMET", "nCleanedJetsPt30", "nCleanedJetsPt30BTagged_bTagSF", "nExtraLep", 
                           "ZZMass_masked", "nExtraZ", "Z1Mass", "Z2Mass", "Z1Pt", "Z2Pt", "ZZMassErr", "ZZPt", "ZZEta", "ZZPhi", "Z1Flav", "Z2Flav"]
    decay_branches = ["costhetastar", "helphi", "helcosthetaZ1", "helcosthetaZ2", "phistarZ1", "xi", "xistar"]

    print "using cumulative_threshold = " + str(threshold)

    if "y" in usemela:
        MELA_branches = ["D_bkg_ME", "D_VBF2j_ggH_ME", "D_VBF1j_ggH_ME", "D_WHh_ggH_ME", "D_ZHh_ggH_ME", "D_WHh_ZHh_ME", "D_VBF2j_WHh_ME", "D_VBF2j_ZHh_ME"]
    else:
        MELA_branches = []

    scorer = BDTscorer(MC_path)
    confhandler = ConfigFileHandler()
    df_fscores = pd.DataFrame()

    # create a model collection with default input variables and default hyperparameters, just to get a list of all the models and their training data files
    mcolls = SimpleModelFactoryDynamic.GenerateSimpleModelCollections(MC_path, input_config_file = None, hyperparam_config_file = None)

    # iterate over all models that are contained in the list of model collections
    for mcoll in mcolls:
        for model_name in mcoll.model_dict.keys():
            
            model = mcoll.model_dict[model_name]
            pre = mcoll.preprocessor_dict[model_name]

            H1_stream = {}
            H0_stream = {}

            print "=========================================================================="
            print "selecting input variables for model " + model_name

            for key, val in mcoll.H1_stream.iteritems():
                H1_stream[key] = lambda row, val = val, pre = pre: val(row) and pre.cuts(row)
                print "adding preprocessor cuts on top for " + key + ": " + pre.cuts_s

            for key, val in mcoll.H0_stream.iteritems():
                H0_stream[key] = lambda row, val = val, pre = pre: val(row) and pre.cuts(row)
                print "adding preprocessor cuts on top for " + key + ": " + pre.cuts_s
            
            df_fscores = select_input_features_cumulative(H1_stream, H0_stream, model_name, production_branches + decay_branches + MELA_branches, list_branches, list_pt_limits, confhandler, df_fscores, threshold)
            print "=========================================================================="

    confhandler.save_configuration(os.path.join(out_dir, campaign_name + "_inputs.conf"))
    df_fscores.to_csv(os.path.join(out_dir, campaign_name + "_fscore_table_bkg.csv"))
def main():
    if len(sys.argv) != 3:
        print "Error: exactly 2 arguments are required!"

    settings_path = sys.argv[1]
    run_dir = sys.argv[2]

    confhandler = ConfigFileHandler()
    confhandler.load_configuration(settings_path)
    root_file_name = confhandler.get_field("Global", "root_file_name")

    # need to merge the many individual chunks coming from the augmentation. keep care that the weights are updated correctly!
    augmentation_training_chunks_dir = os.path.join(
        run_dir, "augmentation_training_chunks")
    augmentation_validation_chunks_dir = os.path.join(
        run_dir, "augmentation_validation_chunks")
    augmentation_test_chunks_dir = os.path.join(run_dir,
                                                "augmentation_test_chunks")

    augmentation_training_dir = os.path.join(run_dir, "augmentation_training")
    augmentation_validation_dir = os.path.join(run_dir,
                                               "augmentation_validation")
    augmentation_test_dir = os.path.join(run_dir, "augmentation_test")

    if not os.path.exists(augmentation_training_dir):
        os.makedirs(augmentation_training_dir)

    if not os.path.exists(augmentation_validation_dir):
        os.makedirs(augmentation_validation_dir)

    if not os.path.exists(augmentation_test_dir):
        os.makedirs(augmentation_test_dir)

    def merge_chunks(source_dir, dest_dir):
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        available_dirs = next(os.walk(source_dir))[1]
        merged_dirs = list(
            set(map(lambda x: re.sub('_chunk_.*$', '', x), available_dirs)))

        for merged_dir in merged_dirs:
            chunks = sorted([
                cur_dir for cur_dir in available_dirs
                if merged_dir + "_chunk_" in cur_dir
            ])

            dest_folder = os.path.join(dest_dir, merged_dir)
            if not os.path.exists(dest_folder):
                os.makedirs(dest_folder)

            dest_file = os.path.join(dest_folder, root_file_name)

            source_files = [
                os.path.join(source_dir, chunk, root_file_name)
                for chunk in chunks
            ]

            print "merging " + " ".join(chunks) + " into " + merged_dir

            # do the raw merging
            output = sp.check_output(["hadd", dest_file] + source_files)
            print output

            # now, need to ensure that the metadata is corrected (as hadd also modfies it in a way that is incorrect here)
            command = [
                "rootcp", "--replace", source_files[0] + ":ClassTree/Counters",
                dest_file + ":/ClassTree/Counters"
            ]
            print " ".join(command)
            output = sp.check_output(command)
            print output

    merge_chunks(augmentation_training_chunks_dir, augmentation_training_dir)
    merge_chunks(augmentation_validation_chunks_dir,
                 augmentation_validation_dir)
    merge_chunks(augmentation_test_chunks_dir, augmentation_test_dir)