Beispiel #1
0
def punzi_target(priors, relevant_classes, params, mode = "S"):
    zzroot = os.environ["CMSSW_BASE"]
    bin_dir = os.path.join(zzroot, "bin/slc6_amd64_gcc630/")
    cost_function_evaluator = "run_prior_evaluator"
    
    output = check_output([bin_dir + cost_function_evaluator, run_dir, out_dir, engine, str(params["min_iterations"]), str(params["max_iterations"]), str(priors["ggh_prior"]), str(priors["whhadr_prior"]), 
              str(priors["zhhadr_prior"]), str(priors["whlept_prior"]), str(priors["zhlept_prior"]), str(priors["zhmet_prior"]), 
              str(priors["tthhadr_prior"]), str(priors["tthlept_prior"]), str(priors["bkg_prior"]), str(priors["qq_prior"]), mode, ref_dir])

    if mode == "S":
        punzi_file = "Mor18_punzi_S_comp.conf"
    elif mode == "SB":
        punzi_file = "Mor18_punzi_comp.conf"

    # read directly the configuration file containing the relative Punzi improvements w.r.t. the reference 
    # (the one with flat priors)
    punzihandler = ConfigFileHandler()
    punzihandler.load_configuration(os.path.join(out_dir, punzi_file))
    
    costval = 0.0
         
    # use the weighted cost function
    delta_pi = []
    for relevant_class in relevant_classes:
        delta_pi.append(float(punzihandler.get_field('Punzi', relevant_class)) - 1.0)
    
    costval = cost_func(delta_pi, 8.0, 2)

    if math.isnan(costval):
        print "caught NaN!"
        costval = -7.0
 
    return costval
def punzi_target(priors, relevant_classes, params):
    bin_dir = "/home/llr/cms/wind/cmssw/CMSSW_9_4_2/bin/slc6_amd64_gcc630/"
    cost_function_evaluator = "run_prior_evaluator"

    output = check_output([
        bin_dir + cost_function_evaluator, run_dir, out_dir, engine,
        str(params["min_iterations"]),
        str(params["max_iterations"]),
        str(priors["ggh_prior"]),
        str(priors["whhadr_prior"]),
        str(priors["zhhadr_prior"]),
        str(priors["whlept_prior"]),
        str(priors["zhlept_prior"]),
        str(priors["zhmet_prior"]),
        str(priors["tthhadr_prior"]),
        str(priors["tthlept_prior"])
    ])

    # read directly the configuration file containing the relative Punzi improvements w.r.t. the reference
    # (the one with flat priors)
    punzihandler = ConfigFileHandler()
    punzihandler.load_configuration(os.path.join(out_dir, punzi_file))

    costval = 0.0

    # use the weighted cost function
    delta_pi = []
    for relevant_class in relevant_classes:
        delta_pi.append(
            float(punzihandler.get_field('Punzi', relevant_class)) - 1.0)

    costval = cost_func(delta_pi, 8.0, 2)

    if math.isnan(costval):
        print "caught NaN!"
        costval = -7.0

    return costval
def main():
    
    if len(sys.argv) < 3:
        print "Error: at least 2 arguments are required"

    campaign_dir = sys.argv[1]
    workdir = sys.argv[2]

    if len(sys.argv) >= 4:
        input_config_file = sys.argv[3]
    else:
        input_config_file = None

    # make sure that the given directory ends with a /
    if not campaign_dir.endswith('/'):
        campaign_dir += "/"
        
    confhandler = ConfigFileHandler()
    confhandler.load_configuration(campaign_dir + "campaign.conf")
    
    iterables = {}
    
    for section in confhandler.get_sections():
        if '!' in section:
            sweep_name = re.sub('!', '', section)
            sweep_sections = ConfigFileUtils.parse_list(confhandler.get_field(section, 'variables'), lambda x: x)

            # now look for the sweep variables that belong to this sweep
            for sweep_section in sweep_sections:
                # this is a section that determines a new sweep direction, possibly linked
                sweep_metadata = confhandler.get_field(sweep_section, 'variable').split(':')
                sweep_scope = sweep_metadata[0]
                sweep_parameter = sweep_metadata[1]

                # request more information
                sweep_behaviour = confhandler.get_field(sweep_section, 'behaviour')

                if ConfigFileUtils.is_dict(confhandler.get_field(sweep_section, 'start')):
                    # will need a dictionary iterable
                    start_dict = ConfigFileUtils.parse_dict(confhandler.get_field(sweep_section, 'start'), lambda x: float(x))
                    end_dict = ConfigFileUtils.parse_dict(confhandler.get_field(sweep_section, 'end'), lambda x: float(x))
                    step_dict = ConfigFileUtils.parse_dict(confhandler.get_field(sweep_section, 'step'), lambda x: float(x))

                    if sweep_name not in iterables:
                        it = SweepDimensionDict(sweep_scope, sweep_parameter, start_dict, end_dict, step_dict, sweep_behaviour)
                        iterables[sweep_name] = it
                    else:
                        iterables[sweep_name].add(sweep_scope, sweep_parameter, start_dict, end_dict, step_dict, sweep_behaviour)
                else:
                    # construct a list iterable instead
                    start_list = ConfigFileUtils.parse_list(confhandler.get_field(sweep_section, 'start'), lambda x: x)    
                    end_list = ConfigFileUtils.parse_list(confhandler.get_field(sweep_section, 'end'), lambda x: x)

                    if sweep_name not in iterables:
                        it = SweepDimensionList(sweep_scope, sweep_parameter, start_list, end_list, sweep_behaviour)
                        iterables[sweep_name] = it
                    else:
                        iterables[sweep_name].add(sweep_scope, sweep_parameter, start_list, end_list, sweep_behaviour)

    MC_path = os.path.join(workdir, "trainval/")
    model_type = confhandler.get_field('global', 'model_type')

    # get the mass point from the global config file in a way that ensures backward compatibility
    try:
        mass_point = float(confhandler.get_field('global', 'mass_point'))
    except KeyError:
        mass_point = 125.0

    if model_type == 'SimpleModel':
        # using the full mass range for training, not using the 118/130GeV cut
        mcoll = SimpleModelFactoryDynamic.GenerateSimpleModelCollections(MC_path, input_config_file = input_config_file, hyperparam_config_file = None, mass_point = mass_point)
    elif model_type == 'CombinedModel':
        mcoll = ModelFactoryFullCategorySetOptimizedInputs.GenerateCombinedModelCollections(MC_path)
        
    iterate(iterables, {}, lambda it: augment_config(mcoll, campaign_dir, it))
def get_loss(run, mcoll, model):
    confhandler = ConfigFileHandler()
    confhandler.load_configuration(
        os.path.join(run, "training", mcoll, "model_benchmark.txt"))
    return float(confhandler.get_field(model, 'val_loss'))
def main():
    if len(sys.argv) != 3:
        print "Error: exactly 2 arguments are required!"

    source_path = sys.argv[1]
    #source_path = "/data_CMS/cms/wind/CJLST_NTuples_prepared_systematics/"
    dest_path = sys.argv[2]

    # global settings:
    zzroot = os.environ["CMSSW_BASE"]
    bin_dir = os.path.join(zzroot, "bin/slc6_amd64_gcc630/")

    scrambler = os.path.join(bin_dir, "run_scrambler")
    chunk_extractor = os.path.join(bin_dir, "run_chunk_extractor")

    settings_path = os.path.join(dest_path, "settings.conf")

    confhandler = ConfigFileHandler()
    confhandler.load_configuration(settings_path)

    # load global settings from the configuration file
    root_file_name = confhandler.get_field("Global", "root_file_name")
    source_dir = confhandler.get_field("Global", "source_dir")
    chunk_size = int(confhandler.get_field("Global", "chunk_size"))

    def submit_job(cmd_dir, command):
        job_submitter = os.environ["JOB_SUBMITTER"]

        filename = str(uuid.uuid4()) + ".sh"
        file_path = os.path.join(cmd_dir, filename)
        with open(file_path, "w") as cmd_file:
            cmd_file.write("#!/bin/bash\n")
            cmd_file.write(command)

        while True:
            try:
                output = sp.check_output([job_submitter, "-short", file_path])
                break
            except sp.CalledProcessError:
                print "-------------------------------------------------"
                print " error submitting job, retrying ... "
                print "-------------------------------------------------"

        print output

    def chunk_file(in_dir, out_root, base_name, number_chunks, cmd_dir):
        splits = np.linspace(0, 1, number_chunks)
        in_file = os.path.join(in_dir, root_file_name)

        if number_chunks == 1:
            out_folder = os.path.join(out_root, base_name + "_chunk_0/")

            if not os.path.exists(out_folder):
                os.makedirs(out_folder)

            out_file = os.path.join(out_folder, root_file_name)

            command = " ".join([chunk_extractor, in_file, out_file, str(0.0), str(1.0), str(0)])
            submit_job(cmd_dir, command)
            print command

        else:
            for i in range(len(splits) - 1):
                start_split = splits[i]
                end_split = splits[i + 1]
            
                out_folder = os.path.join(out_root, base_name + "_chunk_" + str(i) + "/")
                if not os.path.exists(out_folder):
                    os.makedirs(out_folder)

                out_file = os.path.join(out_folder, root_file_name)
                
                command = " ".join([chunk_extractor, in_file, out_file, str(start_split), str(end_split), str(0)])
                submit_job(cmd_dir, command)
                print command

    # create the needed folders:
    train_dir = os.path.join(dest_path, "training/")
    validation_dir = os.path.join(dest_path, "validation/")
    test_dir = os.path.join(dest_path, "test/")
    trainval_dir = os.path.join(dest_path, "trainval/")
    temp_dir = os.path.join(dest_path, "temp/")

    # create these directories
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    
    if not os.path.exists(validation_dir):
        os.makedirs(validation_dir)
    
    if not os.path.exists(test_dir):
        os.makedirs(test_dir)
    
    if not os.path.exists(trainval_dir):
        os.makedirs(trainval_dir)
    
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    training_files = [cur_file for cur_file in confhandler.get_sections() if "Global" not in cur_file]
    available_files = next(os.walk(source_path))[1]
    used_files = []
    
    for training_file in training_files:
        sect = confhandler.get_section(training_file)
    
        print "--------------------------------------------------"
        print "currently splitting: " + training_file
    
        source_files = ConfigFileUtils.parse_list(sect["source"], lambda x: x)
        train_val_splits = ConfigFileUtils.parse_list(sect["train_val_split"], lambda x: float(x))
        val_test_splits = ConfigFileUtils.parse_list(sect["val_test_split"], lambda x: float(x))
    
        # first split the needed files into 3 pieces, as dictated by the splits read from the config file
        for source_file, train_val_split, val_test_split in zip(source_files, train_val_splits, val_test_splits):
        
            print "extracting 0.0 - " + str(train_val_split) + " from " + source_file
        
            dest_dir = os.path.join(train_dir, source_file)
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)
    
            output = sp.check_output([chunk_extractor, os.path.join(source_path, source_file, root_file_name),
                                      os.path.join(dest_dir, root_file_name), str(0.0), str(train_val_split)])      
            print output
        
            print "-- -- -- -- -- -- -- -- -- -- -- --"
        
            print "extracting " + str(train_val_split) + " - " + str(val_test_split) + " from " + source_file
        
            dest_dir = os.path.join(validation_dir, source_file)
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)
    
            output = sp.check_output([chunk_extractor, os.path.join(source_path, source_file, root_file_name),
                                      os.path.join(dest_dir, root_file_name), str(train_val_split), str(val_test_split)])      
            print output
        
            print "-- -- -- -- -- -- -- -- -- -- -- --"
        
            print "extracting " + str(val_test_split) + " - 1.0 from " + source_file
        
            dest_dir = os.path.join(test_dir, source_file)
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)
    
            output = sp.check_output([chunk_extractor, os.path.join(source_path, source_file, root_file_name),
                                      os.path.join(dest_dir, root_file_name), str(val_test_split), str(1.0)])      
            print output
        
            used_files.append(source_file)
    
        print "--------------------------------------------------"

    unused_files = [cur_file for cur_file in available_files if cur_file not in used_files]

    # for all files that are not used for training, split them 50:50 into validation and test ...
    for unused_file in unused_files:
        source_dir = os.path.join(source_path, unused_file)

        # ... unless they are only needed to assess systematics, i.e. are not going to be used at all during the validation step
        if "ext" in unused_file or "tuneup" in unused_file or "tunedown" in unused_file:
            print "extracting 0.0 - 1.0 from " + unused_file

            dest_dir = os.path.join(test_dir, unused_file)
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)
                
            output = sp.check_output([chunk_extractor, os.path.join(source_dir, root_file_name),
                                      os.path.join(dest_dir, root_file_name), str(0.0), str(1.0)])      
            print output

        else:
            print "extracting 0.0 - 0.5 from " + unused_file
            
            dest_dir = os.path.join(validation_dir, unused_file)
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)
    
            output = sp.check_output([chunk_extractor, os.path.join(source_dir, root_file_name),
                                      os.path.join(dest_dir, root_file_name), str(0.0), str(0.5)])      
            print output

            print "-- -- -- -- -- -- -- -- -- -- -- --"

            print "extracting 0.5 - 1.0 from " + unused_file
            
            dest_dir = os.path.join(test_dir, unused_file)
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)
    
            output = sp.check_output([chunk_extractor, os.path.join(source_dir, root_file_name),
                                      os.path.join(dest_dir, root_file_name), str(0.5), str(1.0)])      
            print output
    
    # now have all the needed files split apart, can now proceed to combine them into the training 
    # datasets that will end up in trainval
    for training_file in training_files:
        print "now building training dataset: " + training_file
        sect = confhandler.get_section(training_file)
        source_folders = ConfigFileUtils.parse_list(sect["source"], lambda x: x)
    
        for mode in ["training", "validation"]:

            temp_dest_folder = os.path.join(dest_path, temp_dir, training_file, mode)
            temp_dest_file = os.path.join(temp_dest_folder, root_file_name)

            if not os.path.exists(temp_dest_folder):
                os.makedirs(temp_dest_folder)

            source_files = [os.path.join(dest_path, mode, cur_file, root_file_name) for cur_file in source_folders]

            print "hadd " + temp_dest_file + " " + " ".join(source_files)
            output = sp.check_output(["hadd", temp_dest_file] + source_files)      
            print output
    
            temp_scrambled_folder = os.path.join(dest_path, temp_dir, "scrambled", training_file, mode)
            if not os.path.exists(temp_scrambled_folder):
                os.makedirs(temp_scrambled_folder)
            
            temp_scrambled_file = os.path.join(temp_scrambled_folder, root_file_name)
        
            print scrambler + " " + temp_dest_file + " " + temp_scrambled_file
            output = sp.check_output([scrambler, temp_dest_file, temp_scrambled_file])      
            print output
        
        trainval_dest_folder = os.path.join(trainval_dir, training_file)
        if not os.path.exists(trainval_dest_folder):
            os.makedirs(trainval_dest_folder)
        
        print "hadd " + os.path.join(trainval_dest_folder, root_file_name) + " " + os.path.join(dest_path, temp_dir, "scrambled", training_file, "training", root_file_name) + " " + os.path.join(dest_path, temp_dir, "scrambled", training_file, "validation", root_file_name)
        
        output = sp.check_output(["hadd", os.path.join(trainval_dest_folder, root_file_name),
                                 os.path.join(dest_path, temp_dir, "scrambled", training_file, "training", root_file_name),
                                 os.path.join(dest_path, temp_dir, "scrambled", training_file, "validation", root_file_name)])
        print output

    # at the end, chunk the ROOT files into many smaller ones, to keep the augmentation time short
    train_chunks_dir = os.path.join(dest_path, "training_chunks/")
    validation_chunks_dir = os.path.join(dest_path, "validation_chunks/")
    test_chunks_dir = os.path.join(dest_path, "test_chunks/")

    # create these directories
    if not os.path.exists(train_chunks_dir):
        os.makedirs(train_chunks_dir)
    
    if not os.path.exists(validation_chunks_dir):
        os.makedirs(validation_chunks_dir)
    
    if not os.path.exists(test_chunks_dir):
        os.makedirs(test_chunks_dir)

    for mode in ["training", "validation", "test"]:
        # look at each file individually and put it into chunks
        cur_dir = os.path.join(dest_path, mode)
        available_folders = next(os.walk(cur_dir))[1]

        for available_folder in available_folders:
            available_file = os.path.join(cur_dir, available_folder, root_file_name)

            number_chunks = max(1, os.path.getsize(available_file) / chunk_size)

            print "now splitting file " + available_file + " into " + str(number_chunks) + " chunks"

            out_root = os.path.join(dest_path, mode + "_chunks")
            
            chunk_file(os.path.join(dest_path, mode, available_folder), out_root, available_folder, number_chunks, temp_dir)
        
    print "done."        
def main():
    if len(sys.argv) != 3:
        print "Error: exactly 2 arguments are required!"

    settings_path = sys.argv[1]
    run_dir = sys.argv[2]

    confhandler = ConfigFileHandler()
    confhandler.load_configuration(settings_path)
    root_file_name = confhandler.get_field("Global", "root_file_name")

    # need to merge the many individual chunks coming from the augmentation. keep care that the weights are updated correctly!
    augmentation_training_chunks_dir = os.path.join(
        run_dir, "augmentation_training_chunks")
    augmentation_validation_chunks_dir = os.path.join(
        run_dir, "augmentation_validation_chunks")
    augmentation_test_chunks_dir = os.path.join(run_dir,
                                                "augmentation_test_chunks")

    augmentation_training_dir = os.path.join(run_dir, "augmentation_training")
    augmentation_validation_dir = os.path.join(run_dir,
                                               "augmentation_validation")
    augmentation_test_dir = os.path.join(run_dir, "augmentation_test")

    if not os.path.exists(augmentation_training_dir):
        os.makedirs(augmentation_training_dir)

    if not os.path.exists(augmentation_validation_dir):
        os.makedirs(augmentation_validation_dir)

    if not os.path.exists(augmentation_test_dir):
        os.makedirs(augmentation_test_dir)

    def merge_chunks(source_dir, dest_dir):
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        available_dirs = next(os.walk(source_dir))[1]
        merged_dirs = list(
            set(map(lambda x: re.sub('_chunk_.*$', '', x), available_dirs)))

        for merged_dir in merged_dirs:
            chunks = sorted([
                cur_dir for cur_dir in available_dirs
                if merged_dir + "_chunk_" in cur_dir
            ])

            dest_folder = os.path.join(dest_dir, merged_dir)
            if not os.path.exists(dest_folder):
                os.makedirs(dest_folder)

            dest_file = os.path.join(dest_folder, root_file_name)

            source_files = [
                os.path.join(source_dir, chunk, root_file_name)
                for chunk in chunks
            ]

            print "merging " + " ".join(chunks) + " into " + merged_dir

            # do the raw merging
            output = sp.check_output(["hadd", dest_file] + source_files)
            print output

            # now, need to ensure that the metadata is corrected (as hadd also modfies it in a way that is incorrect here)
            command = [
                "rootcp", "--replace", source_files[0] + ":ClassTree/Counters",
                dest_file + ":/ClassTree/Counters"
            ]
            print " ".join(command)
            output = sp.check_output(command)
            print output

    merge_chunks(augmentation_training_chunks_dir, augmentation_training_dir)
    merge_chunks(augmentation_validation_chunks_dir,
                 augmentation_validation_dir)
    merge_chunks(augmentation_test_chunks_dir, augmentation_test_dir)