Beispiel #1
0
def create_all_jobfiles(datasets: List[Dict], cache_filename: str,
                        datapath: str, chunksize: str, outpath: str):
    """Splits the dataset into job descriptions, specifying how many files will be processed per job.
    The job descriptions will be saved to small JSON fioles for batch processing.
    
    Args:
        datasets (List[Dataset]): The dataset for which to create the job files
        cache_filename (str): Input json filename where the filenames for each dataset are loaded from
        datapath (str): Path to load the data from
        chunksize (int): Number of files to process in each job
        outpath (str): Path with the output directory where the jobfiles will be stored
    
    """
    jobfile_path = outpath + "/jobfiles"
    if os.path.isdir(jobfile_path):
        print(
            "Jobfiles directory {0} already exists, skipping jobfile creation".
            format(jobfile_path))
        return
    os.makedirs(jobfile_path)

    #Create a list of job files for processing
    jobfile_data = []
    print("Loading list of filenames from {0}".format(cache_filename))
    if not os.path.isfile(cache_filename):
        raise Exception(
            "Cached dataset list of filenames not found in {0}, please run this code with --action cache"
            .format(cache_filename))
    filenames_cache = json.load(open(cache_filename, "r"))

    seed_gen = seed_generator()
    for dataset in sorted(datasets, key=lambda x: (x["name"], x["era"])):
        dataset_name = dataset["name"]
        dataset_era = dataset["era"]
        is_mc = dataset["is_mc"]

        try:
            filenames_all = filenames_cache[dataset_name + "_" + dataset_era]
        except KeyError as e:
            print(
                "Could not load {0} from {1}, please make sure this dataset has been added to cache"
                .format(dataset_name + "_" + dataset_era, cache_filename),
                file=sys.stderr)
            raise e

        filenames_all_full = [datapath + "/" + fn for fn in filenames_all]
        print(
            "Saving dataset {0}_{1} with {2} files in {3} files per chunk to jobfiles"
            .format(dataset_name, dataset_era, len(filenames_all_full),
                    chunksize))
        jobfile_dataset = create_dataset_jobfiles(dataset_name, dataset_era,
                                                  filenames_all_full, is_mc,
                                                  chunksize, jobfile_path,
                                                  seed_gen)
        jobfile_data += jobfile_dataset
        print("Dataset {0}_{1} consists of {2} chunks".format(
            dataset_name, dataset_era, len(jobfile_dataset)))

    assert (len(jobfile_data) > 0)
    assert (len(jobfile_data[0]["filenames"]) > 0)
def create_all_jobfiles(datasets, cache_filename, datapath, chunksize, outpath):
    jobfile_path = outpath + "/jobfiles"
    if os.path.isdir(jobfile_path):
        print("Jobfiles directory {0} already exists, skipping jobfile creation".format(jobfile_path))
        return
    os.makedirs(jobfile_path)

    #Create a list of job files for processing
    jobfile_data = []
    print("Loading list of filenames from {0}".format(cache_filename))
    if not os.path.isfile(cache_filename):
        raise Exception("Cached dataset list of filenames not found in {0}, please run this code with --action cache".format(
            cache_filename))
    filenames_cache = json.load(open(cache_filename, "r"))

    seed_gen = seed_generator()
    for dataset in sorted(datasets, key=lambda x: (x["name"], x["era"])):
        dataset_name = dataset["name"]
        dataset_era = dataset["era"]
        is_mc = dataset["is_mc"]
        
        try:
            filenames_all = filenames_cache[dataset_name + "_" + dataset_era]
        except KeyError as e:
            print("Could not load {0} from {1}, please make sure this dataset has been added to cache".format(
                dataset_name + "_" + dataset_era, cache_filename), file=sys.stderr)
            raise e

        filenames_all_full = [datapath + "/" + fn for fn in filenames_all]
        print("Saving dataset {0}_{1} with {2} files in {3} files per chunk to jobfiles".format(
            dataset_name, dataset_era, len(filenames_all_full), chunksize))
        jobfile_dataset = create_dataset_jobfiles(dataset_name, dataset_era,
            filenames_all_full, is_mc, chunksize, jobfile_path, seed_gen)
        jobfile_data += jobfile_dataset
        print("Dataset {0}_{1} consists of {2} chunks".format(
            dataset_name, dataset_era, len(jobfile_dataset)))

    assert(len(jobfile_data) > 0)
    assert(len(jobfile_data[0]["filenames"]) > 0)
Beispiel #3
0
def create_all_jobfiles(datasets, cache_filename, datapath, chunksize,
                        outpath):
    #Create a list of job files for processing
    jobfile_data = []
    print("Loading list of filenames from {0}".format(cache_filename))
    if not os.path.isfile(cache_filename):
        raise Exception(
            "Cached dataset list of filenames not found in {0}, please run this code with --action cache"
            .format(cache_filename))
    filenames_cache = json.load(open(cache_filename, "r"))

    seed_gen = seed_generator()
    for dataset in sorted(datasets):
        dataset_name, dataset_era, dataset_globpattern, is_mc = dataset
        try:
            filenames_all = filenames_cache[dataset_name + "_" + dataset_era]
        except KeyError as e:
            print(
                "Could not load {0} from {1}, please make sure this dataset has been added to cache"
                .format(dataset_name + "_" + dataset_era, cache_filename),
                file=sys.stderr)
            raise e

        filenames_all_full = [datapath + "/" + fn for fn in filenames_all]
        print(
            "Saving dataset {0}_{1} with {2} files in {3} files per chunk to jobfiles"
            .format(dataset_name, dataset_era, len(filenames_all_full),
                    chunksize))
        jobfile_dataset = create_dataset_jobfiles(dataset_name, dataset_era,
                                                  filenames_all_full, is_mc,
                                                  chunksize, outpath, seed_gen)
        jobfile_data += jobfile_dataset
        print("Dataset {0}_{1} consists of {2} chunks".format(
            dataset_name, dataset_era, len(jobfile_dataset)))

    assert (len(jobfile_data) > 0)
    assert (len(jobfile_data[0]["filenames"]) > 0)
    filenames_all = glob.glob(datapath + dataset_globpattern, recursive=True)
    filenames_all = [fn for fn in filenames_all if not "Friend" in fn]
    filenames_cache[dataset_name + "_" + dataset_era] = [
        fn.replace(datapath, "") for fn in filenames_all
    ]

    if len(filenames_all) == 0:
        raise Exception(
            "Dataset {0} matched 0 files from glob pattern {1}, verify that the data files are located in {2}"
            .format(dataset_name, dataset_globpattern, datapath))

    try:
        filenames_all = filenames_cache[dataset_name + "_" + dataset_era]
    except KeyError as e:
        print(
            "Could not load {0}, please make sure this dataset has been added to cache"
            .format(dataset_name + "_" + dataset_era))
        raise e

    filenames_all_full = [datapath + fn for fn in filenames_all]
    chunksize = chunksize_multiplier.get(dataset_name, 1)
    print(
        "Saving dataset {0}_{1} with {2} files in {3} files per chunk to jobfiles"
        .format(dataset_name, dataset_era, len(filenames_all_full), chunksize))
    jobfile_dataset = create_dataset_jobfiles(dataset_name, dataset_era,
                                              filenames_all_full, is_mc,
                                              chunksize, "./")
    jobfile_data += jobfile_dataset
    print("Dataset {0}_{1} consists of {2} chunks".format(
        dataset_name, dataset_era, len(jobfile_dataset)))
def main(args, datasets):

    do_prof = args.do_profile
    do_tensorflow = not args.disable_tensorflow

    #use the environment variable for cupy/cuda choice
    args.use_cuda = USE_CUPY

    analysis_corrections = None
    if "analyze" in args.action:
        analysis_corrections = AnalysisCorrections(args, do_tensorflow)

    # Optionally disable pinned memory (will be somewhat slower)
    if args.use_cuda:
        import cupy
        if not args.pinned:
            cupy.cuda.set_allocator(None)
            cupy.cuda.set_pinned_memory_allocator(None)

    #Use sync-only datasets
    if args.do_sync:
        datasets = datasets_sync

    #Filter datasets by era
    datasets_to_process = []
    for ds in datasets:
        if args.datasets is None or ds[0] in args.datasets:
            if args.eras is None or ds[1] in args.eras:
                datasets_to_process += [ds]
                print("Will consider dataset", ds)
    if len(datasets) == 0:
        raise Exception("No datasets considered, please check the --datasets and --eras options")
    datasets = datasets_to_process

    hmumu_utils.NUMPY_LIB, hmumu_utils.ha = choose_backend(args.use_cuda)
    Dataset.numpy_lib = hmumu_utils.NUMPY_LIB
    NUMPY_LIB = hmumu_utils.NUMPY_LIB 

    # All analysis definitions (cut values etc) should go here
    analysis_parameters = {
        "baseline": {

            "nPV": 0,
            "NdfPV": 4,
            "zPV": 24,

            # Will be applied with OR
            "hlt_bits": {
                "2016": ["HLT_IsoMu24", "HLT_IsoTkMu24"],
                "2017": ["HLT_IsoMu27"],
                "2018": ["HLT_IsoMu24"],
                },

            "muon_pt": 20,
            "muon_pt_leading": {"2016": 26.0, "2017": 29.0, "2018": 26.0},
            "muon_eta": 2.4,
            "muon_iso": 0.25,
            "muon_id": {"2016": "medium", "2017": "medium", "2018": "medium"},
            "muon_trigger_match_dr": 0.1,
            "muon_iso_trigger_matched": 0.15,
            "muon_id_trigger_matched": {"2016": "tight", "2017": "tight", "2018": "tight"},
 
            "do_rochester_corrections": True, 
            "do_lepton_sf": True,
            
            "do_jec": True,
            "jec_tag": {"2016": "Summer16_07Aug2017_V11", "2017": "Fall17_17Nov2017_V32", "2018": "Autumn18_V16"}, 
            "jet_mu_dr": 0.4,
            "jet_pt_leading": {"2016": 35.0, "2017": 35.0, "2018": 35.0},
            "jet_pt_subleading": {"2016": 25.0, "2017": 25.0, "2018": 25.0},
            "jet_eta": 4.7,
            "jet_id": "tight",
            "jet_puid": "loose",
            "jet_veto_eta": [2.65, 3.139],
            "jet_veto_raw_pt": 50.0,  
            "jet_btag": {"2016": 0.6321, "2017": 0.4941, "2018": 0.4184},
            "do_factorized_jec": args.do_factorized_jec,

            "cat5_dijet_inv_mass": 400.0,
            "cat5_abs_jj_deta_cut": 2.5,

            "masswindow_z_peak": [76, 106],
            "masswindow_h_sideband": [110, 150],
            "masswindow_h_peak": [115, 135],

            "inv_mass_bins": 41,

            "extra_electrons_pt": 20,
            "extra_electrons_eta": 2.5,
            "extra_electrons_iso": 0.4, #Check if we want to apply this
            "extra_electrons_id": "mvaFall17V1Iso_WP90",

            "save_dnn_vars": True,
            "dnn_vars_path": "{0}/dnn_vars".format(args.out),

            #If true, apply mjj > cut, otherwise inverse
            "vbf_filter_mjj_cut": 350,
            "vbf_filter": {
                "dy_m105_160_mg": True,
                "dy_m105_160_amc": True,
                "dy_m105_160_vbf_mg": False,
                "dy_m105_160_vbf_amc": False, 
            },

            #Irene's DNN input variable order for keras
            "dnn_varlist_order": ['softJet5', 'dRmm','dEtamm','M_jj','pt_jj','eta_jj','phi_jj','M_mmjj','eta_mmjj','phi_mmjj','dEta_jj','Zep','dRmin_mj', 'dRmax_mj', 'dRmin_mmj','dRmax_mmj','dPhimm','leadingJet_pt','subleadingJet_pt', 'leadingJet_eta','subleadingJet_eta','leadingJet_qgl','subleadingJet_qgl','cthetaCS','Higgs_pt','Higgs_eta','Higgs_mass'],
            "dnn_input_histogram_bins": {
                "softJet5": (0,10,10),
                "dRmm": (0,5,41),
                "dEtamm": (-2,2,41),
                "dPhimm": (-2,2,41),
                "M_jj": (0,2000,41),
                "pt_jj": (0,400,41),
                "eta_jj": (-5,5,41),
                "phi_jj": (-5,5,41),
                "M_mmjj": (0,2000,41),
                "eta_mmjj": (-3,3,41),
                "phi_mmjj": (-3,3,41),
                "dEta_jj": (-3,3,41),
                "Zep": (-2,2,41),
                "dRmin_mj": (0,5,41),
                "dRmax_mj": (0,5,41),
                "dRmin_mmj": (0,5,41),
                "dRmax_mmj": (0,5,41),
                "leadingJet_pt": (0, 200, 41),
                "subleadingJet_pt": (0, 200, 41),
                "leadingJet_eta": (-5, 5, 41),
                "subleadingJet_eta": (-5, 5, 41),
                "leadingJet_qgl": (0, 1, 41),
                "subleadingJet_qgl": (0, 1, 41),
                "cthetaCS": (-1, 1, 41),
                "Higgs_pt": (0, 200, 41),
                "Higgs_eta": (-3, 3, 41),
                "Higgs_mass": (110, 150, 41),
                "dnn_pred": (0, 1, 1001),
                "dnn_pred2": (0, 1, 11),
                "bdt_ucsd": (-1, 1, 41),
                "bdt2j_ucsd": (-1, 1, 41),
                "bdt01j_ucsd": (-1, 1, 41),
                "MET_pt": (0, 200, 41),
                "hmmthetacs": (-1, 1, 41),
                "hmmphics": (-4, 4, 41),
            },

            "categorization_trees": {}
        },
    }
    histo_bins = {
        "muon_pt": np.linspace(0, 200, 101, dtype=np.float32),
        "npvs": np.linspace(0,100,101, dtype=np.float32),
        "dijet_inv_mass": np.linspace(0, 2000, 41, dtype=np.float32),
        "inv_mass": np.linspace(70, 150, 41, dtype=np.float32),
        "numjet": np.linspace(0, 10, 11, dtype=np.float32),
        "jet_pt": np.linspace(0, 300, 101, dtype=np.float32),
        "jet_eta": np.linspace(-4.7, 4.7, 41, dtype=np.float32),
        "pt_balance": np.linspace(0, 5, 41, dtype=np.float32),
        "numjets": np.linspace(0, 10, 11, dtype=np.float32),
        "jet_qgl": np.linspace(0, 1, 41, dtype=np.float32),
        "higgs_inv_mass_uncertainty": np.linspace(0, 10, 101, dtype=np.float32),
        "higgs_rel_inv_mass_uncertainty": np.linspace(0, 0.05, 101, dtype=np.float32)
    }
    for hname, bins in analysis_parameters["baseline"]["dnn_input_histogram_bins"].items():
        histo_bins[hname] = np.linspace(bins[0], bins[1], bins[2], dtype=np.float32)

    for masswindow in ["z_peak", "h_peak", "h_sideband"]:
        mw = analysis_parameters["baseline"]["masswindow_" + masswindow]
        histo_bins["inv_mass_{0}".format(masswindow)] = np.linspace(mw[0], mw[1], 41, dtype=np.float32)

    histo_bins["dnn_pred2"] = {
        "h_peak": np.array([0., 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 1.0], dtype=np.float32),
        "z_peak": np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0], dtype=np.float32),
        "h_sideband": np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0], dtype=np.float32),
    }

    analysis_parameters["baseline"]["histo_bins"] = histo_bins

    #analysis_parameters["oldjec"] = copy.deepcopy(analysis_parameters["baseline"])
    #analysis_parameters["oldjec"]["jec_tag"]["2018"] = "Autumn18_V8"

    #Run baseline analysis
    outpath = "{0}/partial_results".format(args.out)
    try:
        os.makedirs(outpath)
    except FileExistsError as e:
            pass

    with open('{0}/parameters.pkl'.format(outpath), 'wb') as handle:
        pickle.dump(analysis_parameters, handle, protocol=pickle.HIGHEST_PROTOCOL)

    #Recreate dump of all filenames
    cache_filename = args.cache_location + "/datasets.json"
    if ("cache" in args.action) and (args.jobfiles is None):
        print("--action cache and no jobfiles specified, creating datasets.json dump of all filenames")
        if not os.path.isdir(args.cache_location):
            os.makedirs(args.cache_location)
        filenames_cache = {}
        for dataset in datasets:
            dataset_name, dataset_era, dataset_globpattern, is_mc = dataset
            filenames_all = glob.glob(args.datapath + dataset_globpattern, recursive=True)
            filenames_all = [fn for fn in filenames_all if not "Friend" in fn]
            filenames_cache[dataset_name + "_" + dataset_era] = [
                fn.replace(args.datapath, "") for fn in filenames_all]

            if len(filenames_all) == 0:
                raise Exception("Dataset {0} matched 0 files from glob pattern {1}, verify that the data files are located in {2}".format(
                    dataset_name, dataset_globpattern, args.datapath
                ))
    
        #save all dataset filenames to a json file 
        print("Creating a json dump of all the dataset filenames based on data found in {0}".format(args.datapath))
        if os.path.isfile(cache_filename):
            print("Cache file {0} already exists, we will not overwrite it to be safe.".format(cache_filename), file=sys.stderr)
            print("Delete it or change --cache-location and try again.", file=sys.stderr)
            sys.exit(1)
        with open(cache_filename, "w") as fi:
            fi.write(json.dumps(filenames_cache, indent=2))

    if ("cache" in args.action or "analyze" in args.action) and (args.jobfiles is None):
        #Create a list of job files for processing
        jobfile_data = []
        print("Loading list of filenames from {0}".format(cache_filename))
        if not os.path.isfile(cache_filename):
            raise Exception("Cached dataset list of filenames not found in {0}, please run this code with --action cache".format(
                cache_filename))
        filenames_cache = json.load(open(cache_filename, "r"))

        for dataset in datasets:
            dataset_name, dataset_era, dataset_globpattern, is_mc = dataset
            try:
                filenames_all = filenames_cache[dataset_name + "_" + dataset_era]
            except KeyError as e:
                print("Could not load {0} from {1}, please make sure this dataset has been added to cache".format(
                    dataset_name + "_" + dataset_era, cache_filename), file=sys.stderr)
                raise e

            filenames_all_full = [args.datapath + "/" + fn for fn in filenames_all]
            chunksize = args.chunksize * chunksize_multiplier.get(dataset_name, 1)
            print("Saving dataset {0}_{1} with {2} files in {3} files per chunk to jobfiles".format(
                dataset_name, dataset_era, len(filenames_all_full), chunksize))
            jobfile_dataset = create_dataset_jobfiles(dataset_name, dataset_era,
                filenames_all_full, is_mc, chunksize, args.out)
            jobfile_data += jobfile_dataset
            print("Dataset {0}_{1} consists of {2} chunks".format(
                dataset_name, dataset_era, len(jobfile_dataset)))

        assert(len(jobfile_data) > 0)
        assert(len(jobfile_data[0]["filenames"]) > 0)

    #For each dataset, find out which chunks we want to process
    if "cache" in args.action or "analyze" in args.action:
        jobfile_data = []
        if not (args.jobfiles_load is None):
            args.jobfiles = [l.strip() for l in open(args.jobfiles_load).readlines()]
        if args.jobfiles is None:
            print("You did not specify to process specific dataset chunks, assuming you want to process all chunks")
            print("If this is not true, please specify e.g. --jobfiles data_2018_0.json data_2018_1.json ...")
            args.jobfiles = []
            for dataset in datasets:
                dataset_name, dataset_era, dataset_globpattern, is_mc = dataset
                jobfiles_dataset = glob.glob(args.out + "/jobfiles/{0}_{1}_*.json".format(dataset_name, dataset_era))
                assert(len(jobfiles_dataset) > 0)
                if args.maxchunks > 0:
                    jobfiles_dataset = jobfiles_dataset[:args.maxchunks]
                args.jobfiles += jobfiles_dataset
       
        #Now load the jobfiles 
        assert(len(args.jobfiles) > 0)
        print("You specified --jobfiles {0}, processing only these dataset chunks".format(" ".join(args.jobfiles))) 
        jobfile_data = []
        for f in args.jobfiles:
            jobfile_data += [json.load(open(f))]

        chunkstr = " ".join(["{0}_{1}_{2}".format(
            ch["dataset_name"], ch["dataset_era"], ch["dataset_num_chunk"])
            for ch in jobfile_data])
        print("Will process {0} dataset chunks: {1}".format(len(jobfile_data), chunkstr))
        assert(len(jobfile_data) > 0)

    #Start the profiler only in the actual data processing
    if do_prof:
        import yappi
        filename = 'analysis.prof'
        yappi.set_clock_type('cpu')
        yappi.start(builtins=True)

    if "cache" in args.action:
        print("Running the 'cache' step of the analysis, ROOT files will be opened and branches will be uncompressed")
        print("Will retrieve dataset filenames based on existing ROOT files on filesystem in datapath={0}".format(args.datapath)) 
       
        try:
            os.makedirs(cmdline_args.cache_location)
        except Exception as e:
            pass

        run_cache(args, outpath, jobfile_data, analysis_parameters)
    
    if "analyze" in args.action:
        run_analysis(args, outpath, jobfile_data, analysis_parameters, analysis_corrections)

    if "merge" in args.action:
        with ProcessPoolExecutor(max_workers=args.nthreads) as executor:
            for dataset in datasets:
                dataset_name, dataset_era, dataset_globpattern, is_mc = dataset
                fut = executor.submit(merge_partial_results, dataset_name, dataset_era, outpath)
        print("done merging")
    if do_prof:
        stats = yappi.get_func_stats()
        stats.save(filename, type='callgrind')

    import resource
    total_memory = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
    total_memory += resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print("maxrss={0} MB".format(total_memory/1024))