def create_all_jobfiles(datasets: List[Dict], cache_filename: str, datapath: str, chunksize: str, outpath: str): """Splits the dataset into job descriptions, specifying how many files will be processed per job. The job descriptions will be saved to small JSON fioles for batch processing. Args: datasets (List[Dataset]): The dataset for which to create the job files cache_filename (str): Input json filename where the filenames for each dataset are loaded from datapath (str): Path to load the data from chunksize (int): Number of files to process in each job outpath (str): Path with the output directory where the jobfiles will be stored """ jobfile_path = outpath + "/jobfiles" if os.path.isdir(jobfile_path): print( "Jobfiles directory {0} already exists, skipping jobfile creation". format(jobfile_path)) return os.makedirs(jobfile_path) #Create a list of job files for processing jobfile_data = [] print("Loading list of filenames from {0}".format(cache_filename)) if not os.path.isfile(cache_filename): raise Exception( "Cached dataset list of filenames not found in {0}, please run this code with --action cache" .format(cache_filename)) filenames_cache = json.load(open(cache_filename, "r")) seed_gen = seed_generator() for dataset in sorted(datasets, key=lambda x: (x["name"], x["era"])): dataset_name = dataset["name"] dataset_era = dataset["era"] is_mc = dataset["is_mc"] try: filenames_all = filenames_cache[dataset_name + "_" + dataset_era] except KeyError as e: print( "Could not load {0} from {1}, please make sure this dataset has been added to cache" .format(dataset_name + "_" + dataset_era, cache_filename), file=sys.stderr) raise e filenames_all_full = [datapath + "/" + fn for fn in filenames_all] print( "Saving dataset {0}_{1} with {2} files in {3} files per chunk to jobfiles" .format(dataset_name, dataset_era, len(filenames_all_full), chunksize)) jobfile_dataset = create_dataset_jobfiles(dataset_name, dataset_era, filenames_all_full, is_mc, chunksize, jobfile_path, seed_gen) jobfile_data += jobfile_dataset print("Dataset {0}_{1} consists of {2} chunks".format( dataset_name, dataset_era, len(jobfile_dataset))) assert (len(jobfile_data) > 0) assert (len(jobfile_data[0]["filenames"]) > 0)
def create_all_jobfiles(datasets, cache_filename, datapath, chunksize, outpath): jobfile_path = outpath + "/jobfiles" if os.path.isdir(jobfile_path): print("Jobfiles directory {0} already exists, skipping jobfile creation".format(jobfile_path)) return os.makedirs(jobfile_path) #Create a list of job files for processing jobfile_data = [] print("Loading list of filenames from {0}".format(cache_filename)) if not os.path.isfile(cache_filename): raise Exception("Cached dataset list of filenames not found in {0}, please run this code with --action cache".format( cache_filename)) filenames_cache = json.load(open(cache_filename, "r")) seed_gen = seed_generator() for dataset in sorted(datasets, key=lambda x: (x["name"], x["era"])): dataset_name = dataset["name"] dataset_era = dataset["era"] is_mc = dataset["is_mc"] try: filenames_all = filenames_cache[dataset_name + "_" + dataset_era] except KeyError as e: print("Could not load {0} from {1}, please make sure this dataset has been added to cache".format( dataset_name + "_" + dataset_era, cache_filename), file=sys.stderr) raise e filenames_all_full = [datapath + "/" + fn for fn in filenames_all] print("Saving dataset {0}_{1} with {2} files in {3} files per chunk to jobfiles".format( dataset_name, dataset_era, len(filenames_all_full), chunksize)) jobfile_dataset = create_dataset_jobfiles(dataset_name, dataset_era, filenames_all_full, is_mc, chunksize, jobfile_path, seed_gen) jobfile_data += jobfile_dataset print("Dataset {0}_{1} consists of {2} chunks".format( dataset_name, dataset_era, len(jobfile_dataset))) assert(len(jobfile_data) > 0) assert(len(jobfile_data[0]["filenames"]) > 0)
def create_all_jobfiles(datasets, cache_filename, datapath, chunksize, outpath): #Create a list of job files for processing jobfile_data = [] print("Loading list of filenames from {0}".format(cache_filename)) if not os.path.isfile(cache_filename): raise Exception( "Cached dataset list of filenames not found in {0}, please run this code with --action cache" .format(cache_filename)) filenames_cache = json.load(open(cache_filename, "r")) seed_gen = seed_generator() for dataset in sorted(datasets): dataset_name, dataset_era, dataset_globpattern, is_mc = dataset try: filenames_all = filenames_cache[dataset_name + "_" + dataset_era] except KeyError as e: print( "Could not load {0} from {1}, please make sure this dataset has been added to cache" .format(dataset_name + "_" + dataset_era, cache_filename), file=sys.stderr) raise e filenames_all_full = [datapath + "/" + fn for fn in filenames_all] print( "Saving dataset {0}_{1} with {2} files in {3} files per chunk to jobfiles" .format(dataset_name, dataset_era, len(filenames_all_full), chunksize)) jobfile_dataset = create_dataset_jobfiles(dataset_name, dataset_era, filenames_all_full, is_mc, chunksize, outpath, seed_gen) jobfile_data += jobfile_dataset print("Dataset {0}_{1} consists of {2} chunks".format( dataset_name, dataset_era, len(jobfile_dataset))) assert (len(jobfile_data) > 0) assert (len(jobfile_data[0]["filenames"]) > 0)
filenames_all = glob.glob(datapath + dataset_globpattern, recursive=True) filenames_all = [fn for fn in filenames_all if not "Friend" in fn] filenames_cache[dataset_name + "_" + dataset_era] = [ fn.replace(datapath, "") for fn in filenames_all ] if len(filenames_all) == 0: raise Exception( "Dataset {0} matched 0 files from glob pattern {1}, verify that the data files are located in {2}" .format(dataset_name, dataset_globpattern, datapath)) try: filenames_all = filenames_cache[dataset_name + "_" + dataset_era] except KeyError as e: print( "Could not load {0}, please make sure this dataset has been added to cache" .format(dataset_name + "_" + dataset_era)) raise e filenames_all_full = [datapath + fn for fn in filenames_all] chunksize = chunksize_multiplier.get(dataset_name, 1) print( "Saving dataset {0}_{1} with {2} files in {3} files per chunk to jobfiles" .format(dataset_name, dataset_era, len(filenames_all_full), chunksize)) jobfile_dataset = create_dataset_jobfiles(dataset_name, dataset_era, filenames_all_full, is_mc, chunksize, "./") jobfile_data += jobfile_dataset print("Dataset {0}_{1} consists of {2} chunks".format( dataset_name, dataset_era, len(jobfile_dataset)))
def main(args, datasets): do_prof = args.do_profile do_tensorflow = not args.disable_tensorflow #use the environment variable for cupy/cuda choice args.use_cuda = USE_CUPY analysis_corrections = None if "analyze" in args.action: analysis_corrections = AnalysisCorrections(args, do_tensorflow) # Optionally disable pinned memory (will be somewhat slower) if args.use_cuda: import cupy if not args.pinned: cupy.cuda.set_allocator(None) cupy.cuda.set_pinned_memory_allocator(None) #Use sync-only datasets if args.do_sync: datasets = datasets_sync #Filter datasets by era datasets_to_process = [] for ds in datasets: if args.datasets is None or ds[0] in args.datasets: if args.eras is None or ds[1] in args.eras: datasets_to_process += [ds] print("Will consider dataset", ds) if len(datasets) == 0: raise Exception("No datasets considered, please check the --datasets and --eras options") datasets = datasets_to_process hmumu_utils.NUMPY_LIB, hmumu_utils.ha = choose_backend(args.use_cuda) Dataset.numpy_lib = hmumu_utils.NUMPY_LIB NUMPY_LIB = hmumu_utils.NUMPY_LIB # All analysis definitions (cut values etc) should go here analysis_parameters = { "baseline": { "nPV": 0, "NdfPV": 4, "zPV": 24, # Will be applied with OR "hlt_bits": { "2016": ["HLT_IsoMu24", "HLT_IsoTkMu24"], "2017": ["HLT_IsoMu27"], "2018": ["HLT_IsoMu24"], }, "muon_pt": 20, "muon_pt_leading": {"2016": 26.0, "2017": 29.0, "2018": 26.0}, "muon_eta": 2.4, "muon_iso": 0.25, "muon_id": {"2016": "medium", "2017": "medium", "2018": "medium"}, "muon_trigger_match_dr": 0.1, "muon_iso_trigger_matched": 0.15, "muon_id_trigger_matched": {"2016": "tight", "2017": "tight", "2018": "tight"}, "do_rochester_corrections": True, "do_lepton_sf": True, "do_jec": True, "jec_tag": {"2016": "Summer16_07Aug2017_V11", "2017": "Fall17_17Nov2017_V32", "2018": "Autumn18_V16"}, "jet_mu_dr": 0.4, "jet_pt_leading": {"2016": 35.0, "2017": 35.0, "2018": 35.0}, "jet_pt_subleading": {"2016": 25.0, "2017": 25.0, "2018": 25.0}, "jet_eta": 4.7, "jet_id": "tight", "jet_puid": "loose", "jet_veto_eta": [2.65, 3.139], "jet_veto_raw_pt": 50.0, "jet_btag": {"2016": 0.6321, "2017": 0.4941, "2018": 0.4184}, "do_factorized_jec": args.do_factorized_jec, "cat5_dijet_inv_mass": 400.0, "cat5_abs_jj_deta_cut": 2.5, "masswindow_z_peak": [76, 106], "masswindow_h_sideband": [110, 150], "masswindow_h_peak": [115, 135], "inv_mass_bins": 41, "extra_electrons_pt": 20, "extra_electrons_eta": 2.5, "extra_electrons_iso": 0.4, #Check if we want to apply this "extra_electrons_id": "mvaFall17V1Iso_WP90", "save_dnn_vars": True, "dnn_vars_path": "{0}/dnn_vars".format(args.out), #If true, apply mjj > cut, otherwise inverse "vbf_filter_mjj_cut": 350, "vbf_filter": { "dy_m105_160_mg": True, "dy_m105_160_amc": True, "dy_m105_160_vbf_mg": False, "dy_m105_160_vbf_amc": False, }, #Irene's DNN input variable order for keras "dnn_varlist_order": ['softJet5', 'dRmm','dEtamm','M_jj','pt_jj','eta_jj','phi_jj','M_mmjj','eta_mmjj','phi_mmjj','dEta_jj','Zep','dRmin_mj', 'dRmax_mj', 'dRmin_mmj','dRmax_mmj','dPhimm','leadingJet_pt','subleadingJet_pt', 'leadingJet_eta','subleadingJet_eta','leadingJet_qgl','subleadingJet_qgl','cthetaCS','Higgs_pt','Higgs_eta','Higgs_mass'], "dnn_input_histogram_bins": { "softJet5": (0,10,10), "dRmm": (0,5,41), "dEtamm": (-2,2,41), "dPhimm": (-2,2,41), "M_jj": (0,2000,41), "pt_jj": (0,400,41), "eta_jj": (-5,5,41), "phi_jj": (-5,5,41), "M_mmjj": (0,2000,41), "eta_mmjj": (-3,3,41), "phi_mmjj": (-3,3,41), "dEta_jj": (-3,3,41), "Zep": (-2,2,41), "dRmin_mj": (0,5,41), "dRmax_mj": (0,5,41), "dRmin_mmj": (0,5,41), "dRmax_mmj": (0,5,41), "leadingJet_pt": (0, 200, 41), "subleadingJet_pt": (0, 200, 41), "leadingJet_eta": (-5, 5, 41), "subleadingJet_eta": (-5, 5, 41), "leadingJet_qgl": (0, 1, 41), "subleadingJet_qgl": (0, 1, 41), "cthetaCS": (-1, 1, 41), "Higgs_pt": (0, 200, 41), "Higgs_eta": (-3, 3, 41), "Higgs_mass": (110, 150, 41), "dnn_pred": (0, 1, 1001), "dnn_pred2": (0, 1, 11), "bdt_ucsd": (-1, 1, 41), "bdt2j_ucsd": (-1, 1, 41), "bdt01j_ucsd": (-1, 1, 41), "MET_pt": (0, 200, 41), "hmmthetacs": (-1, 1, 41), "hmmphics": (-4, 4, 41), }, "categorization_trees": {} }, } histo_bins = { "muon_pt": np.linspace(0, 200, 101, dtype=np.float32), "npvs": np.linspace(0,100,101, dtype=np.float32), "dijet_inv_mass": np.linspace(0, 2000, 41, dtype=np.float32), "inv_mass": np.linspace(70, 150, 41, dtype=np.float32), "numjet": np.linspace(0, 10, 11, dtype=np.float32), "jet_pt": np.linspace(0, 300, 101, dtype=np.float32), "jet_eta": np.linspace(-4.7, 4.7, 41, dtype=np.float32), "pt_balance": np.linspace(0, 5, 41, dtype=np.float32), "numjets": np.linspace(0, 10, 11, dtype=np.float32), "jet_qgl": np.linspace(0, 1, 41, dtype=np.float32), "higgs_inv_mass_uncertainty": np.linspace(0, 10, 101, dtype=np.float32), "higgs_rel_inv_mass_uncertainty": np.linspace(0, 0.05, 101, dtype=np.float32) } for hname, bins in analysis_parameters["baseline"]["dnn_input_histogram_bins"].items(): histo_bins[hname] = np.linspace(bins[0], bins[1], bins[2], dtype=np.float32) for masswindow in ["z_peak", "h_peak", "h_sideband"]: mw = analysis_parameters["baseline"]["masswindow_" + masswindow] histo_bins["inv_mass_{0}".format(masswindow)] = np.linspace(mw[0], mw[1], 41, dtype=np.float32) histo_bins["dnn_pred2"] = { "h_peak": np.array([0., 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 1.0], dtype=np.float32), "z_peak": np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0], dtype=np.float32), "h_sideband": np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0], dtype=np.float32), } analysis_parameters["baseline"]["histo_bins"] = histo_bins #analysis_parameters["oldjec"] = copy.deepcopy(analysis_parameters["baseline"]) #analysis_parameters["oldjec"]["jec_tag"]["2018"] = "Autumn18_V8" #Run baseline analysis outpath = "{0}/partial_results".format(args.out) try: os.makedirs(outpath) except FileExistsError as e: pass with open('{0}/parameters.pkl'.format(outpath), 'wb') as handle: pickle.dump(analysis_parameters, handle, protocol=pickle.HIGHEST_PROTOCOL) #Recreate dump of all filenames cache_filename = args.cache_location + "/datasets.json" if ("cache" in args.action) and (args.jobfiles is None): print("--action cache and no jobfiles specified, creating datasets.json dump of all filenames") if not os.path.isdir(args.cache_location): os.makedirs(args.cache_location) filenames_cache = {} for dataset in datasets: dataset_name, dataset_era, dataset_globpattern, is_mc = dataset filenames_all = glob.glob(args.datapath + dataset_globpattern, recursive=True) filenames_all = [fn for fn in filenames_all if not "Friend" in fn] filenames_cache[dataset_name + "_" + dataset_era] = [ fn.replace(args.datapath, "") for fn in filenames_all] if len(filenames_all) == 0: raise Exception("Dataset {0} matched 0 files from glob pattern {1}, verify that the data files are located in {2}".format( dataset_name, dataset_globpattern, args.datapath )) #save all dataset filenames to a json file print("Creating a json dump of all the dataset filenames based on data found in {0}".format(args.datapath)) if os.path.isfile(cache_filename): print("Cache file {0} already exists, we will not overwrite it to be safe.".format(cache_filename), file=sys.stderr) print("Delete it or change --cache-location and try again.", file=sys.stderr) sys.exit(1) with open(cache_filename, "w") as fi: fi.write(json.dumps(filenames_cache, indent=2)) if ("cache" in args.action or "analyze" in args.action) and (args.jobfiles is None): #Create a list of job files for processing jobfile_data = [] print("Loading list of filenames from {0}".format(cache_filename)) if not os.path.isfile(cache_filename): raise Exception("Cached dataset list of filenames not found in {0}, please run this code with --action cache".format( cache_filename)) filenames_cache = json.load(open(cache_filename, "r")) for dataset in datasets: dataset_name, dataset_era, dataset_globpattern, is_mc = dataset try: filenames_all = filenames_cache[dataset_name + "_" + dataset_era] except KeyError as e: print("Could not load {0} from {1}, please make sure this dataset has been added to cache".format( dataset_name + "_" + dataset_era, cache_filename), file=sys.stderr) raise e filenames_all_full = [args.datapath + "/" + fn for fn in filenames_all] chunksize = args.chunksize * chunksize_multiplier.get(dataset_name, 1) print("Saving dataset {0}_{1} with {2} files in {3} files per chunk to jobfiles".format( dataset_name, dataset_era, len(filenames_all_full), chunksize)) jobfile_dataset = create_dataset_jobfiles(dataset_name, dataset_era, filenames_all_full, is_mc, chunksize, args.out) jobfile_data += jobfile_dataset print("Dataset {0}_{1} consists of {2} chunks".format( dataset_name, dataset_era, len(jobfile_dataset))) assert(len(jobfile_data) > 0) assert(len(jobfile_data[0]["filenames"]) > 0) #For each dataset, find out which chunks we want to process if "cache" in args.action or "analyze" in args.action: jobfile_data = [] if not (args.jobfiles_load is None): args.jobfiles = [l.strip() for l in open(args.jobfiles_load).readlines()] if args.jobfiles is None: print("You did not specify to process specific dataset chunks, assuming you want to process all chunks") print("If this is not true, please specify e.g. --jobfiles data_2018_0.json data_2018_1.json ...") args.jobfiles = [] for dataset in datasets: dataset_name, dataset_era, dataset_globpattern, is_mc = dataset jobfiles_dataset = glob.glob(args.out + "/jobfiles/{0}_{1}_*.json".format(dataset_name, dataset_era)) assert(len(jobfiles_dataset) > 0) if args.maxchunks > 0: jobfiles_dataset = jobfiles_dataset[:args.maxchunks] args.jobfiles += jobfiles_dataset #Now load the jobfiles assert(len(args.jobfiles) > 0) print("You specified --jobfiles {0}, processing only these dataset chunks".format(" ".join(args.jobfiles))) jobfile_data = [] for f in args.jobfiles: jobfile_data += [json.load(open(f))] chunkstr = " ".join(["{0}_{1}_{2}".format( ch["dataset_name"], ch["dataset_era"], ch["dataset_num_chunk"]) for ch in jobfile_data]) print("Will process {0} dataset chunks: {1}".format(len(jobfile_data), chunkstr)) assert(len(jobfile_data) > 0) #Start the profiler only in the actual data processing if do_prof: import yappi filename = 'analysis.prof' yappi.set_clock_type('cpu') yappi.start(builtins=True) if "cache" in args.action: print("Running the 'cache' step of the analysis, ROOT files will be opened and branches will be uncompressed") print("Will retrieve dataset filenames based on existing ROOT files on filesystem in datapath={0}".format(args.datapath)) try: os.makedirs(cmdline_args.cache_location) except Exception as e: pass run_cache(args, outpath, jobfile_data, analysis_parameters) if "analyze" in args.action: run_analysis(args, outpath, jobfile_data, analysis_parameters, analysis_corrections) if "merge" in args.action: with ProcessPoolExecutor(max_workers=args.nthreads) as executor: for dataset in datasets: dataset_name, dataset_era, dataset_globpattern, is_mc = dataset fut = executor.submit(merge_partial_results, dataset_name, dataset_era, outpath) print("done merging") if do_prof: stats = yappi.get_func_stats() stats.save(filename, type='callgrind') import resource total_memory = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss total_memory += resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print("maxrss={0} MB".format(total_memory/1024))