def test_run_analysis_mc(self): import hmumu_utils from hmumu_utils import load_puhist_target, run_analysis from coffea.lookup_tools import extractor NUMPY_LIB = self.NUMPY_LIB hmumu_utils.NUMPY_LIB = self.NUMPY_LIB hmumu_utils.ha = self.ha job_descriptions = [{ "dataset_name": "vbf_sync", "dataset_era": "2016", "filenames": ["data/myNanoProdMc2016_NANO.root"], "is_mc": True, "dataset_num_chunk": 0, "random_seed": 0 }] if not os.path.exists("test_out"): os.makedirs("test_out") ret = run_analysis(self.cmdline_args, "test_out", job_descriptions, self.analysis_parameters, self.analysis_corrections, numev_per_chunk=10000) ret2 = pickle.load(open("test_out/vbf_sync_2016_0.pkl", "rb")) print(ret2["num_events"]) print(ret2["genEventSumw"]) print(ret2["baseline"]["selected_events_dimuon"]) self.assertAlmostEqual(ret2["num_events"], 3674) self.assertAlmostEqual(ret2["genEventSumw"], 4.659182940800001) self.assertAlmostEqual(ret2["baseline"]["selected_events_dimuon"], 1561)
def test_run_analysis_data_skim(self): fn = "data/nano_2016_data_skim.root" if not os.path.isfile(fn): print("Data sync file {0} not found, skipping".format(fn)) return from hmumu_utils import run_analysis job_descriptions = [{ "dataset_name": "data_skim", "dataset_era": "2016", "filenames": [fn], "is_mc": False, "dataset_num_chunk": 0, "random_seed": 0 }] if not os.path.exists("test_out"): os.makedirs("test_out") ret = run_analysis(self.cmdline_args, "test_out", job_descriptions, self.analysis_parameters, self.analysis_corrections, numev_per_chunk=10000) ret2 = pickle.load(open("test_out/data_skim_2016_0.pkl", "rb")) print(ret2["num_events"]) print(ret2["int_lumi"]) print(ret2["baseline"]["selected_events_dimuon"]) self.assertAlmostEqual(ret2["num_events"], 142) self.assertAlmostEqual(ret2["int_lumi"], 0.130571592) self.assertAlmostEqual(ret2["baseline"]["selected_events_dimuon"], 41)
def test_run_analysis_mc_skim(self): fn = "data/myNanoProdMc2016_NANO_skim.root" if not os.path.isfile(fn): print("File {0} not found, skipping".format(fn)) return from hmumu_utils import run_analysis job_descriptions = [{ "dataset_name": "vbf_sync_skim", "dataset_era": "2016", "filenames": [fn], "is_mc": True, "dataset_num_chunk": 0, "random_seed": 0 }] from argparse import Namespace if not os.path.exists("test_out"): os.makedirs("test_out") ret = run_analysis(self.cmdline_args, "test_out", job_descriptions, self.analysis_parameters, self.analysis_corrections, numev_per_chunk=10000) ret2 = pickle.load(open("test_out/vbf_sync_skim_2016_0.pkl", "rb")) self.assertAlmostEqual(ret2["num_events"], 73903) self.assertAlmostEqual(ret2["genEventSumw"], 3.7593771153623963) self.assertAlmostEqual(ret2["baseline"]["selected_events_dimuon"], 62172)
def test_run_analysis_mc_and_data(self): fn_mc = "/storage/user/nlu/hmm/automaticTest/myNanoProdMc2016_NANO.root" if not os.path.isfile(fn_mc): fn_mc = "data/myNanoProdMc2016_NANO.root" fn_data = "data/nano_2016_data.root" from hmumu_utils import run_analysis job_descriptions = [{ "dataset_name": "dy", "dataset_era": "2016", "filenames": [fn_mc], "is_mc": True, "dataset_num_chunk": 0, "random_seed": 0 }, { "dataset_name": "data", "dataset_era": "2016", "filenames": [fn_data], "is_mc": False, "dataset_num_chunk": 0, "random_seed": 0 }] outpath = "test_out" if os.path.exists(outpath): shutil.rmtree(outpath) os.makedirs(outpath) cmdline_args = copy.deepcopy(self.cmdline_args) cmdline_args.out = outpath ret = run_analysis( cmdline_args, outpath, job_descriptions, self.analysis_parameters, self.analysis_corrections, ) from plotting import make_pdf_plot from pars import cross_sections res = {"data": pickle.load(open(outpath + "/data_2016_0.pkl", "rb"))} analysis = "baseline" var = "hist__dimuon__leading_muon_pt" era = "2016" int_lumi = res["data"]["int_lumi"] mc_samples = ["dy"] process_groups = [("dy", ["dy"])] genweights = {} weight_xs = {} for mc_samp in mc_samples: res[mc_samp] = pickle.load( open(outpath + "/{0}_2016_0.pkl".format(mc_samp), "rb")) genweights[mc_samp] = res[mc_samp]["genEventSumw"] weight_xs[mc_samp] = cross_sections[ mc_samp] * int_lumi / genweights[mc_samp] print(genweights["dy"]) print(int_lumi) self.assertAlmostEqual(genweights["dy"], 4.659182940800001) self.assertAlmostEqual(int_lumi, 0.130571592) histos = {} for sample in mc_samples + ["data"]: histos[sample] = res[sample][analysis][var] hdata = res["data"][analysis][var]["nominal"] outdir = "{0}/{1}/plots/{2}".format(outpath, analysis, era) plot_args = (histos, hdata, mc_samples, analysis, var, "nominal", weight_xs, int_lumi, outdir, era, process_groups, {}) make_pdf_plot(plot_args)
def main(args): do_prof = args.do_profile do_tensorflow = not args.disable_tensorflow # use the environment variable for cupy/cuda choice args.use_cuda = USE_CUPY datasets = yaml.load(open(args.datasets_yaml), Loader=yaml.FullLoader)["datasets"] # Filter datasets by era datasets_to_process = [] for ds in datasets: if args.datasets is None or ds["name"] in args.datasets: if args.eras is None or ds["era"] in args.eras: datasets_to_process += [ds] if len(datasets_to_process) == 0: raise Exception( "No datasets considered, please check the --datasets and --eras options" ) datasets = datasets_to_process # Choose either the CPU or GPU(CUDA) backend hmumu_utils.NUMPY_LIB, hmumu_utils.ha = choose_backend(args.use_cuda) Dataset.numpy_lib = hmumu_utils.NUMPY_LIB outpath_partial = "{0}/partial_results".format(args.out) try: os.makedirs(outpath_partial) except FileExistsError: print("Output path {0} already exists, not recreating".format( outpath_partial)) # save the parameters as a pkl file from pars import analysis_parameters for analysis_name in analysis_parameters.keys(): analysis_parameters[analysis_name][ "do_factorized_jec"] = args.do_factorized_jec analysis_parameters[analysis_name][ "dnn_vars_path"] = "{0}/dnn_vars".format(args.out) with open('{0}/parameters.pkl'.format(outpath_partial), 'wb') as handle: pickle.dump(analysis_parameters, handle, protocol=pickle.HIGHEST_PROTOCOL) # Recreate dump of all filenames cache_filename = "{0}/datasets.json".format(args.out) use_skim = False if args.cachepath is None: print( "--cachepath not specified, will process unskimmed NanoAOD, which is somewhat slower!" ) print("Please see the README.md on how to skim the NanoAOD") datapath = args.datapath else: print("Processing skimmed NanoAOD") datapath = args.cachepath use_skim = True check_and_recreate_filename_cache(cache_filename, datapath, datasets, use_skim) # Create the jobfiles if args.jobfiles is None: create_all_jobfiles(datasets, cache_filename, datapath, args.chunksize, args.out) # For each dataset, find out which chunks we want to process if "analyze" in args.action: jobfile_data = load_jobfiles(datasets, args.jobfiles_load, args.jobfiles, args.maxchunks, args.out) # If we want to check what part of the code is slow, start the profiler only in the actual data processing if do_prof: import yappi yappi.set_clock_type('cpu') yappi.start(builtins=True) # Run the physics analysis on all specified jobfiles if "analyze" in args.action: print( "Running the 'analyze' step of the analysis, processing the events into histograms with all systematics" ) analysis_corrections = AnalysisCorrections(args, do_tensorflow) run_analysis(args, outpath_partial, jobfile_data, analysis_parameters, analysis_corrections) if do_prof: stats = yappi.get_func_stats() stats.save("analysis.prof", type='callgrind') # Merge the partial results (pieces of each dataset) if "merge" in args.action: with ProcessPoolExecutor(max_workers=args.nthreads) as executor: for dataset in datasets: dataset_name = dataset["name"] dataset_era = dataset["era"] executor.submit(merge_partial_results, dataset_name, dataset_era, args.out, outpath_partial) print("done merging") # print memory usage for debugging total_memory = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss total_memory += resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print("maxrss={0} MB".format(total_memory / 1024))
def test_run_analysis_mc_and_data(self): fn_mc = "/storage/group/allcit/store/mc/RunIISummer16NanoAODv5/DYJetsToLL_M-50_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/PUMoriond17_Nano1June2019_102X_mcRun2_asymptotic_v7_ext2-v1/120000/CBCAE1AB-4AFD-D840-BE00-9E5ABD2E4A20.root" fn_data = "data/nano_2016_data.root" if not (os.path.isfile(fn_mc) and os.path.isfile(fn_data)): print("file {0} or {1} not found, returning".format( fn_mc, fn_data)) return from hmumu_utils import run_analysis job_descriptions = [{ "dataset_name": "dy", "dataset_era": "2016", "filenames": [fn_mc], "is_mc": True, "dataset_num_chunk": 0, "random_seed": 0 }, { "dataset_name": "data", "dataset_era": "2016", "filenames": [fn_data], "is_mc": False, "dataset_num_chunk": 0, "random_seed": 0 }] outpath = "test_out" if os.path.exists(outpath): shutil.rmtree(outpath) os.makedirs(outpath) cmdline_args = copy.deepcopy(self.cmdline_args) cmdline_args.out = outpath ret = run_analysis( cmdline_args, outpath, job_descriptions, self.analysis_parameters, self.analysis_corrections, ) from plotting import make_pdf_plot from pars import cross_sections res = {"data": pickle.load(open(outpath + "/data_2016_0.pkl", "rb"))} analysis = "baseline" var = "hist__dimuon__leading_muon_pt" era = "2016" int_lumi = res["data"]["int_lumi"] mc_samples = ["dy"] process_groups = [("dy", ["dy"])] genweights = {} weight_xs = {} for mc_samp in mc_samples: res[mc_samp] = pickle.load( open(outpath + "/{0}_2016_0.pkl".format(mc_samp), "rb")) genweights[mc_samp] = res[mc_samp]["genEventSumw"] weight_xs[mc_samp] = cross_sections[ mc_samp] * int_lumi / genweights[mc_samp] self.assertAlmostEqual(genweights["dy"], 6073.25342144) self.assertAlmostEqual(int_lumi, 5.633297364) histos = {} for sample in mc_samples + ["data"]: histos[sample] = res[sample][analysis][var] hdata = res["data"][analysis][var]["nominal"] outdir = "{0}/{1}/plots/{2}".format(outpath, analysis, era) plot_args = (histos, hdata, mc_samples, analysis, var, "nominal", weight_xs, int_lumi, outdir, era, process_groups, {}) make_pdf_plot(plot_args)
def main(args, datasets): do_prof = args.do_profile do_tensorflow = not args.disable_tensorflow #use the environment variable for cupy/cuda choice args.use_cuda = USE_CUPY analysis_corrections = None if "analyze" in args.action: analysis_corrections = AnalysisCorrections(args, do_tensorflow) # Optionally disable pinned memory (will be somewhat slower) if args.use_cuda: import cupy if not args.pinned: cupy.cuda.set_allocator(None) cupy.cuda.set_pinned_memory_allocator(None) #Use sync-only datasets if args.do_sync: datasets = datasets_sync #Filter datasets by era datasets_to_process = [] for ds in datasets: if args.datasets is None or ds[0] in args.datasets: if args.eras is None or ds[1] in args.eras: datasets_to_process += [ds] print("Will consider dataset", ds) if len(datasets) == 0: raise Exception("No datasets considered, please check the --datasets and --eras options") datasets = datasets_to_process hmumu_utils.NUMPY_LIB, hmumu_utils.ha = choose_backend(args.use_cuda) Dataset.numpy_lib = hmumu_utils.NUMPY_LIB NUMPY_LIB = hmumu_utils.NUMPY_LIB # All analysis definitions (cut values etc) should go here analysis_parameters = { "baseline": { "nPV": 0, "NdfPV": 4, "zPV": 24, # Will be applied with OR "hlt_bits": { "2016": ["HLT_IsoMu24", "HLT_IsoTkMu24"], "2017": ["HLT_IsoMu27"], "2018": ["HLT_IsoMu24"], }, "muon_pt": 20, "muon_pt_leading": {"2016": 26.0, "2017": 29.0, "2018": 26.0}, "muon_eta": 2.4, "muon_iso": 0.25, "muon_id": {"2016": "medium", "2017": "medium", "2018": "medium"}, "muon_trigger_match_dr": 0.1, "muon_iso_trigger_matched": 0.15, "muon_id_trigger_matched": {"2016": "tight", "2017": "tight", "2018": "tight"}, "do_rochester_corrections": True, "do_lepton_sf": True, "do_jec": True, "jec_tag": {"2016": "Summer16_07Aug2017_V11", "2017": "Fall17_17Nov2017_V32", "2018": "Autumn18_V16"}, "jet_mu_dr": 0.4, "jet_pt_leading": {"2016": 35.0, "2017": 35.0, "2018": 35.0}, "jet_pt_subleading": {"2016": 25.0, "2017": 25.0, "2018": 25.0}, "jet_eta": 4.7, "jet_id": "tight", "jet_puid": "loose", "jet_veto_eta": [2.65, 3.139], "jet_veto_raw_pt": 50.0, "jet_btag": {"2016": 0.6321, "2017": 0.4941, "2018": 0.4184}, "do_factorized_jec": args.do_factorized_jec, "cat5_dijet_inv_mass": 400.0, "cat5_abs_jj_deta_cut": 2.5, "masswindow_z_peak": [76, 106], "masswindow_h_sideband": [110, 150], "masswindow_h_peak": [115, 135], "inv_mass_bins": 41, "extra_electrons_pt": 20, "extra_electrons_eta": 2.5, "extra_electrons_iso": 0.4, #Check if we want to apply this "extra_electrons_id": "mvaFall17V1Iso_WP90", "save_dnn_vars": True, "dnn_vars_path": "{0}/dnn_vars".format(args.out), #If true, apply mjj > cut, otherwise inverse "vbf_filter_mjj_cut": 350, "vbf_filter": { "dy_m105_160_mg": True, "dy_m105_160_amc": True, "dy_m105_160_vbf_mg": False, "dy_m105_160_vbf_amc": False, }, #Irene's DNN input variable order for keras "dnn_varlist_order": ['softJet5', 'dRmm','dEtamm','M_jj','pt_jj','eta_jj','phi_jj','M_mmjj','eta_mmjj','phi_mmjj','dEta_jj','Zep','dRmin_mj', 'dRmax_mj', 'dRmin_mmj','dRmax_mmj','dPhimm','leadingJet_pt','subleadingJet_pt', 'leadingJet_eta','subleadingJet_eta','leadingJet_qgl','subleadingJet_qgl','cthetaCS','Higgs_pt','Higgs_eta','Higgs_mass'], "dnn_input_histogram_bins": { "softJet5": (0,10,10), "dRmm": (0,5,41), "dEtamm": (-2,2,41), "dPhimm": (-2,2,41), "M_jj": (0,2000,41), "pt_jj": (0,400,41), "eta_jj": (-5,5,41), "phi_jj": (-5,5,41), "M_mmjj": (0,2000,41), "eta_mmjj": (-3,3,41), "phi_mmjj": (-3,3,41), "dEta_jj": (-3,3,41), "Zep": (-2,2,41), "dRmin_mj": (0,5,41), "dRmax_mj": (0,5,41), "dRmin_mmj": (0,5,41), "dRmax_mmj": (0,5,41), "leadingJet_pt": (0, 200, 41), "subleadingJet_pt": (0, 200, 41), "leadingJet_eta": (-5, 5, 41), "subleadingJet_eta": (-5, 5, 41), "leadingJet_qgl": (0, 1, 41), "subleadingJet_qgl": (0, 1, 41), "cthetaCS": (-1, 1, 41), "Higgs_pt": (0, 200, 41), "Higgs_eta": (-3, 3, 41), "Higgs_mass": (110, 150, 41), "dnn_pred": (0, 1, 1001), "dnn_pred2": (0, 1, 11), "bdt_ucsd": (-1, 1, 41), "bdt2j_ucsd": (-1, 1, 41), "bdt01j_ucsd": (-1, 1, 41), "MET_pt": (0, 200, 41), "hmmthetacs": (-1, 1, 41), "hmmphics": (-4, 4, 41), }, "categorization_trees": {} }, } histo_bins = { "muon_pt": np.linspace(0, 200, 101, dtype=np.float32), "npvs": np.linspace(0,100,101, dtype=np.float32), "dijet_inv_mass": np.linspace(0, 2000, 41, dtype=np.float32), "inv_mass": np.linspace(70, 150, 41, dtype=np.float32), "numjet": np.linspace(0, 10, 11, dtype=np.float32), "jet_pt": np.linspace(0, 300, 101, dtype=np.float32), "jet_eta": np.linspace(-4.7, 4.7, 41, dtype=np.float32), "pt_balance": np.linspace(0, 5, 41, dtype=np.float32), "numjets": np.linspace(0, 10, 11, dtype=np.float32), "jet_qgl": np.linspace(0, 1, 41, dtype=np.float32), "higgs_inv_mass_uncertainty": np.linspace(0, 10, 101, dtype=np.float32), "higgs_rel_inv_mass_uncertainty": np.linspace(0, 0.05, 101, dtype=np.float32) } for hname, bins in analysis_parameters["baseline"]["dnn_input_histogram_bins"].items(): histo_bins[hname] = np.linspace(bins[0], bins[1], bins[2], dtype=np.float32) for masswindow in ["z_peak", "h_peak", "h_sideband"]: mw = analysis_parameters["baseline"]["masswindow_" + masswindow] histo_bins["inv_mass_{0}".format(masswindow)] = np.linspace(mw[0], mw[1], 41, dtype=np.float32) histo_bins["dnn_pred2"] = { "h_peak": np.array([0., 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 1.0], dtype=np.float32), "z_peak": np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0], dtype=np.float32), "h_sideband": np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0], dtype=np.float32), } analysis_parameters["baseline"]["histo_bins"] = histo_bins #analysis_parameters["oldjec"] = copy.deepcopy(analysis_parameters["baseline"]) #analysis_parameters["oldjec"]["jec_tag"]["2018"] = "Autumn18_V8" #Run baseline analysis outpath = "{0}/partial_results".format(args.out) try: os.makedirs(outpath) except FileExistsError as e: pass with open('{0}/parameters.pkl'.format(outpath), 'wb') as handle: pickle.dump(analysis_parameters, handle, protocol=pickle.HIGHEST_PROTOCOL) #Recreate dump of all filenames cache_filename = args.cache_location + "/datasets.json" if ("cache" in args.action) and (args.jobfiles is None): print("--action cache and no jobfiles specified, creating datasets.json dump of all filenames") if not os.path.isdir(args.cache_location): os.makedirs(args.cache_location) filenames_cache = {} for dataset in datasets: dataset_name, dataset_era, dataset_globpattern, is_mc = dataset filenames_all = glob.glob(args.datapath + dataset_globpattern, recursive=True) filenames_all = [fn for fn in filenames_all if not "Friend" in fn] filenames_cache[dataset_name + "_" + dataset_era] = [ fn.replace(args.datapath, "") for fn in filenames_all] if len(filenames_all) == 0: raise Exception("Dataset {0} matched 0 files from glob pattern {1}, verify that the data files are located in {2}".format( dataset_name, dataset_globpattern, args.datapath )) #save all dataset filenames to a json file print("Creating a json dump of all the dataset filenames based on data found in {0}".format(args.datapath)) if os.path.isfile(cache_filename): print("Cache file {0} already exists, we will not overwrite it to be safe.".format(cache_filename), file=sys.stderr) print("Delete it or change --cache-location and try again.", file=sys.stderr) sys.exit(1) with open(cache_filename, "w") as fi: fi.write(json.dumps(filenames_cache, indent=2)) if ("cache" in args.action or "analyze" in args.action) and (args.jobfiles is None): #Create a list of job files for processing jobfile_data = [] print("Loading list of filenames from {0}".format(cache_filename)) if not os.path.isfile(cache_filename): raise Exception("Cached dataset list of filenames not found in {0}, please run this code with --action cache".format( cache_filename)) filenames_cache = json.load(open(cache_filename, "r")) for dataset in datasets: dataset_name, dataset_era, dataset_globpattern, is_mc = dataset try: filenames_all = filenames_cache[dataset_name + "_" + dataset_era] except KeyError as e: print("Could not load {0} from {1}, please make sure this dataset has been added to cache".format( dataset_name + "_" + dataset_era, cache_filename), file=sys.stderr) raise e filenames_all_full = [args.datapath + "/" + fn for fn in filenames_all] chunksize = args.chunksize * chunksize_multiplier.get(dataset_name, 1) print("Saving dataset {0}_{1} with {2} files in {3} files per chunk to jobfiles".format( dataset_name, dataset_era, len(filenames_all_full), chunksize)) jobfile_dataset = create_dataset_jobfiles(dataset_name, dataset_era, filenames_all_full, is_mc, chunksize, args.out) jobfile_data += jobfile_dataset print("Dataset {0}_{1} consists of {2} chunks".format( dataset_name, dataset_era, len(jobfile_dataset))) assert(len(jobfile_data) > 0) assert(len(jobfile_data[0]["filenames"]) > 0) #For each dataset, find out which chunks we want to process if "cache" in args.action or "analyze" in args.action: jobfile_data = [] if not (args.jobfiles_load is None): args.jobfiles = [l.strip() for l in open(args.jobfiles_load).readlines()] if args.jobfiles is None: print("You did not specify to process specific dataset chunks, assuming you want to process all chunks") print("If this is not true, please specify e.g. --jobfiles data_2018_0.json data_2018_1.json ...") args.jobfiles = [] for dataset in datasets: dataset_name, dataset_era, dataset_globpattern, is_mc = dataset jobfiles_dataset = glob.glob(args.out + "/jobfiles/{0}_{1}_*.json".format(dataset_name, dataset_era)) assert(len(jobfiles_dataset) > 0) if args.maxchunks > 0: jobfiles_dataset = jobfiles_dataset[:args.maxchunks] args.jobfiles += jobfiles_dataset #Now load the jobfiles assert(len(args.jobfiles) > 0) print("You specified --jobfiles {0}, processing only these dataset chunks".format(" ".join(args.jobfiles))) jobfile_data = [] for f in args.jobfiles: jobfile_data += [json.load(open(f))] chunkstr = " ".join(["{0}_{1}_{2}".format( ch["dataset_name"], ch["dataset_era"], ch["dataset_num_chunk"]) for ch in jobfile_data]) print("Will process {0} dataset chunks: {1}".format(len(jobfile_data), chunkstr)) assert(len(jobfile_data) > 0) #Start the profiler only in the actual data processing if do_prof: import yappi filename = 'analysis.prof' yappi.set_clock_type('cpu') yappi.start(builtins=True) if "cache" in args.action: print("Running the 'cache' step of the analysis, ROOT files will be opened and branches will be uncompressed") print("Will retrieve dataset filenames based on existing ROOT files on filesystem in datapath={0}".format(args.datapath)) try: os.makedirs(cmdline_args.cache_location) except Exception as e: pass run_cache(args, outpath, jobfile_data, analysis_parameters) if "analyze" in args.action: run_analysis(args, outpath, jobfile_data, analysis_parameters, analysis_corrections) if "merge" in args.action: with ProcessPoolExecutor(max_workers=args.nthreads) as executor: for dataset in datasets: dataset_name, dataset_era, dataset_globpattern, is_mc = dataset fut = executor.submit(merge_partial_results, dataset_name, dataset_era, outpath) print("done merging") if do_prof: stats = yappi.get_func_stats() stats.save(filename, type='callgrind') import resource total_memory = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss total_memory += resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print("maxrss={0} MB".format(total_memory/1024))
def main(args, datasets): do_prof = args.do_profile do_tensorflow = not args.disable_tensorflow #use the environment variable for cupy/cuda choice args.use_cuda = USE_CUPY # Optionally disable pinned memory (will be somewhat slower) if args.use_cuda: import cupy if not args.pinned: cupy.cuda.set_allocator(None) cupy.cuda.set_pinned_memory_allocator(None) #Use sync-only datasets if args.do_sync: datasets = datasets_sync #Filter datasets by era datasets_to_process = [] for ds in datasets: if args.datasets is None or ds[0] in args.datasets: if args.eras is None or ds[1] in args.eras: datasets_to_process += [ds] print("Will consider dataset", ds) if len(datasets) == 0: raise Exception( "No datasets considered, please check the --datasets and --eras options" ) datasets = datasets_to_process hmumu_utils.NUMPY_LIB, hmumu_utils.ha = choose_backend(args.use_cuda) Dataset.numpy_lib = hmumu_utils.NUMPY_LIB NUMPY_LIB = hmumu_utils.NUMPY_LIB outpath_partial = "{0}/partial_results".format(args.out) try: os.makedirs(outpath_partial) except FileExistsError as e: print("Output path {0} already exists, not recreating".format( outpath_partial)) #save the parameters as a pkl file from pars import analysis_parameters for analysis_name in analysis_parameters.keys(): analysis_parameters[analysis_name][ "do_factorized_jec"] = args.do_factorized_jec analysis_parameters[analysis_name][ "dnn_vars_path"] = "{0}/dnn_vars".format(args.out) with open('{0}/parameters.pkl'.format(outpath_partial), 'wb') as handle: pickle.dump(analysis_parameters, handle, protocol=pickle.HIGHEST_PROTOCOL) #Recreate dump of all filenames cache_filename = args.cache_location + "/datasets.json" if ("cache" in args.action) and (args.jobfiles is None): check_and_recreate_filename_cache(cache_filename, args.cache_location, args.datapath) #Create the jobfiles if ("cache" in args.action or "analyze" in args.action) and (args.jobfiles is None): create_all_jobfiles(datasets, cache_filename, args.datapath, args.chunksize, args.out) #For each dataset, find out which chunks we want to process if "cache" in args.action or "analyze" in args.action: jobfile_data = load_jobfiles(datasets, args.jobfiles_load, args.jobfiles, args.maxchunks, args.out) #Start the profiler only in the actual data processing if do_prof: import yappi yappi.set_clock_type('cpu') yappi.start(builtins=True) if "cache" in args.action: print( "Running the 'cache' step of the analysis, ROOT files will be opened and branches will be uncompressed" ) run_cache(args, outpath_partial, jobfile_data, analysis_parameters) #Run the physics analysis on all specified jobfiles if "analyze" in args.action: print( "Running the 'analyze' step of the analysis, processing the events into histograms with all systematics" ) analysis_corrections = AnalysisCorrections(args, do_tensorflow) run_analysis(args, outpath_partial, jobfile_data, analysis_parameters, analysis_corrections) if do_prof: stats = yappi.get_func_stats() stats.save("analysis.prof", type='callgrind') #Merge the partial results (pieces of each dataset) if "merge" in args.action: with ProcessPoolExecutor(max_workers=args.nthreads) as executor: for dataset in datasets: dataset_name, dataset_era, dataset_globpattern, is_mc = dataset fut = executor.submit(merge_partial_results, dataset_name, dataset_era, args.out, outpath_partial) print("done merging") #print memory usage total_memory = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss total_memory += resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print("maxrss={0} MB".format(total_memory / 1024))