def setUpClass(self): self.NUMPY_LIB, self.ha = choose_backend(use_cuda=USE_CUPY) import hmumu_utils hmumu_utils.NUMPY_LIB = self.NUMPY_LIB hmumu_utils.ha = self.ha #disable everything that requires ROOT which is not easily available on travis tests from pars import analysis_parameters self.analysis_parameters = analysis_parameters self.analysis_parameters["baseline"][ "do_rochester_corrections"] = False self.analysis_parameters["baseline"]["do_lepton_sf"] = False self.analysis_parameters["baseline"]["save_dnn_vars"] = False self.analysis_parameters["baseline"]["do_bdt_ucsd"] = False self.analysis_parameters["baseline"]["do_bdt_pisa"] = False self.analysis_parameters["baseline"]["do_factorized_jec"] = False self.analysis_parameters["baseline"]["do_jec"] = True self.analysis_parameters["baseline"]["do_jer"] = {"2016": True} from argparse import Namespace self.cmdline_args = Namespace(use_cuda=USE_CUPY, datapath=".", do_fsr=False, nthreads=1, async_data=False, do_sync=False, out="test_out") from analysis_hmumu import AnalysisCorrections self.analysis_corrections = AnalysisCorrections( self.cmdline_args, True)
def setUpClass(self): self.NUMPY_LIB, self.ha = choose_backend(use_cuda=USE_CUPY) import hmumu_utils hmumu_utils.NUMPY_LIB = self.NUMPY_LIB hmumu_utils.ha = self.ha download_if_not_exists( "data/myNanoProdMc2016_NANO.root", "https://jpata.web.cern.ch/jpata/hmm/test_files/myNanoProdMc2016_NANO.root" ) #Load a simple sync dataset self.datastructures = create_datastructure("vbf_sync", True, "2016", do_fsr=True) self.dataset = Dataset("vbf_sync", ["data/myNanoProdMc2016_NANO.root"], self.datastructures, datapath="", treename="Events", is_mc=True) self.dataset.num_chunk = 0 self.dataset.era = "2016" self.dataset.load_root() self.dataset.numpy_lib = self.NUMPY_LIB self.dataset.move_to_device(self.NUMPY_LIB) #disable everything that requires ROOT which is not easily available on travis tests from pars import analysis_parameters self.analysis_parameters = analysis_parameters self.analysis_parameters["baseline"][ "do_rochester_corrections"] = False self.analysis_parameters["baseline"]["do_lepton_sf"] = False self.analysis_parameters["baseline"]["save_dnn_vars"] = False self.analysis_parameters["baseline"]["do_bdt_ucsd"] = False self.analysis_parameters["baseline"]["do_bdt_pisa"] = False self.analysis_parameters["baseline"]["do_factorized_jec"] = False self.analysis_parameters["baseline"]["do_jec"] = {"2016:": False} self.analysis_parameters["baseline"]["do_jer"] = {"2016": True} from argparse import Namespace self.cmdline_args = Namespace(use_cuda=USE_CUPY, datapath=".", do_fsr=False, nthreads=1, async_data=False, do_sync=False, out="test_out") from analysis_hmumu import AnalysisCorrections self.analysis_corrections = AnalysisCorrections( self.cmdline_args, True)
class TestHistogram(unittest.TestCase): NUMPY_LIB, ha = choose_backend(use_cuda=USE_CUDA) def test_histogram(self): np = TestHistogram.NUMPY_LIB data = np.array([2, 3, 4, 5, 6, 7], dtype=np.float32) data[data < 2] = 0 weights = np.ones_like(data, dtype=np.float32) w, w2, e = self.ha.histogram_from_vector( data, weights, np.array([0, 1, 2, 3, 4, 5], dtype=np.float32)) npw, npe = np.histogram(data, np.array([0, 1, 2, 3, 4, 5])) hr = from_numpy((w, e)) f = uproot.recreate("test.root") f["hist"] = hr data = np.random.normal(size=10000) data = np.array(data, dtype=np.float32) weights = np.ones_like(data, dtype=np.float32) w, w2, e = self.ha.histogram_from_vector( data, weights, np.linspace(-1, 1, 100, dtype=np.float32)) hr = from_numpy((w, e)) f["hist2"] = hr f.close() def test_histogram_several(self): np = TestHistogram.NUMPY_LIB data = np.array([2, 3, 4, 5, 6, 7], dtype=np.float32) mask = data >= 2 data[self.NUMPY_LIB.invert(mask)] = 0 weights = np.ones_like(data, dtype=np.float32) bins = np.array([0, 1, 2, 3, 4, 5], dtype=np.float32) w, w2, e = self.ha.histogram_from_vector(data, weights, bins) histograms = self.ha.histogram_from_vector_several([(data, bins), (data, bins)], weights, mask) assert numpy.all(w == histograms[0][0]) assert numpy.all(w == histograms[1][0]) assert numpy.all(w2 == histograms[0][1]) assert numpy.all(w2 == histograms[1][1])
def setUpClass(self): self.NUMPY_LIB, self.ha = choose_backend(use_cuda=USE_CUPY) import hmumu_utils hmumu_utils.NUMPY_LIB = self.NUMPY_LIB hmumu_utils.ha = self.ha #disable everything that requires ROOT which is not easily available on travis tests from pars import analysis_parameters self.analysis_parameters = analysis_parameters self.analysis_parameters["baseline"]["do_rochester_corrections"] = True self.analysis_parameters["baseline"]["do_lepton_sf"] = True self.analysis_parameters["baseline"]["save_dnn_vars"] = True self.analysis_parameters["baseline"]["do_bdt_ucsd"] = False self.analysis_parameters["baseline"]["do_bdt_pisa"] = False self.analysis_parameters["baseline"]["do_factorized_jec"] = False self.analysis_parameters["baseline"]["do_jec"] = True self.analysis_parameters["baseline"]["do_jer"] = {"2016": True} from argparse import Namespace self.cmdline_args = Namespace(use_cuda=USE_CUPY, datapath=".", nthreads=1, do_fsr=False, async_data=False, do_sync=False, out="test_out") from analysis_hmumu import AnalysisCorrections self.analysis_corrections = AnalysisCorrections( self.cmdline_args, True) download_if_not_exists( "data/myNanoProdMc2016_NANO.root", "https://jpata.web.cern.ch/jpata/hmm/test_files/21-02-2020-private-nanoaod/myNanoProdMc2016_NANO.root" ) download_if_not_exists( "data/nano_2016_data.root", "https://jpata.web.cern.ch/jpata/hmm/test_files/21-02-2020-private-nanoaod/nano_2016_data.root" )
parser.add_argument('--path-to-model', action='store', help='path to DNN model', type=str, default=None, required=False) parser.add_argument('--year', action='store', choices=['2016', '2017', '2018'], help='Year of data/MC samples', default='2017') parser.add_argument('filenames', nargs=argparse.REMAINDER) args = parser.parse_args() # set CPU or GPU backend NUMPY_LIB, ha = choose_backend(args.use_cuda) lib_analysis.NUMPY_LIB, lib_analysis.ha = NUMPY_LIB, ha NanoAODDataset.numpy_lib = NUMPY_LIB if args.use_cuda: os.environ["HEPACCELERATE_CUDA"] = "1" else: os.environ["HEPACCELERATE_CUDA"] = "0" from coffea.util import USE_CUPY from coffea.lumi_tools import LumiMask, LumiData from coffea.lookup_tools import extractor from coffea.btag_tools import BTagScaleFactor # load definitions from definitions_analysis import parameters, eraDependentParameters, samples_info
import requests import os import numpy as np import json import sys import time import uproot import numba import hepaccelerate import hepaccelerate.kernels as kernels from hepaccelerate.utils import Results, Dataset, Histogram, choose_backend from tests.kernel_test import load_dataset USE_CUDA = int(os.environ.get("HEPACCELERATE_CUDA", 0)) == 1 nplib, backend = choose_backend(use_cuda=USE_CUDA) def time_kernel(dataset, test_kernel): # ensure it's compiled test_kernel(dataset) n = len(dataset) t0 = time.time() for i in range(5): test_kernel(dataset) t1 = time.time() dt = (t1 - t0) / 5.0 speed = float(n) / dt
hist_muons_pt = Histogram(*kernels.histogram_from_vector( backend, leading_muon_pt[mask_events_dimuon], weights[mask_events_dimuon], bins)) #save it to the output ret["hist_leading_muon_pt"] = hist_muons_pt return ret if __name__ == "__main__": #choose whether or not to use the GPU backend use_cuda = int(os.environ.get("HEPACCELERATE_CUDA", 0)) == 1 if use_cuda: import setGPU nplib, backend = choose_backend(use_cuda=use_cuda) #Load this input file filename = "data/HZZ.root" #Predefine which branches to read from the TTree and how they are grouped to objects #This will be verified against the actual ROOT TTree when it is loaded datastructures = { "Muon": [("Muon_Px", "float32"), ("Muon_Py", "float32"), ("Muon_Pz", "float32"), ("Muon_E", "float32"), ("Muon_Charge", "int32"), ("Muon_Iso", "float32")], "Jet": [("Jet_Px", "float32"), ("Jet_Py", "float32"), ("Jet_Pz", "float32"), ("Jet_E", "float32"), ("Jet_btag", "float32"), ("Jet_ID", "bool")], "EventVariables": [("NPrimaryVertices", "int32"), ("triggerIsoMu24", "bool"),
def setUpClass(self): self.NUMPY_LIB, self.ha = choose_backend(use_cuda=USE_CUDA) self.use_cuda = USE_CUDA self.dataset = load_dataset(self.NUMPY_LIB)
def main(args): do_prof = args.do_profile do_tensorflow = not args.disable_tensorflow # use the environment variable for cupy/cuda choice args.use_cuda = USE_CUPY datasets = yaml.load(open(args.datasets_yaml), Loader=yaml.FullLoader)["datasets"] # Filter datasets by era datasets_to_process = [] for ds in datasets: if args.datasets is None or ds["name"] in args.datasets: if args.eras is None or ds["era"] in args.eras: datasets_to_process += [ds] if len(datasets_to_process) == 0: raise Exception( "No datasets considered, please check the --datasets and --eras options" ) datasets = datasets_to_process # Choose either the CPU or GPU(CUDA) backend hmumu_utils.NUMPY_LIB, hmumu_utils.ha = choose_backend(args.use_cuda) Dataset.numpy_lib = hmumu_utils.NUMPY_LIB outpath_partial = "{0}/partial_results".format(args.out) try: os.makedirs(outpath_partial) except FileExistsError: print("Output path {0} already exists, not recreating".format( outpath_partial)) # save the parameters as a pkl file from pars import analysis_parameters for analysis_name in analysis_parameters.keys(): analysis_parameters[analysis_name][ "do_factorized_jec"] = args.do_factorized_jec analysis_parameters[analysis_name][ "dnn_vars_path"] = "{0}/dnn_vars".format(args.out) with open('{0}/parameters.pkl'.format(outpath_partial), 'wb') as handle: pickle.dump(analysis_parameters, handle, protocol=pickle.HIGHEST_PROTOCOL) # Recreate dump of all filenames cache_filename = "{0}/datasets.json".format(args.out) use_skim = False if args.cachepath is None: print( "--cachepath not specified, will process unskimmed NanoAOD, which is somewhat slower!" ) print("Please see the README.md on how to skim the NanoAOD") datapath = args.datapath else: print("Processing skimmed NanoAOD") datapath = args.cachepath use_skim = True check_and_recreate_filename_cache(cache_filename, datapath, datasets, use_skim) # Create the jobfiles if args.jobfiles is None: create_all_jobfiles(datasets, cache_filename, datapath, args.chunksize, args.out) # For each dataset, find out which chunks we want to process if "analyze" in args.action: jobfile_data = load_jobfiles(datasets, args.jobfiles_load, args.jobfiles, args.maxchunks, args.out) # If we want to check what part of the code is slow, start the profiler only in the actual data processing if do_prof: import yappi yappi.set_clock_type('cpu') yappi.start(builtins=True) # Run the physics analysis on all specified jobfiles if "analyze" in args.action: print( "Running the 'analyze' step of the analysis, processing the events into histograms with all systematics" ) analysis_corrections = AnalysisCorrections(args, do_tensorflow) run_analysis(args, outpath_partial, jobfile_data, analysis_parameters, analysis_corrections) if do_prof: stats = yappi.get_func_stats() stats.save("analysis.prof", type='callgrind') # Merge the partial results (pieces of each dataset) if "merge" in args.action: with ProcessPoolExecutor(max_workers=args.nthreads) as executor: for dataset in datasets: dataset_name = dataset["name"] dataset_era = dataset["era"] executor.submit(merge_partial_results, dataset_name, dataset_era, args.out, outpath_partial) print("done merging") # print memory usage for debugging total_memory = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss total_memory += resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print("maxrss={0} MB".format(total_memory / 1024))
class TestDataset(unittest.TestCase): NUMPY_LIB, ha = choose_backend(use_cuda=USE_CUDA) @staticmethod def load_dataset(num_iter=1): datastructures = { "Muon": [ ("Muon_Px", "float32"), ("Muon_Py", "float32"), ("Muon_Pz", "float32"), ("Muon_E", "float32"), ("Muon_Charge", "int32"), ("Muon_Iso", "float32"), ], "Jet": [ ("Jet_Px", "float32"), ("Jet_Py", "float32"), ("Jet_Pz", "float32"), ("Jet_E", "float32"), ("Jet_btag", "float32"), ("Jet_ID", "bool"), ], "EventVariables": [ ("NPrimaryVertices", "int32"), ("triggerIsoMu24", "bool"), ("EventWeight", "float32"), ], } dataset = Dataset( "HZZ", num_iter * ["data/HZZ.root"], datastructures, treename="events", datapath="", ) assert dataset.filenames[0] == "data/HZZ.root" assert len(dataset.filenames) == num_iter assert len(dataset.structs["Jet"]) == 0 assert len(dataset.eventvars) == 0 return dataset def setUp(self): self.dataset = self.load_dataset() @staticmethod def map_func(dataset, ifile): mu = dataset.structs["Muon"][ifile] mu_pt = np.sqrt(mu.Px**2 + mu.Py**2) mu_pt_pass = mu_pt > 20 mask_rows = np.ones(mu.numevents(), dtype=np.bool) mask_content = np.ones(mu.numobjects(), dtype=np.bool) ret = TestDataset.ha.sum_in_offsets(mu.offsets, mu_pt_pass, mask_rows, mask_content, dtype=np.int8) return ret def test_dataset_map(self): dataset = self.load_dataset() dataset.load_root() rets = dataset.map(self.map_func) assert len(rets) == 1 assert len(rets[0]) == dataset.structs["Muon"][0].numevents() assert np.sum(rets[0]) > 0 return rets def test_dataset_compact(self): dataset = self.dataset dataset.load_root() memsize1 = dataset.memsize() rets = dataset.map(self.map_func) # compacting uses JaggedArray functionality and can only be done on the numpy/CPU backend dataset.move_to_device(np) rets = [TestDataset.NUMPY_LIB.asnumpy(r) for r in rets] dataset.compact(rets) dataset.move_to_device(TestDataset.NUMPY_LIB) memsize2 = dataset.memsize() assert memsize1 > memsize2 print("compacted memory size ratio:", memsize2 / memsize1) @staticmethod def precompute_results(filename): fi = uproot.open(filename) arr = fi.get("events").array("EventWeight") return {"EventWeight": arr.sum()} def test_dataset_merge_inplace(self): num_iter = 10 ds_multi = self.load_dataset(num_iter=num_iter) ds_multi.func_filename_precompute = self.precompute_results ds_multi.load_root() assert len(ds_multi.structs["Jet"]) == num_iter njet = ds_multi.num_objects_loaded("Jet") # compute a per-event jet energy sum taking into account the offsets jet_sume = TestDataset.NUMPY_LIB.hstack([ TestDataset.ha.sum_in_offsets( ds_multi.structs["Jet"][i].offsets, ds_multi.structs["Jet"][i]["E"], TestDataset.NUMPY_LIB.ones( ds_multi.structs["Jet"][i].numevents(), dtype=TestDataset.NUMPY_LIB.bool, ), TestDataset.NUMPY_LIB.ones( ds_multi.structs["Jet"][i].numobjects(), dtype=TestDataset.NUMPY_LIB.bool, ), ) for i in range(num_iter) ]) numevents = ds_multi.numevents() ds_multi.merge_inplace() assert len(ds_multi.structs["Jet"]) == 1 assert ds_multi.num_objects_loaded("Jet") == njet jet_sume_merged = TestDataset.ha.sum_in_offsets( ds_multi.structs["Jet"][0].offsets, ds_multi.structs["Jet"][0]["E"], TestDataset.NUMPY_LIB.ones(ds_multi.structs["Jet"][0].numevents(), dtype=TestDataset.NUMPY_LIB.bool), TestDataset.NUMPY_LIB.ones( ds_multi.structs["Jet"][0].numobjects(), dtype=TestDataset.NUMPY_LIB.bool, ), ) assert TestDataset.NUMPY_LIB.all(jet_sume_merged == jet_sume) assert ds_multi.numevents() == numevents
def main(args, datasets): do_prof = args.do_profile do_tensorflow = not args.disable_tensorflow #use the environment variable for cupy/cuda choice args.use_cuda = USE_CUPY analysis_corrections = None if "analyze" in args.action: analysis_corrections = AnalysisCorrections(args, do_tensorflow) # Optionally disable pinned memory (will be somewhat slower) if args.use_cuda: import cupy if not args.pinned: cupy.cuda.set_allocator(None) cupy.cuda.set_pinned_memory_allocator(None) #Use sync-only datasets if args.do_sync: datasets = datasets_sync #Filter datasets by era datasets_to_process = [] for ds in datasets: if args.datasets is None or ds[0] in args.datasets: if args.eras is None or ds[1] in args.eras: datasets_to_process += [ds] print("Will consider dataset", ds) if len(datasets) == 0: raise Exception("No datasets considered, please check the --datasets and --eras options") datasets = datasets_to_process hmumu_utils.NUMPY_LIB, hmumu_utils.ha = choose_backend(args.use_cuda) Dataset.numpy_lib = hmumu_utils.NUMPY_LIB NUMPY_LIB = hmumu_utils.NUMPY_LIB # All analysis definitions (cut values etc) should go here analysis_parameters = { "baseline": { "nPV": 0, "NdfPV": 4, "zPV": 24, # Will be applied with OR "hlt_bits": { "2016": ["HLT_IsoMu24", "HLT_IsoTkMu24"], "2017": ["HLT_IsoMu27"], "2018": ["HLT_IsoMu24"], }, "muon_pt": 20, "muon_pt_leading": {"2016": 26.0, "2017": 29.0, "2018": 26.0}, "muon_eta": 2.4, "muon_iso": 0.25, "muon_id": {"2016": "medium", "2017": "medium", "2018": "medium"}, "muon_trigger_match_dr": 0.1, "muon_iso_trigger_matched": 0.15, "muon_id_trigger_matched": {"2016": "tight", "2017": "tight", "2018": "tight"}, "do_rochester_corrections": True, "do_lepton_sf": True, "do_jec": True, "jec_tag": {"2016": "Summer16_07Aug2017_V11", "2017": "Fall17_17Nov2017_V32", "2018": "Autumn18_V16"}, "jet_mu_dr": 0.4, "jet_pt_leading": {"2016": 35.0, "2017": 35.0, "2018": 35.0}, "jet_pt_subleading": {"2016": 25.0, "2017": 25.0, "2018": 25.0}, "jet_eta": 4.7, "jet_id": "tight", "jet_puid": "loose", "jet_veto_eta": [2.65, 3.139], "jet_veto_raw_pt": 50.0, "jet_btag": {"2016": 0.6321, "2017": 0.4941, "2018": 0.4184}, "do_factorized_jec": args.do_factorized_jec, "cat5_dijet_inv_mass": 400.0, "cat5_abs_jj_deta_cut": 2.5, "masswindow_z_peak": [76, 106], "masswindow_h_sideband": [110, 150], "masswindow_h_peak": [115, 135], "inv_mass_bins": 41, "extra_electrons_pt": 20, "extra_electrons_eta": 2.5, "extra_electrons_iso": 0.4, #Check if we want to apply this "extra_electrons_id": "mvaFall17V1Iso_WP90", "save_dnn_vars": True, "dnn_vars_path": "{0}/dnn_vars".format(args.out), #If true, apply mjj > cut, otherwise inverse "vbf_filter_mjj_cut": 350, "vbf_filter": { "dy_m105_160_mg": True, "dy_m105_160_amc": True, "dy_m105_160_vbf_mg": False, "dy_m105_160_vbf_amc": False, }, #Irene's DNN input variable order for keras "dnn_varlist_order": ['softJet5', 'dRmm','dEtamm','M_jj','pt_jj','eta_jj','phi_jj','M_mmjj','eta_mmjj','phi_mmjj','dEta_jj','Zep','dRmin_mj', 'dRmax_mj', 'dRmin_mmj','dRmax_mmj','dPhimm','leadingJet_pt','subleadingJet_pt', 'leadingJet_eta','subleadingJet_eta','leadingJet_qgl','subleadingJet_qgl','cthetaCS','Higgs_pt','Higgs_eta','Higgs_mass'], "dnn_input_histogram_bins": { "softJet5": (0,10,10), "dRmm": (0,5,41), "dEtamm": (-2,2,41), "dPhimm": (-2,2,41), "M_jj": (0,2000,41), "pt_jj": (0,400,41), "eta_jj": (-5,5,41), "phi_jj": (-5,5,41), "M_mmjj": (0,2000,41), "eta_mmjj": (-3,3,41), "phi_mmjj": (-3,3,41), "dEta_jj": (-3,3,41), "Zep": (-2,2,41), "dRmin_mj": (0,5,41), "dRmax_mj": (0,5,41), "dRmin_mmj": (0,5,41), "dRmax_mmj": (0,5,41), "leadingJet_pt": (0, 200, 41), "subleadingJet_pt": (0, 200, 41), "leadingJet_eta": (-5, 5, 41), "subleadingJet_eta": (-5, 5, 41), "leadingJet_qgl": (0, 1, 41), "subleadingJet_qgl": (0, 1, 41), "cthetaCS": (-1, 1, 41), "Higgs_pt": (0, 200, 41), "Higgs_eta": (-3, 3, 41), "Higgs_mass": (110, 150, 41), "dnn_pred": (0, 1, 1001), "dnn_pred2": (0, 1, 11), "bdt_ucsd": (-1, 1, 41), "bdt2j_ucsd": (-1, 1, 41), "bdt01j_ucsd": (-1, 1, 41), "MET_pt": (0, 200, 41), "hmmthetacs": (-1, 1, 41), "hmmphics": (-4, 4, 41), }, "categorization_trees": {} }, } histo_bins = { "muon_pt": np.linspace(0, 200, 101, dtype=np.float32), "npvs": np.linspace(0,100,101, dtype=np.float32), "dijet_inv_mass": np.linspace(0, 2000, 41, dtype=np.float32), "inv_mass": np.linspace(70, 150, 41, dtype=np.float32), "numjet": np.linspace(0, 10, 11, dtype=np.float32), "jet_pt": np.linspace(0, 300, 101, dtype=np.float32), "jet_eta": np.linspace(-4.7, 4.7, 41, dtype=np.float32), "pt_balance": np.linspace(0, 5, 41, dtype=np.float32), "numjets": np.linspace(0, 10, 11, dtype=np.float32), "jet_qgl": np.linspace(0, 1, 41, dtype=np.float32), "higgs_inv_mass_uncertainty": np.linspace(0, 10, 101, dtype=np.float32), "higgs_rel_inv_mass_uncertainty": np.linspace(0, 0.05, 101, dtype=np.float32) } for hname, bins in analysis_parameters["baseline"]["dnn_input_histogram_bins"].items(): histo_bins[hname] = np.linspace(bins[0], bins[1], bins[2], dtype=np.float32) for masswindow in ["z_peak", "h_peak", "h_sideband"]: mw = analysis_parameters["baseline"]["masswindow_" + masswindow] histo_bins["inv_mass_{0}".format(masswindow)] = np.linspace(mw[0], mw[1], 41, dtype=np.float32) histo_bins["dnn_pred2"] = { "h_peak": np.array([0., 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 1.0], dtype=np.float32), "z_peak": np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0], dtype=np.float32), "h_sideband": np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0], dtype=np.float32), } analysis_parameters["baseline"]["histo_bins"] = histo_bins #analysis_parameters["oldjec"] = copy.deepcopy(analysis_parameters["baseline"]) #analysis_parameters["oldjec"]["jec_tag"]["2018"] = "Autumn18_V8" #Run baseline analysis outpath = "{0}/partial_results".format(args.out) try: os.makedirs(outpath) except FileExistsError as e: pass with open('{0}/parameters.pkl'.format(outpath), 'wb') as handle: pickle.dump(analysis_parameters, handle, protocol=pickle.HIGHEST_PROTOCOL) #Recreate dump of all filenames cache_filename = args.cache_location + "/datasets.json" if ("cache" in args.action) and (args.jobfiles is None): print("--action cache and no jobfiles specified, creating datasets.json dump of all filenames") if not os.path.isdir(args.cache_location): os.makedirs(args.cache_location) filenames_cache = {} for dataset in datasets: dataset_name, dataset_era, dataset_globpattern, is_mc = dataset filenames_all = glob.glob(args.datapath + dataset_globpattern, recursive=True) filenames_all = [fn for fn in filenames_all if not "Friend" in fn] filenames_cache[dataset_name + "_" + dataset_era] = [ fn.replace(args.datapath, "") for fn in filenames_all] if len(filenames_all) == 0: raise Exception("Dataset {0} matched 0 files from glob pattern {1}, verify that the data files are located in {2}".format( dataset_name, dataset_globpattern, args.datapath )) #save all dataset filenames to a json file print("Creating a json dump of all the dataset filenames based on data found in {0}".format(args.datapath)) if os.path.isfile(cache_filename): print("Cache file {0} already exists, we will not overwrite it to be safe.".format(cache_filename), file=sys.stderr) print("Delete it or change --cache-location and try again.", file=sys.stderr) sys.exit(1) with open(cache_filename, "w") as fi: fi.write(json.dumps(filenames_cache, indent=2)) if ("cache" in args.action or "analyze" in args.action) and (args.jobfiles is None): #Create a list of job files for processing jobfile_data = [] print("Loading list of filenames from {0}".format(cache_filename)) if not os.path.isfile(cache_filename): raise Exception("Cached dataset list of filenames not found in {0}, please run this code with --action cache".format( cache_filename)) filenames_cache = json.load(open(cache_filename, "r")) for dataset in datasets: dataset_name, dataset_era, dataset_globpattern, is_mc = dataset try: filenames_all = filenames_cache[dataset_name + "_" + dataset_era] except KeyError as e: print("Could not load {0} from {1}, please make sure this dataset has been added to cache".format( dataset_name + "_" + dataset_era, cache_filename), file=sys.stderr) raise e filenames_all_full = [args.datapath + "/" + fn for fn in filenames_all] chunksize = args.chunksize * chunksize_multiplier.get(dataset_name, 1) print("Saving dataset {0}_{1} with {2} files in {3} files per chunk to jobfiles".format( dataset_name, dataset_era, len(filenames_all_full), chunksize)) jobfile_dataset = create_dataset_jobfiles(dataset_name, dataset_era, filenames_all_full, is_mc, chunksize, args.out) jobfile_data += jobfile_dataset print("Dataset {0}_{1} consists of {2} chunks".format( dataset_name, dataset_era, len(jobfile_dataset))) assert(len(jobfile_data) > 0) assert(len(jobfile_data[0]["filenames"]) > 0) #For each dataset, find out which chunks we want to process if "cache" in args.action or "analyze" in args.action: jobfile_data = [] if not (args.jobfiles_load is None): args.jobfiles = [l.strip() for l in open(args.jobfiles_load).readlines()] if args.jobfiles is None: print("You did not specify to process specific dataset chunks, assuming you want to process all chunks") print("If this is not true, please specify e.g. --jobfiles data_2018_0.json data_2018_1.json ...") args.jobfiles = [] for dataset in datasets: dataset_name, dataset_era, dataset_globpattern, is_mc = dataset jobfiles_dataset = glob.glob(args.out + "/jobfiles/{0}_{1}_*.json".format(dataset_name, dataset_era)) assert(len(jobfiles_dataset) > 0) if args.maxchunks > 0: jobfiles_dataset = jobfiles_dataset[:args.maxchunks] args.jobfiles += jobfiles_dataset #Now load the jobfiles assert(len(args.jobfiles) > 0) print("You specified --jobfiles {0}, processing only these dataset chunks".format(" ".join(args.jobfiles))) jobfile_data = [] for f in args.jobfiles: jobfile_data += [json.load(open(f))] chunkstr = " ".join(["{0}_{1}_{2}".format( ch["dataset_name"], ch["dataset_era"], ch["dataset_num_chunk"]) for ch in jobfile_data]) print("Will process {0} dataset chunks: {1}".format(len(jobfile_data), chunkstr)) assert(len(jobfile_data) > 0) #Start the profiler only in the actual data processing if do_prof: import yappi filename = 'analysis.prof' yappi.set_clock_type('cpu') yappi.start(builtins=True) if "cache" in args.action: print("Running the 'cache' step of the analysis, ROOT files will be opened and branches will be uncompressed") print("Will retrieve dataset filenames based on existing ROOT files on filesystem in datapath={0}".format(args.datapath)) try: os.makedirs(cmdline_args.cache_location) except Exception as e: pass run_cache(args, outpath, jobfile_data, analysis_parameters) if "analyze" in args.action: run_analysis(args, outpath, jobfile_data, analysis_parameters, analysis_corrections) if "merge" in args.action: with ProcessPoolExecutor(max_workers=args.nthreads) as executor: for dataset in datasets: dataset_name, dataset_era, dataset_globpattern, is_mc = dataset fut = executor.submit(merge_partial_results, dataset_name, dataset_era, outpath) print("done merging") if do_prof: stats = yappi.get_func_stats() stats.save(filename, type='callgrind') import resource total_memory = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss total_memory += resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print("maxrss={0} MB".format(total_memory/1024))
bins = np.linspace(0,2,1000) plt.hist(data1["corr_JEC"], bins=bins, histtype="step", label="our code"); plt.hist(data2["corr_JEC"], bins=bins, histtype="step", label="NanoAODTools"); plt.xlabel("JEC correction") plt.savefig("corr_JEC.pdf") plt.figure() bins = np.linspace(0,2,1000) plt.hist(data1["corr_JER"], bins=bins, histtype="step", label="our code"); plt.hist(data2["corr_JER"], bins=bins, histtype="step", label="NanoAODTools"); plt.xlabel("JER correction") plt.savefig("corr_JER.pdf") if __name__ == "__main__": use_cuda = False NUMPY_LIB, ha = choose_backend(use_cuda) hmumu_utils.NUMPY_LIB = np hmumu_utils.ha = ha job_desc = { "dataset_name": "ggh_amcPS", "is_mc": True, "dataset_era": "2018", "filenames": ["/store/mc/RunIIAutumn18NanoAODv5/GluGluHToMuMu_M125_TuneCP5_PSweights_13TeV_amcatnloFXFX_pythia8/NANOAODSIM/Nano1June2019_102X_upgrade2018_realistic_v19-v1/100000/359F045D-D71C-E84E-9BD1-0BEA8E6228C5.root", ], "dataset_num_chunk": 0, } cache_location = "/storage/user/nlu/hmm/cache2" datapath = "/storage/group/allcit/" datastructures = create_datastructure(job_desc["dataset_name"], job_desc["is_mc"], job_desc["dataset_era"])
def main(args, datasets): do_prof = args.do_profile do_tensorflow = not args.disable_tensorflow #use the environment variable for cupy/cuda choice args.use_cuda = USE_CUPY # Optionally disable pinned memory (will be somewhat slower) if args.use_cuda: import cupy if not args.pinned: cupy.cuda.set_allocator(None) cupy.cuda.set_pinned_memory_allocator(None) #Use sync-only datasets if args.do_sync: datasets = datasets_sync #Filter datasets by era datasets_to_process = [] for ds in datasets: if args.datasets is None or ds[0] in args.datasets: if args.eras is None or ds[1] in args.eras: datasets_to_process += [ds] print("Will consider dataset", ds) if len(datasets) == 0: raise Exception( "No datasets considered, please check the --datasets and --eras options" ) datasets = datasets_to_process hmumu_utils.NUMPY_LIB, hmumu_utils.ha = choose_backend(args.use_cuda) Dataset.numpy_lib = hmumu_utils.NUMPY_LIB NUMPY_LIB = hmumu_utils.NUMPY_LIB outpath_partial = "{0}/partial_results".format(args.out) try: os.makedirs(outpath_partial) except FileExistsError as e: print("Output path {0} already exists, not recreating".format( outpath_partial)) #save the parameters as a pkl file from pars import analysis_parameters for analysis_name in analysis_parameters.keys(): analysis_parameters[analysis_name][ "do_factorized_jec"] = args.do_factorized_jec analysis_parameters[analysis_name][ "dnn_vars_path"] = "{0}/dnn_vars".format(args.out) with open('{0}/parameters.pkl'.format(outpath_partial), 'wb') as handle: pickle.dump(analysis_parameters, handle, protocol=pickle.HIGHEST_PROTOCOL) #Recreate dump of all filenames cache_filename = args.cache_location + "/datasets.json" if ("cache" in args.action) and (args.jobfiles is None): check_and_recreate_filename_cache(cache_filename, args.cache_location, args.datapath) #Create the jobfiles if ("cache" in args.action or "analyze" in args.action) and (args.jobfiles is None): create_all_jobfiles(datasets, cache_filename, args.datapath, args.chunksize, args.out) #For each dataset, find out which chunks we want to process if "cache" in args.action or "analyze" in args.action: jobfile_data = load_jobfiles(datasets, args.jobfiles_load, args.jobfiles, args.maxchunks, args.out) #Start the profiler only in the actual data processing if do_prof: import yappi yappi.set_clock_type('cpu') yappi.start(builtins=True) if "cache" in args.action: print( "Running the 'cache' step of the analysis, ROOT files will be opened and branches will be uncompressed" ) run_cache(args, outpath_partial, jobfile_data, analysis_parameters) #Run the physics analysis on all specified jobfiles if "analyze" in args.action: print( "Running the 'analyze' step of the analysis, processing the events into histograms with all systematics" ) analysis_corrections = AnalysisCorrections(args, do_tensorflow) run_analysis(args, outpath_partial, jobfile_data, analysis_parameters, analysis_corrections) if do_prof: stats = yappi.get_func_stats() stats.save("analysis.prof", type='callgrind') #Merge the partial results (pieces of each dataset) if "merge" in args.action: with ProcessPoolExecutor(max_workers=args.nthreads) as executor: for dataset in datasets: dataset_name, dataset_era, dataset_globpattern, is_mc = dataset fut = executor.submit(merge_partial_results, dataset_name, dataset_era, args.out, outpath_partial) print("done merging") #print memory usage total_memory = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss total_memory += resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print("maxrss={0} MB".format(total_memory / 1024))