def test_run_analysis_mc(self):
        import hmumu_utils
        from hmumu_utils import load_puhist_target, run_analysis
        from coffea.lookup_tools import extractor
        NUMPY_LIB = self.NUMPY_LIB
        hmumu_utils.NUMPY_LIB = self.NUMPY_LIB
        hmumu_utils.ha = self.ha

        job_descriptions = [{
            "dataset_name": "vbf_sync",
            "dataset_era": "2016",
            "filenames": ["data/myNanoProdMc2016_NANO.root"],
            "is_mc": True,
            "dataset_num_chunk": 0,
            "random_seed": 0
        }]

        if not os.path.exists("test_out"):
            os.makedirs("test_out")
        ret = run_analysis(self.cmdline_args,
                           "test_out",
                           job_descriptions,
                           self.analysis_parameters,
                           self.analysis_corrections,
                           numev_per_chunk=10000)

        ret2 = pickle.load(open("test_out/vbf_sync_2016_0.pkl", "rb"))

        print(ret2["num_events"])
        print(ret2["genEventSumw"])
        print(ret2["baseline"]["selected_events_dimuon"])
        self.assertAlmostEqual(ret2["num_events"], 3674)
        self.assertAlmostEqual(ret2["genEventSumw"], 4.659182940800001)
        self.assertAlmostEqual(ret2["baseline"]["selected_events_dimuon"],
                               1561)
    def test_run_analysis_data_skim(self):
        fn = "data/nano_2016_data_skim.root"
        if not os.path.isfile(fn):
            print("Data sync file {0} not found, skipping".format(fn))
            return

        from hmumu_utils import run_analysis

        job_descriptions = [{
            "dataset_name": "data_skim",
            "dataset_era": "2016",
            "filenames": [fn],
            "is_mc": False,
            "dataset_num_chunk": 0,
            "random_seed": 0
        }]

        if not os.path.exists("test_out"):
            os.makedirs("test_out")

        ret = run_analysis(self.cmdline_args,
                           "test_out",
                           job_descriptions,
                           self.analysis_parameters,
                           self.analysis_corrections,
                           numev_per_chunk=10000)

        ret2 = pickle.load(open("test_out/data_skim_2016_0.pkl", "rb"))

        print(ret2["num_events"])
        print(ret2["int_lumi"])
        print(ret2["baseline"]["selected_events_dimuon"])
        self.assertAlmostEqual(ret2["num_events"], 142)
        self.assertAlmostEqual(ret2["int_lumi"], 0.130571592)
        self.assertAlmostEqual(ret2["baseline"]["selected_events_dimuon"], 41)
Exemple #3
0
    def test_run_analysis_mc_skim(self):
        fn = "data/myNanoProdMc2016_NANO_skim.root"
        if not os.path.isfile(fn):
            print("File {0} not found, skipping".format(fn))
            return

        from hmumu_utils import run_analysis
        job_descriptions = [{
            "dataset_name": "vbf_sync_skim",
            "dataset_era": "2016",
            "filenames": [fn],
            "is_mc": True,
            "dataset_num_chunk": 0,
            "random_seed": 0
        }]

        from argparse import Namespace
        if not os.path.exists("test_out"):
            os.makedirs("test_out")

        ret = run_analysis(self.cmdline_args,
                           "test_out",
                           job_descriptions,
                           self.analysis_parameters,
                           self.analysis_corrections,
                           numev_per_chunk=10000)

        ret2 = pickle.load(open("test_out/vbf_sync_skim_2016_0.pkl", "rb"))

        self.assertAlmostEqual(ret2["num_events"], 73903)
        self.assertAlmostEqual(ret2["genEventSumw"], 3.7593771153623963)
        self.assertAlmostEqual(ret2["baseline"]["selected_events_dimuon"],
                               62172)
    def test_run_analysis_mc_and_data(self):
        fn_mc = "/storage/user/nlu/hmm/automaticTest/myNanoProdMc2016_NANO.root"

        if not os.path.isfile(fn_mc):
            fn_mc = "data/myNanoProdMc2016_NANO.root"

        fn_data = "data/nano_2016_data.root"

        from hmumu_utils import run_analysis

        job_descriptions = [{
            "dataset_name": "dy",
            "dataset_era": "2016",
            "filenames": [fn_mc],
            "is_mc": True,
            "dataset_num_chunk": 0,
            "random_seed": 0
        }, {
            "dataset_name": "data",
            "dataset_era": "2016",
            "filenames": [fn_data],
            "is_mc": False,
            "dataset_num_chunk": 0,
            "random_seed": 0
        }]

        outpath = "test_out"
        if os.path.exists(outpath):
            shutil.rmtree(outpath)
        os.makedirs(outpath)

        cmdline_args = copy.deepcopy(self.cmdline_args)
        cmdline_args.out = outpath

        ret = run_analysis(
            cmdline_args,
            outpath,
            job_descriptions,
            self.analysis_parameters,
            self.analysis_corrections,
        )

        from plotting import make_pdf_plot
        from pars import cross_sections

        res = {"data": pickle.load(open(outpath + "/data_2016_0.pkl", "rb"))}
        analysis = "baseline"
        var = "hist__dimuon__leading_muon_pt"
        era = "2016"

        int_lumi = res["data"]["int_lumi"]
        mc_samples = ["dy"]
        process_groups = [("dy", ["dy"])]

        genweights = {}
        weight_xs = {}
        for mc_samp in mc_samples:
            res[mc_samp] = pickle.load(
                open(outpath + "/{0}_2016_0.pkl".format(mc_samp), "rb"))
            genweights[mc_samp] = res[mc_samp]["genEventSumw"]
            weight_xs[mc_samp] = cross_sections[
                mc_samp] * int_lumi / genweights[mc_samp]

        print(genweights["dy"])
        print(int_lumi)
        self.assertAlmostEqual(genweights["dy"], 4.659182940800001)
        self.assertAlmostEqual(int_lumi, 0.130571592)

        histos = {}
        for sample in mc_samples + ["data"]:
            histos[sample] = res[sample][analysis][var]
        hdata = res["data"][analysis][var]["nominal"]

        outdir = "{0}/{1}/plots/{2}".format(outpath, analysis, era)
        plot_args = (histos, hdata, mc_samples, analysis, var, "nominal",
                     weight_xs, int_lumi, outdir, era, process_groups, {})

        make_pdf_plot(plot_args)
Exemple #5
0
def main(args):
    do_prof = args.do_profile
    do_tensorflow = not args.disable_tensorflow

    # use the environment variable for cupy/cuda choice
    args.use_cuda = USE_CUPY

    datasets = yaml.load(open(args.datasets_yaml),
                         Loader=yaml.FullLoader)["datasets"]

    # Filter datasets by era
    datasets_to_process = []
    for ds in datasets:
        if args.datasets is None or ds["name"] in args.datasets:
            if args.eras is None or ds["era"] in args.eras:
                datasets_to_process += [ds]
    if len(datasets_to_process) == 0:
        raise Exception(
            "No datasets considered, please check the --datasets and --eras options"
        )
    datasets = datasets_to_process

    # Choose either the CPU or GPU(CUDA) backend
    hmumu_utils.NUMPY_LIB, hmumu_utils.ha = choose_backend(args.use_cuda)
    Dataset.numpy_lib = hmumu_utils.NUMPY_LIB

    outpath_partial = "{0}/partial_results".format(args.out)
    try:
        os.makedirs(outpath_partial)
    except FileExistsError:
        print("Output path {0} already exists, not recreating".format(
            outpath_partial))

    # save the parameters as a pkl file
    from pars import analysis_parameters
    for analysis_name in analysis_parameters.keys():
        analysis_parameters[analysis_name][
            "do_factorized_jec"] = args.do_factorized_jec
        analysis_parameters[analysis_name][
            "dnn_vars_path"] = "{0}/dnn_vars".format(args.out)
    with open('{0}/parameters.pkl'.format(outpath_partial), 'wb') as handle:
        pickle.dump(analysis_parameters,
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)

    # Recreate dump of all filenames
    cache_filename = "{0}/datasets.json".format(args.out)

    use_skim = False
    if args.cachepath is None:
        print(
            "--cachepath not specified, will process unskimmed NanoAOD, which is somewhat slower!"
        )
        print("Please see the README.md on how to skim the NanoAOD")
        datapath = args.datapath
    else:
        print("Processing skimmed NanoAOD")
        datapath = args.cachepath
        use_skim = True
    check_and_recreate_filename_cache(cache_filename, datapath, datasets,
                                      use_skim)

    # Create the jobfiles
    if args.jobfiles is None:
        create_all_jobfiles(datasets, cache_filename, datapath, args.chunksize,
                            args.out)

    # For each dataset, find out which chunks we want to process
    if "analyze" in args.action:
        jobfile_data = load_jobfiles(datasets, args.jobfiles_load,
                                     args.jobfiles, args.maxchunks, args.out)

    # If we want to check what part of the code is slow, start the profiler only in the actual data processing
    if do_prof:
        import yappi
        yappi.set_clock_type('cpu')
        yappi.start(builtins=True)

    # Run the physics analysis on all specified jobfiles
    if "analyze" in args.action:
        print(
            "Running the 'analyze' step of the analysis, processing the events into histograms with all systematics"
        )
        analysis_corrections = AnalysisCorrections(args, do_tensorflow)
        run_analysis(args, outpath_partial, jobfile_data, analysis_parameters,
                     analysis_corrections)

    if do_prof:
        stats = yappi.get_func_stats()
        stats.save("analysis.prof", type='callgrind')

    # Merge the partial results (pieces of each dataset)
    if "merge" in args.action:
        with ProcessPoolExecutor(max_workers=args.nthreads) as executor:
            for dataset in datasets:
                dataset_name = dataset["name"]
                dataset_era = dataset["era"]
                executor.submit(merge_partial_results, dataset_name,
                                dataset_era, args.out, outpath_partial)
        print("done merging")

    # print memory usage for debugging
    total_memory = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
    total_memory += resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print("maxrss={0} MB".format(total_memory / 1024))
Exemple #6
0
    def test_run_analysis_mc_and_data(self):
        fn_mc = "/storage/group/allcit/store/mc/RunIISummer16NanoAODv5/DYJetsToLL_M-50_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/PUMoriond17_Nano1June2019_102X_mcRun2_asymptotic_v7_ext2-v1/120000/CBCAE1AB-4AFD-D840-BE00-9E5ABD2E4A20.root"
        fn_data = "data/nano_2016_data.root"

        if not (os.path.isfile(fn_mc) and os.path.isfile(fn_data)):
            print("file {0} or {1} not found, returning".format(
                fn_mc, fn_data))
            return

        from hmumu_utils import run_analysis

        job_descriptions = [{
            "dataset_name": "dy",
            "dataset_era": "2016",
            "filenames": [fn_mc],
            "is_mc": True,
            "dataset_num_chunk": 0,
            "random_seed": 0
        }, {
            "dataset_name": "data",
            "dataset_era": "2016",
            "filenames": [fn_data],
            "is_mc": False,
            "dataset_num_chunk": 0,
            "random_seed": 0
        }]

        outpath = "test_out"
        if os.path.exists(outpath):
            shutil.rmtree(outpath)
        os.makedirs(outpath)

        cmdline_args = copy.deepcopy(self.cmdline_args)
        cmdline_args.out = outpath

        ret = run_analysis(
            cmdline_args,
            outpath,
            job_descriptions,
            self.analysis_parameters,
            self.analysis_corrections,
        )

        from plotting import make_pdf_plot
        from pars import cross_sections

        res = {"data": pickle.load(open(outpath + "/data_2016_0.pkl", "rb"))}
        analysis = "baseline"
        var = "hist__dimuon__leading_muon_pt"
        era = "2016"

        int_lumi = res["data"]["int_lumi"]
        mc_samples = ["dy"]
        process_groups = [("dy", ["dy"])]

        genweights = {}
        weight_xs = {}
        for mc_samp in mc_samples:
            res[mc_samp] = pickle.load(
                open(outpath + "/{0}_2016_0.pkl".format(mc_samp), "rb"))
            genweights[mc_samp] = res[mc_samp]["genEventSumw"]
            weight_xs[mc_samp] = cross_sections[
                mc_samp] * int_lumi / genweights[mc_samp]

        self.assertAlmostEqual(genweights["dy"], 6073.25342144)
        self.assertAlmostEqual(int_lumi, 5.633297364)

        histos = {}
        for sample in mc_samples + ["data"]:
            histos[sample] = res[sample][analysis][var]
        hdata = res["data"][analysis][var]["nominal"]

        outdir = "{0}/{1}/plots/{2}".format(outpath, analysis, era)
        plot_args = (histos, hdata, mc_samples, analysis, var, "nominal",
                     weight_xs, int_lumi, outdir, era, process_groups, {})

        make_pdf_plot(plot_args)
def main(args, datasets):

    do_prof = args.do_profile
    do_tensorflow = not args.disable_tensorflow

    #use the environment variable for cupy/cuda choice
    args.use_cuda = USE_CUPY

    analysis_corrections = None
    if "analyze" in args.action:
        analysis_corrections = AnalysisCorrections(args, do_tensorflow)

    # Optionally disable pinned memory (will be somewhat slower)
    if args.use_cuda:
        import cupy
        if not args.pinned:
            cupy.cuda.set_allocator(None)
            cupy.cuda.set_pinned_memory_allocator(None)

    #Use sync-only datasets
    if args.do_sync:
        datasets = datasets_sync

    #Filter datasets by era
    datasets_to_process = []
    for ds in datasets:
        if args.datasets is None or ds[0] in args.datasets:
            if args.eras is None or ds[1] in args.eras:
                datasets_to_process += [ds]
                print("Will consider dataset", ds)
    if len(datasets) == 0:
        raise Exception("No datasets considered, please check the --datasets and --eras options")
    datasets = datasets_to_process

    hmumu_utils.NUMPY_LIB, hmumu_utils.ha = choose_backend(args.use_cuda)
    Dataset.numpy_lib = hmumu_utils.NUMPY_LIB
    NUMPY_LIB = hmumu_utils.NUMPY_LIB 

    # All analysis definitions (cut values etc) should go here
    analysis_parameters = {
        "baseline": {

            "nPV": 0,
            "NdfPV": 4,
            "zPV": 24,

            # Will be applied with OR
            "hlt_bits": {
                "2016": ["HLT_IsoMu24", "HLT_IsoTkMu24"],
                "2017": ["HLT_IsoMu27"],
                "2018": ["HLT_IsoMu24"],
                },

            "muon_pt": 20,
            "muon_pt_leading": {"2016": 26.0, "2017": 29.0, "2018": 26.0},
            "muon_eta": 2.4,
            "muon_iso": 0.25,
            "muon_id": {"2016": "medium", "2017": "medium", "2018": "medium"},
            "muon_trigger_match_dr": 0.1,
            "muon_iso_trigger_matched": 0.15,
            "muon_id_trigger_matched": {"2016": "tight", "2017": "tight", "2018": "tight"},
 
            "do_rochester_corrections": True, 
            "do_lepton_sf": True,
            
            "do_jec": True,
            "jec_tag": {"2016": "Summer16_07Aug2017_V11", "2017": "Fall17_17Nov2017_V32", "2018": "Autumn18_V16"}, 
            "jet_mu_dr": 0.4,
            "jet_pt_leading": {"2016": 35.0, "2017": 35.0, "2018": 35.0},
            "jet_pt_subleading": {"2016": 25.0, "2017": 25.0, "2018": 25.0},
            "jet_eta": 4.7,
            "jet_id": "tight",
            "jet_puid": "loose",
            "jet_veto_eta": [2.65, 3.139],
            "jet_veto_raw_pt": 50.0,  
            "jet_btag": {"2016": 0.6321, "2017": 0.4941, "2018": 0.4184},
            "do_factorized_jec": args.do_factorized_jec,

            "cat5_dijet_inv_mass": 400.0,
            "cat5_abs_jj_deta_cut": 2.5,

            "masswindow_z_peak": [76, 106],
            "masswindow_h_sideband": [110, 150],
            "masswindow_h_peak": [115, 135],

            "inv_mass_bins": 41,

            "extra_electrons_pt": 20,
            "extra_electrons_eta": 2.5,
            "extra_electrons_iso": 0.4, #Check if we want to apply this
            "extra_electrons_id": "mvaFall17V1Iso_WP90",

            "save_dnn_vars": True,
            "dnn_vars_path": "{0}/dnn_vars".format(args.out),

            #If true, apply mjj > cut, otherwise inverse
            "vbf_filter_mjj_cut": 350,
            "vbf_filter": {
                "dy_m105_160_mg": True,
                "dy_m105_160_amc": True,
                "dy_m105_160_vbf_mg": False,
                "dy_m105_160_vbf_amc": False, 
            },

            #Irene's DNN input variable order for keras
            "dnn_varlist_order": ['softJet5', 'dRmm','dEtamm','M_jj','pt_jj','eta_jj','phi_jj','M_mmjj','eta_mmjj','phi_mmjj','dEta_jj','Zep','dRmin_mj', 'dRmax_mj', 'dRmin_mmj','dRmax_mmj','dPhimm','leadingJet_pt','subleadingJet_pt', 'leadingJet_eta','subleadingJet_eta','leadingJet_qgl','subleadingJet_qgl','cthetaCS','Higgs_pt','Higgs_eta','Higgs_mass'],
            "dnn_input_histogram_bins": {
                "softJet5": (0,10,10),
                "dRmm": (0,5,41),
                "dEtamm": (-2,2,41),
                "dPhimm": (-2,2,41),
                "M_jj": (0,2000,41),
                "pt_jj": (0,400,41),
                "eta_jj": (-5,5,41),
                "phi_jj": (-5,5,41),
                "M_mmjj": (0,2000,41),
                "eta_mmjj": (-3,3,41),
                "phi_mmjj": (-3,3,41),
                "dEta_jj": (-3,3,41),
                "Zep": (-2,2,41),
                "dRmin_mj": (0,5,41),
                "dRmax_mj": (0,5,41),
                "dRmin_mmj": (0,5,41),
                "dRmax_mmj": (0,5,41),
                "leadingJet_pt": (0, 200, 41),
                "subleadingJet_pt": (0, 200, 41),
                "leadingJet_eta": (-5, 5, 41),
                "subleadingJet_eta": (-5, 5, 41),
                "leadingJet_qgl": (0, 1, 41),
                "subleadingJet_qgl": (0, 1, 41),
                "cthetaCS": (-1, 1, 41),
                "Higgs_pt": (0, 200, 41),
                "Higgs_eta": (-3, 3, 41),
                "Higgs_mass": (110, 150, 41),
                "dnn_pred": (0, 1, 1001),
                "dnn_pred2": (0, 1, 11),
                "bdt_ucsd": (-1, 1, 41),
                "bdt2j_ucsd": (-1, 1, 41),
                "bdt01j_ucsd": (-1, 1, 41),
                "MET_pt": (0, 200, 41),
                "hmmthetacs": (-1, 1, 41),
                "hmmphics": (-4, 4, 41),
            },

            "categorization_trees": {}
        },
    }
    histo_bins = {
        "muon_pt": np.linspace(0, 200, 101, dtype=np.float32),
        "npvs": np.linspace(0,100,101, dtype=np.float32),
        "dijet_inv_mass": np.linspace(0, 2000, 41, dtype=np.float32),
        "inv_mass": np.linspace(70, 150, 41, dtype=np.float32),
        "numjet": np.linspace(0, 10, 11, dtype=np.float32),
        "jet_pt": np.linspace(0, 300, 101, dtype=np.float32),
        "jet_eta": np.linspace(-4.7, 4.7, 41, dtype=np.float32),
        "pt_balance": np.linspace(0, 5, 41, dtype=np.float32),
        "numjets": np.linspace(0, 10, 11, dtype=np.float32),
        "jet_qgl": np.linspace(0, 1, 41, dtype=np.float32),
        "higgs_inv_mass_uncertainty": np.linspace(0, 10, 101, dtype=np.float32),
        "higgs_rel_inv_mass_uncertainty": np.linspace(0, 0.05, 101, dtype=np.float32)
    }
    for hname, bins in analysis_parameters["baseline"]["dnn_input_histogram_bins"].items():
        histo_bins[hname] = np.linspace(bins[0], bins[1], bins[2], dtype=np.float32)

    for masswindow in ["z_peak", "h_peak", "h_sideband"]:
        mw = analysis_parameters["baseline"]["masswindow_" + masswindow]
        histo_bins["inv_mass_{0}".format(masswindow)] = np.linspace(mw[0], mw[1], 41, dtype=np.float32)

    histo_bins["dnn_pred2"] = {
        "h_peak": np.array([0., 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 1.0], dtype=np.float32),
        "z_peak": np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0], dtype=np.float32),
        "h_sideband": np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0], dtype=np.float32),
    }

    analysis_parameters["baseline"]["histo_bins"] = histo_bins

    #analysis_parameters["oldjec"] = copy.deepcopy(analysis_parameters["baseline"])
    #analysis_parameters["oldjec"]["jec_tag"]["2018"] = "Autumn18_V8"

    #Run baseline analysis
    outpath = "{0}/partial_results".format(args.out)
    try:
        os.makedirs(outpath)
    except FileExistsError as e:
            pass

    with open('{0}/parameters.pkl'.format(outpath), 'wb') as handle:
        pickle.dump(analysis_parameters, handle, protocol=pickle.HIGHEST_PROTOCOL)

    #Recreate dump of all filenames
    cache_filename = args.cache_location + "/datasets.json"
    if ("cache" in args.action) and (args.jobfiles is None):
        print("--action cache and no jobfiles specified, creating datasets.json dump of all filenames")
        if not os.path.isdir(args.cache_location):
            os.makedirs(args.cache_location)
        filenames_cache = {}
        for dataset in datasets:
            dataset_name, dataset_era, dataset_globpattern, is_mc = dataset
            filenames_all = glob.glob(args.datapath + dataset_globpattern, recursive=True)
            filenames_all = [fn for fn in filenames_all if not "Friend" in fn]
            filenames_cache[dataset_name + "_" + dataset_era] = [
                fn.replace(args.datapath, "") for fn in filenames_all]

            if len(filenames_all) == 0:
                raise Exception("Dataset {0} matched 0 files from glob pattern {1}, verify that the data files are located in {2}".format(
                    dataset_name, dataset_globpattern, args.datapath
                ))
    
        #save all dataset filenames to a json file 
        print("Creating a json dump of all the dataset filenames based on data found in {0}".format(args.datapath))
        if os.path.isfile(cache_filename):
            print("Cache file {0} already exists, we will not overwrite it to be safe.".format(cache_filename), file=sys.stderr)
            print("Delete it or change --cache-location and try again.", file=sys.stderr)
            sys.exit(1)
        with open(cache_filename, "w") as fi:
            fi.write(json.dumps(filenames_cache, indent=2))

    if ("cache" in args.action or "analyze" in args.action) and (args.jobfiles is None):
        #Create a list of job files for processing
        jobfile_data = []
        print("Loading list of filenames from {0}".format(cache_filename))
        if not os.path.isfile(cache_filename):
            raise Exception("Cached dataset list of filenames not found in {0}, please run this code with --action cache".format(
                cache_filename))
        filenames_cache = json.load(open(cache_filename, "r"))

        for dataset in datasets:
            dataset_name, dataset_era, dataset_globpattern, is_mc = dataset
            try:
                filenames_all = filenames_cache[dataset_name + "_" + dataset_era]
            except KeyError as e:
                print("Could not load {0} from {1}, please make sure this dataset has been added to cache".format(
                    dataset_name + "_" + dataset_era, cache_filename), file=sys.stderr)
                raise e

            filenames_all_full = [args.datapath + "/" + fn for fn in filenames_all]
            chunksize = args.chunksize * chunksize_multiplier.get(dataset_name, 1)
            print("Saving dataset {0}_{1} with {2} files in {3} files per chunk to jobfiles".format(
                dataset_name, dataset_era, len(filenames_all_full), chunksize))
            jobfile_dataset = create_dataset_jobfiles(dataset_name, dataset_era,
                filenames_all_full, is_mc, chunksize, args.out)
            jobfile_data += jobfile_dataset
            print("Dataset {0}_{1} consists of {2} chunks".format(
                dataset_name, dataset_era, len(jobfile_dataset)))

        assert(len(jobfile_data) > 0)
        assert(len(jobfile_data[0]["filenames"]) > 0)

    #For each dataset, find out which chunks we want to process
    if "cache" in args.action or "analyze" in args.action:
        jobfile_data = []
        if not (args.jobfiles_load is None):
            args.jobfiles = [l.strip() for l in open(args.jobfiles_load).readlines()]
        if args.jobfiles is None:
            print("You did not specify to process specific dataset chunks, assuming you want to process all chunks")
            print("If this is not true, please specify e.g. --jobfiles data_2018_0.json data_2018_1.json ...")
            args.jobfiles = []
            for dataset in datasets:
                dataset_name, dataset_era, dataset_globpattern, is_mc = dataset
                jobfiles_dataset = glob.glob(args.out + "/jobfiles/{0}_{1}_*.json".format(dataset_name, dataset_era))
                assert(len(jobfiles_dataset) > 0)
                if args.maxchunks > 0:
                    jobfiles_dataset = jobfiles_dataset[:args.maxchunks]
                args.jobfiles += jobfiles_dataset
       
        #Now load the jobfiles 
        assert(len(args.jobfiles) > 0)
        print("You specified --jobfiles {0}, processing only these dataset chunks".format(" ".join(args.jobfiles))) 
        jobfile_data = []
        for f in args.jobfiles:
            jobfile_data += [json.load(open(f))]

        chunkstr = " ".join(["{0}_{1}_{2}".format(
            ch["dataset_name"], ch["dataset_era"], ch["dataset_num_chunk"])
            for ch in jobfile_data])
        print("Will process {0} dataset chunks: {1}".format(len(jobfile_data), chunkstr))
        assert(len(jobfile_data) > 0)

    #Start the profiler only in the actual data processing
    if do_prof:
        import yappi
        filename = 'analysis.prof'
        yappi.set_clock_type('cpu')
        yappi.start(builtins=True)

    if "cache" in args.action:
        print("Running the 'cache' step of the analysis, ROOT files will be opened and branches will be uncompressed")
        print("Will retrieve dataset filenames based on existing ROOT files on filesystem in datapath={0}".format(args.datapath)) 
       
        try:
            os.makedirs(cmdline_args.cache_location)
        except Exception as e:
            pass

        run_cache(args, outpath, jobfile_data, analysis_parameters)
    
    if "analyze" in args.action:
        run_analysis(args, outpath, jobfile_data, analysis_parameters, analysis_corrections)

    if "merge" in args.action:
        with ProcessPoolExecutor(max_workers=args.nthreads) as executor:
            for dataset in datasets:
                dataset_name, dataset_era, dataset_globpattern, is_mc = dataset
                fut = executor.submit(merge_partial_results, dataset_name, dataset_era, outpath)
        print("done merging")
    if do_prof:
        stats = yappi.get_func_stats()
        stats.save(filename, type='callgrind')

    import resource
    total_memory = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
    total_memory += resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print("maxrss={0} MB".format(total_memory/1024))
Exemple #8
0
def main(args, datasets):
    do_prof = args.do_profile
    do_tensorflow = not args.disable_tensorflow

    #use the environment variable for cupy/cuda choice
    args.use_cuda = USE_CUPY

    # Optionally disable pinned memory (will be somewhat slower)
    if args.use_cuda:
        import cupy
        if not args.pinned:
            cupy.cuda.set_allocator(None)
            cupy.cuda.set_pinned_memory_allocator(None)

    #Use sync-only datasets
    if args.do_sync:
        datasets = datasets_sync

    #Filter datasets by era
    datasets_to_process = []
    for ds in datasets:
        if args.datasets is None or ds[0] in args.datasets:
            if args.eras is None or ds[1] in args.eras:
                datasets_to_process += [ds]
                print("Will consider dataset", ds)
    if len(datasets) == 0:
        raise Exception(
            "No datasets considered, please check the --datasets and --eras options"
        )
    datasets = datasets_to_process

    hmumu_utils.NUMPY_LIB, hmumu_utils.ha = choose_backend(args.use_cuda)
    Dataset.numpy_lib = hmumu_utils.NUMPY_LIB
    NUMPY_LIB = hmumu_utils.NUMPY_LIB

    outpath_partial = "{0}/partial_results".format(args.out)
    try:
        os.makedirs(outpath_partial)
    except FileExistsError as e:
        print("Output path {0} already exists, not recreating".format(
            outpath_partial))

    #save the parameters as a pkl file
    from pars import analysis_parameters
    for analysis_name in analysis_parameters.keys():
        analysis_parameters[analysis_name][
            "do_factorized_jec"] = args.do_factorized_jec
        analysis_parameters[analysis_name][
            "dnn_vars_path"] = "{0}/dnn_vars".format(args.out)

    with open('{0}/parameters.pkl'.format(outpath_partial), 'wb') as handle:
        pickle.dump(analysis_parameters,
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)

    #Recreate dump of all filenames
    cache_filename = args.cache_location + "/datasets.json"
    if ("cache" in args.action) and (args.jobfiles is None):
        check_and_recreate_filename_cache(cache_filename, args.cache_location,
                                          args.datapath)

    #Create the jobfiles
    if ("cache" in args.action
            or "analyze" in args.action) and (args.jobfiles is None):
        create_all_jobfiles(datasets, cache_filename, args.datapath,
                            args.chunksize, args.out)

    #For each dataset, find out which chunks we want to process
    if "cache" in args.action or "analyze" in args.action:
        jobfile_data = load_jobfiles(datasets, args.jobfiles_load,
                                     args.jobfiles, args.maxchunks, args.out)

    #Start the profiler only in the actual data processing
    if do_prof:
        import yappi
        yappi.set_clock_type('cpu')
        yappi.start(builtins=True)

    if "cache" in args.action:
        print(
            "Running the 'cache' step of the analysis, ROOT files will be opened and branches will be uncompressed"
        )
        run_cache(args, outpath_partial, jobfile_data, analysis_parameters)

    #Run the physics analysis on all specified jobfiles
    if "analyze" in args.action:
        print(
            "Running the 'analyze' step of the analysis, processing the events into histograms with all systematics"
        )
        analysis_corrections = AnalysisCorrections(args, do_tensorflow)
        run_analysis(args, outpath_partial, jobfile_data, analysis_parameters,
                     analysis_corrections)

    if do_prof:
        stats = yappi.get_func_stats()
        stats.save("analysis.prof", type='callgrind')

    #Merge the partial results (pieces of each dataset)
    if "merge" in args.action:
        with ProcessPoolExecutor(max_workers=args.nthreads) as executor:
            for dataset in datasets:
                dataset_name, dataset_era, dataset_globpattern, is_mc = dataset
                fut = executor.submit(merge_partial_results, dataset_name,
                                      dataset_era, args.out, outpath_partial)
        print("done merging")

    #print memory usage
    total_memory = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
    total_memory += resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print("maxrss={0} MB".format(total_memory / 1024))