Example #1
0
    def setUpClass(self):
        self.NUMPY_LIB, self.ha = choose_backend(use_cuda=USE_CUPY)

        import hmumu_utils
        hmumu_utils.NUMPY_LIB = self.NUMPY_LIB
        hmumu_utils.ha = self.ha

        #disable everything that requires ROOT which is not easily available on travis tests
        from pars import analysis_parameters
        self.analysis_parameters = analysis_parameters
        self.analysis_parameters["baseline"][
            "do_rochester_corrections"] = False
        self.analysis_parameters["baseline"]["do_lepton_sf"] = False
        self.analysis_parameters["baseline"]["save_dnn_vars"] = False
        self.analysis_parameters["baseline"]["do_bdt_ucsd"] = False
        self.analysis_parameters["baseline"]["do_bdt_pisa"] = False
        self.analysis_parameters["baseline"]["do_factorized_jec"] = False
        self.analysis_parameters["baseline"]["do_jec"] = True
        self.analysis_parameters["baseline"]["do_jer"] = {"2016": True}

        from argparse import Namespace
        self.cmdline_args = Namespace(use_cuda=USE_CUPY,
                                      datapath=".",
                                      do_fsr=False,
                                      nthreads=1,
                                      async_data=False,
                                      do_sync=False,
                                      out="test_out")

        from analysis_hmumu import AnalysisCorrections
        self.analysis_corrections = AnalysisCorrections(
            self.cmdline_args, True)
Example #2
0
    def setUpClass(self):
        self.NUMPY_LIB, self.ha = choose_backend(use_cuda=USE_CUPY)

        import hmumu_utils
        hmumu_utils.NUMPY_LIB = self.NUMPY_LIB
        hmumu_utils.ha = self.ha

        download_if_not_exists(
            "data/myNanoProdMc2016_NANO.root",
            "https://jpata.web.cern.ch/jpata/hmm/test_files/myNanoProdMc2016_NANO.root"
        )

        #Load a simple sync dataset
        self.datastructures = create_datastructure("vbf_sync",
                                                   True,
                                                   "2016",
                                                   do_fsr=True)
        self.dataset = Dataset("vbf_sync", ["data/myNanoProdMc2016_NANO.root"],
                               self.datastructures,
                               datapath="",
                               treename="Events",
                               is_mc=True)
        self.dataset.num_chunk = 0
        self.dataset.era = "2016"
        self.dataset.load_root()

        self.dataset.numpy_lib = self.NUMPY_LIB
        self.dataset.move_to_device(self.NUMPY_LIB)

        #disable everything that requires ROOT which is not easily available on travis tests
        from pars import analysis_parameters
        self.analysis_parameters = analysis_parameters
        self.analysis_parameters["baseline"][
            "do_rochester_corrections"] = False
        self.analysis_parameters["baseline"]["do_lepton_sf"] = False
        self.analysis_parameters["baseline"]["save_dnn_vars"] = False
        self.analysis_parameters["baseline"]["do_bdt_ucsd"] = False
        self.analysis_parameters["baseline"]["do_bdt_pisa"] = False
        self.analysis_parameters["baseline"]["do_factorized_jec"] = False
        self.analysis_parameters["baseline"]["do_jec"] = {"2016:": False}
        self.analysis_parameters["baseline"]["do_jer"] = {"2016": True}

        from argparse import Namespace
        self.cmdline_args = Namespace(use_cuda=USE_CUPY,
                                      datapath=".",
                                      do_fsr=False,
                                      nthreads=1,
                                      async_data=False,
                                      do_sync=False,
                                      out="test_out")

        from analysis_hmumu import AnalysisCorrections
        self.analysis_corrections = AnalysisCorrections(
            self.cmdline_args, True)
class TestHistogram(unittest.TestCase):
    NUMPY_LIB, ha = choose_backend(use_cuda=USE_CUDA)

    def test_histogram(self):
        np = TestHistogram.NUMPY_LIB
        data = np.array([2, 3, 4, 5, 6, 7], dtype=np.float32)
        data[data < 2] = 0
        weights = np.ones_like(data, dtype=np.float32)
        w, w2, e = self.ha.histogram_from_vector(
            data, weights, np.array([0, 1, 2, 3, 4, 5], dtype=np.float32))
        npw, npe = np.histogram(data, np.array([0, 1, 2, 3, 4, 5]))
        hr = from_numpy((w, e))
        f = uproot.recreate("test.root")
        f["hist"] = hr

        data = np.random.normal(size=10000)
        data = np.array(data, dtype=np.float32)
        weights = np.ones_like(data, dtype=np.float32)
        w, w2, e = self.ha.histogram_from_vector(
            data, weights, np.linspace(-1, 1, 100, dtype=np.float32))
        hr = from_numpy((w, e))
        f["hist2"] = hr
        f.close()

    def test_histogram_several(self):
        np = TestHistogram.NUMPY_LIB
        data = np.array([2, 3, 4, 5, 6, 7], dtype=np.float32)
        mask = data >= 2
        data[self.NUMPY_LIB.invert(mask)] = 0
        weights = np.ones_like(data, dtype=np.float32)
        bins = np.array([0, 1, 2, 3, 4, 5], dtype=np.float32)
        w, w2, e = self.ha.histogram_from_vector(data, weights, bins)

        histograms = self.ha.histogram_from_vector_several([(data, bins),
                                                            (data, bins)],
                                                           weights, mask)
        assert numpy.all(w == histograms[0][0])
        assert numpy.all(w == histograms[1][0])
        assert numpy.all(w2 == histograms[0][1])
        assert numpy.all(w2 == histograms[1][1])
    def setUpClass(self):
        self.NUMPY_LIB, self.ha = choose_backend(use_cuda=USE_CUPY)

        import hmumu_utils
        hmumu_utils.NUMPY_LIB = self.NUMPY_LIB
        hmumu_utils.ha = self.ha

        #disable everything that requires ROOT which is not easily available on travis tests
        from pars import analysis_parameters
        self.analysis_parameters = analysis_parameters
        self.analysis_parameters["baseline"]["do_rochester_corrections"] = True
        self.analysis_parameters["baseline"]["do_lepton_sf"] = True
        self.analysis_parameters["baseline"]["save_dnn_vars"] = True
        self.analysis_parameters["baseline"]["do_bdt_ucsd"] = False
        self.analysis_parameters["baseline"]["do_bdt_pisa"] = False
        self.analysis_parameters["baseline"]["do_factorized_jec"] = False
        self.analysis_parameters["baseline"]["do_jec"] = True
        self.analysis_parameters["baseline"]["do_jer"] = {"2016": True}

        from argparse import Namespace
        self.cmdline_args = Namespace(use_cuda=USE_CUPY,
                                      datapath=".",
                                      nthreads=1,
                                      do_fsr=False,
                                      async_data=False,
                                      do_sync=False,
                                      out="test_out")

        from analysis_hmumu import AnalysisCorrections
        self.analysis_corrections = AnalysisCorrections(
            self.cmdline_args, True)
        download_if_not_exists(
            "data/myNanoProdMc2016_NANO.root",
            "https://jpata.web.cern.ch/jpata/hmm/test_files/21-02-2020-private-nanoaod/myNanoProdMc2016_NANO.root"
        )
        download_if_not_exists(
            "data/nano_2016_data.root",
            "https://jpata.web.cern.ch/jpata/hmm/test_files/21-02-2020-private-nanoaod/nano_2016_data.root"
        )
Example #5
0
    parser.add_argument('--path-to-model',
                        action='store',
                        help='path to DNN model',
                        type=str,
                        default=None,
                        required=False)
    parser.add_argument('--year',
                        action='store',
                        choices=['2016', '2017', '2018'],
                        help='Year of data/MC samples',
                        default='2017')
    parser.add_argument('filenames', nargs=argparse.REMAINDER)
    args = parser.parse_args()

    # set CPU or GPU backend
    NUMPY_LIB, ha = choose_backend(args.use_cuda)
    lib_analysis.NUMPY_LIB, lib_analysis.ha = NUMPY_LIB, ha
    NanoAODDataset.numpy_lib = NUMPY_LIB

    if args.use_cuda:
        os.environ["HEPACCELERATE_CUDA"] = "1"
    else:
        os.environ["HEPACCELERATE_CUDA"] = "0"

    from coffea.util import USE_CUPY
    from coffea.lumi_tools import LumiMask, LumiData
    from coffea.lookup_tools import extractor
    from coffea.btag_tools import BTagScaleFactor

    # load definitions
    from definitions_analysis import parameters, eraDependentParameters, samples_info
Example #6
0
import requests
import os
import numpy as np
import json
import sys
import time
import uproot
import numba

import hepaccelerate
import hepaccelerate.kernels as kernels
from hepaccelerate.utils import Results, Dataset, Histogram, choose_backend
from tests.kernel_test import load_dataset

USE_CUDA = int(os.environ.get("HEPACCELERATE_CUDA", 0)) == 1
nplib, backend = choose_backend(use_cuda=USE_CUDA)


def time_kernel(dataset, test_kernel):
    # ensure it's compiled
    test_kernel(dataset)

    n = len(dataset)

    t0 = time.time()
    for i in range(5):
        test_kernel(dataset)
    t1 = time.time()

    dt = (t1 - t0) / 5.0
    speed = float(n) / dt
Example #7
0
    hist_muons_pt = Histogram(*kernels.histogram_from_vector(
        backend, leading_muon_pt[mask_events_dimuon],
        weights[mask_events_dimuon], bins))

    #save it to the output
    ret["hist_leading_muon_pt"] = hist_muons_pt
    return ret


if __name__ == "__main__":
    #choose whether or not to use the GPU backend
    use_cuda = int(os.environ.get("HEPACCELERATE_CUDA", 0)) == 1
    if use_cuda:
        import setGPU

    nplib, backend = choose_backend(use_cuda=use_cuda)

    #Load this input file
    filename = "data/HZZ.root"

    #Predefine which branches to read from the TTree and how they are grouped to objects
    #This will be verified against the actual ROOT TTree when it is loaded
    datastructures = {
        "Muon": [("Muon_Px", "float32"), ("Muon_Py", "float32"),
                 ("Muon_Pz", "float32"), ("Muon_E", "float32"),
                 ("Muon_Charge", "int32"), ("Muon_Iso", "float32")],
        "Jet": [("Jet_Px", "float32"), ("Jet_Py", "float32"),
                ("Jet_Pz", "float32"), ("Jet_E", "float32"),
                ("Jet_btag", "float32"), ("Jet_ID", "bool")],
        "EventVariables": [("NPrimaryVertices", "int32"),
                           ("triggerIsoMu24", "bool"),
 def setUpClass(self):
     self.NUMPY_LIB, self.ha = choose_backend(use_cuda=USE_CUDA)
     self.use_cuda = USE_CUDA
     self.dataset = load_dataset(self.NUMPY_LIB)
Example #9
0
def main(args):
    do_prof = args.do_profile
    do_tensorflow = not args.disable_tensorflow

    # use the environment variable for cupy/cuda choice
    args.use_cuda = USE_CUPY

    datasets = yaml.load(open(args.datasets_yaml),
                         Loader=yaml.FullLoader)["datasets"]

    # Filter datasets by era
    datasets_to_process = []
    for ds in datasets:
        if args.datasets is None or ds["name"] in args.datasets:
            if args.eras is None or ds["era"] in args.eras:
                datasets_to_process += [ds]
    if len(datasets_to_process) == 0:
        raise Exception(
            "No datasets considered, please check the --datasets and --eras options"
        )
    datasets = datasets_to_process

    # Choose either the CPU or GPU(CUDA) backend
    hmumu_utils.NUMPY_LIB, hmumu_utils.ha = choose_backend(args.use_cuda)
    Dataset.numpy_lib = hmumu_utils.NUMPY_LIB

    outpath_partial = "{0}/partial_results".format(args.out)
    try:
        os.makedirs(outpath_partial)
    except FileExistsError:
        print("Output path {0} already exists, not recreating".format(
            outpath_partial))

    # save the parameters as a pkl file
    from pars import analysis_parameters
    for analysis_name in analysis_parameters.keys():
        analysis_parameters[analysis_name][
            "do_factorized_jec"] = args.do_factorized_jec
        analysis_parameters[analysis_name][
            "dnn_vars_path"] = "{0}/dnn_vars".format(args.out)
    with open('{0}/parameters.pkl'.format(outpath_partial), 'wb') as handle:
        pickle.dump(analysis_parameters,
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)

    # Recreate dump of all filenames
    cache_filename = "{0}/datasets.json".format(args.out)

    use_skim = False
    if args.cachepath is None:
        print(
            "--cachepath not specified, will process unskimmed NanoAOD, which is somewhat slower!"
        )
        print("Please see the README.md on how to skim the NanoAOD")
        datapath = args.datapath
    else:
        print("Processing skimmed NanoAOD")
        datapath = args.cachepath
        use_skim = True
    check_and_recreate_filename_cache(cache_filename, datapath, datasets,
                                      use_skim)

    # Create the jobfiles
    if args.jobfiles is None:
        create_all_jobfiles(datasets, cache_filename, datapath, args.chunksize,
                            args.out)

    # For each dataset, find out which chunks we want to process
    if "analyze" in args.action:
        jobfile_data = load_jobfiles(datasets, args.jobfiles_load,
                                     args.jobfiles, args.maxchunks, args.out)

    # If we want to check what part of the code is slow, start the profiler only in the actual data processing
    if do_prof:
        import yappi
        yappi.set_clock_type('cpu')
        yappi.start(builtins=True)

    # Run the physics analysis on all specified jobfiles
    if "analyze" in args.action:
        print(
            "Running the 'analyze' step of the analysis, processing the events into histograms with all systematics"
        )
        analysis_corrections = AnalysisCorrections(args, do_tensorflow)
        run_analysis(args, outpath_partial, jobfile_data, analysis_parameters,
                     analysis_corrections)

    if do_prof:
        stats = yappi.get_func_stats()
        stats.save("analysis.prof", type='callgrind')

    # Merge the partial results (pieces of each dataset)
    if "merge" in args.action:
        with ProcessPoolExecutor(max_workers=args.nthreads) as executor:
            for dataset in datasets:
                dataset_name = dataset["name"]
                dataset_era = dataset["era"]
                executor.submit(merge_partial_results, dataset_name,
                                dataset_era, args.out, outpath_partial)
        print("done merging")

    # print memory usage for debugging
    total_memory = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
    total_memory += resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print("maxrss={0} MB".format(total_memory / 1024))
Example #10
0
class TestDataset(unittest.TestCase):
    NUMPY_LIB, ha = choose_backend(use_cuda=USE_CUDA)

    @staticmethod
    def load_dataset(num_iter=1):
        datastructures = {
            "Muon": [
                ("Muon_Px", "float32"),
                ("Muon_Py", "float32"),
                ("Muon_Pz", "float32"),
                ("Muon_E", "float32"),
                ("Muon_Charge", "int32"),
                ("Muon_Iso", "float32"),
            ],
            "Jet": [
                ("Jet_Px", "float32"),
                ("Jet_Py", "float32"),
                ("Jet_Pz", "float32"),
                ("Jet_E", "float32"),
                ("Jet_btag", "float32"),
                ("Jet_ID", "bool"),
            ],
            "EventVariables": [
                ("NPrimaryVertices", "int32"),
                ("triggerIsoMu24", "bool"),
                ("EventWeight", "float32"),
            ],
        }
        dataset = Dataset(
            "HZZ",
            num_iter * ["data/HZZ.root"],
            datastructures,
            treename="events",
            datapath="",
        )
        assert dataset.filenames[0] == "data/HZZ.root"
        assert len(dataset.filenames) == num_iter
        assert len(dataset.structs["Jet"]) == 0
        assert len(dataset.eventvars) == 0
        return dataset

    def setUp(self):
        self.dataset = self.load_dataset()

    @staticmethod
    def map_func(dataset, ifile):
        mu = dataset.structs["Muon"][ifile]
        mu_pt = np.sqrt(mu.Px**2 + mu.Py**2)
        mu_pt_pass = mu_pt > 20
        mask_rows = np.ones(mu.numevents(), dtype=np.bool)
        mask_content = np.ones(mu.numobjects(), dtype=np.bool)
        ret = TestDataset.ha.sum_in_offsets(mu.offsets,
                                            mu_pt_pass,
                                            mask_rows,
                                            mask_content,
                                            dtype=np.int8)
        return ret

    def test_dataset_map(self):
        dataset = self.load_dataset()
        dataset.load_root()

        rets = dataset.map(self.map_func)
        assert len(rets) == 1
        assert len(rets[0]) == dataset.structs["Muon"][0].numevents()
        assert np.sum(rets[0]) > 0
        return rets

    def test_dataset_compact(self):
        dataset = self.dataset
        dataset.load_root()

        memsize1 = dataset.memsize()
        rets = dataset.map(self.map_func)

        # compacting uses JaggedArray functionality and can only be done on the numpy/CPU backend
        dataset.move_to_device(np)
        rets = [TestDataset.NUMPY_LIB.asnumpy(r) for r in rets]
        dataset.compact(rets)
        dataset.move_to_device(TestDataset.NUMPY_LIB)

        memsize2 = dataset.memsize()
        assert memsize1 > memsize2
        print("compacted memory size ratio:", memsize2 / memsize1)

    @staticmethod
    def precompute_results(filename):
        fi = uproot.open(filename)
        arr = fi.get("events").array("EventWeight")
        return {"EventWeight": arr.sum()}

    def test_dataset_merge_inplace(self):
        num_iter = 10

        ds_multi = self.load_dataset(num_iter=num_iter)
        ds_multi.func_filename_precompute = self.precompute_results

        ds_multi.load_root()
        assert len(ds_multi.structs["Jet"]) == num_iter
        njet = ds_multi.num_objects_loaded("Jet")

        # compute a per-event jet energy sum taking into account the offsets
        jet_sume = TestDataset.NUMPY_LIB.hstack([
            TestDataset.ha.sum_in_offsets(
                ds_multi.structs["Jet"][i].offsets,
                ds_multi.structs["Jet"][i]["E"],
                TestDataset.NUMPY_LIB.ones(
                    ds_multi.structs["Jet"][i].numevents(),
                    dtype=TestDataset.NUMPY_LIB.bool,
                ),
                TestDataset.NUMPY_LIB.ones(
                    ds_multi.structs["Jet"][i].numobjects(),
                    dtype=TestDataset.NUMPY_LIB.bool,
                ),
            ) for i in range(num_iter)
        ])

        numevents = ds_multi.numevents()

        ds_multi.merge_inplace()
        assert len(ds_multi.structs["Jet"]) == 1
        assert ds_multi.num_objects_loaded("Jet") == njet
        jet_sume_merged = TestDataset.ha.sum_in_offsets(
            ds_multi.structs["Jet"][0].offsets,
            ds_multi.structs["Jet"][0]["E"],
            TestDataset.NUMPY_LIB.ones(ds_multi.structs["Jet"][0].numevents(),
                                       dtype=TestDataset.NUMPY_LIB.bool),
            TestDataset.NUMPY_LIB.ones(
                ds_multi.structs["Jet"][0].numobjects(),
                dtype=TestDataset.NUMPY_LIB.bool,
            ),
        )
        assert TestDataset.NUMPY_LIB.all(jet_sume_merged == jet_sume)
        assert ds_multi.numevents() == numevents
def main(args, datasets):

    do_prof = args.do_profile
    do_tensorflow = not args.disable_tensorflow

    #use the environment variable for cupy/cuda choice
    args.use_cuda = USE_CUPY

    analysis_corrections = None
    if "analyze" in args.action:
        analysis_corrections = AnalysisCorrections(args, do_tensorflow)

    # Optionally disable pinned memory (will be somewhat slower)
    if args.use_cuda:
        import cupy
        if not args.pinned:
            cupy.cuda.set_allocator(None)
            cupy.cuda.set_pinned_memory_allocator(None)

    #Use sync-only datasets
    if args.do_sync:
        datasets = datasets_sync

    #Filter datasets by era
    datasets_to_process = []
    for ds in datasets:
        if args.datasets is None or ds[0] in args.datasets:
            if args.eras is None or ds[1] in args.eras:
                datasets_to_process += [ds]
                print("Will consider dataset", ds)
    if len(datasets) == 0:
        raise Exception("No datasets considered, please check the --datasets and --eras options")
    datasets = datasets_to_process

    hmumu_utils.NUMPY_LIB, hmumu_utils.ha = choose_backend(args.use_cuda)
    Dataset.numpy_lib = hmumu_utils.NUMPY_LIB
    NUMPY_LIB = hmumu_utils.NUMPY_LIB 

    # All analysis definitions (cut values etc) should go here
    analysis_parameters = {
        "baseline": {

            "nPV": 0,
            "NdfPV": 4,
            "zPV": 24,

            # Will be applied with OR
            "hlt_bits": {
                "2016": ["HLT_IsoMu24", "HLT_IsoTkMu24"],
                "2017": ["HLT_IsoMu27"],
                "2018": ["HLT_IsoMu24"],
                },

            "muon_pt": 20,
            "muon_pt_leading": {"2016": 26.0, "2017": 29.0, "2018": 26.0},
            "muon_eta": 2.4,
            "muon_iso": 0.25,
            "muon_id": {"2016": "medium", "2017": "medium", "2018": "medium"},
            "muon_trigger_match_dr": 0.1,
            "muon_iso_trigger_matched": 0.15,
            "muon_id_trigger_matched": {"2016": "tight", "2017": "tight", "2018": "tight"},
 
            "do_rochester_corrections": True, 
            "do_lepton_sf": True,
            
            "do_jec": True,
            "jec_tag": {"2016": "Summer16_07Aug2017_V11", "2017": "Fall17_17Nov2017_V32", "2018": "Autumn18_V16"}, 
            "jet_mu_dr": 0.4,
            "jet_pt_leading": {"2016": 35.0, "2017": 35.0, "2018": 35.0},
            "jet_pt_subleading": {"2016": 25.0, "2017": 25.0, "2018": 25.0},
            "jet_eta": 4.7,
            "jet_id": "tight",
            "jet_puid": "loose",
            "jet_veto_eta": [2.65, 3.139],
            "jet_veto_raw_pt": 50.0,  
            "jet_btag": {"2016": 0.6321, "2017": 0.4941, "2018": 0.4184},
            "do_factorized_jec": args.do_factorized_jec,

            "cat5_dijet_inv_mass": 400.0,
            "cat5_abs_jj_deta_cut": 2.5,

            "masswindow_z_peak": [76, 106],
            "masswindow_h_sideband": [110, 150],
            "masswindow_h_peak": [115, 135],

            "inv_mass_bins": 41,

            "extra_electrons_pt": 20,
            "extra_electrons_eta": 2.5,
            "extra_electrons_iso": 0.4, #Check if we want to apply this
            "extra_electrons_id": "mvaFall17V1Iso_WP90",

            "save_dnn_vars": True,
            "dnn_vars_path": "{0}/dnn_vars".format(args.out),

            #If true, apply mjj > cut, otherwise inverse
            "vbf_filter_mjj_cut": 350,
            "vbf_filter": {
                "dy_m105_160_mg": True,
                "dy_m105_160_amc": True,
                "dy_m105_160_vbf_mg": False,
                "dy_m105_160_vbf_amc": False, 
            },

            #Irene's DNN input variable order for keras
            "dnn_varlist_order": ['softJet5', 'dRmm','dEtamm','M_jj','pt_jj','eta_jj','phi_jj','M_mmjj','eta_mmjj','phi_mmjj','dEta_jj','Zep','dRmin_mj', 'dRmax_mj', 'dRmin_mmj','dRmax_mmj','dPhimm','leadingJet_pt','subleadingJet_pt', 'leadingJet_eta','subleadingJet_eta','leadingJet_qgl','subleadingJet_qgl','cthetaCS','Higgs_pt','Higgs_eta','Higgs_mass'],
            "dnn_input_histogram_bins": {
                "softJet5": (0,10,10),
                "dRmm": (0,5,41),
                "dEtamm": (-2,2,41),
                "dPhimm": (-2,2,41),
                "M_jj": (0,2000,41),
                "pt_jj": (0,400,41),
                "eta_jj": (-5,5,41),
                "phi_jj": (-5,5,41),
                "M_mmjj": (0,2000,41),
                "eta_mmjj": (-3,3,41),
                "phi_mmjj": (-3,3,41),
                "dEta_jj": (-3,3,41),
                "Zep": (-2,2,41),
                "dRmin_mj": (0,5,41),
                "dRmax_mj": (0,5,41),
                "dRmin_mmj": (0,5,41),
                "dRmax_mmj": (0,5,41),
                "leadingJet_pt": (0, 200, 41),
                "subleadingJet_pt": (0, 200, 41),
                "leadingJet_eta": (-5, 5, 41),
                "subleadingJet_eta": (-5, 5, 41),
                "leadingJet_qgl": (0, 1, 41),
                "subleadingJet_qgl": (0, 1, 41),
                "cthetaCS": (-1, 1, 41),
                "Higgs_pt": (0, 200, 41),
                "Higgs_eta": (-3, 3, 41),
                "Higgs_mass": (110, 150, 41),
                "dnn_pred": (0, 1, 1001),
                "dnn_pred2": (0, 1, 11),
                "bdt_ucsd": (-1, 1, 41),
                "bdt2j_ucsd": (-1, 1, 41),
                "bdt01j_ucsd": (-1, 1, 41),
                "MET_pt": (0, 200, 41),
                "hmmthetacs": (-1, 1, 41),
                "hmmphics": (-4, 4, 41),
            },

            "categorization_trees": {}
        },
    }
    histo_bins = {
        "muon_pt": np.linspace(0, 200, 101, dtype=np.float32),
        "npvs": np.linspace(0,100,101, dtype=np.float32),
        "dijet_inv_mass": np.linspace(0, 2000, 41, dtype=np.float32),
        "inv_mass": np.linspace(70, 150, 41, dtype=np.float32),
        "numjet": np.linspace(0, 10, 11, dtype=np.float32),
        "jet_pt": np.linspace(0, 300, 101, dtype=np.float32),
        "jet_eta": np.linspace(-4.7, 4.7, 41, dtype=np.float32),
        "pt_balance": np.linspace(0, 5, 41, dtype=np.float32),
        "numjets": np.linspace(0, 10, 11, dtype=np.float32),
        "jet_qgl": np.linspace(0, 1, 41, dtype=np.float32),
        "higgs_inv_mass_uncertainty": np.linspace(0, 10, 101, dtype=np.float32),
        "higgs_rel_inv_mass_uncertainty": np.linspace(0, 0.05, 101, dtype=np.float32)
    }
    for hname, bins in analysis_parameters["baseline"]["dnn_input_histogram_bins"].items():
        histo_bins[hname] = np.linspace(bins[0], bins[1], bins[2], dtype=np.float32)

    for masswindow in ["z_peak", "h_peak", "h_sideband"]:
        mw = analysis_parameters["baseline"]["masswindow_" + masswindow]
        histo_bins["inv_mass_{0}".format(masswindow)] = np.linspace(mw[0], mw[1], 41, dtype=np.float32)

    histo_bins["dnn_pred2"] = {
        "h_peak": np.array([0., 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 1.0], dtype=np.float32),
        "z_peak": np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0], dtype=np.float32),
        "h_sideband": np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0], dtype=np.float32),
    }

    analysis_parameters["baseline"]["histo_bins"] = histo_bins

    #analysis_parameters["oldjec"] = copy.deepcopy(analysis_parameters["baseline"])
    #analysis_parameters["oldjec"]["jec_tag"]["2018"] = "Autumn18_V8"

    #Run baseline analysis
    outpath = "{0}/partial_results".format(args.out)
    try:
        os.makedirs(outpath)
    except FileExistsError as e:
            pass

    with open('{0}/parameters.pkl'.format(outpath), 'wb') as handle:
        pickle.dump(analysis_parameters, handle, protocol=pickle.HIGHEST_PROTOCOL)

    #Recreate dump of all filenames
    cache_filename = args.cache_location + "/datasets.json"
    if ("cache" in args.action) and (args.jobfiles is None):
        print("--action cache and no jobfiles specified, creating datasets.json dump of all filenames")
        if not os.path.isdir(args.cache_location):
            os.makedirs(args.cache_location)
        filenames_cache = {}
        for dataset in datasets:
            dataset_name, dataset_era, dataset_globpattern, is_mc = dataset
            filenames_all = glob.glob(args.datapath + dataset_globpattern, recursive=True)
            filenames_all = [fn for fn in filenames_all if not "Friend" in fn]
            filenames_cache[dataset_name + "_" + dataset_era] = [
                fn.replace(args.datapath, "") for fn in filenames_all]

            if len(filenames_all) == 0:
                raise Exception("Dataset {0} matched 0 files from glob pattern {1}, verify that the data files are located in {2}".format(
                    dataset_name, dataset_globpattern, args.datapath
                ))
    
        #save all dataset filenames to a json file 
        print("Creating a json dump of all the dataset filenames based on data found in {0}".format(args.datapath))
        if os.path.isfile(cache_filename):
            print("Cache file {0} already exists, we will not overwrite it to be safe.".format(cache_filename), file=sys.stderr)
            print("Delete it or change --cache-location and try again.", file=sys.stderr)
            sys.exit(1)
        with open(cache_filename, "w") as fi:
            fi.write(json.dumps(filenames_cache, indent=2))

    if ("cache" in args.action or "analyze" in args.action) and (args.jobfiles is None):
        #Create a list of job files for processing
        jobfile_data = []
        print("Loading list of filenames from {0}".format(cache_filename))
        if not os.path.isfile(cache_filename):
            raise Exception("Cached dataset list of filenames not found in {0}, please run this code with --action cache".format(
                cache_filename))
        filenames_cache = json.load(open(cache_filename, "r"))

        for dataset in datasets:
            dataset_name, dataset_era, dataset_globpattern, is_mc = dataset
            try:
                filenames_all = filenames_cache[dataset_name + "_" + dataset_era]
            except KeyError as e:
                print("Could not load {0} from {1}, please make sure this dataset has been added to cache".format(
                    dataset_name + "_" + dataset_era, cache_filename), file=sys.stderr)
                raise e

            filenames_all_full = [args.datapath + "/" + fn for fn in filenames_all]
            chunksize = args.chunksize * chunksize_multiplier.get(dataset_name, 1)
            print("Saving dataset {0}_{1} with {2} files in {3} files per chunk to jobfiles".format(
                dataset_name, dataset_era, len(filenames_all_full), chunksize))
            jobfile_dataset = create_dataset_jobfiles(dataset_name, dataset_era,
                filenames_all_full, is_mc, chunksize, args.out)
            jobfile_data += jobfile_dataset
            print("Dataset {0}_{1} consists of {2} chunks".format(
                dataset_name, dataset_era, len(jobfile_dataset)))

        assert(len(jobfile_data) > 0)
        assert(len(jobfile_data[0]["filenames"]) > 0)

    #For each dataset, find out which chunks we want to process
    if "cache" in args.action or "analyze" in args.action:
        jobfile_data = []
        if not (args.jobfiles_load is None):
            args.jobfiles = [l.strip() for l in open(args.jobfiles_load).readlines()]
        if args.jobfiles is None:
            print("You did not specify to process specific dataset chunks, assuming you want to process all chunks")
            print("If this is not true, please specify e.g. --jobfiles data_2018_0.json data_2018_1.json ...")
            args.jobfiles = []
            for dataset in datasets:
                dataset_name, dataset_era, dataset_globpattern, is_mc = dataset
                jobfiles_dataset = glob.glob(args.out + "/jobfiles/{0}_{1}_*.json".format(dataset_name, dataset_era))
                assert(len(jobfiles_dataset) > 0)
                if args.maxchunks > 0:
                    jobfiles_dataset = jobfiles_dataset[:args.maxchunks]
                args.jobfiles += jobfiles_dataset
       
        #Now load the jobfiles 
        assert(len(args.jobfiles) > 0)
        print("You specified --jobfiles {0}, processing only these dataset chunks".format(" ".join(args.jobfiles))) 
        jobfile_data = []
        for f in args.jobfiles:
            jobfile_data += [json.load(open(f))]

        chunkstr = " ".join(["{0}_{1}_{2}".format(
            ch["dataset_name"], ch["dataset_era"], ch["dataset_num_chunk"])
            for ch in jobfile_data])
        print("Will process {0} dataset chunks: {1}".format(len(jobfile_data), chunkstr))
        assert(len(jobfile_data) > 0)

    #Start the profiler only in the actual data processing
    if do_prof:
        import yappi
        filename = 'analysis.prof'
        yappi.set_clock_type('cpu')
        yappi.start(builtins=True)

    if "cache" in args.action:
        print("Running the 'cache' step of the analysis, ROOT files will be opened and branches will be uncompressed")
        print("Will retrieve dataset filenames based on existing ROOT files on filesystem in datapath={0}".format(args.datapath)) 
       
        try:
            os.makedirs(cmdline_args.cache_location)
        except Exception as e:
            pass

        run_cache(args, outpath, jobfile_data, analysis_parameters)
    
    if "analyze" in args.action:
        run_analysis(args, outpath, jobfile_data, analysis_parameters, analysis_corrections)

    if "merge" in args.action:
        with ProcessPoolExecutor(max_workers=args.nthreads) as executor:
            for dataset in datasets:
                dataset_name, dataset_era, dataset_globpattern, is_mc = dataset
                fut = executor.submit(merge_partial_results, dataset_name, dataset_era, outpath)
        print("done merging")
    if do_prof:
        stats = yappi.get_func_stats()
        stats.save(filename, type='callgrind')

    import resource
    total_memory = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
    total_memory += resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print("maxrss={0} MB".format(total_memory/1024))
    bins = np.linspace(0,2,1000)
    plt.hist(data1["corr_JEC"], bins=bins, histtype="step", label="our code");
    plt.hist(data2["corr_JEC"], bins=bins, histtype="step", label="NanoAODTools");
    plt.xlabel("JEC correction")
    plt.savefig("corr_JEC.pdf")

    plt.figure()
    bins = np.linspace(0,2,1000)
    plt.hist(data1["corr_JER"], bins=bins, histtype="step", label="our code");
    plt.hist(data2["corr_JER"], bins=bins, histtype="step", label="NanoAODTools");
    plt.xlabel("JER correction")
    plt.savefig("corr_JER.pdf")

if __name__ == "__main__":
    use_cuda = False
    NUMPY_LIB, ha = choose_backend(use_cuda)
    hmumu_utils.NUMPY_LIB = np
    hmumu_utils.ha = ha
    
    job_desc = {
        "dataset_name": "ggh_amcPS",
        "is_mc": True,
        "dataset_era": "2018",
        "filenames": ["/store/mc/RunIIAutumn18NanoAODv5/GluGluHToMuMu_M125_TuneCP5_PSweights_13TeV_amcatnloFXFX_pythia8/NANOAODSIM/Nano1June2019_102X_upgrade2018_realistic_v19-v1/100000/359F045D-D71C-E84E-9BD1-0BEA8E6228C5.root", ],
        "dataset_num_chunk": 0,
    }
    cache_location = "/storage/user/nlu/hmm/cache2"
    datapath = "/storage/group/allcit/"
    
    datastructures = create_datastructure(job_desc["dataset_name"], job_desc["is_mc"], job_desc["dataset_era"])
    
Example #13
0
def main(args, datasets):
    do_prof = args.do_profile
    do_tensorflow = not args.disable_tensorflow

    #use the environment variable for cupy/cuda choice
    args.use_cuda = USE_CUPY

    # Optionally disable pinned memory (will be somewhat slower)
    if args.use_cuda:
        import cupy
        if not args.pinned:
            cupy.cuda.set_allocator(None)
            cupy.cuda.set_pinned_memory_allocator(None)

    #Use sync-only datasets
    if args.do_sync:
        datasets = datasets_sync

    #Filter datasets by era
    datasets_to_process = []
    for ds in datasets:
        if args.datasets is None or ds[0] in args.datasets:
            if args.eras is None or ds[1] in args.eras:
                datasets_to_process += [ds]
                print("Will consider dataset", ds)
    if len(datasets) == 0:
        raise Exception(
            "No datasets considered, please check the --datasets and --eras options"
        )
    datasets = datasets_to_process

    hmumu_utils.NUMPY_LIB, hmumu_utils.ha = choose_backend(args.use_cuda)
    Dataset.numpy_lib = hmumu_utils.NUMPY_LIB
    NUMPY_LIB = hmumu_utils.NUMPY_LIB

    outpath_partial = "{0}/partial_results".format(args.out)
    try:
        os.makedirs(outpath_partial)
    except FileExistsError as e:
        print("Output path {0} already exists, not recreating".format(
            outpath_partial))

    #save the parameters as a pkl file
    from pars import analysis_parameters
    for analysis_name in analysis_parameters.keys():
        analysis_parameters[analysis_name][
            "do_factorized_jec"] = args.do_factorized_jec
        analysis_parameters[analysis_name][
            "dnn_vars_path"] = "{0}/dnn_vars".format(args.out)

    with open('{0}/parameters.pkl'.format(outpath_partial), 'wb') as handle:
        pickle.dump(analysis_parameters,
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)

    #Recreate dump of all filenames
    cache_filename = args.cache_location + "/datasets.json"
    if ("cache" in args.action) and (args.jobfiles is None):
        check_and_recreate_filename_cache(cache_filename, args.cache_location,
                                          args.datapath)

    #Create the jobfiles
    if ("cache" in args.action
            or "analyze" in args.action) and (args.jobfiles is None):
        create_all_jobfiles(datasets, cache_filename, args.datapath,
                            args.chunksize, args.out)

    #For each dataset, find out which chunks we want to process
    if "cache" in args.action or "analyze" in args.action:
        jobfile_data = load_jobfiles(datasets, args.jobfiles_load,
                                     args.jobfiles, args.maxchunks, args.out)

    #Start the profiler only in the actual data processing
    if do_prof:
        import yappi
        yappi.set_clock_type('cpu')
        yappi.start(builtins=True)

    if "cache" in args.action:
        print(
            "Running the 'cache' step of the analysis, ROOT files will be opened and branches will be uncompressed"
        )
        run_cache(args, outpath_partial, jobfile_data, analysis_parameters)

    #Run the physics analysis on all specified jobfiles
    if "analyze" in args.action:
        print(
            "Running the 'analyze' step of the analysis, processing the events into histograms with all systematics"
        )
        analysis_corrections = AnalysisCorrections(args, do_tensorflow)
        run_analysis(args, outpath_partial, jobfile_data, analysis_parameters,
                     analysis_corrections)

    if do_prof:
        stats = yappi.get_func_stats()
        stats.save("analysis.prof", type='callgrind')

    #Merge the partial results (pieces of each dataset)
    if "merge" in args.action:
        with ProcessPoolExecutor(max_workers=args.nthreads) as executor:
            for dataset in datasets:
                dataset_name, dataset_era, dataset_globpattern, is_mc = dataset
                fut = executor.submit(merge_partial_results, dataset_name,
                                      dataset_era, args.out, outpath_partial)
        print("done merging")

    #print memory usage
    total_memory = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
    total_memory += resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print("maxrss={0} MB".format(total_memory / 1024))