Esempio n. 1
0
def gen_jet_pair_mass(df):
    gjmass = None
    gjets = df.GenJet
    gleptons = df.GenPart[(abs(df.GenPart.pdgId) == 13)
                          | (abs(df.GenPart.pdgId) == 11)
                          | (abs(df.GenPart.pdgId) == 15)]
    gl_pair = ak.cartesian({
        "jet": gjets,
        "lepton": gleptons
    },
                           axis=1,
                           nested=True)
    _, _, dr_gl = delta_r(
        gl_pair["jet"].eta,
        gl_pair["lepton"].eta,
        gl_pair["jet"].phi,
        gl_pair["lepton"].phi,
    )
    isolated = ak.all((dr_gl > 0.3), axis=-1)
    if ak.count(gjets[isolated], axis=None) > 0:
        # TODO: convert only relevant fields!
        gjet1 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 0],
                                                  ["pt", "eta", "phi", "mass"]]
        gjet2 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 1],
                                                  ["pt", "eta", "phi", "mass"]]
        gjet1.index = gjet1.index.droplevel("subentry")
        gjet2.index = gjet2.index.droplevel("subentry")

        gjsum = p4_sum(gjet1, gjet2)
        gjmass = gjsum.mass
    return gjmass
Esempio n. 2
0
def read_data(paths, ds_predictions, pn_predictions):
    dfs = []
    for path in paths:
        valid_jets = read_nanoaod(path)

        jet_pt = ak.to_pandas(valid_jets.pt)
        gen_jet_pt = ak.to_pandas(valid_jets.matched_gen.pt)
        gen_jet_eta = ak.to_pandas(valid_jets.matched_gen.eta)
        parton_flavour = ak.to_pandas(valid_jets.matched_gen.partonFlavour)
        hadron_flavour = ak.to_pandas(valid_jets.matched_gen.hadronFlavour)

        df = pd.concat(
            (jet_pt, gen_jet_pt, gen_jet_eta, parton_flavour, hadron_flavour),
            axis=1)
        df.columns = [
            'Jet_pt', 'GenJet_pt', 'GenJet_eta', 'GenJet_partonFlavour',
            'GenJet_hadronFlavour'
        ]

        flavour = df.GenJet_hadronFlavour.where(df.GenJet_hadronFlavour != 0,
                                                other=np.abs(
                                                    df.GenJet_partonFlavour))
        df = df.drop(columns=['GenJet_partonFlavour', 'GenJet_hadronFlavour'])
        df['flavour'] = flavour

        dfs.append(df)

    df = pd.concat(dfs, axis=0)

    df['response'] = df.Jet_pt / df.GenJet_pt
    df['ds_response'] = ds_predictions.flatten() * df.Jet_pt / df.GenJet_pt
    df['pn_response'] = pn_predictions.flatten() * df.Jet_pt / df.GenJet_pt

    return df
Esempio n. 3
0
def fill_gen_jets(df, output):
    gjets = df.GenJet
    gleptons = df.GenPart[(abs(df.GenPart.pdgId) == 13)
                          | (abs(df.GenPart.pdgId) == 11)
                          | (abs(df.GenPart.pdgId) == 15)]
    gl_pair = ak.cartesian({
        "jet": gjets,
        "lepton": gleptons
    },
                           axis=1,
                           nested=True)
    _, _, dr_gl = delta_r(
        gl_pair["jet"].eta,
        gl_pair["lepton"].eta,
        gl_pair["jet"].phi,
        gl_pair["lepton"].phi,
    )
    isolated = ak.all((dr_gl > 0.3), axis=-1)
    gjet1 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 0],
                                              ["pt", "eta", "phi", "mass"]]
    gjet2 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 1],
                                              ["pt", "eta", "phi", "mass"]]
    gjet1.index = gjet1.index.droplevel("subentry")
    gjet2.index = gjet2.index.droplevel("subentry")
    gjsum = p4_sum(gjet1, gjet2)
    for var in ["pt", "eta", "phi", "mass"]:
        output[f"gjet1_{var}"] = gjet1[var]
        output[f"gjet2_{var}"] = gjet2[var]
        output[f"gjj_{var}"] = gjsum[var]
    output["gjj_dEta"], output["gjj_dPhi"], output["gjj_dR"] = delta_r(
        output.gjet1_eta, output.gjet2_eta, output.gjet1_phi, output.gjet2_phi)
    return output
def test():
    simple = ak.Array([0.0, 1.1, 2.2, 3.3, 4.4, 5.5])
    assert ak.to_pandas(simple)["values"].values.tolist() == [
        0.0,
        1.1,
        2.2,
        3.3,
        4.4,
        5.5,
    ]

    index = ak.layout.Index64(np.array([3, 3, 1, 5], dtype=np.int64))
    indexed = ak.Array(ak.layout.IndexedArray64(index, simple.layout))
    assert indexed.tolist() == [3.3, 3.3, 1.1, 5.5]

    assert ak.to_pandas(indexed)["values"].values.tolist() == [3.3, 3.3, 1.1, 5.5]

    tuples = ak.Array(ak.layout.RecordArray([simple.layout, simple.layout]))
    assert ak.to_pandas(tuples)["1"].values.tolist() == [0.0, 1.1, 2.2, 3.3, 4.4, 5.5]

    offsets = ak.layout.Index64(np.array([0, 1, 1, 3, 4], dtype=np.int64))
    nested = ak.Array(ak.layout.ListOffsetArray64(offsets, indexed.layout))
    assert ak.to_pandas(nested)["values"].values.tolist() == [3.3, 3.3, 1.1, 5.5]

    offsets2 = ak.layout.Index64(np.array([0, 3, 3, 4, 6], dtype=np.int64))
    nested2 = ak.Array(ak.layout.ListOffsetArray64(offsets2, tuples.layout))

    assert ak.to_pandas(nested2)["1"].values.tolist() == [0.0, 1.1, 2.2, 3.3, 4.4, 5.5]

    recrec = ak.Array([{"x": {"y": 1}}, {"x": {"y": 2}}, {"x": {"y": 3}}])
    assert ak.to_pandas(recrec)["x", "y"].values.tolist() == [1, 2, 3]

    recrec2 = ak.Array(
        [
            {"x": {"a": 1, "b": 2}, "y": {"c": 3, "d": 4}},
            {"x": {"a": 10, "b": 20}, "y": {"c": 30, "d": 40}},
        ]
    )
    assert ak.to_pandas(recrec2)["y", "c"].values.tolist() == [3, 30]

    recrec3 = ak.Array(
        [{"x": 1, "y": {"c": 3, "d": 4}}, {"x": 10, "y": {"c": 30, "d": 40}}]
    )
    assert ak.to_pandas(recrec3)["y", "c"].values.tolist() == [3, 30]

    tuptup = ak.Array([(1.0, (1.1, 1.2)), (2.0, (2.1, 2.2)), (3.0, (3.1, 3.2))])
    assert ak.to_pandas(tuptup)["1", "0"].values.tolist() == [1.1, 2.1, 3.1]

    recrec4 = ak.Array(
        [[{"x": 1, "y": {"c": 3, "d": 4}}], [{"x": 10, "y": {"c": 30, "d": 40}}]]
    )
    assert ak.to_pandas(recrec4)["y", "c"].values.tolist() == [3, 30]
def test_broken():
    ex = ak.Array([[1, 2, 3], [], [4, 5]])
    p4 = ak.zip({"x": ex})
    p4c = ak.cartesian({"a": p4, "b": p4})
    df = ak.to_pandas(p4c)
    assert df["a", "x"].values.tolist() == [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5]
    assert df["b", "x"].values.tolist() == [1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5]
Esempio n. 6
0
def get_df_subentry2(root_file_name):
    """returns a dataframe that contains only subentry 2 data

    This subentry seems to contain all the relevant information"""

    df = pd.DataFrame()
    with rt.open(f'{root_file_name}:Hits') as tree:
        df = ak.to_pandas(tree.arrays())
    return df.xs(2, level='subentry')
Esempio n. 7
0
 def get_dat_glob(self):
     dat_glob = ak.to_pandas(
         self.tree.arrays([
             'ebeam', 'emeas', 'lumoff', 'lumofferr', 'runnum',
             'finalstate_id'
         ]))
     badruns = np.loadtxt('pylib/badruns.dat')
     dat_glob['badrun'] = dat_glob.runnum.isin(badruns)
     return dat_glob
Esempio n. 8
0
def fill_softjets(df, output, variables, cutoff):
    saj_df = ak.to_pandas(df.SoftActivityJet)
    saj_df["mass"] = 0.0
    nj_name = f"SoftActivityJetNjets{cutoff}"
    ht_name = f"SoftActivityJetHT{cutoff}"
    res = ak.to_pandas(df[[nj_name, ht_name]])

    res["to_correct"] = output.two_muons | (variables.njets > 0)
    _, _, dR_m1 = delta_r(saj_df.eta, output.mu1_eta, saj_df.phi,
                          output.mu1_phi)
    _, _, dR_m2 = delta_r(saj_df.eta, output.mu2_eta, saj_df.phi,
                          output.mu2_phi)
    _, _, dR_j1 = delta_r(saj_df.eta, variables.jet1_eta, saj_df.phi,
                          variables.jet1_phi)
    _, _, dR_j2 = delta_r(saj_df.eta, variables.jet2_eta, saj_df.phi,
                          variables.jet2_phi)
    saj_df["dR_m1"] = dR_m1 < 0.4
    saj_df["dR_m2"] = dR_m2 < 0.4
    saj_df["dR_j1"] = dR_j1 < 0.4
    saj_df["dR_j2"] = dR_j2 < 0.4
    dr_cols = ["dR_m1", "dR_m2", "dR_j1", "dR_j2"]
    saj_df[dr_cols] = saj_df[dr_cols].fillna(False)
    saj_df["to_remove"] = saj_df[dr_cols].sum(axis=1).astype(bool)

    saj_df_filtered = saj_df[(~saj_df.to_remove) & (saj_df.pt > cutoff)]
    footprint = saj_df[(saj_df.to_remove) & (saj_df.pt > cutoff)]
    res["njets_corrected"] = (
        saj_df_filtered.reset_index().groupby("entry")["subentry"].nunique())
    res["njets_corrected"] = res["njets_corrected"].fillna(0).astype(int)
    res["footprint"] = footprint.pt.groupby(level=[0]).sum()
    res["footprint"] = res["footprint"].fillna(0.0)
    res["ht_corrected"] = res[ht_name] - res.footprint
    res.loc[res.ht_corrected < 0, "ht_corrected"] = 0.0

    res.loc[res.to_correct, nj_name] = res.loc[res.to_correct,
                                               "njets_corrected"]

    res.loc[res.to_correct, ht_name] = res.loc[res.to_correct, "ht_corrected"]

    variables[f"nsoftjets{cutoff}"] = res[f"SoftActivityJetNjets{cutoff}"]
    variables[f"htsoft{cutoff}"] = res[f"SoftActivityJetHT{cutoff}"]
Esempio n. 9
0
def find_cluster(interactions, cluster_size_space, cluster_size_time):
    """
    Function which finds cluster within a event.

    Args:
        x (pandas.DataFrame): Subentries of event must contain the
            fields, x,y,z,time
        cluster_size_space (float): Max spatial distance between two points to
            be inside a cluster [cm].
        cluster_size_time (float): Max time distance between two points to be 
            inside a cluster [ns].
    
    Returns:
        awkward.array: Adds to interaction a cluster_ids record.
    """
    # TODO is there a better way to get the df?
    df = []
    for key in ['x', 'y', 'z', 'ed', 't']:
        df.append(ak.to_pandas(interactions[key], anonymous=key))
    df = pd.concat(df, axis=1)

    if df.empty:
        # TPC interaction is empty
        return interactions

    # Splitting into individual events and apply time clustering:
    groups = df.groupby('entry')

    df["time_cluster"] = np.concatenate(
        groups.apply(
            lambda x: simple_1d_clustering(x.t.values, cluster_size_time)))

    # Splitting into individual events and time cluster and apply space clustering space:
    df['cluster_id'] = np.zeros(len(df.index), dtype=np.int)

    for evt in df.index.get_level_values(0).unique():
        _df_evt = df.loc[evt]
        _t_clusters = _df_evt.time_cluster.unique()
        add_to_cluster = 0

        for _t in _t_clusters:
            _cl = _find_cluster(_df_evt[_df_evt.time_cluster == _t],
                                cluster_size_space=cluster_size_space)
            df.loc[(df.time_cluster == _t) &
                   (df.index.get_level_values(0) == evt),
                   'cluster_id'] = _cl + add_to_cluster
            add_to_cluster = max(_cl) + add_to_cluster + 1

    ci = df.loc[:, 'cluster_id'].values
    offsets = ak.num(interactions['x'])
    interactions['cluster_ids'] = reshape_awkward(ci, offsets)

    return interactions
Esempio n. 10
0
    def get_dat_photons(self):
        arrs = self.tree.arrays(
            ['pt', 'theta', 'phi', 'mass'],
            cut='(nt>=2)&(nks>0)&(phen>0)',
            aliases={
                'pt': 'phen*sin(phth)',
                'theta': 'phth',
                'phi': 'phphi',
                'mass': '0*phen'
            })
        vecs = vector.Array(arrs)

        df = ak.to_pandas(ak.combinations(vecs.px, 2))
        df = df.rename({'0': 'px0', '1': 'px1'}, axis=1)
        df_len = len(df)

        df = df.join(ak.to_pandas(ak.combinations(vecs.py, 2)))
        assert df_len == len(df)
        df_len = len(df)
        df = df.rename({'0': 'py0', '1': 'py1'}, axis=1)

        df = df.join(ak.to_pandas(ak.combinations(vecs.pz, 2)))
        assert df_len == len(df)
        df_len = len(df)
        df = df.rename({'0': 'pz0', '1': 'pz1'}, axis=1)

        df = df.join(ak.to_pandas(ak.combinations(vecs.E, 2)))
        assert df_len == len(df)
        df_len = len(df)
        df = df.rename({'0': 'E0', '1': 'E1'}, axis=1)

        for coord in ('x', 'y', 'z'):
            df[f'P{coord}'] = df[f'p{coord}0'] + df[f'p{coord}1']
        df['P'] = np.sqrt(df['Px']**2 + df['Py']**2 + df['Pz']**2)
        df['E'] = df['E0'] + df['E1']
        M2 = df['E']**2 - df['P']**2
        df['M'] = np.where(M2 > 0, np.sqrt(np.abs(M2)), -np.sqrt(np.abs(M2)))
        return df
Esempio n. 11
0
 def get_dat_tracks(self):
     e0 = self.tree['emeas'].array()[0]
     pidedx = '5.58030e+9 / (tptot + 40.)**3 + 2.21228e+3 - 3.77103e-1 * tptot - tdedx'
     arrs = self.tree.arrays(
         ['tz', 'tptot', 'tdedx', 'tcharge', 'trho', 'tth', 'tphi'],
         f'(nt>=2)&(nks>0)&(tnhit>6)&(abs(pidedx)<{self.cut_dedx})&(tchi2r<20)&(tchi2z<20)&(abs(tz)<{self.cut_z})&(tptot<{e0})&(tptot>40)',
         aliases={'pidedx': pidedx})
     dat_tracks = ak.to_pandas(arrs)
     dat_tracks_groups = dat_tracks.groupby('entry').agg(uniques=('tz',
                                                                  'count'),
                                                         charge=('tcharge',
                                                                 'sum'))
     idx = dat_tracks_groups.query('(uniques==2)&(charge==0)').index
     return dat_tracks.loc[idx]  #.drop('tcharge', axis=1)
Esempio n. 12
0
def fill_gen_jets(df, output):
    features = ["PT", "Eta", "Phi", "Mass"]
    gjets = df.GenJet[features]
    print(df.GenJet.fields)
    gleptons = df.MuonMedium
    # gleptons = df.GenPart[
    #    (abs(df.GenPart.pdgId) == 13)
    #    | (abs(df.GenPart.pdgId) == 11)
    #    | (abs(df.GenPart.pdgId) == 15)
    # ]
    gl_pair = ak.cartesian({"jet": gjets, "lepton": gleptons}, axis=1, nested=True)
    _, _, dr_gl = delta_r(
        gl_pair["jet"].Eta,
        gl_pair["lepton"].Eta,
        gl_pair["jet"].Phi,
        gl_pair["lepton"].Phi,
    )
    isolated = ak.all((dr_gl > 0.3), axis=-1)

    gjet1 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 0], features]
    gjet2 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 1], features]
    gjet1.index = gjet1.index.droplevel("subentry")
    gjet2.index = gjet2.index.droplevel("subentry")
    feat_map = {"pt": "PT", "eta": "Eta", "phi": "Phi", "mass": "Mass"}
    for var in ["pt", "eta", "phi", "mass"]:
        gjet1[var] = gjet1[feat_map[var]]
        gjet2[var] = gjet2[feat_map[var]]
    gjsum = p4_sum(gjet1, gjet2)

    for var in ["pt", "eta", "phi", "mass"]:
        output[f"gjet1_{var}"] = gjet1[var]
        output[f"gjet2_{var}"] = gjet2[var]
        output[f"gjj_{var}"] = gjsum[var]
    output["gjj_dEta"], output["gjj_dPhi"], output["gjj_dR"] = delta_r(
        output.gjet1_eta, output.gjet2_eta, output.gjet1_phi, output.gjet2_phi
    )
    return output
Esempio n. 13
0
 def get_dat_kaons(self):
     dlt_mass = 'abs(ksminv-497.6)'
     cuts = f'(nt>=2)&(nks>0)&(ksalign>{self.cut_align})&(dlt_mass<200)'
     dat_kaons = ak.to_pandas(
         self.tree.arrays([
             'ksptot', 'ksminv', 'ksalign', 'dlt_mass', 'ksvind', 'ksdpsi',
             'ksz0', 'kslen', 'ksth', 'ksphi'
         ],
                          cuts,
                          aliases={'dlt_mass': dlt_mass})).loc[:, :, :1]
     dat_kaons = dat_kaons.reset_index().drop(
         'subsubentry', axis=1).set_index(['entry', 'subentry'])
     kaons = dat_kaons.sort_values(
         by=['dlt_mass']).reset_index().drop_duplicates(
             subset=['entry'],
             keep='first').set_index(['entry', 'subentry']).index
     dat_kaons = dat_kaons.loc[kaons]
     return dat_kaons.reset_index().drop(['subentry'], axis=1).rename(
         {
             'ksvind': 'subentry'
         }, axis=1).set_index(['entry', 'subentry'])
Esempio n. 14
0
    def cellid_adj_matrix(
        self
    ):  #pytorch_geometric adj_matrix format (tensor of connected edges with dim 2xnum_of_edges)
        fnlup = osp.join(self.geometry_dir, "DetIdLUT.root")  #conf["luppath"]
        rf = uproot.open(fnlup)
        arr = rf["analyzer/tree"].arrays()
        keydf = ak.to_pandas(arr[0])
        keydf = keydf.set_index("globalid")

        # load the geometry
        geoyamlpath = osp.join(self.geometry_dir, "geometry.yaml")
        fngeopic = osp.join(
            self.geometry_dir,
            "geometry.pickle")  #conf["geoyamlpath"].strip("yaml") + "pickle"
        if os.path.isfile(fngeopic):
            with open(fngeopic, "rb") as f:
                geoD = pickle.load(f)
        else:
            with open(geoyamlpath, "r") as f:
                geoD = yaml.load(f)
            with open(fngeopic, "wb") as f:
                pickle.dump(geoD, f)
        graphpath = osp.join(self.geometry_dir, "edge_index.pt")
        if os.path.isfile(graphpath):
            edge_index = torch.load(graphpath)
        else:
            # Instanciate array
            edgeA = np.empty((2, 0), dtype=int)

            for originid, row in keydf.iterrows():
                for i in range(row.nneighbors + row.ngapneighbors):
                    edgeA = np.append(edgeA, [[originid], [row["n" + str(i)]]],
                                      axis=1)

            # Prune
            edgeA = edgeA[:, edgeA[0] != 0]

            edge_index = torch.tensor(edgeA, dtype=torch.long)
            torch.save(edge_index, graphpath)
        return keydf.index, edge_index
Esempio n. 15
0
    def process(self, events):

        events = events[
            ak.num(events.Jet) >
            0]  #corrects for rare case where there isn't a single jet in event
        output = self.accumulator.identity()

        # we can use a very loose preselection to filter the events. nothing is done with this presel, though
        presel = ak.num(events.Jet) >= 0

        ev = events[presel]
        dataset = ev.metadata['dataset']

        # load the config - probably not needed anymore
        # cfg = loadConfig()

        output['totalEvents']['all'] += len(events)
        output['skimmedEvents']['all'] += len(ev)

        ### For FCNC, we want electron -> tightTTH
        electron = Collections(ev, "Electron", "tightFCNC").get()
        fakeableelectron = Collections(ev, "Electron", "fakeableFCNC").get()

        muon = Collections(ev, "Muon", "tightFCNC").get()
        fakeablemuon = Collections(ev, "Muon", "fakeableFCNC").get()

        ##Jets
        Jets = events.Jet

        ## MET -> can switch to puppi MET
        met_pt = ev.MET.pt
        met_phi = ev.MET.phi

        lepton = fakeablemuon  #ak.concatenate([fakeablemuon, fakeableelectron], axis=1)
        mt_lep_met = mt(lepton.pt, lepton.phi, ev.MET.pt, ev.MET.phi)
        min_mt_lep_met = ak.min(mt_lep_met, axis=1)

        selection = PackedSelection()
        selection.add('MET<20', (ev.MET.pt < 20))
        selection.add('mt<20', min_mt_lep_met < 20)
        #selection.add('MET<19',        (ev.MET.pt<19) )
        selection_reqs = ['MET<20', 'mt<20']  #, 'MET<19']
        fcnc_reqs_d = {sel: True for sel in selection_reqs}
        fcnc_selection = selection.require(**fcnc_reqs_d)

        # define the weight
        weight = Weights(len(ev))

        if not dataset == 'MuonEG':
            # generator weight
            weight.add("weight", ev.genWeight)

        jets = getJets(
            ev, maxEta=2.4, minPt=25, pt_var='pt'
        )  #& (ak.num(jets[~match(jets, fakeablemuon, deltaRCut=1.0)])>=1)
        single_muon_sel = (ak.num(muon) == 1) & (ak.num(fakeablemuon) == 1) | (
            ak.num(muon) == 0) & (ak.num(fakeablemuon) == 1)
        single_electron_sel = (ak.num(electron) == 1) & (
            ak.num(fakeableelectron)
            == 1) | (ak.num(electron) == 0) & (ak.num(fakeableelectron) == 1)
        fcnc_muon_sel = (ak.num(
            jets[~match(jets, fakeablemuon, deltaRCut=1.0)]) >=
                         1) & fcnc_selection & single_muon_sel
        fcnc_electron_sel = (ak.num(
            jets[~match(jets, fakeableelectron, deltaRCut=1.0)]) >=
                             1) & fcnc_selection & single_electron_sel
        tight_muon_sel = (ak.num(muon) == 1) & fcnc_muon_sel
        loose_muon_sel = (ak.num(fakeablemuon) == 1) & fcnc_muon_sel
        tight_electron_sel = (ak.num(electron) == 1) & fcnc_electron_sel
        loose_electron_sel = (ak.num(fakeableelectron)
                              == 1) & fcnc_electron_sel

        output['single_mu_fakeable'].fill(
            dataset=dataset,
            pt=ak.to_numpy(ak.flatten(fakeablemuon[loose_muon_sel].conePt)),
            eta=np.abs(
                ak.to_numpy(ak.flatten(fakeablemuon[loose_muon_sel].eta))))
        output['single_mu'].fill(
            dataset=dataset,
            pt=ak.to_numpy(ak.flatten(muon[tight_muon_sel].conePt)),
            eta=np.abs(ak.to_numpy(ak.flatten(muon[tight_muon_sel].eta))))
        output['single_e_fakeable'].fill(
            dataset=dataset,
            pt=ak.to_numpy(
                ak.flatten(fakeableelectron[loose_electron_sel].conePt)),
            eta=np.abs(
                ak.to_numpy(
                    ak.flatten(fakeableelectron[loose_electron_sel].eta))))
        output['single_e'].fill(
            dataset=dataset,
            pt=ak.to_numpy(ak.flatten(electron[tight_electron_sel].conePt)),
            eta=np.abs(
                ak.to_numpy(ak.flatten(electron[tight_electron_sel].eta))))

        if self.debug:
            #create pandas dataframe for debugging
            passed_events = ev[tight_muon_sel]
            passed_muons = muon[tight_muon_sel]
            event_p = ak.to_pandas(passed_events[["event"]])
            event_p["MET_PT"] = passed_events["MET"]["pt"]
            event_p["mt"] = min_mt_lep_met[tight_muon_sel]
            event_p["num_tight_mu"] = ak.to_numpy(ak.num(muon)[tight_muon_sel])
            event_p["num_loose_mu"] = ak.num(fakeablemuon)[tight_muon_sel]
            muon_p = ak.to_pandas(
                ak.flatten(passed_muons)[[
                    "pt", "conePt", "eta", "dz", "dxy", "ptErrRel",
                    "miniPFRelIso_all", "jetRelIsoV2", "jetRelIso",
                    "jetPtRelv2"
                ]])
            #convert to numpy array for the output
            events_array = pd.concat([muon_p, event_p], axis=1)

            events_to_add = [6886009]
            for e in events_to_add:
                tmp_event = ev[ev.event == e]
                added_event = ak.to_pandas(tmp_event[["event"]])
                added_event["MET_PT"] = tmp_event["MET"]["pt"]
                added_event["mt"] = min_mt_lep_met[ev.event == e]
                added_event["num_tight_mu"] = ak.to_numpy(
                    ak.num(muon)[ev.event == e])
                added_event["num_loose_mu"] = ak.to_numpy(
                    ak.num(fakeablemuon)[ev.event == e])
                add_muon = ak.to_pandas(
                    ak.flatten(muon[ev.event == e])[[
                        "pt", "conePt", "eta", "dz", "dxy", "ptErrRel",
                        "miniPFRelIso_all", "jetRelIsoV2", "jetRelIso",
                        "jetPtRelv2"
                    ]])
                add_concat = pd.concat([add_muon, added_event], axis=1)
                events_array = pd.concat([events_array, add_concat], axis=0)

            output['muons_df'] += processor.column_accumulator(
                events_array.to_numpy())

        return output
def test_union_to_record():
    recordarray1 = ak.Array([{"x": 1, "y": 1.1}, {"x": 3, "y": 3.3}]).layout
    recordarray2 = ak.Array([{"y": 2.2, "z": 999}]).layout
    tags = ak.layout.Index8(np.array([0, 1, 0], dtype=np.int8))
    index = ak.layout.Index64(np.array([0, 0, 1], dtype=np.int64))
    unionarray = ak.layout.UnionArray8_64(tags, index,
                                          [recordarray1, recordarray2])
    assert ak.to_list(unionarray) == [
        {
            "x": 1,
            "y": 1.1
        },
        {
            "y": 2.2,
            "z": 999
        },
        {
            "x": 3,
            "y": 3.3
        },
    ]

    converted = ak._util.union_to_record(unionarray, "values")
    assert isinstance(converted, ak.layout.RecordArray)
    assert ak.to_list(converted) == [
        {
            "x": 1,
            "y": 1.1,
            "z": None
        },
        {
            "x": None,
            "y": 2.2,
            "z": 999
        },
        {
            "x": 3,
            "y": 3.3,
            "z": None
        },
    ]

    otherarray = ak.Array(["one", "two"]).layout
    tags2 = ak.layout.Index8(np.array([0, 2, 1, 2, 0], dtype=np.int8))
    index2 = ak.layout.Index64(np.array([0, 0, 0, 1, 1], dtype=np.int64))
    unionarray2 = ak.layout.UnionArray8_64(
        tags2, index2, [recordarray1, recordarray2, otherarray])
    assert ak.to_list(unionarray2) == [
        {
            "x": 1,
            "y": 1.1
        },
        "one",
        {
            "y": 2.2,
            "z": 999
        },
        "two",
        {
            "x": 3,
            "y": 3.3
        },
    ]

    converted2 = ak._util.union_to_record(unionarray2, "values")
    assert isinstance(converted2, ak.layout.RecordArray)
    assert ak.to_list(converted2) == [
        {
            "x": 1,
            "y": 1.1,
            "z": None,
            "values": None
        },
        {
            "x": None,
            "y": None,
            "z": None,
            "values": "one"
        },
        {
            "x": None,
            "y": 2.2,
            "z": 999,
            "values": None
        },
        {
            "x": None,
            "y": None,
            "z": None,
            "values": "two"
        },
        {
            "x": 3,
            "y": 3.3,
            "z": None,
            "values": None
        },
    ]

    df_unionarray = ak.to_pandas(unionarray)
    np.testing.assert_array_equal(df_unionarray["x"].values,
                                  np.array([1, np.nan, 3]))
    np.testing.assert_array_equal(df_unionarray["y"].values,
                                  np.array([1.1, 2.2, 3.3]))
    np.testing.assert_array_equal(df_unionarray["z"].values,
                                  np.array([np.nan, 999, np.nan]))

    df_unionarray2 = ak.to_pandas(unionarray2)
    np.testing.assert_array_equal(df_unionarray2["x"].values,
                                  [1, np.nan, np.nan, np.nan, 3])
    np.testing.assert_array_equal(df_unionarray2["y"].values,
                                  [1.1, np.nan, 2.2, np.nan, 3.3])
    np.testing.assert_array_equal(df_unionarray2["z"].values,
                                  [np.nan, np.nan, 999, np.nan, np.nan])
    np.testing.assert_array_equal(df_unionarray2["values"].values,
                                  ["nan", "one", "nan", "two", "nan"])
Esempio n. 17
0
import os
import pickle
import yaml
import awkward as ak
import numpy as np
import uproot
import torch
from ..config import conf
from ..utils.logger import logger
from torch_geometric.data import Data

# load the root table
fnlup = conf["luppath"]
rf = uproot.open(fnlup)
arr = rf["analyzer/tree"].arrays()
keydf = ak.to_pandas(arr[0])
keydf = keydf.set_index("globalid")

# load the geometry

fngeopic = conf["geoyamlpath"].strip("yaml") + "pickle"
if os.path.isfile(fngeopic):
    with open(fngeopic, "rb") as f:
        geoD = pickle.load(f)
else:
    with open(conf["geoyamlpath"], "r") as f:
        geoD = yaml.load(f)
    with open(fngeopic, "wb") as f:
        pickle.dump(geoD, f)

if os.path.isfile(conf["graphpath"]):
Esempio n. 18
0
 def write_to_df(self, events, output_name):
     df = awkward.to_pandas(events)
     df.to_pickle(output_name)
     return
Esempio n. 19
0
    def process(self, df):
        # print(df.fields)
        # numevents = len(df)
        # dataset = df.metadata["dataset"]
        output = pd.DataFrame({"event": df.Event.Number})

        output.index.name = "entry"

        output["dataset"] = df.metadata["dataset"]
        regions = df.metadata["regions"]
        # channels = df.metadata['channels']
        output["lumi_wgt"] = float(df.metadata["lumi_wgt"])
        output["mc_wgt"] = ak.to_pandas(df.Event.Weight)
        # There are multiple weights per event - need to figure this out
        # output['lhe_wgt'] = ak.to_pandas(df.Weight.Weight)
        output["year"] = "snowmass"

        # Select muons
        muons = df[parameters["muon_branch"]]
        muon_filter = ((muons.pt > parameters["muon_pt_cut"])
                       & (abs(muons.eta) < parameters["muon_eta_cut"])
                       & (muons.IsolationVar < parameters["muon_iso_cut"]))
        nmuons = ak.to_pandas(ak.count(muons[muon_filter].pt, axis=1))

        mu_map = {"PT": "pt", "Eta": "eta", "Phi": "phi", "Charge": "charge"}
        muon_columns = ["PT", "Eta", "Phi", "Charge", "IsolationVar"]

        # Convert one column at a time to preserve event indices in Pandas
        muon_feature_list = []
        for col in muon_columns:
            muon_feature = df[parameters["muon_branch"]][col]
            val = ak.to_pandas(muon_feature[muon_filter])
            muon_feature_list.append(val)

        muons = pd.concat(muon_feature_list, axis=1)
        muons.columns = muon_columns
        muons.rename(columns=mu_map, inplace=True)

        mu1 = muons.loc[muons.pt.groupby("entry").idxmax()]
        mu2 = muons.loc[muons.pt.groupby("entry").idxmin()]
        mu1.index = mu1.index.droplevel("subentry")
        mu2.index = mu2.index.droplevel("subentry")
        pass_leading_pt = mu1.pt > parameters["muon_leading_pt"]
        fill_muons(output, mu1, mu2)

        output.mm_charge = output.mu1_charge * output.mu2_charge

        # Select electrons
        electrons = df[parameters["electron_branch"]]
        electrons = electrons[
            (electrons.pt > parameters["electron_pt_cut"])
            & (abs(electrons.eta) < parameters["electron_eta_cut"])]
        nelectrons = ak.to_pandas(ak.count(electrons.pt, axis=1))

        # Select jets
        jets = df[parameters["jet_branch"]]
        mu_for_clean = df[parameters["muon_branch"]]
        mu_for_clean = mu_for_clean[
            (mu_for_clean.pt > parameters["muon_pt_cut"])
            & (mu_for_clean.IsolationVar < parameters["muon_iso_cut"])]
        _, jet_mu_dr = jets.nearest(mu_for_clean, return_metric=True)
        jet_filter = (
            ak.fill_none(jet_mu_dr > parameters["min_dr_mu_jet"], True)
            & (jets.pt > parameters["jet_pt_cut"])
            & (abs(jets.eta) < parameters["jet_eta_cut"]))
        njets = ak.to_pandas(ak.count(jets[jet_filter].pt, axis=1))

        jet_map = {"PT": "pt", "Eta": "eta", "Phi": "phi", "Mass": "mass"}
        jet_columns = ["PT", "Eta", "Phi", "Mass"]

        jet_feature_list = []
        for col in jet_columns:
            jet_feature = df[parameters["jet_branch"]][col]
            val = ak.to_pandas(jet_feature[jet_filter])
            jet_feature_list.append(val)

        jets = pd.concat(jet_feature_list, axis=1)
        jets.columns = jet_columns
        jets.rename(columns=jet_map, inplace=True)

        jets = jets.sort_values(["entry", "pt"], ascending=[True, False])
        jets.index = pd.MultiIndex.from_arrays(
            [jets.index.get_level_values(0),
             jets.groupby(level=0).cumcount()],
            names=["entry", "subentry"],
        )
        jet1 = jets.loc[pd.IndexSlice[:, 0], :]
        jet2 = jets.loc[pd.IndexSlice[:, 1], :]
        jet1.index = jet1.index.droplevel("subentry")
        jet2.index = jet2.index.droplevel("subentry")

        fill_jets(output, jet1, jet2)
        fill_gen_jets(df, output)

        # Event selection: two opposite-sign muons and no electrons
        output["nmuons"] = nmuons
        output["nelectrons"] = nelectrons
        output["njets"] = njets
        output[["nmuons", "nelectrons",
                "njets"]] = output[["nmuons", "nelectrons", "njets"]].fillna(0)

        output["event_selection"] = ((output.nmuons == 2)
                                     & (output.mm_charge == -1)
                                     & (output.nelectrons == 0)
                                     & pass_leading_pt)

        mass = output.dimuon_mass
        output["region"] = None
        output.loc[((mass > 76) & (mass < 106)), "region"] = "z-peak"
        output.loc[((mass > 110) & (mass < 115.03)) | ((mass > 135.03) &
                                                       (mass < 150)),
                   "region", ] = "h-sidebands"
        output.loc[((mass > 115.03) & (mass < 135.03)), "region"] = "h-peak"

        output = output.loc[output.event_selection, :]
        output = output.reindex(sorted(output.columns), axis=1)

        output = output[output.region.isin(regions)]
        """
        input_evts = numevents
        output_evts = output.shape[0]
        out_yield = output.lumi_wgt.sum()
        out_vbf = output[
            (output.jj_mass>400) & (output.jj_dEta>2.5) & (output.jet1_pt>35) & (output.njets>=2)
        ].lumi_wgt.sum()
        out_ggh = out_yield - out_vbf

        print(f"\n{dataset}:    {input_evts}  ->  {output_evts};    yield = {out_ggh} (ggH) + {out_vbf} (VBF) = {out_yield}")
        """

        to_return = None
        if self.apply_to_output is None:
            to_return = output
        else:
            self.apply_to_output(output)
            to_return = self.accumulator.identity()

        return to_return
Esempio n. 20
0
    def process(self, df):
        # Initialize timer
        if self.timer:
            self.timer.update()

        # Dataset name (see definitions in config/datasets.py)
        dataset = df.metadata["dataset"]
        is_mc = "data" not in dataset
        numevents = len(df)

        # ------------------------------------------------------------#
        # Apply HLT, lumimask, genweights, PU weights
        # and L1 prefiring weights
        # ------------------------------------------------------------#

        # All variables that we want to save
        # will be collected into the 'output' dataframe
        output = pd.DataFrame({"run": df.run, "event": df.event})
        output.index.name = "entry"
        output["npv"] = df.PV.npvs
        output["met"] = df.MET.pt

        # Separate dataframe to keep track on weights
        # and their systematic variations
        weights = Weights(output)

        if is_mc:
            # For MC: Apply gen.weights, pileup weights, lumi weights,
            # L1 prefiring weights
            mask = np.ones(numevents, dtype=bool)
            genweight = df.genWeight
            weights.add_weight("genwgt", genweight)
            weights.add_weight("lumi", self.lumi_weights[dataset])

            pu_wgts = pu_evaluator(
                self.pu_lookups,
                self.parameters,
                numevents,
                np.array(df.Pileup.nTrueInt),
                self.auto_pu,
            )
            weights.add_weight("pu_wgt", pu_wgts, how="all")

            if self.parameters["do_l1prefiring_wgts"]:
                if "L1PreFiringWeight" in df.fields:
                    l1pfw = l1pf_weights(df)
                    weights.add_weight("l1prefiring_wgt", l1pfw, how="all")
                else:
                    weights.add_weight("l1prefiring_wgt", how="dummy_vars")

        else:
            # For Data: apply Lumi mask
            lumi_info = LumiMask(self.parameters["lumimask"])
            mask = lumi_info(df.run, df.luminosityBlock)

        # Apply HLT to both Data and MC
        hlt_columns = [c for c in self.parameters["hlt"] if c in df.HLT.fields]
        hlt = ak.to_pandas(df.HLT[hlt_columns])
        if len(hlt_columns) == 0:
            hlt = False
        else:
            hlt = hlt[hlt_columns].sum(axis=1)

        if self.timer:
            self.timer.add_checkpoint("HLT, lumimask, PU weights")

        # ------------------------------------------------------------#
        # Update muon kinematics with Rochester correction,
        # FSR recovery and GeoFit correction
        # Raw pT and eta are stored to be used in event selection
        # ------------------------------------------------------------#

        # Save raw variables before computing any corrections
        df["Muon", "pt_raw"] = df.Muon.pt
        df["Muon", "eta_raw"] = df.Muon.eta
        df["Muon", "phi_raw"] = df.Muon.phi
        df["Muon", "pfRelIso04_all_raw"] = df.Muon.pfRelIso04_all

        # Rochester correction
        if self.do_roccor:
            apply_roccor(df, self.roccor_lookup, is_mc)
            df["Muon", "pt"] = df.Muon.pt_roch

            # variations will be in branches pt_roch_up and pt_roch_down
            # muons_pts = {
            #     'nominal': df.Muon.pt,
            #     'roch_up':df.Muon.pt_roch_up,
            #     'roch_down':df.Muon.pt_roch_down
            # }

        # for ...
        if True:  # indent reserved for loop over muon pT variations
            # According to HIG-19-006, these variations have negligible
            # effect on significance, but it's better to have them
            # implemented in the future

            # FSR recovery
            if self.do_fsr:
                has_fsr = fsr_recovery(df)
                df["Muon", "pt"] = df.Muon.pt_fsr
                df["Muon", "eta"] = df.Muon.eta_fsr
                df["Muon", "phi"] = df.Muon.phi_fsr
                df["Muon", "pfRelIso04_all"] = df.Muon.iso_fsr

            # if FSR was applied, 'pt_fsr' will be corrected pt
            # if FSR wasn't applied, just copy 'pt' to 'pt_fsr'
            df["Muon", "pt_fsr"] = df.Muon.pt

            # GeoFit correction
            if self.do_geofit and ("dxybs" in df.Muon.fields):
                apply_geofit(df, self.year, ~has_fsr)
                df["Muon", "pt"] = df.Muon.pt_fsr

            if self.timer:
                self.timer.add_checkpoint("Muon corrections")

            # --- conversion from awkward to pandas --- #
            muon_columns = [
                "pt",
                "pt_fsr",
                "eta",
                "phi",
                "charge",
                "ptErr",
                "mass",
                "pt_raw",
                "eta_raw",
                "pfRelIso04_all",
            ] + [self.parameters["muon_id"]]
            muons = ak.to_pandas(df.Muon[muon_columns])

            # --------------------------------------------------------#
            # Select muons that pass pT, eta, isolation cuts,
            # muon ID and quality flags
            # Select events with 2 OS muons, no electrons,
            # passing quality cuts and at least one good PV
            # --------------------------------------------------------#

            # Apply event quality flags
            flags = ak.to_pandas(df.Flag[self.parameters["event_flags"]])
            flags = flags[self.parameters["event_flags"]].product(axis=1)
            muons["pass_flags"] = True
            if self.parameters["muon_flags"]:
                muons["pass_flags"] = muons[
                    self.parameters["muon_flags"]].product(axis=1)

            # Define baseline muon selection (applied to pandas DF!)
            muons["selection"] = (
                (muons.pt_raw > self.parameters["muon_pt_cut"])
                & (abs(muons.eta_raw) < self.parameters["muon_eta_cut"])
                & (muons.pfRelIso04_all < self.parameters["muon_iso_cut"])
                & muons[self.parameters["muon_id"]]
                & muons.pass_flags)

            # Count muons
            nmuons = (muons[muons.selection].reset_index().groupby("entry")
                      ["subentry"].nunique())

            # Find opposite-sign muons
            mm_charge = muons.loc[muons.selection,
                                  "charge"].groupby("entry").prod()

            # Veto events with good quality electrons
            electrons = df.Electron[
                (df.Electron.pt > self.parameters["electron_pt_cut"])
                & (abs(df.Electron.eta) < self.parameters["electron_eta_cut"])
                & (df.Electron[self.parameters["electron_id"]] == 1)]
            electron_veto = ak.to_numpy(ak.count(electrons.pt, axis=1) == 0)

            # Find events with at least one good primary vertex
            good_pv = ak.to_pandas(df.PV).npvsGood > 0

            # Define baseline event selection
            output["two_muons"] = nmuons == 2
            output["event_selection"] = (mask
                                         & (hlt > 0)
                                         & (flags > 0)
                                         & (nmuons == 2)
                                         & (mm_charge == -1)
                                         & electron_veto
                                         & good_pv)

            # --------------------------------------------------------#
            # Select two leading-pT muons
            # --------------------------------------------------------#

            # Find pT-leading and subleading muons
            # This is slow for large chunk size.
            # Consider reimplementing using sort_values().groupby().nth()
            # or sort_values().drop_duplicates()
            # or using Numba
            # https://stackoverflow.com/questions/50381064/select-the-max-row-per-group-pandas-performance-issue
            muons = muons[muons.selection & (nmuons == 2)]
            mu1 = muons.loc[muons.pt.groupby("entry").idxmax()]
            mu2 = muons.loc[muons.pt.groupby("entry").idxmin()]
            mu1.index = mu1.index.droplevel("subentry")
            mu2.index = mu2.index.droplevel("subentry")

            # --------------------------------------------------------#
            # Select events with muons passing leading pT cut
            # and trigger matching (trig match not done in final vrsn)
            # --------------------------------------------------------#

            # Events where there is at least one muon passing
            # leading muon pT cut
            pass_leading_pt = mu1.pt_raw > self.parameters["muon_leading_pt"]

            # update event selection with leading muon pT cut
            output["pass_leading_pt"] = pass_leading_pt
            output[
                "event_selection"] = output.event_selection & output.pass_leading_pt

            # --------------------------------------------------------#
            # Fill dimuon and muon variables
            # --------------------------------------------------------#

            fill_muons(self, output, mu1, mu2, is_mc)

            if self.timer:
                self.timer.add_checkpoint("Event & muon selection")

        # ------------------------------------------------------------#
        # Prepare jets
        # ------------------------------------------------------------#

        prepare_jets(df, is_mc)

        # ------------------------------------------------------------#
        # Apply JEC, get JEC and JER variations
        # ------------------------------------------------------------#

        jets = df.Jet

        self.do_jec = False

        # We only need to reapply JEC for 2018 data
        # (unless new versions of JEC are released)
        if ("data" in dataset) and ("2018" in self.year):
            self.do_jec = True

        jets = apply_jec(
            df,
            jets,
            dataset,
            is_mc,
            self.year,
            self.do_jec,
            self.do_jecunc,
            self.do_jerunc,
            self.jec_factories,
            self.jec_factories_data,
        )

        # ------------------------------------------------------------#
        # Calculate other event weights
        # ------------------------------------------------------------#

        if is_mc:
            do_nnlops = self.do_nnlops and ("ggh" in dataset)
            if do_nnlops:
                nnlopsw = nnlops_weights(df, numevents, self.parameters,
                                         dataset)
                weights.add_weight("nnlops", nnlopsw)
            else:
                weights.add_weight("nnlops", how="dummy")
            # --- --- --- --- --- --- --- --- --- --- --- --- --- --- #
            # do_zpt = ('dy' in dataset)
            #
            # if do_zpt:
            #     zpt_weight = np.ones(numevents, dtype=float)
            #     zpt_weight[two_muons] =\
            #         self.evaluator[self.zpt_path](
            #             output['dimuon_pt'][two_muons]
            #         ).flatten()
            #     weights.add_weight('zpt_wgt', zpt_weight)
            # --- --- --- --- --- --- --- --- --- --- --- --- --- --- #
            do_musf = True
            if do_musf:
                muID, muIso, muTrig = musf_evaluator(self.musf_lookup,
                                                     self.year, numevents, mu1,
                                                     mu2)
                weights.add_weight("muID", muID, how="all")
                weights.add_weight("muIso", muIso, how="all")
                weights.add_weight("muTrig", muTrig, how="all")
            else:
                weights.add_weight("muID", how="dummy_all")
                weights.add_weight("muIso", how="dummy_all")
                weights.add_weight("muTrig", how="dummy_all")
            # --- --- --- --- --- --- --- --- --- --- --- --- --- --- #
            do_lhe = (("LHEScaleWeight" in df.fields)
                      and ("LHEPdfWeight" in df.fields)
                      and ("nominal" in self.pt_variations))
            if do_lhe:
                lhe_ren, lhe_fac = lhe_weights(df, output, dataset, self.year)
                weights.add_weight("LHERen", lhe_ren, how="only_vars")
                weights.add_weight("LHEFac", lhe_fac, how="only_vars")
            else:
                weights.add_weight("LHERen", how="dummy_vars")
                weights.add_weight("LHEFac", how="dummy_vars")
            # --- --- --- --- --- --- --- --- --- --- --- --- --- --- #
            do_thu = (("vbf" in dataset) and ("dy" not in dataset)
                      and ("nominal" in self.pt_variations)
                      and ("stage1_1_fine_cat_pTjet30GeV" in df.HTXS.fields))
            if do_thu:
                for i, name in enumerate(self.sths_names):
                    wgt_up = stxs_uncert(
                        i,
                        ak.to_numpy(df.HTXS.stage1_1_fine_cat_pTjet30GeV),
                        1.0,
                        self.stxs_acc_lookups,
                        self.powheg_xsec_lookup,
                    )
                    wgt_down = stxs_uncert(
                        i,
                        ak.to_numpy(df.HTXS.stage1_1_fine_cat_pTjet30GeV),
                        -1.0,
                        self.stxs_acc_lookups,
                        self.powheg_xsec_lookup,
                    )
                    thu_wgts = {"up": wgt_up, "down": wgt_down}
                    weights.add_weight("THU_VBF_" + name,
                                       thu_wgts,
                                       how="only_vars")
            else:
                for i, name in enumerate(self.sths_names):
                    weights.add_weight("THU_VBF_" + name, how="dummy_vars")
            # --- --- --- --- --- --- --- --- --- --- --- --- --- --- #
            do_pdf = (self.do_pdf and ("nominal" in self.pt_variations)
                      and ("dy" in dataset or "ewk" in dataset
                           or "ggh" in dataset or "vbf" in dataset)
                      and ("mg" not in dataset))
            if "2016" in self.year:
                max_replicas = 0
                if "dy" in dataset:
                    max_replicas = 100
                elif "ewk" in dataset:
                    max_replicas = 33
                else:
                    max_replicas = 100
                if do_pdf:
                    pdf_wgts = df.LHEPdfWeight[:, 0:self.
                                               parameters["n_pdf_variations"]]
                for i in range(100):
                    if (i < max_replicas) and do_pdf:
                        output[f"pdf_mcreplica{i}"] = pdf_wgts[:, i]
                    else:
                        output[f"pdf_mcreplica{i}"] = np.nan
            else:
                if do_pdf:
                    pdf_wgts = df.LHEPdfWeight[:, 0:self.
                                               parameters["n_pdf_variations"]][
                                                   0]
                    pdf_wgts = np.array(pdf_wgts)
                    pdf_vars = {
                        "up": (1 + 2 * pdf_wgts.std()),
                        "down": (1 - 2 * pdf_wgts.std()),
                    }
                    weights.add_weight("pdf_2rms", pdf_vars, how="only_vars")
                else:
                    weights.add_weight("pdf_2rms", how="dummy_vars")
            # --- --- --- --- --- --- --- --- --- --- --- --- --- --- #

        if is_mc:
            output = fill_gen_jets(df, output)

        # ------------------------------------------------------------#
        # Loop over JEC variations and fill jet variables
        # ------------------------------------------------------------#

        output.columns = pd.MultiIndex.from_product(
            [output.columns, [""]], names=["Variable", "Variation"])

        if self.timer:
            self.timer.add_checkpoint("Jet preparation & event weights")

        for v_name in self.pt_variations:
            output_updated = self.jet_loop(
                v_name,
                is_mc,
                df,
                dataset,
                mask,
                muons,
                mu1,
                mu2,
                jets,
                weights,
                numevents,
                output,
            )
            if output_updated is not None:
                output = output_updated

        if self.timer:
            self.timer.add_checkpoint("Jet loop")

        # ------------------------------------------------------------#
        # Fill outputs
        # ------------------------------------------------------------#
        mass = output.dimuon_mass
        output["region"] = None
        output.loc[((mass > 76) & (mass < 106)), "region"] = "z-peak"
        output.loc[((mass > 110) & (mass < 115.03)) | ((mass > 135.03) &
                                                       (mass < 150)),
                   "region", ] = "h-sidebands"
        output.loc[((mass > 115.03) & (mass < 135.03)), "region"] = "h-peak"
        output["dataset"] = dataset
        output["year"] = int(self.year)

        for wgt in weights.df.columns:
            skip_saving = (("nominal" not in wgt) and ("up" not in wgt)
                           and ("down" not in wgt))
            if skip_saving:
                continue
            output[f"wgt_{wgt}"] = weights.get_weight(wgt)

        columns_to_save = [
            c for c in output.columns
            if (c[0] in self.vars_to_save) or ("wgt_" in c[0]) or (
                "mcreplica" in c[0]) or (c[0] in ["region", "dataset", "year"])
            or ("gjet" in c[0]) or ("gjj" in c[0])
        ]

        output = output.loc[output.event_selection, columns_to_save]
        output = output.reindex(sorted(output.columns), axis=1)

        output.columns = [
            " ".join(col).strip() for col in output.columns.values
        ]

        output = output[output.region.isin(self.regions)]
        """
        input_evts = numevents
        output_evts = output.shape[0]
        out_yield = output.wgt_nominal.sum()
        out_vbf = output[
                (output["jj_mass nominal"]>400) & (output["jj_dEta nominal"]>2.5) & (output["jet1_pt nominal"]>35)
            ].wgt_nominal.sum()
        out_ggh = out_yield - out_vbf

        print(f"\n{dataset}:    {input_evts}  ->  {output_evts};    yield = {out_ggh} (ggH) + {out_vbf} (VBF) = {out_yield}")
        """

        to_return = None
        if self.apply_to_output is None:
            to_return = output
        else:
            self.apply_to_output(output)
            to_return = self.accumulator.identity()

        if self.timer:
            self.timer.add_checkpoint("Saving outputs")
            self.timer.summary()

        return to_return
def l1pf_weights(df):
    l1pfw = ak.to_pandas(df.L1PreFiringWeight)
    ret = {"nom": l1pfw.Nom, "up": l1pfw.Up, "down": l1pfw.Dn}
    return ret
def test():
    def key(n):
        if n in ("values", "x", "y"):
            return n
        else:
            return tuple(eval(n.replace("nan", "None").replace("null", "None")))

    def regularize(data):
        if isinstance(data, dict):
            return dict((key(n), regularize(x)) for n, x in data.items())
        else:
            return data

    array = ak.Array([[0.0, 1.1, 2.2], [], [3.3, 4.4], [5.5], [6.6, None, 8.8, 9.9]])
    assert regularize(json.loads(ak.to_pandas(array).to_json())) == {
        "values": {
            (0, 0): 0.0,
            (0, 1): 1.1,
            (0, 2): 2.2,
            (2, 0): 3.3,
            (2, 1): 4.4,
            (3, 0): 5.5,
            (4, 0): 6.6,
            (4, 1): None,
            (4, 2): 8.8,
            (4, 3): 9.9,
        }
    }

    array = ak.Array(
        [[[0.0, 1.1, 2.2], [], [3.3, 4.4]], [[5.5]], [[6.6, None, 8.8, 9.9]]]
    )
    assert regularize(json.loads(ak.to_pandas(array).to_json())) == {
        "values": {
            (0, 0, 0): 0.0,
            (0, 0, 1): 1.1,
            (0, 0, 2): 2.2,
            (0, 2, 0): 3.3,
            (0, 2, 1): 4.4,
            (1, 0, 0): 5.5,
            (2, 0, 0): 6.6,
            (2, 0, 1): None,
            (2, 0, 2): 8.8,
            (2, 0, 3): 9.9,
        }
    }

    array = ak.Array(
        [
            [[0.0, 1.1, 2.2], [], [3.3, 4.4]],
            [],
            [[5.5]],
            None,
            [[], None, [6.6, None, 8.8, 9.9]],
        ]
    )
    assert regularize(json.loads(ak.to_pandas(array).to_json())) == {
        "values": {
            (0, 0, 0): 0.0,
            (0, 0, 1): 1.1,
            (0, 0, 2): 2.2,
            (0, 2, 0): 3.3,
            (0, 2, 1): 4.4,
            (2, 0, 0): 5.5,
            (4, 2, 0): 6.6,
            (4, 2, 1): None,
            (4, 2, 2): 8.8,
            (4, 2, 3): 9.9,
        }
    }

    array = ak.Array(
        [
            [
                [{"x": 0.0, "y": []}, {"x": 1.1, "y": [1]}, {"x": 2.2, "y": [2, 2]}],
                [],
                [{"x": 3.3, "y": [3, 3, 3]}, {"x": 4.4, "y": [4, 4, 4, 4]}],
            ],
            [],
            [[{"x": 5.5, "y": [5, 5, 5, 5, 5]}]],
        ]
    )
    assert regularize(json.loads(ak.to_pandas(array).to_json())) == {
        "x": {
            (0, 0, 1, 0): 1.1,
            (0, 0, 2, 0): 2.2,
            (0, 0, 2, 1): 2.2,
            (0, 2, 0, 0): 3.3,
            (0, 2, 0, 1): 3.3,
            (0, 2, 0, 2): 3.3,
            (0, 2, 1, 0): 4.4,
            (0, 2, 1, 1): 4.4,
            (0, 2, 1, 2): 4.4,
            (0, 2, 1, 3): 4.4,
            (2, 0, 0, 0): 5.5,
            (2, 0, 0, 1): 5.5,
            (2, 0, 0, 2): 5.5,
            (2, 0, 0, 3): 5.5,
            (2, 0, 0, 4): 5.5,
        },
        "y": {
            (0, 0, 1, 0): 1,
            (0, 0, 2, 0): 2,
            (0, 0, 2, 1): 2,
            (0, 2, 0, 0): 3,
            (0, 2, 0, 1): 3,
            (0, 2, 0, 2): 3,
            (0, 2, 1, 0): 4,
            (0, 2, 1, 1): 4,
            (0, 2, 1, 2): 4,
            (0, 2, 1, 3): 4,
            (2, 0, 0, 0): 5,
            (2, 0, 0, 1): 5,
            (2, 0, 0, 2): 5,
            (2, 0, 0, 3): 5,
            (2, 0, 0, 4): 5,
        },
    }

    assert regularize(json.loads(ak.to_pandas(array, how="outer").to_json())) == {
        "x": {
            (0, 0, 0, None): 0.0,
            (0, 0, 1, 0.0): 1.1,
            (0, 0, 2, 0.0): 2.2,
            (0, 0, 2, 1.0): 2.2,
            (0, 2, 0, 0.0): 3.3,
            (0, 2, 0, 1.0): 3.3,
            (0, 2, 0, 2.0): 3.3,
            (0, 2, 1, 0.0): 4.4,
            (0, 2, 1, 1.0): 4.4,
            (0, 2, 1, 2.0): 4.4,
            (0, 2, 1, 3.0): 4.4,
            (2, 0, 0, 0.0): 5.5,
            (2, 0, 0, 1.0): 5.5,
            (2, 0, 0, 2.0): 5.5,
            (2, 0, 0, 3.0): 5.5,
            (2, 0, 0, 4.0): 5.5,
        },
        "y": {
            (0, 0, 0, None): None,
            (0, 0, 1, 0.0): 1.0,
            (0, 0, 2, 0.0): 2.0,
            (0, 0, 2, 1.0): 2.0,
            (0, 2, 0, 0.0): 3.0,
            (0, 2, 0, 1.0): 3.0,
            (0, 2, 0, 2.0): 3.0,
            (0, 2, 1, 0.0): 4.0,
            (0, 2, 1, 1.0): 4.0,
            (0, 2, 1, 2.0): 4.0,
            (0, 2, 1, 3.0): 4.0,
            (2, 0, 0, 0.0): 5.0,
            (2, 0, 0, 1.0): 5.0,
            (2, 0, 0, 2.0): 5.0,
            (2, 0, 0, 3.0): 5.0,
            (2, 0, 0, 4.0): 5.0,
        },
    }

    array = ak.Array(
        [
            [
                [{"x": 0.0, "y": 0}, {"x": 1.1, "y": 1}, {"x": 2.2, "y": 2}],
                [],
                [{"x": 3.3, "y": 3}, {"x": 4.4, "y": 4}],
            ],
            [],
            [[{"x": 5.5, "y": 5}]],
        ]
    )
    assert regularize(json.loads(ak.to_pandas(array).to_json())) == {
        "x": {
            (0, 0, 0): 0.0,
            (0, 0, 1): 1.1,
            (0, 0, 2): 2.2,
            (0, 2, 0): 3.3,
            (0, 2, 1): 4.4,
            (2, 0, 0): 5.5,
        },
        "y": {
            (0, 0, 0): 0,
            (0, 0, 1): 1,
            (0, 0, 2): 2,
            (0, 2, 0): 3,
            (0, 2, 1): 4,
            (2, 0, 0): 5,
        },
    }

    array = ak.Array(
        [
            [
                [
                    {"x": 0.0, "y": {"z": 0}},
                    {"x": 1.1, "y": {"z": 1}},
                    {"x": 2.2, "y": {"z": 2}},
                ],
                [],
                [{"x": 3.3, "y": {"z": 3}}, {"x": 4.4, "y": {"z": 4}}],
            ],
            [],
            [[{"x": 5.5, "y": {"z": 5}}]],
        ]
    )
    assert regularize(json.loads(ak.to_pandas(array).to_json())) == {
        ("x", ""): {
            (0, 0, 0): 0.0,
            (0, 0, 1): 1.1,
            (0, 0, 2): 2.2,
            (0, 2, 0): 3.3,
            (0, 2, 1): 4.4,
            (2, 0, 0): 5.5,
        },
        ("y", "z"): {
            (0, 0, 0): 0,
            (0, 0, 1): 1,
            (0, 0, 2): 2,
            (0, 2, 0): 3,
            (0, 2, 1): 4,
            (2, 0, 0): 5,
        },
    }

    one = ak.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5]])
    two = ak.Array([[100, 200], [300], [400, 500]])
    assert [
        regularize(json.loads(x.to_json()))
        for x in ak.to_pandas({"x": one, "y": two}, how=None)
    ] == [
        {"x": {(0, 0): 1.1, (0, 1): 2.2, (0, 2): 3.3, (2, 0): 4.4, (2, 1): 5.5}},
        {"y": {(0, 0): 100, (0, 1): 200, (1, 0): 300, (2, 0): 400, (2, 1): 500}},
    ]
Esempio n. 23
0
print(wzg_dat)
columns = wzg_dat.fields
print(columns)

print(len(wzg_dat))

y = np.ones(len(wzg_dat)) * 1
#xsec =  np.ones(len(wzg_dat)) *
#gen  =

data = {'y': y}
df = pd.DataFrame(data)

for column in columns:
    print(column, len(wzg_dat[column]))
    df[column] = ak.to_pandas(wzg_dat[column])

display(df)

corr = df.corr()
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr,
            cmap=cmap,
            vmax=.3,
            center=0,
Esempio n. 24
0
def get_phasespace_df(timestamp, layer):
    root_file = f"../results/tracker_{timestamp}_{layer}.root:PhaseSpace"
    df = pd.DataFrame()
    with rt.open(root_file) as tree:
        df = ak.to_pandas(tree.arrays())
    return df
Esempio n. 25
0
    def jet_loop(
        self,
        variation,
        is_mc,
        df,
        dataset,
        mask,
        muons,
        mu1,
        mu2,
        jets,
        weights,
        numevents,
        output,
    ):
        # weights = copy.deepcopy(weights)

        if not is_mc and variation != "nominal":
            return

        variables = pd.DataFrame(index=output.index)

        jet_columns = [
            "pt", "eta", "phi", "jetId", "qgl", "puId", "mass", "btagDeepB"
        ]
        if "puId17" in df.Jet.fields:
            jet_columns += ["puId17"]
        if is_mc:
            jet_columns += ["partonFlavour", "hadronFlavour"]
        if variation == "nominal":
            if self.do_jec:
                jet_columns += ["pt_jec", "mass_jec"]
            if is_mc and self.do_jerunc:
                jet_columns += ["pt_orig", "mass_orig"]

        # Find jets that have selected muons within dR<0.4 from them
        matched_mu_pt = jets.matched_muons.pt_fsr
        matched_mu_iso = jets.matched_muons.pfRelIso04_all
        matched_mu_id = jets.matched_muons[self.parameters["muon_id"]]
        matched_mu_pass = ((matched_mu_pt > self.parameters["muon_pt_cut"])
                           & (matched_mu_iso < self.parameters["muon_iso_cut"])
                           & matched_mu_id)
        clean = ~(ak.to_pandas(matched_mu_pass).astype(float).fillna(
            0.0).groupby(level=[0, 1]).sum().astype(bool))

        # if self.timer:
        #     self.timer.add_checkpoint("Clean jets from matched muons")

        # Select particular JEC variation
        if "_up" in variation:
            unc_name = "JES_" + variation.replace("_up", "")
            if unc_name not in jets.fields:
                return
            jets = jets[unc_name]["up"][jet_columns]
        elif "_down" in variation:
            unc_name = "JES_" + variation.replace("_down", "")
            if unc_name not in jets.fields:
                return
            jets = jets[unc_name]["down"][jet_columns]
        else:
            jets = jets[jet_columns]

        # --- conversion from awkward to pandas --- #
        jets = ak.to_pandas(jets)

        if jets.index.nlevels == 3:
            # sometimes there are duplicates?
            jets = jets.loc[pd.IndexSlice[:, :, 0], :]
            jets.index = jets.index.droplevel("subsubentry")

        if variation == "nominal":
            # Update pt and mass if JEC was applied
            if self.do_jec:
                jets["pt"] = jets["pt_jec"]
                jets["mass"] = jets["mass_jec"]

            # We use JER corrections only for systematics, so we shouldn't
            # update the kinematics. Use original values,
            # unless JEC were applied.
            if is_mc and self.do_jerunc and not self.do_jec:
                jets["pt"] = jets["pt_orig"]
                jets["mass"] = jets["mass_orig"]

        # ------------------------------------------------------------#
        # Apply jetID and PUID
        # ------------------------------------------------------------#

        pass_jet_id = jet_id(jets, self.parameters, self.year)
        pass_jet_puid = jet_puid(jets, self.parameters, self.year)

        # Jet PUID scale factors
        # if is_mc and False:  # disable for now
        #     puid_weight = puid_weights(
        #         self.evaluator, self.year, jets, pt_name,
        #         jet_puid_opt, jet_puid, numevents
        #     )
        #     weights.add_weight('puid_wgt', puid_weight)

        # ------------------------------------------------------------#
        # Select jets
        # ------------------------------------------------------------#
        jets["clean"] = clean

        jet_selection = (pass_jet_id
                         & pass_jet_puid
                         & (jets.qgl > -2)
                         & jets.clean
                         & (jets.pt > self.parameters["jet_pt_cut"])
                         & (abs(jets.eta) < self.parameters["jet_eta_cut"]))

        jets = jets[jet_selection]

        # if self.timer:
        #     self.timer.add_checkpoint("Selected jets")

        # ------------------------------------------------------------#
        # Fill jet-related variables
        # ------------------------------------------------------------#

        njets = jets.reset_index().groupby("entry")["subentry"].nunique()
        variables["njets"] = njets

        # one_jet = (njets > 0)
        two_jets = njets > 1

        # Sort jets by pT and reset their numbering in an event
        jets = jets.sort_values(["entry", "pt"], ascending=[True, False])
        jets.index = pd.MultiIndex.from_arrays(
            [jets.index.get_level_values(0),
             jets.groupby(level=0).cumcount()],
            names=["entry", "subentry"],
        )

        # Select two jets with highest pT
        try:
            jet1 = jets.loc[pd.IndexSlice[:, 0], :]
            jet2 = jets.loc[pd.IndexSlice[:, 1], :]
            jet1.index = jet1.index.droplevel("subentry")
            jet2.index = jet2.index.droplevel("subentry")
        except Exception:
            return

        fill_jets(output, variables, jet1, jet2)

        # if self.timer:
        #     self.timer.add_checkpoint("Filled jet variables")

        # ------------------------------------------------------------#
        # Fill soft activity jet variables
        # ------------------------------------------------------------#

        # Effect of changes in jet acceptance should be negligible,
        # no need to calcluate this for each jet pT variation
        if variation == "nominal":
            fill_softjets(df, output, variables, 2)
            fill_softjets(df, output, variables, 5)

            # if self.timer:
            #     self.timer.add_checkpoint("Calculated SA variables")

        # ------------------------------------------------------------#
        # Apply remaining cuts
        # ------------------------------------------------------------#

        # Cut has to be defined here because we will use it in
        # b-tag weights calculation
        vbf_cut = (variables.jj_mass > 400) & (variables.jj_dEta >
                                               2.5) & (jet1.pt > 35)

        # ------------------------------------------------------------#
        # Calculate QGL weights, btag SF and apply btag veto
        # ------------------------------------------------------------#

        if is_mc and variation == "nominal":
            # --- QGL weights --- #
            isHerwig = "herwig" in dataset

            qgl_wgts = qgl_weights(jet1, jet2, isHerwig, output, variables,
                                   njets)
            weights.add_weight("qgl_wgt", qgl_wgts, how="all")

            # --- Btag weights --- #
            bjet_sel_mask = output.event_selection & two_jets & vbf_cut

            btag_wgt, btag_syst = btag_weights(self, self.btag_lookup,
                                               self.btag_systs, jets, weights,
                                               bjet_sel_mask)
            weights.add_weight("btag_wgt", btag_wgt)

            # --- Btag weights variations --- #
            for name, bs in btag_syst.items():
                weights.add_weight(f"btag_wgt_{name}", bs, how="only_vars")

            # if self.timer:
            #     self.timer.add_checkpoint(
            #         "Applied QGL and B-tag weights"
            #     )

        # Separate from ttH and VH phase space
        variables["nBtagLoose"] = (
            jets[(jets.btagDeepB > self.parameters["btag_loose_wp"])
                 & (abs(jets.eta) < 2.5)].reset_index().groupby(
                     "entry")["subentry"].nunique())

        variables["nBtagMedium"] = (
            jets[(jets.btagDeepB > self.parameters["btag_medium_wp"])
                 & (abs(jets.eta) < 2.5)].reset_index().groupby(
                     "entry")["subentry"].nunique())
        variables.nBtagLoose = variables.nBtagLoose.fillna(0.0)
        variables.nBtagMedium = variables.nBtagMedium.fillna(0.0)

        variables.selection = (output.event_selection
                               & (variables.nBtagLoose < 2)
                               & (variables.nBtagMedium < 1))

        # --------------------------------------------------------------#
        # Fill outputs
        # --------------------------------------------------------------#

        variables.update({"wgt_nominal": weights.get_weight("nominal")})

        # All variables are affected by jet pT because of jet selections:
        # a jet may or may not be selected depending on pT variation.

        for key, val in variables.items():
            output.loc[:, pd.IndexSlice[key, variation]] = val

        return output
Esempio n. 26
0
def main(args):

    # Read nano, micro, EB or EE cuts
    nanoaod_arr = ak.from_parquet(args.nano_input_dir)
    print("Read nanoaod: {}".format(nanoaod_arr.type))
    
    microaod_arr = uproot.concatenate(
        ["{}/*.root:diphotonDumper/trees/ggH_125_13TeV_All_$SYST".format(args.micro_input_dir)]
        )
    print("Read microaod: {}".format(microaod_arr.type))
    # Stupid typo in flashgg
    if "lead_ch_iso_worst__uncorr" in microaod_arr.fields:
        microaod_arr["lead_ch_iso_worst_uncorr"] = microaod_arr["lead_ch_iso_worst__uncorr"]

    if args.sd == "EB":
        nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.lead_eta) < 1.5]
        nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.sublead_eta) < 1.5]
        microaod_arr = microaod_arr[np.abs(microaod_arr.lead_eta) < 1.5]
        microaod_arr = microaod_arr[np.abs(microaod_arr.sublead_eta) < 1.5]

    if args.sd == "EE":
        nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.lead_eta) > 1.5]
        nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.sublead_eta) > 1.5]
        microaod_arr = microaod_arr[np.abs(microaod_arr.lead_eta) > 1.5]
        microaod_arr = microaod_arr[np.abs(microaod_arr.sublead_eta) > 1.5]

    # Read catalogue of variables to be plotted
    with open("plots_specs.json", "r") as f:
        columns = json.load(f)

    # Create dict where keys are names of variables in nano and values are names of variables in micro
    nano_micro_names = {var["nano_col"]: var["micro_col"] for var in columns}
    nano_micro_names["event"] = "event"
    nano_micro_names["lumi"] = "lumi"

    # Event by event
    nano_dict = {k: nanoaod_arr[k] for k in nano_micro_names.keys()}
    nano_dict["lead_fixedGridRhoAll"] = nanoaod_arr["lead_fixedGridRhoAll"] # needed for XGBoost vs TMVA
    test_nano = ak.Array(nano_dict)

    test_micro = microaod_arr[nano_micro_names.values()]

    pd_nano = ak.to_pandas(test_nano)
    pd_micro = ak.to_pandas(test_micro)

    pd_nano = pd_nano.set_index(["event", "lumi"])
    pd_micro = pd_micro.set_index(["event", "lumi"])

    pd_joined = pd_nano.join(pd_micro, lsuffix="_nano", rsuffix="_micro")

    print("Joined dataframe:\n{}".format(pd_joined))

    #Remove NaN values
    for nano_name, micro_name in nano_micro_names.items():
        if nano_name in ["event", "lumi"]:
            break
        if nano_name == micro_name:
            nano_name += "_nano"
            micro_name += "_micro"
        pd_joined = pd_joined[pd_joined[nano_name].notna()]
        pd_joined = pd_joined[pd_joined[micro_name].notna()]

    # Cut over delta R
    # Here https://github.com/CoffeaTeam/coffea/blob/3db3fab23064c70d0ca63b185d51c7fa3b7849dc/coffea/nanoevents/methods/vector.py#L74
    # useful info
    deltaR_threshold = 0.1

    four_lead_nano = vector.obj(
        pt=pd_joined["lead_pt"],
        phi=pd_joined["lead_phi_nano"],
        eta=pd_joined["lead_eta_nano"],
        E=pd_joined["lead_energyRaw"]
    )

    four_sublead_nano = vector.obj(
        pt=pd_joined["sublead_pt"],
        phi=pd_joined["sublead_phi_nano"],
        eta=pd_joined["sublead_eta_nano"],
        E=pd_joined["sublead_energyRaw"]
    )

    pd_joined["deltaR_nano"] = four_lead_nano.deltaR(four_sublead_nano)

    four_lead_micro = vector.obj(
        pt=pd_joined["leadPt"],
        phi=pd_joined["lead_phi_micro"],
        eta=pd_joined["lead_eta_micro"],
        E=pd_joined["lead_SCRawE"]
    )

    four_sublead_micro = vector.obj(
        pt=pd_joined["subleadPt"],
        phi=pd_joined["sublead_phi_micro"],
        eta=pd_joined["sublead_eta_micro"],
        E=pd_joined["sublead_SCRawE"]
    )

    pd_joined["lead_deltaR"] = four_lead_nano.deltaR(four_lead_micro)
    pd_joined["sublead_deltaR"] = four_sublead_nano.deltaR(four_sublead_micro)
    pd_joined = pd_joined[pd_joined["lead_deltaR"] < deltaR_threshold]
    pd_joined = pd_joined[pd_joined["sublead_deltaR"] < deltaR_threshold]
    print("Final joined dataframe:\n{}".format(pd_joined))

    # Plot
    print("Start plotting")
    for column in columns:
        fig, (up, middle, down) = plt.subplots(
            nrows=3,
            ncols=1,
            gridspec_kw={"height_ratios": (2, 1, 1)}
            )

        nano_name = column["nano_col"]
        micro_name = column["micro_col"]

        if nano_name == micro_name:
            nano_name += "_nano"
            micro_name += "_micro"
        
        range = column["range"]

        # Up
        n, n_, n__ = up.hist(pd_joined[nano_name], bins=column["bins"], range=range, histtype="step", label="NanoAOD", linewidth=2)
        m, m_, m__ = up.hist(pd_joined[micro_name], bins=column["bins"], range=range, histtype="step", label="MicroAOD", linewidth=2)

        up.legend(fontsize=18, loc="upper right")
        up.set_xlim(range)
        up.set_xlabel(column["var"])
        up.set_ylabel("Events")
        if "log" in column:
            up.set_yscale("log")
        
        # Middle
        ylim = [0, 2]
        middle.set_ylim(ylim)
        #middle.axhline(1, xmin=range[0], xmax=range[1], color="black", alpha=0.6)
        centers = (n_[:-1] + n_[1:]) / 2
        middle.plot(centers, n / m, "k.")
        middle.set_xlim(range)
        middle.set_xlabel(column["var"])
        middle.set_ylabel("$n/\mu$")
        middle.grid(which="both")

        # Down
        perc_range = (-300, 300)
        perc_bins = 500
        down.hist(100 * (pd_joined[nano_name] - pd_joined[micro_name]) / pd_joined[micro_name], 
                  bins=perc_bins,
                  range=perc_range,
                  histtype="step",
                  density=True,
                  color="black",
                  linewidth=2)
        #down.set_yscale("log")
        down.set_xlabel("$(n_{ev} - \mu_{ev})/\mu_{ev}$ [%]")
        down.set_ylabel("Events / {}%".format((perc_range[1] - perc_range[0]) / perc_bins))

        print(column["nano_col"])
        print("nano: {}".format(np.sum(n)))
        print("micro: {}".format(np.sum(m)))
        print("diff = {}".format(abs(np.sum(n) - np.sum(m))))
        print("rel diff = {}%\n".format(100 * abs(np.sum(n) - np.sum(m)) / max(np.sum(n), np.sum(m))))

        fig.tight_layout()

        fig.savefig("{}/{}_{}.png".format(args.output_dir, column["nano_col"], args.sd), bbox_inches='tight')
        fig.savefig("{}/{}_{}.pdf".format(args.output_dir, column["nano_col"], args.sd), bbox_inches='tight')

        plt.close(fig)

    # Dump pandas dataframe to parquet file
    pd_joined.to_parquet("nano_micro_{}.parquet".format(args.sd), engine="fastparquet")
    print("Dumped dataframe to parquet file")

    # Redundant: dump separate dataframes for nano and micro with PhotonID inputs
    nano_vars = {
        "r9": "lead_r9_nano", 
        "s4": "lead_s4_nano",
        "sieie": "lead_sieie_nano",
        "etaWidth": "lead_etaWidth",
        "phiWidth": "lead_phiWidth",
        "sieip": "lead_sieip_nano",
        "pfPhoIso03": "lead_pfPhoIso03",
        "pfChargedIsoPFPV": "lead_pfChargedIsoPFPV",
        "pfChargedIsoWorstVtx": "lead_pfChargedIsoWorstVtx",

        "mva_ID": "lead_mvaID_recomputed"
        }

    micro_vars = {
        "r9": "lead_r9_micro", 
        "s4": "lead_s4_micro",
        "sieie": "lead_sieie_micro",
        "etaWidth": "lead_eta_width",
        "phiWidth": "lead_phi_width",
        "sieip": "lead_sieip_micro",
        "pfPhoIso03": "lead_pho_iso",
        "pfChargedIsoPFPV": "lead_ch_iso",
        "pfChargedIsoWorstVtx": "lead_ch_iso_worst",

        "mva_ID": "lead_mva"
        }

    nano_isos = {
        "pfPhoIso03": "lead_pfPhoIso03",
        "pfChargedIsoPFPV": "lead_pfChargedIsoPFPV",
        "pfChargedIsoWorstVtx": "lead_pfChargedIsoWorstVtx",
        "pfPhoIso03_uncorr": "lead_uncorr_pfPhoIso03",
        "pfChargedIsoPFPV_uncorr": "lead_uncorr_pfChargedIsoPFPV",
        "pfChargedIsoWorstVtx_uncorr": "lead_uncorr_pfChargedIsoWorstVtx",
        }

    micro_isos = {
        "pfPhoIso03": "lead_pho_iso",
        "pfChargedIsoPFPV": "lead_ch_iso",
        "pfChargedIsoWorstVtx": "lead_ch_iso_worst",
        "pfPhoIso03_uncorr": "lead_pho_iso_uncorr",
        "pfChargedIsoPFPV_uncorr": "lead_ch_iso_uncorr",
        "pfChargedIsoWorstVtx_uncorr": "lead_ch_iso_worst_uncorr",
       }

    nano_df = pd_joined[list(nano_vars.values())]
    nano_df.rename(columns=dict((v, k) for k, v in nano_vars.items()), inplace=True)
    nano_df.to_parquet("nano_{}.parquet".format(args.sd), engine="fastparquet")
    print("Dumped nano dataframe to parquet file")

    micro_df = pd_joined[list(micro_vars.values())]
    micro_df.rename(columns=dict((v, k) for k, v in micro_vars.items()), inplace=True)
    micro_df.to_parquet("micro_{}.parquet".format(args.sd), engine="fastparquet")
    print("Dumped micro dataframe to parquet file")

    nano_df = pd_joined[list(nano_isos.values())]
    nano_df.rename(columns=dict((v, k) for k, v in nano_isos.items()), inplace=True)
    nano_df.to_parquet("nano_{}_isos.parquet".format(args.sd), engine="fastparquet")
    print("Dumped nano dataframe for isos to parquet file")

    micro_df = pd_joined[list(micro_isos.values())]
    micro_df.rename(columns=dict((v, k) for k, v in micro_isos.items()), inplace=True)
    micro_df.to_parquet("micro_{}_isos.parquet".format(args.sd), engine="fastparquet")
    print("Dumped micro dataframe for isos to parquet file")
Esempio n. 27
0
def setupPionData(root_file_dict,branches=[], layers=[], cluster_tree='ClusterTree', 
                  balance_data=True, n_max=-1, 
                  cut_distributions=[], cut_values=[], cut_types=[],
                  match_distribution='', match_binning=(), match_log=False,
                  verbose=False, load=False, save=False, filename='', return_indices=False):

    pdata = {}
    pcells = {}
    keys = list(root_file_dict.keys())
    rng = np.random.default_rng()

    pdata_filename = filename + '_frame.h5'
    pcell_filename = filename + '_images.h5'
    selec_filename = filename + '_selections.h5'

    if(load and pathlib.Path(pdata_filename).exists() and pathlib.Path(pcell_filename).exists()):
        
        if(verbose): print('Loading pandas DataFrame and calo images from {} and {}.'.format(pdata_filename,pcell_filename))
        # Load the DataFrame and images from disk.
        pdata = {
            key: pd.read_hdf(pdata_filename,key=key)
            for key in keys
        }
        
        hf = h5.File(pcell_filename,'r')
        for key in keys:
            pcells[key] = {}
            for layer in layers:
                pcells[key][layer] = hf['{}:{}'.format(key,layer)][:]
        hf.close()
        
        if(return_indices): # TODO: Rework this a little!
            hf = h5.File(selec_filename,'r')
            indices = {key: hf[key][:] for key in keys}
            hf.close()
            
    else:
        
        # root_file_dict entries might be glob-style strings, or lists of files. We should consider both possibilities.
        arrays = {}
        for key,root_files in root_file_dict.items():
            if(type(root_files) == list):
                arrays[key] = ur.lazy([':'.join((x,cluster_tree)) for x in root_files], filter_branch=lambda x: x.name in branches)
            else:
                arrays[key] = ur.lazy(':'.join((root_files, cluster_tree)), filter_branch=lambda x: x.name in branches)

        indices = ApplyCuts(arrays, cut_distributions, cut_values, cut_types, verbose)
                                
        # Filter out clusters so that our data series match in their distribution of a user-supplied variable.
        if(match_distribution != ''):
            if(match_distribution in branches and len(match_binning) == 3):
                if(verbose): print('Matching data series on distribution: {}.'.format(match_distribution))
                                                
                binning = np.linspace(match_binning[1],match_binning[2],match_binning[0]+1)
                n_bins = len(binning) - 1
                distributions = {
                    key: np.histogram(arrays[key][match_distribution][indices[key]].to_numpy(), bins=binning)[0] # only keep bin counts
                    for key in keys
                }
                
                # Now determine how many clusters we keep in each bin, for each key.
                n_keep = np.zeros(n_bins,dtype=np.dtype('i8'))
                for i in range(n_bins):
                    n_keep[i] = int(np.min([x[i] for x in distributions.values()]))
                    
                # Now we need to throw out some clusters -- in other words, only keep some.
                # We will randomly choose which ones we keep, for each match_distribution bin,
                # for each data series (key).
                for key in keys:
                    sorted_indices = indices[key][np.argsort(arrays[key][match_distribution][indices[key]])]
                    keep_indices = []
                    bin_idx_edges = np.insert(np.cumsum(distributions[key]),0,0)
                    for i in range(n_bins):
                        index_block = sorted_indices[bin_idx_edges[i]:bin_idx_edges[i+1]] # all indices corresponding to the i'th bin of match_distribution, for this key
                        keep_indices.append(rng.choice(index_block, n_keep[i], replace=False))
                    n_before = len(indices[key])
                    indices[key] = np.hstack(keep_indices)
                    n_after = len(indices[key])
                    #if(verbose): print('\t{}, number of events: {} -> {}'.format(key, n_before, n_after))
                                    
            else: print('Warning: Requested matching of distribution \"{}\" but this variable is not among the branches you selected from the data. Skipping this step.'.format(match_distribution))            
            
        # Balance data so we have equal amounts of each category.
        # Note that if we did the matching above, we can potentially skip this as
        # balancing was implicitly done. However, we might want to take the opportunity
        # to further slim down our dataset.
        if(balance_data):
            n_max_tmp = np.min([len(x) for x in indices.values()])
            if(n_max > 0): n_max = np.minimum(n_max_tmp, n_max)
            else: n_max = n_max_tmp
            
            if(verbose): print('Balancing data: {} events per category.'.format(n_max))
            indices = {key:rng.choice(val, n_max, replace=False) for key,val in indices.items()}

        # Make a boolean mask from the indices. This speeds things up below, as opposed to passing (unsorted) lists of indices.
        for key in indices.keys():
            msk = np.zeros(len(arrays[key]),dtype=bool)
            msk[indices[key]] = True
            indices[key] = msk
    
        # Now, apply our selection indices to the arrays.
        arrays = {
            key:arrays[key][indices[key]]
            for key in keys
        }
        
        # Make the dataframes from the arrays.
        if(verbose): print('Preparing pandas DataFrame.')
        pdata = {
            key: ak.to_pandas(arrays[key][branches])
            for key in keys
        }
    
        # Re-make the arrays with just our layer info (using our selection indices again).
        arrays = {}
        for key,root_files in root_file_dict.items():
            if(type(root_files) == list):
                arrays[key] = ur.lazy([':'.join((x,cluster_tree)) for x in root_files], filter_branch=lambda x: x.name in layers)[indices[key]]
            else:
                arrays[key] = ur.lazy(':'.join((root_files, cluster_tree)), filter_branch=lambda x: x.name in layers)[indices[key]]

        
        # Make our calorimeter images.
        nentries = len(keys) * len(layers)
        i = 0
        if(verbose): qu.printProgressBarColor (i, nentries, prefix='Preparing calorimeter images.', suffix='% Complete', length=40)

        pcells = {}
        for key in keys:
            pcells[key] = {}
            for layer in layers:
                pcells[key][layer] = setupCells_new(arrays[key],layer)
                i+=1
                if(verbose): qu.printProgressBarColor (i, nentries, prefix='Preparing calorimeter images.', suffix='% Complete', length=40)
        
        # Save the dataframes and calorimeter images in HDF5 format for easy access next time.
        if(filename != '' and save):
            if(verbose): print('Saving DataFrames to {}.'.format(pdata_filename))
            for key,frame in pdata.items():
                frame.to_hdf(pdata_filename, key=key, mode='a',complevel=6)   
                
            if(verbose): print('Saving calorimeter images to {}.'.format(pcell_filename))
                
            hf = h5.File(pcell_filename, 'w')
            for key in pcells.keys():
                for layer in layers:
                    dset = hf.create_dataset('{}:{}'.format(key,layer), data=pcells[key][layer], chunks=True, compression='gzip', compression_opts=7)
            hf.close()
            
    # One may optionally also save the selected event indices. This can be useful if referring back to the original data source.
    if(return_indices):
        # Save the indices to a file.
        hf = h5.File(selec_filename, 'w')
        for key in indices.keys():
            dset = hf.create_dataset(key, data=indices[key], chunks=True, compression='gzip', compression_opts=7)
        hf.close()
        return pdata, pcells, indices # return indices
    return pdata, pcells # don't return indices
    def build_dataframe(
            self,
            data_path: str,
            TTree_name: str,
            tree_dict: Dict[str, Set[str]],
            is_truth: bool,
            is_reco: bool,
            chunksize: int = 1024,
            validate_missing_events: bool = True,
            validate_duplicated_events: bool = True,
            validate_sumofweights: bool = True,
    ) -> pd.DataFrame:
        """
         Builds a dataframe

        :param data_path: path to ROOT datafile(s)
        :param TTree_name: TTree in datapath to set as default tree
        :param tree_dict: dictionary of tree: variables to extract from Datapath
        :param is_truth: whether dataset contains truth data
        :param is_reco: whether dataset contains reco data
        :param chunksize: chunksize for uproot concat method
        :param validate_missing_events: whether to check for missing events
        :param validate_duplicated_events: whether to check for duplicated events
        :param validate_sumofweights: whether to check sum of weights against weight_mc
        :return: output dataframe containing columns corresponding to necessary variables
        """
        self.logger.info(f"Building DataFrame from {data_path} ({file_utils.n_files(data_path)} file(s))...")

        # is the default tree a truth tree?
        default_tree_truth = 'truth' in TTree_name

        t1 = time.time()
        self.logger.debug(f"Extracting {tree_dict[TTree_name]} from {TTree_name} tree...")
        df = to_pandas(uproot.concatenate(data_path + ':' + TTree_name, tree_dict[TTree_name],
                                          num_workers=config.n_threads, begin_chunk_size=chunksize))
        self.logger.debug(f"Extracted {len(df)} events.")

        self.logger.debug(f"Extracting ['total_EventsWeighted', 'dsid'] from 'sumWeights' tree...")
        sumw = to_pandas(uproot.concatenate(data_path + ':sumWeights', ['totalEventsWeighted', 'dsid'],
                                            num_workers=config.n_threads, begin_chunk_size=chunksize))

        self.logger.debug(f"Calculating sum of weights and merging...")
        sumw = sumw.groupby('dsid').sum()
        df = pd.merge(df, sumw, left_on='mcChannelNumber', right_on='dsid', sort=False, copy=False)

        df.set_index(['mcChannelNumber', 'eventNumber'], inplace=True)
        df.index.names = ['DSID', 'eventNumber']
        self.logger.debug("Set DSID/eventNumber as index")

        # merge TTrees
        if validate_duplicated_events:
            validation = '1:1'
            self.logger.info(f"Validating duplicated events in tree {TTree_name}...")
            self.__drop_duplicates(df)
            self.__drop_duplicate_event_numbers(df)
        else:
            validation = 'm:m'
            self.logger.info("Skipping duplicted events validation")

        # iterate over TTrees and merge
        for tree in tree_dict:
            if tree == TTree_name:
                continue

            self.logger.debug(f"Extracting {tree_dict[tree]} from {tree} tree...")
            alt_df = to_pandas(uproot.concatenate(data_path + ":" + tree, tree_dict[tree],
                                                  num_workers=config.n_threads, begin_chunk_size=chunksize))
            self.logger.debug(f"Extracted {len(alt_df)} events.")

            alt_df.set_index(['mcChannelNumber', 'eventNumber'], inplace=True)
            alt_df.index.names = ['DSID', 'eventNumber']
            self.logger.debug("Set DSID/eventNumber as index")

            if validate_missing_events:
                self.logger.info(f"Checking for missing events in tree '{tree}'..")
                tree_is_truth = 'truth' in tree

                if tree_is_truth and not default_tree_truth:
                    if n_missing := len(df.index.difference(alt_df.index)):
                        raise Exception(
                            f"Found {n_missing} events in '{TTree_name}' tree not found in '{tree}' tree")
                    else:
                        self.logger.debug(f"All events in {TTree_name} tree found in {tree} tree")
                elif default_tree_truth and not tree_is_truth:
                    if n_missing := len(alt_df.index.difference(df.index)):
                        raise Exception(
                            f"Found {n_missing} events in '{tree}' tree not found in '{TTree_name}' tree")
                    else:
                        self.logger.debug(f"All events in {tree} tree found in {TTree_name} tree")
                else:
                    self.logger.info(f"Skipping missing events check. Not truth/reco tree combination")
Esempio n. 29
0
#     "pgf.texsystem": "lualatex",
#     "pgf.rcfonts": False,
#     "font.family": "serif",
#     "font.serif": [],
#     "font.sans-serif": [],
#     "font.monospace": [],
#     # "figure.figsize": [default_width, default_width * default_ratsio],
#     "pgf.preamble": "\\usepackage{mymplsetup}"
# })
# plt.rcParams = plt.rcParamsDefault
plt.rcParams = plt.rcParamsDefault
mpl.rcParams.update({"font.size": 16})
# %%
rf = uproot.open("output/DetIdLUT.root")
arr = rf["analyzer/tree"].arrays()
keydf = ak.to_pandas(arr[0])
keydf = keydf.set_index("globalid")
keydf.head()
# %%
# Debug code to see the if the arrays are filled correctly
index = [
    "globalid",
    "detectorid",
    "subdetid",
    "layerid",
    "waferortileid.first",
    "waferortileid.second",
    "cellid.first",
    "cellid.second",
    "x",
    "y",
        return output

    def postprocess(self, accumulator):
            return accumulator 
import uproot
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
import pandas as pd
class HackSchema(NanoAODSchema):
    def __init__(self, base_form):
        base_form["contents"].pop("Muon_fsrPhotonIdx", None)
        base_form["contents"].pop("Electron_photonIdx", None)
        super().__init__(base_form)
print(args.file)
f=args.file
files = {"TTBAR":[args.file]}
result = processor.run_uproot_job(
    files,
    treename="Events",
    processor_instance=MyProcessor(),
    executor=processor.iterative_executor,
    executor_args={'schema':HackSchema},
    chunksize=10000
)

l=args.loc
keys=result.keys()
finaldict={}
for key in keys:
    finaldict[key]=result[key].value
df=ak.to_pandas(ak.zip(finaldict))
df.to_csv("%s/%sDatasetWithTruths.csv"%(l,f[0:-5]))