def _recursive_zip(self, forms, hierarchy, key, final_zip=False): # print(key,'\n') for k, v in hierarchy.items(): if isinstance(v, collections.abc.Mapping): # print('if:', k,v) forms[k] = self._recursive_zip(forms.get(k, {}), hierarchy.get(k, {}), k, True) # form = zip_forms(form,"events",record_name=None) else: name = self.mixins[k] if k in self.mixins.keys() else None # print('else:', k,v,name) forms[k] = zip_forms(forms[k], k, record_name=name) if final_zip: forms = zip_forms(forms, key, record_name=None) return forms
def _build_collections(self, branch_forms): zip_groups = defaultdict(list) has_eventindex = defaultdict(bool) for key, ak_form in branch_forms.items(): # Normal fields key_fields = key.split("/")[-1].split(".") top_key = key_fields[0] sub_key = ".".join(key_fields[1:]) objname = top_key.replace("Analysis", "").replace("AuxDyn", "") # temporary hack to have the correct type for the ElementLinks # (uproot loses the type information somewhere on the way and they end up int64) if self._hack_for_elementlink_int64: try: for k in ["m_persIndex", "m_persKey"]: form = ak_form["content"]["content"]["contents"][k] form["itemsize"] = 8 form["primitive"] = "int64" except KeyError: pass zip_groups[objname].append(((key, sub_key), ak_form)) # add eventindex form, based on the first single-jagged list column if (not has_eventindex[objname] and "List" in ak_form["class"] and "List" not in ak_form["content"]["class"]): zip_groups[objname].append( ((key, "_eventindex"), self._create_eventindex_form(ak_form, key))) has_eventindex[objname] = True # zip the forms contents = {} for objname, keys_and_form in zip_groups.items(): try: contents[objname] = zip_forms( {sub_key: form for (key, sub_key), form in keys_and_form}, objname, self.mixins.get(objname, None), ) content = contents[objname]["content"] content["parameters"] = dict(content.get("parameters", {}), collection_name=objname) except NotImplementedError: warnings.warn(f"Can't zip collection {objname}") return contents
def _tlorentz_vectorize(objname, form): # first handle RecordArray if {"fE", "fP"} == form.get("contents", {}).keys(): return zip_forms( { "x": form["contents"]["fP"]["contents"]["fX"], "y": form["contents"]["fP"]["contents"]["fY"], "z": form["contents"]["fP"]["contents"]["fZ"], "t": form["contents"]["fE"], }, objname, "LorentzVector", ) # If there's no "content", like a NumpyArray, just return. # Note: this comes after checking for RecordArray. if "content" not in form: return form # Then recursively go through and update the form's content. form["content"] = _tlorentz_vectorize(objname, form["content"]) return form
def _build_collections(self, branch_forms): zip_groups = defaultdict(list) has_eventindex = defaultdict(bool) for key, ak_form in branch_forms.items(): # Normal fields key_fields = key.split("/")[-1].split(".") top_key = key_fields[0] sub_key = ".".join(key_fields[1:]) objname = top_key.replace("Analysis", "").replace("AuxDyn", "") zip_groups[objname].append(((key, sub_key), ak_form)) # add eventindex form, based on the first single-jagged list column if (not has_eventindex[objname] and "List" in ak_form["class"] and "List" not in ak_form["content"]["class"]): zip_groups[objname].append( ((key, "_eventindex"), self._create_eventindex_form(ak_form, key))) has_eventindex[objname] = True # zip the forms contents = {} for objname, keys_and_form in zip_groups.items(): try: contents[objname] = zip_forms( {sub_key: form for (key, sub_key), form in keys_and_form}, objname, self.mixins.get(objname, None), bypass=True, ) content = contents[objname]["content"] content["parameters"] = dict(content.get("parameters", {}), collection_name=objname) except NotImplementedError: warnings.warn(f"Can't zip collection {objname}") return contents
def _build_collections(self, branch_forms): def _tlorentz_vectorize(objname, form): # first handle RecordArray if {"fE", "fP"} == form.get("contents", {}).keys(): return zip_forms( { "x": form["contents"]["fP"]["contents"]["fX"], "y": form["contents"]["fP"]["contents"]["fY"], "z": form["contents"]["fP"]["contents"]["fZ"], "t": form["contents"]["fE"], }, objname, "LorentzVector", ) # If there's no "content", like a NumpyArray, just return. # Note: this comes after checking for RecordArray. if "content" not in form: return form # Then recursively go through and update the form's content. form["content"] = _tlorentz_vectorize(objname, form["content"]) return form # preprocess lorentz vectors properly (and recursively) for objname, form in branch_forms.items(): branch_forms[objname] = _tlorentz_vectorize(objname, form) # parse into high-level records (collections, list collections, and singletons) collections = set(k.split("/")[0] for k in branch_forms) collections -= set(k for k in collections if k.endswith("_size")) # Create offsets virtual arrays for name in collections: if f"{name}_size" in branch_forms: branch_forms[f"o{name}"] = transforms.counts2offsets_form( branch_forms[f"{name}_size"]) output = {} for name in collections: output[f"{name}.offsets"] = branch_forms[f"o{name}"] mixin = self.mixins.get(name, "NanoCollection") # Every delphes collection is a list offsets = branch_forms["o" + name] content = { k[2 * len(name) + 2:]: branch_forms[k] for k in branch_forms if k.startswith(name + "/" + name) } output[name] = zip_forms(content, name, record_name=mixin, offsets=offsets) # update docstrings as needed # NB: must be before flattening for easier logic for parameter in output[name]["content"]["contents"].keys(): if "parameters" not in output[name]["content"]["contents"][ parameter]: continue output[name]["content"]["contents"][parameter]["parameters"][ "__doc__"] = self.docstrings.get( parameter, output[name]["content"]["contents"][parameter] ["parameters"].get("__doc__", "no docstring available"), ) # handle branches named like [4] and [5] output[name]["content"]["contents"] = { k.replace("[", "_").replace("]", ""): v for k, v in output[name]["content"]["contents"].items() } output[name]["content"]["parameters"].update({ "__doc__": offsets["parameters"]["__doc__"], "collection_name": name, }) if name in self.singletons: # flatten! this 'promotes' the content of an inner dimension # upwards, effectively hiding one nested dimension output[name] = output[name]["content"] return output
def _build_collections(self, branch_forms): # Turn any special classes into the appropriate awkward form composite_objects = list(set(k.split("/")[0] for k in branch_forms if "/" in k)) composite_behavior = { # Dictionary for overriding the default behavior "Tracks": "LorentzVector" } for objname in composite_objects: # grab the * from "objname/objname.*" components = set( k[2 * len(objname) + 2 :] for k in branch_forms if k.startswith(objname + "/") ) if components == { "fCoordinates.fPt", "fCoordinates.fEta", "fCoordinates.fPhi", "fCoordinates.fE", }: form = zip_forms( { "pt": branch_forms.pop(f"{objname}/{objname}.fCoordinates.fPt"), "eta": branch_forms.pop( f"{objname}/{objname}.fCoordinates.fEta" ), "phi": branch_forms.pop( f"{objname}/{objname}.fCoordinates.fPhi" ), "energy": branch_forms.pop( f"{objname}/{objname}.fCoordinates.fE" ), }, objname, composite_behavior.get(objname, "PtEtaPhiELorentzVector"), ) branch_forms[objname] = form elif components == { "fCoordinates.fX", "fCoordinates.fY", "fCoordinates.fZ", }: form = zip_forms( { "x": branch_forms.pop(f"{objname}/{objname}.fCoordinates.fX"), "y": branch_forms.pop(f"{objname}/{objname}.fCoordinates.fY"), "z": branch_forms.pop(f"{objname}/{objname}.fCoordinates.fZ"), }, objname, composite_behavior.get(objname, "ThreeVector"), ) branch_forms[objname] = form else: raise ValueError( f"Unrecognized class with split branches: {components}" ) # Generating collection from branch name collections = [k for k in branch_forms if "_" in k] collections = set( [ "_".join(k.split("_")[:-1]) for k in collections if k.split("_")[-1] != "AK8" # Excluding per-event variables with AK8 variants like Mjj and MT ] ) subcollections = [] for cname in collections: items = sorted(k for k in branch_forms if k.startswith(cname + "_")) if len(items) == 0: continue # Special pattern parsing for <collection>_<subcollection>Counts branches countitems = [x for x in items if x.endswith("Counts")] subcols = set(x[:-6] for x in countitems) # List of subcollection names for subcol in subcols: items = [ k for k in items if not k.startswith(subcol) or k.endswith("Counts") ] subname = subcol[len(cname) + 1 :] subcollections.append( { "colname": cname, "subcol": subcol, "countname": subname + "Counts", "subname": subname, } ) if cname not in branch_forms: collection = zip_forms( {k[len(cname) + 1]: branch_forms.pop(k) for k in items}, cname ) branch_forms[cname] = collection else: collection = branch_forms[cname] if not collection["class"].startswith("ListOffsetArray"): raise NotImplementedError( f"{cname} isn't a jagged array, not sure what to do" ) for item in items: itemname = item[len(cname) + 1 :] collection["content"]["contents"][itemname] = branch_forms.pop( item )["content"] for sub in subcollections: nest_jagged_forms( branch_forms[sub["colname"]], branch_forms.pop(sub["subcol"]), sub["countname"], sub["subname"], ) return branch_forms
def _build_collections(self, branch_forms): # parse into high-level records (collections, list collections, and singletons) collections = set(k.split("_")[0] for k in branch_forms) collections -= set( k for k in collections if k.startswith("n") and k[1:] in collections ) isData = "GenPart" not in collections # Create offsets virtual arrays for name in collections: if "n" + name in branch_forms: branch_forms["o" + name] = transforms.counts2offsets_form( branch_forms["n" + name] ) # Create global index virtual arrays for indirection for indexer, target in self.cross_references.items(): if target.startswith("Gen") and isData: continue if indexer not in branch_forms: if self.warn_missing_crossrefs: warnings.warn( f"Missing cross-reference index for {indexer} => {target}", RuntimeWarning, ) continue if "o" + target not in branch_forms: if self.warn_missing_crossrefs: warnings.warn( f"Missing cross-reference target for {indexer} => {target}", RuntimeWarning, ) continue branch_forms[indexer + "G"] = transforms.local2global_form( branch_forms[indexer], branch_forms["o" + target] ) # Create nested indexer from Idx1, Idx2, ... arrays for name, indexers in self.nested_items.items(): if all(idx in branch_forms for idx in indexers): branch_forms[name] = transforms.nestedindex_form( [branch_forms[idx] for idx in indexers] ) # Create nested indexer from n* counts arrays for name, (local_counts, target) in self.nested_index_items.items(): if local_counts in branch_forms and "o" + target in branch_forms: branch_forms[name] = transforms.counts2nestedindex_form( branch_forms[local_counts], branch_forms["o" + target] ) # Create any special arrays for name, (fcn, args) in self.special_items.items(): if all(k in branch_forms for k in args): branch_forms[name] = fcn(*(branch_forms[k] for k in args)) output = {} for name in collections: mixin = self.mixins.get(name, "NanoCollection") if "o" + name in branch_forms and name not in branch_forms: # list collection offsets = branch_forms["o" + name] content = { k[len(name) + 1 :]: branch_forms[k] for k in branch_forms if k.startswith(name + "_") } output[name] = zip_forms( content, name, record_name=mixin, offsets=offsets ) output[name]["content"]["parameters"].update( { "__doc__": offsets["parameters"]["__doc__"], "collection_name": name, } ) elif "o" + name in branch_forms: # list singleton, can use branch's own offsets output[name] = branch_forms[name] output[name]["parameters"].update( {"__array__": mixin, "collection_name": name} ) elif name in branch_forms: # singleton output[name] = branch_forms[name] else: # simple collection output[name] = zip_forms( { k[len(name) + 1 :]: branch_forms[k] for k in branch_forms if k.startswith(name + "_") }, name, record_name=mixin, ) output[name]["parameters"].update({"collection_name": name}) return output
def _build_collections(self, branch_forms): # Turn any special classes into the appropriate awkward form composite_objects = list( set(k.split("/")[0] for k in branch_forms if "/" in k)) for objname in composite_objects: # grab the * from "objname/objname.*" components = set(k[2 * len(objname) + 2:] for k in branch_forms if k.startswith(objname + "/")) if components == { "fCoordinates.fPt", "fCoordinates.fEta", "fCoordinates.fPhi", "fCoordinates.fE", }: form = zip_forms( { "pt": branch_forms.pop( f"{objname}/{objname}.fCoordinates.fPt"), "eta": branch_forms.pop( f"{objname}/{objname}.fCoordinates.fEta"), "phi": branch_forms.pop( f"{objname}/{objname}.fCoordinates.fPhi"), "energy": branch_forms.pop( f"{objname}/{objname}.fCoordinates.fE"), }, objname, "PtEtaPhiELorentzVector", ) branch_forms[objname] = form elif components == { "fCoordinates.fX", "fCoordinates.fY", "fCoordinates.fZ", }: form = zip_forms( { "x": branch_forms.pop( f"{objname}/{objname}.fCoordinates.fX"), "y": branch_forms.pop( f"{objname}/{objname}.fCoordinates.fY"), "z": branch_forms.pop( f"{objname}/{objname}.fCoordinates.fZ"), }, objname, "Point", ) branch_forms[objname] = form else: raise ValueError( f"Unrecognized class with split branches: {components}") # Generating collection from branch name collections = [k for k in branch_forms if "_" in k] collections = set([ "_".join(k.split("_")[:-1]) for k in collections if k.split("_")[-1] != "AK8" # Excluding per-event variables with AK8 variants like Mjj and MT ]) for cname in collections: items = sorted(k for k in branch_forms if k.startswith(cname + "_")) if len(items) == 0: continue if cname == "JetsAK8": items = [ k for k in items if not k.startswith("JetsAK8_subjets") ] items.append("JetsAK8_subjetsCounts") if cname == "JetsAK8_subjets": items = [k for k in items if not k.endswith("Counts")] if cname not in branch_forms: collection = zip_forms( {k[len(cname) + 1]: branch_forms.pop(k) for k in items}, cname) branch_forms[cname] = collection else: collection = branch_forms[cname] if not collection["class"].startswith("ListOffsetArray"): raise NotImplementedError( f"{cname} isn't a jagged array, not sure what to do") for item in items: itemname = item[len(cname) + 1:] collection["content"]["contents"][ itemname] = branch_forms.pop(item)["content"] nest_jagged_forms( branch_forms["JetsAK8"], branch_forms.pop("JetsAK8_subjets"), "subjetsCounts", "subjets", ) return branch_forms