def test_read_write_hdf(tmpdir, input_arr): tmp_file = tmpdir / "example.h5" # Write with h5py.File(str(tmp_file), "w") as hf: a = awkward.JaggedArray.fromiter(input_arr) ah5 = awkward.hdf5(hf) ah5["example"] = a # Read with h5py.File(str(tmp_file)) as hf: ah5 = awkward.hdf5(hf) b = ah5["example"] assert a.tolist() == b.tolist()
def save(filename, item): typename = type(item).__name__ if typename in vetoed_typenames: return False if typename == "DataFrame": item.to_hdf(filename, key="data") return True if typename in ["ndarray", "JaggedArray"]: with h5py.File(filename, "w") as hf: ah5 = awkward.hdf5(hf) if typename == "JaggedArray": ah5["data"] = awkward_utils.ascontiguousarray(item) else: ah5["data"] = np.ascontiguousarray(item) return True try: with open(filename, "wb") as f: pickle.dump(item, f) return True except: return False return False
def _get_from_cache(filename): basename = os.path.basename(filename) if "__StreamList" in basename: filenames = glob.glob(filename + "/*") stream_list = StreamList( [_get_from_cache(f) for f in sorted(filenames)]) return stream_list if "__DataFrame" in basename: return pd.read_hdf(filename, key="data") # For TLorentsVectorArray from ptetaphimass if "JaggedArrayMethods_PtEtaPhiMassLorentzVectorArray_Table" in basename: content = OrderedDict() with h5py.File(filename, "r") as hf: ah5 = awkward.hdf5(hf) for k in ah5: content[k] = ah5[k] particles = uproot_methods.TLorentzVectorArray.from_ptetaphim( content["fPt"], content["fEta"], content["fPhi"], content["fMass"]) for k in content: if k not in ["fPt", "fEta", "fPhi", "fMass"]: particles[k] = content[k] return particles if "__ndarray" in basename or "__JaggedArray" in basename: with h5py.File(filename, "r") as hf: ah5 = awkward.hdf5(hf) array = ah5["data"] return array # for pytorch nn models if filename.endswith(".pt"): model = torch.load(filename) model.eval() return model if filename.endswith(".pkl"): with open(filename, "rb") as pf: product = pickle.load(pf) return product
def load_h5(file): """ read an hdf5 file created by awkward """ h5arrs = {} file = os.path.expanduser(file) with h5py.File(file) as hf: ah5 = awkward.hdf5(hf) for ds in ah5: h5arrs[ds] = ah5[ds] return h5arrs
def write_h5(file, uarrs, mode="w"): """ write an hdf5 file created by awkward, from awkward.JaggedArrays. - awkward/awkward/persist.py - awkward/tests/test_hdf5.py """ file = os.path.expanduser(file) with h5py.File(file, mode) as hf: ah5 = awkward.hdf5(hf) for ds in uarrs: awk = awkward.JaggedArray.fromiter(uarrs[ds]) ah5[ds] = awk
def __getitem__(self, key): print("Getting from cache {0}...".format(key)) file_name = self._get_file_name(key) if not key in self: raise KeyError(key + "not found in scratch cache.") with h5py.File(file_name, "r") as hf: ah5 = awkward.hdf5(hf) array = ah5[key] if isinstance(array, np.ndarray): print("got ndarray of length " + str(len(array))) else: print("got jagged array of length " + str(len(array)) + " (flattened " + str(len(array.flatten())) + ")") return array
def load(filename, typename): if typename == "DataFrame": return pd.read_hdf(filename, key="data") if typename in ["ndarray", "JaggedArray"]: with h5py.File(filename, "r") as hf: ah5 = awkward.hdf5(hf) array = ah5["data"] return array try: with open(filename, "rb") as pf: product = pickle.load(pf) return product except: return None return None
def __setitem__(self, key, item): file_name = self._get_file_name(key) with h5py.File(file_name, "w") as hf: ah5 = awkward.hdf5(hf) ah5[key] = item
def write_HDF5(run=None, infile=None, mode="pandas", nevt=None): """ primary writer function. contains several Majorana-specific choices. works on gatified or skim data. (TODO: simulation data?) """ if run is None and infile is None: print("You must specify either a run number or input filename.") exit() # declare inputs and outputs if run is not None: from ROOT import GATDataSet gds = GATDataSet() gfile = gds.GetPathToRun(run, GATDataSet.kGatified) infile, tname = gfile, "mjdTree" ufile = uproot.open(infile) if infile is not None: ufile = uproot.open(infile) # auto-detect and use the name of the first TTree we find for uc in ufile.allclasses(): cname, ctype = uc[0], str(uc[1]) if "TTree" in ctype: tname = cname.decode("utf-8").split(";")[0] print("Found TTree:", tname) break # strip the path and extension off the filename to create the hfile if run is None: label = infile.split("/")[-1].split(".root")[0] hfile = "{}/hdf5/{}.h5".format(os.environ["MJDDATADIR"], label) else: hfile = "{}/hdf5/mjd_run{}.h5".format(os.environ["MJDDATADIR"], run) # these MGDO object members don't have the same number of entries # as the rest of the vector-valued branches, so skip them for now skip_names = ["i", "iH", "iL", "j", "jH", "jL", "rawRun", "c0Channels"] # get all relevant TTree branches & sort by data type event_names, hit_names = [], [] utree = ufile[tname] uarrs = utree.arrays(entrystop=1) for k in sorted(uarrs.keys()): name = k.decode('utf-8') vals = uarrs[k] if isinstance(vals, np.ndarray): event_names.append(k) elif isinstance(vals, awkward.JaggedArray): if name in skip_names: continue hit_names.append(k) elif isinstance(vals, awkward.ObjectArray): # print("Skipping branch:", name) continue # write to pandas HDF5 (pytables) if mode == "pandas": print("writing pandas hdf5.\n input:{}\n output:{}".format( infile, hfile)) df_events = ufile[tname].pandas.df(event_names, entrystop=nevt) df_hits = ufile[tname].pandas.df(hit_names, entrystop=nevt) if os.path.isfile(hfile): os.remove(hfile) opts = { "mode": "a", # 'r', 'r+', 'a' and 'w' "format": "table", # "fixed" can't be indexed w/ data_columns "complib": "blosc:snappy", "complevel": 2, # "data_columns":["ievt"] # used for pytables' fast HDF5 dataset indexing } df_events.to_hdf(hfile, key="events", **opts) df_hits.to_hdf(hfile, key="hits", **opts) # -- write to awkward.hdf5 -- elif mode == "awkward": print("Writing awkward hdf5.\n input:{}\n output:{}".format( infile, hfile)) print("Warning: this mode is not well-developed and needs work") # FIXME: separate values, as above uarrs = utree.arrays(entrystop=nevt) # set awkward hdf5 options opts = { # "compression":2 # hmm, doesn't work? } with h5py.File(os.path.expanduser(hfile), "w") as hf: awk_h5 = awkward.hdf5(hf, **opts) for ds in uarrs: if isinstance(uarrs[ds], awkward.ObjectArray): print("skipping dataset:", ds.decode('utf-8')) continue awk_h5[ds.decode('utf-8')] = uarrs[ds] # ehhh, it's a work in progress. probably won't need this until LEGEND # check the groups saved into the file with pd.HDFStore(hfile, 'r') as f: print("Keys:", f.keys())
def _save_to_cache(filename, item): classname = type(item).__name__ if classname in vetoed_classnames: return if classname == "StreamList": if type(item[0]).__name__ in vetoed_classnames: return mkdir(filename) for i, subitem in enumerate(item): subclassname = type(subitem).__name__ if subclassname == "StreamList": raise TypeError( "StreamList in a StreamList found, which should not happen." ) subfilename = os.path.join( filename, os.path.basename(filename.replace(classname, subclassname)) + "__{0:04d}".format(i)) _save_to_cache(subfilename, subitem) return if classname == "DataFrame": item.to_hdf(filename + ".h5", key="data") return if classname in ["ndarray", "JaggedArray"]: with h5py.File(filename + ".h5", "w") as hf: ah5 = awkward.hdf5(hf) if classname == "JaggedArray": ah5["data"] = awkward_utils.ascontiguousarray(item) else: ah5["data"] = np.ascontiguousarray(item) return # For TLorentsVectorArray from ptetaphimass if (classname == "JaggedArrayMethods" and type( item._content).__name__ == "PtEtaPhiMassLorentzVectorArray" and type(item._content._content).__name__ == "Table"): starts = item._starts stops = item._stops contents = item._content._content._contents filename = filename.replace( "JaggedArrayMethods", "JaggedArrayMethods_PtEtaPhiMassLorentzVectorArray_Table") with h5py.File(filename + ".h5", "w") as hf: ah5 = awkward.hdf5(hf) for k, v in contents.items(): a = awkward.JaggedArray(starts, stops, v) ah5[k] = awkward_utils.ascontiguousarray(a) return # for pytorch nn models if issubclass(type(item), torch.nn.Module): torch.save(item, filename + ".pt") return with open(filename + ".pkl", "wb") as f: pickle.dump(item, f) return