Beispiel #1
0
def test_read_write_hdf(tmpdir, input_arr):
    tmp_file = tmpdir / "example.h5"

    # Write
    with h5py.File(str(tmp_file), "w") as hf:
        a = awkward.JaggedArray.fromiter(input_arr)
        ah5 = awkward.hdf5(hf)
        ah5["example"] = a

    # Read
    with h5py.File(str(tmp_file)) as hf:
        ah5 = awkward.hdf5(hf)
        b = ah5["example"]

    assert a.tolist() == b.tolist()
Beispiel #2
0
def save(filename, item):

    typename = type(item).__name__

    if typename in vetoed_typenames:
        return False

    if typename == "DataFrame":
        item.to_hdf(filename, key="data")
        return True

    if typename in ["ndarray", "JaggedArray"]:
        with h5py.File(filename, "w") as hf:
            ah5 = awkward.hdf5(hf)
            if typename == "JaggedArray":
                ah5["data"] = awkward_utils.ascontiguousarray(item)
            else:
                ah5["data"] = np.ascontiguousarray(item)
        return True

    try:
        with open(filename, "wb") as f:
            pickle.dump(item, f)
        return True
    except:
        return False

    return False
Beispiel #3
0
def _get_from_cache(filename):

    basename = os.path.basename(filename)

    if "__StreamList" in basename:
        filenames = glob.glob(filename + "/*")
        stream_list = StreamList(
            [_get_from_cache(f) for f in sorted(filenames)])
        return stream_list

    if "__DataFrame" in basename:
        return pd.read_hdf(filename, key="data")

    # For TLorentsVectorArray from ptetaphimass
    if "JaggedArrayMethods_PtEtaPhiMassLorentzVectorArray_Table" in basename:
        content = OrderedDict()
        with h5py.File(filename, "r") as hf:
            ah5 = awkward.hdf5(hf)
            for k in ah5:
                content[k] = ah5[k]
        particles = uproot_methods.TLorentzVectorArray.from_ptetaphim(
            content["fPt"], content["fEta"], content["fPhi"], content["fMass"])
        for k in content:
            if k not in ["fPt", "fEta", "fPhi", "fMass"]:
                particles[k] = content[k]
        return particles

    if "__ndarray" in basename or "__JaggedArray" in basename:
        with h5py.File(filename, "r") as hf:
            ah5 = awkward.hdf5(hf)
            array = ah5["data"]
        return array

    # for pytorch nn models
    if filename.endswith(".pt"):
        model = torch.load(filename)
        model.eval()
        return model

    if filename.endswith(".pkl"):
        with open(filename, "rb") as pf:
            product = pickle.load(pf)
        return product
Beispiel #4
0
def load_h5(file):
    """
    read an hdf5 file created by awkward
    """
    h5arrs = {}
    file = os.path.expanduser(file)
    with h5py.File(file) as hf:
        ah5 = awkward.hdf5(hf)
        for ds in ah5:
            h5arrs[ds] = ah5[ds]
    return h5arrs
Beispiel #5
0
def write_h5(file, uarrs, mode="w"):
    """
    write an hdf5 file created by awkward, from awkward.JaggedArrays.
    - awkward/awkward/persist.py
    - awkward/tests/test_hdf5.py
    """
    file = os.path.expanduser(file)
    with h5py.File(file, mode) as hf:
        ah5 = awkward.hdf5(hf)
        for ds in uarrs:
            awk = awkward.JaggedArray.fromiter(uarrs[ds])
            ah5[ds] = awk
Beispiel #6
0
    def __getitem__(self, key):
        print("Getting from cache {0}...".format(key))
        file_name = self._get_file_name(key)

        if not key in self:
            raise KeyError(key + "not found in scratch cache.")

        with h5py.File(file_name, "r") as hf:
            ah5 = awkward.hdf5(hf)
            array = ah5[key]

        if isinstance(array, np.ndarray):
            print("got ndarray of length " + str(len(array)))
        else:
            print("got jagged array of length " + str(len(array)) +
                  " (flattened " + str(len(array.flatten())) + ")")
        return array
Beispiel #7
0
def load(filename, typename):

    if typename == "DataFrame":
        return pd.read_hdf(filename, key="data")

    if typename in ["ndarray", "JaggedArray"]:
        with h5py.File(filename, "r") as hf:
            ah5 = awkward.hdf5(hf)
            array = ah5["data"]
        return array

    try:
        with open(filename, "rb") as pf:
            product = pickle.load(pf)
            return product
    except:
        return None

    return None
Beispiel #8
0
 def __setitem__(self, key, item):
     file_name = self._get_file_name(key)
     with h5py.File(file_name, "w") as hf:
         ah5 = awkward.hdf5(hf)
         ah5[key] = item
Beispiel #9
0
def write_HDF5(run=None, infile=None, mode="pandas", nevt=None):
    """
    primary writer function.  contains several Majorana-specific choices.
    works on gatified or skim data.  (TODO: simulation data?)
    """
    if run is None and infile is None:
        print("You must specify either a run number or input filename.")
        exit()

    # declare inputs and outputs
    if run is not None:
        from ROOT import GATDataSet
        gds = GATDataSet()
        gfile = gds.GetPathToRun(run, GATDataSet.kGatified)
        infile, tname = gfile, "mjdTree"
        ufile = uproot.open(infile)

    if infile is not None:
        ufile = uproot.open(infile)
        # auto-detect and use the name of the first TTree we find
        for uc in ufile.allclasses():
            cname, ctype = uc[0], str(uc[1])
            if "TTree" in ctype:
                tname = cname.decode("utf-8").split(";")[0]
                print("Found TTree:", tname)
                break

    # strip the path and extension off the filename to create the hfile
    if run is None:
        label = infile.split("/")[-1].split(".root")[0]
        hfile = "{}/hdf5/{}.h5".format(os.environ["MJDDATADIR"], label)
    else:
        hfile = "{}/hdf5/mjd_run{}.h5".format(os.environ["MJDDATADIR"], run)

    # these MGDO object members don't have the same number of entries
    # as the rest of the vector-valued branches, so skip them for now
    skip_names = ["i", "iH", "iL", "j", "jH", "jL", "rawRun", "c0Channels"]

    # get all relevant TTree branches & sort by data type
    event_names, hit_names = [], []

    utree = ufile[tname]
    uarrs = utree.arrays(entrystop=1)
    for k in sorted(uarrs.keys()):
        name = k.decode('utf-8')
        vals = uarrs[k]

        if isinstance(vals, np.ndarray):
            event_names.append(k)

        elif isinstance(vals, awkward.JaggedArray):
            if name in skip_names:
                continue
            hit_names.append(k)

        elif isinstance(vals, awkward.ObjectArray):
            # print("Skipping branch:", name)
            continue

    # write to pandas HDF5 (pytables)
    if mode == "pandas":
        print("writing pandas hdf5.\n  input:{}\n  output:{}".format(
            infile, hfile))

        df_events = ufile[tname].pandas.df(event_names, entrystop=nevt)
        df_hits = ufile[tname].pandas.df(hit_names, entrystop=nevt)

        if os.path.isfile(hfile):
            os.remove(hfile)

        opts = {
            "mode": "a",  # 'r', 'r+', 'a' and 'w'
            "format": "table",  # "fixed" can't be indexed w/ data_columns
            "complib": "blosc:snappy",
            "complevel": 2,
            # "data_columns":["ievt"] # used for pytables' fast HDF5 dataset indexing
        }

        df_events.to_hdf(hfile, key="events", **opts)
        df_hits.to_hdf(hfile, key="hits", **opts)

    # -- write to awkward.hdf5 --
    elif mode == "awkward":
        print("Writing awkward hdf5.\n  input:{}\n  output:{}".format(
            infile, hfile))
        print("Warning: this mode is not well-developed and needs work")

        # FIXME: separate values, as above
        uarrs = utree.arrays(entrystop=nevt)

        # set awkward hdf5 options
        opts = {
            # "compression":2 # hmm, doesn't work?
        }
        with h5py.File(os.path.expanduser(hfile), "w") as hf:
            awk_h5 = awkward.hdf5(hf, **opts)
            for ds in uarrs:
                if isinstance(uarrs[ds], awkward.ObjectArray):
                    print("skipping dataset:", ds.decode('utf-8'))
                    continue
                awk_h5[ds.decode('utf-8')] = uarrs[ds]

        # ehhh, it's a work in progress.  probably won't need this until LEGEND

    # check the groups saved into the file
    with pd.HDFStore(hfile, 'r') as f:
        print("Keys:", f.keys())
Beispiel #10
0
def _save_to_cache(filename, item):

    classname = type(item).__name__

    if classname in vetoed_classnames:
        return

    if classname == "StreamList":
        if type(item[0]).__name__ in vetoed_classnames:
            return
        mkdir(filename)
        for i, subitem in enumerate(item):
            subclassname = type(subitem).__name__
            if subclassname == "StreamList":
                raise TypeError(
                    "StreamList in a StreamList found, which should not happen."
                )
            subfilename = os.path.join(
                filename,
                os.path.basename(filename.replace(classname, subclassname)) +
                "__{0:04d}".format(i))
            _save_to_cache(subfilename, subitem)
        return

    if classname == "DataFrame":
        item.to_hdf(filename + ".h5", key="data")
        return

    if classname in ["ndarray", "JaggedArray"]:
        with h5py.File(filename + ".h5", "w") as hf:
            ah5 = awkward.hdf5(hf)
            if classname == "JaggedArray":
                ah5["data"] = awkward_utils.ascontiguousarray(item)
            else:
                ah5["data"] = np.ascontiguousarray(item)
        return

    # For TLorentsVectorArray from ptetaphimass
    if (classname == "JaggedArrayMethods" and type(
            item._content).__name__ == "PtEtaPhiMassLorentzVectorArray"
            and type(item._content._content).__name__ == "Table"):
        starts = item._starts
        stops = item._stops
        contents = item._content._content._contents
        filename = filename.replace(
            "JaggedArrayMethods",
            "JaggedArrayMethods_PtEtaPhiMassLorentzVectorArray_Table")
        with h5py.File(filename + ".h5", "w") as hf:
            ah5 = awkward.hdf5(hf)
            for k, v in contents.items():
                a = awkward.JaggedArray(starts, stops, v)
                ah5[k] = awkward_utils.ascontiguousarray(a)
        return

    # for pytorch nn models
    if issubclass(type(item), torch.nn.Module):
        torch.save(item, filename + ".pt")
        return

    with open(filename + ".pkl", "wb") as f:
        pickle.dump(item, f)
    return