def dataset(path, treepath, namespace=None, **kwargs): import uproot if namespace is None: namespace = "root({0}, {1})".format(repr(path), repr(treepath)) if "localsource" not in kwargs: kwargs["localsource"] = lambda path: uproot.source.file.FileSource( path, chunkbytes=8 * 1024, limitbytes=None) kwargs["total"] = False kwargs["blocking"] = True paths2entries = uproot.tree.numentries(path, treepath, **kwargs) if len(paths2entries) == 0: raise ValueError("path {0} matched no TTrees".format(repr(path))) offsets = [0] paths = [] for path, numentries in paths2entries.items(): offsets.append(offsets[-1] + numentries) paths.append(path) sch = schema(paths[0], treepath, namespace=namespace) doc = sch.doc sch.doc = None return oamap.dataset.Dataset(treepath.split("/")[-1].split(";")[0], sch, {namespace: ROOTBackend(paths, treepath)}, oamap.dataset.SingleThreadExecutor(), offsets, extension=None, packing=None, doc=doc, metadata={"schemafrom": paths[0]})
def test_Pointer(self): class Node(object): def __init__(self, label, next): self.label = label self.next = next schema = Record({"label": Primitive("i8")}, name="Node") schema["next"] = Pointer(schema) value = Node(0, Node(1, Node(2, None))) value.next.next.next = value arrays = oamap.fill.fromdata(value, schema) columnar = schema(arrays) self.assertEqual(value.label, columnar.label) self.assertEqual(value.next.label, columnar.next.label) self.assertEqual(value.next.next.label, columnar.next.next.label) self.assertEqual(value.next.next.next.label, columnar.next.next.next.label) self.assertEqual(value.next.next.next.next.label, columnar.next.next.next.next.label) self.assertEqual(value.next.next.next.next.next.label, columnar.next.next.next.next.next.label) self.assertEqual(value.next.next.next.next.next.next.label, columnar.next.next.next.next.next.next.label)
def check(self, value, schema=None, debug=False): if schema is None: schema = oamap.inference.fromdata(value) if debug: print("schema: {0}".format(schema)) arrays = oamap.fill.fromdata(value, schema) if debug: print("arrays:") for n in sorted(arrays): print(" {0}: {1}".format(repr(n), arrays[n])) columnar = schema(arrays) if debug: print("columnar: {0}".format(columnar)) value2 = oamap.proxy.tojson(columnar) self.assertEqual(value, value2)
def load(npzfile, prefix="object", delimiter="-"): if not isinstance(npzfile, numpy.lib.npyio.NpzFile): npzfile = numpy.load(npzfile) if not isinstance(npzfile, numpy.lib.npyio.NpzFile): raise TypeError( "npzfile must be a Numpy NpzFile (e.g. oamap.source.npz.load(numpy.load(\"filename.npz\")))" ) try: datasetarray = npzfile[prefix] assert datasetarray.dtype == numpy.dtype(numpy.uint8) and len( datasetarray.shape) == 1 dataset = oamap.schema.Dataset.fromjsonstring(datasetarray.tostring()) except: schema = oamap.inference.fromnames(npzfile.keys(), prefix=prefix, delimiter=delimiter) else: schema = dataset.schema return schema(npzfile)
def proxy(table): import pyarrow class _ArrayDict(object): def __init__(self, table): self.table = table def chop(self, name): slashindex = name.rindex("/") return name[:slashindex], int(name[slashindex + 1:]) def frombuffer(self, chunk, bufferindex): def truncate(array, length, offset=0): return array[:length + offset] def mask(index, length): buf = chunk.buffers()[index] if buf is None: return numpy.arange(length, dtype=oamap.generator.Masked.maskdtype) else: unmasked = truncate( numpy.unpackbits( numpy.frombuffer(buf, dtype=numpy.uint8)).view( numpy.bool_), length) mask = numpy.empty(len(unmasked), dtype=oamap.generator.Masked.maskdtype) mask[unmasked] = numpy.arange(unmasked.sum(), dtype=mask.dtype) mask[~unmasked] = oamap.generator.Masked.maskedvalue return mask def recurse(tpe, index, length): if isinstance(tpe, pyarrow.lib.ListType): if index == bufferindex: # list mask return mask(index, length) elif index + 1 == bufferindex: # list starts return truncate( numpy.frombuffer(chunk.buffers()[index + 1], dtype=numpy.int32), length, 1) else: # descend into list length = truncate( numpy.frombuffer(chunk.buffers()[index + 1], dtype=numpy.int32), length, 1)[-1] return recurse(tpe.value_type, index + 2, length) elif isinstance(tpe, pyarrow.lib.DataType): if index == bufferindex: # data mask return mask(index, length) elif index + 1 == bufferindex: # data return truncate( numpy.frombuffer(chunk.buffers()[index + 1], dtype=tpe.to_pandas_dtype()), length) else: raise AssertionError else: raise NotImplementedError return recurse(chunk.type, 0, len(chunk)) def getall(self, names): out = {} for name in names: if len(str(name)) == 0: if isinstance(name, oamap.generator.StartsRole): out[name] = numpy.array( [0], dtype=oamap.generator.ListGenerator.posdtype) elif isinstance(name, oamap.generator.StopsRole): out[name] = numpy.array( [self.table.num_rows], dtype=oamap.generator.ListGenerator.posdtype) else: raise AssertionError elif isinstance(name, oamap.generator.StopsRole): out[name] = out[name.starts][1:] else: columnname, bufferindex = self.chop(str(name)) column = self.table[self.table.schema.names.index( columnname)] chunks = column.data.chunks if len(chunks) == 0: raise ValueError( "Arrow column {0} has no chunks".format( repr(columnname))) elif len(chunks) == 1: out[name] = self.frombuffer(chunks[0], bufferindex) else: out[name] = numpy.concatenate([ self.frombuffer(chunk, bufferindex) for chunk in chunks ]) return out return schema(table)(_ArrayDict(table))