def open(path, mode="r", prefix="object", delimiter="-"): def explode(x): parsed = urlparse(x) if parsed.scheme == "file" or len(parsed.scheme) == 0: return sorted( glob.glob(os.path.expanduser(parsed.netloc + parsed.path))) else: raise ValueError("URL scheme '{0}' not recognized".format( parsed.scheme)) if isinstance(path, basestring): paths = explode(path) else: paths = [y for x in path for y in explode(x)] if len(paths) == 0: raise ValueError("no matching filenames") npzfile = numpy.load(paths[0]) try: datasetarray = npzfile[prefix] assert datasetarray.dtype == numpy.dtype(numpy.uint8) and len( datasetarray.shape) == 1 dataset = oamap.schema.Dataset.fromjsonstring(datasetarray.tostring()) except: schema = oamap.inference.fromnames(npzfile.keys(), prefix=prefix, delimiter=delimiter) else: schema = dataset.schema generator = schema.generator() listofarrays = [NumpyFileArrays(paths[0], npzfile) ] + [NumpyFileArrays(x, None) for x in paths[1:]] return oamap.proxy.PartitionedListProxy(generator, listofarrays)
def _proxy(tree, namespace=None, extension=oamap.extension.common): if namespace is None: namespace = "root.cmsnano({0})".format(repr(path)) schema = _schema(tree, namespace=namespace) generator = schema.generator(extension=extension) return oamap.proxy.ListProxy( generator, oamap.backend.root.ROOTArrays( tree, oamap.backend.root.ROOTBackend([tree._context.sourcepath], tree._context.treename, namespace)), generator._newcache(), 0, 1, tree.numentries)
def _proxy(tree, namespace="", extension=oamap.extension.common): schema = _schema(tree, namespace=namespace) generator = schema.generator(extension=extension) return oamap.proxy.ListProxy(generator, ROOTArrays(tree), generator._newcache(), 0, 1, tree.numentries)
def _proxy(tree, namespace="", extension=oamap.extension.common): schema = _schema(tree, namespace=namespace) generator = schema.generator(extension=extension) return oamap.proxy.ListProxy(generator, ROOTArrays(tree, ROOTBackend([tree._context.sourcepath], tree._context.treename, namespace)), generator._newcache(), 0, 1, tree.numentries)
def fromdata(self, key, value, schema=None, inferencelimit=None, partitionlimit=None, pointer_fromequal=False): if schema is None: schema = oamap.inference.fromdata(value, limit=inferencelimit) if partitionlimit is not None: if not (isinstance(schema, oamap.schema.List) and not schema.nullable): raise TypeError("if limit is not None, the schema must be a partitionable List") if not callable(partitionlimit): raise TypeError("partitionlimit must be None or a callable function") if isinstance(schema, oamap.schema.Dataset): dataset = schema schema = dataset.schema else: dataset = oamap.schema.Dataset(schema, prefix=key) if dataset.prefix is None: prefix = key else: prefix = dataset.prefix if dataset.delimiter is None: delimiter = "-" else: delimiter = dataset.delimiter if dataset.extension is None: extension = import_module("oamap.extension.common") elif isinstance(dataset.extension, basestring): extension = import_module(dataset.extension) else: extension = [import_module(x) for x in dataset.extension] generator = schema.generator(prefix=prefix, delimiter=delimiter, extension=extension) if partitionlimit is None: arrays = oamap.fill.fromdata(value, generator=generator, pointer_fromequal=pointer_fromequal) if key in self: del self[key] if dataset.partitioning is None: for n, x in arrays.items(): self.dbm[_asbytes(self.ARRAY + n)] = x.tostring() else: partitionlookup = dataset.partitioning.empty_partitionlookup(delimiter) partitionlookup.append(arrays[generator.stops][0] - arrays[generator.starts][0], arrays.keys()) for n, x in arrays.items(): self.dbm[_asbytes(self.ARRAY + partitionlookup.id2name(n, 0))] = x.tostring() self.dbm[_asbytes(self.ARRAY + dataset.partitioning.key)] = numpy.array(partitionlookup).tostring() self.dbm[_asbytes(self.DATASET + key)] = dataset.tojsonstring() else: dataset = dataset.copy(partitioning=dataset._get_partitioning(prefix, delimiter)) partitionlookup = dataset.partitioning.empty_partitionlookup(delimiter) values = iter(value) if key in self: del self[key] self.dbm[_asbytes(self.ARRAY + key)] = numpy.array(partitionlookup).tostring() self.dbm[_asbytes(self.DATASET + key)] = dataset.tojsonstring() for partitionid, (numentries, arrays) in enumerate(oamap.fill.fromiterdata(values, generator=generator, limit=partitionlimit, pointer_fromequal=pointer_fromequal)): partitionlookup.append(numentries, arrays.keys()) for n, x in arrays.items(): self.dbm[_asbytes(self.ARRAY + partitionlookup.id2name(n, partitionid))] = x.tostring() self.dbm[_asbytes(self.ARRAY + dataset.partitioning.key)] = numpy.array(partitionlookup).tostring()
def fromdata(self, name, schema, *partitions, **opts): try: pointer_fromequal = opts.pop("pointer_fromequal", False) except KeyError: pass try: namespace = opts.pop("namespace", self._namespace) except KeyError: pass try: extension = opts.pop("extension", None) except KeyError: pass try: packing = opts.pop("packing", None) except KeyError: pass try: doc = opts.pop("doc", None) except KeyError: pass try: metadata = opts.pop("metadata", None) except KeyError: pass if len(opts) > 0: raise TypeError("unrecognized options: {0}".format(" ".join(opts))) if namespace not in self._backends: self[namespace] = DictBackend() backend = self[namespace] def setnamespace(node): node.namespace = namespace return node schema = schema.replace(setnamespace) generator = schema.generator(prefix=backend.prefix(name), delimiter=backend.delimiter(), packing=packing) generator._requireall() roles = generator._togetall({}, generator._newcache(), True, set()) if isinstance(schema, (oamap.schema.Record, oamap.schema.Tuple)): if len(partitions) != 1: raise TypeError( "only lists can have more or less than one partition") data = generator.fromdata(partitions[0]) roles2arrays = dict((x, data._arrays[str(x)]) for x in roles) active = backend.instantiate(0) if hasattr(active, "putall"): active.putall(roles2arrays) else: for n, x in roles2arrays.items(): active[str(n)] = x out = oamap.dataset.Data(name, generator.namedschema(), self._backends, self._executor, extension=extension, packing=packing, doc=doc, metadata=metadata) elif isinstance(schema, oamap.schema.List): offsets = [0] for partitionid, partition in enumerate(partitions): data = generator.fromdata(partition) roles2arrays = dict((x, data._arrays[str(x)]) for x in roles) startsrole = oamap.generator.StartsRole( generator.starts, generator.namespace, None) stopsrole = oamap.generator.StopsRole(generator.stops, generator.namespace, None) startsrole.stops = stopsrole stopsrole.starts = startsrole if schema.nullable: maskrole = oamap.generator.MaskRole( generator.mask, generator.namespace, { startsrole: roles2arrays[startsrole], stopsrole: roles2arrays[stopsrole] }) del roles2arrays[startsrole] del roles2arrays[stopsrole] if schema.nullable: del roles2arrays[maskrole] active = backend.instantiate(partitionid) if hasattr(active, "putall"): active.putall(roles2arrays) else: for n, x in roles2arrays.items(): active[str(n)] = x offsets.append(offsets[-1] + len(data)) out = oamap.dataset.Dataset(name, generator.namedschema(), self._backends, self._executor, offsets, extension=extension, packing=packing, doc=doc, metadata=metadata) else: raise TypeError( "can only create datasets from proxy types (list, records, tuples)" ) self.put(name, out, namespace=namespace)
def savez(file, value, schema=None, prefix="object", delimiter=None, extension=None, saveschema=True, compressed=False, inferencelimit=None, pointer_fromequal=False): if schema is None: if isinstance(value, oamap.proxy.Proxy): schema = value._generator.schema else: schema = oamap.inference.fromdata(value, limit=inferencelimit) if isinstance(schema, oamap.schema.Dataset): dataset = schema schema = dataset.schema else: dataset = oamap.schema.Dataset(schema, prefix=prefix, delimiter="-", extension=extension) if dataset.partitioning is not None: raise ValueError("npz files do not support partitioning") if delimiter is None: if dataset.delimiter is None: delimiter = "-" else: delimiter = dataset.delimiter if extension is None: if dataset.extension is None: extension = import_module("oamap.extension.common") elif isinstance(dataset.extension, basestring): extension = import_module(dataset.extension) else: extension = [import_module(x) for x in dataset.extension] generator = schema.generator(prefix=prefix, delimiter=delimiter, extension=extension) if isinstance(value, oamap.proxy.Proxy) and hasattr( value._arrays, "items"): arrays = dict(value._arrays.items()) elif isinstance(value, oamap.proxy.Proxy) and hasattr( value._arrays, "keys"): arrays = dict((n, value._arrays[n]) for n in value._arrays.keys()) elif isinstance(value, oamap.proxy.Proxy) and hasattr( value._arrays, "__iter__"): arrays = dict((n, value._arrays[n]) for n in value._arrays) else: arrays = oamap.fill.fromdata(value, generator=generator, pointer_fromequal=pointer_fromequal) if saveschema and prefix not in arrays: arrays[prefix] = numpy.frombuffer(dataset.tojsonstring(), dtype=numpy.uint8) if compressed: numpy.savez_compressed(file, **arrays) else: numpy.savez(file, **arrays)
def fromdata(self, key, value, schema=None, inferencelimit=None, partitionlimit=None, pointer_fromequal=False): if schema is None: schema = oamap.inference.fromdata(value, limit=inferencelimit) if partitionlimit is not None: if not (isinstance(schema, oamap.schema.List) and not schema.nullable): raise TypeError( "if limit is not None, the schema must be a partitionable List" ) if not callable(partitionlimit): raise TypeError( "partitionlimit must be None or a callable function") if isinstance(schema, oamap.schema.Dataset): dataset = schema schema = dataset.schema else: dataset = oamap.schema.Dataset(schema, prefix=key) if dataset.prefix is None: prefix = key else: prefix = dataset.prefix if dataset.delimiter is None: delimiter = "-" else: delimiter = dataset.delimiter if dataset.extension is None: extension = import_module("oamap.extension.common") elif isinstance(dataset.extension, basestring): extension = import_module(dataset.extension) else: extension = [import_module(x) for x in dataset.extension] generator = schema.generator(prefix=prefix, delimiter=delimiter, extension=extension) if partitionlimit is None: arrays = oamap.fill.fromdata( value, generator=generator, pointer_fromequal=pointer_fromequal) for n in arrays: if super(OAMapGroup, self).__contains__(n): raise RuntimeError( "cannot assign to {0} (dataset exists)".format( repr(n))) if dataset.partitioning is None: for n, x in arrays.items(): super(OAMapGroup, self).__setitem__(n, x) else: partitionlookup = dataset.partitioning.empty_partitionlookup( delimiter) partitionlookup.append( arrays[generator.stops][0] - arrays[generator.starts][0], arrays.keys()) for n, x in arrays.items(): super(OAMapGroup, self).__setitem__(partitionlookup.id2name(n, 0), x) super(OAMapGroup, self).__setitem__(dataset.partitioning.key, numpy.array(partitionlookup)) self.attrs[key] = dataset.tojsonstring() else: dataset = dataset.copy( partitioning=dataset._get_partitioning(prefix, delimiter)) partitionlookup = dataset.partitioning.empty_partitionlookup( delimiter) super(OAMapGroup, self).__setitem__(dataset.partitioning.key, numpy.array(partitionlookup)) self.attrs[key] = dataset.tojsonstring() for partitionid, (numentries, arrays) in enumerate( oamap.fill.fromiterdata( value, generator=generator, limit=partitionlimit, pointer_fromequal=pointer_fromequal)): partitionlookup.append(numentries, arrays.keys()) for n in arrays: if super(OAMapGroup, self).__contains__(n): raise RuntimeError( "cannot assign to {0} (dataset exists)".format( repr(n))) for n, x in arrays.items(): super(OAMapGroup, self).__setitem__( partitionlookup.id2name(n, partitionid), x) super(OAMapGroup, self).__delitem__(dataset.partitioning.key) super(OAMapGroup, self).__setitem__(dataset.partitioning.key, numpy.array(partitionlookup))