def download_raw_array(self, object_name, buffersize=2**16, **kwargs): """Download a binary np.ndarray and return an np.ndarray object This method downloads an array without any disk or memory overhead. Parameters ---------- object_name : str buffersize : optional (defaults 2^16) Returns ------- array : np.ndarray Notes ----- The object must have metadata containing: shape, dtype and a gzip boolean flag. This is all automatically handled by ``upload_raw_array``. """ self.exists_object(object_name, raise_err=True) arraystream = self.download_stream(object_name) shape = arraystream.metadata['shape'] shape = tuple(map(int, shape.split(',')) if shape else ()) dtype = np.dtype(arraystream.metadata['dtype']) order = arraystream.metadata.get('order', 'C') array = np.empty(tuple(shape), dtype = dtype, order = order) body = arraystream.content if 'gzip' in arraystream.metadata: # Backward compatibility, over-write "compression" value isgzipped = string2bool(arraystream.metadata['gzip']) if isgzipped: arraystream.metadata['compression'] = 'gzip' else: arraystream.metadata['compression'] = 'False' assert 'compression' in arraystream.metadata if arraystream.metadata['compression'] == 'gzip': # gzipped! datastream = GzipInputStream(body) elif arraystream.metadata['compression'] in ['False', 'None']: # uncompressed data datastream = body else: # numcodecs compression compression = arraystream.metadata['compression'] decompressor = numcodecs.get_codec(dict(id=compression.lower())) # Can't decode stream; must read in file. Memory hungry? bits_compr = arraystream.content.read() bits = decompressor.decode(bits_compr) array = np.frombuffer(bits, dtype=dtype) array = array.reshape(shape) return array read_buffered(datastream, array, buffersize=buffersize) return array
def create_dataset(output_n5, template_n5, compression='same', dtype='same', overwrite=True): data_set = '/s0' template = zarr.open(store=zarr.N5Store(template_n5), mode='r')[data_set] out = zarr.open(store=zarr.N5Store(output_n5), mode='a') if compression == 'raw': compressor = None elif compression == 'same': compressor = template.compressor else: compressor = codecs.get_codec(dict(id=compression)) if dtype == 'same': dtype = template.dtype print("Using compressor:", compressor or 'raw') print("Creating n5 data set with:") print(f" compressor: {compressor}") print(f" shape: {template.shape}") print(f" chunking: {template.chunks}") print(f" dtype: {dtype}") print(f" to path: {output_n5}{data_set}") out.create_dataset(data_set, shape=template.shape, chunks=template.chunks, dtype=dtype, compressor=compressor, overwrite=overwrite)
def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: dtype = meta["dtype"] sdshape = () if dtype.subdtype is not None: dtype, sdshape = dtype.subdtype dimension_separator = meta.get("dimension_separator") if dtype.hasobject: import numcodecs object_codec = numcodecs.get_codec(meta['filters'][0]) else: object_codec = None meta = dict( zarr_format=cls.ZARR_FORMAT, shape=meta["shape"] + sdshape, chunks=meta["chunks"], dtype=cls.encode_dtype(dtype), compressor=meta["compressor"], fill_value=cls.encode_fill_value(meta["fill_value"], dtype, object_codec), order=meta["order"], filters=meta["filters"], ) if dimension_separator: meta['dimension_separator'] = dimension_separator if dimension_separator: meta["dimension_separator"] = dimension_separator return json_dumps(meta)
def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, Any]: meta = cls.parse_metadata(s) # check metadata format zarr_format = meta.get("zarr_format", None) if zarr_format != cls.ZARR_FORMAT: raise MetadataError("unsupported zarr format: %s" % zarr_format) # extract array metadata fields try: dtype = cls.decode_dtype(meta["dtype"]) if dtype.hasobject: import numcodecs object_codec = numcodecs.get_codec(meta['filters'][0]) else: object_codec = None dimension_separator = meta.get("dimension_separator", None) fill_value = cls.decode_fill_value(meta['fill_value'], dtype, object_codec) meta = dict( zarr_format=meta["zarr_format"], shape=tuple(meta["shape"]), chunks=tuple(meta["chunks"]), dtype=dtype, compressor=meta["compressor"], fill_value=fill_value, order=meta["order"], filters=meta["filters"], ) if dimension_separator: meta['dimension_separator'] = dimension_separator except Exception as e: raise MetadataError("error decoding metadata") from e else: return meta
def upload_raw_array(self, object_name, array, compression=DO_COMPRESSION, acl=DEFAULT_ACL, **metadata): """Upload a a binary representation of a np.ndarray This method reads the array content from memory to upload. It does not have any overhead. Parameters ---------- object_name : str array : np.ndarray compression : str, bool `True` uses the configuration defaults. `False` is no compression. Available options are: 'gzip', 'LZ4', 'Zlib', 'Zstd', 'BZ2' (attend to caps). NB: Zstd appears to be the only one that supports >2GB arrays. acl : str ACL for the object **metadata : optional Notes ----- This method also uploads the array ``dtype``, ``shape``, and ``gzip`` flag as metadata """ if compression is None: compression = False # Backward compatibility if 'gzip' in metadata: warn("Deprecated keyword argument `gzip`. Use `compression='gzip'` instead", DeprecationWarning) gz = metadata.pop('gzip') compression = 'gzip' if gz else False if compression is True: # check whether array is >= 2 GB large_array = array.nbytes > 2**31 compression = COMPRESSION_LARGE if large_array else COMPRESSION_SMALL if large_array and compression == 'gzip': # Raise exception for specification of gzip w/ large array raise ValueError(("gzip does not support compression of >2GB arrays. " "Try `compression='Zstd'` instead.")) order = 'C' if array.flags.carray else 'F' if ((not array.flags['%s_CONTIGUOUS' % order] and six.PY2) or (not array.flags['C_CONTIGUOUS'] and six.PY3)): warn('Non-contiguous array. Creating copy (will use extra memory)...') if six.PY3 and order == 'F': # memoryview (PY3) vs buffer (PY2) issues warn("PY3: Changing array from 'F' to 'C' order") order = 'C' # create contiguous copy array = np.array(array, order=order) meta = dict(dtype=array.dtype.str, shape=','.join(map(str, array.shape)), compression=str(compression), order=order) # check for conflicts in metadata metadata_keys = [] for k in metadata.keys(): # check for conflicts in metadata metadata_keys.append(k in meta) assert not any(metadata_keys) meta.update(metadata) if compression is False: filestream = StringIO(array.data) elif compression == 'gzip': if six.PY3 and array.flags['F_CONTIGUOUS']: # eventually, array.data below should be changed to np.getbuffer(array) # (not yet working in python3 numpy) # F-contiguous arrays break gzip in python 3 array = array.T zipdata = StringIO() gz = GzipFile(mode='wb', fileobj=zipdata) gz.write(array.data) gz.close() zipdata.seek(0) filestream = zipdata elif hasattr(numcodecs, compression.lower()): # If the specified compression type is in numcodecs, use numcodecs orig_nbytes = array.nbytes compressor = numcodecs.get_codec(dict(id=compression.lower())) filestream = StringIO(compressor.encode(array)) data_nbytes = get_fileobject_size(filestream) print('Compressed to %0.2f%% the size'%(data_nbytes / float(orig_nbytes) * 100)) else: raise ValueError('Unknown compression scheme: %s'%compression) response = self.upload_object(object_name, filestream, acl=acl, **meta) return response
def __init__(self, codec=None, **kwargs): if codec is None: import numcodecs self.codec = numcodecs.get_codec(kwargs) else: self.codec = codec