def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, dry_run, **create_kws): # N.B., if this is a dry run, dest may be None # setup counting variables n_copied = n_skipped = n_bytes_copied = 0 # are we copying to/from h5py? source_h5py = source.__module__.startswith('h5py.') dest_h5py = dest is not None and dest.__module__.startswith('h5py.') # check if_exists parameter valid_if_exists = ['raise', 'replace', 'skip', 'skip_initialized'] if if_exists not in valid_if_exists: raise ValueError('if_exists must be one of {!r}; found {!r}'.format( valid_if_exists, if_exists)) if dest_h5py and if_exists == 'skip_initialized': raise ValueError( '{!r} can only be used when copying to zarr'.format(if_exists)) # determine name to copy to if name is None: name = source.name.split('/')[-1] if not name: # this can happen if source is the root group raise TypeError('source has no name, please provide the `name` ' 'parameter to indicate a name to copy to') if hasattr(source, 'shape'): # copy a dataset/array # check if already exists, decide what to do do_copy = True exists = dest is not None and name in dest if exists: if if_exists == 'raise': raise CopyError('an object {!r} already exists in destination ' '{!r}'.format(name, dest.name)) elif if_exists == 'skip': do_copy = False elif if_exists == 'skip_initialized': ds = dest[name] if ds.nchunks_initialized == ds.nchunks: do_copy = False # take action if do_copy: # log a message about what we're going to do log('copy {} {} {}'.format(source.name, source.shape, source.dtype)) if not dry_run: # clear the way if exists: del dest[name] # setup creation keyword arguments kws = create_kws.copy() # setup chunks option, preserve by default kws.setdefault('chunks', source.chunks) # setup compression options if source_h5py: if dest_h5py: # h5py -> h5py; preserve compression options by default kws.setdefault('compression', source.compression) kws.setdefault('compression_opts', source.compression_opts) kws.setdefault('shuffle', source.shuffle) kws.setdefault('fletcher32', source.fletcher32) kws.setdefault('fillvalue', source.fillvalue) else: # h5py -> zarr; use zarr default compression options kws.setdefault('fill_value', source.fillvalue) else: if dest_h5py: # zarr -> h5py; use some vaguely sensible defaults kws.setdefault('chunks', True) kws.setdefault('compression', 'gzip') kws.setdefault('compression_opts', 1) kws.setdefault('shuffle', False) kws.setdefault('fillvalue', source.fill_value) else: # zarr -> zarr; preserve compression options by default kws.setdefault('compressor', source.compressor) kws.setdefault('filters', source.filters) kws.setdefault('order', source.order) kws.setdefault('fill_value', source.fill_value) # create new dataset in destination ds = dest.create_dataset(name, shape=source.shape, dtype=source.dtype, **kws) # copy data - N.B., go chunk by chunk to avoid loading # everything into memory shape = ds.shape chunks = ds.chunks chunk_offsets = [range(0, s, c) for s, c in zip(shape, chunks)] for offset in itertools.product(*chunk_offsets): sel = tuple( slice(o, min(s, o + c)) for o, s, c in zip(offset, shape, chunks)) ds[sel] = source[sel] n_bytes_copied += ds.size * ds.dtype.itemsize # copy attributes if not without_attrs: ds.attrs.update(source.attrs) n_copied += 1 else: log('skip {} {} {}'.format(source.name, source.shape, source.dtype)) n_skipped += 1 elif root or not shallow: # copy a group # check if an array is in the way do_copy = True exists_array = (dest is not None and name in dest and hasattr(dest[name], 'shape')) if exists_array: if if_exists == 'raise': raise CopyError('an array {!r} already exists in destination ' '{!r}'.format(name, dest.name)) elif if_exists == 'skip': do_copy = False # take action if do_copy: # log action log('copy {}'.format(source.name)) if not dry_run: # clear the way if exists_array: del dest[name] # require group in destination grp = dest.require_group(name) # copy attributes if not without_attrs: grp.attrs.update(source.attrs) else: # setup for dry run without creating any groups in the # destination if dest is not None: grp = dest.get(name, None) else: grp = None # recurse for k in source.keys(): c, s, b = _copy(log, source[k], grp, name=k, root=False, shallow=shallow, without_attrs=without_attrs, if_exists=if_exists, dry_run=dry_run, **create_kws) n_copied += c n_skipped += s n_bytes_copied += b n_copied += 1 else: log('skip {}'.format(source.name)) n_skipped += 1 return n_copied, n_skipped, n_bytes_copied
def copy_store(source, dest, source_path='', dest_path='', excludes=None, includes=None, flags=0, if_exists='raise', dry_run=False, log=None): """Copy data directly from the `source` store to the `dest` store. Use this function when you want to copy a group or array in the most efficient way, preserving all configuration and attributes. This function is more efficient than the copy() or copy_all() functions because it avoids de-compressing and re-compressing data, rather the compressed chunk data for each array are copied directly between stores. Parameters ---------- source : Mapping Store to copy data from. dest : MutableMapping Store to copy data into. source_path : str, optional Only copy data from under this path in the source store. dest_path : str, optional Copy data into this path in the destination store. excludes : sequence of str, optional One or more regular expressions which will be matched against keys in the source store. Any matching key will not be copied. includes : sequence of str, optional One or more regular expressions which will be matched against keys in the source store and will override any excludes also matching. flags : int, optional Regular expression flags used for matching excludes and includes. if_exists : {'raise', 'replace', 'skip'}, optional How to handle keys that already exist in the destination store. If 'raise' then a CopyError is raised on the first key already present in the destination store. If 'replace' then any data will be replaced in the destination. If 'skip' then any existing keys will not be copied. dry_run : bool, optional If True, don't actually copy anything, just log what would have happened. log : callable, file path or file-like object, optional If provided, will be used to log progress information. Returns ------- n_copied : int Number of items copied. n_skipped : int Number of items skipped. n_bytes_copied : int Number of bytes of data that were actually copied. Examples -------- >>> import zarr >>> store1 = zarr.DirectoryStore('data/example.zarr') >>> root = zarr.group(store1, overwrite=True) >>> foo = root.create_group('foo') >>> bar = foo.create_group('bar') >>> baz = bar.create_dataset('baz', shape=100, chunks=50, dtype='i8') >>> import numpy as np >>> baz[:] = np.arange(100) >>> root.tree() / └── foo └── bar └── baz (100,) int64 >>> from sys import stdout >>> store2 = zarr.ZipStore('data/example.zip', mode='w') >>> zarr.copy_store(store1, store2, log=stdout) copy .zgroup copy foo/.zgroup copy foo/bar/.zgroup copy foo/bar/baz/.zarray copy foo/bar/baz/0 copy foo/bar/baz/1 all done: 6 copied, 0 skipped, 566 bytes copied (6, 0, 566) >>> new_root = zarr.group(store2) >>> new_root.tree() / └── foo └── bar └── baz (100,) int64 >>> new_root['foo/bar/baz'][:] array([ 0, 1, 2, ..., 97, 98, 99]) >>> store2.close() # zip stores need to be closed Notes ----- Please note that this is an experimental feature. The behaviour of this function is still evolving and the default behaviour and/or parameters may change in future versions. """ # normalize paths source_path = normalize_storage_path(source_path) dest_path = normalize_storage_path(dest_path) if source_path: source_path = source_path + '/' if dest_path: dest_path = dest_path + '/' # normalize excludes and includes if excludes is None: excludes = [] elif isinstance(excludes, str): excludes = [excludes] if includes is None: includes = [] elif isinstance(includes, str): includes = [includes] excludes = [re.compile(e, flags) for e in excludes] includes = [re.compile(i, flags) for i in includes] # check if_exists parameter valid_if_exists = ['raise', 'replace', 'skip'] if if_exists not in valid_if_exists: raise ValueError('if_exists must be one of {!r}; found {!r}'.format( valid_if_exists, if_exists)) # setup counting variables n_copied = n_skipped = n_bytes_copied = 0 # setup logging with _LogWriter(log) as log: # iterate over source keys for source_key in sorted(source.keys()): # filter to keys under source path if source_key.startswith(source_path): # process excludes and includes exclude = False for prog in excludes: if prog.search(source_key): exclude = True break if exclude: for prog in includes: if prog.search(source_key): exclude = False break if exclude: continue # map key to destination path key_suffix = source_key[len(source_path):] dest_key = dest_path + key_suffix # create a descriptive label for this operation descr = source_key if dest_key != source_key: descr = descr + ' -> ' + dest_key # decide what to do do_copy = True if if_exists != 'replace': if dest_key in dest: if if_exists == 'raise': raise CopyError( 'key {!r} exists in destination'.format( dest_key)) elif if_exists == 'skip': do_copy = False # take action if do_copy: log('copy {}'.format(descr)) if not dry_run: data = source[source_key] n_bytes_copied += buffer_size(data) dest[dest_key] = data n_copied += 1 else: log('skip {}'.format(descr)) n_skipped += 1 # log a final message with a summary of what happened _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) return n_copied, n_skipped, n_bytes_copied