コード例 #1
0
ファイル: convenience.py プロジェクト: grlee77/zarr-python
def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists,
          dry_run, **create_kws):
    # N.B., if this is a dry run, dest may be None

    # setup counting variables
    n_copied = n_skipped = n_bytes_copied = 0

    # are we copying to/from h5py?
    source_h5py = source.__module__.startswith('h5py.')
    dest_h5py = dest is not None and dest.__module__.startswith('h5py.')

    # check if_exists parameter
    valid_if_exists = ['raise', 'replace', 'skip', 'skip_initialized']
    if if_exists not in valid_if_exists:
        raise ValueError('if_exists must be one of {!r}; found {!r}'.format(
            valid_if_exists, if_exists))
    if dest_h5py and if_exists == 'skip_initialized':
        raise ValueError(
            '{!r} can only be used when copying to zarr'.format(if_exists))

    # determine name to copy to
    if name is None:
        name = source.name.split('/')[-1]
        if not name:
            # this can happen if source is the root group
            raise TypeError('source has no name, please provide the `name` '
                            'parameter to indicate a name to copy to')

    if hasattr(source, 'shape'):
        # copy a dataset/array

        # check if already exists, decide what to do
        do_copy = True
        exists = dest is not None and name in dest
        if exists:
            if if_exists == 'raise':
                raise CopyError('an object {!r} already exists in destination '
                                '{!r}'.format(name, dest.name))
            elif if_exists == 'skip':
                do_copy = False
            elif if_exists == 'skip_initialized':
                ds = dest[name]
                if ds.nchunks_initialized == ds.nchunks:
                    do_copy = False

        # take action
        if do_copy:

            # log a message about what we're going to do
            log('copy {} {} {}'.format(source.name, source.shape,
                                       source.dtype))

            if not dry_run:

                # clear the way
                if exists:
                    del dest[name]

                # setup creation keyword arguments
                kws = create_kws.copy()

                # setup chunks option, preserve by default
                kws.setdefault('chunks', source.chunks)

                # setup compression options
                if source_h5py:
                    if dest_h5py:
                        # h5py -> h5py; preserve compression options by default
                        kws.setdefault('compression', source.compression)
                        kws.setdefault('compression_opts',
                                       source.compression_opts)
                        kws.setdefault('shuffle', source.shuffle)
                        kws.setdefault('fletcher32', source.fletcher32)
                        kws.setdefault('fillvalue', source.fillvalue)
                    else:
                        # h5py -> zarr; use zarr default compression options
                        kws.setdefault('fill_value', source.fillvalue)
                else:
                    if dest_h5py:
                        # zarr -> h5py; use some vaguely sensible defaults
                        kws.setdefault('chunks', True)
                        kws.setdefault('compression', 'gzip')
                        kws.setdefault('compression_opts', 1)
                        kws.setdefault('shuffle', False)
                        kws.setdefault('fillvalue', source.fill_value)
                    else:
                        # zarr -> zarr; preserve compression options by default
                        kws.setdefault('compressor', source.compressor)
                        kws.setdefault('filters', source.filters)
                        kws.setdefault('order', source.order)
                        kws.setdefault('fill_value', source.fill_value)

                # create new dataset in destination
                ds = dest.create_dataset(name,
                                         shape=source.shape,
                                         dtype=source.dtype,
                                         **kws)

                # copy data - N.B., go chunk by chunk to avoid loading
                # everything into memory
                shape = ds.shape
                chunks = ds.chunks
                chunk_offsets = [range(0, s, c) for s, c in zip(shape, chunks)]
                for offset in itertools.product(*chunk_offsets):
                    sel = tuple(
                        slice(o, min(s, o + c))
                        for o, s, c in zip(offset, shape, chunks))
                    ds[sel] = source[sel]
                n_bytes_copied += ds.size * ds.dtype.itemsize

                # copy attributes
                if not without_attrs:
                    ds.attrs.update(source.attrs)

            n_copied += 1

        else:
            log('skip {} {} {}'.format(source.name, source.shape,
                                       source.dtype))
            n_skipped += 1

    elif root or not shallow:
        # copy a group

        # check if an array is in the way
        do_copy = True
        exists_array = (dest is not None and name in dest
                        and hasattr(dest[name], 'shape'))
        if exists_array:
            if if_exists == 'raise':
                raise CopyError('an array {!r} already exists in destination '
                                '{!r}'.format(name, dest.name))
            elif if_exists == 'skip':
                do_copy = False

        # take action
        if do_copy:

            # log action
            log('copy {}'.format(source.name))

            if not dry_run:

                # clear the way
                if exists_array:
                    del dest[name]

                # require group in destination
                grp = dest.require_group(name)

                # copy attributes
                if not without_attrs:
                    grp.attrs.update(source.attrs)

            else:

                # setup for dry run without creating any groups in the
                # destination
                if dest is not None:
                    grp = dest.get(name, None)
                else:
                    grp = None

            # recurse
            for k in source.keys():
                c, s, b = _copy(log,
                                source[k],
                                grp,
                                name=k,
                                root=False,
                                shallow=shallow,
                                without_attrs=without_attrs,
                                if_exists=if_exists,
                                dry_run=dry_run,
                                **create_kws)
                n_copied += c
                n_skipped += s
                n_bytes_copied += b

            n_copied += 1

        else:
            log('skip {}'.format(source.name))
            n_skipped += 1

    return n_copied, n_skipped, n_bytes_copied
コード例 #2
0
ファイル: convenience.py プロジェクト: grlee77/zarr-python
def copy_store(source,
               dest,
               source_path='',
               dest_path='',
               excludes=None,
               includes=None,
               flags=0,
               if_exists='raise',
               dry_run=False,
               log=None):
    """Copy data directly from the `source` store to the `dest` store. Use this
    function when you want to copy a group or array in the most efficient way,
    preserving all configuration and attributes. This function is more efficient
    than the copy() or copy_all() functions because it avoids de-compressing and
    re-compressing data, rather the compressed chunk data for each array are
    copied directly between stores.

    Parameters
    ----------
    source : Mapping
        Store to copy data from.
    dest : MutableMapping
        Store to copy data into.
    source_path : str, optional
        Only copy data from under this path in the source store.
    dest_path : str, optional
        Copy data into this path in the destination store.
    excludes : sequence of str, optional
        One or more regular expressions which will be matched against keys in
        the source store. Any matching key will not be copied.
    includes : sequence of str, optional
        One or more regular expressions which will be matched against keys in
        the source store and will override any excludes also matching.
    flags : int, optional
        Regular expression flags used for matching excludes and includes.
    if_exists : {'raise', 'replace', 'skip'}, optional
        How to handle keys that already exist in the destination store. If
        'raise' then a CopyError is raised on the first key already present
        in the destination store. If 'replace' then any data will be replaced in
        the destination. If 'skip' then any existing keys will not be copied.
    dry_run : bool, optional
        If True, don't actually copy anything, just log what would have
        happened.
    log : callable, file path or file-like object, optional
        If provided, will be used to log progress information.

    Returns
    -------
    n_copied : int
        Number of items copied.
    n_skipped : int
        Number of items skipped.
    n_bytes_copied : int
        Number of bytes of data that were actually copied.

    Examples
    --------

    >>> import zarr
    >>> store1 = zarr.DirectoryStore('data/example.zarr')
    >>> root = zarr.group(store1, overwrite=True)
    >>> foo = root.create_group('foo')
    >>> bar = foo.create_group('bar')
    >>> baz = bar.create_dataset('baz', shape=100, chunks=50, dtype='i8')
    >>> import numpy as np
    >>> baz[:] = np.arange(100)
    >>> root.tree()
    /
     └── foo
         └── bar
             └── baz (100,) int64
    >>> from sys import stdout
    >>> store2 = zarr.ZipStore('data/example.zip', mode='w')
    >>> zarr.copy_store(store1, store2, log=stdout)
    copy .zgroup
    copy foo/.zgroup
    copy foo/bar/.zgroup
    copy foo/bar/baz/.zarray
    copy foo/bar/baz/0
    copy foo/bar/baz/1
    all done: 6 copied, 0 skipped, 566 bytes copied
    (6, 0, 566)
    >>> new_root = zarr.group(store2)
    >>> new_root.tree()
    /
     └── foo
         └── bar
             └── baz (100,) int64
    >>> new_root['foo/bar/baz'][:]
    array([ 0,  1,  2,  ..., 97, 98, 99])
    >>> store2.close()  # zip stores need to be closed

    Notes
    -----
    Please note that this is an experimental feature. The behaviour of this
    function is still evolving and the default behaviour and/or parameters may change
    in future versions.

    """

    # normalize paths
    source_path = normalize_storage_path(source_path)
    dest_path = normalize_storage_path(dest_path)
    if source_path:
        source_path = source_path + '/'
    if dest_path:
        dest_path = dest_path + '/'

    # normalize excludes and includes
    if excludes is None:
        excludes = []
    elif isinstance(excludes, str):
        excludes = [excludes]
    if includes is None:
        includes = []
    elif isinstance(includes, str):
        includes = [includes]
    excludes = [re.compile(e, flags) for e in excludes]
    includes = [re.compile(i, flags) for i in includes]

    # check if_exists parameter
    valid_if_exists = ['raise', 'replace', 'skip']
    if if_exists not in valid_if_exists:
        raise ValueError('if_exists must be one of {!r}; found {!r}'.format(
            valid_if_exists, if_exists))

    # setup counting variables
    n_copied = n_skipped = n_bytes_copied = 0

    # setup logging
    with _LogWriter(log) as log:

        # iterate over source keys
        for source_key in sorted(source.keys()):

            # filter to keys under source path
            if source_key.startswith(source_path):

                # process excludes and includes
                exclude = False
                for prog in excludes:
                    if prog.search(source_key):
                        exclude = True
                        break
                if exclude:
                    for prog in includes:
                        if prog.search(source_key):
                            exclude = False
                            break
                if exclude:
                    continue

                # map key to destination path
                key_suffix = source_key[len(source_path):]
                dest_key = dest_path + key_suffix

                # create a descriptive label for this operation
                descr = source_key
                if dest_key != source_key:
                    descr = descr + ' -> ' + dest_key

                # decide what to do
                do_copy = True
                if if_exists != 'replace':
                    if dest_key in dest:
                        if if_exists == 'raise':
                            raise CopyError(
                                'key {!r} exists in destination'.format(
                                    dest_key))
                        elif if_exists == 'skip':
                            do_copy = False

                # take action
                if do_copy:
                    log('copy {}'.format(descr))
                    if not dry_run:
                        data = source[source_key]
                        n_bytes_copied += buffer_size(data)
                        dest[dest_key] = data
                    n_copied += 1
                else:
                    log('skip {}'.format(descr))
                    n_skipped += 1

        # log a final message with a summary of what happened
        _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied)

    return n_copied, n_skipped, n_bytes_copied