Beispiel #1
0
 def create_array(read_only=False, **kwargs):
     path = mktemp(suffix='.dbm')
     atexit.register(os.remove, path)
     store = DBMStore(path, flag='n', open=bsddb3.btopen)
     kwargs.setdefault('compressor', Zlib(1))
     init_array(store, **kwargs)
     return Array(store, read_only=read_only)
def chromsizes_tsv_to_zarr(input, output, has_header):
    df = pd.read_csv(input, header=(0 if has_header else None), sep='\t')

    num_chroms = df.shape[0]

    columns = df.columns.values.tolist()
    chrom_names = df[columns[0]].values
    chrom_sizes = df[columns[1]].values

    df["name_len"] = df[columns[0]].apply(lambda name: len(name))
    max_name_len = int(df["name_len"].max())

    z = zarr.open(output, mode='w')
    compressor = Zlib(level=1)

    z.create_dataset("names",
                     shape=(num_chroms, ),
                     dtype=f"S{max_name_len}",
                     compressor=compressor)
    z.create_dataset("sizes",
                     shape=(num_chroms, ),
                     dtype="u4",
                     compressor=compressor)
    z["names"][:] = chrom_names
    z["sizes"][:] = chrom_sizes
Beispiel #3
0
 def create_array(read_only=False, **kwargs):
     path = mkdtemp()
     atexit.register(shutil.rmtree, path)
     store = NestedDirectoryStore(path)
     kwargs.setdefault('compressor', Zlib(1))
     init_array(store, **kwargs)
     return Array(store, read_only=read_only)
def chromsizes_negspy_to_zarr(assembly, output, has_header):
    chrom_order = nc.get_chromorder(assembly)
    chrom_info = nc.get_chrominfo(assembly)

    chrom_rows = [{
        0: chrom_name,
        1: chrom_info.chrom_lengths[chrom_name]
    } for chrom_name in chrom_order]

    df = pd.DataFrame(columns=[0, 1], data=chrom_rows)

    num_chroms = df.shape[0]

    columns = df.columns.values.tolist()
    chrom_names = df[columns[0]].values
    chrom_sizes = df[columns[1]].values

    df["name_len"] = df[columns[0]].apply(lambda name: len(name))
    max_name_len = int(df["name_len"].max())

    z = zarr.open(output, mode='w')
    compressor = Zlib(level=1)

    z.create_dataset("names",
                     shape=(num_chroms, ),
                     dtype=f"S{max_name_len}",
                     compressor=compressor)
    z.create_dataset("sizes",
                     shape=(num_chroms, ),
                     dtype="u4",
                     compressor=compressor)
    z["names"][:] = chrom_names
    z["sizes"][:] = chrom_sizes
Beispiel #5
0
    def _translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]):
        """Produce Zarr metadata for all groups and datasets in the HDF5 file.
        """
        refs = {}
        if isinstance(h5obj, h5py.Dataset):
            lggr.debug(f'HDF5 dataset: {h5obj.name}')
            if h5obj.id.get_create_plist().get_layout() == h5py.h5d.COMPACT:
                RuntimeError(
                    f'Compact HDF5 datasets not yet supported: <{h5obj.name} '
                    f'{h5obj.shape} {h5obj.dtype} {h5obj.nbytes} bytes>')
                return

            if (h5obj.scaleoffset or h5obj.fletcher32 or
                    h5obj.compression in ('szip', 'lzf')):
                raise RuntimeError(
                    f'{h5obj.name} uses unsupported HDF5 filters')
            if h5obj.compression == 'gzip':
                compression = Zlib(level=h5obj.compression_opts)
            else:
                compression = None
            
            # Add filter for shuffle
            filters = []
            if h5obj.shuffle:
                filters.append(Shuffle(elementsize=h5obj.dtype.itemsize))

            # Get storage info of this HDF5 dataset...
            cinfo = self._storage_info(h5obj)
            if self._xr and h5py.h5ds.is_scale(h5obj.id) and not cinfo:
                return

            # Create a Zarr array equivalent to this HDF5 dataset...
            za = self._zroot.create_dataset(h5obj.name, shape=h5obj.shape,
                                            dtype=h5obj.dtype,
                                            chunks=h5obj.chunks or False,
                                            fill_value=h5obj.fillvalue,
                                            compression=compression,
                                            filters=filters,
                                            overwrite=True)
            lggr.debug(f'Created Zarr array: {za}')
            self._transfer_attrs(h5obj, za)

            if self._xr:
                # Do this for xarray...
                adims = self._get_array_dims(h5obj)
                za.attrs['_ARRAY_DIMENSIONS'] = adims
                lggr.debug(f'_ARRAY_DIMENSIONS = {adims}')

            # Store chunk location metadata...
            if cinfo:
                for k, v in cinfo.items():
                    self.store[za._chunk_key(k)] = [self._uri, v['offset'], v['size']]

        elif isinstance(h5obj, h5py.Group):
            lggr.debug(f'HDF5 group: {h5obj.name}')
            zgrp = self._zroot.create_group(h5obj.name)
            self._transfer_attrs(h5obj, zgrp)
Beispiel #6
0
 def create_array(read_only=False, **kwargs):
     path = mktemp(suffix='.lmdb')
     atexit_rmtree(path)
     try:
         store = LMDBStore(path, buffers=False)
     except ImportError:  # pragma: no cover
         raise SkipTest('lmdb not installed')
     kwargs.setdefault('compressor', Zlib(1))
     init_array(store, **kwargs)
     return Array(store, read_only=read_only)
Beispiel #7
0
 def create_array(read_only=False, **kwargs):
     store = dict()
     dtype = kwargs.get('dtype', None)
     filters = [
         Delta(dtype=dtype),
         FixedScaleOffset(dtype=dtype, scale=1, offset=0),
     ]
     kwargs.setdefault('filters', filters)
     compressor = Zlib(1)
     kwargs.setdefault('compressor', compressor)
     init_array(store, **kwargs)
     return Array(store, read_only=read_only)
Beispiel #8
0
 def prep_source(source):
     foo = source.create_group('foo')
     foo.attrs['experiment'] = 'weird science'
     baz = foo.create_dataset('bar/baz', data=np.arange(100), chunks=(50,))
     baz.attrs['units'] = 'metres'
     if request.param == 'hdf5':
         extra_kws = dict(compression='gzip', compression_opts=3, fillvalue=84,
                          shuffle=True, fletcher32=True)
     else:
         extra_kws = dict(compressor=Zlib(3), order='F', fill_value=42, filters=[Adler32()])
     source.create_dataset('spam', data=np.arange(100, 200).reshape(20, 5),
                           chunks=(10, 2), dtype='i2', **extra_kws)
     return source
Beispiel #9
0
    def test_copy_array_create_options(self, source, dest):
        dest_h5py = dest.__module__.startswith('h5py.')

        # copy array, provide creation options
        compressor = Zlib(9)
        create_kws = dict(chunks=(10,))
        if dest_h5py:
            create_kws.update(compression='gzip', compression_opts=9,
                              shuffle=True, fletcher32=True, fillvalue=42)
        else:
            # v3 case has no filters argument in zarr create_kws
            create_kws.update(compressor=compressor, fill_value=42, order='F')
        copy(source['foo/bar/baz'], dest, without_attrs=True, **create_kws)
        check_copied_array(source['foo/bar/baz'], dest['baz'],
                           without_attrs=True, expect_props=create_kws)
    def test_copy_array_create_options(self):
        source = self.source
        dest = self.new_dest()

        # copy array, provide creation options
        compressor = Zlib(9)
        create_kws = dict(chunks=(10,))
        if self.dest_h5py:
            create_kws.update(compression='gzip', compression_opts=9,
                              shuffle=True, fletcher32=True, fillvalue=42)
        else:
            create_kws.update(compressor=compressor, fill_value=42, order='F',
                              filters=[Adler32()])
        copy(source['foo/bar/baz'], dest, without_attrs=True, **create_kws)
        check_copied_array(source['foo/bar/baz'], dest['baz'],
                           without_attrs=True, expect_props=create_kws)
    def translator(self, name, h5obj):
        """Produce Zarr metadata for all groups and datasets in the HDF5 file.
        """
        if isinstance(h5obj, h5py.Dataset):
            lggr.debug(f'Dataset: {h5obj.name}')
            if (h5obj.scaleoffset or h5obj.fletcher32 or h5obj.shuffle or
                    h5obj.compression in ('szip', 'lzf')):
                raise RuntimeError(
                    f'{h5obj.name} uses unsupported HDF5 filters')
            if h5obj.compression == 'gzip':
                compression = Zlib(level=h5obj.compression_opts)
            else:
                compression = None

            # Get storage info of this HDF5 dataset...
            cinfo = self.storage_info(h5obj)
            if self._xr and h5py.h5ds.is_scale(h5obj.id) and not cinfo:
                return

            # Create a Zarr array equivalent to this HDF5 dataset...
            za = self._zroot.create_dataset(h5obj.name, shape=h5obj.shape,
                                            dtype=h5obj.dtype,
                                            chunks=h5obj.chunks or False,
                                            fill_value=h5obj.fillvalue,
                                            compression=compression,
                                            overwrite=True)
            lggr.debug(f'Created Zarr array: {za}')
            self.transfer_attrs(h5obj, za)

            if self._xr:
                # Do this for xarray...
                adims = self._get_array_dims(h5obj)
                za.attrs['_ARRAY_DIMENSIONS'] = adims
                lggr.debug(f'_ARRAY_DIMENSIONS = {adims}')

            # Store chunk location metadata...
            if cinfo:
                cinfo['source'] = {'uri': self._uri,
                                   'array_name': h5obj.name}
                FileChunkStore.chunks_info(za, cinfo)

        elif isinstance(h5obj, h5py.Group):
            lggr.debug(f'Group: {h5obj.name}')
            zgrp = self._zroot.create_group(h5obj.name)
            self.transfer_attrs(h5obj, zgrp)
Beispiel #12
0
import numpy as np
from numcodecs import Zlib
import zarr

if __name__ == "__main__":
    arr = np.arange(3 * 12 * 6).reshape(3, 12, 6)

    z = zarr.open(
        "dummy_data.zarr",
        mode="w",
        shape=arr.shape,
        compressor=Zlib(level=1),
        chunks=(3, 3, 3),
        dtype="<i4",
    )

    z[:, :, :] = arr
Beispiel #13
0
    def test_create_dataset(self):
        g = self.create_group()

        # create as immediate child
        d1 = g.create_dataset('foo', shape=1000, chunks=100)
        assert isinstance(d1, Array)
        assert (1000,) == d1.shape
        assert (100,) == d1.chunks
        assert 'foo' == d1.path
        assert '/foo' == d1.name
        assert g.store is d1.store

        # create as descendant
        d2 = g.create_dataset('/a/b/c/', shape=2000, chunks=200, dtype='i1',
                              compression='zlib', compression_opts=9,
                              fill_value=42, order='F')
        assert isinstance(d2, Array)
        assert (2000,) == d2.shape
        assert (200,) == d2.chunks
        assert np.dtype('i1') == d2.dtype
        assert 'zlib' == d2.compressor.codec_id
        assert 9 == d2.compressor.level
        assert 42 == d2.fill_value
        assert 'F' == d2.order
        assert 'a/b/c' == d2.path
        assert '/a/b/c' == d2.name
        assert g.store is d2.store

        # create with data
        data = np.arange(3000, dtype='u2')
        d3 = g.create_dataset('bar', data=data, chunks=300)
        assert isinstance(d3, Array)
        assert (3000,) == d3.shape
        assert (300,) == d3.chunks
        assert np.dtype('u2') == d3.dtype
        assert_array_equal(data, d3[:])
        assert 'bar' == d3.path
        assert '/bar' == d3.name
        assert g.store is d3.store

        # compression arguments handling follows...

        # compression_opts as dict
        d = g.create_dataset('aaa', shape=1000, dtype='u1',
                             compression='blosc',
                             compression_opts=dict(cname='zstd', clevel=1, shuffle=2))
        assert d.compressor.codec_id == 'blosc'
        assert 'zstd' == d.compressor.cname
        assert 1 == d.compressor.clevel
        assert 2 == d.compressor.shuffle

        # compression_opts as sequence
        d = g.create_dataset('bbb', shape=1000, dtype='u1',
                             compression='blosc',
                             compression_opts=('zstd', 1, 2))
        assert d.compressor.codec_id == 'blosc'
        assert 'zstd' == d.compressor.cname
        assert 1 == d.compressor.clevel
        assert 2 == d.compressor.shuffle

        # None compression_opts
        d = g.create_dataset('ccc', shape=1000, dtype='u1', compression='zlib')
        assert d.compressor.codec_id == 'zlib'
        assert 1 == d.compressor.level

        # None compression
        d = g.create_dataset('ddd', shape=1000, dtype='u1', compression=None)
        assert d.compressor is None

        # compressor as compression
        d = g.create_dataset('eee', shape=1000, dtype='u1', compression=Zlib(1))
        assert d.compressor.codec_id == 'zlib'
        assert 1 == d.compressor.level
Beispiel #14
0
    def test_create_dataset(self):
        g = self.create_group()

        # create as immediate child
        d1 = g.create_dataset('foo', shape=1000, chunks=100)
        assert_is_instance(d1, Array)
        eq((1000, ), d1.shape)
        eq((100, ), d1.chunks)
        eq('foo', d1.path)
        eq('/foo', d1.name)
        assert_is(g.store, d1.store)

        # create as descendant
        d2 = g.create_dataset('/a/b/c/',
                              shape=2000,
                              chunks=200,
                              dtype='i1',
                              compression='zlib',
                              compression_opts=9,
                              fill_value=42,
                              order='F')
        assert_is_instance(d2, Array)
        eq((2000, ), d2.shape)
        eq((200, ), d2.chunks)
        eq(np.dtype('i1'), d2.dtype)
        eq('zlib', d2.compressor.codec_id)
        eq(9, d2.compressor.level)
        eq(42, d2.fill_value)
        eq('F', d2.order)
        eq('a/b/c', d2.path)
        eq('/a/b/c', d2.name)
        assert_is(g.store, d2.store)

        # create with data
        data = np.arange(3000, dtype='u2')
        d3 = g.create_dataset('bar', data=data, chunks=300)
        assert_is_instance(d3, Array)
        eq((3000, ), d3.shape)
        eq((300, ), d3.chunks)
        eq(np.dtype('u2'), d3.dtype)
        assert_array_equal(data, d3[:])
        eq('bar', d3.path)
        eq('/bar', d3.name)
        assert_is(g.store, d3.store)

        # compression arguments handling follows...

        # compression_opts as dict
        d = g.create_dataset('aaa',
                             shape=1000,
                             dtype='u1',
                             compression='blosc',
                             compression_opts=dict(cname='zstd',
                                                   clevel=1,
                                                   shuffle=2))
        eq(d.compressor.codec_id, 'blosc')
        eq('zstd', d.compressor.cname)
        eq(1, d.compressor.clevel)
        eq(2, d.compressor.shuffle)

        # compression_opts as sequence
        d = g.create_dataset('bbb',
                             shape=1000,
                             dtype='u1',
                             compression='blosc',
                             compression_opts=('zstd', 1, 2))
        eq(d.compressor.codec_id, 'blosc')
        eq('zstd', d.compressor.cname)
        eq(1, d.compressor.clevel)
        eq(2, d.compressor.shuffle)

        # None compression_opts
        d = g.create_dataset('ccc', shape=1000, dtype='u1', compression='zlib')
        eq(d.compressor.codec_id, 'zlib')
        eq(1, d.compressor.level)

        # None compression
        d = g.create_dataset('ddd', shape=1000, dtype='u1', compression=None)
        assert_is_none(d.compressor)

        # compressor as compression
        d = g.create_dataset('eee',
                             shape=1000,
                             dtype='u1',
                             compression=Zlib(1))
        eq(d.compressor.codec_id, 'zlib')
        eq(1, d.compressor.level)
Beispiel #15
0
            overwrite=True)
group.array(name='30x20_c_>f4',
            dtype='>f4',
            data=array_30x20_c,
            chunks=(7, 13),
            overwrite=True)
group.array(name='30x20_f_>f4',
            dtype='>f4',
            data=array_30x20_f,
            chunks=(7, 13),
            order='F',
            overwrite=True)

group.array(name='30x20_c_>u8_zlib',
            dtype='>u8',
            compressor=Zlib(level=6),
            data=array_30x20_c,
            chunks=(7, 13),
            overwrite=True)
group.array(name='30x20_c_>u8_gzip',
            dtype='>u8',
            compressor=GZip(level=6),
            data=array_30x20_c,
            chunks=(7, 13),
            overwrite=True)
group.array(name='30x20_c_>u8_bz2',
            dtype='>u8',
            compressor=BZ2(level=1),
            data=array_30x20_c,
            chunks=(7, 13),
            overwrite=True)
Beispiel #16
0
# add groups
compressed_grp = root_grp.create_group('compressed', overwrite=True)
filtered_grp = root_grp.create_group('filtered', overwrite=True)
comp_filt_grp = root_grp.create_group('comp_filt', overwrite=True)

# In[ ]:

# add compressed data arrays (no filters)
# deflate
a = compressed_grp.create_dataset('deflate1',
                                  shape=(200, 200),
                                  chunks=(50, 50),
                                  dtype='i4',
                                  overwrite=True,
                                  compressor=Zlib(level=1))
a[:] = data
a = compressed_grp.create_dataset('deflate9',
                                  shape=(200, 200),
                                  chunks=(50, 50),
                                  dtype='i4',
                                  overwrite=True,
                                  compressor=Zlib(level=0))
a[:] = data
# shuffle
a = compressed_grp.create_dataset('shuffle',
                                  shape=(200, 200),
                                  chunks=(50, 50),
                                  dtype='i4',
                                  overwrite=True,
                                  compressor=Shuffle())
def bigwigs_to_zarr(input_bigwig_files, output_file, starting_resolution,
                    name):
    # Short-hand for creating a DirectoryStore with a root group.
    f = zarr.open(output_file, mode='w')
    compressor = Zlib(level=1)

    num_samples = len(input_bigwig_files)

    # Create level zero groups
    chromosomes_group = f.create_group("chromosomes")

    # Prepare to fill in chroms dataset
    chromosomes = nc.get_chromorder('hg38')
    chromosomes = [str(chr_name) for chr_name in chromosomes[:25]
                   ]  # TODO: should more than chr1-chrM be used?
    num_chromosomes = len(chromosomes)
    chroms_length_arr = np.array(
        [nc.get_chrominfo('hg38').chrom_lengths[x] for x in chromosomes],
        dtype="i8")
    chroms_cumsum_arr = np.concatenate(
        (np.array([0]), np.cumsum(chroms_length_arr)))

    chromosomes_set = set(chromosomes)
    chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr))
    chrom_name_to_cumsum = dict(zip(chromosomes, chroms_cumsum_arr))

    # Prepare to fill in resolutions dataset
    resolutions = [starting_resolution * (2**x) for x in range(16)]

    # Create each chromosome dataset.
    for chr_name, chr_len in chrom_name_to_length.items():
        chr_group = chromosomes_group.create_group(chr_name)
        # Create each resolution group.
        for resolution in resolutions:
            chr_shape = (num_samples, math.ceil(chr_len / resolution))
            chr_group.create_dataset(str(resolution),
                                     shape=chr_shape,
                                     dtype="f4",
                                     fill_value=np.nan,
                                     compressor=compressor)

    # Fill in data for each bigwig file.
    for bw_index, bw_file in tqdm(list(enumerate(input_bigwig_files)),
                                  desc='bigwigs'):
        if bbi.is_bigwig(bw_file):
            chromsizes = bbi.chromsizes(bw_file)
            matching_chromosomes = set(
                chromsizes.keys()).intersection(chromosomes_set)

            # Fill in data for each resolution of a bigwig file.
            for resolution in resolutions:
                # Fill in data for each chromosome of a resolution of a bigwig file.
                for chr_name in matching_chromosomes:
                    chr_len = chrom_name_to_length[chr_name]
                    chr_shape = (num_samples, math.ceil(chr_len / resolution))
                    arr = bbi.fetch(bw_file,
                                    chr_name,
                                    0,
                                    chr_len,
                                    chr_shape[1],
                                    summary="sum")
                    chromosomes_group[chr_name][str(resolution)][
                        bw_index, :] = arr
        else:
            print(f"{bw_file} not is_bigwig")

    max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print(max_mem)

    # Append metadata to the top resolution row_infos attribute.
    row_infos = []
    for bw_index, bw_file in enumerate(input_bigwig_files):
        row_infos.append({
            "cluster": int(bw_index + 1),
            "file": os.path.basename(bw_file)
        })

    # f.attrs should contain all tileset_info properties
    # For zarr, more attributes are used here to allow "serverless"
    f.attrs['row_infos'] = row_infos
    f.attrs['resolutions'] = sorted(resolutions, reverse=True)
    f.attrs['shape'] = [num_samples, 256]
    f.attrs['name'] = name
    f.attrs['coordSystem'] = "hg38"

    # https://github.com/zarr-developers/zarr-specs/issues/50
    f.attrs['multiscales'] = [{
        "version":
        "0.1",
        "name":
        chr_name,
        "datasets": [{
            "path": f"chromosomes/{chr_name}/{resolution}"
        } for resolution in sorted(resolutions, reverse=True)],
        "type":
        "zarr-multivec",
        "metadata": {
            "chromoffset": int(chrom_name_to_cumsum[chr_name]),
            "chromsize": int(chr_len),
        }
    } for (chr_name, chr_len) in list(zip(chromosomes, chroms_length_arr))]
Beispiel #18
0
from __future__ import absolute_import, print_function, division


import numpy as np
from numpy.testing import assert_array_equal, assert_array_almost_equal


from numcodecs import (AsType, Delta, FixedScaleOffset, PackBits, Categorize, Zlib, Blosc,
                       BZ2, Quantize)
from zarr.creation import array
from zarr.compat import PY2


compressors = [
    None,
    Zlib(),
    BZ2(),
    Blosc(),
]

# TODO rely on backports and remove PY2 exclusion
if not PY2:  # pragma: py2 no cover
    from zarr.codecs import LZMA
    compressors.append(LZMA())


def test_array_with_delta_filter():

    # setup
    astype = 'u1'
    dtype = 'i8'
Beispiel #19
0
 def create_array(self, read_only=False, **kwargs):
     store = dict()
     kwargs.setdefault('compressor', Zlib(level=1))
     init_array(store, **kwargs)
     return Array(store, read_only=read_only)
Beispiel #20
0
 def create_array(read_only=False, **kwargs):
     store = CustomMapping()
     kwargs.setdefault('compressor', Zlib(1))
     init_array(store, **kwargs)
     return Array(store, read_only=read_only)
import click

import numpy as np
import dask.array as da
import zarr
from numcodecs import Zlib

from pathlib import Path

PYRAMID_GROUP_NAME = "sub-resolutions"
DEFAULT_COMPRESSOR = Zlib(level=1)


def pad_axis(array, dim, pad_width):
    padding = [(0, 0) if i != dim else (0, pad_width)
               for i in range(len(array.shape))]
    padded = da.pad(array, padding, "constant")
    return padded


def guess_rgb(shape):
    ndim = len(shape)
    last_dim = shape[-1]

    if ndim > 2 and last_dim < 5:
        return True
    else:
        return False


def _create_pyramid(