Exemple #1
0
    def open(self, mode: str = "r", cached: bool = True, cache_size_bytes: int = int(1e9)) -> "ChunkedDataset":
        """Opens a zarr dataset from disk from the path supplied in the constructor.

        Keyword Arguments:
            mode (str): Mode to open dataset in, default to read-only (default: {"r"})
            cached (bool): Whether to cache files read from disk using a LRU cache. (default: {True})
            cache_size_bytes (int): Size of cache in bytes (default: {1e9} (1GB))

        Raises:
            Exception: When any of the expected arrays (frames, agents, scenes) is missing or the store couldn't be
opened.
        """
        if cached:
            self.root = zarr.open_group(
                store=zarr.LRUStoreCache(zarr.DirectoryStore(self.path), max_size=cache_size_bytes), mode=mode
            )
        else:
            self.root = zarr.open_group(self.path, mode=mode)
        self.frames = self.root[FRAME_ARRAY_KEY]
        self.agents = self.root[AGENT_ARRAY_KEY]
        self.scenes = self.root[SCENE_ARRAY_KEY]
        try:
            self.tl_faces = self.root[TL_FACE_ARRAY_KEY]
        except KeyError:
            warnings.warn(
                f"{TL_FACE_ARRAY_KEY} not found in {self.path}! Traffic lights will be disabled",
                RuntimeWarning,
                stacklevel=2,
            )
            self.tl_faces = np.empty((0,), dtype=TL_FACE_DTYPE)
        return self
Exemple #2
0
def prepare_zarr_group(dataset_id, dataset, store, table="MAIN"):
    dir_store = zarr.DirectoryStore(store)

    try:
        # Open in read/write, must exist
        group = zarr.open_group(store=dir_store, mode="r+")
    except zarr.errors.GroupNotFoundError:
        # Create, must not exist
        group = zarr.open_group(store=dir_store, mode="w-")

    group_name = f"{table}_{dataset_id}"
    ds_group = group.require_group(table).require_group(group_name)

    schema = DatasetSchema.from_dataset(dataset)

    for column, column_schema in schema.data_vars.items():
        create_array(ds_group, column, column_schema, False)

    for column, column_schema in schema.coords.items():
        create_array(ds_group, column, column_schema, True)

    ds_group.attrs.update({
        **schema.attrs, DASKMS_ATTR_KEY: {
            "chunks": dict(dataset.chunks)
        }
    })

    return ds_group
Exemple #3
0
def collect_zarr(file_name, out_dir, num_procs):
    final_zarr_file = '%s/%s' % (out_dir, file_name)

    # seed w/ job0
    job_zarr_file = '%s/job0/%s' % (out_dir, file_name)
    shutil.copytree(job_zarr_file, final_zarr_file)

    # open final
    final_zarr_open = zarr.open_group(final_zarr_file)

    for pi in range(1, num_procs):
        # open job
        job_zarr_file = '%s/job%d/%s' % (out_dir, pi, file_name)
        job_zarr_open = zarr.open_group(job_zarr_file, 'r')

        # append to final
        for key in final_zarr_open.keys():
            if key in ['percentiles', 'target_ids', 'target_labels']:
                # once is enough
                pass

            elif key[-4:] == '_pct':
                # average
                u_k1 = np.array(final_zarr_open[key])
                x_k = np.array(job_zarr_open[key])
                final_zarr_open[key] = u_k1 + (x_k - u_k1) / (pi + 1)

            else:
                # append
                final_zarr_open[key].append(job_zarr_open[key])
Exemple #4
0
def prepare_zarr_group(dataset_id, dataset, store, rechunk=False):
    try:
        # Open in read/write, must exist
        group = zarr.open_group(store=store.map, mode="r+")
    except zarr.errors.GroupNotFoundError:
        # Create, must not exist
        group = zarr.open_group(store=store.map, mode="w-")

    table_path = store.table if store.table else "MAIN"

    group_name = f"{table_path}_{dataset_id}"
    ds_group = group.require_group(table_path).require_group(group_name)

    dataset, ds_group = maybe_rechunk(dataset, ds_group, rechunk=rechunk)

    schema = DatasetSchema.from_dataset(dataset)
    schema_chunks = schema.chunks

    for column, column_schema in schema.data_vars.items():
        create_array(ds_group, column, column_schema, schema_chunks, False)

    for column, column_schema in schema.coords.items():
        create_array(ds_group, column, column_schema, schema_chunks, True)

    ds_group.attrs.update({
        **schema.attrs, DASKMS_ATTR_KEY: {
            "chunks": dict(dataset.chunks)
        }
    })

    return dataset, ds_group
    def open(
        self,
        mode: str = "r",
        cached: bool = True,
        cache_size_bytes: int = int(1e9)) -> "ChunkedDataset":
        """Opens a zarr dataset from disk from the path supplied in the constructor.

        :param mode: Mode to open dataset in, default to read-only (default: {"r"})
        :param cached: Whether to cache files read from disk using a LRU cache. (default: {True})
        :param cache_size_bytes: Size of cache in bytes (default: {1e9} (1GB))
        """
        if cached:
            self.root = zarr.open_group(store=zarr.LRUStoreCache(
                zarr.DirectoryStore(self.path), max_size=cache_size_bytes),
                                        mode=mode)
        else:
            self.root = zarr.open_group(self.path, mode=mode)
        self.frames = self.root[FRAME_ARRAY_KEY]
        self.agents = self.root[AGENT_ARRAY_KEY]
        self.scenes = self.root[SCENE_ARRAY_KEY]
        try:
            self.tl_faces = self.root[TL_FACE_ARRAY_KEY]
        except KeyError:
            # the real issue here is that frame doesn't have traffic_light_faces_index_interval
            warnings.warn(
                f"{TL_FACE_ARRAY_KEY} not found in {self.path}! "
                f"You won't be able to use this zarr into an Ego/AgentDataset",
                RuntimeWarning,
                stacklevel=2,
            )
            self.tl_faces = np.empty((0, ), dtype=TL_FACE_DTYPE)
        return self
Exemple #6
0
def init(release_dir):
    """Initialise data resources.

    Parameters
    ----------
    release_dir : string
        Local filesystem path where data from the release are stored.

    """

    # variation
    ###########

    global callset, callset_pass
    variation_dir = os.path.join(release_dir, 'variation')

    # main callset
    callset_zarr_fn = os.path.join(variation_dir, 'main', 'zarr2',
                                   'ag1000g.phase1.ar3')
    if os.path.exists(callset_zarr_fn):
        callset = zarr.open_group(callset_zarr_fn, mode='r')

    # main callset, PASS variants only
    callset_pass_zarr_fn = os.path.join(variation_dir, 'main', 'zarr2',
                                        'ag1000g.phase1.ar3.pass')
    if os.path.exists(callset_pass_zarr_fn):
        callset_pass = zarr.open_group(callset_pass_zarr_fn, mode='r')

    # haplotypes
    ############

    global callset_phased, tbl_haplotypes, lkp_haplotypes, df_haplotypes
    haplotypes_dir = os.path.join(release_dir, 'haplotypes')

    # try HDF5 first
    callset_phased_h5_fn = os.path.join(haplotypes_dir, 'main', 'hdf5',
                                        'ag1000g.phase1.ar3.1.haplotypes.h5')
    if os.path.exists(callset_phased_h5_fn):
        callset_phased = h5py.File(callset_phased_h5_fn, mode='r')

    # prefer Zarr if available
    # N.B., the Zarr data is not consistent with HDF5 or shapeit outputs,
    # it is based on a previous phasing run.
    #
    #callset_phased_zarr_fn = os.path.join(haplotypes_dir, 'main', 'zarr2',
    #                                      'ag1000g.phase1.ar3.1.haplotypes')
    #if os.path.exists(callset_phased_zarr_fn):
    #    callset_phased = zarr.open_group(callset_phased_zarr_fn, mode='r')

    # haplotypes metadata
    haplotypes_fn = os.path.join(haplotypes_dir, 'haplotypes.meta.txt')
    if os.path.exists(haplotypes_fn):
        tbl_haplotypes = (etl.fromtsv(haplotypes_fn).convert(
            ('index', 'kt_2la', 'kt_2rb'), int))
        lkp_haplotypes = tbl_haplotypes.recordlookupone('label')
        df_haplotypes = pandas.read_csv(haplotypes_fn,
                                        sep='\t',
                                        index_col='index')
Exemple #7
0
def _concat_zarrs_optimized(
    zarr_files: List[str],
    output: PathType,
    vars_to_rechunk: List[Hashable],
    vars_to_copy: List[Hashable],
) -> None:
    zarr_groups = [zarr.open_group(f) for f in zarr_files]

    first_zarr_group = zarr_groups[0]

    # create the top-level group
    zarr.open_group(str(output), mode="w")

    # copy variables that are to be rechunked
    # NOTE: that this uses _to_zarr function defined here that is needed to avoid
    # race conditions between writing the array contents and its metadata
    # see https://github.com/pystatgen/sgkit/pull/486
    delayed = []  # do all the rechunking operations in one computation
    for var in vars_to_rechunk:
        dtype = None
        if var in {"variant_id", "variant_allele"}:
            max_len = _get_max_len(zarr_groups, f"max_length_{var}")
            dtype = f"S{max_len}"

        arr = concatenate_and_rechunk([group[var] for group in zarr_groups],
                                      dtype=dtype)
        d = _to_zarr(  # type: ignore[no-untyped-call]
            arr,
            str(output),
            component=var,
            overwrite=True,
            compute=False,
            fill_value=None,
            attrs=first_zarr_group[var].attrs.asdict(),
        )
        delayed.append(d)
    da.compute(*delayed)

    # copy unchanged variables and top-level metadata
    with zarr.open_group(str(output)) as output_zarr:

        # copy variables that are not rechunked (e.g. sample_id)
        for var in vars_to_copy:
            output_zarr[var] = first_zarr_group[var]
            output_zarr[var].attrs.update(first_zarr_group[var].attrs)

        # copy top-level attributes
        output_zarr.attrs.update(first_zarr_group.attrs)
Exemple #8
0
    def initialize(self,
                   mode: str = "w",
                   scenes_num: int = 0,
                   frames_num: int = 0,
                   agents_num: int = 0) -> None:
        """Initializes a new zarr dataset, creating the underlying arrays.

        Keyword Arguments:
            mode (str): Mode to open dataset in, should be something that supports writing. (default: {"w"})
            scenes_num (int): pre-allocate this number of scenes
            frames_num (int): pre-allocate this number of frames
            agents_num (int): pre-allocate this number of agents
        """

        self.root = zarr.open_group(self.path, mode=mode)

        self.frames = self.root.require_dataset(FRAME_ARRAY_KEY,
                                                dtype=FRAME_DTYPE,
                                                chunks=FRAME_CHUNK_SIZE,
                                                shape=(frames_num, ))
        self.agents = self.root.require_dataset(AGENT_ARRAY_KEY,
                                                dtype=AGENT_DTYPE,
                                                chunks=AGENT_CHUNK_SIZE,
                                                shape=(agents_num, ))
        self.scenes = self.root.require_dataset(SCENE_ARRAY_KEY,
                                                dtype=SCENE_DTYPE,
                                                chunks=SCENE_CHUNK_SIZE,
                                                shape=(scenes_num, ))

        self.root.attrs["format_version"] = FORMAT_VERSION
        self.root.attrs["labels"] = LABELS
Exemple #9
0
def _overwrite_time_array_with_single_chunk(target: str, time: xr.DataArray,
                                            dim: str):
    if time is not None:
        del zarr.open_group(fsspec.get_mapper(target))[dim]
        with tempfile.TemporaryDirectory() as tmpdir:
            xr.Dataset({dim: time}).to_zarr(tmpdir)
            upload_dir(tmpdir, target)
Exemple #10
0
def load_zarr_data(zarr_fn, chrom, s1, s2, gdistkey=None):

    import zarr

    samples1 = get_sample_ids(s1)
    samples2 = get_sample_ids(s2)

    zfh = zarr.open_group(zarr_fn, mode="r")[chrom]

    samples_x = zfh["samples"][:]
    sample_name = [sid.decode() for sid in samples_x.tolist()]

    idx1 = np.array([sample_name.index(sid) for sid in samples1])
    idx2 = np.array([sample_name.index(sid) for sid in samples2])

    g = allel.GenotypeChunkedArray(zfh["calldata"]["genotype"])

    pos = allel.SortedIndex(zfh["variants"]["POS"][:])

    if gdistkey is not None:
        gdist = h5fh["variants"][gdistkey][:]
    else:
        gdist = None

    return g.take(idx1, axis=1), g.take(idx2, axis=1), pos, gdist
    def test_write_output_vars_batch(self, store_batch, model_batch1,
                                     model_batch2):
        model_batch1.state[("profile", "u")] = np.array([1.0, 2.0, 3.0])
        model_batch2.state[("profile", "u")] = np.array([4.0, 5.0, 6.0])

        model_batch1.state[("roll", "u_diff")] = np.array([-1.0, 1.0, 0.0])
        model_batch2.state[("roll", "u_diff")] = np.array([0.0, 1.0, -1.0])

        model_batch1.state[("add", "offset")] = 2.0
        model_batch2.state[("add", "offset")] = 3.0

        store_batch.write_output_vars(0, 0, model=model_batch1)
        store_batch.write_output_vars(1, 0, model=model_batch2)

        ztest = zarr.open_group(store_batch.zgroup.store, mode="r")

        assert ztest.profile__u.ndim == 3
        np.testing.assert_array_equal(
            ztest.profile__u[:, 0, :],
            np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]))

        store_batch.write_output_vars(0, -1, model=model_batch1)
        store_batch.write_output_vars(1, -1, model=model_batch2)

        np.testing.assert_array_equal(ztest.add__offset[:],
                                      np.array([2.0, 3.0]))

        # test default chunk size along batch dim
        assert ztest.profile__u.chunks[0] == 1
    def test_write_index_vars(self, store):
        store.model.state[("init_profile", "x")] = np.array([1.0, 2.0, 3.0])

        store.write_index_vars()
        ztest = zarr.open_group(store.zgroup.store, mode="r")

        np.testing.assert_array_equal(ztest.x, np.array([1.0, 2.0, 3.0]))
Exemple #13
0
 def _get_zarr_group(store):
     if store is None:
         # memory store
         return None, zarr.group()
     elif isinstance(store, str):
         store = zarr.DirectoryStore(store)
     return store, zarr.open_group(store=store, mode="a")
    def test_write_output_vars(self, in_ds, store):
        model = store.model
        model.state[("profile", "u")] = np.array([1.0, 2.0, 3.0])
        model.state[("roll", "u_diff")] = np.array([-1.0, 1.0, 0.0])
        model.state[("add", "offset")] = 2.0

        store.write_output_vars(-1, 0)

        ztest = zarr.open_group(store.zgroup.store, mode="r")

        assert ztest.profile__u.shape == (in_ds.clock.size, 3)
        np.testing.assert_array_equal(ztest.profile__u[0],
                                      np.array([1.0, 2.0, 3.0]))

        assert ztest.roll__u_diff.shape == (in_ds.out.size, 3)
        np.testing.assert_array_equal(ztest.roll__u_diff[0],
                                      np.array([-1.0, 1.0, 0.0]))

        assert ztest.add__u_diff.shape == (in_ds.out.size, )
        np.testing.assert_array_equal(ztest.add__u_diff,
                                      np.array([2.0, np.nan, np.nan]))

        # test save main clock but not out clock
        store.write_output_vars(-1, 1)
        np.testing.assert_array_equal(ztest.profile__u[1],
                                      np.array([1.0, 2.0, 3.0]))
        np.testing.assert_array_equal(ztest.roll__u_diff[1],
                                      np.array([np.nan, np.nan, np.nan]))

        # test save no-clock outputs
        store.write_output_vars(-1, -1)
        np.testing.assert_array_equal(ztest.profile__u_opp,
                                      np.array([-1.0, -2.0, -3.0]))
        assert ztest.add__offset[()] == 2.0
    def _build_output(self, ds, ds0, fss):
        import zarr
        out = {}
        ds.to_zarr(out, chunk_store={}, compute=False)  # fills in metadata&coords
        z = zarr.open_group(out, mode='a')
        for dim in self.extra_dims.union(self.concat_dims):
            # derived and concatenated dims stored as absolute data
            z[dim][:] = ds[dim].values
        for dim in self.same_dims:
            # duplicated coordinates stored as references just once
            out.update({k: v for k, v in fss[0].references.items() if k.startswith(dim)})
        for variable in ds.variables:
            if variable in ds.dims:
                # already handled
                continue
            var, var0 = ds[variable], ds0[variable]
            assert var.dims[-len(var0.dims):] == var0.dims

            concats = {d: 0 for d in self.concat_dims}
            for i, fs in enumerate(fss):
                for k, v in fs.references.items():
                    start, part = os.path.split(k)
                    if start != variable or part in ['.zgroup', '.zarray', '.zattrs']:
                        # OK, so we go through all the keys multiple times
                        continue
                    if var.shape == var0.shape:
                        out[k] = v  # copy
                    else:
                        out[f"{start}/{i}.{part}"] = v
        return out
Exemple #16
0
def read_vcfzarr(path: PathType) -> xr.Dataset:
    """Read a VCF Zarr file created using scikit-allel.

    Loads VCF variant, sample, and genotype data as Dask arrays within a Dataset
    from a Zarr file created using scikit-allel's ``vcf_to_zarr`` function.

    Since ``vcf_to_zarr`` does not preserve phasing information, there is no
    :data:`sgkit.variables.call_genotype_phased_spec` variable in the resulting dataset.

    Parameters
    ----------
    path
        Path to the Zarr file.

    Returns
    -------
    A dataset containing the following variables:

    - :data:`sgkit.variables.variant_id_spec` (variants)
    - :data:`sgkit.variables.variant_contig_spec` (variants)
    - :data:`sgkit.variables.variant_position_spec` (variants)
    - :data:`sgkit.variables.variant_allele_spec` (variants)
    - :data:`sgkit.variables.sample_id_spec` (samples)
    - :data:`sgkit.variables.call_genotype_spec` (variants, samples, ploidy)
    - :data:`sgkit.variables.call_genotype_mask_spec` (variants, samples, ploidy)
    """

    vcfzarr = zarr.open_group(str(path), mode="r")

    # don't fix strings since it requires a pass over the whole dataset
    return _vcfzarr_to_dataset(vcfzarr, fix_strings=False)
Exemple #17
0
def http_pull_file(remote_file, remote_mtime, local_file, LIST, CLOBBER, MODE):
    #-- split extension from input local file
    fileBasename, fileExtension = os.path.splitext(local_file)
    #-- copy HDF5 file from server into new zarr file
    if (fileExtension == '.h5'):
        local_file = '{0}.zarr'.format(fileBasename)
    #-- if file exists in file system: check if remote file is newer
    TEST = False
    OVERWRITE = ' (clobber)'
    #-- check if local version of file exists
    if os.access(local_file, os.F_OK):
        #-- check last modification time of local file
        local_mtime = os.stat(local_file).st_mtime
        #-- if remote file is newer: overwrite the local file
        if (remote_mtime > local_mtime):
            TEST = True
            OVERWRITE = ' (overwrite)'
    else:
        TEST = True
        OVERWRITE = ' (new)'
    #-- if file does not exist locally, is to be overwritten, or CLOBBER is set
    if TEST or CLOBBER:
        #-- output string for printing files transferred
        output = '{0} -->\n\t{1}{2}\n'.format(remote_file, local_file,
                                              OVERWRITE)
        #-- if executing copy command (not only printing the files)
        if not LIST and (fileExtension == '.h5'):
            #-- Create and submit request. There are a wide range of exceptions
            #-- that can be thrown here, including HTTPError and URLError.
            request = urllib2.Request(remote_file)
            fid = io.BytesIO(urllib2.urlopen(request).read())
            #-- copy everything from the HDF5 file to the zarr file
            with h5py.File(fid, 'r') as source:
                dest = zarr.open_group(local_file, mode='w')
                #-- value checks on output zarr
                if not hasattr(dest, 'create_dataset'):
                    raise ValueError(
                        'dest must be a group, got {!r}'.format(dest))
                #-- for each key in the root of the hdf5 file structure
                for k in source.keys():
                    copy_from_HDF5(source[k], dest, name=k)
            #-- keep remote modification time of file and local access time
            os.utime(local_file, (os.stat(local_file).st_atime, remote_mtime))
            os.chmod(local_file, MODE)
        elif not LIST:
            #-- Create and submit request. There are a wide range of exceptions
            #-- that can be thrown here, including HTTPError and URLError.
            request = urllib2.Request(remote_file)
            response = urllib2.urlopen(request)
            #-- chunked transfer encoding size
            CHUNK = 16 * 1024
            #-- copy contents to local file using chunked transfer encoding
            #-- transfer should work properly with ascii and binary data formats
            with open(local_file, 'wb') as f:
                shutil.copyfileobj(response, f, CHUNK)
            #-- keep remote modification time of file and local access time
            os.utime(local_file, (os.stat(local_file).st_atime, remote_mtime))
            os.chmod(local_file, MODE)
        #-- return the output string
        return output
Exemple #18
0
    def __init__(self,
                 data_path,
                 volume_indices,
                 nb_io_workers=1,
                 nb_proc_workers=0,
                 downscale=False,
                 return_vol_idx=False,
                 num_consecutive=None):
        self.data_path = data_path
        self.volume_indices = volume_indices
        self.nb_io_workers = nb_io_workers
        self.nb_proc_workers = nb_proc_workers
        self.downscale = downscale
        self.return_vol_idx = return_vol_idx
        self.num_consecutive = num_consecutive

        try:
            zgroup = zarr.open_group(data_path, mode='r')
        except:
            print("Failed to open data: {}".format(data_path))
            raise

        # Assemble volumes and corresponding segmentations
        self.volumes = []
        self.segmentations = []
        for idx in self.volume_indices:
            subgroup = zgroup[str(idx)]
            self.volumes.append(subgroup['volume'])
            self.segmentations.append(subgroup['segmentation'])

        # Length
        self.num_volumes = len(self.volumes)
        assert (len(self.segmentations) == self.num_volumes)
Exemple #19
0
def main():
    usage = 'usage: %prog [options] <in_h5_file> <out_zarr_file>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='chunk_size', default=None, type='int')
    parser.add_option('-v', dest='verbose', default=False, action='store_true')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide input HDF5 and output BigWig.')
    else:
        hdf5_file = args[0]
        zarr_file = args[1]

    # open files
    h5_in = h5py.File(hdf5_file)
    zarr_out = zarr.open_group(zarr_file, 'w')

    # foreach chromosome
    for chrom in h5_in.keys():
        if options.verbose:
            print(chrom)

        # read values
        x = np.array(h5_in[chrom], dtype='float16')

        # write gzipped into HDF5
        z = zarr_out.create_dataset(chrom, data=x, shape=x.shape, dtype='float16', chunks=options.chunk_size)
        if options.verbose:
            print(z)

    # close files
    h5_in.close()
Exemple #20
0
    def __init__(self,
                 zarr_root,
                 zarr_group,
                 fold_num,
                 conv2d=True,
                 transpose=False,
                 **kwargs):
        'Initialization'

        # Zarr dataset handling
        zarr_root = zarr.open_group(str(zarr_root), mode='r')

        # Get cross-validation fold metadata
        zarr_fold = zarr_root[f'{zarr_group}/folds/fold{fold_num}']

        # Get metadata
        self.metadata = zarr_root[zarr_group].attrs.asdict()
        self.scene_labels = self.metadata['scene_labels']

        # Normalization data
        self.norm_data = {}
        self.norm_data['mean'] = zarr_fold['norm_data']['mean'][:]
        self.norm_data['std'] = zarr_fold['norm_data']['std'][:]

        # Set features dimensions
        self.set_dim(transpose, conv2d)
Exemple #21
0
    def from_schema(
        store: zarr.ABSStore,
        schema: xr.Dataset,
        dims: Sequence[str],
        coords: Mapping[str, xr.DataArray],
    ) -> "ZarrMapping":
        """Initialize a ZarrMapping using an xarray dataset as a template

        Args:
            store: A object implementing the mutable mapping interface required
                by zarr.open_group
            schema: A template for the datasets that will be inserted into the
                ZarrMapping.
            dims: The list of dimensions that will be managed by the zarr
                mapping. The zarr dataset produced by ZarrMapping will have
                these dimensions pre-pendended to the list of dimensions of
                each variable in the schema object.
            coords: the coordinate labels corresponding to the dimensions in dims

        Returns:
            an initialized ZarrMapping object

        """
        group = zarr.open_group(store, mode="w")
        coords = {
            name: xr.DataArray(coords[name], name=name, dims=[name]) for name in coords
        }
        _create_zarr(dims, coords, group, schema)
        return ZarrMapping(store)
    def test_write_global_vars(self):
        # ensure that variable metadata (dims, etc.) is properly accessed for global references

        @xs.process
        class Foo:
            var = xs.variable(dims="x", global_name="global_var", intent="out")

        @xs.process
        class Bar:
            var = xs.global_ref("global_var")

        model = xs.Model({"foo": Foo, "bar": Bar})

        in_ds = xs.create_setup(
            model=model,
            clocks={"clock": [0, 1]},
            output_vars={"bar__var": None},
        )

        store = ZarrSimulationStore(in_ds, model)

        model.state[("foo", "var")] = np.array([1, 2, 3])
        store.write_output_vars(-1, -1)

        ztest = zarr.open_group(store.zgroup.store, mode="r")
        np.testing.assert_array_equal(ztest.bar__var, np.array([1, 2, 3]))
Exemple #23
0
def main():
    usage = 'usage: %prog [options] <in_zarr_file> <out_h5_file>'
    parser = OptionParser(usage)
    parser.add_option('-v', dest='verbose', default=False, action='store_true')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide input Zarr and output HDF5.')
    else:
        zarr_file = args[0]
        hdf5_file = args[1]

    # open files
    zarr_in = zarr.open_group(zarr_file, 'r')
    h5_out = h5py.File(hdf5_file, 'w')

    # foreach chromosome
    for chrom in sorted(zarr_in.keys()):
        if options.verbose:
            print(chrom)

        # read values
        x = np.array(zarr_in[chrom])

        # write gzipped into HDF5
        h5_out.create_dataset(chrom, data=x, dtype='float16', chunks=True, compression='lzf', shuffle=True)

    # close files
    h5_out.close()
Exemple #24
0
    def initialize(
        self, mode: str = "w", num_scenes: int = 0, num_frames: int = 0, num_agents: int = 0, num_tl_faces: int = 0
    ) -> "ChunkedDataset":
        """Initializes a new zarr dataset, creating the underlying arrays.

        Keyword Arguments:
            mode (str): Mode to open dataset in, should be something that supports writing. (default: {"w"})
            num_scenes (int): pre-allocate this number of scenes
            num_frames (int): pre-allocate this number of frames
            num_agents (int): pre-allocate this number of agents
            num_tl_faces (int): pre-allocate this number of traffic lights
        """

        self.root = zarr.open_group(self.path, mode=mode)

        self.frames = self.root.require_dataset(
            FRAME_ARRAY_KEY, dtype=FRAME_DTYPE, chunks=FRAME_CHUNK_SIZE, shape=(num_frames,)
        )
        self.agents = self.root.require_dataset(
            AGENT_ARRAY_KEY, dtype=AGENT_DTYPE, chunks=AGENT_CHUNK_SIZE, shape=(num_agents,)
        )
        self.scenes = self.root.require_dataset(
            SCENE_ARRAY_KEY, dtype=SCENE_DTYPE, chunks=SCENE_CHUNK_SIZE, shape=(num_scenes,)
        )
        self.tl_faces = self.root.require_dataset(
            TL_FACE_ARRAY_KEY, dtype=TL_FACE_DTYPE, chunks=TL_FACE_CHUNK_SIZE, shape=(num_tl_faces,)
        )

        self.root.attrs["format_version"] = FORMAT_VERSION
        self.root.attrs["labels"] = PERCEPTION_LABELS
        return self
Exemple #25
0
    def open_group(
        cls,
        store,
        mode="r",
        synchronizer=None,
        group=None,
        consolidated=False,
        consolidate_on_close=False,
        chunk_store=None,
        append_dim=None,
        write_region=None,
    ):
        import zarr

        # zarr doesn't support pathlib.Path objects yet. zarr-python#601
        if isinstance(store, pathlib.Path):
            store = os.fspath(store)

        open_kwargs = dict(mode=mode, synchronizer=synchronizer, path=group)
        if chunk_store:
            open_kwargs["chunk_store"] = chunk_store

        if consolidated:
            # TODO: an option to pass the metadata_key keyword
            zarr_group = zarr.open_consolidated(store, **open_kwargs)
        else:
            zarr_group = zarr.open_group(store, **open_kwargs)
        return cls(zarr_group, consolidate_on_close, append_dim, write_region)
def main(cfg):
    # data_preprocessing
    data_dir = cfg["data_dir"]
    out_dir = cfg["out_dir"]
    dat = HubmapDataset(data_dir, out_dir)
    # db_zarr
    db = zarr.open_group(store=zarr.DirectoryStore(cfg["zarr_db_dir"]),
                         mode="r")

    # results dir
    results_dir = dat.path.out / "normalization" / cfg["version"]
    if cfg["version"] == "debug":
        os.makedirs(results_dir, exist_ok=True)
    else:
        try:
            os.makedirs(results_dir, exist_ok=False)
        except:
            raise Exception(f"Version {cfg['version']} exists!")
    # save config
    dat.jsn_dump(cfg, results_dir / "config.json")

    # tiles
    tile_dct = dat.pkl_load(dat.path.out / "tiles" / cfg['tiles_version'] /
                            "tile_dct.pkl")
    tiles = tile_dct["train_df"]

    # mean
    sum = np.zeros(3)
    N = (tiles["tile"].apply(lambda x: x[1] - x[0]) *
         tiles["tile"].apply(lambda x: x[3] - x[2])).sum()

    for _, row in tqdm(tiles.iterrows(), total=len(tiles)):
        id_ = row["id"]
        c = row["tile"]

        slc = np.s_[c[0]:c[1], c[2]:c[3]]
        img = db[id_]["img"][slc] / 255

        sum += img.sum(axis=(0, 1))

    mean = sum / N
    dat.pkl_dump(mean, results_dir / "mean.pkl")

    # std
    diff_squared = np.zeros(3)

    for _, row in tqdm(tiles.iterrows(), total=len(tiles)):
        id_ = row["id"]
        c = row["tile"]

        slc = np.s_[c[0]:c[1], c[2]:c[3]]
        img = db[id_]["img"][slc] / 255

        diff_squared += ((img - mean)**2).sum(axis=(0, 1))

    std = np.sqrt(diff_squared / N)
    dat.pkl_dump(std, results_dir / "std.pkl")

    print(f"MEAN: {mean}")
    print(f"STD: {std}")
Exemple #27
0
    def open_group(cls, store, mode='r', synchronizer=None, group=None,
                   consolidated=False, consolidate_on_close=False):
        import zarr
        min_zarr = '2.2'

        if LooseVersion(zarr.__version__) < min_zarr:  # pragma: no cover
            raise NotImplementedError("Zarr version %s or greater is "
                                      "required by xarray. See zarr "
                                      "installation "
                                      "http://zarr.readthedocs.io/en/stable/"
                                      "#installation" % min_zarr)

        if consolidated or consolidate_on_close:
            if LooseVersion(
                    zarr.__version__) <= '2.2.1.dev2':  # pragma: no cover
                raise NotImplementedError("Zarr version 2.2.1.dev2 or greater "
                                          "is required by for consolidated "
                                          "metadata.")

        open_kwargs = dict(mode=mode, synchronizer=synchronizer, path=group)
        if consolidated:
            # TODO: an option to pass the metadata_key keyword
            zarr_group = zarr.open_consolidated(store, **open_kwargs)
        else:
            zarr_group = zarr.open_group(store, **open_kwargs)
        return cls(zarr_group, consolidate_on_close)
Exemple #28
0
def append_slice(store: Union[str, MutableMapping],
                 dataslice: xr.Dataset,
                 dimension: str = "time") -> None:
    """
    Append data slice to existing zarr dataset.

    :param store: A zarr store.
    :param dataslice: Data slice to insert
    :param dimension: name of dimension perpendicular to the slice
    """

    # Unfortunately slice.to_zarr(store, mode='a', append_dim='time') will
    # replace global attributes of store with attributes of slice (xarray
    # bug?), which are usually empty in our case. Hence, we must save our old
    # attributes in a copy of slice.
    ds = zarr.open_group(store, mode='r')
    dataslice = dataslice.copy()
    dataslice.attrs.update(ds.attrs)
    if 'coordinates' in dataslice.attrs:
        # Remove 'coordinates', otherwise we get ValueError: cannot serialize
        # coordinates because the global attribute 'coordinates' already
        # exists from next slice.to_zarr(...) call.
        dataslice.attrs.pop('coordinates')

    dataslice.to_zarr(store, mode='a', append_dim=dimension)
Exemple #29
0
def get_singletons(zarr_folder, chrom, samples, start=-9, stop=-9):
    callset = zarr.open_group(zarr_folder, mode='r')

    pos = callset[chrom]['variants']['POS']
    # pdb.set_trace()
    ref = callset[chrom]['variants']['REF']
    alt = callset[chrom]['variants']['ALT']
    ids = callset[chrom]['variants']['ID']

    gt = allel.GenotypeDaskArray(
        callset[str(chrom)]['calldata']['GT'])  # Retrieve genotype data
    gt = gt.take(samples,
                 axis=1).compute()  # subset data to samples of interest

    ac = gt.count_alleles()

    if start == -9: start = min(pos)
    if stop == -9: stop = max(pos)

    flt = ac.is_singleton(1)
    pos2 = pos.get_mask_selection(flt)
    gf = gt.compress(flt, axis=0)
    sing_dict = {p: i for p, i in zip(pos2, np.where(gf.is_het())[1])}
    ind_dict = {}
    for key, value in sing_dict.items():
        if value in ind_dict:
            ind_dict[value].append(key)
        else:
            ind_dict[value] = [key]

    return ind_dict, gt, ids, ref, alt, pos, start, stop
    def test_convert_to_zarr(self):
        input_vcf_path = "./tests/data/trio.2010_06.ychr.genotypes.vcf"
        output_zarr_path = "trio.2010_06.ychr.genotypes.zarr"

        # Attempt to remove local file in case a previous unit test failed to do so (prevents false positive)
        if os.path.isdir(output_zarr_path):
            shutil.rmtree(output_zarr_path)

        if os.path.isfile(input_vcf_path):
            # Setup test settings for Zarr conversion
            vcf_to_zarr_config = config.VCFtoZarrConfigurationRepresentation()
            vcf_to_zarr_config.fields = 'variants/numalt'
            vcf_to_zarr_config.enabled = True
            vcf_to_zarr_config.compressor = "Blosc"
            vcf_to_zarr_config.blosc_compression_algorithm = "zstd"
            vcf_to_zarr_config.blosc_compression_level = 1
            vcf_to_zarr_config.blosc_shuffle_mode = -1

            # Convert VCF file to Zarr
            data_service.convert_to_zarr(input_vcf_path=input_vcf_path,
                                         output_zarr_path=output_zarr_path,
                                         conversion_config=vcf_to_zarr_config)

            # Load the Zarr data from storage for testing
            callset = zarr.open_group(output_zarr_path, mode="r")
            numalt = callset['variants/numalt']
            self.assertEqual(np.size(numalt), 959)
            self.assertEqual(np.max(numalt), 1)
        else:
            self.fail("Test data file does not exist. Please ensure the file exists and try running test again")

        # Remove the Zarr test data
        if os.path.isdir(output_zarr_path):
            shutil.rmtree(output_zarr_path)
Exemple #31
0
def initialize_output_zarr(out_dir, sad_stats, snps, target_ids,
                           target_labels):
    """Initialize an output Zarr file for SAD stats."""

    num_targets = len(target_ids)
    num_snps = len(snps)

    sad_out = zarr.open_group('%s/sad.zarr' % out_dir, 'w')

    # write SNPs
    sad_out.create_dataset('snp',
                           data=[snp.rsid for snp in snps],
                           chunks=(32768, ))

    # write targets
    sad_out.create_dataset('target_ids', data=target_ids, compressor=None)
    sad_out.create_dataset('target_labels',
                           data=target_labels,
                           compressor=None)

    # initialize SAD stats
    for sad_stat in sad_stats:
        sad_out.create_dataset(sad_stat,
                               shape=(num_snps, num_targets),
                               chunks=(128, num_targets),
                               dtype='float16')

    return sad_out
Exemple #32
0
def append_time_slice(store: Union[str, MutableMapping],
                      time_slice: xr.Dataset,
                      chunk_sizes: Dict[str, int] = None):
    """
    Append time slice to existing zarr dataset.

    :param store: A zarr store.
    :param time_slice: Time slice to insert
    :param chunk_sizes: desired chunk sizes
    """
    if chunk_sizes:
        time_slice = chunk_dataset(time_slice, chunk_sizes, format_name='zarr')

    # Unfortunately time_slice.to_zarr(store, mode='a', append_dim='time') will replace global attributes of store
    # with attributes of time_slice (xarray bug?), which are usually empty in our case.
    # Hence, we must save our old attributes in a copy of time_slice.
    ds = zarr.open_group(store, mode='r')
    time_slice = time_slice.copy()
    time_slice.attrs.update(ds.attrs)
    if 'coordinates' in time_slice.attrs:
        # Remove 'coordinates', otherwise we get
        # ValueError: cannot serialize coordinates because the global attribute 'coordinates' already exists
        # from next time_slice.to_zarr(...) call.
        time_slice.attrs.pop('coordinates')

    time_slice.to_zarr(store, mode='a', append_dim='time')
    unchunk_dataset(store, coords_only=True)
Exemple #33
0
def extract2D(dataset, datatable, row_idx, col_idx, two_d_properties):
    zarr_file = zarr.DirectoryStore(os.path.join(config.BASEDIR, '2D_data', dataset + '_' + datatable + '.zarr'))
    root_group = zarr.open_group(zarr_file)
    two_d_properties = dict((prop, da.from_array(root_group[prop], chunks=root_group[prop].chunks, fancy=False)) for prop in two_d_properties)
    if len(col_idx) == 0 or len(row_idx) == 0:
        two_d_result = {}
        for prop in list(two_d_properties.keys()):
            two_d_result[prop] = np.array([], dtype=two_d_properties[prop].dtype)
    else:
        two_d_result = select_by_list(two_d_properties, row_idx, col_idx)
    return two_d_result
Exemple #34
0
    def open_group(cls, store, mode='r', synchronizer=None, group=None,
                   writer=None):
        import zarr
        min_zarr = '2.2'

        if LooseVersion(zarr.__version__) < min_zarr:  # pragma: no cover
            raise NotImplementedError("Zarr version %s or greater is "
                                      "required by xarray. See zarr "
                                      "installation "
                                      "http://zarr.readthedocs.io/en/stable/"
                                      "#installation" % min_zarr)
        zarr_group = zarr.open_group(store=store, mode=mode,
                                     synchronizer=synchronizer, path=group)
        return cls(zarr_group, writer=writer)
Exemple #35
0
def main():
    usage = 'usage: %prog [options] <in_zarr_file> <out_bw_file>'
    parser = OptionParser(usage)
    parser.add_option('-v', dest='verbose', default=False, action='store_true')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide input HDF5 and output BigWig.')
    else:
        zarr_file = args[0]
        bw_file = args[1]

    # open files
    zarr_in = zarr.open_group(zarr_file, 'r')
    bw_out = pyBigWig.open(bw_file, 'w')

    # construct header
    header = []
    chroms = sorted(zarr_in.keys())
    for chrom in chroms:
        # chromosome and length
        header.append((chrom,len(zarr_in[chrom])))

    # write header
    bw_out.addHeader(header)

    for chrom, length in header:
        if options.verbose:
            print(chrom)

        # read values
        x = np.array(zarr_in[chrom])

        # write gzipped into HDF5
        bw_out.addEntries(chrom, 0, values=x, span=1, step=1)

    # close files
    bw_out.close()
Exemple #36
0
def load_zarr_data(zarr_fn, chrom, s1, s2, gdistkey=None):

    import zarr

    samples1 = get_sample_ids(s1)
    samples2 = get_sample_ids(s2)

    zfh = zarr.open_group(zarr_fn, mode="r")[chrom]

    samples_x = zfh["samples"][:]
    sample_name = [sid.decode() for sid in samples_x.tolist()]

    idx1 = np.array([sample_name.index(sid) for sid in samples1])
    idx2 = np.array([sample_name.index(sid) for sid in samples2])

    g = allel.GenotypeChunkedArray(zfh["calldata"]["genotype"])

    pos = allel.SortedIndex(zfh["variants"]["POS"][:])
    if gdistkey is not None:
        gdist = h5fh["variants"][gdistkey][:]
    else:
        gdist = None

    return g.take(idx1, axis=1), g.take(idx2, axis=1), pos, gdist