Ejemplo n.º 1
0
def write_multiscale(pyramid: List, group: zarr.Group) -> None:
    """Write a pyramid with multiscale metadata to disk."""
    paths = []
    for path, dataset in enumerate(pyramid):
        group.create_dataset(str(path), data=pyramid[path])
        paths.append({"path": str(path)})

    multiscales = [{"version": "0.1", "datasets": paths}]
    group.attrs["multiscales"] = multiscales
Ejemplo n.º 2
0
 def write_array(self, group: zarr.Group, name: str,
                 array: np.ndarray) -> None:
     if array.dtype.kind == 'V':
         self.write_dataframe(group, name, array)
     else:
         dtype = str if array.dtype.kind == 'O' else array.dtype
         group.create_dataset(name,
                              data=array,
                              shape=array.shape,
                              chunks=calc_chunk(array.shape),
                              dtype=dtype,
                              compressor=COMPRESSOR,
                              overwrite=True)
Ejemplo n.º 3
0
def write_multiscale(
    pyramid: List,
    group: zarr.Group,
    chunks: Union[Tuple[Any, ...], int] = None,
) -> None:
    """Write a pyramid with multiscale metadata to disk."""
    paths = []
    for path, dataset in enumerate(pyramid):
        # TODO: chunks here could be different per layer
        group.create_dataset(str(path), data=dataset, chunks=chunks)
        paths.append({"path": str(path)})

    multiscales = [{"version": "0.2", "datasets": paths}]
    group.attrs["multiscales"] = multiscales
Ejemplo n.º 4
0
def create_zarr_count_assay(z: zarr.Group,
                            assay_name: str,
                            chunk_size: Tuple[int, int],
                            n_cells: int,
                            feat_ids: Union[np.ndarray, List[str]],
                            feat_names: Union[np.ndarray, List[str]],
                            dtype: str = 'uint32') -> zarr.hierarchy:
    """
    Creates and returns a Zarr array with name 'counts'.

    Args:
        z (zarr.Group):
        assay_name (str):
        chunk_size (Tuple[int, int]):
        n_cells (int):
        feat_ids (Union[np.ndarray, List[str]]):
        feat_names (Union[np.ndarray, List[str]]):
        dtype (str = 'uint32'):

    Returns:
        A Zarr array.
    """
    g = z.create_group(assay_name, overwrite=True)
    g.attrs['is_assay'] = True
    g.attrs['misc'] = {}
    create_zarr_obj_array(g, 'featureData/ids', feat_ids)
    create_zarr_obj_array(g, 'featureData/names', feat_names)
    create_zarr_obj_array(g, 'featureData/I',
                          [True for _ in range(len(feat_ids))], 'bool')
    return create_zarr_dataset(g,
                               'counts',
                               chunk_size,
                               dtype, (n_cells, len(feat_ids)),
                               overwrite=True)
Ejemplo n.º 5
0
def create_zarr_dataset(g: zarr.Group,
                        name: str,
                        chunks: tuple,
                        dtype: Any,
                        shape: Tuple,
                        overwrite: bool = True) -> zarr.hierarchy:
    """
    Creates and returns a Zarr array.

    Args:
        g (zarr.hierarchy):
        name (str):
        chunks (tuple):
        dtype (Any):
        shape (Tuple):
        overwrite (bool):

    Returns:
        A Zarr Array.
    """
    from numcodecs import Blosc

    compressor = Blosc(cname='lz4', clevel=5, shuffle=Blosc.BITSHUFFLE)
    return g.create_dataset(name,
                            chunks=chunks,
                            dtype=dtype,
                            shape=shape,
                            compressor=compressor,
                            overwrite=overwrite)
Ejemplo n.º 6
0
def create_zarr_obj_array(g: zarr.Group,
                          name: str,
                          data,
                          dtype: Union[str, Any] = None,
                          overwrite: bool = True) -> zarr.hierarchy:
    """
    Creates and returns a Zarr object array.

    A Zarr object array can contain any type of object.
    https://zarr.readthedocs.io/en/stable/tutorial.html#object-arrays

    Args:
        g (zarr.hierarchy):
        name (str):
        data ():
        dtype (Union[str, Any]):
        overwrite (bool):

    Returns:
        A Zarr object Array.
    """
    data = np.array(data)
    if dtype is None or dtype == object:
        dtype = 'U' + str(max([len(str(x)) for x in data]))
    if np.issubdtype(data.dtype, np.dtype('S')):
        data = data.astype('U')
        dtype = data.dtype
    return g.create_dataset(name,
                            data=data,
                            chunks=(100000, ),
                            shape=len(data),
                            dtype=dtype,
                            overwrite=overwrite)
Ejemplo n.º 7
0
    def write_dataframe(self, group: zarr.Group, name: str,
                        df: Union[pd.DataFrame, np.recarray]) -> None:
        data_type = 'data_frame' if isinstance(
            df, pd.DataFrame) else 'record_array'

        sub_group = group.create_group(name, overwrite=True)
        attrs_dict = {'data_type': data_type}
        cols = list(df.columns if data_type ==
                    'data_frame' else df.dtype.names)
        attrs_dict['columns'] = cols
        if data_type == 'data_frame':
            attrs_dict[
                'index_name'] = df.index.name if df.index.name is not None else 'index'
            sub_group.create_group(
                '_categories', overwrite=True
            )  # create a group to store category keys for catigorical columns
            self.write_series(sub_group, '_index', df.index.values, data_type)

        for col in cols:
            self.write_series(
                sub_group, col,
                (df[col].values if data_type == 'data_frame' else df[col]),
                data_type)

        sub_group.attrs.update(**attrs_dict)
Ejemplo n.º 8
0
 def write_csr(self, group: zarr.Group, name: str,
               matrix: csr_matrix) -> None:
     sub_group = group.create_group(name, overwrite=True)
     sub_group.attrs.update(data_type='csr_matrix', shape=matrix.shape)
     sub_group.create_dataset('data',
                              data=matrix.data,
                              shape=matrix.data.shape,
                              chunks=calc_chunk(matrix.data.shape),
                              dtype=matrix.data.dtype,
                              compressor=COMPRESSOR,
                              overwrite=True)
     sub_group.create_dataset('indices',
                              data=matrix.indices,
                              shape=matrix.indices.shape,
                              chunks=calc_chunk(matrix.indices.shape),
                              dtype=matrix.indices.dtype,
                              compressor=COMPRESSOR,
                              overwrite=True)
     sub_group.create_dataset('indptr',
                              data=matrix.indptr,
                              shape=matrix.indptr.shape,
                              chunks=calc_chunk(matrix.indptr.shape),
                              dtype=matrix.indptr.dtype,
                              compressor=COMPRESSOR,
                              overwrite=True)
Ejemplo n.º 9
0
    def write_series(self, group: zarr.Group, name: str, array: np.ndarray,
                     data_type: str) -> None:
        if data_type == 'data_frame':
            if not is_categorical_dtype(
                    array) and name != '_index' and is_string_dtype(array):
                keywords = set(array)
                if len(keywords
                       ) <= array.size / 10.0:  # at least 10x reduction
                    array = pd.Categorical(array,
                                           categories=natsorted(keywords))

            if is_categorical_dtype(array):
                # write category keys
                categories = group.require_group('_categories')
                values = array.categories.values
                if isinstance(values[0], bytes):
                    values = np.array([x.decode() for x in values],
                                      dtype=object)
                dtype = str if values.dtype.kind == 'O' else values.dtype
                categories.create_dataset(name,
                                          data=values,
                                          shape=values.shape,
                                          chunks=calc_chunk(values.shape),
                                          dtype=dtype,
                                          compressor=COMPRESSOR,
                                          overwrite=True)
                # write codes
                codes_arr = group.create_dataset(name,
                                                 data=array.codes,
                                                 shape=array.codes.shape,
                                                 chunks=calc_chunk(
                                                     array.codes.shape),
                                                 dtype=array.codes.dtype,
                                                 compressor=COMPRESSOR,
                                                 overwrite=True)
                codes_arr.attrs['ordered'] = array.ordered

                return None

        dtype = str if array.dtype.kind == 'O' else array.dtype
        group.create_dataset(name,
                             data=array,
                             shape=array.shape,
                             chunks=calc_chunk(array.shape),
                             dtype=dtype,
                             compressor=COMPRESSOR,
                             overwrite=True)
Ejemplo n.º 10
0
def _set_time_units_like(source_store: zarr.Group, target_store: zarr.Group):
    """Modify all time-like variables in source_store to use same units as
    corresponding variable in target_store. The provided source_store must be
    opened in a mode such that it can be modified (e.g. mode='r+')"""
    for variable, source_array in source_store.items():
        target_array = target_store[variable]
        if "units" in source_array.attrs and "since" in source_array.attrs[
                "units"]:
            _set_array_time_units_like(source_array, target_array)
Ejemplo n.º 11
0
def _init_coord(group: zarr.Group, coord: xr.DataArray):
    # fill_value=NaN is needed below for xr.open_zarr to succesfully load this
    # coordinate if decode_cf=True. Otherwise, time=0 gets filled in as nan. very
    # confusing...
    arr = np.asarray(coord)
    out_array = group.array(
        name=coord.name, data=arr, fill_value=_fill_value(arr.dtype)
    )
    out_array.attrs.update(coord.attrs)
    _set_dims(out_array, coord.dims)
Ejemplo n.º 12
0
def read_group(group: zarr.Group):
    if "encoding-type" in group.attrs:
        enctype = group.attrs["encoding-type"]
        EncodingVersions[enctype].check(group.name, group.attrs["encoding-version"])
        if enctype == "dataframe":
            return read_dataframe(group)
        elif enctype == "csr_matrix":
            return read_csr(group)
        elif enctype == "csc_matrix":
            return read_csc(group)
        # At the moment, just treat raw as normal group
    return {k: read_attribute(group[k]) for k in group.keys()}
Ejemplo n.º 13
0
def conv_chrom(fname: str, block_size: int,
               root: Group, chrom: int) -> None:
    num_positions = 0
    with open(fname) as tfam:
        for line in tfam:  # redo this with generators
            tokens = line.rstrip().split(' ')
            l_chrom = int(tokens[0])
            if l_chrom < chrom:
                continue
            elif l_chrom > chrom:
                break
            num_positions += 1
    tfam = open(fname)
    chrom_group = root.create_group(f'chromosome-{chrom}')
    all_calls = chrom_group.zeros('calls', shape=(num_positions, NUM_SAMPLES),
                                  dtype='B')
    block = []
    all_positions = []
    all_alleles = []
    current_position = 0
    for line in tfam:  # redo this with generators
        tokens = line.rstrip().split(' ')
        l_chrom = int(tokens[0])
        if l_chrom < chrom:
            continue
        elif l_chrom > chrom:
            break
        position = int(tokens[3])
        all_positions.append(position)
        calls = tokens[4:]
        alleles = ''.join(set(calls[4:]) - set(['0']))
        if len(alleles) == 1:
            alleles += alleles
        all_alleles.append(alleles)
        sample_calls = np.empty(shape=NUM_SAMPLES, dtype='B')
        for sample_position, sample in enumerate(range(NUM_SAMPLES)):
            a1, a2 = calls[2 * sample: 2 * sample + 2]
            try:
                sample_calls[sample_position] = encode_alleles(a1, a2, alleles)
            except Exception:
                print(chrom, current_position, sample_position)
                raise
        block.append(sample_calls)
        current_position += 1
        if current_position % block_size == 0:
            all_calls[current_position - block_size:current_position, :] = block
            block = []
    if len(block) > 0:
        all_calls[- len(block):, :] = block
    chrom_group.array('positions', all_positions)
    chrom_group.array('alleles', all_alleles)
    def read_mapping(self, group: zarr.Group) -> dict:
        res_dict = {}

        if 'scalar' in group.attrs:
            res_dict.update(group.attrs['scalar'])
        
        for key in group.array_keys():
            res_dict[key] = self.read_array(group, key)
        
        for key in group.group_keys():
            sub_group = group[key]
            data_type = sub_group.attrs['data_type']
            value = None
            if data_type == 'data_frame' or data_type == 'record_array':
                value = self.read_dataframe(sub_group)
            elif data_type == 'csr_matrix':
                value = self.read_csr(sub_group)
            else:
                assert data_type == 'dict'
                value = self.read_mapping(sub_group)
            res_dict[key] = value

        return res_dict
Ejemplo n.º 15
0
def write_multiscale(
    pyramid: List,
    group: zarr.Group,
    chunks: Union[Tuple[Any, ...], int] = None,
    fmt: Format = CurrentFormat(),
    axes: Union[str, List[str]] = None,
) -> None:
    """
    Write a pyramid with multiscale metadata to disk.

    Parameters
    ----------
    pyramid: List of np.ndarray
      the image data to save. Largest level first
      All image arrays MUST be up to 5-dimensional with dimensions
      ordered (t, c, z, y, x)
    group: zarr.Group
      the group within the zarr store to store the data in
    chunks: int or tuple of ints,
      size of the saved chunks to store the image
    fmt: Format
      The format of the ome_zarr data which should be used.
      Defaults to the most current.
    axes: str or list of str
      the names of the axes. e.g. "tczyx". Not needed for v0.1 or v0.2
      or for v0.3 if 2D or 5D. Otherwise this must be provided
    """

    dims = len(pyramid[0].shape)
    axes = _validate_axes_names(dims, axes, fmt)

    paths = []
    for path, dataset in enumerate(pyramid):
        # TODO: chunks here could be different per layer
        group.create_dataset(str(path), data=dataset, chunks=chunks)
        paths.append(str(path))
    write_multiscales_metadata(group, paths, fmt, axes)
Ejemplo n.º 16
0
def create_zarr_obj_array(
    g: zarr.Group,
    name: str,
    data,
    dtype: Union[str, Any] = None,
    overwrite: bool = True,
    chunk_size: int = 100000,
) -> zarr.hierarchy:
    """
    Creates and returns a Zarr object array.

    A Zarr object array can contain any type of object.
    https://zarr.readthedocs.io/en/stable/tutorial.html#object-arrays

    Args:
        g (zarr.hierarchy):
        name (str):
        data ():
        dtype (Union[str, Any]):
        overwrite (bool):
        chunk_size (int):

    Returns:
        A Zarr object Array.
    """

    from numcodecs import Blosc

    compressor = Blosc(cname="lz4", clevel=5, shuffle=Blosc.BITSHUFFLE)

    data = np.array(data)
    if dtype is None or dtype == object:
        dtype = "U" + str(max([len(str(x)) for x in data]))
    if np.issubdtype(data.dtype, np.dtype("S")):
        data = data.astype("U")
        dtype = data.dtype
    if chunk_size is None or chunk_size is False:
        chunks = False
    else:
        chunks = (chunk_size, )
    return g.create_dataset(
        name,
        data=data,
        chunks=chunks,
        shape=len(data),
        dtype=dtype,
        overwrite=overwrite,
        compressor=compressor,
    )
Ejemplo n.º 17
0
def _init_data_var(
    group: zarr.Group,
    array: xr.DataArray,
    start_shape: Sequence[int],
    start_chunks,
    dims,
):
    shape = tuple(start_shape) + array.data.shape
    chunks = tuple(start_chunks) + array.data.shape
    out_array = group.empty(
        name=array.name, shape=shape, chunks=chunks, dtype=array.dtype
    )
    out_array.attrs.update(array.attrs)
    dims = list(dims) + list(array.dims)
    _set_dims(out_array, dims)
    def read_dataframe(self, group: zarr.Group) -> Union[pd.DataFrame, np.ndarray]:
        columns = group.attrs.get('columns', None)
        if columns is None:
            columns = [col for col in group.array_keys() if col != '_index']

        if group.attrs['data_type'] == 'data_frame':
            data = {col: self.read_series(group, col) for col in columns}
            _index = self.read_series(group, '_index')
            index = pd.Index(_index, name = group.attrs['index_name'], dtype = _index.dtype)
            df = pd.DataFrame(data = data, index = index) # if add columns = columns, the generation will be slow
            return df
        else:
            array = np.rec.fromarrays([self.read_series(group, col) for col in columns],
                names = columns)
            return array
    def write_unimodal_data(self, group: zarr.Group, name: str, data: UnimodalData, overwrite: bool = True) -> None:
        """ Write UnimodalData
        """
        sub_group = group.require_group(name, overwrite = overwrite)
        attrs_dict = {'data_type': 'UnimodalData', '_cur_matrix': data.current_matrix()}
        sub_group.attrs.update(**attrs_dict)

        self.write_dataframe(sub_group, 'barcode_metadata', data.barcode_metadata)
        self.write_dataframe(sub_group, 'feature_metadata', data.feature_metadata)

        if overwrite or data.matrices.is_dirty():
            self.write_mapping(sub_group, 'matrices', data.matrices, overwrite = overwrite)
        if overwrite or data.metadata.is_dirty():
            self.write_mapping(sub_group, 'metadata', data.metadata, overwrite = overwrite)
        if overwrite or data.barcode_multiarrays.is_dirty():
            self.write_mapping(sub_group, 'barcode_multiarrays', data.barcode_multiarrays, overwrite = overwrite)
        if overwrite or data.feature_multiarrays.is_dirty():
            self.write_mapping(sub_group, 'feature_multiarrays', data.feature_multiarrays, overwrite = overwrite)
Ejemplo n.º 20
0
    def write_mapping(self,
                      group: zarr.Group,
                      name: str,
                      mapping: dict,
                      overwrite=True) -> None:
        sub_group = group.require_group(name, overwrite=overwrite)
        scalar_dict = sub_group.attrs.pop(
            'scalar', {})  # if overwrite == True, there should be no 'scalar'

        def _write_one_pair(key, value):
            if is_scalar(value):
                scalar_dict[key] = value
            elif isinstance(value, np.ndarray):
                self.write_array(sub_group, key, value)
            elif isinstance(value, pd.DataFrame):
                self.write_dataframe(sub_group, key, value)
            elif is_dict_like(value):
                self.write_mapping(sub_group, key, value)
            elif issparse(value):
                assert isinstance(value, csr_matrix)
                self.write_csr(sub_group, key, value)
            else:
                # assume value is either list or tuple, converting it to np.ndarray
                self.write_array(
                    sub_group, key,
                    value.astype(str)
                    if is_categorical_dtype(value) else np.array(value))

        if overwrite:
            for key, value in mapping.items():
                _write_one_pair(key, value)
        else:
            for key in mapping.deleted:
                if key in scalar_dict:
                    del scalar_dict[key]
                else:
                    del sub_group[key]
            for key in mapping.modified:
                _write_one_pair(key, mapping[key])

        attrs_dict = {'data_type': 'dict'}
        if len(scalar_dict) > 0:
            attrs_dict['scalar'] = scalar_dict
        sub_group.attrs.update(**attrs_dict)
Ejemplo n.º 21
0
def _shift_store(group: zarr.Group, dim: str, n_shift: int):
    """Shift local zarr store which represents an xarray dataset by n_shift along dim
    
    Args:
        group: zarr Group for an xarray dataset backed by a DirectoryStore
        dim: name of dimension of xarray dataset along which to shift zarr
        n_shift: how far to shift. The chunk size along dim of every array in group
            must evenly divide n_shift.

    Note:
        The zarr store represented by group will no longer be valid after this
        function is called since its chunks will not be listed starting at 0. It is
        intended that the output of this function be copied into another zarr store as
        a method of appending.
    """
    for array in group.values():
        if dim in array.attrs[XARRAY_DIM_NAMES_ATTR]:
            axis = array.attrs[XARRAY_DIM_NAMES_ATTR].index(dim)
            _shift_array(array, axis, n_shift)
Ejemplo n.º 22
0
def _assert_chunks_match(source_group: zarr.Group, target_group: zarr.Group,
                         dim: str):
    """Ensure chunks for source and target groups are valid for appending.
    
    Specifically:
        1. all arrays in source_group have corresponding arrays in target_group.
        2. chunk size is same for each array in source and target group.
        3. dim length is a multiple of chunk size for target group.
        
    In addition, log a warning if dim length is not a multiple of chunk size for source
    group."""
    for key, source_array in source_group.items():
        if key not in target_group:
            raise KeyError(
                f"Cannot append {source_array} because there is no corresponding array "
                f"in {target_group}.")
        if dim != key and dim in source_array.attrs[XARRAY_DIM_NAMES_ATTR]:
            axis = source_array.attrs[XARRAY_DIM_NAMES_ATTR].index(dim)
            target_array = target_group[key]
            _assert_array_chunks_match(source_array, target_array, axis)
Ejemplo n.º 23
0
def _create_zarr(
    dims: Sequence[str],
    coords: Mapping[str, xr.DataArray],
    group: zarr.Group,
    template: xr.Dataset,
):
    ds = template
    group.attrs.update(ds.attrs)

    start_shape = [len(coords[dim]) for dim in dims]
    start_chunks = (1,) * len(start_shape)

    for name in ds:
        _init_data_var(group, ds[name], start_shape, start_chunks, dims)

    for name in ds.coords:
        _init_coord(group, ds[name])

    for name in coords:
        _init_coord(group, coords[name])

    group.attrs["DIMS"] = dims
Ejemplo n.º 24
0
def write_multiscales_metadata(
    group: zarr.Group,
    paths: List[str],
    fmt: Format = CurrentFormat(),
    axes: List[str] = None,
) -> None:
    """
    Write the multiscales metadata in the group.

    Parameters
    ----------
    group: zarr.Group
      the group within the zarr store to write the metadata in.
    paths: list of str
      The list of paths to the datasets for this multiscale image.
    fmt: Format
      The format of the ome_zarr data which should be used.
      Defaults to the most current.
    axes: list of str
      the names of the axes. e.g. ["t", "c", "z", "y", "x"].
      Ignored for versions 0.1 and 0.2. Required for version 0.3 or greater.
    """

    multiscales = [
        {
            "version": fmt.version,
            "datasets": [{"path": str(p)} for p in paths],
        }
    ]
    if axes is not None:
        if fmt.version in ("0.1", "0.2"):
            LOGGER.info("axes ignored for version 0.1 or 0.2")
        else:
            _validate_axes(axes, fmt)
            multiscales[0]["axes"] = axes
    group.attrs["multiscales"] = multiscales
 def read_array(self, group: zarr.Group, name: str) -> Union[np.ndarray, np.recarray]:
     if name in group.group_keys():
         return self.read_dataframe(group[name])
     else:
         return group[name][...]
Ejemplo n.º 26
0
def _get_dim_size(group: zarr.Group, dim: str):
    """Get length of dim, assuming it is same for all arrays that contain it"""
    for array in group.values():
        if dim in array.attrs[XARRAY_DIM_NAMES_ATTR]:
            axis = array.attrs[XARRAY_DIM_NAMES_ATTR].index(dim)
            return array.shape[axis]