Ejemplo n.º 1
0
def test_broadcast_arrays_dask(d1_chunks):
    d1 = dsa.empty((5, 25), chunks=d1_chunks)
    d2 = dsa.empty((1, 25), chunks=(1, 25))

    d1b, d2b = broadcast_arrays(d1, d2)
    assert d1b.shape == (5, 25)
    assert d2b.shape == (5, 25)
    assert d1b.chunks == d1_chunks
    assert d2b.chunks == d1_chunks
Ejemplo n.º 2
0
 def setup(self):
     r = rnd()
     self.a = da.empty(shape=(2000000, 200, 2),
                       dtype='i1',
                       chunks=(10000, 100, 2))
     self.c = r.randint(0, 2, size=self.a.shape[0], dtype=bool)
     self.s = sorted(r.choice(self.a.shape[1], size=100, replace=False))
Ejemplo n.º 3
0
def convert_probability_to_call(
    ds: Dataset,
    call_genotype_probability: str = variables.call_genotype_probability,
    threshold: float = 0.9,
    merge: bool = True,
) -> Dataset:
    """
    Convert genotype probabilities to hard calls.

    Parameters
    ----------
    ds
        Dataset containing genotype probabilities, such as from :func:`sgkit.io.bgen.read_bgen`.
    call_genotype_probability
        Genotype probability variable to be converted as defined by
        :data:`sgkit.variables.call_genotype_probability_spec`.
    threshold
        Probability threshold in [0, 1] that must be met or exceeded by at least one genotype
        probability in order for any calls to be made -- all values will be -1 (missing)
        otherwise. Setting this value to less than or equal to 0 disables any effect it has.
        Default value is 0.9.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the following variables:

    - `call_genotype` (variants, samples, ploidy): Converted hard calls.
        Defined by :data:`sgkit.variables.call_genotype_spec`.

    - `call_genotype_mask` (variants, samples, ploidy): Mask for converted hard calls.
        Defined by :data:`sgkit.variables.call_genotype_mask_spec`.
    """
    if not (0 <= threshold <= 1):
        raise ValueError(
            f"Threshold must be float in [0, 1], not {threshold}.")
    variables.validate(
        ds,
        {call_genotype_probability: variables.call_genotype_probability_spec})
    if ds.dims["genotypes"] != 3:
        raise NotImplementedError(
            f"Hard call conversion only supported for diploid, biallelic genotypes; "
            f"num genotypes in provided probabilities array = {ds.dims['genotypes']}."
        )
    GP = da.asarray(ds[call_genotype_probability])
    # Remove chunking in genotypes dimension, if present
    if len(GP.chunks[2]) > 1:
        GP = GP.rechunk((None, None, -1))
    K = da.empty(2, dtype=np.uint8)
    GT = _convert_probability_to_call(GP, K, threshold)
    new_ds = create_dataset({
        variables.call_genotype: (("variants", "samples", "ploidy"), GT),
        variables.call_genotype_mask:
        (("variants", "samples", "ploidy"), GT < 0),
    })
    return conditional_merge_datasets(ds, new_ds, merge)
Ejemplo n.º 4
0
def cf_y_x():
    return xr.DataArray(
        da.empty((Y_DIM_SIZE, X_DIM_SIZE)),
        dims=("y", "x"),
        attrs={
            "grid_mapping": "a_grid_map_var",
        },
    )
Ejemplo n.º 5
0
def duck_empty(dims, sizes, dtype="float64", chunks=None):
    """Return an empty DataArray based on a numpy or dask backend, depending on the chunks argument."""
    shape = [sizes[dim] for dim in dims]
    if chunks:
        chnks = [chunks.get(dim, (sizes[dim], )) for dim in dims]
        content = dsk.empty(shape, chunks=chnks, dtype=dtype)
    else:
        content = np.empty(shape, dtype=dtype)
    return xr.DataArray(content, dims=dims)
Ejemplo n.º 6
0
def raw_coords_lats1d_lons1d():
    return xr.DataArray(
        da.empty((Y_DIM_SIZE, X_DIM_SIZE)),
        dims=("lats", "lons"),
        coords={
            "lons": da.linspace(25, 35, X_DIM_SIZE),
            "lats": da.linspace(25, 35, Y_DIM_SIZE),
        },
    )
Ejemplo n.º 7
0
def label_adjacency_graph(labels, nlabels, depth, iou_threshold):
    all_mappings = [da.empty((2, 0), dtype=np.int32, chunks=1)]

    slices_and_axes = get_slices_and_axes(labels.chunks, labels.shape, depth)
    for face_slice, axis in slices_and_axes:
        face = labels[face_slice]
        mapped = _across_block_iou_delayed(face, axis, iou_threshold)
        all_mappings.append(mapped)

    i, j = da.concatenate(all_mappings, axis=1)
    result = _label._to_csr_matrix(i, j, nlabels + 1)
    return result
Ejemplo n.º 8
0
def regrid_chunk(
    block_src_data,
    block_src_y_pnts,
    block_src_low_y_bnds,
    block_src_upp_y_bnds,
    src_y_coord_metadata,
    src_x_coord,
    src_cube_metadata,
    tgt_y_coord,
    tgt_x_coord,
    tgt_y_slices,
    tgt_cube_metadata,
    y_dim,
    x_dim,
    scheme,
    block_info=None,
):
    # Construct source and target cubes.
    block_src_y_coord = iris.coords.DimCoord(
        block_src_y_pnts.ravel(),
        bounds=np.hstack((
            block_src_low_y_bnds,
            block_src_upp_y_bnds,
        )),
    )
    block_src_y_coord.metadata = src_y_coord_metadata

    src_cube = iris.cube.Cube(
        block_src_data,
        dim_coords_and_dims=[(block_src_y_coord, y_dim), (src_x_coord, x_dim)],
    )
    src_cube.metadata = src_cube_metadata

    tgt_y_slice = tgt_y_slices[block_info[0]["chunk-location"][0]]
    block_tgt_y_coord = tgt_y_coord[tgt_y_slice]

    tgt_shape = (
        block_tgt_y_coord.shape[0],
        tgt_x_coord.shape[0],
    )
    tgt_cube = iris.cube.Cube(
        da.empty(tgt_shape),
        dim_coords_and_dims=[(block_tgt_y_coord, y_dim), (tgt_x_coord, x_dim)],
    )
    tgt_cube.metadata = tgt_cube_metadata

    # Regrid and ensure that there are 2 dimensions.
    reg_data = src_cube.regrid(tgt_cube, scheme).data.reshape(tgt_shape)
    return reg_data
Ejemplo n.º 9
0
    def read_5d(filename: str,
                sizes: Tuple[int, int, int, int, int],
                s: int,
                mdata: czimd.CziMetadata,
                remove_Adim: bool = True) -> np.ndarray:

        array_md = da.empty([
            sizes[0], sizes[1], sizes[2], sizes[3], sizes[4],
            3 if mdata.isRGB else 1
        ],
                            dtype=mdata.npdtype)

        # open the CZI document to read the
        with pyczi.open_czi(filename) as czidoc:

            # read array for the scene
            for t, z, c in product(range(sizes[0]), range(sizes[1]),
                                   range(sizes[2])):

                if mdata.image.SizeS is None:
                    image2d = czidoc.read()
                else:
                    image2d = czidoc.read(plane={
                        'T': t,
                        'Z': z,
                        'C': c
                    },
                                          scene=s)

                # check if the image2d is really not too big
                if mdata.pyczi_dims["X"][
                        1] > mdata.image.SizeX or mdata.pyczi_dims["Y"][
                            1] > mdata.image.SizeY:
                    image2d = image2d[..., 0:mdata.image.SizeY,
                                      0:mdata.image.SizeX, :]

                array_md[t, z, c, ...] = image2d

        if remove_Adim:
            array_md = np.squeeze(array_md, axis=-1)

        return array_md
Ejemplo n.º 10
0
def gx_y_x():
    crs = CRS.from_epsg(4326)
    return xr.DataArray(
        da.empty((Y_DIM_SIZE, X_DIM_SIZE)),
        dims=("y", "x"),
        attrs={
            "grid_mapping": "spatial_ref",
        },
        coords={
            "spatial_ref": xr.DataArray(
                0,
                attrs={
                    "crs_wkt": crs.to_wkt(),
                    "spatial_ref": crs.to_wkt(),
                },
            ),
            "y": da.linspace(0, 15000, X_DIM_SIZE),
            "x": da.linspace(-15000, 10000, Y_DIM_SIZE),
        },
    )
Ejemplo n.º 11
0
def label_adjacency_graph(labels, structure, nlabels):
    """
    Adjacency graph of labels between chunks of ``labels``.

    Each chunk in ``labels`` has been labeled independently, and the labels
    in different chunks are guaranteed to be unique.

    Here we construct a graph connecting labels in different chunks that
    correspond to the same logical label in the global volume. This is true
    if the two labels "touch" across the block face as defined by the input
    ``structure``.

    Parameters
    ----------
    labels : dask array of int
        The input labeled array, where each chunk is independently labeled.
    structure : array of bool
        Structuring element, shape (3,) * labels.ndim.
    nlabels : delayed int
        The total number of labels in ``labels`` *before* correcting for
        global consistency.

    Returns
    -------
    mat : delayed scipy.sparse.csr_matrix
        This matrix has value 1 at (i, j) if label i is connected to
        label j in the global volume, 0 everywhere else.
    """
    faces = _chunk_faces(labels.chunks, labels.shape)
    all_mappings = [da.empty((2, 0), dtype=LABEL_DTYPE, chunks=1)]
    for face_slice in faces:
        face = labels[face_slice]
        mapped = _across_block_label_grouping_delayed(face, structure)
        all_mappings.append(mapped)
    all_mappings = da.concatenate(all_mappings, axis=1)
    i, j = all_mappings
    mat = _to_csr_matrix(i, j, nlabels + 1)
    return mat
def strfunc_from_pdf1(rxs, pdf, values, order, absolute=False):
    """Compute structure function of specified order from pdf for increments
    module.

    """
    if absolute:
        values = abs(values)

    irx_max = rxs.size

    n = pdf.shape[1]
    dpdf = da.from_array(pdf, chunks=(1, n))
    dvalues = da.from_array(values, chunks=(1, n))
    S_order = da.empty(rxs.shape, chunks=1)
    print(f"S_order {S_order.shape}")
    print(f"pdf {pdf.shape}")
    print(f"values {values.shape}")
    for irx in range(irx_max):
        S_order[irx] = da.sum(
            dpdf[irx] * dvalues[irx]**order) * np.abs(dvalues[irx, 1] -
                                                      dvalues[irx, 0])

    return S_order.compute()
Ejemplo n.º 13
0
def count_cohort_alleles(
    ds: Dataset,
    *,
    call_allele_count: Hashable = variables.call_allele_count,
    sample_cohort: Hashable = variables.sample_cohort,
    merge: bool = True,
) -> Dataset:
    """Compute per cohort allele counts from per-sample allele counts, or genotype calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_allele_count
        Input variable name holding call_allele_count as defined by
        :data:`sgkit.variables.call_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_call_alleles`.
    sample_cohort
        Input variable name holding sample_cohort as defined by
        :data:`sgkit.variables.sample_cohort_spec`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.cohort_allele_count_spec`
    of allele counts with shape (variants, cohorts, alleles) and values corresponding to
    the number of non-missing occurrences of each allele.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> ds["sample_cohort"] = xr.DataArray(np.repeat([0, 1], ds.dims["samples"] // 2), dims="samples")
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1   S2   S3
    variants
    0         0/0  1/0  1/0  0/1
    1         1/0  0/1  0/0  1/0
    2         1/1  0/0  1/0  0/1
    3         1/0  1/1  1/1  1/0
    4         1/0  0/0  1/0  1/1

    >>> sg.count_cohort_alleles(ds)["cohort_allele_count"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[3, 1],
            [2, 2]],
    <BLANKLINE>
            [[2, 2],
            [3, 1]],
    <BLANKLINE>
            [[2, 2],
            [2, 2]],
    <BLANKLINE>
            [[1, 3],
            [1, 3]],
    <BLANKLINE>
            [[3, 1],
            [1, 3]]])
    """
    ds = define_variable_if_absent(
        ds, variables.call_allele_count, call_allele_count, count_call_alleles
    )
    variables.validate(ds, {call_allele_count: variables.call_allele_count_spec})

    n_variants = ds.dims["variants"]
    n_alleles = ds.dims["alleles"]

    AC, SC = da.asarray(ds[call_allele_count]), da.asarray(ds[sample_cohort])
    n_cohorts = SC.max().compute() + 1  # 0-based indexing
    C = da.empty(n_cohorts, dtype=np.uint8)

    G = da.asarray(ds.call_genotype)
    shape = (G.chunks[0], n_cohorts, n_alleles)

    AC = da.map_blocks(_count_cohort_alleles, AC, SC, C, chunks=shape, dtype=np.int32)
    assert_array_shape(
        AC, n_variants, n_cohorts * AC.numblocks[1], n_alleles * AC.numblocks[2]
    )

    # Stack the blocks and sum across them
    # (which will only work because each chunk is guaranteed to have same size)
    AC = da.stack([AC.blocks[:, i] for i in range(AC.numblocks[1])]).sum(axis=0)
    assert_array_shape(AC, n_variants, n_cohorts, n_alleles)

    new_ds = create_dataset(
        {variables.cohort_allele_count: (("variants", "cohorts", "alleles"), AC)}
    )
    return conditional_merge_datasets(ds, new_ds, merge)
Ejemplo n.º 14
0
def count_call_alleles(ds: Dataset, merge: bool = True) -> Dataset:
    """Compute per sample allele counts from genotype calls.

    Parameters
    ----------
    ds : Dataset
        Genotype call dataset such as from
        `sgkit.create_genotype_call_dataset`.
    merge : bool, optional
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset. Output variables will
        overwrite any input variables with the same name, and a warning
        will be issued in this case.
        If False, return only the computed output variables.

    Returns
    -------
    Dataset
        Array `call_allele_count` of allele counts with
        shape (variants, samples, alleles) and values corresponding to
        the number of non-missing occurrences of each allele.

    Examples
    --------

    >>> import sgkit as sg
    >>> from sgkit.testing import simulate_genotype_call_dataset
    >>> ds = simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1)
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1
    variants
    0         1/0  1/0
    1         1/0  1/1
    2         0/1  1/0
    3         0/0  0/0

    >>> sg.count_call_alleles(ds)["call_allele_count"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[1, 1],
            [1, 1]],
    <BLANKLINE>
           [[1, 1],
            [0, 2]],
    <BLANKLINE>
           [[1, 1],
            [1, 1]],
    <BLANKLINE>
           [[2, 0],
            [2, 0]]], dtype=uint8)
    """
    n_alleles = ds.dims["alleles"]
    G = da.asarray(ds["call_genotype"])
    shape = (G.chunks[0], G.chunks[1], n_alleles)
    N = da.empty(n_alleles, dtype=np.uint8)
    new_ds = Dataset(
        {
            "call_allele_count": (
                ("variants", "samples", "alleles"),
                da.map_blocks(
                    count_alleles, G, N, chunks=shape, drop_axis=2, new_axis=2
                ),
            )
        }
    )
    return merge_datasets(ds, new_ds) if merge else new_ds
Ejemplo n.º 15
0
def concatenate_and_rechunk(
    zarrs: Sequence[zarr.Array],
    chunks: Optional[Tuple[int, ...]] = None,
    dtype: DType = None,
) -> da.Array:
    """Perform a concatenate and rechunk operation on a collection of Zarr arrays
    to produce an array with a uniform chunking, suitable for saving as
    a single Zarr array.

    In contrast to Dask's ``rechunk`` method, the Dask computation graph
    is embarrassingly parallel and will make efficient use of memory,
    since no Zarr chunks are cached by the Dask scheduler.

    The Zarr arrays must have matching shapes except in the first
    dimension.

    Parameters
    ----------
    zarrs
        Collection of Zarr arrays to concatenate.
    chunks : Optional[Tuple[int, ...]], optional
        The chunks to apply to the concatenated arrays. If not specified
        the chunks for the first array will be applied to the concatenated
        array.
    dtype
        The dtype of the concatenated array, by default the same as the
        first array.

    Returns
    -------
    A Dask array, suitable for saving as a single Zarr array.

    Raises
    ------
    ValueError
        If the Zarr arrays do not have matching shapes (except in the first
        dimension).
    """

    if len(set([z.shape[1:] for z in zarrs])) > 1:
        shapes = [z.shape for z in zarrs]
        raise ValueError(
            f"Zarr arrays must have matching shapes (except in the first dimension): {shapes}"
        )

    lengths = np.array([z.shape[0] for z in zarrs])
    lengths0 = np.insert(lengths, 0, 0,
                         axis=0)  # type: ignore[no-untyped-call]
    offsets = np.cumsum(lengths0)
    total_length = offsets[-1]

    shape = (total_length, *zarrs[0].shape[1:])
    chunks = chunks or zarrs[0].chunks
    dtype = dtype or zarrs[0].dtype

    ar = da.empty(shape, chunks=chunks)

    def load_chunk(
        x: ArrayLike,
        zarrs: Sequence[zarr.Array],
        offsets: ArrayLike,
        block_info: Dict[Any, Any],
    ) -> ArrayLike:
        return _slice_zarrs(zarrs, offsets, block_info[0]["array-location"])

    return ar.map_blocks(load_chunk, zarrs=zarrs, offsets=offsets, dtype=dtype)
Ejemplo n.º 16
0
    def computeTimeChunk(self, time, dt):
        """Load a chunk of three data time steps into the FieldSet.
        This is used when FieldSet uses data imported from netcdf,
        with default option deferred_load. The loaded time steps are at or immediatly before time
        and the two time steps immediately following time if dt is positive (and inversely for negative dt)
        :param time: Time around which the FieldSet chunks are to be loaded. Time is provided as a double, relatively to Fieldset.time_origin
        :param dt: time step of the integration scheme
        """
        signdt = np.sign(dt)
        nextTime = np.infty if dt > 0 else -np.infty

        for g in self.gridset.grids:
            g.update_status = 'not_updated'
        for f in self.get_fields():
            if type(f) in [VectorField, NestedField, SummedField] or not f.grid.defer_load:
                continue
            if f.grid.update_status == 'not_updated':
                nextTime_loc = f.grid.computeTimeChunk(f, time, signdt)
                if time == nextTime_loc and signdt != 0:
                    raise TimeExtrapolationError(time, field=f, msg='In fset.computeTimeChunk')
            nextTime = min(nextTime, nextTime_loc) if signdt >= 0 else max(nextTime, nextTime_loc)

        for f in self.get_fields():
            if type(f) in [VectorField, NestedField, SummedField] or not f.grid.defer_load or f.dataFiles is None:
                continue
            g = f.grid
            if g.update_status == 'first_updated':  # First load of data
                data = da.empty((g.tdim, g.zdim, g.ydim-2*g.meridional_halo, g.xdim-2*g.zonal_halo), dtype=np.float32)
                f.loaded_time_indices = range(3)
                for tind in f.loaded_time_indices:
                    for fb in f.filebuffers:
                        if fb is not None:
                            fb.dataset.close()

                    data = f.computeTimeChunk(data, tind)
                data = f.rescale_and_set_minmax(data)
                f.data = f.reshape(data)
                if not f.chunk_set:
                    f.chunk_setup()
                if len(g.load_chunk) > 0:
                    g.load_chunk = np.where(g.load_chunk == 2, 1, g.load_chunk)
                    g.load_chunk = np.where(g.load_chunk == 3, 0, g.load_chunk)
            elif g.update_status == 'updated':
                data = da.empty((g.tdim, g.zdim, g.ydim-2*g.meridional_halo, g.xdim-2*g.zonal_halo), dtype=np.float32)
                if signdt >= 0:
                    f.loaded_time_indices = [2]
                    f.filebuffers[0].dataset.close()
                    f.filebuffers[:2] = f.filebuffers[1:]
                    data = f.computeTimeChunk(data, 2)
                else:
                    f.loaded_time_indices = [0]
                    f.filebuffers[2].dataset.close()
                    f.filebuffers[1:] = f.filebuffers[:2]
                    data = f.computeTimeChunk(data, 0)
                data = f.rescale_and_set_minmax(data)
                if signdt >= 0:
                    data = f.reshape(data)[2:, :]
                    f.data = da.concatenate([f.data[1:, :], data], axis=0)
                else:
                    data = f.reshape(data)[0:1, :]
                    f.data = da.concatenate([data, f.data[:2, :]], axis=0)
                if len(g.load_chunk) > 0:
                    if signdt >= 0:
                        for block_id in range(len(g.load_chunk)):
                            if g.load_chunk[block_id] == 2:
                                if f.data_chunks[block_id] is None:
                                    # file chunks were never loaded.
                                    # happens when field not called by kernel, but shares a grid with another field called by kernel
                                    break
                                block = f.get_block(block_id)
                                f.data_chunks[block_id][:2] = f.data_chunks[block_id][1:]
                                f.data_chunks[block_id][2] = np.array(f.data.blocks[(slice(3),)+block][2])
                    else:
                        for block_id in range(len(g.load_chunk)):
                            if g.load_chunk[block_id] == 2:
                                if f.data_chunks[block_id] is None:
                                    # file chunks were never loaded.
                                    # happens when field not called by kernel, but shares a grid with another field called by kernel
                                    break
                                block = f.get_block(block_id)
                                f.data_chunks[block_id][1:] = f.data_chunks[block_id][:2]
                                f.data_chunks[block_id][0] = np.array(f.data.blocks[(slice(3),)+block][0])
        # do user-defined computations on fieldset data
        if self.compute_on_defer:
            self.compute_on_defer(self)

        if abs(nextTime) == np.infty or np.isnan(nextTime):  # Second happens when dt=0
            return nextTime
        else:
            nSteps = int((nextTime - time) / dt)
            if nSteps == 0:
                return nextTime
            else:
                return time + nSteps * dt
Ejemplo n.º 17
0
def misc_t_z_y_x():
    return xr.DataArray(
        da.empty((TIME_DIM_SIZE, ALT_DIM_SIZE, Y_DIM_SIZE, X_DIM_SIZE)),
        dims=("t", "z", "y", "x"),
    )
Ejemplo n.º 18
0
def misc_y_x_z():
    return xr.DataArray(
        da.empty((Y_DIM_SIZE, X_DIM_SIZE, ALT_DIM_SIZE)),
        dims=("y", "x", "z"),
    )
Ejemplo n.º 19
0
def geotiff_y_x_bands():
    return xr.DataArray(
        da.empty((Y_DIM_SIZE, X_DIM_SIZE, OTHER_DIM_SIZE)),
        dims=("y", "x", "bands"),
    )
Ejemplo n.º 20
0
def geotiff_bands_y_x():
    return xr.DataArray(
        da.empty((OTHER_DIM_SIZE, Y_DIM_SIZE, X_DIM_SIZE)),
        dims=("bands", "y", "x"),
    )
Ejemplo n.º 21
0
def geotiff_b_a():
    return xr.DataArray(
        da.empty((Y_DIM_SIZE, X_DIM_SIZE)),
        dims=("a", "b"),
    )
Ejemplo n.º 22
0
def wrapper(func,
            time,
            params=1,
            chunks=None,
            dtype='float',
            output='xarray',
            name='z',
            **kwargs):
    """ Wraps timeseries generation code in order to distribute the generation

    Parameters
    ----------
        func: method
            Method wrapped, signature needs to be func(p1, p2, ..., time, draws=1, **kwargs)
            where p1, p2 are dimensioning parameters that are nor time nor draw
            Minimal signature is func(time, draws=1, **kwargs)
        time: int, np.ndarray, tuple
            Number of time steps, time array, tuple (T, dt)
        params: int, dict, optional
            Parameters that will lead to dimensions or required to generate
            time series
        chunks: dict, optional
            Associated chunks
        seed: int, optional
            numpy seed
        name: str, optional
            output name, may be required if multiple variables are correlated
            (dask otherwise will assume they are one and the same)
        **kwargs:
            passed to func
    """

    if isinstance(time, int):
        time = np.arange(time)
    elif isinstance(time, tuple):
        time = np.arange(0., time[0], time[1])
    else:
        time = np.array(time)
    Nt = time.size

    if isinstance(params, dict):
        dims = {}
        for d, v in params.items():
            if d=='draw' and isinstance(v, int):
                dims[d] = np.arange(v)
            else:
                dims[d] = np.array(v, ndmin=1)
    else:
        dims = {'draw': np.array(range(params), ndmin=1)}
    dims['time'] = time
    Nd = len(dims)
    shape = tuple(v.size for d, v in dims.items())

    xr_chunks = {d: 'auto' for d in dims}
    xr_chunks['time'] = -1
    if chunks:
        xr_chunks.update(**chunks)
    da_chunks = tuple(xr_chunks[d] for d in dims)

    # transform dimensions into dask arrays with appropriate forms
    # Note: adding name to dimension names below is pretty critical if
    #   multiple calls to wrapper are made.
    #   dask will create a single object ... danger
    dims_da = tuple(da.from_array(dims[d]
                                  .reshape(tuple(dims[d].size if i==j else 1 for j in range(Nd))),
                                  chunks=tuple(xr_chunks[d] if i==j else -1 for j in range(Nd)),
                                  name=name+d
                                 )
                     for i, d in enumerate(dims)
                    )

    # wraps func to reinit numpy seed from chunk number
    def _func(*args, seed=None, block_info=None, **kwargs):
        if seed is None:
            seed = np.random.randint(0,2**32-1)
        np.random.seed(seed+block_info[0]['num-chunks'][0])
        return func(*args[1:],
                    draws=args[0].shape[-2],
                    seed=seed,
                    **kwargs)

    x = da.empty(shape=shape, chunks=da_chunks)
    dims_da = tuple(d for d in dims_da if d.name!=name+'draw')
    x = x.map_blocks(_func, *dims_da, **kwargs, dtype=dtype)
    x = x.squeeze()
    dims = {d: v for d, v in dims.items() if v.size>1}

    # put result in an xarray DataArray
    if output=='xarray':
        x = xr.DataArray(x, dims=tuple(dims), coords=dims).rename(name)
    elif output=='dask_dd':
        assert x.ndim<3, 'Data generated is not 2D and cannot be transformed' \
                +' into a dataframe'
        to_index = lambda d: (dd
                              .from_array(dims[d], columns=d)
                              .to_frame()
                              .set_index(d)
                              .index
                             )
        if shape[0]==1:
            i=to_index('time')
            c='draw'
        else:
            i=to_index('draw')
            c=time
        x = dd.from_dask_array(x, index=i, columns=c)

    return x
Ejemplo n.º 23
0
 def test_non_square_datasets(self):
     data_array_dask = da.ones((6, 16, 100, 50), chunks=(2, 2, 25, 25))
     peak_array_dask = da.empty((6, 16), chunks=(2, 2), dtype=np.object)
     dt._intensity_peaks_image(data_array_dask, peak_array_dask, 5)
Ejemplo n.º 24
0
def count_call_alleles(
    ds: Dataset,
    *,
    call_genotype: Hashable = variables.call_genotype,
    merge: bool = True,
) -> Dataset:
    """Compute per sample allele counts from genotype calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_genotype
        Input variable name holding call_genotype as defined by
        :data:`sgkit.variables.call_genotype_spec`.
        Must be present in ``ds``.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.call_allele_count_spec`
    of allele counts with shape (variants, samples, alleles) and values corresponding to
    the number of non-missing occurrences of each allele.

    Examples
    --------

    >>> import sgkit as sg
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1)
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1
    variants
    0         1/0  1/0
    1         1/0  1/1
    2         0/1  1/0
    3         0/0  0/0

    >>> sg.count_call_alleles(ds)["call_allele_count"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[1, 1],
            [1, 1]],
    <BLANKLINE>
           [[1, 1],
            [0, 2]],
    <BLANKLINE>
           [[1, 1],
            [1, 1]],
    <BLANKLINE>
           [[2, 0],
            [2, 0]]], dtype=uint8)
    """
    variables.validate(ds, {call_genotype: variables.call_genotype_spec})
    n_alleles = ds.dims["alleles"]
    G = da.asarray(ds[call_genotype])
    shape = (G.chunks[0], G.chunks[1], n_alleles)
    N = da.empty(n_alleles, dtype=np.uint8)
    new_ds = create_dataset(
        {
            variables.call_allele_count: (
                ("variants", "samples", "alleles"),
                da.map_blocks(
                    count_alleles, G, N, chunks=shape, drop_axis=2, new_axis=2
                ),
            )
        }
    )
    return conditional_merge_datasets(ds, new_ds, merge)
Ejemplo n.º 25
0
 def test_different_chunks(self):
     data_array_dask = da.ones((6, 16, 100, 50), chunks=(6, 4, 50, 25))
     peak_array_dask = da.empty((6, 16), chunks=(3, 2), dtype=np.object)
     dt._intensity_peaks_image(data_array_dask, peak_array_dask, 5)
Ejemplo n.º 26
0
 def time_fancy(self):
     a = da.empty(shape=(2000000, 200, 2), dtype='i1',
                  chunks=(10000, 100, 2))
     c = np.random.randint(0, 2, size=a.shape[0], dtype=bool)
     s = sorted(np.random.choice(a.shape[1], size=100, replace=False))
     a[c][:, s]
Ejemplo n.º 27
0
def statistic(stat, images, band, num_process, chunksize, feedback):
    # create a empty initial wrapper raster for managed dask parallel
    # in chunks and storage result
    wrapper_array = da.empty(Image.wrapper_shape, chunks=chunksize)
    chunksize = wrapper_array.chunks[0][0]

    # call built in numpy statistical functions, with a specified axis. if
    # axis=2 means it will Compute along the 'depth' axis, per pixel.
    # with the return being n by m, the shape of each band.
    #

    # Compute the median
    if stat == 'median':

        def stat_func(stack_chunk, metadata):
            return np.nanmedian(stack_chunk, axis=2)

    # Compute the arithmetic mean
    if stat == 'mean':

        def stat_func(stack_chunk, metadata):
            return np.nanmean(stack_chunk, axis=2)

    # Compute the geometric mean
    if stat == 'gmean':

        def stat_func(stack_chunk, metadata):
            product = np.nanprod(stack_chunk, axis=2)
            count = np.count_nonzero(np.nan_to_num(stack_chunk), axis=2)
            gmean = np.array([p**(1.0 / c) for p, c in zip(product, count)])
            gmean[gmean == 1] = np.nan
            return gmean

    # Compute the maximum value
    if stat == 'max':

        def stat_func(stack_chunk, metadata):
            return np.nanmax(stack_chunk, axis=2)

    # Compute the minimum value
    if stat == 'min':

        def stat_func(stack_chunk, metadata):
            return np.nanmin(stack_chunk, axis=2)

    # Compute the standard deviation
    if stat == 'std':

        def stat_func(stack_chunk, metadata):
            return np.nanstd(stack_chunk, axis=2)

    # Compute the valid pixels
    # this count the valid data (no nans) across the z-axis
    if stat == 'valid_pixels':

        def stat_func(stack_chunk, metadata):
            return stack_chunk.shape[2] - np.isnan(stack_chunk).sum(axis=2)

    # Compute the percentile NN
    if stat.startswith('percentile_'):
        p = int(stat.split('_')[1])

        def stat_func(stack_chunk, metadata):
            return np.nanpercentile(stack_chunk, p, axis=2)

    # Compute the last valid pixel
    if stat == 'last_pixel':

        def last_pixel(pixel_time_series, index_sort):
            if np.isnan(pixel_time_series).all():
                return np.nan
            for index in index_sort:
                if not np.isnan(pixel_time_series[index]):
                    return pixel_time_series[index]

        def stat_func(stack_chunk, metadata):
            index_sort = np.argsort(
                metadata['date'])[::-1]  # from the most recent to the oldest
            return np.apply_along_axis(last_pixel, 2, stack_chunk, index_sort)

    # Compute the julian day of the last valid pixel
    if stat == 'jday_last_pixel':

        def jday_last_pixel(pixel_time_series, index_sort, jdays):
            if np.isnan(pixel_time_series).all():
                return 0  # better np.nan but there is bug with multiprocessing with return nan value here
            for index in index_sort:
                if not np.isnan(pixel_time_series[index]):
                    return jdays[index]

        def stat_func(stack_chunk, metadata):
            index_sort = np.argsort(
                metadata['date'])[::-1]  # from the most recent to the oldest
            return np.apply_along_axis(jday_last_pixel, 2, stack_chunk,
                                       index_sort, metadata['jday'])

    # Compute the julian day of the median value
    if stat == 'jday_median':

        def jday_median(pixel_time_series, index_sort, jdays):
            if np.isnan(pixel_time_series).all():
                return 0  # better np.nan but there is bug with multiprocessing with return nan value here
            jdays = [
                jdays[index] for index in index_sort
                if not np.isnan(pixel_time_series[index])
            ]
            return np.ceil(np.median(jdays))

        def stat_func(stack_chunk, metadata):
            index_sort = np.argsort(
                metadata['date'])  # from the oldest to most recent
            return np.apply_along_axis(jday_median, 2, stack_chunk, index_sort,
                                       metadata['jday'])

    # Compute the trimmed median with lower limit and upper limit
    if stat.startswith('trim_mean_'):
        # TODO: check this stats when the time series have few data
        lower = int(stat.split('_')[2])
        upper = int(stat.split('_')[3])

        def trim_mean(pixel_time_series):
            if np.isnan(pixel_time_series).all():
                return 0  # better np.nan but there is bug with multiprocessing with return nan value here
            pts = pixel_time_series[~np.isnan(pixel_time_series)]
            if len(pts) <= 2:
                return np.percentile(pts, (lower + upper) / 2)
            return np.mean(pts[(pts >= np.percentile(pts, lower))
                               & (pts <= np.percentile(pts, upper))])

        def stat_func(stack_chunk, metadata):
            return np.apply_along_axis(trim_mean, 2, stack_chunk)

    # Compute the linear trend using least-squares method
    if stat == 'linear_trend':

        def linear_trend(pixel_time_series, index_sort, date_list):
            if np.isnan(pixel_time_series).all() or len(
                    pixel_time_series[~np.isnan(pixel_time_series)]) == 1:
                return np.nan
            # Unix timestamp in days
            x = [
                int(int(date_list[index].strftime("%s")) / 86400)
                for index in index_sort
            ]
            x = [i - x[0] for i in x]  # diff from minimum
            pts = np.array([pixel_time_series[index] for index in index_sort])
            y = np.ma.array(pts, mask=np.isnan(pts))

            ssxm, ssxym, ssyxm, ssym = np.ma.cov(x, y, bias=1).flat
            slope = ssxym / ssxm
            return slope * 1000000

        def stat_func(stack_chunk, metadata):
            index_sort = np.argsort(
                metadata['date'])  # from the oldest to most recent
            return np.apply_along_axis(linear_trend, 2, stack_chunk,
                                       index_sort, metadata['date'])

    # Compute the statistical for the respective chunk
    def calc(block, block_id=None, chunksize=None):
        if feedback.isCanceled():
            return

        yc = block_id[0] * chunksize
        yc_size = block.shape[0]
        xc = block_id[1] * chunksize
        xc_size = block.shape[1]

        # make stack reading all images only in specific chunk
        chunks_list = [
            image.get_chunk_in_wrapper(band, xc, xc_size, yc, yc_size)
            for image in images
        ]
        # delete empty chunks
        mask_none = [False if x is None else True for x in chunks_list]
        chunks_list = np.array([i for i in chunks_list if i is not None])

        if not chunks_list.size:
            # all chunks are empty, return the chunk with nan
            return np.full((yc_size, xc_size), np.nan)

        # for some statistics that required filename as metadata
        metadata = {}
        if stat in [
                "last_pixel", "jday_last_pixel", "jday_median", "linear_trend"
        ]:
            metadata["date"] = np.array([image.date
                                         for image in images])[mask_none]
        if stat in ["jday_last_pixel", "jday_median"]:
            metadata["jday"] = np.array([image.jday
                                         for image in images])[mask_none]

        stack_chunk = np.stack(chunks_list, axis=2)
        return stat_func(stack_chunk, metadata)

    # process
    with ProgressBar(feedback=feedback):
        map_blocks = da.map_blocks(calc,
                                   wrapper_array,
                                   chunks=wrapper_array.chunks,
                                   chunksize=chunksize,
                                   dtype=float)
        result_array = map_blocks.compute(num_workers=num_process,
                                          scheduler="threads")

    return result_array
Ejemplo n.º 28
0
def geotiff_y_x():
    return xr.DataArray(
        da.empty((Y_DIM_SIZE, X_DIM_SIZE)),
        dims=("y", "x"),
    )
Ejemplo n.º 29
0
 def setup(self):
     self.N = 100000
     self.a = da.empty(shape=(self.N, ), dtype='i1', chunks=[1] * self.N)
Ejemplo n.º 30
0
def geotiff_x_y():
    # transposed data
    return xr.DataArray(
        da.empty((X_DIM_SIZE, Y_DIM_SIZE)),
        dims=("x", "y"),
    )