コード例 #1
0
def test_squeeze(is_func, axis):
    a = np.arange(10)[None, :, None, None]
    d = da.from_array(a, chunks=(1, 3, 1, 1))

    if is_func:
        a_s = np.squeeze(a, axis=axis)
        d_s = da.squeeze(d, axis=axis)
    else:
        a_s = a.squeeze(axis=axis)
        d_s = d.squeeze(axis=axis)

    assert_eq(d_s, a_s)
    assert same_keys(d_s, da.squeeze(d, axis=axis))

    if axis is None:
        axis = tuple(range(a.ndim))
    else:
        axis = axis if isinstance(axis, tuple) else (axis,)
        axis = tuple(i % a.ndim for i in axis)
    axis = tuple(
        i for i, c in enumerate(d.chunks) if i in axis and len(c) == 1
    )

    exp_d_s_chunks = tuple(
        c for i, c in enumerate(d.chunks) if i not in axis
    )
    assert d_s.chunks == exp_d_s_chunks
コード例 #2
0
ファイル: im_utils.py プロジェクト: manzt/wsireg
def tifffile_dask_backend(image_filepath,
                          largest_series,
                          preprocessing,
                          force_rgb=None):
    """
    Read image with tifffile and use dask to read data into memory

    Parameters
    ----------
    image_filepath: str
        path to the image file
    largest_series: int
        index of the largest series in the image
    preprocessing:
        whether to do some read-time pre-processing
        - greyscale conversion (at the tile level)
        - read individual or range of channels (at the tile level)

    Returns
    -------
    image: sitk.Image
        image ready for other registration pre-processing

    """
    print("using dask backend")
    zarr_series = imread(image_filepath, aszarr=True, series=largest_series)
    zarr_store = zarr.open(zarr_series)

    dask_im = da.squeeze(da.from_zarr(zarr_get_base_pyr_layer(zarr_store)))
    if force_rgb is None:
        is_rgb = guess_rgb(dask_im.shape)
        is_interleaved = True if is_rgb else False
    elif force_rgb is True and guess_rgb(dask_im.shape) is False:
        is_rgb = True
        is_interleaved = False

    if is_rgb:
        if preprocessing is not None:
            image = grayscale(dask_im, is_interleaved=is_interleaved).compute()

            image = sitk.GetImageFromArray(image)
        else:
            image = dask_im.compute()
            if is_interleaved is False:
                image = np.rollaxis(image, 0, 3)
            image = sitk.GetImageFromArray(image, isVector=True)

    elif len(dask_im.shape) == 2:
        image = sitk.GetImageFromArray(dask_im.compute())

    else:
        if preprocessing is not None:
            if (preprocessing.get("ch_indices") is not None
                    and len(dask_im.shape) > 2):
                chs = np.asarray(preprocessing.get('ch_indices'))
                dask_im = dask_im[chs, :, :]

        image = sitk.GetImageFromArray(np.squeeze(dask_im.compute()))

    return image
コード例 #3
0
def tifffile_dask_backend(image_filepath,
                          largest_series,
                          preprocessing,
                          force_rgb=None):
    """
    Read image with tifffile and use dask to read data into memory

    Parameters
    ----------
    image_filepath: str
        path to the image file
    largest_series: int
        index of the largest series in the image
    preprocessing:
        whether to do some read-time pre-processing
        - greyscale conversion (at the tile level)
        - read individual or range of channels (at the tile level)

    Returns
    -------
    image: sitk.Image
        image ready for other registration pre-processing

    """
    print("using dask backend")
    zarr_series = imread(image_filepath, aszarr=True, series=largest_series)
    zarr_store = zarr.open(zarr_series)
    dask_im = da.squeeze(da.from_zarr(zarr_get_base_pyr_layer(zarr_store)))
    return read_preprocess_array(dask_im,
                                 preprocessing=preprocessing,
                                 force_rgb=force_rgb)
コード例 #4
0
 def _matvec(self, x):
     x = da.squeeze(x.reshape(self.nsl, self.ny, self.nz))
     if self.chunks[0] is not None:
         x = x.rechunk(self.chunks[0])
     if self.nz == 1:
         x = x[..., np.newaxis]
     y = da.matmul(self.G, x)
     return y.ravel()
コード例 #5
0
ファイル: im_utils.py プロジェクト: manzt/wsireg
def tf_zarr_read_single_ch(image_filepath,
                           channel_idx,
                           is_rgb,
                           is_rgb_interleaved=False):
    """
    Reads a single channel using zarr or dask in combination with tifffile

    Parameters
    ----------
    image_filepath:str
        file path to image
    channel_idx:int
        index of the channel to be read
    is_rgb:bool
        whether image is rgb interleaved

    Returns
    -------
    im:np.ndarray
        image as a np.ndarray
    """
    largest_series = tf_get_largest_series(image_filepath)
    zarr_im = zarr.open(
        imread(image_filepath, aszarr=True, series=largest_series))
    zarr_im = zarr_get_base_pyr_layer(zarr_im)
    try:
        im = da.squeeze(da.from_zarr(zarr_im))
        if is_rgb and is_rgb_interleaved is True:
            im = im[:, :, channel_idx].compute()
        elif len(im.shape) > 2:
            im = im[channel_idx, :, :].compute()
        else:
            im = im.compute()

    except ValueError:
        im = zarr_im
        if is_rgb is True and is_rgb_interleaved is True:
            im = im[:, :, channel_idx]
        elif len(im.shape) > 2:
            im = im[channel_idx, :, :].compute()
        else:
            im = im.compute()
    return im
コード例 #6
0
ファイル: regenie.py プロジェクト: timothymillar/sgkit
    def apply(X: Array, YP: Array, BX: Array, BYP: Array) -> Array:
        # Collapse selected variant blocks and alphas into single
        # new covariate dimension
        assert YP.shape[2] == BYP.shape[2]
        n_group_covar = n_covar + BYP.shape[2] * n_alpha_1

        BYP = BYP.reshape((n_outcome, n_sample_block, -1))
        BG = da.concatenate((BX, BYP), axis=-1)
        BG = BG.rechunk((-1, None, -1))
        assert_block_shape(BG, 1, n_sample_block, 1)
        assert_chunk_shape(BG, n_outcome, 1, n_group_covar)
        assert_array_shape(BG, n_outcome, n_sample_block, n_group_covar)

        YP = YP.reshape((n_outcome, n_sample, -1))
        XYP = da.broadcast_to(X, (n_outcome, n_sample, n_covar))
        XG = da.concatenate((XYP, YP), axis=-1)
        XG = XG.rechunk((-1, None, -1))
        assert_block_shape(XG, 1, n_sample_block, 1)
        assert_chunk_shape(XG, n_outcome, sample_chunks[0], n_group_covar)
        assert_array_shape(XG, n_outcome, n_sample, n_group_covar)

        YG = da.map_blocks(
            # Block chunks:
            # (n_outcome, sample_chunks[0], n_group_covar) @
            # (n_outcome, n_group_covar, 1) [after transpose]
            lambda x, b: x @ b.transpose((0, 2, 1)),
            XG,
            BG,
            chunks=(n_outcome, sample_chunks, 1),
        )
        assert_block_shape(YG, 1, n_sample_block, 1)
        assert_chunk_shape(YG, n_outcome, sample_chunks[0], 1)
        assert_array_shape(YG, n_outcome, n_sample, 1)
        YG = da.squeeze(YG, axis=-1).T
        assert_block_shape(YG, n_sample_block, 1)
        assert_chunk_shape(YG, sample_chunks[0], n_outcome)
        assert_array_shape(YG, n_sample, n_outcome)
        return YG
コード例 #7
0
    def _resample_tile(
        self,
        ch_idx: int,
        tile_resampler: sitk.ResampleImageFilter,
        x_max: int,
        x_min: int,
        y_max: int,
        y_min: int,
    ) -> Optional[sitk.Image]:
        """Resample tile or don't if it is outside of the moving
        image space."""

        if x_min == 0 and x_max == 0:
            return

        if y_min == 0 and y_max == 0:
            return

        if self.reg_image.is_rgb:
            image = self.reg_image.dask_image[y_min:y_max, x_min:x_max, :]
            image = sitk.GetImageFromArray(image, isVector=True)
        elif self.reg_image.n_ch == 1:
            image = da.squeeze(self.reg_image.dask_image)[
                y_min:y_max, x_min:x_max
            ]
            image = sitk.GetImageFromArray(image, isVector=False)
        else:
            image = self.reg_image.dask_image[ch_idx, y_min:y_max, x_min:x_max]
            image = sitk.GetImageFromArray(image, isVector=False)
        image.SetSpacing((self.reg_image.image_res, self.reg_image.image_res))
        image.SetOrigin(
            image.TransformIndexToPhysicalPoint([int(x_min), int(y_min)])
        )

        tile_resampled = tile_resampler.Execute(image)

        return tile_resampled
コード例 #8
0
ファイル: scaled_legacy.py プロジェクト: TNonet/lmdec
    def _center_x(self, x, dx, transpose: bool = False) -> da.core.Array:
        """ Centers the product of matrix multiplication instead of center the matrix

        Let A be a matrix of shape (n by p) with non zero column means, U of shape (p,).

        Matrix B could be constructed as follows with zero column mean.
            B = A - 1'U where 1 is a 1 vector. And 1'U is an outer product of shape (n by p)
        However, this is inefficient if only the matrix product of B, with a matrix x is needed.
        Instead `_center_x` implements:

            Ax - Ux
             ^    ^- dx being passed in,
             |
             x being passed in
        with efficient broadcasting.


        Parameters
        ----------
        x : array_like
            Usually the product of Ax that needs to be center
        dx : array_like
            Usually the original x before being multiplied by A
        transpose : bool
            Flag whether to indicate if A'x or Ax. Adjusts dimensions

        Returns
        -------
        x_centered: array_like
        """
        if transpose:
            # Computes mu1'x_k_h
            return x - da.squeeze(
                da.outer(self._array_moment.center_vector, dx.sum(axis=0)))
        else:
            return x - self._array_moment.center_vector.dot(dx)
コード例 #9
0
def to_raster(data,
              filename,
              readxsize=None,
              readysize=None,
              use_dask_store=False,
              separate=False,
              out_block_type='zarr',
              keep_blocks=False,
              verbose=0,
              overwrite=False,
              gdal_cache=512,
              scheduler='mpool',
              n_jobs=1,
              n_workers=None,
              n_threads=None,
              n_chunks=None,
              overviews=False,
              resampling='nearest',
              use_client=False,
              address=None,
              total_memory=48,
              **kwargs):
    """
    Writes a ``dask`` array to a raster file

    Args:
        data (DataArray): The ``xarray.DataArray`` to write.
        filename (str): The output file name to write to.
        readxsize (Optional[int]): The size of column chunks to read. If not given, ``readxsize`` defaults to Dask
            chunk size.
        readysize (Optional[int]): The size of row chunks to read. If not given, ``readysize`` defaults to Dask
            chunk size.
        separate (Optional[bool]): Whether to write blocks as separate files. Otherwise, write to a single file.
        use_dask_store (Optional[bool]): Whether to use ``dask.array.store`` to save with Dask task graphs.
        out_block_type (Optional[str]): The output block type. Choices are ['gtiff', 'zarr'].
            Only used if ``separate`` = ``True``.
        keep_blocks (Optional[bool]): Whether to keep the blocks stored on disk. Only used if ``separate`` = ``True``.
        verbose (Optional[int]): The verbosity level.
        overwrite (Optional[bool]): Whether to overwrite an existing file.
        gdal_cache (Optional[int]): The ``GDAL`` cache size (in MB).
        scheduler (Optional[str]): The ``concurrent.futures`` scheduler to use. Choices are ['processes', 'threads', 'mpool'].

            mpool: process pool of workers using ``multiprocessing.Pool``
            processes: process pool of workers using ``concurrent.futures``
            threads: thread pool of workers using ``concurrent.futures``

        n_jobs (Optional[int]): The total number of parallel jobs.
        n_workers (Optional[int]): The number of processes.
        n_threads (Optional[int]): The number of threads.
        n_chunks (Optional[int]): The chunk size of windows. If not given, equal to ``n_workers`` x 50.
        overviews (Optional[bool or list]): Whether to build overview layers.
        resampling (Optional[str]): The resampling method for overviews when ``overviews`` is ``True`` or a ``list``.
            Choices are ['average', 'bilinear', 'cubic', 'cubic_spline', 'gauss', 'lanczos', 'max', 'med', 'min', 'mode', 'nearest'].
        use_client (Optional[bool]): Whether to use a ``dask`` client.
        address (Optional[str]): A cluster address to pass to client. Only used when ``use_client`` = ``True``.
        total_memory (Optional[int]): The total memory (in GB) required when ``use_client`` = ``True``.
        kwargs (Optional[dict]): Additional keyword arguments to pass to ``rasterio.write``.

    Returns:
        ``dask.delayed`` object

    Examples:
        >>> import geowombat as gw
        >>>
        >>> # Use 8 parallel workers
        >>> with gw.open('input.tif') as ds:
        >>>     gw.to_raster(ds, 'output.tif', n_jobs=8)
        >>>
        >>> # Use 4 process workers and 2 thread workers
        >>> with gw.open('input.tif') as ds:
        >>>     gw.to_raster(ds, 'output.tif', n_workers=4, n_threads=2)
        >>>
        >>> # Control the window chunks passed to concurrent.futures
        >>> with gw.open('input.tif') as ds:
        >>>     gw.to_raster(ds, 'output.tif', n_workers=4, n_threads=2, n_chunks=16)
        >>>
        >>> # Compress the output and build overviews
        >>> with gw.open('input.tif') as ds:
        >>>     gw.to_raster(ds, 'output.tif', n_jobs=8, overviews=True, compress='lzw')
    """

    if MKL_LIB:
        __ = MKL_LIB.MKL_Set_Num_Threads(n_threads)

    pfile = Path(filename)

    if scheduler.lower() == 'mpool':
        pool_executor = multi.Pool
    else:
        pool_executor = concurrent.futures.ProcessPoolExecutor if scheduler.lower(
        ) == 'processes' else concurrent.futures.ThreadPoolExecutor

    if overwrite:

        if pfile.is_file():
            pfile.unlink()

    if pfile.is_file():
        logger.warning('  The output file already exists.')
        return

    if not is_dask_collection(data.data):
        logger.exception('  The data should be a dask array.')

    if use_client:

        if address:
            cluster_object = _cluster_dummy
        else:
            cluster_object = LocalCluster

        client_object = Client

    else:

        cluster_object = _cluster_dummy
        client_object = _client_dummy

    if isinstance(n_workers, int) and isinstance(n_threads, int):
        n_jobs = n_workers * n_threads
    else:

        n_workers = n_jobs
        n_threads = 1

    mem_per_core = int(total_memory / n_workers)

    if not isinstance(n_chunks, int):
        n_chunks = n_workers * 50

    if not isinstance(readxsize, int):
        readxsize = data.gw.col_chunks

    if not isinstance(readysize, int):
        readysize = data.gw.row_chunks

    chunksize = (data.gw.row_chunks, data.gw.col_chunks)

    # Force tiled outputs with no file sharing
    kwargs['sharing'] = False

    if data.gw.tiled:
        kwargs['tiled'] = True

    if 'compress' in kwargs:

        # Store the compression type because
        #   it is removed in concurrent writing
        compress = True
        compress_type = kwargs['compress']
        del kwargs['compress']

    elif isinstance(data.gw.compress,
                    str) and data.gw.compress.lower() in ['lzw', 'deflate']:

        compress = True
        compress_type = data.gw.compress

    else:
        compress = False

    if 'nodata' not in kwargs:

        if isinstance(data.gw.nodata, int) or isinstance(
                data.gw.nodata, float):
            kwargs['nodata'] = data.gw.nodata

    if 'blockxsize' not in kwargs:
        kwargs['blockxsize'] = data.gw.col_chunks

    if 'blockysize' not in kwargs:
        kwargs['blockysize'] = data.gw.row_chunks

    if 'bigtiff' not in kwargs:
        kwargs['bigtiff'] = data.gw.bigtiff

    if 'driver' not in kwargs:
        kwargs['driver'] = data.gw.driver

    if 'count' not in kwargs:
        kwargs['count'] = data.gw.nbands

    if 'width' not in kwargs:
        kwargs['width'] = data.gw.ncols

    if 'height' not in kwargs:
        kwargs['height'] = data.gw.nrows

    if separate:

        d_name = pfile.parent
        sub_dir = d_name.joinpath('sub_tmp_')
        zarr_file = sub_dir.joinpath('data.zarr').as_posix()

        sub_dir.mkdir(parents=True, exist_ok=True)

        root = zarr.open(zarr_file, mode='w')

    else:

        root = None

        if verbose > 0:
            logger.info('  Creating the file ...\n')

        with rio.open(filename, mode='w', **kwargs) as rio_dst:
            pass

    if verbose > 0:
        logger.info('  Writing data to file ...\n')

    with rio.Env(GDAL_CACHEMAX=gdal_cache):

        if not use_dask_store:

            windows = get_window_offsets(data.gw.nrows,
                                         data.gw.ncols,
                                         readysize,
                                         readxsize,
                                         return_as='list')

            n_windows = len(windows)

            # Iterate over the windows in chunks
            for wchunk in range(0, n_windows, n_chunks):

                window_slice = windows[wchunk:wchunk + n_chunks]
                n_windows_slice = len(window_slice)

                if verbose > 0:

                    logger.info('  Windows {:,d}--{:,d} of {:,d} ...'.format(
                        wchunk + 1, wchunk + n_windows_slice, n_windows))

                if len(data.shape) == 2:
                    data_gen = ((data[w.row_off:w.row_off + w.height,
                                      w.col_off:w.col_off + w.width], filename,
                                 w, n_threads, separate, chunksize, root)
                                for w in window_slice)
                elif len(data.shape) == 3:
                    data_gen = ((data[:, w.row_off:w.row_off + w.height,
                                      w.col_off:w.col_off + w.width], filename,
                                 w, n_threads, separate, chunksize, root)
                                for w in window_slice)
                else:
                    data_gen = ((data[:, :, w.row_off:w.row_off + w.height,
                                      w.col_off:w.col_off + w.width], filename,
                                 w, n_threads, separate, chunksize, root)
                                for w in window_slice)

                with pool_executor(n_workers) as executor:

                    if scheduler == 'mpool':

                        for zarr_file in tqdm(executor.imap_unordered(
                                _write_xarray, data_gen),
                                              total=n_windows_slice):
                            pass

                    else:

                        for zarr_file in tqdm(executor.map(
                                _write_xarray, data_gen),
                                              total=n_windows_slice):
                            pass

            # if overviews:
            #
            #     if not isinstance(overviews, list):
            #         overviews = [2, 4, 8, 16]
            #
            #     if resampling not in ['average', 'bilinear', 'cubic', 'cubic_spline',
            #                           'gauss', 'lanczos', 'max', 'med', 'min', 'mode', 'nearest']:
            #
            #         logger.warning("  The resampling method is not supported by rasterio. Setting to 'nearest'")
            #
            #         resampling = 'nearest'
            #
            #     if verbose > 0:
            #         logger.info('  Building pyramid overviews ...')
            #
            #     rio_dst.build_overviews(overviews, getattr(Resampling, resampling))
            #     rio_dst.update_tags(ns='overviews', resampling=resampling)

        else:

            with cluster_object(
                    n_workers=n_workers,
                    threads_per_worker=n_threads,
                    scheduler_port=0,
                    processes=False,
                    memory_limit='{:d}GB'.format(mem_per_core)) as cluster:

                cluster_address = address if address else cluster

                with client_object(address=cluster_address) as client:

                    with WriteDaskArray(filename,
                                        overwrite=overwrite,
                                        separate=separate,
                                        out_block_type=out_block_type,
                                        keep_blocks=keep_blocks,
                                        gdal_cache=gdal_cache,
                                        **kwargs) as dst:

                        # Store the data and return a lazy evaluator
                        res = da.store(da.squeeze(data.data),
                                       dst,
                                       lock=False,
                                       compute=False)

                        if verbose > 0:
                            logger.info('  Writing data to file ...')

                        # Send the data to file
                        #
                        # *Note that the progress bar will
                        #   not work with a client.
                        if use_client:
                            res.compute(num_workers=n_jobs)
                        else:

                            with ProgressBar():
                                res.compute(num_workers=n_jobs)

                        if verbose > 0:
                            logger.info('  Finished writing data to file.')

                        out_block_type = dst.out_block_type
                        keep_blocks = dst.keep_blocks
                        zarr_file = dst.zarr_file
                        sub_dir = dst.sub_dir

        if compress:

            if verbose > 0:
                logger.info('  Compressing output file ...')

            if separate:

                group_keys = list(root.group_keys())
                n_groups = len(group_keys)

                if out_block_type.lower() == 'zarr':
                    # root = zarr.open(zarr_file, mode='r')
                    open_file = zarr_file
                else:

                    outfiles = sorted(
                        fnmatch.filter(os.listdir(sub_dir), '*.tif'))
                    outfiles = [os.path.join(sub_dir, fn) for fn in outfiles]

                    # data_gen = ((fn, None, 'gtiff') for fn in outfiles)

                kwargs['compress'] = compress_type

                n_windows = len(group_keys)

                # Compress into one file
                with rio.open(filename, mode='w', **kwargs) as dst_:

                    # Iterate over the windows in chunks
                    for wchunk in range(0, n_groups, n_chunks):

                        group_keys_slice = group_keys[wchunk:wchunk + n_chunks]
                        n_windows_slice = len(group_keys_slice)

                        if verbose > 0:

                            logger.info(
                                '  Windows {:,d}--{:,d} of {:,d} ...'.format(
                                    wchunk + 1, wchunk + n_windows_slice,
                                    n_windows))

                        ################################################
                        data_gen = ((open_file, group, 'zarr')
                                    for group in group_keys_slice)

                        # for f in tqdm(executor.map(_compressor, data_gen), total=n_windows_slice):
                        #     pass
                        #
                        # futures = [executor.submit(_compress_dummy, iter_[0], iter_[1], None) for iter_ in data_gen]
                        #
                        # for f in tqdm(concurrent.futures.as_completed(futures), total=n_windows_slice):
                        #
                        #     out_window, out_block = f.result()
                        #
                        #     dst_.write(np.squeeze(out_block),
                        #                window=out_window,
                        #                indexes=out_indexes_)
                        ################################################

                        # data_gen = ((root, group, 'zarr') for group in group_keys_slice)

                        # for f, g, t in tqdm(data_gen, total=n_windows_slice):
                        #
                        #     out_window, out_indexes, out_block = _block_read_func(f, g, t)

                        # executor.map(_block_write_func, data_gen)

                        with concurrent.futures.ProcessPoolExecutor(
                                max_workers=n_workers) as executor:

                            # Submit all of the tasks as futures
                            futures = [
                                executor.submit(_block_read_func, f, g, t)
                                for f, g, t in data_gen
                            ]

                            for f in tqdm(
                                    concurrent.futures.as_completed(futures),
                                    total=n_windows_slice):

                                out_window, out_indexes, out_block = f.result()

                                dst_.write(out_block,
                                           window=out_window,
                                           indexes=out_indexes)

                        futures = None

                if not keep_blocks:
                    shutil.rmtree(sub_dir)

            else:

                p = Path(filename)

                d_name = p.parent
                f_base, f_ext = os.path.splitext(p.name)

                ld = string.ascii_letters + string.digits
                rstr = ''.join(random.choice(ld) for i in range(0, 9))

                temp_file = d_name.joinpath('{}_temp_{}{}'.format(
                    f_base, rstr, f_ext))

                compress_raster(filename,
                                temp_file.as_posix(),
                                n_jobs=n_jobs,
                                gdal_cache=gdal_cache,
                                compress=compress_type)

                temp_file.rename(filename)

            if verbose > 0:
                logger.info('  Finished compressing')

    if verbose > 0:
        logger.info('\nFinished writing the data.')
コード例 #10
0
    def as_array(self, axes=None, stitched=False, verbose=True, **kwargs):
        """
        Read all data image data as one big Dask array with last two axes as y, x and preceeding axes depending on data.
        The dask array is made up of memory-mapped numpy arrays, so the dataset does not need to be able to fit into RAM.
        If the data doesn't fully fill out the array (e.g. not every z-slice collected at every time point), zeros will
        be added automatically.

        To convert data into a numpy array, call np.asarray() on the returned result. However, doing so will bring the
        data into RAM, so it may be better to do this on only a slice of the array at a time.

        Parameters
        ----------
        axes : list
            list of axes names over which to iterate and merge into a stacked array. If None, all axes will be used
        stitched : bool
            If true and tiles were acquired in a grid, lay out adjacent tiles next to one another (Default value = False)
        verbose : bool
            If True print updates on progress loading the image
        **kwargs :
            names and integer positions of axes on which to slice data
        Returns
        -------
        dataset : dask array
        """

        w = self.image_height if not stitched else self._tile_width
        h = self.image_height if not stitched else self._tile_height
        self._empty_tile = (np.zeros(
            (h, w), self.dtype) if self.bytes_per_pixel != 3 else np.zeros(
                (h, w, 3), self.dtype))
        self._count = 1
        total = np.prod([len(v) for v in self.axes.values()])

        def recurse_axes(loop_axes, point_axes):
            """
            Used to create a nested list of images, with each nesting level corresponding to a particular axis.
            Each time this function is recursively called, it will descend one level deeper. The recursive calls
            can be thought of as a tree structure, where each depth level of the tree is one axis, and it has a
            branch (i.e. a subsequent call of recurse_axes) corresponding to every value of the the next axis.

            :param loop_axes: The remaining axes that need to be looped over (i.e. the innermost ones)
            :param point_axes: The axes that have been assigned values already by a previous call of this function

            :return: Nested list of images
            """
            if len(loop_axes.values()) == 0:
                # There are no more axes over which to loop (i.e. we're at the maximum depth), so return
                # the image defined by point_axes, or a blank image if it is undefined (so that the full
                # nested list will have the expected rectangular shape)
                if verbose:
                    print("\rAdding data chunk {} of {}".format(
                        self._count, total),
                          end="")
                self._count += 1
                if None not in point_axes.values() and self.has_image(
                        **point_axes):
                    recurse_axes.empty = False  # track that actual data was read
                    if stitched:
                        img = self.read_image(**point_axes, memmapped=True)
                        if self.half_overlap[0] != 0:
                            img = img[
                                self.half_overlap[0]:-self.half_overlap[0],
                                self.half_overlap[1]:-self.half_overlap[1], ]
                        return img
                    else:
                        return self.read_image(**point_axes, memmapped=True)
                else:
                    # return np.zeros((self.image_height, self.image_width), self.dtype)
                    return self._empty_tile
            else:
                # Still have axes over which to loop
                # do row and col first because it makes stitching faster
                if "row" in loop_axes.keys() and stitched:
                    axis = "row"
                elif "column" in loop_axes.keys() and stitched:
                    axis = "column"
                else:
                    # Take the next axis in the list that needs to be looped over
                    axis = list(loop_axes.keys())[0]

                # copy so multiple calls dont collide on the same data structure
                remaining_loop_axes = loop_axes.copy()
                if (axis == "row" or axis == "column") and stitched:
                    # do these both at once
                    del remaining_loop_axes["row"]
                    del remaining_loop_axes["column"]
                else:
                    # remove because this axis is now being assigned a point value
                    del remaining_loop_axes[axis]
                if (axis == "row" or axis == "column") and stitched:
                    # Do stitching along existing axis
                    # Stitch tiles acquired in a grid (i.e. data acquired by Micro-Magellan or in multi-res mode)
                    self.half_overlap = (self.overlap[0] // 2,
                                         self.overlap[1] // 2)

                    # get spatial layout of position indices
                    row_values = np.array(list(self.axes["row"]))
                    column_values = np.array(list(self.axes["column"]))

                    # make nested list of rows and columns
                    blocks = []
                    for row in row_values:
                        blocks.append([])
                        for column in column_values:
                            valed_axes = point_axes.copy()
                            if verbose:
                                print(
                                    "\rAdding data chunk {} of {}".format(
                                        self._count, total),
                                    end="",
                                )
                            valed_axes["row"] = row
                            valed_axes["column"] = column

                            blocks[-1].append(
                                da.stack(
                                    recurse_axes(remaining_loop_axes,
                                                 valed_axes)))

                    rgb = self.bytes_per_pixel == 3 and self.dtype == np.uint8
                    if rgb:
                        stitched_array = np.concatenate(
                            [
                                np.concatenate(
                                    row, axis=len(blocks[0][0].shape) - 2)
                                for row in blocks
                            ],
                            axis=len(blocks[0][0].shape) - 3,
                        )
                    else:
                        stitched_array = da.block(blocks)
                    return stitched_array
                else:
                    # Do stacking along new axis (i.e. not stiching along exisitng)
                    blocks = []
                    # Loop through every value of the next axis (i.e. create new branches of the tree)
                    for val in loop_axes[axis]:
                        # Copy to avoid unexpected errors by multiple calls
                        valed_axes = point_axes.copy()
                        # Move this axis from one that needs to be looped over to one that has a discrete value.
                        valed_axes[axis] = val
                        blocks.append(
                            recurse_axes(remaining_loop_axes, valed_axes))
                    return blocks

        if axes is None:
            axes = self.axes.keys()
        axes_to_slice = kwargs
        axes_to_stack_or_stitch = {
            key: self.axes[key]
            for key in axes if key not in kwargs.keys()
        }

        recurse_axes.empty = True
        blocks = recurse_axes(axes_to_stack_or_stitch, axes_to_slice)
        if recurse_axes.empty:
            # No actual data in any of the tiles
            return None

        if verbose:
            print(
                "\rStacking tiles...         "
            )  # extra space otherwise there is no space after the "Adding data chunk {} {}"
        # import time
        # s = time.time()
        array = da.stack(blocks, allow_unknown_chunksizes=False)
        # e = time.time()
        # print(e - s)
        if verbose:
            print("\rDask array opened")
        # remove singleton axes
        array = da.squeeze(array)
        return array
コード例 #11
0
def make_regression(n_samples=100,
                    n_features=100,
                    n_informative=10,
                    n_targets=1,
                    bias=0.0,
                    effective_rank=None,
                    tail_strength=0.5,
                    noise=0.0,
                    shuffle=False,
                    coef=False,
                    random_state=None,
                    n_parts=1,
                    n_samples_per_part=None):
    """Generate a random regression problem.
    The input set can either be well conditioned (by default) or have a low
    rank-fat tail singular profile.

    The output is generated by applying a (potentially biased) random linear
    regression model with "n_informative" nonzero regressors to the previously
    generated input and some gaussian centered noise with some adjustable
    scale.

    Parameters
    ----------
    n_samples : int, optional (default=100)
        The number of samples.
    n_features : int, optional (default=100)
        The number of features.
    n_informative : int, optional (default=10)
        The number of informative features, i.e., the number of features used
        to build the linear model used to generate the output.
    n_targets : int, optional (default=1)
        The number of regression targets, i.e., the dimension of the y output
        vector associated with a sample. By default, the output is a scalar.
    bias : float, optional (default=0.0)
        The bias term in the underlying linear model.
    effective_rank : int or None, optional (default=None)
        if not None:
            The approximate number of singular vectors required to explain most
            of the input data by linear combinations. Using this kind of
            singular spectrum in the input allows the generator to reproduce
            the correlations often observed in practice.
        if None:
            The input set is well conditioned, centered and gaussian with
            unit variance.
    tail_strength : float between 0.0 and 1.0, optional (default=0.5)
        The relative importance of the fat noisy tail of the singular values
        profile if "effective_rank" is not None.
    noise : float, optional (default=0.0)
        The standard deviation of the gaussian noise applied to the output.
    shuffle : boolean, optional (default=False)
        Shuffle the samples and the features.
    coef : boolean, optional (default=False)
        If True, the coefficients of the underlying linear model are returned.
    random_state : int, CuPy RandomState instance, Dask RandomState instance
                   or None (default)
        Determines random number generation for dataset creation. Pass an int
        for reproducible output across multiple function calls.
    n_parts : int, optional (default=1)
        The number of parts of work.

    Returns
    -------
    X : Dask-CuPy array of shape [n_samples, n_features]
        The input samples.
    y : Dask-CuPy array of shape [n_samples] or [n_samples, n_targets]
        The output values.
    coef : Dask-CuPy array of shape [n_features]
           or [n_features, n_targets], optional
        The coefficient of the underlying linear model. It is returned only if
        coef is True.
    """
    n_informative = min(n_features, n_informative)
    rs = create_rs_generator(random_state)

    if n_samples_per_part is None:
        n_samples_per_part = max(1, int(n_samples / n_parts))

    if effective_rank is None:
        # Randomly generate a well conditioned input set
        X = rs.standard_normal(
            (n_samples, n_features),
            chunks=(n_samples_per_part, (n_informative,
                                         n_features - n_informative)))

    else:
        # Randomly generate a low rank, fat tail input set
        X = make_low_rank_matrix(n_samples=n_samples,
                                 n_features=n_features,
                                 effective_rank=effective_rank,
                                 tail_strength=tail_strength,
                                 random_state=rs,
                                 n_parts=n_parts)
        X = X.rechunk({
            0: n_samples_per_part,
            1: (n_informative, n_features - n_informative)
        })

    # Generate a ground truth model with only n_informative features being non
    # zeros (the other features are not correlated to y and should be ignored
    # by a sparsifying regularizers such as L1 or elastic net)

    ground_truth = 100.0 * rs.standard_normal(
        (n_informative, n_targets), chunks=(n_samples_per_part, -1))

    y = da.dot(X[:, :n_informative], ground_truth) + bias
    X = X.rechunk((None, -1))

    if n_informative != n_features:
        zeroes = 0.0 * rs.standard_normal(
            (n_features - n_informative, n_targets))
        ground_truth = da.concatenate([ground_truth, zeroes], axis=0)
        ground_truth = ground_truth.rechunk(-1)

    # Add noise
    if noise > 0.0:
        y += rs.normal(scale=noise, size=y.shape)

    # Randomly permute samples and features
    if shuffle:
        samples_indices = np.random.permutation(n_samples)
        X = X[samples_indices, :]
        y = y[samples_indices, :]

        features_indices = np.random.permutation(n_features)
        X = X[:, features_indices]
        ground_truth = ground_truth[features_indices, :]

    y = da.squeeze(y)

    if coef:
        ground_truth = da.squeeze(ground_truth)
        return X, y, ground_truth

    else:
        return X, y
コード例 #12
0
ファイル: regression.py プロジェクト: st071300/cuML
def make_regression(n_samples=100,
                    n_features=100,
                    n_informative=10,
                    n_targets=1,
                    bias=0.0,
                    effective_rank=None,
                    tail_strength=0.5,
                    noise=0.0,
                    shuffle=False,
                    coef=False,
                    random_state=None,
                    n_parts=1,
                    n_samples_per_part=None,
                    order='F',
                    dtype='float32',
                    client=None,
                    use_full_low_rank=True):
    """
    Generate a random regression problem.

    The input set can either be well conditioned (by default) or have a low
    rank-fat tail singular profile.

    The output is generated by applying a (potentially biased) random linear
    regression model with "n_informative" nonzero regressors to the previously
    generated input and some gaussian centered noise with some adjustable
    scale.

    Parameters
    ----------
    n_samples : int, optional (default=100)
        The number of samples.
    n_features : int, optional (default=100)
        The number of features.
    n_informative : int, optional (default=10)
        The number of informative features, i.e., the number of features used
        to build the linear model used to generate the output.
    n_targets : int, optional (default=1)
        The number of regression targets, i.e., the dimension of the y output
        vector associated with a sample. By default, the output is a scalar.
    bias : float, optional (default=0.0)
        The bias term in the underlying linear model.
    effective_rank : int or None, optional (default=None)
        if not None:
            The approximate number of singular vectors required to explain most
            of the input data by linear combinations. Using this kind of
            singular spectrum in the input allows the generator to reproduce
            the correlations often observed in practice.

        if None:
            The input set is well conditioned, centered and gaussian with
            unit variance.

    tail_strength : float between 0.0 and 1.0, optional (default=0.5)
        The relative importance of the fat noisy tail of the singular values
        profile if "effective_rank" is not None.
    noise : float, optional (default=0.0)
        The standard deviation of the gaussian noise applied to the output.
    shuffle : boolean, optional (default=False)
        Shuffle the samples and the features.
    coef : boolean, optional (default=False)
        If True, the coefficients of the underlying linear model are returned.
    random_state : int, CuPy RandomState instance, Dask RandomState instance \
                   or None (default)
        Determines random number generation for dataset creation. Pass an int
        for reproducible output across multiple function calls.
    n_parts : int, optional (default=1)
        The number of parts of work.
    order : str, optional (default='F')
        Row-major or Col-major
    dtype: str, optional (default='float32')
        dtype of generated data
    use_full_low_rank : boolean (default=True)
        Whether to use the entire dataset to generate the low rank matrix.
        If False, it creates a low rank covariance and uses the
        corresponding covariance to generate a multivariate normal
        distribution on the remaining chunks

    Returns
    -------
    X : Dask-CuPy array of shape [n_samples, n_features]
        The input samples.
    y : Dask-CuPy array of shape [n_samples] or [n_samples, n_targets]
        The output values.
    coef : Dask-CuPy array of shape [n_features] \
           or [n_features, n_targets], optional
        The coefficient of the underlying linear model. It is returned only if
        coef is True.

    Notes
    -----
    Known Performance Limitations:
     1. When `effective_rank` is set and `use_full_low_rank` is True, \
        we cannot generate order `F` by construction, and an explicit \
        transpose is performed on each part. This may cause memory to spike \
        (other parameters make order `F` by construction)
     2. When `n_targets > 1` and `order = 'F'` as above, we have to \
        explicity transpose the `y` array. If `coef = True`, then we also \
        explicity transpose the `ground_truth` array
     3. When `shuffle = True` and `order = F`, there are memory spikes to \
        shuffle the `F` order arrays

    .. note:: If out-of-memory errors are encountered in any of the above
        configurations, try increasing the `n_parts` parameter.
    """

    client = get_client(client=client)

    n_informative = min(n_features, n_informative)
    rs = _create_rs_generator(random_state)

    if n_samples_per_part is None:
        n_samples_per_part = max(1, int(n_samples / n_parts))

    data_chunksizes = [n_samples_per_part] * n_parts

    data_chunksizes[-1] += (n_samples % n_parts)

    data_chunksizes = tuple(data_chunksizes)

    if effective_rank is None:
        # Randomly generate a well conditioned input set
        if order == 'F':
            X = _f_order_standard_normal(client, rs, data_chunksizes,
                                         n_features, dtype)

        elif order == 'C':
            X = rs.standard_normal((n_samples, n_features),
                                   chunks=(data_chunksizes, -1),
                                   dtype=dtype)

    else:
        # Randomly generate a low rank, fat tail input set
        if use_full_low_rank:
            X = make_low_rank_matrix(n_samples=n_samples,
                                     n_features=n_features,
                                     effective_rank=effective_rank,
                                     tail_strength=tail_strength,
                                     random_state=rs,
                                     n_parts=n_parts,
                                     n_samples_per_part=n_samples_per_part,
                                     dtype=dtype)

            X = X.rechunk({0: data_chunksizes, 1: -1})
        else:
            seed = int(rs.randint(n_samples).compute())
            covar = _make_low_rank_covariance(client, n_features,
                                              effective_rank, tail_strength,
                                              seed, n_parts,
                                              n_samples_per_part, dtype)
            X = _data_from_multivariate_normal(client, rs, covar,
                                               data_chunksizes, n_features,
                                               dtype)

        X = _convert_to_order(client, X, data_chunksizes, order, n_features,
                              dtype)

    # Generate a ground truth model with only n_informative features being non
    # zeros (the other features are not correlated to y and should be ignored
    # by a sparsifying regularizers such as L1 or elastic net)
    ground_truth = 100.0 * rs.standard_normal((n_informative, n_targets),
                                              chunks=(n_samples_per_part, -1),
                                              dtype=dtype)

    y = da.dot(X[:, :n_informative], ground_truth) + bias

    if n_informative != n_features:
        zeroes = 0.0 * rs.standard_normal(
            (n_features - n_informative, n_targets), dtype=dtype)
        ground_truth = da.concatenate([ground_truth, zeroes], axis=0)

    ground_truth = ground_truth.rechunk(-1)

    # Add noise
    if noise > 0.0:
        y += rs.normal(scale=noise, size=y.shape, dtype=dtype)

    # Randomly permute samples and features
    if shuffle:
        features_indices = np.random.permutation(n_features)
        X, y = _shuffle(client, rs, X, y, data_chunksizes, n_features,
                        features_indices, n_targets, dtype)

        ground_truth = ground_truth[features_indices, :]

    y = da.squeeze(y)

    if order == 'F' and n_targets > 1:
        y = _convert_to_order(client, y, y.chunks[0], order, n_targets, dtype)
        if coef:
            ground_truth = _convert_to_order(client, ground_truth,
                                             ground_truth.chunks[0], order,
                                             n_targets, dtype)

    if coef:
        ground_truth = da.squeeze(ground_truth)
        return X, y, ground_truth

    else:
        return X, y
コード例 #13
0
def MDD(G,
        d,
        dt=0.004,
        dr=1.,
        nfmax=None,
        wav=None,
        twosided=True,
        adjoint=False,
        dottest=False,
        saveGt=False,
        add_negative=True,
        **kwargs_cgls):
    r"""Multi-dimensional deconvolution.

    Solve multi-dimensional deconvolution problem using
    :py:func:`scipy.sparse.linalg.lsqr` iterative solver.

    Parameters
    ----------
    G : :obj:`dask.array.ndarray`
        Multi-dimensional convolution kernel in frequency domain of size
        :math:`[n_{f,max} \times n_s \times n_r]`
    d : :obj:`dask.array.ndarray`
        Data in time domain :math:`[n_t \times n_s (\times n_vs)]` if
        ``twosided=False`` or ``twosided=True`` and ``add_negative=True``
        (with only positive times) or size
        :math:`[2*n_t-1 \times n_s (\times n_vs)]` if ``twosided=True``
    dt : :obj:`float`, optional
        Sampling of time integration axis
    dr : :obj:`float`, optional
        Sampling of receiver integration axis
    nfmax : :obj:`int`, optional
        Index of max frequency to include in deconvolution process
    wav : :obj:`numpy.ndarray`, optional
        Wavelet to convolve to the inverted model and psf
        (must be centered around its index in the middle of the array).
        If ``None``, the outputs of the inversion are returned directly.
    twosided : :obj:`bool`, optional
        MDC operator and data both negative and positive time (``True``)
        or only positive (``False``)
    add_negative : :obj:`bool`, optional
        Add negative side to MDC operator and data (``True``) or not
        (``False``)- operator and data are already provided with both positive
        and negative sides. To be used only with ``twosided=True``.
    adjoint : :obj:`bool`, optional
        Compute and return adjoint(s)
    dottest : :obj:`bool`, optional
        Apply dot-test
    saveGt : :obj:`bool`, optional
        Save ``G`` and ``G^H`` to speed up the computation of adjoint of
        :class:`pylops_distributed.signalprocessing.Fredholm1` (``True``) or
        create ``G^H`` on-the-fly (``False``) Note that ``saveGt=True`` will be
        faster but double the amount of required memory
    **kwargs_cgls
        Arbitrary keyword arguments for
        :py:func:`pylops_distributed.optimization.cg.cgls` solver

    Returns
    -------
    minv : :obj:`dask.array.ndarray`
        Inverted model of size :math:`[n_t \times n_r (\times n_{vs})]`
        for ``twosided=False`` or
        :math:`[2*n_t-1 \times n_r (\times n_vs)]` for ``twosided=True``
    madj : :obj:`dask.array.ndarray`
        Adjoint model of size :math:`[n_t \times n_r (\times n_{vs})]`
        for ``twosided=False`` or
        :math:`[2*n_t-1 \times n_r (\times n_r) ]` for ``twosided=True``

    See Also
    --------
    MDC : Multi-dimensional convolution

    Notes
    -----
    Refer to :class:`pylops.waveeqprocessing.MDD` for implementation
    details. Note that this implementation is currently missing the ``wav``
    and ``causality_precond=False`` options.

    """
    nf, ns, nr = G.shape
    nt = d.shape[0]
    if len(d.shape) == 2:
        nv = 1
    else:
        nv = d.shape[2]
    if twosided:
        if add_negative:
            nt2 = 2 * nt - 1
        else:
            nt2 = nt
            nt = (nt2 + 1) // 2
        nfmax_allowed = int(np.ceil((nt2 + 1) / 2))
    else:
        nt2 = nt
        nfmax_allowed = nt

    # Fix nfmax to be at maximum equal to half of the size of fft samples
    if nfmax is None or nfmax > nfmax_allowed:
        nfmax = nfmax_allowed
        logging.warning('nfmax set equal to ceil[(nt+1)/2=%d]' % nfmax)

    # Add negative part to data and model
    if twosided and add_negative:
        if nv == 1:
            d = da.concatenate((da.zeros((nt - 1, ns)), d), axis=0)
        else:
            d = da.concatenate((da.zeros((nt - 1, ns, nv)), d), axis=0)
        d = d.rechunk(d.shape)

    # Define MDC linear operator
    MDCop = MDC(G, nt2, nv=nv, dt=dt, dr=dr, twosided=twosided, saveGt=saveGt)
    if dottest:
        Dottest(MDCop, nt2 * ns * nv, nt2 * nr * nv, verb=True)

    # Adjoint
    if adjoint:
        madj = MDCop.H * d.flatten()
        madj = da.squeeze(madj.reshape(nt2, nr, nv))

    # Inverse
    minv = cgls(MDCop, d.flatten(), **kwargs_cgls)[0]
    minv = da.squeeze(minv.reshape(nt2, nr, nv))
    #if wav is not None:
    #    minv = sp_convolve1d(minv, wav, axis=-1)

    if adjoint:
        return minv, madj
    else:
        return minv
コード例 #14
0
def segment_from_directory(
        directory, 
        suffix,
        affinities_channels, 
        centroids_channel, 
        thresholding_channel, 
        scale = (4, 1, 1),
        w_scale=None, 
        compactness=0.,
        display=True, 
        validation=False, 
        **kwargs
        #
    ):
    images, _, output, GT = get_dataset(directory, 
                                        GT=True, 
                                        validation=validation)
    images = da.squeeze(images)
    print(output.shape)
    segmentations = []
    masks = []
    scores = {'GT | Output' : [], 'Output | GT' : []}
    for i in range(output.shape[0]):
        gt = GT[i].compute()
        seg, _, mask = segment_output_image(
                output[i], 
                affinities_channels, 
                centroids_channel, 
                thresholding_channel, 
                scale=w_scale, 
                compactness=0.)
        vi = variation_of_information(gt, seg)
        scores['GT | Output'].append(vi[0])
        scores['Output | GT'].append(vi[1])
        seg = da.from_array(seg)
        segmentations.append(seg)
        masks.append(mask)
    segmentations = da.stack(segmentations)
    masks = da.stack(masks)
    # Save the VI data
    scores = pd.DataFrame(scores)
    if validation:
        s = 'validation_VI.csv'
    else:
        s = '_VI.csv'
    s_path = os.path.join(directory, suffix + s)
    scores.to_csv(s_path)
    gt_o = scores['GT | Output'].mean()
    o_gt = scores['Output | GT'].mean()
    print(f'Conditional entropy H(GT|Output): {gt_o}')
    print(f'Conditional entropy H(Output|GT): {o_gt}')
    if display:
        # Now Display
        z_affs = output[:, affinities_channels[0], ...]
        y_affs = output[:, affinities_channels[1], ...]
        x_affs = output[:, affinities_channels[2], ...]
        c = output[:, thresholding_channel, ...]
        cl = output[:, centroids_channel, ...]
        v_scale = [1] * len(images.shape)
        v_scale[-3:] = scale
        print(images.shape, v_scale, z_affs.shape, masks.shape)
        v = napari.Viewer()
        v.add_image(images, name='Input images', blending='additive', visible=True, scale=v_scale)
        v.add_image(c, name='Thresholding channel', blending='additive', visible=False, scale=v_scale)
        v.add_image(cl, name='Centroids channel', blending='additive', visible=False, scale=v_scale)
        v.add_image(z_affs, name='z affinities', blending='additive', visible=False, scale=v_scale, 
                    colormap='bop purple')
        v.add_image(y_affs, name='y affinities', blending='additive', visible=False, scale=v_scale, 
                    colormap='bop orange')
        v.add_image(x_affs, name='x affinities', blending='additive', visible=False, scale=v_scale, 
                    colormap='bop blue')
        v.add_labels(masks, name='Masks', blending='additive', visible=False, scale=v_scale)
        v.add_labels(GT, name='Ground truth', blending='additive', visible=False, scale=v_scale)
        v.add_labels(segmentations, name='Segmentations', blending='additive', visible=True, 
                     scale=v_scale)
        napari.run()
コード例 #15
0
ファイル: linearfilters.py プロジェクト: xy6g13/xscale
    def boundary_weights(self,
                         mode='reflect',
                         mask=None,
                         drop_dims=[],
                         compute=False):
        """
		Compute the boundary weights

		Parameters
		----------
		mode : {'reflect', 'periodic', 'any-constant'}, optional
			The mode parameter determines how the array borders are handled.
			Default is 'reflect'.
		mask : array-like, optional
			Specify the mask, if None the mask is inferred from missing values
		drop_dims : list, optional
			Specify dimensions along which the weights do not need to be
			computed
		compute : bool, optional
			If True, the computation is performed after the dask graph has
			been made. If False, only the dask graph is made is the computation
			will be performed later on.

		Returns
		-------
		weights : xarray.DataArray
			Return a DataArray containing the weights
		"""
        if mode is 'periodic':
            mode_conv = 'wrap'
        else:
            mode_conv = mode
        # Normalize coefficients
        coeffs = self.coefficients / self.coefficients.sum()
        if drop_dims:
            new_coeffs = da.squeeze(
                coeffs, axis=[self.obj.get_axis_num(di) for di in drop_dims])
        else:
            new_coeffs = coeffs
        new_obj = self.obj.isel(**{di: 0 for di in drop_dims}).squeeze()
        #depth = {new_obj.get_axis_num(di): self.order[di] // 2
        #         for di in self.dims}
        boundary = {self._obj.get_axis_num(di): mode for di in self.dims}
        if mask is None:
            mask = da.notnull(new_obj.data)
        conv = lambda x: im.convolve(x, new_coeffs, mode=mode_conv)
        weights = mask.astype(float).map_overlap(conv,
                                                 depth=self._depth,
                                                 boundary=boundary,
                                                 trim=True)

        res = xr.DataArray(mask * weights,
                           dims=new_obj.dims,
                           coords=new_obj.coords,
                           name='boundary_weights')
        res = res.where(res != 0)
        if compute:
            with ProgressBar():
                out = res.compute()
        else:
            out = res
        return out
コード例 #16
0
def main(argv=None):

    #     cluster = LocalCluster(dashboard_address=None)
    #     client = Client(cluster, memory_limit='{}GB'.format(FLAGS.memory_limit),
    #                     processes=False)

    K.set_floatx('float32')

    chunk_size = FLAGS.chunk_size

    # Read data set
    hdf5_file = h5py.File(FLAGS.data_file, 'r')
    images, labels, _ = hdf52dask(hdf5_file,
                                  FLAGS.group,
                                  chunk_size,
                                  shuffle=FLAGS.shuffle,
                                  seed=FLAGS.seed,
                                  pct=FLAGS.pct)
    n_images = images.shape[0]
    n_batches = int(np.ceil(n_images / float(FLAGS.batch_size)))

    # Data augmentation parameters
    daug_params_file = get_daug_scheme_path(FLAGS.daug_params, FLAGS.data_file)
    daug_params = yaml.load(open(daug_params_file, 'r'),
                            Loader=yaml.FullLoader)
    nodaug_params_file = get_daug_scheme_path('nodaug.yml', FLAGS.data_file)
    nodaug_params = yaml.load(open(nodaug_params_file, 'r'),
                              Loader=yaml.FullLoader)

    # Initialize the network model
    model_filename = FLAGS.model
    model = load_model(model_filename)

    # Print the model summary
    model.summary()

    # Get relevant layers
    if FLAGS.store_input:
        layer_regex = '({}|.*input.*)'.format(FLAGS.layer_regex)
    else:
        layer_regex = FLAGS.layer_regex

    layers = [
        layer.name for layer in model.layers
        if re.compile(layer_regex).match(layer.name)
    ]

    # Create batch generators
    n_daug_rep = FLAGS.n_daug_rep
    n_diff_per_batch = int(FLAGS.batch_size / n_daug_rep)
    image_gen_daug = get_generator(images, **daug_params)
    batch_gen_daug = batch_generator(image_gen_daug,
                                     images,
                                     labels,
                                     batch_size=n_diff_per_batch,
                                     aug_per_im=n_daug_rep,
                                     shuffle=False)
    image_gen_nodaug = get_generator(images, **nodaug_params)
    batch_gen_nodaug = batch_generator(image_gen_nodaug,
                                       images,
                                       labels,
                                       FLAGS.batch_size,
                                       aug_per_im=1,
                                       shuffle=False)

    # Outputs
    if FLAGS.output_dir == '-1':
        FLAGS.output_dir = os.path.dirname(FLAGS.model)

    output_hdf5 = h5py.File(
        os.path.join(FLAGS.output_dir, FLAGS.output_mse_matrix_hdf5), 'w')
    output_pickle = os.path.join(FLAGS.output_dir, FLAGS.output_pickle)
    df_init_idx = 0
    df = pd.DataFrame()

    # Iterate over the layers
    for layer_idx, layer_name in enumerate(layers):

        # Reload the model
        if layer_idx > 0:
            K.clear_session()
            model = load_model(model_filename)

        layer = model.get_layer(layer_name)

        # Rename input layer
        if re.compile('.*input.*').match(layer_name):
            layer_name = 'input'

        hdf5_layer = output_hdf5.create_group(layer_name)

        activation_function = K.function(
            [model.input, K.learning_phase()], [layer.output])

        print('\nComputing pairwise similarity at layer {}'.format(layer_name))

        # Compute activations of original data (without augmentation)
        a_nodaug_da = get_activations(activation_function, batch_gen_nodaug)
        a_nodaug_da = da.squeeze(a_nodaug_da)
        a_nodaug_da = da.rechunk(a_nodaug_da,
                                 (chunk_size, ) + (a_nodaug_da.shape[1:]))
        dim_activations = a_nodaug_da.shape[1]

        # Comute matrix of similarities
        r = da.reshape(da.sum(da.square(a_nodaug_da), axis=1), (-1, 1))
        mse_matrix = (r - 2 * da.dot(a_nodaug_da,
                                     da.transpose(a_nodaug_da)) \
                     + da.transpose(r)) / dim_activations

        # Compute activations with augmentation
        a_daug_da = get_activations(activation_function, batch_gen_daug)
        a_daug_da = da.rechunk(a_daug_da, (chunk_size, dim_activations, 1))

        # Compute similarity of augmentations with respect to the
        # activations of the original data
        a_nodaug_da = da.repeat(da.reshape(a_nodaug_da,
                                           a_nodaug_da.shape + (1, )),
                                repeats=n_daug_rep,
                                axis=2)
        a_nodaug_da = da.rechunk(a_nodaug_da, (chunk_size, dim_activations, 1))
        mse_daug = da.mean(da.square(a_nodaug_da - a_daug_da), axis=1)

        # Compute invariance score
        mse_sum = da.repeat(da.reshape(da.sum(mse_matrix, axis=1),
                                       (n_images, 1)),
                            repeats=n_daug_rep,
                            axis=1)
        mse_sum = da.rechunk(mse_sum, (chunk_size, 1))
        invariance = 1 - n_images * da.divide(mse_daug, mse_sum)

        print('Dimensionality activations: {}x{}x{}'.format(
            n_images, dim_activations, n_daug_rep))

        # Store HDF5 file
        if FLAGS.output_mse_matrix_hdf5:
            mse_matrix_ds = hdf5_layer.create_dataset(
                'mse_matrix',
                shape=mse_matrix.shape,
                chunks=mse_matrix.chunksize,
                dtype=K.floatx())
            mse_daug_ds = hdf5_layer.create_dataset('mse_daug',
                                                    shape=mse_daug.shape,
                                                    chunks=mse_daug.chunksize,
                                                    dtype=K.floatx())
            invariance_ds = hdf5_layer.create_dataset(
                'invariance',
                shape=invariance.shape,
                chunks=invariance.chunksize,
                dtype=K.floatx())
            time_init = time()
            with ProgressBar(dt=1):
                da.store([mse_matrix, mse_daug, invariance],
                         [mse_matrix_ds, mse_daug_ds, invariance_ds])
            time_end = time()
            print('Elapsed time: {}'.format(time_end - time_init))

            invariance = np.ravel(
                np.asarray(output_hdf5[layer_name]['invariance']))
        else:
            time_init = time()
            invariance = da.ravel(invariance).compute()
            time_end = time()
            print('Elapsed time: {}'.format(time_end - time_init))

        # Update pandas data frame for plotting
        df_end_idx = df_init_idx + n_images * n_daug_rep
        d = pd.DataFrame(
            {
                'Layer': layer_name,
                'sample': np.repeat(np.arange(n_images), n_daug_rep),
                'n_daug': np.tile(np.arange(n_daug_rep), n_images),
                'invariance': invariance
            },
            index=np.arange(df_init_idx, df_end_idx).tolist())
        df = df.append(d)
        df_init_idx += df_end_idx

    pickle.dump(df, open(output_pickle, 'wb'))
    output_hdf5.close()