Exemple #1
0
def coarsen(f):
    '''
    Create data pyramid.
    '''
    grid = f['resolutions']['1']['values']
    top_n = grid.shape[0]
    tile_size = 256

    max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))
    max_width = tile_size * 2**max_zoom

    chunk_size = tile_size * 16
    curr_size = grid.shape
    dask_dset = da.from_array(grid, chunks=(chunk_size, chunk_size))

    r = f['resolutions']
    curr_resolution = 1

    while curr_resolution < 2**max_zoom:
        curr_size = tuple(np.array(curr_size) / 2)
        print('coarsening')
        curr_resolution *= 2

        print("curr_size:", curr_size)
        g = r.create_group(str(curr_resolution))
        values = g.require_dataset('values',
                                   curr_size,
                                   dtype='f4',
                                   compression='lzf',
                                   fillvalue=np.nan)

        dask_dset = dask_dset.rechunk((chunk_size, chunk_size))
        dask_dset = da.coarsen(np.nansum, dask_dset, {0: 2, 1: 2})
        da.store(dask_dset, values)
Exemple #2
0
 def _vis_xformer(index):
     """
     Transform katdal visibilities indexed by ``index``
     into AIPS visiblities.
     """
     if isinstance(self._katds.vis, DaskLazyIndexer):
         arrays = [
             self._katds.vis, self._katds.weights, self._katds.flags
         ]
         vis, weights, flags = [
             dask_getitem(array.dataset, np.s_[index, :, :])
             for array in arrays
         ]
     else:
         vis = da.from_array(self._katds.vis[index])
         weights = da.from_array(self._katds.weights[index])
         flags = da.from_array(self._katds.flags[index])
     # Apply flags by negating weights
     weights = da.where(flags, -32767.0, weights)
     # Split complex vis dtype into real and imaginary parts
     vis_dtype = vis.dtype.type(0).real.dtype
     vis = vis.view(vis_dtype).reshape(vis.shape + (2, ))
     out_array = np.empty(weights.shape + (3, ), dtype=vis_dtype)
     da.store([vis, weights], [out_array[..., 0:2], out_array[..., 2]],
              lock=False)
     return out_array
Exemple #3
0
    def store(self, dask, raster_path, **kwargs):
        """
        Store a computed dask array into a new raster path. The format of the output raster will be interpreted by GDAL,
        although additional parameters will be implicitly added for a GeoTiff output to make it cloud optimized.

        .. Note::
                The dask should have been created using the same Mosaic instance that is used to store it. Also, 
                if the dask graph includes subsequent reductions or slicing, the dask may not fit the mosaic spatial
                definition.

        :param dask.Array dask: Input dask array
        :param str raster_path: Input path to a raster source.

        :param kwargs: Used for additional creation options, ex. { 'BIGTIFF': 'YES' }
        """
        raster = create_raster_source(raster_path, self.top, self.left,
                                      self.shape, self.csx, self.csy, self.sr,
                                      self.dtype, self.nodata, self.chunks,
                                      **kwargs)

        da.store([dask.reshape(self.shape).rechunk(self.chunk_tuple)],
                 [raster])

        # Create internal overviews
        overview_resampling_method = kwargs.get('overview_resampling_method',
                                                'nearest')
        cmd = f'gdaladdo -r {overview_resampling_method} "{raster_path}"'
        subprocess.call(cmd, shell=True)
Exemple #4
0
def coarsen(f, type, tile_size=256):
    '''
    Create data pyramid.
    '''
    grid = f['resolutions']['1'][type]
    top_n = grid.shape[0]
    max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))

    chunk_size = tile_size * 16
    curr_size = grid.shape
    dask_dset = da.from_array(grid, chunks=(chunk_size, chunk_size))

    r = f['resolutions']
    curr_resolution = 1

    while curr_resolution < 2 ** max_zoom:
        curr_size = tuple(np.array(curr_size) / 2)
        print('coarsening')
        curr_resolution *= 2

        print("curr_size:", curr_size)
        group_name = '{}{}'.format(
            curr_resolution, '' if type == 'values' else '-' + type
        )
        g = r.create_group(group_name)
        values = g.require_dataset(type, curr_size, dtype='f4',
                                   compression='lzf', fillvalue=np.nan)

        dask_dset = dask_dset.rechunk((chunk_size, chunk_size))
        dask_dset = da.coarsen(np.nansum, dask_dset, {0: 2, 1: 2})
        da.store(dask_dset, values)
Exemple #5
0
    def _save_in_hdf5_object(self, f, tag="tomo"):
        if "class_name" not in f.attrs.keys():
            f.attrs["class_name"] = self.__class__.__name__
            f.attrs["module_name"] = self.__module__
        if "params" not in f.keys() and self.params is not None:
            self.params._save_as_hdf5(hdf5_parent=f)

        if tag in f:
            grp = f[tag]
        else:
            grp = f.create_group(tag)

        grp.attrs["class_name"] = self.__class__.__name__
        grp.attrs["module_name"] = self.__module__
        for attr in self._attrs_to_save:
            grp.attrs[attr] = getattr(self, attr)

        for k in self._keys_to_save:
            data = getattr(self, k)
            print(f"Saving {type(data)} {k}...")
            if isinstance(data, da.core.Array):
                dataset = grp.require_dataset(f"/{tag}/{k}",
                                              shape=data.shape,
                                              dtype=data.dtype)
                da.store(data, dataset)
            else:
                grp.create_dataset(k, data=data)
Exemple #6
0
def copy(source: GreyOrdinates, target: GreyOrdinates):
    """
    Copies information from source to target.
    """
    if source.data.shape != target.data.shape:
        raise ValueError("Source and target shape do not match")

    if isinstance(target.data, zarr.Array):
        target.data[:] = source.data
    else:
        chunks = getattr(target.data, 'chunks',
                         getattr(source.data, 'chunks', 'auto'))
        for dataset in target.data, source.data:
            if not hasattr(dataset, 'chunks'):
                if chunks == 'auto':
                    chunks = [1, 1]
                else:
                    chunks = list(chunks)
                chunks[np.argmin(dataset.strides)] = None
        logger.info(
            f"Adopted chunk size: {tuple(chunks)} for CIFTI with shape {tuple(source.data.shape)}"
        )

        data = source.as_dask(tuple(chunks))
        da.store(data, target.data)
Exemple #7
0
def write_raster(path, array, **kwargs):
    """Write a dask array to a raster file

    If array is 2d, write array on band 1.
    If array is 3d, write data on each band

    Arguments:
        path {string} -- path of raster to write
        array {dask.array.Array} -- band array
        kwargs {dict} -- keyword arguments to delegate to rasterio.open

    Examples:
        # Write a single band raster
        >> red_band = read_raster_band("test.tif", band=1)
        >> write_raster("new.tif", red_band)

        # Write a multiband raster
        >> img = read_raster("test.tif")
        >> new_img = process(img)
        >> write_raster("new.tif", new_img)

    """
    if len(array.shape) != 2 and len(array.shape) != 3:
        raise TypeError('invalid shape (must be either 2d or 3d)')

    if is_dask_collection(array):
        with RasterioDataset(path, 'w', **kwargs) as dst:
            da.store(array, dst, lock=True)
    else:
        with rasterio.open(path, 'w', **kwargs) as dst:
            if len(array.shape) == 2:
                dst.write(array, 1)
            else:
                dst.write(array)
Exemple #8
0
def overwrite_dataset(group, data, key, signal_axes=None, **kwds):
    if signal_axes is None:
        chunks = True
    else:
        chunks = get_signal_chunks(data.shape, data.dtype, signal_axes)

    maxshape = tuple(None for _ in data.shape)

    got_data = False
    while not got_data:
        try:
            these_kwds = kwds.copy()
            these_kwds.update(dict(shape=data.shape,
                                   dtype=data.dtype,
                                   exact=True,
                                   maxshape=maxshape,
                                   chunks=chunks,
                                   shuffle=True,))

            dset = group.require_dataset(key, **these_kwds)
            got_data = True
        except TypeError:
            # if the shape or dtype/etc do not match,
            # we delete the old one and create new in the next loop run
            del group[key]
    if dset == data:
        # just a reference to already created thing
        pass
    else:
        if isinstance(data, da.Array):
            da.store(data.rechunk(dset.chunks), dset)
        else:
            da.store(da.from_array(data, chunks=dset.chunks), dset)
Exemple #9
0
    def get(cls, arrays, keep, out=None):
        """Extract several arrays from the underlying dataset.

        This is a variant of :meth:`__getitem__` that pulls from several arrays
        jointly. This can be significantly more efficient if intermediate dask
        nodes can be shared.

        Parameters
        ----------
        arrays : list of :class:`DaskLazyIndexer`
            Arrays to index
        keep : NumPy index expression
            Second-stage index as a valid index or slice specification
            (supports arbitrary slicing or advanced indexing on any dimension)
        out : list of :class:`np.ndarray`
            If specified, output arrays in which to store results. It must be
            the same length as `arrays` and each array must have the
            appropriate shape and dtype.

        Returns
        -------
        out : sequence of :class:`numpy.ndarray`
            Extracted output array (computed from the final dask version)
        """
        kept = [dask_getitem(array.dataset, keep) for array in arrays]
        # Workaround for https://github.com/dask/dask/issues/3595
        # This is equivalent to da.compute(kept), but does not allocate
        # excessive memory.
        if out is None:
            out = [np.empty(array.shape, array.dtype) for array in kept]
        da.store(kept, out, lock=False)
        return out
Exemple #10
0
def coarsen(f, tile_size=256):
    """
    Create data pyramid.
    """
    grid = f["resolutions"]["1"]["values"]
    top_n = grid.shape[0]

    max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))
    max_width = tile_size * 2**max_zoom

    chunk_size = tile_size * 16
    curr_size = grid.shape
    dask_dset = da.from_array(grid, chunks=(chunk_size, chunk_size))

    r = f["resolutions"]
    curr_resolution = 1

    while curr_resolution < 2**max_zoom:
        curr_size = tuple(np.array(curr_size) / 2)
        print("coarsening")
        curr_resolution *= 2

        print("curr_size:", curr_size)
        g = r.create_group(str(curr_resolution))
        values = g.require_dataset("values",
                                   curr_size,
                                   dtype="f4",
                                   compression="lzf",
                                   fillvalue=np.nan)

        dask_dset = dask_dset.rechunk((chunk_size, chunk_size))
        dask_dset = da.coarsen(np.nansum, dask_dset, {0: 2, 1: 2})
        da.store(dask_dset, values)
Exemple #11
0
def load(dataset, indices, vis, weights, flags):
    """Load data from lazy indexers into existing storage.

    This is optimised for the MVF v4 case where we can use dask directly
    to eliminate one copy, and also load vis, flags and weights in parallel.
    In older formats it causes an extra copy.

    Parameters
    ----------
    dataset : :class:`katdal.DataSet`
        Input dataset, possibly with an existing selection
    indices : tuple
        Index expression for subsetting the dataset
    vis, weights, flags : array-like
        Outputs, which must have the correct shape and type
    """
    if isinstance(dataset.vis, DaskLazyIndexer):
        da.store([
            dataset.vis.dask_getitem(indices),
            dataset.weights.dask_getitem(indices),
            dataset.flags.dask_getitem(indices)
        ], [vis, weights, flags],
                 lock=False)
    else:
        vis[:] = dataset.vis[indices]
        weights[:] = dataset.weights[indices]
        flags[:] = dataset.flags[indices]
Exemple #12
0
def dataarray_to_gridded_product(ds, grid_def, overwrite_existing=False):
    info = ds.attrs.copy()
    info.pop("area", None)
    if ds.ndim == 3:
        # RGB composite
        if ds.shape[0] in [3, 4]:
            channels = ds.shape[0]
        else:
            # unpreferred array orientation
            channels = ds.shape[-1]
            ds = np.rollaxis(ds, 2)
    else:
        channels = 1

    if np.issubdtype(np.dtype(ds.dtype), np.floating):
        dtype = np.float32
    else:
        dtype = ds.dtype

    p2g_metadata = {
        "product_name":
        info["name"],
        "satellite":
        info["platform_name"].lower(),
        "instrument":
        info["sensor"].lower() if isinstance(info["sensor"], str) else list(
            info["sensor"])[0].lower(),
        "data_kind":
        info["standard_name"],
        "begin_time":
        info["start_time"],
        "end_time":
        info["end_time"],
        "fill_value":
        np.nan,
        # "swath_columns": cols,
        # "swath_rows": rows,
        "rows_per_scan":
        info["rows_per_scan"],
        "data_type":
        dtype,
        "channels":
        channels,
        "grid_definition":
        grid_def,
    }
    info.update(p2g_metadata)

    filename = info["name"] + ".dat"
    info["grid_data"] = filename
    if os.path.isfile(filename):
        if not overwrite_existing:
            LOG.error("Binary file already exists: %s" % (filename, ))
            raise RuntimeError("Binary file already exists: %s" % (filename, ))
        else:
            LOG.warning("Binary file already exists, will overwrite: %s",
                        filename)
    p2g_arr = np.memmap(filename, mode="w+", dtype=dtype, shape=ds.shape)
    da.store(ds.data.astype(dtype), p2g_arr)
    return containers.GriddedProduct(**info)
Exemple #13
0
 def __getitem__(self, keep):
     kept = self.dask_getitem(keep)
     # Workaround for https://github.com/dask/dask/issues/3595
     # This is equivalent to kept.compute(), but does not
     # allocate excessive memory.
     out = np.empty(kept.shape, kept.dtype)
     da.store(kept, out, lock=False)
     return out
Exemple #14
0
def write_data_variables(data_vars, nco):
    for name, variable in data_vars.items():
        try:
            with dask.set_options(get=dask.async.get_sync):
                da.store(variable.data, nco[name], lock=True)
        except ValueError:
            nco[name][:] = netcdf_writer.netcdfy_data(variable.values)
        nco.sync()
Exemple #15
0
 def _store_data(data, dset, group, key, chunks):
     if isinstance(data, da.Array):
         if data.chunks != dset.chunks:
             data = data.rechunk(dset.chunks)
         da.store(data, dset)
     elif data.flags.c_contiguous:
         dset.write_direct(data)
     else:
         dset[:] = data
Exemple #16
0
def overwrite_dataset(group, data, key, signal_axes=None, chunks=None, **kwds):
    if chunks is None:
        if isinstance(data, da.Array):
            # For lazy dataset, by default, we use the current dask chunking
            chunks = tuple([c[0] for c in data.chunks])
        else:
            # If signal_axes=None, use automatic h5py chunking, otherwise
            # optimise the chunking to contain at least one signal per chunk
            chunks = get_signal_chunks(data.shape, data.dtype, signal_axes)

    if np.issubdtype(data.dtype, np.dtype('U')):
        # Saving numpy unicode type is not supported in h5py
        data = data.astype(np.dtype('S'))
    if data.dtype == np.dtype('O'):
        # For saving ragged array
        # http://docs.h5py.org/en/stable/special.html#arbitrary-vlen-data
        group.require_dataset(key,
                              chunks,
                              dtype=h5py.special_dtype(vlen=data[0].dtype),
                              **kwds)
        group[key][:] = data[:]

    maxshape = tuple(None for _ in data.shape)

    got_data = False
    while not got_data:
        try:
            these_kwds = kwds.copy()
            these_kwds.update(dict(shape=data.shape,
                                   dtype=data.dtype,
                                   exact=True,
                                   maxshape=maxshape,
                                   chunks=chunks,
                                   shuffle=True,))

            # If chunks is True, the `chunks` attribute of `dset` below
            # contains the chunk shape guessed by h5py
            dset = group.require_dataset(key, **these_kwds)
            got_data = True
        except TypeError:
            # if the shape or dtype/etc do not match,
            # we delete the old one and create new in the next loop run
            del group[key]
    if dset == data:
        # just a reference to already created thing
        pass
    else:
        _logger.info(f"Chunks used for saving: {dset.chunks}")
        if isinstance(data, da.Array):
            if data.chunks != dset.chunks:
                data = data.rechunk(dset.chunks)
            da.store(data, dset)
        elif data.flags.c_contiguous:
            dset.write_direct(data)
        else:
            dset[:] = data
Exemple #17
0
 def sync(self):
     if self.sources:
         import dask.array as da
         import dask
         if StrictVersion(dask.__version__) > StrictVersion('0.8.1'):
             da.store(self.sources, self.targets, lock=threading.Lock())
         else:
             da.store(self.sources, self.targets)
         self.sources = []
         self.targets = []
Exemple #18
0
 def sync(self):
     if self.sources:
         import dask.array as da
         import dask
         if LooseVersion(dask.__version__) > LooseVersion('0.8.1'):
             da.store(self.sources, self.targets, lock=GLOBAL_LOCK)
         else:
             da.store(self.sources, self.targets)
         self.sources = []
         self.targets = []
Exemple #19
0
 def sync(self):
     if self.sources:
         import dask.array as da
         import dask
         if StrictVersion(dask.__version__) > StrictVersion('0.8.1'):
             da.store(self.sources, self.targets, lock=threading.Lock())
         else:
             da.store(self.sources, self.targets)
         self.sources = []
         self.targets = []
Exemple #20
0
    def sync(self):
        if self.sources:
            import dask.array as da
            import dask

            if StrictVersion(dask.__version__) > StrictVersion("0.8.1"):
                da.store(self.sources, self.targets, lock=GLOBAL_LOCK)
            else:
                da.store(self.sources, self.targets)
            self.sources = []
            self.targets = []
Exemple #21
0
def area_to_swath_def(area, overwrite_existing=False):
    lons = area.lons
    lats = area.lats
    name = area.name
    name = name.replace(":", "")
    if lons.ndim == 1:
        rows, cols = lons.shape[0], 1
    else:
        rows, cols = lons.shape
    info = {
        "swath_name": name,
        "longitude": name + "_lon.dat",
        "latitude": name + "_lat.dat",
        "swath_rows": rows,
        "swath_columns": cols,
        "data_type": lons.dtype,
        "fill_value": np.nan,
    }
    if hasattr(area, "attrs"):
        info.update(area.attrs)

    # Write lons to disk
    filename = info["longitude"]
    if os.path.isfile(filename):
        if not overwrite_existing:
            LOG.error("Binary file already exists: %s" % (filename, ))
            raise RuntimeError("Binary file already exists: %s" % (filename, ))
        else:
            LOG.warning("Binary file already exists, will overwrite: %s",
                        filename)
    LOG.info("Writing longitude data to disk cache...")
    lon_arr = np.memmap(filename,
                        mode="w+",
                        dtype=lons.dtype,
                        shape=lons.shape)
    da.store(lons.data, lon_arr)

    # Write lats to disk
    filename = info["latitude"]
    if os.path.isfile(filename):
        if not overwrite_existing:
            LOG.error("Binary file already exists: %s" % (filename, ))
            raise RuntimeError("Binary file already exists: %s" % (filename, ))
        else:
            LOG.warning("Binary file already exists, will overwrite: %s",
                        filename)
    LOG.info("Writing latitude data to disk cache...")
    lat_arr = np.memmap(filename,
                        mode="w+",
                        dtype=lats.dtype,
                        shape=lats.shape)
    da.store(lats.data, lat_arr)
    return containers.SwathDefinition(**info)
Exemple #22
0
def get_scalar_outputs(dobj, nelem_in_yr, var_fcast, verif_data_attr,
                       out_types, use_dask=False, ):
    
    if use_dask:
        truth_data = dobj.reset_data(verif_data_attr)
    else:
        truth_data = getattr(dobj, verif_data_attr)
        
    truth_1yr = truth_data[nelem_in_yr:]
    truth_init = truth_data[:-nelem_in_yr]

    curr_var_output = {}

    for out_type in out_types:

        fcast_factor, verif_factor = get_scalar_factor(dobj, out_type,
                                                       verif_data_attr)

        var_out = var_fcast @ fcast_factor
        truth_init_out = truth_init @ verif_factor
        truth_1yr_out = truth_1yr @ verif_factor

        # Standardize PDO Index relative to truth output
        if out_type == 'pdo':
            truth_1yr_out, std_dev = _standardize_series(truth_1yr_out)
            var_out, _ = _standardize_series(var_out, std_dev=std_dev)
            truth_init_out, _ = _standardize_series(truth_init_out,
                                                    std_dev=std_dev)
            
        if use_dask:
            t_truth_1yr_out = np.empty(truth_1yr_out.shape)
            t_truth_init_out = np.empty(truth_init_out.shape)

            dask_vars = [truth_1yr_out, truth_init_out]
            dask_outs = [t_truth_1yr_out, t_truth_init_out]

            if ST.is_dask_array(var_out):
                t_var_out = np.empty(var_out.shape)
                dask_vars.append(var_out)
                dask_outs.append(t_var_out)

            da.store(dask_vars, dask_outs)

            truth_1yr_out = t_truth_1yr_out
            truth_init_out = t_truth_init_out

            if ST.is_dask_array(var_out):
                var_out = t_var_out

        curr_var_output[out_type] = {'fcast': var_out,
                                     't0': truth_init_out,
                                     '1yr': truth_1yr_out}
    return curr_var_output
Exemple #23
0
    def save_image(self, img, filename=None, compute=True, dtype=None, fill_value=None, **kwargs):
        filename = filename or self.get_filename(
            data_type=dtype_to_str(dtype), rows=img.data.shape[0], columns=img.data.shape[1], **img.data.attrs
        )

        data = self._prep_data(img.data, dtype, fill_value)

        logger.info("Saving product %s to binary file %s", img.data.attrs["p2g_name"], filename)
        dst = np.memmap(filename, shape=img.data.shape, dtype=dtype, mode="w+")
        if compute:
            da.store(data, dst)
            return
        return [[data], [dst]]
Exemple #24
0
def do_stack_task(config, task):
    global_attributes = config['global_attributes']
    global_attributes['history'] = get_history_attribute(config, task)

    variable_params = config['variable_params']

    output_filename = Path(task['output_filename'])
    tile = task['tile']

    data = datacube.api.GridWorkflow.load(
        tile, dask_chunks=config['storage']['chunking'])

    unwrapped_datasets = xr_apply(tile.sources,
                                  _unwrap_dataset_list,
                                  dtype='O')
    data['dataset'] = datasets_to_doc(unwrapped_datasets)

    nco = create_netcdf_storage_unit(output_filename, data.crs, data.coords,
                                     data.data_vars, variable_params,
                                     global_attributes)

    for name, variable in data.data_vars.items():
        try:
            with dask.set_options(get=dask. async .get_sync):
                da.store(variable.data, nco[name], lock=True)
        except ValueError:
            nco[name][:] = netcdf_writer.netcdfy_data(variable.values)
        nco.sync()

    nco.close()

    def update_dataset_location(labels, dataset):
        new_dataset = copy.copy(dataset)
        new_dataset.local_uri = output_filename.absolute().as_uri()
        return [dataset]

    updated_datasets = xr_apply(unwrapped_datasets,
                                update_dataset_location,
                                dtype='O')
    new_tile = datacube.api.Tile(sources=updated_datasets, geobox=tile.geobox)

    new_data = datacube.api.GridWorkflow.load(
        new_tile, dask_chunks=config['storage']['chunking'])

    if not data.identical(new_data):
        _LOG.error("Mismatch found for %s, not indexing", output_filename)
        raise ValueError("Mismatch found for %s, not indexing" %
                         output_filename)

    return unwrapped_datasets, output_filename.absolute().as_uri()
Exemple #25
0
def store(sources, targets):
    """
    Adapted from dask.array.store
    :param sources: sources dask arrays
    :param targets: target data store locations
    :return: None
    """
    # For debugging
    # -------------
    # for source, target in zip(sources, targets):
    #     da.store(source, target, compute=True)
    # return
    # -------------

    da.store(sources, targets, compute=True)
Exemple #26
0
def da_yxbt_sink(
    bands: Tuple[da.Array, ...], chunks: Tuple[int, ...], name="yxbt"
) -> da.Array:
    """
    each band is in <t,y,x>
    output is <y,x,b,t>

    eval(bands) |> transpose(YXBT) |> Store(RAM) |> DaskArray(RAM, chunks)
    """
    tk = tokenize(*bands)

    b = bands[0]
    dtype = b.dtype
    nt, ny, nx = b.shape
    nb = len(bands)
    shape = (ny, nx, nb, nt)

    token = Cache.dask_new(shape, dtype, f"{name}_alloc")

    sinks = [dask.delayed(_YXBTSink)(token, idx) for idx in range(nb)]
    fut = da.store(bands, sinks, lock=False, compute=False)
    sink_name = f"{name}_collect-{tk}"
    dsk = dict(fut.dask)
    dsk[sink_name] = (lambda *x: x[0], token.key, *fut.dask[fut.key])
    dsk = HighLevelGraph.from_collections(sink_name, dsk, dependencies=sinks)
    token_done = Delayed(sink_name, dsk)

    return _da_from_mem(token_done, shape=shape, dtype=dtype, chunks=chunks, name=name)
Exemple #27
0
def split_hdf5_multiple(arr, out_dirpath, nb_blocks, file_list):
    """
    Arguments:
    ----------
        arr: Array to split
        file_list: Empty list to store output files' objects.
        nb_blocks: Nb blocks we want to extract. None = all blocks.
    """
    arr_dict = get_arr_chunks(
        arr, nb_blocks, as_dict=True)  # get array blocks as dask array objects

    datasets = list()
    arr_list = list()
    for key, arr_block in arr_dict.items():
        i, j, k = key
        filename = f'{i}_{j}_{k}.hdf5'
        filepath = os.path.join(out_dirpath, filename)
        if os.path.isfile(filepath):
            os.remove(filepath)
        file_list.append(h5py.File(filepath, 'w'))

        datasets.append(file_list[-1].create_dataset('/data',
                                                     shape=arr_block.shape))
        arr_list.append(arr_block)
    return da.store(arr_list, datasets, compute=False)
Exemple #28
0
def da_yxbt_sink(bands: Tuple[da.Array, ...],
                 chunks: Tuple[int, ...],
                 name="yxbt") -> da.Array:
    """
    each band is in <t,y,x>
    output is <y,x,b,t>

    eval(bands) |> transpose(YXBT) |> Store(RAM) |> DaskArray(RAM, chunks)
    """
    b = bands[0]
    dtype = b.dtype
    nt, ny, nx = b.shape
    nb = len(bands)
    shape = (ny, nx, nb, nt)

    token = dask.delayed(Cache.new)(shape, dtype)

    sinks = [dask.delayed(_YXBTSink)(token, idx) for idx in range(nb)]
    fut = da.store(bands, sinks, lock=False, compute=False)

    return _da_from_mem(with_deps(token, fut),
                        shape=shape,
                        dtype=dtype,
                        chunks=chunks,
                        name=name)
Exemple #29
0
def da_mem_sink(xx: da.Array,
                chunks: Tuple[int, ...],
                name="memsink") -> da.Array:
    """
    It's a kind of fancy rechunk for special needs.

    Assumptions
    - Single worker only
    - ``xx`` can fit in RAM of the worker

    Note that every output chunk depends on ALL of input chunks.

    On some Dask worker:
    - Fully evaluate ``xx`` and serialize to RAM
    - Present in RAM view of the result with a different chunking regime

    A common use case would be to load a large collection (>50% of RAM) that
    needs to be processed by some non-Dask code as a whole. A simple
    ``do_stuff(xx.compute())`` would not work as duplicating RAM is not an
    option in that scenario. Normal rechunk might also run out of RAM and
    introduces large memory copy overhead as all input chunks need to be cached
    then re-assembled into a different chunking structure.
    """
    token = dask.delayed(Cache.new)(xx.shape, xx.dtype)

    sink = dask.delayed(CachedArray)(token)
    fut = da.store(xx, sink, lock=False, compute=False)

    return _da_from_mem(with_deps(token, fut),
                        shape=xx.shape,
                        dtype=xx.dtype,
                        chunks=chunks,
                        name=name)
Exemple #30
0
def apply_store(B, O, R, volumestokeep, reconstructed_array, outputimgdir,
                case_index):
    # creations of data for dask store function
    d_arrays, d_regions = compute_zones(B, O, R, volumestokeep)
    out_files = list()  # to keep outfiles open during processing
    sources = list()
    targets = list()
    regions = list()
    for outfile_index in range(9):
        sliceslistoflist = d_arrays[outfile_index]

        # create file
        out_file = h5py.File('./' + str(outfile_index) + '.hdf5', 'w')
        out_files.append(out_file)

        # create dset
        dset = out_file.create_dataset('/data', shape=O)

        for i, st in enumerate(sliceslistoflist):
            tmp_array = reconstructed_array[st[0], st[1], st[2]]
            print("shape:", tmp_array.shape)
            reg = d_regions[outfile_index][i]
            tmp_array = tmp_array.rechunk(tmp_array.shape)

            sources.append(tmp_array)
            targets.append(dset)
            regions.append(reg)

    # storage: creation of task graph
    task = da.store(sources, targets, regions=regions, compute=False)
    filename = os.path.join(outputimgdir,
                            'after_store' + str(case_index) + '.png')
    task.visualize(optimize_graph=False, filename=filename)
    return task
Exemple #31
0
def dataarray_to_gridded_product(ds, grid_def, overwrite_existing=False):
    info = ds.attrs.copy()
    info.pop("area", None)
    if ds.ndim == 3:
        # RGB composite
        if ds.shape[0] in [3, 4]:
            channels = ds.shape[0]
        else:
            # unpreferred array orientation
            channels = ds.shape[-1]
            ds = np.rollaxis(ds, 2)
    else:
        channels = 1

    if np.issubdtype(np.dtype(ds.dtype), np.floating):
        dtype = np.float32
    else:
        dtype = ds.dtype

    p2g_metadata = {
        "product_name": info["name"],
        "satellite": info["platform_name"].lower(),
        "instrument": info["sensor"].lower() if isinstance(info["sensor"], str) else list(info["sensor"])[0].lower(),
        "data_kind": info["standard_name"],
        "begin_time": info["start_time"],
        "end_time": info["end_time"],
        "fill_value": np.nan,
        # "swath_columns": cols,
        # "swath_rows": rows,
        "rows_per_scan": info["rows_per_scan"],
        "data_type": dtype,
        "channels": channels,
        "grid_definition": grid_def,
    }
    info.update(p2g_metadata)

    filename = info["name"] + ".dat"
    info["grid_data"] = filename
    if os.path.isfile(filename):
        if not overwrite_existing:
            LOG.error("Binary file already exists: %s" % (filename,))
            raise RuntimeError("Binary file already exists: %s" % (filename,))
        else:
            LOG.warning("Binary file already exists, will overwrite: %s", filename)
    p2g_arr = np.memmap(filename, mode="w+", dtype=dtype, shape=ds.shape)
    da.store(ds.data.astype(dtype), p2g_arr)
    return containers.GriddedProduct(**info)
Exemple #32
0
    def save_datasets(self, datasets, compute=True, **kwargs):
        """Save all datasets to one or more files.

        Subclasses can use this method to save all datasets to one single
        file or optimize the writing of individual datasets. By default
        this simply calls `save_dataset` for each dataset provided.

        Args:
            datasets (iterable): Iterable of `xarray.DataArray` objects to
                                 save using this writer.
            compute (bool): If `True` (default), compute all of the saves to
                            disk. If `False` then the return value is either
                            a `dask.delayed.Delayed` object or two lists to
                            be passed to a `dask.array.store` call.
                            See return values below for more details.
            **kwargs: Keyword arguments to pass to `save_dataset`. See that
                      documentation for more details.

        Returns:
            Value returned depends on `compute` keyword argument. If
            `compute` is `True` the value is the result of a either a
            `dask.array.store` operation or a `dask.delayed.Delayed` compute,
            typically this is `None`. If `compute` is `False` then the
            result is either a `dask.delayed.Delayed` object that can be
            computed with `delayed.compute()` or a two element tuple of
            sources and targets to be passed to `dask.array.store`. If
            `targets` is provided then it is the caller's responsibility to
            close any objects that have a "close" method.

        """
        sources = []
        targets = []
        for ds in datasets:
            res = self.save_dataset(ds, compute=False, **kwargs)
            if isinstance(res, tuple):
                # source, target to be passed to da.store
                sources.append(res[0])
                targets.append(res[1])
            else:
                # delayed object
                sources.append(res)

        # we have targets, we should save sources to targets
        if targets and compute:
            LOG.info("Computing and writing results...")
            res = da.store(sources, targets)
            for target in targets:
                if hasattr(target, 'close'):
                    target.close()
            return res
        elif targets:
            return sources, targets

        delayed = dask.delayed(sources)
        if compute:
            LOG.info("Computing and writing results...")
            return delayed.compute()
        return delayed
    def work(self):
        import dask.array as da
        import numpy as np
        import h5py
        from luigi.file import atomic_file

        fs = [h5py.File(f.path, mode='r') for f in self.input()]

        # Verify all H5s have the same structure
        datasets, groups, samples = [[] for x in fs], [[] for x in fs
                                                       ], [[] for x in fs]
        for i, f in enumerate(fs):
            f.visititems(lambda n, o: datasets[i].append(n) if isinstance(
                o, h5py.Dataset) else groups[i].append(n))
            samples[i] = f['samples'][:]
        if not all([set(datasets[0]) == set(x) for x in datasets]) and np.all(
                samples == samples[0], axis=0):
            raise Exception(
                "All HDF5 files must have the same groups/datasets/samples!")
        datasets, groups, samples = datasets[0], groups[0], samples[0]

        # Drop Samples dataset and handle separately
        datasets = [x for x in datasets if x != 'samples']
        combined = {
            d: da.concatenate([da.from_array(f[d], chunks=100000) for f in fs])
            for d in datasets
        }

        shapes = [(np.sum([f.get(d).shape
                           for f in fs], axis=0)[0], *fs[0].get(d).shape[1:])
                  for d in datasets]
        dtypes = [fs[0].get(d).dtype for d in datasets]

        # Handles Samples dataset
        datasets.append('samples')
        combined.update({'samples': da.from_array(fs[0]['samples'], chunks=1)})
        shapes.append(samples.shape)
        dtypes.append(samples.dtype)

        af = atomic_file(self.output().path)
        fout = h5py.File(af.tmp_path, 'w')

        # Set up group structure
        for g in groups:
            fout.create_group(g)

        # Create the datasets
        out_datasets = {}
        for p, dtype, shape in zip(datasets, dtypes, shapes):
            g, d = os.path.split(p)
            out_datasets[p] = (fout[g] if g else fout).create_dataset(
                d, shape=shape, dtype=dtype, chunks=True, compression='gzip')
        for k in combined.keys():
            s = da.store(combined[k], out_datasets[k], compute=False)
            s.compute(num_workers=self.n_cpu)
            print("Done " + k)

        af.move_to_final_destination()
Exemple #34
0
def area_to_swath_def(area, overwrite_existing=False):
    lons = area.lons
    lats = area.lats
    name = area.name
    name = name.replace(":", "")
    if lons.ndim == 1:
        rows, cols = lons.shape[0], 1
    else:
        rows, cols = lons.shape
    info = {
        "swath_name": name,
        "longitude": name + "_lon.dat",
        "latitude": name + "_lat.dat",
        "swath_rows": rows,
        "swath_columns": cols,
        "data_type": lons.dtype,
        "fill_value": np.nan,
    }
    if hasattr(area, "attrs"):
        info.update(area.attrs)

    # Write lons to disk
    filename = info["longitude"]
    if os.path.isfile(filename):
        if not overwrite_existing:
            LOG.error("Binary file already exists: %s" % (filename,))
            raise RuntimeError("Binary file already exists: %s" % (filename,))
        else:
            LOG.warning("Binary file already exists, will overwrite: %s", filename)
    LOG.info("Writing longitude data to disk cache...")
    lon_arr = np.memmap(filename, mode="w+", dtype=lons.dtype, shape=lons.shape)
    da.store(lons.data, lon_arr)

    # Write lats to disk
    filename = info["latitude"]
    if os.path.isfile(filename):
        if not overwrite_existing:
            LOG.error("Binary file already exists: %s" % (filename,))
            raise RuntimeError("Binary file already exists: %s" % (filename,))
        else:
            LOG.warning("Binary file already exists, will overwrite: %s", filename)
    LOG.info("Writing latitude data to disk cache...")
    lat_arr = np.memmap(filename, mode="w+", dtype=lats.dtype, shape=lats.shape)
    da.store(lats.data, lat_arr)
    return containers.SwathDefinition(**info)
Exemple #35
0
 def sync(self, compute=True):
     if self.sources:
         import dask.array as da
         delayed_store = da.store(self.sources, self.targets,
                                  lock=self.lock, compute=compute,
                                  flush=True)
         self.sources = []
         self.targets = []
         return delayed_store
Exemple #36
0
    def save_datasets(self, datasets, compute=True, **kwargs):
        """Save all datasets to one or more files.

        Subclasses can use this method to save all datasets to one single
        file or optimize the writing of individual datasets. By default
        this simply calls `save_dataset` for each dataset provided.

        Args:
            datasets (iterable): Iterable of `xarray.DataArray` objects to
                                 save using this writer.
            compute (bool): If `True` (default), compute all of the saves to
                            disk. If `False` then the return value is either
                            a `dask.delayed.Delayed` object or two lists to
                            be passed to a `dask.array.store` call.
                            See return values below for more details.
            **kwargs: Keyword arguments to pass to `save_dataset`. See that
                      documentation for more details.

        Returns:
            Value returned depends on `compute` keyword argument. If
            `compute` is `True` the value is the result of a either a
            `dask.array.store` operation or a `dask.delayed.Delayed` compute,
            typically this is `None`. If `compute` is `False` then the
            result is either a `dask.delayed.Delayed` object that can be
            computed with `delayed.compute()` or a two element tuple of
            sources and targets to be passed to `dask.array.store`. If
            `targets` is provided then it is the caller's responsibility to
            close any objects that have a "close" method.

        """
        sources = []
        targets = []
        for ds in datasets:
            res = self.save_dataset(ds, compute=False, **kwargs)
            if isinstance(res, tuple):
                # source, target to be passed to da.store
                sources.append(res[0])
                targets.append(res[1])
            else:
                # delayed object
                sources.append(res)

        # we have targets, we should save sources to targets
        if targets and compute:
            res = da.store(sources, targets)
            for target in targets:
                if hasattr(target, 'close'):
                    target.close()
            return res
        elif targets:
            return sources, targets

        delayed = dask.delayed(sources)
        if compute:
            return delayed.compute()
        return delayed
Exemple #37
0
    def do(f):
        dataset = f[path]
        gpath = os.path.dirname('/' + path)
        g = f[gpath]

        d = da.from_array(dataset, chunks=_good_chunk(dataset))

        name = os.path.basename(dataset.name)
        tmp_name = name + '_tmp_cfm92askj3'
        if tmp_name in g:
            del g[tmp_name]
        tmp_d = g.create_dataset(tmp_name, shape=dataset.shape,
                             dtype=dataset.dtype, chunks=chunks,
                             compression=compression)
        da.store(d, tmp_d)

        del g[name]
        g[name] = g[tmp_name]
        del g[tmp_name]
Exemple #38
0
 def test_simple_delayed_write(self):
     """Test writing can be delayed."""
     import dask.array as da
     from satpy.writers.geotiff import GeoTIFFWriter
     datasets = self._get_test_datasets()
     w = GeoTIFFWriter(base_dir=self.base_dir)
     # when we switch to rio_save on XRImage then this will be sources
     # and targets
     res = w.save_datasets(datasets, compute=False)
     # this will fail if rasterio isn't installed
     self.assertIsInstance(res, tuple)
     # two lists, sources and destinations
     self.assertEqual(len(res), 2)
     self.assertIsInstance(res[0], list)
     self.assertIsInstance(res[1], list)
     self.assertIsInstance(res[0][0], da.Array)
     da.store(res[0], res[1])
     for target in res[1]:
         if hasattr(target, 'close'):
             target.close()
Exemple #39
0
def overwrite_dataset(group, data, key, signal_axes=None, chunks=None, **kwds):
    if chunks is None:
        if signal_axes is None:
            # Use automatic h5py chunking
            chunks = True
        else:
            # Optimise the chunking to contain at least one signal per chunk
            chunks = get_signal_chunks(data.shape, data.dtype, signal_axes)

    maxshape = tuple(None for _ in data.shape)

    got_data = False
    while not got_data:
        try:
            these_kwds = kwds.copy()
            these_kwds.update(dict(shape=data.shape,
                                   dtype=data.dtype,
                                   exact=True,
                                   maxshape=maxshape,
                                   chunks=chunks,
                                   shuffle=True,))

            # If chunks is True, the `chunks` attribute of `dset` below
            # contains the chunk shape guessed by h5py
            dset = group.require_dataset(key, **these_kwds)
            got_data = True
        except TypeError:
            # if the shape or dtype/etc do not match,
            # we delete the old one and create new in the next loop run
            del group[key]
    if dset == data:
        # just a reference to already created thing
        pass
    else:
        _logger.info("Chunks used for saving: %s" % str(dset.chunks))
        if isinstance(data, da.Array):
            da.store(data.rechunk(dset.chunks), dset)
        elif data.flags.c_contiguous:
            dset.write_direct(data)
        else:
            dset[:] = data
Exemple #40
0
 def sync(self, compute=True):
     if self.sources:
         import dask.array as da
         # TODO: consider wrapping targets with dask.delayed, if this makes
         # for any discernable difference in perforance, e.g.,
         # targets = [dask.delayed(t) for t in self.targets]
         delayed_store = da.store(self.sources, self.targets,
                                  lock=self.lock, compute=compute,
                                  flush=True)
         self.sources = []
         self.targets = []
         return delayed_store
Exemple #41
0
    def load(s, measure, dset_name, transpose_lst, df_attr='demog_df'):
        ''' given measure, h5 dataset name, transpose list: load data '''

        df = getattr(s, df_attr)

        if measure in dir(s):
            print(measure, 'already loaded')
            if df.shape[0] != getattr(s, measure).shape[0]:
                print('shape of loaded data does not match demogs, reloading')
            else:
                return np.array([])

        dsets = [h5py.File(fn, 'r')[dset_name] for fn in df['path'].values]
        arrays = [da.from_array(dset, chunks=dset.shape) for dset in dsets]
        stack = da.stack(arrays, axis=-1)  # concatenate along last axis
        stack = stack.transpose(transpose_lst)  # do transposition

        data = np.empty(stack.shape)
        da.store(stack, data)
        print(data.shape)
        return data
Exemple #42
0
def fft_to_hdf5(x, filename, axis=-1, chunksize=2**26, available_memory=(4 * 1024**3), cache=None):
    """Simple wrapper for DAFT FFT function that writes to HDF5

    This function calls the DAFT function, but also performs the computation of
    the FFT, and outputs the result into the requested HDF5 file

    Parameters
    ----------
    x : array_like
        Input array, can be complex.
    filename : string
        Relative or absolute path to HDF5 file.  If this string contains a
        colon, the preceding part is taken as the filename, while the following
        part is taken as the dataset group name.  The default group name is 'X'.
    axis : int, optional
        Axis over which to compute the FFT. If not given, the last axis is used.
    chunksize : int, optional
        Chunksize to use when splitting up the input array.  Default is 2**24,
        which is about 64MB -- a reasonable target that reduces memory usage.
    available_memory : int, optional
        Maximum amount of RAM to use for caching during computation.  Defaults
        to 4*1024**3, which is 4GB.

    """
    from h5py import File
    from dask import set_options
    from dask.array import store
    if cache is None:
        from chest import Chest  # For more flexible caching
        cache = Chest(available_memory=available_memory)
    if ':' in filename:
        filename, groupname = filename.split(':')
    else:
        groupname = 'X'
    X_dask = DAFT(x, axis=axis, chunksize=chunksize)
    with set_options(cache=cache):
        with File(filename, 'w') as f:
            output = f.create_dataset(groupname, shape=X_dask.shape, dtype=X_dask.dtype)
            store(X_dask, output)
    return
Exemple #43
0
def dask_detrend_data(data, output_arr):
    """
    Detrend data using a linear fit.

    Parameters
    ----------
    data: dask.array
        Input dataset to detrend.  Assumes leading axis is sampling dimension.
    output_arr: ndarray-like
        Output array with same shape as data to store detrended data.

    Notes
    -----
    This is a very expensive operation if using a large dataset.  May slow down
    if forced to spill onto the disk cache  It does not currently take into 
    account X data.  Instead, it creates a dummy array (using arange) for 
    sampling points.
    """

    dummy_time = np.arange(data.shape[0])[:, None]
    dummy_time = da.from_array(dummy_time, chunks=dummy_time.shape)

    # intercept handling
    x_offset = dummy_time.mean(axis=0)
    x_centered = dummy_time - x_offset
    y_offset = data.mean(axis=0)
    y_centered = data - y_offset

    coefs, resid, rank, s = da.linalg.lstsq(x_centered, y_centered)

    intercepts = y_offset - x_offset*coefs
    predict = da.dot(dummy_time, coefs) + intercepts
    detrended = data - predict

    da.store(detrended, output_arr)

    return output_arr
Exemple #44
0
def _eval_blocks(expression, vars, vlen, typesize, vm, out_flavor, blen,
                 **kwargs):
    """Perform the evaluation in blocks."""

    if not blen:
        # Compute the optimal block size (in elements)
        # The next is based on experiments with bench/ctable-query.py
        # and the 'movielens-bench' repository
        if vm == "numexpr":
            bsize = 2**23
        elif vm == "dask":
            bsize = 2**25
        else:  # python
            bsize = 2**21
        blen = int(bsize / typesize)
        # Protection against too large atomsizes
        if blen == 0:
            blen = 1

    if vm == "dask":
        if 'da' in vars:
            raise NameError(
                "'da' is reserved as a prefix for dask.array. "
                "Please use another prefix")
        for name in vars:
            var = vars[name]
            if is_sequence_like(var):
                vars[name] = da.from_array(var, chunks=(blen,) + var.shape[1:])
        # Build the expression graph
        vars['da'] = da
        da_expr = _eval(expression, vars)
        if out_flavor in ("bcolz", "carray") and da_expr.shape:
            result = bcolz.zeros(da_expr.shape, da_expr.dtype, **kwargs)
            # Store while compute expression graph
            da.store(da_expr, result)
            return result
        else:
            # Store while compute
            return np.array(da_expr)

    # Check whether we have a re_evaluate() function in numexpr
    re_evaluate = bcolz.numexpr_here and hasattr(bcolz.numexpr, "re_evaluate")

    vars_ = {}
    # Get containers for vars
    maxndims = 0
    for name in vars:
        var = vars[name]
        if is_sequence_like(var):
            ndims = len(var.shape) + len(var.dtype.shape)
            if ndims > maxndims:
                maxndims = ndims
            if len(var) > blen and hasattr(var, "_getrange"):
                    shape = (blen, ) + var.shape[1:]
                    vars_[name] = np.empty(shape, dtype=var.dtype)

    for i in xrange(0, vlen, blen):
        # Fill buffers for vars
        for name in vars:
            var = vars[name]
            if is_sequence_like(var) and len(var) > blen:
                if hasattr(var, "_getrange"):
                    if i+blen < vlen:
                        var._getrange(i, blen, vars_[name])
                    else:
                        vars_[name] = var[i:]
                else:
                    vars_[name] = var[i:i+blen]
            else:
                if hasattr(var, "__getitem__"):
                    vars_[name] = var[:]
                else:
                    vars_[name] = var

        # Perform the evaluation for this block
        if vm == "python":
            res_block = _eval(expression, vars_)
        else:
            if i == 0 or not re_evaluate:
                try:
                    res_block = bcolz.numexpr.evaluate(expression,
                                                       local_dict=vars_)
                except ValueError:
                    # numexpr cannot handle this, so fall back to "python" vm
                    warnings.warn(
                        "numexpr cannot handle this expression: falling back "
                        "to the 'python' virtual machine.  You can choose "
                        "another virtual machine by using the `vm` parameter.")
                    return _eval_blocks(
                        expression, vars, vlen, typesize, "python",
                        out_flavor, blen, **kwargs)
            else:
                res_block = bcolz.numexpr.re_evaluate(local_dict=vars_)

        if i == 0:
            # Detection of reduction operations
            scalar = False
            dim_reduction = False
            if len(res_block.shape) == 0:
                scalar = True
                result = res_block
                continue
            elif len(res_block.shape) < maxndims:
                dim_reduction = True
                result = res_block
                continue
            # Get a decent default for expectedlen
            if out_flavor in ("bcolz", "carray"):
                nrows = kwargs.pop('expectedlen', vlen)
                result = bcolz.carray(res_block, expectedlen=nrows, **kwargs)
            else:
                out_shape = list(res_block.shape)
                out_shape[0] = vlen
                result = np.empty(out_shape, dtype=res_block.dtype)
                result[:blen] = res_block
        else:
            if scalar or dim_reduction:
                result += res_block
            elif out_flavor in ("bcolz", "carray"):
                result.append(res_block)
            else:
                result[i:i+blen] = res_block

    if isinstance(result, bcolz.carray):
        result.flush()
    if scalar:
        return result[()]
    return result
Exemple #45
0
def calc_anomaly(data, yrsize, climo=None, output_arr=None):
    """
    Caculate anomaly for the given data.  Right now it assumes sub-annual data
    input so that the climatology subtracts means for each month instead
    of the mean of the entire series.

    Note: May take yrsize argument out and leave it to user to format data
    as to take the desired anomaly.

    Parameters
    ----------
    data: ndarray
        Input data to calculate the anomaly from.  Leading dimension should be
        the temporal axis.
    yrsize: int
        Number of elements that compose a full year.  Used to reshape the data
        time axis to num years x size year for climatology purposes.
    climo: ndarray, optional
        User-provided climatology to subtract from the data.  Must be
        broadcastable over the time-dimension of data
    output_arr: ndarray-like, optional
        Array to place output of anomaly calculation that supports 
        ndarray-like slicing.  This is required for dask array input.
    Returns
    -------
    anomaly: ndarray-like
        Data converted to its anomaly form.
    climo: ndarray
        The calculated climatology that was subtracted from the data
    """

    yrsize = int(yrsize)
    if not yrsize >= 1:
        raise ValueError('yrsize must be an integer >= 1')

    # Reshape to take monthly mean
    old_shp = data.shape
    new_shp = (old_shp[0]//yrsize, yrsize, old_shp[1])
    data = data.reshape(new_shp)

    # Use of data[:] should work for ndarray or ndarray-like
    if climo is None:
        climo = data.mean(axis=0, keepdims=True)

    if is_dask_array(data):
        if output_arr is None:
            raise ValueError('calc_anomaly requires an output array keyword '
                             'argument when operating on a Dask array.')

        anomaly = data - climo
        old_shp_anom = anomaly.reshape(old_shp)
        da.store(old_shp_anom, output_arr)
        out_climo = climo.compute()
    else:
        if output_arr is not None:
            output_arr[:] = np.squeeze(ne.evaluate('data - climo'))
        else:
            output_arr = np.squeeze(ne.evaluate('data - climo'))
        output_arr = output_arr.reshape(old_shp)
        out_climo = climo

    return output_arr, out_climo
Exemple #46
0
def run_mean(data, window_size, trim_edge=None, output_arr=None):
    """
    A function for calculating the running mean on data.

    Parameters
    ----------
    data: ndarray
        Data matrix to perform running mean over. Expected to be in time(row) x
        space(column) format. And that samples span full years.
    window_size: int
        Size of the window to compute the running mean over.
    trim_edge: int, optional
        Remove specified items from the start and end of the sampling
        dimension of the running mean.  Otherwise the window_size/2 items at
        the start and the end will have reflected padding effects.
    output_arr: ndarray-like, optional
        Array to place output of running mean that supports
        ndarray-like slicing.  This is required for dask array input.

    Returns
    -------
    result: ndarray
        Running mean result of given data.
    bot_edge: int
        Number of elements removed from beginning of the time series
    top_edge: int
        Number of elements removed from the ending of the time series
    """

    sample_len = data.shape[0]
    if sample_len < window_size:
        raise ValueError("Window size must be smaller than or equal to the "
                         "length of the time dimension of the data.")

    if trim_edge is not None:
        sample_len -= trim_edge*2

        if sample_len < 1:
            raise ValueError('Not enough data to trim edges. Please try with '
                             'trim_edge=None')

    weights = [1.0/float(window_size) for _ in xrange(window_size)]
    if is_dask_array(data):
        if output_arr is None:
            raise ValueError('calc_anomaly requires an output array keyword '
                             'argument when operating on a Dask array.')

        def _run_mean_block(block):
            return convolve1d(block, weights, axis=0)

        old_chunk_shape = data
        pad = window_size // 2
        ghost = da.ghost.ghost(data, depth={0: pad}, boundary={0: 'reflect'})
        filt = ghost.map_blocks(_run_mean_block)
        unpadded = da.ghost.trim_internal(filt, {0: pad})
        if trim_edge is not None:
            unpadded = unpadded[trim_edge:-trim_edge]

        da.store(unpadded, output_arr)
    else:
        res = convolve1d(data, weights, axis=0)
        if trim_edge:
            res = res[trim_edge:-trim_edge]

        if output_arr is not None:
            output_arr[:] = res
        else:
            output_arr = res

    return output_arr
Exemple #47
0
def dataarray_to_swath_product(ds, swath_def, overwrite_existing=False):
    info = ds.attrs.copy()
    info.pop("area")
    if ds.ndim == 3:
        # RGB composite
        if ds.shape[0] in [3, 4]:
            channels = ds.shape[0]
        else:
            # unpreferred array orientation
            channels = ds.shape[-1]
            ds = np.rollaxis(ds, 2)
    else:
        channels = 1

    if ds.ndim == 1:
        rows, cols = ds.shape[0], 1
    else:
        rows, cols = ds.shape[-2:]
    if np.issubdtype(np.dtype(ds.dtype), np.floating):
        dtype = np.float32
    else:
        dtype = ds.dtype

    if isinstance(info["sensor"], bytes):
        info["sensor"] = info["sensor"].decode("utf-8")

    p2g_metadata = {
        "product_name": info["name"],
        "satellite": info["platform_name"].lower(),
        "instrument": info["sensor"].lower() if isinstance(info["sensor"], str) else list(info["sensor"])[0].lower(),
        "data_kind": info["standard_name"],
        "begin_time": info["start_time"],
        "end_time": info["end_time"],
        "fill_value": np.nan,
        "swath_columns": cols,
        "swath_rows": rows,
        "rows_per_scan": info.get("rows_per_scan", rows),
        "data_type": dtype,
        "swath_definition": swath_def,
        "channels": channels,
    }

    info.update(p2g_metadata)

    if channels == 1:
        filename = info["name"] + ".dat"
        info["swath_data"] = filename
        if os.path.isfile(filename):
            if not overwrite_existing:
                LOG.error("Binary file already exists: %s" % (filename,))
                raise RuntimeError("Binary file already exists: %s" % (filename,))
            else:
                LOG.warning("Binary file already exists, will overwrite: %s", filename)
        LOG.info("Writing band data to disk cache...")
        p2g_arr = np.memmap(filename, mode="w+", dtype=dtype, shape=ds.shape)
        ds = ds.where(ds.notnull(), np.nan)
        da.store(ds.data.astype(dtype), p2g_arr)
        yield containers.SwathProduct(**info)
    else:
        for chn_idx in range(channels):
            tmp_info = info.copy()
            tmp_info["product_name"] = info["product_name"] + "_rgb_{:d}".format(chn_idx)
            filename = tmp_info["product_name"] + ".dat"
            tmp_info["swath_data"] = filename
            if os.path.isfile(filename):
                if not overwrite_existing:
                    LOG.error("Binary file already exists: %s" % (filename,))
                    raise RuntimeError("Binary file already exists: %s" % (filename,))
                else:
                    LOG.warning("Binary file already exists, will overwrite: %s", filename)
            LOG.info("Writing band data to disk cache...")
            p2g_arr = np.memmap(filename, mode="w+", dtype=dtype, shape=ds.shape[-2:])
            da.store(ds.data[chn_idx].astype(dtype), p2g_arr)
            yield containers.SwathProduct(**tmp_info)
Exemple #48
0
    def rio_save(self, filename, fformat=None, fill_value=None,
                 dtype=np.uint8, compute=True, tags=None,
                 keep_palette=False, cmap=None,
                 **format_kwargs):
        """Save the image using rasterio.

        Overviews can be added to the file using the `overviews` kwarg, eg::

          img.rio_save('myfile.tif', overviews=[2, 4, 8, 16])

        """
        fformat = fformat or os.path.splitext(filename)[1][1:4]
        drivers = {'jpg': 'JPEG',
                   'png': 'PNG',
                   'tif': 'GTiff',
                   'jp2': 'JP2OpenJPEG'}
        driver = drivers.get(fformat, fformat)

        if tags is None:
            tags = {}

        data, mode = self.finalize(fill_value, dtype=dtype,
                                   keep_palette=keep_palette, cmap=cmap)
        data = data.transpose('bands', 'y', 'x')
        data.attrs = self.data.attrs

        crs = None
        gcps = None
        transform = None
        if driver in ['GTiff', 'JP2OpenJPEG']:
            if not np.issubdtype(data.dtype, np.floating):
                format_kwargs.setdefault('compress', 'DEFLATE')
            photometric_map = {
                'RGB': 'RGB',
                'RGBA': 'RGB',
                'CMYK': 'CMYK',
                'CMYKA': 'CMYK',
                'YCBCR': 'YCBCR',
                'YCBCRA': 'YCBCR',
            }
            if mode.upper() in photometric_map:
                format_kwargs.setdefault('photometric',
                                         photometric_map[mode.upper()])

            try:
                crs = rasterio.crs.CRS(data.attrs['area'].proj_dict)
                west, south, east, north = data.attrs['area'].area_extent
                height, width = data.sizes['y'], data.sizes['x']
                transform = rasterio.transform.from_bounds(west, south,
                                                           east, north,
                                                           width, height)

            except KeyError:  # No area
                logger.info("Couldn't create geotransform")
            except AttributeError:
                try:
                    gcps = data.attrs['area'].lons.attrs['gcps']
                    crs = data.attrs['area'].lons.attrs['crs']
                except KeyError:
                    logger.info("Couldn't create geotransform")

            if "start_time" in data.attrs:
                stime = data.attrs['start_time']
                stime_str = stime.strftime("%Y:%m:%d %H:%M:%S")
                tags.setdefault('TIFFTAG_DATETIME', stime_str)
        elif driver == 'JPEG' and 'A' in mode:
            raise ValueError('JPEG does not support alpha')

        # FIXME add metadata
        r_file = RIOFile(filename, 'w', driver=driver,
                         width=data.sizes['x'], height=data.sizes['y'],
                         count=data.sizes['bands'],
                         dtype=dtype,
                         nodata=fill_value,
                         crs=crs,
                         transform=transform,
                         gcps=gcps,
                         **format_kwargs)
        r_file.open()
        if not keep_palette:
            r_file.colorinterp = color_interp(data)
        r_file.rfile.update_tags(**tags)

        if keep_palette and cmap is not None:
            if data.dtype != 'uint8':
                raise ValueError('Rasterio only supports 8-bit colormaps')
            try:
                from trollimage.colormap import Colormap
                cmap = cmap.to_rio() if isinstance(cmap, Colormap) else cmap
                r_file.rfile.write_colormap(1, cmap)
            except AttributeError:
                raise ValueError("Colormap is not formatted correctly")

        if compute:
            # write data to the file now
            res = da.store(data.data, r_file)
            r_file.close()
            return res
        # provide the data object and the opened file so the caller can
        # store them when they would like. Caller is responsible for
        # closing the file
        return data.data, r_file
Exemple #49
0
def calc_eofs(data, num_eigs, ret_pcs=False, var_stats_dict=None):
    """
    Method to calculate the EOFs of given  dataset.  This assumes data comes in as
    an m x n matrix where m is the temporal dimension and n is the spatial
    dimension.

    Parameters
    ----------
    data: ndarray
        Dataset to calculate EOFs from
    num_eigs: int
        Number of eigenvalues/vectors to return.  Must be less than min(m, n).
    ret_pcs: bool, optional
        Return principal component matrix along with EOFs
    var_stats_dict: dict, optional
        Dictionary target to star some simple statistics about the EOF
        calculation.  Note: if this is provided for a dask array it prompts two
        SVD calculations for both the compressed and full singular values.

    Returns
    -------
    eofs: ndarray
        The eofs (as column vectors) of the data with dimensions n x k where
        k is the num_eigs.
    svals: ndarray
        Singular values from the svd decomposition.  Returned as a row vector
        in order from largest to smallest.
    """

    if is_dask_array(data):
        pcs, full_svals, eofs = da.linalg.svd_compressed(data, num_eigs)

        out_svals = np.zeros(num_eigs)
        out_eofs = np.zeros((num_eigs, data.shape[1]))
        out_pcs = np.zeros((data.shape[0], num_eigs))
        da.store([eofs, full_svals, pcs], [out_eofs, out_svals, out_pcs])

        out_eofs = out_eofs.T
        out_pcs = out_pcs.T

        if var_stats_dict is not None:
            logger.warning('Cannot currently provide variance statistics for '
                           'EOFs computed on a dask array.')

    else:
        eofs, full_svals, pcs = svd(data[:].T, full_matrices=False)
        out_eofs = eofs[:, :num_eigs]
        out_svals = full_svals[:num_eigs]
        out_pcs = pcs[:num_eigs]

    # variance stats
    if var_stats_dict is not None:
        try:
            nt = data.shape[0]
            ns = data.shape[1]
            eig_vals = (full_svals ** 2) / (nt * ns)
            total_var = eig_vals.sum()
            var_expl_by_mode = eig_vals / total_var
            var_expl_by_retained = var_expl_by_mode[0:num_eigs].sum()

            var_stats_dict['nt'] = nt
            var_stats_dict['ns'] = ns
            var_stats_dict['eigvals'] = eig_vals
            var_stats_dict['num_ret_modes'] = num_eigs
            var_stats_dict['total_var'] = total_var
            var_stats_dict['var_expl_by_mode'] = var_expl_by_mode
            var_stats_dict['var_expl_by_ret'] = var_expl_by_retained
        except TypeError as e:
            print 'Must past dictionary type to var_stats_dict in order to ' \
                  'output variance statistics.'
            print e

    if ret_pcs:
        return out_eofs, out_svals, out_pcs
    else:
        return out_eofs, out_svals
Exemple #50
0
def valid_images_to_hdf5(directory, width=224, height=224, channels=3):
    '''
    Function to build needed arrays for training or validating the neural network using out of core processing.
    If labels are passed, get a list of training image files, their labels
    '''    
    
    validationList, _ = get_list_of_validation_files(directory)  # Pass directory containing validation images
    print('Creating the hdf5 file...')
    len_array = len(validationList)
    with h5py.File('validation_files.h5', 'w') as hf:
        dset = hf.create_dataset('validation_array', (len_array, channels, width, height), chunks=True)
        img_names = hf.create_dataset('image_names', (len_array,), chunks=True, dtype='S40')

    with h5py.File('validation_files.h5', 'r+') as hf:
        x = hf['validation_array']
        X = da.from_array(x, chunks=1000)
        image_names = list(hf['image_names'])

    print('There are ', len(validationList), ' files in the validation list.')
    print('Breaking the validation list into chunks of 10,000...')
    chunkedList = get_chunks(validationList, 10000)    # Break the list of files in to chunks of 10000

    if channels == 3:
        for i, chunk in enumerate(chunkedList):
#            print(chunk)
            count = i + len(chunk[i][:])*i                 # Set counter for empty array
#            valid_sublist = chunk[i][:]
            print('Create empty list to store image names..')
            filenames = []
            print('Creating an empty array to store images...')
            X = create_holding_array(chunk, width = width, height=height, channels=channels)    # Create empty array
            for j, validFile in enumerate(chunk):
                print('Reading file #:  ', j)
                filenames.append(os.path.basename(validFile))
#                print(chunk)
#                input('')
                img = misc.imread(validFile)                                # Read the image
                img = misc.imresize(img, size = (width, height, channels))  # Resize image with color channel = 3
#                img = np.transpose(img, (2,0,1))    # Store resized image in empty array
                X[j] = img
            asciiList = []
            asciiList = [n.encode("ascii", "ignore") for n in filenames]
            X1 = np.transpose(X, (0, 3, 1, 2))
            del X, filenames
            print(X1.shape)
            X_da = da.from_array(X1, chunks=1000)
            print('Opening validation_files.h5...')
            with h5py.File('validation_files.h5', 'r+') as hf:
                print('Putting validation_array in x...')
                x = hf['validation_array']
                print('Putting validation_array in dask array...')
                dset = da.from_array(x, chunks=1000)
                print('Concatenating the two dask arrays...')
                X2 = da.concatenate([dset, X_da], axis=0)
                print('Storing the dask array in the hdf5 file...')
                da.store(X2, x)
                print('Put image_names dset into a list...')
                image_names = list(hf['image_names'])
                print('Extend the list with additional image names...')
                image_names.extend(asciiList)
                

            print('Done.')    
        return filenames

    else:     # If number of channels != 1 or != 3

            print('Could not create dataset and resize training images...')
Exemple #51
0
def to_geotiff(arr, path='./output.tif', proj=None, spec=None, bands=None, **kwargs):
    ''' Write out a geotiff file of the image

    Args:
        path (str): path to write the geotiff file to, default is ./output.tif
        proj (str): EPSG string of projection to reproject to
        spec (str): if set to 'rgb', write out color-balanced 8-bit RGB tif
        bands (list): list of bands to export. If spec='rgb' will default to RGB bands
    
    Returns:
        str: path the geotiff was written to'''
        
    assert has_rasterio, "To create geotiff images please install rasterio" 

    try:
        img_md = arr.rda.metadata["image"]
        x_size = img_md["tileXSize"]
        y_size = img_md["tileYSize"]
    except (AttributeError, KeyError):
        x_size = kwargs.get("chunk_size", 256)
        y_size = kwargs.get("chunk_size", 256)

    try:
        tfm = kwargs['transform'] if 'transform' in kwargs else arr.affine
    except:
        tfm = None

    dtype = arr.dtype.name if arr.dtype.name != 'int8' else 'uint8' 

    if spec is not None and spec.lower() == 'rgb':
        if bands is None:
            bands = arr._rgb_bands
        # skip if already DRA'ed
        if not arr.options.get('dra'):
            # add the RDA HistogramDRA op to get a RGB 8-bit image
            from gbdxtools.rda.interface import RDA
            rda = RDA()
            dra = rda.HistogramDRA(arr)
            # Reset the bounds and select the bands on the new Dask
            arr = dra.aoi(bbox=arr.bounds)
        arr = arr[bands,...].astype(np.uint8)
        dtype = 'uint8'
    else:
        if bands is not None:
            arr = arr[bands,...]
    meta = {
        'width': arr.shape[2],
        'height': arr.shape[1],
        'count': arr.shape[0],
        'dtype': dtype,
        'driver': 'GTiff',
        'transform': tfm
    }
    if proj is not None:
        meta["crs"] = {'init': proj}

    if "tiled" in kwargs and kwargs["tiled"]:
        meta.update(blockxsize=x_size, blockysize=y_size, tiled="yes")

    with rasterio.open(path, "w", **meta) as dst:
        writer = rio_writer(dst)
        result = store(arr, writer, compute=False)
        result.compute(scheduler=threaded_get)
    
    return path
Exemple #52
0
 def sync(self):
     if self.sources:
         import dask.array as da
         da.store(self.sources, self.targets)
         self.sources = []
         self.targets = []