コード例 #1
0
ファイル: parallel.py プロジェクト: creare-com/podpac
    def eval(self, coordinates, **kwargs):
        output = kwargs.get("output")
        if self.zarr_shape is None:
            self._shape = coordinates.shape
        else:
            self._shape = tuple(self.zarr_shape.values())

        # initialize zarr file
        if self.zarr_chunks is None:
            chunks = [self.chunks[d] for d in coordinates]
        else:
            chunks = [self.zarr_chunks[d] for d in coordinates]
        self._chunks = chunks
        zf, data_key, zn = self.initialize_zarr_array(self._shape, chunks)
        self.dataset = zf
        self.zarr_data_key = data_key
        self.zarr_node = zn
        zn.keys

        # eval
        _log.debug("Starting parallel eval.")
        missing_dims = [
            d for d in coordinates.dims if d not in self.chunks.keys()
        ]
        if self.zarr_coordinates is not None:
            missing_dims = missing_dims + [
                d for d in self.zarr_coordinates.dims if d not in missing_dims
            ]
            set_coords = merge_dims(
                [coordinates.drop(missing_dims), self.zarr_coordinates])
        else:
            set_coords = coordinates.drop(missing_dims)
        set_coords.transpose(*coordinates.dims)

        self.set_zarr_coordinates(set_coords, data_key)
        if self.list_dir:
            dk = data_key
            if isinstance(dk, list):
                dk = dk[0]
            self._list_dir = self.zarr_node.list_dir(dk)

        output = super(ZarrOutputMixin, self).eval(coordinates, output=output)

        # fill in the coordinates, this is guaranteed to be correct even if the user messed up.
        if output is not None:
            self.set_zarr_coordinates(Coordinates.from_xarray(output),
                                      data_key)
        else:
            return zf

        return output
コード例 #2
0
 def find_coordinates(self):
     '''
     {native_coordinates}
     
     Notes
     -----
     These coordinates are computed, assuming dataset is regular.
     '''
     if self.product in SMAP_IRREGULAR_COORDINATES:
         raise Exception("Native coordinates too large. Try using get_filename_coordinates_sources().")
     
     shared = self.get_shared_coordinates()
     partial_sources = self.get_source_coordinates()['time'].coordinates
     complete_source_0 = self.sources[0].get_source_coordinates()['time'].coordinates
     offset = complete_source_0 - partial_sources[0]
     full_times = (partial_sources[:, None] + offset[None, :]).ravel()
     return [merge_dims([podpac.Coordinates([full_times], ['time']), shared])]
コード例 #3
0
ファイル: rasterio_source.py プロジェクト: creare-com/podpac
    def get_data_overviews(self, coordinates, coordinates_index):
        # Figure out how much coarser the request is than the actual data
        reduction_factor = np.inf
        for c in ["lat", "lon"]:
            crd = coordinates[c]
            if crd.size == 1:
                reduction_factor = 0
                break
            if isinstance(crd, UniformCoordinates1d):
                min_delta = crd.step
            elif isinstance(crd, ArrayCoordinates1d) and crd.is_monotonic:
                min_delta = crd.deltas.min()
            else:
                raise NotImplementedError(
                    "The Rasterio node with prefer_overviews=True currently does not support request coordinates type {}"
                    .format(coordinates))
            reduction_factor = min(
                reduction_factor,
                np.abs(min_delta / self.coordinates[c].step
                       )  # self.coordinates is always uniform
            )
        # Find the overview that's closest to this reduction factor
        if (reduction_factor < 2) or (len(
                self.overviews) == 0):  # Then we shouldn't use an overview
            overview = 1
            overview_level = None
        else:
            diffs = reduction_factor - np.array(self.overviews)
            if self.prefer_overviews_closest:
                diffs = np.abs(diffs)
            else:
                diffs[diffs < 0] = np.inf
            overview_level = np.argmin(diffs)
            overview = self.overviews[np.argmin(diffs)]

        # Now read the data
        inds = coordinates_index
        if overview_level is None:
            dataset = self.dataset
        else:
            dataset = self.open_dataset(self.source, overview_level)
        try:
            # read data within coordinates_index window at the resolution of the overview
            # Rasterio will then automatically pull from the overview
            window = (
                ((inds[0].min() // overview),
                 int(np.ceil(inds[0].max() / overview) + 1)),
                ((inds[1].min() // overview),
                 int(np.ceil(inds[1].max() / overview) + 1)),
            )
            slc = (slice(window[0][0], window[0][1],
                         1), slice(window[1][0], window[1][1], 1))
            new_coords = Coordinates.from_geotransform(
                dataset.transform.to_gdal(),
                dataset.shape,
                crs=self.coordinates.crs)
            new_coords = new_coords[slc]
            missing_coords = self.coordinates.drop(["lat", "lon"])
            new_coords = merge_dims([new_coords, missing_coords])
            new_coords = new_coords.transpose(*self.coordinates.dims)
            coordinates_shape = new_coords.shape[:2]

            # The following lines are *nearly* copied/pasted from get_data
            if self.outputs is not None:  # read all the bands
                raster_data = dataset.read(out_shape=(len(self.outputs), ) +
                                           coordinates_shape,
                                           window=window)
                raster_data = np.moveaxis(raster_data, 0, 2)
            else:  # read the requested band
                raster_data = dataset.read(self.band,
                                           out_shape=coordinates_shape,
                                           window=window)

            # set raster data to output array
            data = self.create_output_array(new_coords)
            data.data.ravel()[:] = raster_data.ravel()
        except Exception as e:
            _logger.error(
                "Error occurred when reading overview with Rasterio: {}".
                format(e))

        if overview_level is not None:
            dataset.close()

        return data
コード例 #4
0
ファイル: parallel.py プロジェクト: creare-com/podpac
    def eval(self, coordinates, **kwargs):
        output = kwargs.get("output")
        # Make a thread pool to manage queue
        pool = ThreadPool(processes=self.number_of_workers)

        if output is None and self.fill_output:
            output = self.create_output_array(coordinates)

        shape = []
        for d in coordinates.dims:
            if d in self.chunks:
                shape.append(self.chunks[d])
            else:
                shape.append(coordinates[d].size)

        results = []
        #         inputs = []
        i = 0
        for coords, slc in coordinates.iterchunks(shape, True):
            #             inputs.append(coords)
            if i < self.start_i:
                _log.debug(
                    "Skipping {} since it is less than self.start_i ({})".
                    format(i, self.start_i))
                i += 1
                continue

            out = None
            if self.fill_output and output is not None:
                out = output[slc]
            with self._lock:
                _log.debug("Added {} to worker pool".format(i))
                _log.debug("Node eval with coords: {}, {}".format(slc, coords))
                results.append(
                    pool.apply_async(self.eval_source, [coords, slc, out, i]))
            i += 1

        _log.info("Added all chunks to worker pool. Now waiting for results.")
        start_time = time.time()
        for i, res in enumerate(results):
            #             _log.debug('Waiting for results: {} {}'.format(i, inputs[i]))
            dt = str(
                np.timedelta64(int(1000 * (time.time() - start_time)),
                               "ms").astype(object))
            _log.info("({}): Waiting for results: {} / {}".format(
                dt, i + 1, len(results)))

            # Try to get the results / wait for the results
            try:
                o, slc = res.get()
            except Exception as e:
                o = None
                slc = None
                self.errors.append((i, res, e))
                dt = str(
                    np.timedelta64(int(1000 * (time.time() - start_time)),
                                   "ms").astype(object))
                _log.warning("({}) {} failed with exception {}".format(
                    dt, i, e))

            dt = str(
                np.timedelta64(int(1000 * (time.time() - start_time)),
                               "ms").astype(object))
            _log.info("({}) Finished result: {} / {}".format(
                time.time() - start_time, i + 1, len(results)))

            # Fill output
            if self.fill_output:
                if output is None:
                    missing_dims = [
                        d for d in coordinates.dims
                        if d not in self.chunks.keys()
                    ]
                    coords = coordinates.drop(missing_dims)
                    missing_coords = Coordinates.from_xarray(o).drop(
                        list(self.chunks.keys()))
                    coords = merge_dims([coords, missing_coords])
                    coords = coords.transpose(*coordinates.dims)
                    output = self.create_output_array(coords)
                output[slc] = o

        _log.info("Completed parallel execution.")
        pool.close()

        return output
コード例 #5
0
    def interpolate(self, source_coordinates, source_data, eval_coordinates, output_data):
        """Interpolate data from requested coordinates to source coordinates

        Parameters
        ----------
        source_coordinates : :class:`podpac.Coordinates`
            Description
        source_data : podpac.core.units.UnitsDataArray
            Description
        eval_coordinates : :class:`podpac.Coordinates`
            Description
        output_data : podpac.core.units.UnitsDataArray
            Description

        Returns
        -------
        podpac.core.units.UnitDataArray
            returns the new output UnitDataArray of interpolated data

        Raises
        ------
        InterpolationException
            Raises InterpolationException when interpolator definition can't support all the dimensions
            of the requested coordinates
        """

        # loop through multiple outputs if necessary
        if "output" in output_data.dims:
            for output in output_data.coords["output"]:
                output_data.sel(output=output)[:] = self.interpolate(
                    source_coordinates,
                    source_data.sel(output=output).drop("output"),
                    eval_coordinates,
                    output_data.sel(output=output).drop("output"),
                )
            return output_data

        ## drop already-selected output variable
        # if "output" in output_data.coords:
        # source_data = source_data.drop("output")
        # output_data = output_data.drop("output")

        # short circuit if the source data and requested coordinates are of shape == 1
        if source_data.size == 1 and eval_coordinates.size == 1:
            output_data.data[:] = source_data.data.flatten()[0]
            return output_data

        # short circuit if source_coordinates contains eval_coordinates
        # TODO handle stacked issubset of unstacked case
        #      this case is currently skipped because of the set(eval_coordinates) == set(source_coordinates)))
        if eval_coordinates.issubset(source_coordinates) and set(eval_coordinates) == set(source_coordinates):
            if any(isinstance(c, StackedCoordinates) and c.ndim > 1 for c in eval_coordinates.values()):
                # TODO AFFINE
                # currently this is bypassing the short-circuit in the shaped stacked coordinates case
                pass
            else:
                try:
                    data = source_data.interp(output_data.coords, method="nearest")
                except (NotImplementedError, ValueError):
                    data = source_data.sel(output_data.coords)

                output_data.data[:] = data.transpose(*output_data.dims)
                return output_data

        interpolator_queue = self._select_interpolator_queue(
            source_coordinates, eval_coordinates, "can_interpolate", strict=True
        )

        # for debugging purposes, save the last defined interpolator queue
        self._last_interpolator_queue = interpolator_queue

        # reset interpolation parameters
        for k in self._interpolation_params:
            self._interpolation_params[k] = False

        # iterate through each dim tuple in the queue
        dtype = output_data.dtype
        attrs = source_data.attrs
        for udims, interpolator in interpolator_queue.items():
            # TODO move the above short-circuits into this loop
            if all([ud not in source_coordinates.udims for ud in udims]):
                # Skip this udim if it's not part of the source coordinates (can happen with default)
                continue
            # Check if parameters are being used
            for k in self._interpolation_params:
                self._interpolation_params[k] = hasattr(interpolator, k) or self._interpolation_params[k]

            # interp_coordinates are essentially intermediate eval_coordinates
            interp_dims = [dim for dim, c in source_coordinates.items() if set(c.dims).issubset(udims)]
            other_dims = [dim for dim, c in eval_coordinates.items() if not set(c.dims).issubset(udims)]
            interp_coordinates = merge_dims(
                [source_coordinates.drop(interp_dims), eval_coordinates.drop(other_dims)], validate_crs=False
            )
            interp_data = UnitsDataArray.create(interp_coordinates, dtype=dtype)
            interp_data = interpolator.interpolate(
                udims, source_coordinates, source_data, interp_coordinates, interp_data
            )

            # prepare for the next iteration
            source_data = interp_data.transpose(*interp_coordinates.xdims)
            source_data.attrs = attrs
            source_coordinates = interp_coordinates

        output_data.data = interp_data.transpose(*output_data.dims)

        # Throw warnings for unused parameters
        for k in self._interpolation_params:
            if self._interpolation_params[k]:
                continue
            _logger.warning("The interpolation parameter '{}' was ignored during interpolation.".format(k))

        return output_data
コード例 #6
0
    def iteroutputs(self, coordinates):
        """Summary
        
        Parameters
        ----------
        coordinates : :class:`podpac.Coordinates`
            Coordinates to evaluate at compositor sources
        
        Yields
        ------
        :class:`podpac.core.units.UnitsDataArray`
            Output from source node eval method
        """
        # downselect sources based on coordinates
        src_subset = self.select_sources(coordinates)

        if len(src_subset) == 0:
            yield self.create_output_array(coordinates)
            return

        # Set the interpolation properties for sources
        if self.interpolation is not None:
            for s in src_subset.ravel():
                if trait_is_defined(self, 'interpolation'):
                    s.interpolation = self.interpolation

        # Optimization: if coordinates complete and source coords is 1D,
        # set native_coordinates unless they are set already
        # WARNING: this assumes
        #              native_coords = source_coords + shared_coordinates
        #         NOT  native_coords = shared_coords + source_coords
        if self.is_source_coordinates_complete and self.source_coordinates.ndim == 1:
            coords_subset = list(
                self.source_coordinates.intersect(
                    coordinates, outer=True).coords.values())[0]
            coords_dim = list(self.source_coordinates.dims)[0]
            for s, c in zip(src_subset, coords_subset):
                nc = merge_dims([
                    Coordinates(np.atleast_1d(c), dims=[coords_dim]),
                    self.shared_coordinates
                ])

                if trait_is_defined(s, 'native_coordinates') is False:
                    s.native_coordinates = nc

        if self.threaded:
            # TODO pool of pre-allocated scratch space
            # TODO: docstring?
            def f(src):
                return src.eval(coordinates)

            pool = ThreadPool(processes=self.n_threads)
            results = [pool.apply_async(f, [src]) for src in src_subset]

            for src, res in zip(src_subset, results):
                yield res.get()
                #src._output = None # free up memory

        else:
            output = None  # scratch space
            for src in src_subset:
                output = src.eval(coordinates, output)
                yield output