def eval(self, coordinates, **kwargs): output = kwargs.get("output") if self.zarr_shape is None: self._shape = coordinates.shape else: self._shape = tuple(self.zarr_shape.values()) # initialize zarr file if self.zarr_chunks is None: chunks = [self.chunks[d] for d in coordinates] else: chunks = [self.zarr_chunks[d] for d in coordinates] self._chunks = chunks zf, data_key, zn = self.initialize_zarr_array(self._shape, chunks) self.dataset = zf self.zarr_data_key = data_key self.zarr_node = zn zn.keys # eval _log.debug("Starting parallel eval.") missing_dims = [ d for d in coordinates.dims if d not in self.chunks.keys() ] if self.zarr_coordinates is not None: missing_dims = missing_dims + [ d for d in self.zarr_coordinates.dims if d not in missing_dims ] set_coords = merge_dims( [coordinates.drop(missing_dims), self.zarr_coordinates]) else: set_coords = coordinates.drop(missing_dims) set_coords.transpose(*coordinates.dims) self.set_zarr_coordinates(set_coords, data_key) if self.list_dir: dk = data_key if isinstance(dk, list): dk = dk[0] self._list_dir = self.zarr_node.list_dir(dk) output = super(ZarrOutputMixin, self).eval(coordinates, output=output) # fill in the coordinates, this is guaranteed to be correct even if the user messed up. if output is not None: self.set_zarr_coordinates(Coordinates.from_xarray(output), data_key) else: return zf return output
def find_coordinates(self): ''' {native_coordinates} Notes ----- These coordinates are computed, assuming dataset is regular. ''' if self.product in SMAP_IRREGULAR_COORDINATES: raise Exception("Native coordinates too large. Try using get_filename_coordinates_sources().") shared = self.get_shared_coordinates() partial_sources = self.get_source_coordinates()['time'].coordinates complete_source_0 = self.sources[0].get_source_coordinates()['time'].coordinates offset = complete_source_0 - partial_sources[0] full_times = (partial_sources[:, None] + offset[None, :]).ravel() return [merge_dims([podpac.Coordinates([full_times], ['time']), shared])]
def get_data_overviews(self, coordinates, coordinates_index): # Figure out how much coarser the request is than the actual data reduction_factor = np.inf for c in ["lat", "lon"]: crd = coordinates[c] if crd.size == 1: reduction_factor = 0 break if isinstance(crd, UniformCoordinates1d): min_delta = crd.step elif isinstance(crd, ArrayCoordinates1d) and crd.is_monotonic: min_delta = crd.deltas.min() else: raise NotImplementedError( "The Rasterio node with prefer_overviews=True currently does not support request coordinates type {}" .format(coordinates)) reduction_factor = min( reduction_factor, np.abs(min_delta / self.coordinates[c].step ) # self.coordinates is always uniform ) # Find the overview that's closest to this reduction factor if (reduction_factor < 2) or (len( self.overviews) == 0): # Then we shouldn't use an overview overview = 1 overview_level = None else: diffs = reduction_factor - np.array(self.overviews) if self.prefer_overviews_closest: diffs = np.abs(diffs) else: diffs[diffs < 0] = np.inf overview_level = np.argmin(diffs) overview = self.overviews[np.argmin(diffs)] # Now read the data inds = coordinates_index if overview_level is None: dataset = self.dataset else: dataset = self.open_dataset(self.source, overview_level) try: # read data within coordinates_index window at the resolution of the overview # Rasterio will then automatically pull from the overview window = ( ((inds[0].min() // overview), int(np.ceil(inds[0].max() / overview) + 1)), ((inds[1].min() // overview), int(np.ceil(inds[1].max() / overview) + 1)), ) slc = (slice(window[0][0], window[0][1], 1), slice(window[1][0], window[1][1], 1)) new_coords = Coordinates.from_geotransform( dataset.transform.to_gdal(), dataset.shape, crs=self.coordinates.crs) new_coords = new_coords[slc] missing_coords = self.coordinates.drop(["lat", "lon"]) new_coords = merge_dims([new_coords, missing_coords]) new_coords = new_coords.transpose(*self.coordinates.dims) coordinates_shape = new_coords.shape[:2] # The following lines are *nearly* copied/pasted from get_data if self.outputs is not None: # read all the bands raster_data = dataset.read(out_shape=(len(self.outputs), ) + coordinates_shape, window=window) raster_data = np.moveaxis(raster_data, 0, 2) else: # read the requested band raster_data = dataset.read(self.band, out_shape=coordinates_shape, window=window) # set raster data to output array data = self.create_output_array(new_coords) data.data.ravel()[:] = raster_data.ravel() except Exception as e: _logger.error( "Error occurred when reading overview with Rasterio: {}". format(e)) if overview_level is not None: dataset.close() return data
def eval(self, coordinates, **kwargs): output = kwargs.get("output") # Make a thread pool to manage queue pool = ThreadPool(processes=self.number_of_workers) if output is None and self.fill_output: output = self.create_output_array(coordinates) shape = [] for d in coordinates.dims: if d in self.chunks: shape.append(self.chunks[d]) else: shape.append(coordinates[d].size) results = [] # inputs = [] i = 0 for coords, slc in coordinates.iterchunks(shape, True): # inputs.append(coords) if i < self.start_i: _log.debug( "Skipping {} since it is less than self.start_i ({})". format(i, self.start_i)) i += 1 continue out = None if self.fill_output and output is not None: out = output[slc] with self._lock: _log.debug("Added {} to worker pool".format(i)) _log.debug("Node eval with coords: {}, {}".format(slc, coords)) results.append( pool.apply_async(self.eval_source, [coords, slc, out, i])) i += 1 _log.info("Added all chunks to worker pool. Now waiting for results.") start_time = time.time() for i, res in enumerate(results): # _log.debug('Waiting for results: {} {}'.format(i, inputs[i])) dt = str( np.timedelta64(int(1000 * (time.time() - start_time)), "ms").astype(object)) _log.info("({}): Waiting for results: {} / {}".format( dt, i + 1, len(results))) # Try to get the results / wait for the results try: o, slc = res.get() except Exception as e: o = None slc = None self.errors.append((i, res, e)) dt = str( np.timedelta64(int(1000 * (time.time() - start_time)), "ms").astype(object)) _log.warning("({}) {} failed with exception {}".format( dt, i, e)) dt = str( np.timedelta64(int(1000 * (time.time() - start_time)), "ms").astype(object)) _log.info("({}) Finished result: {} / {}".format( time.time() - start_time, i + 1, len(results))) # Fill output if self.fill_output: if output is None: missing_dims = [ d for d in coordinates.dims if d not in self.chunks.keys() ] coords = coordinates.drop(missing_dims) missing_coords = Coordinates.from_xarray(o).drop( list(self.chunks.keys())) coords = merge_dims([coords, missing_coords]) coords = coords.transpose(*coordinates.dims) output = self.create_output_array(coords) output[slc] = o _log.info("Completed parallel execution.") pool.close() return output
def interpolate(self, source_coordinates, source_data, eval_coordinates, output_data): """Interpolate data from requested coordinates to source coordinates Parameters ---------- source_coordinates : :class:`podpac.Coordinates` Description source_data : podpac.core.units.UnitsDataArray Description eval_coordinates : :class:`podpac.Coordinates` Description output_data : podpac.core.units.UnitsDataArray Description Returns ------- podpac.core.units.UnitDataArray returns the new output UnitDataArray of interpolated data Raises ------ InterpolationException Raises InterpolationException when interpolator definition can't support all the dimensions of the requested coordinates """ # loop through multiple outputs if necessary if "output" in output_data.dims: for output in output_data.coords["output"]: output_data.sel(output=output)[:] = self.interpolate( source_coordinates, source_data.sel(output=output).drop("output"), eval_coordinates, output_data.sel(output=output).drop("output"), ) return output_data ## drop already-selected output variable # if "output" in output_data.coords: # source_data = source_data.drop("output") # output_data = output_data.drop("output") # short circuit if the source data and requested coordinates are of shape == 1 if source_data.size == 1 and eval_coordinates.size == 1: output_data.data[:] = source_data.data.flatten()[0] return output_data # short circuit if source_coordinates contains eval_coordinates # TODO handle stacked issubset of unstacked case # this case is currently skipped because of the set(eval_coordinates) == set(source_coordinates))) if eval_coordinates.issubset(source_coordinates) and set(eval_coordinates) == set(source_coordinates): if any(isinstance(c, StackedCoordinates) and c.ndim > 1 for c in eval_coordinates.values()): # TODO AFFINE # currently this is bypassing the short-circuit in the shaped stacked coordinates case pass else: try: data = source_data.interp(output_data.coords, method="nearest") except (NotImplementedError, ValueError): data = source_data.sel(output_data.coords) output_data.data[:] = data.transpose(*output_data.dims) return output_data interpolator_queue = self._select_interpolator_queue( source_coordinates, eval_coordinates, "can_interpolate", strict=True ) # for debugging purposes, save the last defined interpolator queue self._last_interpolator_queue = interpolator_queue # reset interpolation parameters for k in self._interpolation_params: self._interpolation_params[k] = False # iterate through each dim tuple in the queue dtype = output_data.dtype attrs = source_data.attrs for udims, interpolator in interpolator_queue.items(): # TODO move the above short-circuits into this loop if all([ud not in source_coordinates.udims for ud in udims]): # Skip this udim if it's not part of the source coordinates (can happen with default) continue # Check if parameters are being used for k in self._interpolation_params: self._interpolation_params[k] = hasattr(interpolator, k) or self._interpolation_params[k] # interp_coordinates are essentially intermediate eval_coordinates interp_dims = [dim for dim, c in source_coordinates.items() if set(c.dims).issubset(udims)] other_dims = [dim for dim, c in eval_coordinates.items() if not set(c.dims).issubset(udims)] interp_coordinates = merge_dims( [source_coordinates.drop(interp_dims), eval_coordinates.drop(other_dims)], validate_crs=False ) interp_data = UnitsDataArray.create(interp_coordinates, dtype=dtype) interp_data = interpolator.interpolate( udims, source_coordinates, source_data, interp_coordinates, interp_data ) # prepare for the next iteration source_data = interp_data.transpose(*interp_coordinates.xdims) source_data.attrs = attrs source_coordinates = interp_coordinates output_data.data = interp_data.transpose(*output_data.dims) # Throw warnings for unused parameters for k in self._interpolation_params: if self._interpolation_params[k]: continue _logger.warning("The interpolation parameter '{}' was ignored during interpolation.".format(k)) return output_data
def iteroutputs(self, coordinates): """Summary Parameters ---------- coordinates : :class:`podpac.Coordinates` Coordinates to evaluate at compositor sources Yields ------ :class:`podpac.core.units.UnitsDataArray` Output from source node eval method """ # downselect sources based on coordinates src_subset = self.select_sources(coordinates) if len(src_subset) == 0: yield self.create_output_array(coordinates) return # Set the interpolation properties for sources if self.interpolation is not None: for s in src_subset.ravel(): if trait_is_defined(self, 'interpolation'): s.interpolation = self.interpolation # Optimization: if coordinates complete and source coords is 1D, # set native_coordinates unless they are set already # WARNING: this assumes # native_coords = source_coords + shared_coordinates # NOT native_coords = shared_coords + source_coords if self.is_source_coordinates_complete and self.source_coordinates.ndim == 1: coords_subset = list( self.source_coordinates.intersect( coordinates, outer=True).coords.values())[0] coords_dim = list(self.source_coordinates.dims)[0] for s, c in zip(src_subset, coords_subset): nc = merge_dims([ Coordinates(np.atleast_1d(c), dims=[coords_dim]), self.shared_coordinates ]) if trait_is_defined(s, 'native_coordinates') is False: s.native_coordinates = nc if self.threaded: # TODO pool of pre-allocated scratch space # TODO: docstring? def f(src): return src.eval(coordinates) pool = ThreadPool(processes=self.n_threads) results = [pool.apply_async(f, [src]) for src in src_subset] for src, res in zip(src_subset, results): yield res.get() #src._output = None # free up memory else: output = None # scratch space for src in src_subset: output = src.eval(coordinates, output) yield output