def test_indexer(data, x, expected_pos, expected_idx=None) -> None: pos, new_idx_vars = indexing.remap_label_indexers(data, {"x": x}) idx, _ = new_idx_vars.get("x", (None, None)) if idx is not None: idx = idx.to_pandas_index() assert_array_equal(pos.get("x"), expected_pos) assert_array_equal(idx, expected_idx)
def index(da, indexers): from xarray.core.indexing import remap_label_indexers if not isinstance(da, xr.DataArray): raise TypeError(f"Expected DataArray. Received {type(da).__name__}") pos_indexers, new_indexes = remap_label_indexers(da, indexers) dask_indexers = list(pos_indexers.values()) # TODO: avoid the sel. That could be slow indexed = da.sel(**indexers).copy(data=dask_safeslice(da.data, dask_indexers)) return indexed
def test_indexer(data, x, expected_pos, expected_idx=None): pos, idx = indexing.remap_label_indexers(data, {'x': x}) self.assertArrayEqual(pos.get('x'), expected_pos) self.assertArrayEqual(idx.get('x'), expected_idx)
def test_indexer(data, x, expected_pos, expected_idx=None): pos, idx = indexing.remap_label_indexers(data, {'x': x}) assert_array_equal(pos.get('x'), expected_pos) assert_array_equal(idx.get('x'), expected_idx)
def run(self, config: Config): # Magic for nested key generation # https://stackoverflow.com/a/27809959 results = defaultdict(lambda: defaultdict(odict)) do_close, ds = self._open() for context in config.contexts: for stream_id, stream_config in context.streams.items(): # Find any var specific kwargs to pass onto the run if stream_id not in ds.variables: L.warning( f'{stream_id} is not a variable in the xarray dataset, skipping' ) continue # Because the variables could have different dimensions # we calculate the coordiantes and subset for each # This is xarray style subsetting, so will look something like: # { # 'time': slice(datetime.datetime(2020, 1, 1, 0, 0), datetime.datetime(2020, 4, 1, 0, 0), None) # } label_indexes = {} subset_kwargs = {} # Region subset # TODO: yeah this does nothing right now # Subset against the passed in lat/lons variable keys # and build up the subset dict to apply later # Time subset if self.time_var in ds[stream_id].coords: if context.window.starting and context.window.ending: label_indexes[self.time_var] = slice( context.window.starting, context.window.ending) subset_stream = ds[stream_id].sel(**label_indexes) if self.time_var in subset_stream.coords: # Already subset with the stream, best case. Good netCDF file. subset_kwargs['tinp'] = subset_stream.coords[ self.time_var].values elif self.time_var in ds.variables and ds[ self.time_var].dims == ds[stream_id].dims: # Same dimensions as the stream, so use the same subset subset_kwargs['tinp'] = ds[self.time_var].sel( **label_indexes).values elif self.time_var in ds.variables and ds[ self.time_var].size == ds[stream_id].size: # Not specifically connected, but hey, the user asked for it subset_kwargs['tinp'] = ds[self.time_var].sel( **label_indexes).values if self.z_var in subset_stream.coords: # Already subset with the stream, best case. Good netCDF file. subset_kwargs['zinp'] = subset_stream.coords[ self.z_var].values elif self.z_var in ds.variables and ds[ self.z_var].dims == ds[stream_id].dims: # Same dimensions as the stream, so use the same subset subset_kwargs['zinp'] = ds[self.z_var].sel( **label_indexes).values elif self.z_var in ds.variables and ds[ self.z_var].size == ds[stream_id].size: # Not specifically connected, but hey, the user asked for it subset_kwargs['zinp'] = ds[self.z_var].sel( **label_indexes).values if self.lat_var in subset_stream.coords: # Already subset with the stream, best case. Good netCDF file. subset_kwargs['lat'] = subset_stream.coords[ self.lat_var].values elif self.lat_var in ds.variables and ds[ self.lat_var].dims == ds[stream_id].dims: # Same dimensions as the stream, so use the same subset subset_kwargs['lat'] = ds[self.lat_var].sel( **label_indexes).values elif self.lat_var in ds.variables and ds[ self.lat_var].size == ds[stream_id].size: # Not specifically connected, but hey, the user asked for it subset_kwargs['lat'] = ds[self.lat_var].sel( **label_indexes).values if self.lon_var in subset_stream.coords: # Already subset with the stream, best case. Good netCDF file. subset_kwargs['lon'] = subset_stream.coords[ self.lon_var].values elif self.lon_var in ds.variables and ds[ self.lon_var].dims == ds[stream_id].dims: # Same dimensions as the stream, so use the same subset subset_kwargs['lon'] = ds[self.lon_var].sel( **label_indexes).values elif self.lon_var in ds.variables and ds[ self.lon_var].size == ds[stream_id].size: # Not specifically connected, but hey, the user asked for it subset_kwargs['lon'] = ds[self.lon_var].sel( **label_indexes).values data_input = subset_stream.values run_result = stream_config.run(**subset_kwargs, **dict(inp=data_input)) # Here we turn the labeled xarray indexes into boolean index arrays that numpy # can use to subset a basic array. This takes each labeled index, converts it to # its integer index representation (label -> integers) and then matches the keys # on each label with the dimension of the data variable. This result should be # able to be used on the original data feed AS IS using a direct subset notation # data[subset_indexes]. I'm pretty sure this works and if it doesn't blame my cat. # We start by subsetting nothing subset_indexes = np.full_like(ds[stream_id].values, 0, dtype=bool) int_indexes, _ = remap_label_indexers(ds[stream_id], label_indexes) # Initial slicer will select everything. This selects all values in a dimension # if there are no labeled indexes for it. slicers = [slice(None) for x in range(ds[stream_id].ndim)] for index_key, index_value in int_indexes.items(): if index_key in ds[stream_id].dims: slicers[ds[stream_id].dims.index( index_key)] = index_value # We started with an empty subset_indexes, not set to True what we actually subset # using the labeled dimensions. # Casting to a tuple to handle a numpy deprecation: # FutureWarning: Using a non-tuple sequence for multidimensional indexing is # deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will # be interpreted as an array index, `arr[np.array(seq)]`, which will result either # in an error or a different result. subset_indexes[tuple(slicers)] = True yield ContextResult( results=run_result, stream_id=stream_id, subset_indexes=subset_indexes, data=data_input, tinp=subset_kwargs.get( 'tinp', pd.Series(dtype='datetime64[ns]').values), zinp=subset_kwargs.get('zinp', pd.Series(dtype='float64').values), lat=subset_kwargs.get('lat', pd.Series(dtype='float64').values), lon=subset_kwargs.get('lon', pd.Series(dtype='float64').values), ) if do_close is True: ds.close() return results
def test_indexer(data, x, expected_pos, expected_idx=None): pos, idx = indexing.remap_label_indexers(data, {"x": x}) assert_array_equal(pos.get("x"), expected_pos) assert_array_equal(idx.get("x"), expected_idx)
def test_indexer(x): return indexing.remap_label_indexers(data, {'x': x})
def test_indexer(data, x, expected_pos, expected_idx=None): pos, idx = indexing.remap_label_indexers(data, {"x": x}) self.assertArrayEqual(pos.get("x"), expected_pos) self.assertArrayEqual(idx.get("x"), expected_idx)