Ejemplo n.º 1
0
 def test_indexer(data, x, expected_pos, expected_idx=None) -> None:
     pos, new_idx_vars = indexing.remap_label_indexers(data, {"x": x})
     idx, _ = new_idx_vars.get("x", (None, None))
     if idx is not None:
         idx = idx.to_pandas_index()
     assert_array_equal(pos.get("x"), expected_pos)
     assert_array_equal(idx, expected_idx)
Ejemplo n.º 2
0
def index(da, indexers):
    from xarray.core.indexing import remap_label_indexers

    if not isinstance(da, xr.DataArray):
        raise TypeError(f"Expected DataArray. Received {type(da).__name__}")
    pos_indexers, new_indexes = remap_label_indexers(da, indexers)
    dask_indexers = list(pos_indexers.values())

    # TODO: avoid the sel. That could be slow
    indexed = da.sel(**indexers).copy(data=dask_safeslice(da.data, dask_indexers))
    return indexed
Ejemplo n.º 3
0
 def test_indexer(data, x, expected_pos, expected_idx=None):
     pos, idx = indexing.remap_label_indexers(data, {'x': x})
     self.assertArrayEqual(pos.get('x'), expected_pos)
     self.assertArrayEqual(idx.get('x'), expected_idx)
Ejemplo n.º 4
0
 def test_indexer(data, x, expected_pos, expected_idx=None):
     pos, idx = indexing.remap_label_indexers(data, {'x': x})
     assert_array_equal(pos.get('x'), expected_pos)
     assert_array_equal(idx.get('x'), expected_idx)
Ejemplo n.º 5
0
    def run(self, config: Config):

        # Magic for nested key generation
        # https://stackoverflow.com/a/27809959
        results = defaultdict(lambda: defaultdict(odict))

        do_close, ds = self._open()

        for context in config.contexts:

            for stream_id, stream_config in context.streams.items():

                # Find any var specific kwargs to pass onto the run
                if stream_id not in ds.variables:
                    L.warning(
                        f'{stream_id} is not a variable in the xarray dataset, skipping'
                    )
                    continue

                # Because the variables could have different dimensions
                # we calculate the coordiantes and subset for each
                # This is xarray style subsetting, so will look something like:
                # {
                #     'time': slice(datetime.datetime(2020, 1, 1, 0, 0), datetime.datetime(2020, 4, 1, 0, 0), None)
                # }
                label_indexes = {}
                subset_kwargs = {}

                # Region subset
                # TODO: yeah this does nothing right now
                # Subset against the passed in lat/lons variable keys
                # and build up the subset dict to apply later

                # Time subset
                if self.time_var in ds[stream_id].coords:
                    if context.window.starting and context.window.ending:
                        label_indexes[self.time_var] = slice(
                            context.window.starting, context.window.ending)

                subset_stream = ds[stream_id].sel(**label_indexes)

                if self.time_var in subset_stream.coords:
                    # Already subset with the stream, best case. Good netCDF file.
                    subset_kwargs['tinp'] = subset_stream.coords[
                        self.time_var].values
                elif self.time_var in ds.variables and ds[
                        self.time_var].dims == ds[stream_id].dims:
                    # Same dimensions as the stream, so use the same subset
                    subset_kwargs['tinp'] = ds[self.time_var].sel(
                        **label_indexes).values
                elif self.time_var in ds.variables and ds[
                        self.time_var].size == ds[stream_id].size:
                    # Not specifically connected, but hey, the user asked for it
                    subset_kwargs['tinp'] = ds[self.time_var].sel(
                        **label_indexes).values

                if self.z_var in subset_stream.coords:
                    # Already subset with the stream, best case. Good netCDF file.
                    subset_kwargs['zinp'] = subset_stream.coords[
                        self.z_var].values
                elif self.z_var in ds.variables and ds[
                        self.z_var].dims == ds[stream_id].dims:
                    # Same dimensions as the stream, so use the same subset
                    subset_kwargs['zinp'] = ds[self.z_var].sel(
                        **label_indexes).values
                elif self.z_var in ds.variables and ds[
                        self.z_var].size == ds[stream_id].size:
                    # Not specifically connected, but hey, the user asked for it
                    subset_kwargs['zinp'] = ds[self.z_var].sel(
                        **label_indexes).values

                if self.lat_var in subset_stream.coords:
                    # Already subset with the stream, best case. Good netCDF file.
                    subset_kwargs['lat'] = subset_stream.coords[
                        self.lat_var].values
                elif self.lat_var in ds.variables and ds[
                        self.lat_var].dims == ds[stream_id].dims:
                    # Same dimensions as the stream, so use the same subset
                    subset_kwargs['lat'] = ds[self.lat_var].sel(
                        **label_indexes).values
                elif self.lat_var in ds.variables and ds[
                        self.lat_var].size == ds[stream_id].size:
                    # Not specifically connected, but hey, the user asked for it
                    subset_kwargs['lat'] = ds[self.lat_var].sel(
                        **label_indexes).values

                if self.lon_var in subset_stream.coords:
                    # Already subset with the stream, best case. Good netCDF file.
                    subset_kwargs['lon'] = subset_stream.coords[
                        self.lon_var].values
                elif self.lon_var in ds.variables and ds[
                        self.lon_var].dims == ds[stream_id].dims:
                    # Same dimensions as the stream, so use the same subset
                    subset_kwargs['lon'] = ds[self.lon_var].sel(
                        **label_indexes).values
                elif self.lon_var in ds.variables and ds[
                        self.lon_var].size == ds[stream_id].size:
                    # Not specifically connected, but hey, the user asked for it
                    subset_kwargs['lon'] = ds[self.lon_var].sel(
                        **label_indexes).values

                data_input = subset_stream.values
                run_result = stream_config.run(**subset_kwargs,
                                               **dict(inp=data_input))

                # Here we turn the labeled xarray indexes into boolean index arrays that numpy
                # can use to subset a basic array. This takes each labeled index, converts it to
                # its integer index representation (label -> integers) and then matches the keys
                # on each label with the dimension of the data variable. This result should be
                # able to be used on the original data feed AS IS using a direct subset notation
                # data[subset_indexes]. I'm pretty sure this works and if it doesn't blame my cat.
                # We start by subsetting nothing
                subset_indexes = np.full_like(ds[stream_id].values,
                                              0,
                                              dtype=bool)
                int_indexes, _ = remap_label_indexers(ds[stream_id],
                                                      label_indexes)
                # Initial slicer will select everything. This selects all values in a dimension
                # if there are no labeled indexes for it.
                slicers = [slice(None) for x in range(ds[stream_id].ndim)]
                for index_key, index_value in int_indexes.items():
                    if index_key in ds[stream_id].dims:
                        slicers[ds[stream_id].dims.index(
                            index_key)] = index_value
                # We started with an empty subset_indexes, not set to True what we actually subset
                # using the labeled dimensions.

                # Casting to a tuple to handle a numpy deprecation:
                # FutureWarning: Using a non-tuple sequence for multidimensional indexing is
                # deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will
                # be interpreted as an array index, `arr[np.array(seq)]`, which will result either
                # in an error or a different result.
                subset_indexes[tuple(slicers)] = True

                yield ContextResult(
                    results=run_result,
                    stream_id=stream_id,
                    subset_indexes=subset_indexes,
                    data=data_input,
                    tinp=subset_kwargs.get(
                        'tinp',
                        pd.Series(dtype='datetime64[ns]').values),
                    zinp=subset_kwargs.get('zinp',
                                           pd.Series(dtype='float64').values),
                    lat=subset_kwargs.get('lat',
                                          pd.Series(dtype='float64').values),
                    lon=subset_kwargs.get('lon',
                                          pd.Series(dtype='float64').values),
                )

        if do_close is True:
            ds.close()

        return results
Ejemplo n.º 6
0
 def test_indexer(data, x, expected_pos, expected_idx=None):
     pos, idx = indexing.remap_label_indexers(data, {"x": x})
     assert_array_equal(pos.get("x"), expected_pos)
     assert_array_equal(idx.get("x"), expected_idx)
Ejemplo n.º 7
0
 def test_indexer(x):
     return indexing.remap_label_indexers(data, {'x': x})
Ejemplo n.º 8
0
 def test_indexer(data, x, expected_pos, expected_idx=None):
     pos, idx = indexing.remap_label_indexers(data, {"x": x})
     self.assertArrayEqual(pos.get("x"), expected_pos)
     self.assertArrayEqual(idx.get("x"), expected_idx)