def test_resample_f_all(self): resampled_cube = resample_in_time(self.input_cube, 'all', ['min', 'max']) self.assertIsNot(resampled_cube, self.input_cube) self.assertIn('time', resampled_cube) self.assertIn('temperature_min', resampled_cube) self.assertIn('temperature_max', resampled_cube) self.assertIn('precipitation_min', resampled_cube) self.assertIn('precipitation_max', resampled_cube) self.assertEqual(('time',), resampled_cube.time.dims) self.assertEqual(('time', 'lat', 'lon'), resampled_cube.temperature_min.dims) self.assertEqual(('time', 'lat', 'lon'), resampled_cube.temperature_max.dims) self.assertEqual(('time', 'lat', 'lon'), resampled_cube.precipitation_min.dims) self.assertEqual(('time', 'lat', 'lon'), resampled_cube.precipitation_max.dims) self.assertEqual((1,), resampled_cube.time.shape) self.assertEqual((1, 180, 360), resampled_cube.temperature_min.shape) self.assertEqual((1, 180, 360), resampled_cube.temperature_max.shape) self.assertEqual((1, 180, 360), resampled_cube.precipitation_min.shape) self.assertEqual((1, 180, 360), resampled_cube.precipitation_max.shape) np.testing.assert_allclose(resampled_cube.temperature_min.values[..., 0, 0], np.array([272.0])) np.testing.assert_allclose(resampled_cube.temperature_max.values[..., 0, 0], np.array([274.9])) np.testing.assert_allclose(resampled_cube.precipitation_min.values[..., 0, 0], np.array([114.2])) np.testing.assert_allclose(resampled_cube.precipitation_max.values[..., 0, 0], np.array([120.0])) schema = CubeSchema.new(resampled_cube) self.assertEqual(3, schema.ndim) self.assertEqual(('time', 'lat', 'lon'), schema.dims) self.assertEqual((1, 180, 360), schema.shape)
def test_resample_in_time_with_time_chunk_size(self): resampled_cube = resample_in_time(self.input_cube, '2D', ['min', 'max'], time_chunk_size=5) schema = CubeSchema.new(resampled_cube) self.assertEqual(3, schema.ndim) self.assertEqual(('time', 'lat', 'lon'), schema.dims) self.assertEqual((33, 180, 360), schema.shape) self.assertEqual((5, 90, 180), schema.chunks)
def test_without_inputs(self): calls = [] def my_cube_func( input_params: Dict[str, Any] = None, dim_coords: Dict[str, np.ndarray] = None, dim_ranges: Dict[str, Tuple[int, int]] = None) -> CubeFuncOutput: nonlocal calls calls.append((input_params, dim_coords, dim_ranges)) lon_range = dim_ranges['lon'] lat_range = dim_ranges['lat'] time_range = dim_ranges['time'] n_lon = lon_range[1] - lon_range[0] n_lat = lat_range[1] - lat_range[0] n_time = time_range[1] - time_range[0] fill_value = input_params['fill_value'] return np.full((n_time, n_lat, n_lon), fill_value, dtype=np.float64) output_cube = compute_cube(my_cube_func, input_cube_schema=CubeSchema.new(self.cube), input_params=dict(fill_value=0.74)) self.assertIsInstance(output_cube, xr.Dataset) self.assertIn('output', output_cube.data_vars) output_var = output_cube.output self.assertEqual(0, len(calls)) self.assertEqual(('time', 'lat', 'lon'), output_var.dims) self.assertEqual((6, 180, 360), output_var.shape) values = output_var.values self.assertEqual(2 * 2 * 4, len(calls)) self.assertEqual((6, 180, 360), values.shape) self.assertAlmostEqual(0.74, values[0, 0, 0]) self.assertAlmostEqual(0.74, values[-1, -1, -1])
def compute_dataset(cube_func: CubeFunc, *input_cubes: xr.Dataset, input_cube_schema: CubeSchema = None, input_var_names: Sequence[str] = None, input_params: Dict[str, Any] = None, output_var_name: str = 'output', output_var_dims: AbstractSet[str] = None, output_var_dtype: Any = np.float64, output_var_attrs: Dict[str, Any] = None, vectorize: bool = None, cube_asserted: bool = False) -> xr.Dataset: """ Compute a new output dataset with a single variable named *output_var_name* from variables named *input_var_names* contained in zero, one, or more input data cubes in *input_cubes* using a cube factory function *cube_func*. *cube_func* is called concurrently for each of the chunks of the input variables. It is expected to return a chunk block whith is type ``np.ndarray``. If *input_cubes* is not empty, *cube_func* receives variables as specified by *input_var_names*. If *input_cubes* is empty, *input_var_names* must be empty too, and *input_cube_schema* must be given, so that a new cube can be created. The full signature of *cube_func* is::: def cube_func(*input_vars: np.ndarray, input_params: Dict[str, Any] = None, dim_coords: Dict[str, np.ndarray] = None, dim_ranges: Dict[str, Tuple[int, int]] = None) -> np.ndarray: pass The arguments are: * ``input_vars``: the variables according to the given *input_var_names*; * ``input_params``: is this call's *input_params*, a mapping from parameter name to value; * ``dim_coords``: a mapping from dimension names to the current chunk's coordinate arrays; * ``dim_ranges``: a mapping from dimension names to the current chunk's index ranges. Only the ``input_vars`` argument is mandatory. The keyword arguments ``input_params``, ``input_params``, ``input_params`` do need to be present at all. *output_var_dims* my be given in the case, where ... TODO: describe new output_var_dims... :param cube_func: The cube factory function. :param input_cubes: An optional sequence of input cube datasets, must be provided if *input_cube_schema* is not. :param input_cube_schema: An optional input cube schema, must be provided if *input_cubes* is not. :param input_var_names: A sequence of variable names :param input_params: Optional dictionary with processing parameters passed to *cube_func*. :param output_var_name: Optional name of the output variable, defaults to ``'output'``. :param output_var_dims: Optional set of names of the output dimensions, used in the case *cube_func* reduces dimensions. :param output_var_dtype: Optional numpy datatype of the output variable, defaults to ``'float32'``. :param output_var_attrs: Optional metadata attributes for the output variable. :param vectorize: Whether all *input_cubes* have the same variables which are concatenated and passed as vectors to *cube_func*. Not implemented yet. :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: A new dataset that contains the computed output variable. """ if vectorize is not None: # TODO: support vectorize = all cubes have same variables and cube_func # receives variables as vectors (with extra dim) raise NotImplementedError('vectorize is not supported yet') if not cube_asserted: for cube in input_cubes: assert_cube(cube) # Check compatibility of inputs if input_cubes: input_cube_schema = CubeSchema.new(input_cubes[0]) for cube in input_cubes: if not cube_asserted: assert_cube(cube) if cube != input_cubes[0]: # noinspection PyUnusedLocal other_schema = CubeSchema.new(cube) # TODO (forman): broadcast all cubes to same shape, rechunk to same chunks elif input_cube_schema is None: raise ValueError('input_cube_schema must be given') output_var_name = output_var_name or 'output' # Collect named input variables, raise if not found input_var_names = input_var_names or [] input_vars = [] for var_name in input_var_names: input_var = None for cube in input_cubes: if var_name in cube.data_vars: input_var = cube[var_name] break if input_var is None: raise ValueError( f'variable {var_name!r} not found in any of cubes') input_vars.append(input_var) # Find out, if cube_func uses any of _PREDEFINED_KEYWORDS has_input_params, has_dim_coords, has_dim_ranges = _inspect_cube_func( cube_func, input_var_names) def cube_func_wrapper(index_chunk, *input_var_chunks): nonlocal input_cube_schema, input_var_names, input_params, input_vars nonlocal has_input_params, has_dim_coords, has_dim_ranges # Note, xarray.apply_ufunc does a test call with empty input arrays, # so index_chunk.size == 0 is a valid case empty_call = index_chunk.size == 0 # TODO: when output_var_dims is given, index_chunk must be reordered # as core dimensions are moved to the and of index_chunk and input_var_chunks if not empty_call: index_chunk = index_chunk.ravel() if index_chunk.size < 2 * input_cube_schema.ndim: if not empty_call: warnings.warn( f"unexpected index_chunk of size {index_chunk.size} received!" ) return None dim_ranges = None if has_dim_ranges or has_dim_coords: dim_ranges = {} for i in range(input_cube_schema.ndim): dim_name = input_cube_schema.dims[i] if not empty_call: start = int(index_chunk[2 * i + 0]) end = int(index_chunk[2 * i + 1]) dim_ranges[dim_name] = start, end else: dim_ranges[dim_name] = () dim_coords = None if has_dim_coords: dim_coords = {} for coord_var_name, coord_var in input_cube_schema.coords.items(): coord_slices = [slice(None)] * coord_var.ndim for i in range(input_cube_schema.ndim): dim_name = input_cube_schema.dims[i] if dim_name in coord_var.dims: j = coord_var.dims.index(dim_name) coord_slices[j] = slice(*dim_ranges[dim_name]) dim_coords[coord_var_name] = coord_var[tuple( coord_slices)].values kwargs = {} if has_input_params: kwargs['input_params'] = input_params if has_dim_ranges: kwargs['dim_ranges'] = dim_ranges if has_dim_coords: kwargs['dim_coords'] = dim_coords return cube_func(*input_var_chunks, **kwargs) index_var = _gen_index_var(input_cube_schema) all_input_vars = [index_var] + input_vars input_core_dims = None if output_var_dims: input_core_dims = [] has_warned = False for i in range(len(all_input_vars)): input_var = all_input_vars[i] var_core_dims = [ dim for dim in input_var.dims if dim not in output_var_dims ] must_rechunk = False if var_core_dims and input_var.chunks: for var_core_dim in var_core_dims: dim_index = input_var.dims.index(var_core_dim) dim_chunk_size = input_var.chunks[dim_index][0] dim_shape_size = input_var.shape[dim_index] if dim_chunk_size != dim_shape_size: must_rechunk = True break if must_rechunk: if not has_warned: warnings.warn( f'Input variables must not be chunked in dimension(s): {", ".join(var_core_dims)}.\n' f'Rechunking applies, which may drastically decrease runtime performance ' f'and increase memory usage.') has_warned = True all_input_vars[i] = input_var.chunk( {var_core_dim: -1 for var_core_dim in var_core_dims}) input_core_dims.append(var_core_dims) output_var = xr.apply_ufunc(cube_func_wrapper, *all_input_vars, dask='parallelized', input_core_dims=input_core_dims, output_dtypes=[output_var_dtype]) if output_var_attrs: output_var.attrs.update(output_var_attrs) return xr.Dataset({output_var_name: output_var}, coords=input_cube_schema.coords)
def compute_cube(cube_func: CubeFunc, *input_cubes: xr.Dataset, input_cube_schema: CubeSchema = None, input_var_names: Sequence[str] = None, input_params: Dict[str, Any] = None, output_var_name: str = 'output', output_var_dtype: Any = np.float64, output_var_attrs: Dict[str, Any] = None, vectorize: bool = None, cube_asserted: bool = False) -> xr.Dataset: """ Compute a new output data cube with a single variable named *output_var_name* from variables named *input_var_names* contained in zero, one, or more input data cubes in *input_cubes* using a cube factory function *cube_func*. *cube_func* is called concurrently for each of the chunks of the input variables. It is expected to return a chunk block whith is type ``np.ndarray``. If *input_cubes* is not empty, *cube_func* receives variables as specified by *input_var_names*. If *input_cubes* is empty, *input_var_names* must be empty too, and *input_cube_schema* must be given, so that a new cube can be created. The full signature of *cube_func* is::: def cube_func(*input_vars: np.ndarray, input_params: Dict[str, Any] = None, dim_coords: Dict[str, np.ndarray] = None, dim_ranges: Dict[str, Tuple[int, int]] = None) -> np.ndarray: pass The arguments are: * ``input_vars``: the variables according to the given *input_var_names*; * ``input_params``: is this call's *input_params*, a mapping from parameter name to value; * ``dim_coords``: a mapping from dimension names to the current chunk's coordinate arrays; * ``dim_ranges``: a mapping from dimension names to the current chunk's index ranges. Only the ``input_vars`` argument is mandatory. The keyword arguments ``input_params``, ``input_params``, ``input_params`` do need to be present at all. :param cube_func: The cube factory function. :param input_cubes: An optional sequence of input cube datasets, must be provided if *input_cube_schema* is not. :param input_cube_schema: An optional input cube schema, must be provided if *input_cubes* is not. :param input_var_names: A sequence of variable names :param input_params: Optional dictionary with processing parameters passed to *cube_func*. :param output_var_name: Optional name of the output variable, defaults to ``'output'``. :param output_var_dtype: Optional numpy datatype of the output variable, defaults to ``'float32'``. :param output_var_attrs: Optional metadata attributes for the output variable. :param vectorize: Whether all *input_cubes* have the same variables which are concatenated and passed as vectors to *cube_func*. Not implemented yet. :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: A new dataset that contains the computed output variable. """ if vectorize is not None: raise NotImplementedError('vectorize is not supported yet') if not cube_asserted: for cube in input_cubes: assert_cube(cube) if input_cubes: input_cube_schema = CubeSchema.new(input_cubes[0]) for cube in input_cubes: if not cube_asserted: assert_cube(cube) if cube != input_cubes[0]: # noinspection PyUnusedLocal other_schema = CubeSchema.new(cube) # TODO (forman): broadcast all cubes to same shape, rechunk to same chunks elif input_cube_schema is None: raise ValueError('input_cube_schema must be given') if output_var_name is None: output_var_name = 'output' input_var_names = input_var_names or [] input_vars = [] for var_name in input_var_names: var = None for cube in input_cubes: if var_name in cube.data_vars: var = cube[var_name] break if var is None: raise ValueError(f'variable {var_name!r} not found in any of cubes') input_vars.append(var) has_input_params, has_dim_coords, has_dim_ranges = _inspect_cube_func(cube_func, input_var_names) def cube_func_wrapper(index_chunk, *input_var_chunks): nonlocal input_cube_schema, input_var_names, input_params, input_vars nonlocal has_input_params, has_dim_coords, has_dim_ranges index_chunk = index_chunk.ravel() if index_chunk.size < 2 * input_cube_schema.ndim: warnings.warn(f"weird index_chunk of size {index_chunk.size} received!") return dim_ranges = None if has_dim_ranges or has_dim_coords: dim_ranges = {} for i in range(input_cube_schema.ndim): dim_name = input_cube_schema.dims[i] start = int(index_chunk[2 * i + 0]) end = int(index_chunk[2 * i + 1]) dim_ranges[dim_name] = start, end dim_coords = None if has_dim_coords: dim_coords = {} for coord_var_name, coord_var in input_cube_schema.coords.items(): coord_slices = [slice(None)] * coord_var.ndim for i in range(input_cube_schema.ndim): dim_name = input_cube_schema.dims[i] if dim_name in coord_var.dims: j = coord_var.dims.index(dim_name) coord_slices[j] = slice(*dim_ranges[dim_name]) dim_coords[coord_var_name] = coord_var[tuple(coord_slices)].values kwargs = {} if has_input_params: kwargs['input_params'] = input_params if has_dim_ranges: kwargs['dim_ranges'] = dim_ranges if has_dim_coords: kwargs['dim_coords'] = dim_coords return cube_func(*input_var_chunks, **kwargs) index_var = _gen_index_var(input_cube_schema) output_var = xr.apply_ufunc(cube_func_wrapper, index_var, *input_vars, dask='parallelized', output_dtypes=[output_var_dtype]) if output_var_attrs: output_var.attrs.update(output_var_attrs) return xr.Dataset({output_var_name: output_var}, coords=input_cube_schema.coords)
def resample_in_time(cube: xr.Dataset, frequency: str, method: Union[str, Sequence[str]], offset=None, base: int = 0, tolerance=None, interp_kind=None, time_chunk_size=None, var_names: Sequence[str] = None, metadata: Dict[str, Any] = None, cube_asserted: bool = False) -> xr.Dataset: """ Resample a xcube dataset in the time dimension. :param cube: The xcube dataset. :param frequency: Temporal aggregation frequency. Use format "<count><offset>" "where <offset> is one of 'H', 'D', 'W', 'M', 'Q', 'Y'. :param method: Resampling method or sequence of resampling methods. :param offset: Offset used to adjust the resampled time labels. Uses same syntax as *frequency*. :param base: For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '24H' frequency, base could range from 0 through 23. :param time_chunk_size: If not None, the chunk size to be used for the "time" dimension. :param var_names: Variable names to include. :param tolerance: Time tolerance for selective upsampling methods. Defaults to *frequency*. :param interp_kind: Kind of interpolation if *method* is 'interpolation'. :param metadata: Output metadata. :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: A new xcube dataset resampled in time. """ if not cube_asserted: assert_cube(cube) if var_names: cube = select_vars(cube, var_names) resampler = cube.resample(skipna=True, closed='left', label='left', keep_attrs=True, time=frequency, loffset=offset, base=base) if isinstance(method, str): methods = [method] else: methods = list(method) resampled_cubes = [] for method in methods: resampling_method = getattr(resampler, method) kwargs = get_method_kwargs(method, frequency, interp_kind, tolerance) resampled_cube = resampling_method(**kwargs) resampled_cube = resampled_cube.rename( {var_name: f'{var_name}_{method}' for var_name in resampled_cube.data_vars}) resampled_cubes.append(resampled_cube) if len(resampled_cubes) == 1: resampled_cube = resampled_cubes[0] else: resampled_cube = xr.merge(resampled_cubes) # TODO: add time_bnds to resampled_ds time_coverage_start = '%s' % cube.time[0] time_coverage_end = '%s' % cube.time[-1] resampled_cube.attrs.update(metadata or {}) # TODO: add other time_coverage_ attributes resampled_cube.attrs.update(time_coverage_start=time_coverage_start, time_coverage_end=time_coverage_end) schema = CubeSchema.new(cube) chunk_sizes = {schema.dims[i]: schema.chunks[i] for i in range(schema.ndim)} if isinstance(time_chunk_size, int) and time_chunk_size >= 0: chunk_sizes['time'] = time_chunk_size return resampled_cube.chunk(chunk_sizes)