def test_concat_multiindex(self): x = pd.MultiIndex.from_product([[1, 2, 3], ['a', 'b']]) expected = Dataset({'x': x}) actual = concat([expected.isel(x=slice(2)), expected.isel(x=slice(2, None))], 'x') assert expected.equals(actual) assert isinstance(actual.x.to_index(), pd.MultiIndex)
def test_to_dask_dataframe(self): # Test conversion of Datasets to dask DataFrames x = da.from_array(np.random.randn(10), chunks=4) y = np.arange(10, dtype='uint8') t = list('abcdefghij') ds = Dataset(OrderedDict([('a', ('t', x)), ('b', ('t', y)), ('t', ('t', t))])) expected_pd = pd.DataFrame({'a': x, 'b': y}, index=pd.Index(t, name='t')) # test if 1-D index is correctly set up expected = dd.from_pandas(expected_pd, chunksize=4) actual = ds.to_dask_dataframe(set_index=True) # test if we have dask dataframes assert isinstance(actual, dd.DataFrame) # use the .equals from pandas to check dataframes are equivalent assert_frame_equal(expected.compute(), actual.compute()) # test if no index is given expected = dd.from_pandas(expected_pd.reset_index(drop=False), chunksize=4) actual = ds.to_dask_dataframe(set_index=False) assert isinstance(actual, dd.DataFrame) assert_frame_equal(expected.compute(), actual.compute())
def test_open_and_do_math(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: original.to_netcdf(tmp) with open_mfdataset(tmp) as ds: actual = 1.0 * ds self.assertDatasetAllClose(original, actual)
def radec2azel(scale: xarray.Dataset, latlon: Tuple[float, float], time: datetime=None) -> xarray.Dataset: if latlon is None or not isinstance(scale, xarray.Dataset): return None if time is None: with fits.open(scale.filename, mode='readonly') as f: try: t = f[0].header['FRAME'] # TODO this only works from Solis? except KeyError: logging.error('no time given in file or manually, cannot compute az/el') return None time = parse(t) logging.info('using FITS header for time') elif isinstance(time, datetime): pass elif isinstance(time, (float, int)): # assume UT1_Unix time = datetime.utcfromtimestamp(time) else: # user override of frame time time = parse(time) print('image time:', time) # %% knowing camera location, time, and sky coordinates observed, convert to az/el for each pixel az, el = pymap3d.radec2azel(scale['ra'], scale['dec'], latlon[0], latlon[1], time) # %% collect output scale['az'] = (('y', 'x'), az) scale['el'] = (('y', 'x'), el) scale.attrs['lat'] = latlon[0] scale.attrs['lon'] = latlon[1] scale.attrs['time'] = time return scale
def adjust_temporal_attrs_impl(ds: xr.Dataset) -> xr.Dataset: """ Adjust the global temporal attributes of the dataset by doing some introspection of the dataset and adjusting the appropriate attributes accordingly. In case the determined attributes do not exist in the dataset, these will be added. For more information on suggested global attributes see `Attribute Convention for Data Discovery <http://wiki.esipfed.org/index.php/Attribute_Convention_for_Data_Discovery>`_ :param ds: Dataset to adjust :return: Adjusted dataset """ temporal_attrs = _get_temporal_cf_attrs_from_var(ds) if temporal_attrs: ds = ds.copy() # Align temporal attributes with the ones from the shallow Dataset copy for key in temporal_attrs: if temporal_attrs[key] is not None: ds.attrs[key] = temporal_attrs[key] else: ds.attrs.pop(key, None) return ds
def test_roundtrip_object_dtype(self): floats = np.array([0.0, 0.0, 1.0, 2.0, 3.0], dtype=object) floats_nans = np.array([np.nan, np.nan, 1.0, 2.0, 3.0], dtype=object) letters = np.array(['ab', 'cdef', 'g'], dtype=object) letters_nans = np.array(['ab', 'cdef', np.nan], dtype=object) all_nans = np.array([np.nan, np.nan], dtype=object) original = Dataset({'floats': ('a', floats), 'floats_nans': ('a', floats_nans), 'letters': ('b', letters), 'letters_nans': ('b', letters_nans), 'all_nans': ('c', all_nans), 'nan': ([], np.nan)}) expected = original.copy(deep=True) if isinstance(self, Only32BitTypes): # for netCDF3 tests, expect the results to come back as characters expected['letters_nans'] = expected['letters_nans'].astype('S') expected['letters'] = expected['letters'].astype('S') with self.roundtrip(original) as actual: try: self.assertDatasetIdentical(expected, actual) except AssertionError: # Most stores use '' for nans in strings, but some don't # first try the ideal case (where the store returns exactly) # the original Dataset), then try a more realistic case. # ScipyDataTest, NetCDF3ViaNetCDF4DataTest and NetCDF4DataTest # all end up using this case. expected['letters_nans'][-1] = '' self.assertDatasetIdentical(expected, actual)
def test_coordinates_encoding(self): def equals_latlon(obj): return obj == 'lat lon' or obj == 'lon lat' original = Dataset({'temp': ('x', [0, 1]), 'precip': ('x', [0, -1])}, {'lat': ('x', [2, 3]), 'lon': ('x', [4, 5])}) with self.roundtrip(original) as actual: self.assertDatasetIdentical(actual, original) with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) with open_dataset(tmp_file, decode_coords=False) as ds: self.assertTrue(equals_latlon(ds['temp'].attrs['coordinates'])) self.assertTrue(equals_latlon(ds['precip'].attrs['coordinates'])) self.assertNotIn('coordinates', ds.attrs) self.assertNotIn('coordinates', ds['lat'].attrs) self.assertNotIn('coordinates', ds['lon'].attrs) modified = original.drop(['temp', 'precip']) with self.roundtrip(modified) as actual: self.assertDatasetIdentical(actual, modified) with create_tmp_file() as tmp_file: modified.to_netcdf(tmp_file) with open_dataset(tmp_file, decode_coords=False) as ds: self.assertTrue(equals_latlon(ds.attrs['coordinates'])) self.assertNotIn('coordinates', ds['lat'].attrs) self.assertNotIn('coordinates', ds['lon'].attrs)
def test_roundtrip_strings_with_fill_value(self): values = np.array(['ab', 'cdef', np.nan], dtype=object) encoding = {'_FillValue': np.string_('X'), 'dtype': np.dtype('S1')} original = Dataset({'x': ('t', values, {}, encoding)}) expected = original.copy(deep=True) expected['x'][:2] = values[:2].astype('S') with self.roundtrip(original) as actual: self.assertDatasetIdentical(expected, actual) original = Dataset({'x': ('t', values, {}, {'_FillValue': '\x00'})}) if not isinstance(self, Only32BitTypes): # these stores can save unicode strings expected = original.copy(deep=True) if isinstance(self, BaseNetCDF4Test): # netCDF4 can't keep track of an empty _FillValue for VLEN # variables expected['x'][-1] = '' elif (isinstance(self, (NetCDF3ViaNetCDF4DataTest, NetCDF4ClassicViaNetCDF4DataTest)) or (has_netCDF4 and type(self) is GenericNetCDFDataTest)): # netCDF4 can't keep track of an empty _FillValue for nc3, either: # https://github.com/Unidata/netcdf4-python/issues/273 expected['x'][-1] = np.string_('') with self.roundtrip(original) as actual: self.assertDatasetIdentical(expected, actual)
def state_to_xarray(state): '''Convert a dictionary of climlab.Field objects to xarray.Dataset Input: dictionary of climlab.Field objects (e.g. process.state or process.diagnostics dictionary) Output: xarray.Dataset object with all spatial axes, including 'bounds' axes indicating cell boundaries in each spatial dimension. Any items in the dictionary that are not instances of climlab.Field are ignored.''' from climlab.domain.field import Field ds = Dataset() for name, field in state.items(): if isinstance(field, Field): ds[name] = Field_to_xarray(field) dom = field.domain for axname, ax in dom.axes.items(): bounds_name = axname + '_bounds' ds.coords[bounds_name] = DataArray(ax.bounds, dims=[bounds_name], coords={bounds_name:ax.bounds}) try: ds[bounds_name].attrs['units'] = ax.units except: pass else: warnings.warn('{} excluded from Dataset because it is not a Field variable.'.format(name)) return ds
def _preprocess_dataset(self, ds: Dataset): # Convert specific data variables to coordinate variables for var_name in EXTRA_COORDS_VAR_NAMES: if var_name in ds.data_vars: ds.set_coords(var_name, inplace=True) # print(ds) return ds
def test_save_mfdataset_roundtrip(self): original = Dataset({'foo': ('x', np.random.randn(10))}) datasets = [original.isel(x=slice(5)), original.isel(x=slice(5, 10))] with create_tmp_file() as tmp1: with create_tmp_file() as tmp2: save_mfdataset(datasets, [tmp1, tmp2]) with open_mfdataset([tmp1, tmp2]) as actual: self.assertDatasetIdentical(actual, original)
def test_dask_layers_and_dependencies(): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk() x = dask.delayed(ds) assert set(x.__dask_graph__().dependencies).issuperset( ds.__dask_graph__().dependencies) assert set(x.foo.__dask_graph__().dependencies).issuperset( ds.__dask_graph__().dependencies)
def adjust_spatial_attrs_impl(ds: xr.Dataset, allow_point: bool) -> xr.Dataset: """ Adjust the global spatial attributes of the dataset by doing some introspection of the dataset and adjusting the appropriate attributes accordingly. In case the determined attributes do not exist in the dataset, these will be added. For more information on suggested global attributes see `Attribute Convention for Data Discovery <http://wiki.esipfed.org/index.php/Attribute_Convention_for_Data_Discovery>`_ :param ds: Dataset to adjust :param allow_point: Whether to accept single point cells :return: Adjusted dataset """ copied = False for dim in ('lon', 'lat'): geo_spatial_attrs = _get_geo_spatial_cf_attrs_from_var(ds, dim, allow_point=allow_point) if geo_spatial_attrs: # Copy any new attributes into the shallow Dataset copy for key in geo_spatial_attrs: if geo_spatial_attrs[key] is not None: if not copied: ds = ds.copy() copied = True ds.attrs[key] = geo_spatial_attrs[key] lon_min = ds.attrs.get('geospatial_lon_min') lat_min = ds.attrs.get('geospatial_lat_min') lon_max = ds.attrs.get('geospatial_lon_max') lat_max = ds.attrs.get('geospatial_lat_max') if lon_min is not None and lat_min is not None and lon_max is not None and lat_max is not None: if not copied: ds = ds.copy() ds.attrs['geospatial_bounds'] = 'POLYGON(({} {}, {} {}, {} {}, {} {}, {} {}))'. \ format(lon_min, lat_min, lon_min, lat_max, lon_max, lat_max, lon_max, lat_min, lon_min, lat_min) # Determination of the following attributes from introspection in a general # way is ambiguous, hence it is safer to drop them than to risk preserving # out of date attributes. drop = ['geospatial_bounds_crs', 'geospatial_bounds_vertical_crs', 'geospatial_vertical_min', 'geospatial_vertical_max', 'geospatial_vertical_positive', 'geospatial_vertical_units', 'geospatial_vertical_resolution'] for key in drop: ds.attrs.pop(key, None) return ds
def test_weakrefs(self): example = Dataset({'foo': ('x', np.arange(5.0))}) expected = example.rename({'foo': 'bar', 'x': 'y'}) with create_tmp_file() as tmp_file: example.to_netcdf(tmp_file, engine='scipy') on_disk = open_dataset(tmp_file, engine='pynio') actual = on_disk.rename({'foo': 'bar', 'x': 'y'}) del on_disk # trigger garbage collection self.assertDatasetIdentical(actual, expected)
def test_variable_order(self): # doesn't work with scipy or h5py :( ds = Dataset() ds['a'] = 1 ds['z'] = 2 ds['b'] = 3 ds.coords['c'] = 4 with self.roundtrip(ds) as actual: self.assertEqual(list(ds), list(actual))
def test_persist_Dataset(self): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk() ds = ds + 1 n = len(ds.foo.data.dask) ds2 = ds.persist() assert len(ds2.foo.data.dask) == 1 assert len(ds.foo.data.dask) == n # doesn't mutate in place
def test_dataset_pickle(self): ds1 = Dataset({'a': DataArray(build_dask_array())}) ds1.compute() self.assertFalse(ds1['a']._in_memory) self.assertEquals(kernel_call_count, 1) ds2 = pickle.loads(pickle.dumps(ds1)) self.assertEquals(kernel_call_count, 1) self.assertDatasetIdentical(ds1, ds2) self.assertFalse(ds1['a']._in_memory) self.assertFalse(ds2['a']._in_memory)
def test_concat_encoding(self): # Regression test for GH1297 ds = Dataset({'foo': (['x', 'y'], np.random.random((2, 3))), 'bar': (['x', 'y'], np.random.random((2, 3)))}, {'x': [0, 1]}) foo = ds['foo'] foo.encoding = {"complevel": 5} ds.encoding = {"unlimited_dims": 'x'} assert concat([foo, foo], dim="x").encoding == foo.encoding assert concat([ds, ds], dim="x").encoding == ds.encoding
def test_concat_coords(self): data = Dataset({"foo": ("x", np.random.randn(10))}) expected = data.assign_coords(c=("x", [0] * 5 + [1] * 5)) objs = [data.isel(x=slice(5)).assign_coords(c=0), data.isel(x=slice(5, None)).assign_coords(c=1)] for coords in ["different", "all", ["c"]]: actual = concat(objs, dim="x", coords=coords) self.assertDatasetIdentical(expected, actual) for coords in ["minimal", []]: with self.assertRaisesRegexp(ValueError, "not equal across"): concat(objs, dim="x", coords=coords)
def diff(ds: xr.Dataset, ds2: xr.Dataset, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Calculate the difference of two datasets (ds - ds2). This is done by matching variable names in the two datasets against each other and taking the difference of matching variables. If lat/lon/time extents differ between the datasets, the default behavior is to take the intersection of the datasets and run subtraction on that. However, broadcasting is possible. E.g. ds(lat/lon/time) - ds(lat/lon) is valid. In this case the subtrahend will be stretched to the size of ds(lat/lon/time) so that it can be subtracted. This also works if the subtrahend is a single time slice of arbitrary temporal position. In this case, the time dimension will be squeezed out leaving a lat/lon dataset. :param ds: The minuend dataset :param ds2: The subtrahend dataset :param monitor: a progress monitor. :return: The difference dataset """ try: # Times do not intersect if 0 == len(ds.time - ds2.time) and \ len(ds.time) == len(ds2.time): # Times are the same length # If the datasets don't intersect in time dimension, a naive difference # would return empty data variables. Hence, the time coordinate has to # be dropped beforehand ds = ds.drop('time') ds2 = ds2.drop('time') return ds - ds2 except AttributeError: # It is likely that the one operand is a lat/lon array that can be # broadcast against the other operand pass try: if 1 == len(ds2.time): # The subtrahend is a single time-slice -> squeeze 'time' dimension to # be able to broadcast is along minuend ds2 = ds2.squeeze('time', drop=True) except AttributeError: # Doesn't have a time dimension already pass except TypeError as e: if 'unsized object' in str(e): # The 'time' variable is a scalar pass else: raise TypeError(str(e)) with monitor.observing("Subtract datasets"): diff = ds - ds2 return diff
def test_basic_compute(): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk({'x': 2}) for get in [dask.threaded.get, dask.multiprocessing.get, dask.local.get_sync, None]: with dask.set_options(get=get): ds.compute() ds.foo.compute() ds.foo.variable.compute()
def test_concat_coords(self): data = Dataset({'foo': ('x', np.random.randn(10))}) expected = data.assign_coords(c=('x', [0] * 5 + [1] * 5)) objs = [data.isel(x=slice(5)).assign_coords(c=0), data.isel(x=slice(5, None)).assign_coords(c=1)] for coords in ['different', 'all', ['c']]: actual = concat(objs, dim='x', coords=coords) self.assertDatasetIdentical(expected, actual) for coords in ['minimal', []]: with self.assertRaisesRegexp(ValueError, 'not equal across'): concat(objs, dim='x', coords=coords)
def test_to_dask_dataframe_2D_set_index(self): # This will fail until dask implements MultiIndex support w = da.from_array(np.random.randn(2, 3), chunks=(1, 2)) ds = Dataset({'w': (('x', 'y'), w)}) ds['x'] = ('x', np.array([0, 1], np.int64)) ds['y'] = ('y', list('abc')) expected = ds.compute().to_dataframe() actual = ds.to_dask_dataframe(set_index=True) assert isinstance(actual, dd.DataFrame) assert_frame_equal(expected, actual.compute())
def test_preprocess_mfdataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: original.to_netcdf(tmp) def preprocess(ds): return ds.assign_coords(z=0) expected = preprocess(original) with open_mfdataset(tmp, preprocess=preprocess) as actual: self.assertDatasetIdentical(expected, actual)
def test_simultaneous_compute(self): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk() count = [0] def counting_get(*args, **kwargs): count[0] += 1 return dask.get(*args, **kwargs) ds.load(get=counting_get) assert count[0] == 1
def _normalize_lon_360(ds: xr.Dataset) -> xr.Dataset: """ Fix the longitude of the given dataset ``ds`` so that it ranges from -180 to +180 degrees. :param ds: The dataset whose longitudes may be given in the range 0 to 360. :return: The fixed dataset or the original dataset. """ if 'lon' not in ds.coords: return ds lon_var = ds.coords['lon'] if len(lon_var.shape) != 1: return ds lon_size = lon_var.shape[0] if lon_size < 2: return ds lon_size_05 = lon_size // 2 lon_values = lon_var.values if not np.any(lon_values[lon_size_05:] > 180.): return ds delta_lon = lon_values[1] - lon_values[0] var_names = [var_name for var_name in ds.data_vars] ds = ds.assign_coords(lon=xr.DataArray(np.linspace(-180. + 0.5 * delta_lon, +180. - 0.5 * delta_lon, lon_size), dims=ds['lon'].dims, attrs=dict(long_name='longitude', standard_name='longitude', units='degrees east'))) ds = adjust_spatial_attrs_impl(ds, True) new_vars = dict() for var_name in var_names: var = ds[var_name] if len(var.dims) >= 1 and var.dims[-1] == 'lon': values = np.copy(var.values) temp = np.copy(values[..., : lon_size_05]) values[..., : lon_size_05] = values[..., lon_size_05:] values[..., lon_size_05:] = temp # import matplotlib.pyplot as plt # im = values[(len(values.shape) - 2) * [0] + [slice(None), slice(None)]] # plt.imshow(im) new_vars[var_name] = xr.DataArray(values, dims=var.dims, attrs=var.attrs, encoding=var.encoding) return ds.assign(**new_vars)
def test_to_dask_dataframe_no_coordinate(self): x = da.from_array(np.random.randn(10), chunks=4) ds = Dataset({'x': ('dim_0', x)}) expected = ds.compute().to_dataframe().reset_index() actual = ds.to_dask_dataframe() assert isinstance(actual, dd.DataFrame) assert_frame_equal(expected, actual.compute()) expected = ds.compute().to_dataframe() actual = ds.to_dask_dataframe(set_index=True) assert isinstance(actual, dd.DataFrame) assert_frame_equal(expected, actual.compute())
def test_open_dataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: original.to_netcdf(tmp) with open_dataset(tmp, chunks={'x': 5}) as actual: self.assertIsInstance(actual.foo.variable.data, da.Array) self.assertEqual(actual.foo.variable.data.chunks, ((5, 5),)) self.assertDatasetIdentical(original, actual) with open_dataset(tmp, chunks=5) as actual: self.assertDatasetIdentical(original, actual) with open_dataset(tmp) as actual: self.assertIsInstance(actual.foo.variable.data, np.ndarray) self.assertDatasetIdentical(original, actual)
def test_simultaneous_compute(self): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk() count = [0] def counting_get(*args, **kwargs): count[0] += 1 return dask.get(*args, **kwargs) with dask.set_options(get=counting_get): ds.load() self.assertEqual(count[0], 1)
def test_lock(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: original.to_netcdf(tmp, format='NETCDF3_CLASSIC') with open_dataset(tmp, chunks=10) as ds: task = ds.foo.data.dask[ds.foo.data.name, 0] self.assertIsInstance(task[-1], type(Lock())) with open_mfdataset(tmp) as ds: task = ds.foo.data.dask[ds.foo.data.name, 0] self.assertIsInstance(task[-1], type(Lock())) with open_mfdataset(tmp, engine='scipy') as ds: task = ds.foo.data.dask[ds.foo.data.name, 0] self.assertNotIsInstance(task[-1], type(Lock()))
def test_combine_coords_join(self, join, expected): objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [1], "y": [1]})] actual = combine_nested(objs, concat_dim="x", join=join) assert_identical(expected, actual)
def test_invalid_units_raises_eagerly(self): ds = Dataset({'time': ('time', [0, 1], {'units': 'foobar since 123'})}) with self.assertRaisesRegexp(ValueError, 'unable to decode time'): decode_cf(ds)
def test_lazy_dataset(self): lazy_ds = Dataset({'foo': (('x', 'y'), self.data)}) self.assertIsInstance(lazy_ds.foo.variable.data, da.Array)
def test_no_dimension_coords(self): ds0 = Dataset({"foo": ("x", [0, 1])}) ds1 = Dataset({"foo": ("x", [2, 3])}) with raises_regex(ValueError, "Could not find any dimension"): _infer_concat_order_from_coords([ds1, ds0])
def test_concat_promote_shape(self): # mixed dims within variables objs = [Dataset({}, {'x': 0}), Dataset({'x': [1]})] actual = concat(objs, 'x') expected = Dataset({'x': [0, 1]}) assert_identical(actual, expected) objs = [Dataset({'x': [0]}), Dataset({}, {'x': 1})] actual = concat(objs, 'x') assert_identical(actual, expected) # mixed dims between variables objs = [Dataset({'x': [2], 'y': 3}), Dataset({'x': [4], 'y': 5})] actual = concat(objs, 'x') expected = Dataset({'x': [2, 4], 'y': ('x', [3, 5])}) assert_identical(actual, expected) # mixed dims in coord variable objs = [ Dataset({'x': [0]}, {'y': -1}), Dataset({'x': [1]}, {'y': ('x', [-2])}) ] actual = concat(objs, 'x') expected = Dataset({'x': [0, 1]}, {'y': ('x', [-1, -2])}) assert_identical(actual, expected) # scalars with mixed lengths along concat dim -- values should repeat objs = [ Dataset({'x': [0]}, {'y': -1}), Dataset({'x': [1, 2]}, {'y': -2}) ] actual = concat(objs, 'x') expected = Dataset({'x': [0, 1, 2]}, {'y': ('x', [-1, -2, -2])}) assert_identical(actual, expected) # broadcast 1d x 1d -> 2d objs = [ Dataset({'z': ('x', [-1])}, { 'x': [0], 'y': [0] }), Dataset({'z': ('y', [1])}, { 'x': [1], 'y': [0] }) ] actual = concat(objs, 'x') expected = Dataset({'z': (('x', 'y'), [[-1], [1]])}, { 'x': [0, 1], 'y': [0] }) assert_identical(actual, expected)
def test_empty_input(self): assert_identical(Dataset(), combine_by_coords([]))
def rolling_cumsum(ds: xr.Dataset, rolling_window: int = 3) -> xr.Dataset: ds_window = (ds.rolling(time=rolling_window, center=True).sum().dropna(dim='time', how='all')) return ds_window
def _create_lookup_table(self, xr: xarray.Dataset): lookup = [] if not self._disable_pbar: LOGGER.info("Create lookup table and convert to pytorch tensor") for basin in tqdm(self.basins, file=sys.stdout, disable=self._disable_pbar): # store data of each frequency as numpy array of shape [time steps, features] x_d, x_s, y = {}, {}, {} # keys: frequencies, values: array mapping each lowest-frequency # sample to its corresponding sample in this frequency frequency_maps = {} lowest_freq = utils.sort_frequencies(self.frequencies)[0] # converting from xarray to pandas DataFrame because resampling is much faster in pandas. df_native = xr.sel(basin=basin).to_dataframe() for freq in self.frequencies: if isinstance(self.cfg.dynamic_inputs, list): dynamic_cols = self.cfg.dynamic_inputs else: dynamic_cols = self.cfg.dynamic_inputs[freq] df_resampled = df_native[dynamic_cols + self.cfg.target_variables + self.cfg.static_inputs].resample(freq).mean() x_d[freq] = df_resampled[dynamic_cols].values y[freq] = df_resampled[self.cfg.target_variables].values if self.cfg.static_inputs: x_s[freq] = df_resampled[self.cfg.static_inputs].values # number of frequency steps in one lowest-frequency step frequency_factor = pd.to_timedelta(lowest_freq) // pd.to_timedelta(freq) # array position i is the last entry of this frequency that belongs to the lowest-frequency sample i. frequency_maps[freq] = np.arange(len(df_resampled) // frequency_factor) \ * frequency_factor + (frequency_factor - 1) # store first date of sequence to be able to restore dates during inference if not self.is_train: self.period_starts[basin] = pd.to_datetime(xr.sel(basin=basin)["date"].values[0]) # we can ignore the deprecation warning about lists because we don't use the passed lists # after the validate_samples call. The alternative numba.typed.Lists is still experimental. with warnings.catch_warnings(): warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning) # checks inputs and outputs for each sequence. valid: flag = 1, invalid: flag = 0 # manually unroll the dicts into lists to make sure the order of frequencies is consistent. # during inference, we want all samples with sufficient history (even if input is NaN), so # we pass x_d, x_s, y as None. flag = validate_samples(x_d=[x_d[freq] for freq in self.frequencies] if self.is_train else None, x_s=[x_s[freq] for freq in self.frequencies] if self.is_train and x_s else None, y=[y[freq] for freq in self.frequencies] if self.is_train else None, frequency_maps=[frequency_maps[freq] for freq in self.frequencies], seq_length=self.seq_len, predict_last_n=self._predict_last_n) valid_samples = np.argwhere(flag == 1) for f in valid_samples: # store pointer to basin and the sample's index in each frequency lookup.append((basin, [frequency_maps[freq][int(f)] for freq in self.frequencies])) self.x_d[basin] = {freq: torch.from_numpy(_x_d.astype(np.float32)) for freq, _x_d in x_d.items()} self.y[basin] = {freq: torch.from_numpy(_y.astype(np.float32)) for freq, _y in y.items()} if x_s: self.x_s[basin] = {freq: torch.from_numpy(_x_s.astype(np.float32)) for freq, _x_s in x_s.items()} self.lookup_table = {i: elem for i, elem in enumerate(lookup)} self.num_samples = len(self.lookup_table)
def test_min_count_dataset(func): da = construct_dataarray(2, dtype=float, contains_nan=True, dask=False) ds = Dataset({'var1': da}, coords={'scalar': 0}) actual = getattr(ds, func)(dim='x', skipna=True, min_count=3)['var1'] expected = getattr(ds['var1'], func)(dim='x', skipna=True, min_count=3) assert_allclose(actual, expected)
def test_invalid_time_units_raises_eagerly(self) -> None: ds = Dataset({"time": ("time", [0, 1], {"units": "foobar since 123"})}) with pytest.raises(ValueError, match=r"unable to decode time"): decode_cf(ds)
def test_concat_dim_is_variable(self): objs = [Dataset({'x': 0}), Dataset({'x': 1})] coord = Variable('y', [3, 4]) expected = Dataset({'x': ('y', [0, 1]), 'y': [3, 4]}) actual = concat(objs, coord) assert_identical(actual, expected)
def test_concat_loads_variables(self): # Test that concat() computes not-in-memory variables at most once # and loads them in the output, while leaving the input unaltered. d1 = build_dask_array('d1') c1 = build_dask_array('c1') d2 = build_dask_array('d2') c2 = build_dask_array('c2') d3 = build_dask_array('d3') c3 = build_dask_array('c3') # Note: c is a non-index coord. # Index coords are loaded by IndexVariable.__init__. ds1 = Dataset(data_vars={'d': ('x', d1)}, coords={'c': ('x', c1)}) ds2 = Dataset(data_vars={'d': ('x', d2)}, coords={'c': ('x', c2)}) ds3 = Dataset(data_vars={'d': ('x', d3)}, coords={'c': ('x', c3)}) assert kernel_call_count == 0 out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='different', coords='different') # each kernel is computed exactly once assert kernel_call_count == 6 # variables are loaded in the output assert isinstance(out['d'].data, np.ndarray) assert isinstance(out['c'].data, np.ndarray) out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='all', coords='all') # no extra kernel calls assert kernel_call_count == 6 assert isinstance(out['d'].data, dask.array.Array) assert isinstance(out['c'].data, dask.array.Array) out = xr.concat([ds1, ds2, ds3], dim='n', data_vars=['d'], coords=['c']) # no extra kernel calls assert kernel_call_count == 6 assert isinstance(out['d'].data, dask.array.Array) assert isinstance(out['c'].data, dask.array.Array) out = xr.concat([ds1, ds2, ds3], dim='n', data_vars=[], coords=[]) # variables are loaded once as we are validing that they're identical assert kernel_call_count == 12 assert isinstance(out['d'].data, np.ndarray) assert isinstance(out['c'].data, np.ndarray) out = xr.concat([ds1, ds2, ds3], dim='n', data_vars='different', coords='different', compat='identical') # compat=identical doesn't do any more kernel calls than compat=equals assert kernel_call_count == 18 assert isinstance(out['d'].data, np.ndarray) assert isinstance(out['c'].data, np.ndarray) # When the test for different turns true halfway through, # stop computing variables as it would not have any benefit ds4 = Dataset(data_vars={'d': ('x', [2.0])}, coords={'c': ('x', [2.0])}) out = xr.concat([ds1, ds2, ds4, ds3], dim='n', data_vars='different', coords='different') # the variables of ds1 and ds2 were computed, but those of ds3 didn't assert kernel_call_count == 22 assert isinstance(out['d'].data, dask.array.Array) assert isinstance(out['c'].data, dask.array.Array) # the data of ds1 and ds2 was loaded into numpy and then # concatenated to the data of ds3. Thus, only ds3 is computed now. out.compute() assert kernel_call_count == 24 # Finally, test that riginals are unaltered assert ds1['d'].data is d1 assert ds1['c'].data is c1 assert ds2['d'].data is d2 assert ds2['c'].data is c2 assert ds3['d'].data is d3 assert ds3['c'].data is c3
def test_nested_concat_too_many_dims_at_once(self): objs = [Dataset({"x": [0], "y": [1]}), Dataset({"y": [0], "x": [1]})] with pytest.raises(ValueError, match="not equal across datasets"): combine_nested(objs, concat_dim="x", coords="minimal")
def test_nested_concat(self): objs = [Dataset({"x": [0]}), Dataset({"x": [1]})] expected = Dataset({"x": [0, 1]}) actual = combine_nested(objs, concat_dim="x") assert_identical(expected, actual) actual = combine_nested(objs, concat_dim=["x"]) assert_identical(expected, actual) actual = combine_nested([actual], concat_dim=None) assert_identical(expected, actual) actual = combine_nested([actual], concat_dim="x") assert_identical(expected, actual) objs = [Dataset({"x": [0, 1]}), Dataset({"x": [2]})] actual = combine_nested(objs, concat_dim="x") expected = Dataset({"x": [0, 1, 2]}) assert_identical(expected, actual) # ensure combine_nested handles non-sorted variables objs = [ Dataset({ "x": ("a", [0]), "y": ("a", [0]) }), Dataset({ "y": ("a", [1]), "x": ("a", [1]) }), ] actual = combine_nested(objs, concat_dim="a") expected = Dataset({"x": ("a", [0, 1]), "y": ("a", [0, 1])}) assert_identical(expected, actual) objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [1]})] actual = combine_nested(objs, concat_dim="x") expected = Dataset({"x": [0, 1], "y": [0]}) assert_identical(expected, actual)
def test_combine_by_coords(self): objs = [Dataset({"x": [0]}), Dataset({"x": [1]})] actual = combine_by_coords(objs) expected = Dataset({"x": [0, 1]}) assert_identical(expected, actual) actual = combine_by_coords([actual]) assert_identical(expected, actual) objs = [Dataset({"x": [0, 1]}), Dataset({"x": [2]})] actual = combine_by_coords(objs) expected = Dataset({"x": [0, 1, 2]}) assert_identical(expected, actual) # ensure auto_combine handles non-sorted variables objs = [ Dataset({ "x": ("a", [0]), "y": ("a", [0]), "a": [0] }), Dataset({ "x": ("a", [1]), "y": ("a", [1]), "a": [1] }), ] actual = combine_by_coords(objs) expected = Dataset({ "x": ("a", [0, 1]), "y": ("a", [0, 1]), "a": [0, 1] }) assert_identical(expected, actual) objs = [Dataset({"x": [0], "y": [0]}), Dataset({"y": [1], "x": [1]})] actual = combine_by_coords(objs) expected = Dataset({"x": [0, 1], "y": [0, 1]}) assert_equal(actual, expected) objs = [Dataset({"x": 0}), Dataset({"x": 1})] with raises_regex(ValueError, "Could not find any dimension coordinates"): combine_by_coords(objs) objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})] with raises_regex(ValueError, "Every dimension needs a coordinate"): combine_by_coords(objs) def test_empty_input(self): assert_identical(Dataset(), combine_by_coords([]))
def create_bout_ds(syn_data_type='random', lengths=(6, 2, 4, 7), num=0, nxpe=1, nype=1, xproc=0, yproc=0, guards={}): # Set the shape of the data in this dataset t_length, x_length, y_length, z_length = lengths mxg = guards.get('x', 0) myg = guards.get('y', 0) x_length += 2*mxg y_length += 2*myg shape = (t_length, x_length, y_length, z_length) # calculate global nx, ny and nz nx = nxpe*lengths[1] + 2*mxg ny = nype*lengths[2] nz = 1*lengths[3] # Fill with some kind of synthetic data if syn_data_type is 'random': # Each dataset contains unique random noise np.random.seed(seed=num) data = np.random.randn(*shape) elif syn_data_type is 'linear': # Variables increase linearly across entire domain data = DataArray(-np.ones(shape), dims=('t', 'x', 'y', 'z')) t_array = DataArray((nx - 2*mxg)*ny*nz*np.arange(t_length, dtype=float), dims='t') x_array = DataArray(ny*nz*(xproc*lengths[1] + mxg + np.arange(lengths[1], dtype=float)), dims='x') y_array = DataArray(nz*(yproc*lengths[2] + myg + np.arange(lengths[2], dtype=float)), dims='y') z_array = DataArray(np.arange(z_length, dtype=float), dims='z') data[:, mxg:x_length-mxg, myg:y_length-myg, :] = ( t_array + x_array + y_array + z_array ) elif syn_data_type is 'stepped': # Each dataset contains a different number depending on the filename data = np.ones(shape) * num elif isinstance(syn_data_type, int): data = np.ones(shape)* syn_data_type else: raise ValueError('Not a recognised choice of type of synthetic bout data.') T = DataArray(data, dims=['t', 'x', 'y', 'z']) n = DataArray(data, dims=['t', 'x', 'y', 'z']) ds = Dataset({'n': n, 'T': T}) # BOUT_VERSION needed so that we know that number of points in z is MZ, not MZ-1 (as # it was in BOUT++ before v4.0 ds['BOUT_VERSION'] = 4.3 # Include grid data ds['NXPE'] = nxpe ds['NYPE'] = nype ds['NZPE'] = 1 ds['PE_XIND'] = xproc ds['PE_YIND'] = yproc ds['MYPE'] = num ds['MXG'] = mxg ds['MYG'] = myg ds['nx'] = nx ds['ny'] = ny ds['nz'] = nz ds['MZ'] = 1*lengths[3] ds['MXSUB'] = lengths[1] ds['MYSUB'] = lengths[2] ds['MZSUB'] = lengths[3] ds['ixseps1'] = nx ds['ixseps2'] = nx ds['jyseps1_1'] = 0 ds['jyseps1_2'] = ny ds['jyseps2_1'] = ny//2 - 1 ds['jyseps2_2'] = ny//2 - 1 ds['ny_inner'] = ny//2 one = DataArray(np.ones((x_length, y_length)), dims=['x', 'y']) zero = DataArray(np.zeros((x_length, y_length)), dims=['x', 'y']) ds['zperiod'] = 1 ds['ZMIN'] = 0. ds['ZMAX'] = 2.*np.pi ds['g11'] = one ds['g22'] = one ds['g33'] = one ds['g12'] = zero ds['g13'] = zero ds['g23'] = zero ds['g_11'] = one ds['g_22'] = one ds['g_33'] = one ds['g_12'] = zero ds['g_13'] = zero ds['g_23'] = zero ds['G1'] = zero ds['G2'] = zero ds['G3'] = zero ds['J'] = one ds['Bxy'] = one ds['zShift'] = zero ds['dx'] = 0.5*one ds['dy'] = 2.*one ds['dz'] = 0.7 ds['iteration'] = t_length ds['t_array'] = DataArray(np.arange(t_length, dtype=float)*10., dims='t') return ds
class TestCombineAuto: def test_combine_by_coords(self): objs = [Dataset({"x": [0]}), Dataset({"x": [1]})] actual = combine_by_coords(objs) expected = Dataset({"x": [0, 1]}) assert_identical(expected, actual) actual = combine_by_coords([actual]) assert_identical(expected, actual) objs = [Dataset({"x": [0, 1]}), Dataset({"x": [2]})] actual = combine_by_coords(objs) expected = Dataset({"x": [0, 1, 2]}) assert_identical(expected, actual) # ensure auto_combine handles non-sorted variables objs = [ Dataset({ "x": ("a", [0]), "y": ("a", [0]), "a": [0] }), Dataset({ "x": ("a", [1]), "y": ("a", [1]), "a": [1] }), ] actual = combine_by_coords(objs) expected = Dataset({ "x": ("a", [0, 1]), "y": ("a", [0, 1]), "a": [0, 1] }) assert_identical(expected, actual) objs = [Dataset({"x": [0], "y": [0]}), Dataset({"y": [1], "x": [1]})] actual = combine_by_coords(objs) expected = Dataset({"x": [0, 1], "y": [0, 1]}) assert_equal(actual, expected) objs = [Dataset({"x": 0}), Dataset({"x": 1})] with raises_regex(ValueError, "Could not find any dimension coordinates"): combine_by_coords(objs) objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})] with raises_regex(ValueError, "Every dimension needs a coordinate"): combine_by_coords(objs) def test_empty_input(self): assert_identical(Dataset(), combine_by_coords([])) @pytest.mark.parametrize( "join, expected", [ ("outer", Dataset({ "x": [0, 1], "y": [0, 1] })), ("inner", Dataset({ "x": [0, 1], "y": [] })), ("left", Dataset({ "x": [0, 1], "y": [0] })), ("right", Dataset({ "x": [0, 1], "y": [1] })), ], ) def test_combine_coords_join(self, join, expected): objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [1], "y": [1]})] actual = combine_nested(objs, concat_dim="x", join=join) assert_identical(expected, actual) def test_combine_coords_join_exact(self): objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [1], "y": [1]})] with raises_regex(ValueError, "indexes along dimension"): combine_nested(objs, concat_dim="x", join="exact") @pytest.mark.parametrize( "combine_attrs, expected", [ ("drop", Dataset({ "x": [0, 1], "y": [0, 1] }, attrs={})), ( "no_conflicts", Dataset({ "x": [0, 1], "y": [0, 1] }, attrs={ "a": 1, "b": 2 }), ), ("override", Dataset({ "x": [0, 1], "y": [0, 1] }, attrs={"a": 1})), ], ) def test_combine_coords_combine_attrs(self, combine_attrs, expected): objs = [ Dataset({ "x": [0], "y": [0] }, attrs={"a": 1}), Dataset({ "x": [1], "y": [1] }, attrs={ "a": 1, "b": 2 }), ] actual = combine_nested(objs, concat_dim="x", join="outer", combine_attrs=combine_attrs) assert_identical(expected, actual) if combine_attrs == "no_conflicts": objs[1].attrs["a"] = 2 with raises_regex(ValueError, "combine_attrs='no_conflicts'"): actual = combine_nested(objs, concat_dim="x", join="outer", combine_attrs=combine_attrs) def test_combine_coords_combine_attrs_identical(self): objs = [ Dataset({ "x": [0], "y": [0] }, attrs={"a": 1}), Dataset({ "x": [1], "y": [1] }, attrs={"a": 1}), ] expected = Dataset({"x": [0, 1], "y": [0, 1]}, attrs={"a": 1}) actual = combine_nested(objs, concat_dim="x", join="outer", combine_attrs="identical") assert_identical(expected, actual) objs[1].attrs["b"] = 2 with raises_regex(ValueError, "combine_attrs='identical'"): actual = combine_nested(objs, concat_dim="x", join="outer", combine_attrs="identical") def test_combine_nested_combine_attrs_drop_conflicts(self): objs = [ Dataset({ "x": [0], "y": [0] }, attrs={ "a": 1, "b": 2, "c": 3 }), Dataset({ "x": [1], "y": [1] }, attrs={ "a": 1, "b": 0, "d": 3 }), ] expected = Dataset({ "x": [0, 1], "y": [0, 1] }, attrs={ "a": 1, "c": 3, "d": 3 }) actual = combine_nested(objs, concat_dim="x", join="outer", combine_attrs="drop_conflicts") assert_identical(expected, actual) def test_infer_order_from_coords(self): data = create_test_data() objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] actual = combine_by_coords(objs) expected = data assert expected.broadcast_equals(actual) def test_combine_leaving_bystander_dimensions(self): # Check non-monotonic bystander dimension coord doesn't raise # ValueError on combine (https://github.com/pydata/xarray/issues/3150) ycoord = ["a", "c", "b"] data = np.random.rand(7, 3) ds1 = Dataset( data_vars=dict(data=(["x", "y"], data[:3, :])), coords=dict(x=[1, 2, 3], y=ycoord), ) ds2 = Dataset( data_vars=dict(data=(["x", "y"], data[3:, :])), coords=dict(x=[4, 5, 6, 7], y=ycoord), ) expected = Dataset( data_vars=dict(data=(["x", "y"], data)), coords=dict(x=[1, 2, 3, 4, 5, 6, 7], y=ycoord), ) actual = combine_by_coords((ds1, ds2)) assert_identical(expected, actual) def test_combine_by_coords_previously_failed(self): # In the above scenario, one file is missing, containing the data for # one year's data for one variable. datasets = [ Dataset({ "a": ("x", [0]), "x": [0] }), Dataset({ "b": ("x", [0]), "x": [0] }), Dataset({ "a": ("x", [1]), "x": [1] }), ] expected = Dataset({ "a": ("x", [0, 1]), "b": ("x", [0, np.nan]) }, {"x": [0, 1]}) actual = combine_by_coords(datasets) assert_identical(expected, actual) def test_combine_by_coords_still_fails(self): # concat can't handle new variables (yet): # https://github.com/pydata/xarray/issues/508 datasets = [ Dataset({"x": 0}, {"y": 0}), Dataset({"x": 1}, { "y": 1, "z": 1 }) ] with pytest.raises(ValueError): combine_by_coords(datasets, "y") def test_combine_by_coords_no_concat(self): objs = [Dataset({"x": 0}), Dataset({"y": 1})] actual = combine_by_coords(objs) expected = Dataset({"x": 0, "y": 1}) assert_identical(expected, actual) objs = [Dataset({"x": 0, "y": 1}), Dataset({"y": np.nan, "z": 2})] actual = combine_by_coords(objs) expected = Dataset({"x": 0, "y": 1, "z": 2}) assert_identical(expected, actual) def test_check_for_impossible_ordering(self): ds0 = Dataset({"x": [0, 1, 5]}) ds1 = Dataset({"x": [2, 3]}) with raises_regex( ValueError, "does not have monotonic global indexes along dimension x"): combine_by_coords([ds1, ds0]) def test_combine_by_coords_incomplete_hypercube(self): # test that this succeeds with default fill_value x1 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [0]}) x2 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [1], "x": [0]}) x3 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [1]}) actual = combine_by_coords([x1, x2, x3]) expected = Dataset( {"a": (("y", "x"), [[1, 1], [1, np.nan]])}, coords={ "y": [0, 1], "x": [0, 1] }, ) assert_identical(expected, actual) # test that this fails if fill_value is None with pytest.raises(ValueError): combine_by_coords([x1, x2, x3], fill_value=None)
def postprocess(self, frame: xr.Dataset): import arpes.xarray_extensions # pylint: disable=unused-import, redefined-outer-name frame = super().postprocess(frame) return frame.assign_attrs(frame.S.spectrum.attrs)
def test_empty_input(self): assert_identical(Dataset(), combine_nested([], concat_dim="x"))
def postprocess_final(self, data: xr.Dataset, scan_desc: dict = None): # attach the 'spectrum_type' # TODO move this logic into xarray extensions and customize here # only as necessary coord_names = tuple(sorted([c for c in data.dims if c != 'cycle'])) spectrum_type = None if any(d in coord_names for d in {'x', 'y', 'z'}): coord_names = tuple(c for c in coord_names if c not in {'x', 'y', 'z'}) spectrum_types = { ('eV', ): 'spem', ( 'eV', 'phi', ): 'ucut', } spectrum_type = spectrum_types.get(coord_names) else: spectrum_types = { ('eV', ): 'xps', ( 'eV', 'phi', 'theta', ): 'map', ( 'eV', 'phi', 'psi', ): 'map', ( 'beta', 'eV', 'phi', ): 'map', ( 'eV', 'hv', 'phi', ): 'hv_map', ('eV', 'phi'): 'cut', } spectrum_type = spectrum_types.get(coord_names) if 'phi' not in data.coords: # XPS data.coords['phi'] = 0 for s in data.S.spectra: s.coords['phi'] = 0 if spectrum_type is not None: data.attrs['spectrum_type'] = spectrum_type if 'spectrum' in data.data_vars: data.spectrum.attrs['spectrum_type'] = spectrum_type ls = [data] + data.S.spectra for l in ls: for k, key_fn in self.ATTR_TRANSFORMS.items(): if k in l.attrs: transformed = key_fn(l.attrs[k]) if isinstance(transformed, dict): l.attrs.update(transformed) else: l.attrs[k] = transformed for l in ls: for k, v in self.MERGE_ATTRS.items(): if k not in l.attrs: l.attrs[k] = v for l in ls: for c in self.ENSURE_COORDS_EXIST: if c not in l.coords: if c in l.attrs: l.coords[c] = l.attrs[c] else: warnings.warn( f'Could not assign coordinate {c} from attributes, assigning np.nan instead.' ) l.coords[c] = np.nan for l in ls: if 'chi' in l.coords and 'chi_offset' not in l.attrs: l.attrs['chi_offset'] = l.coords['chi'].item() return data
class TestNestedCombine: def test_nested_concat(self): objs = [Dataset({"x": [0]}), Dataset({"x": [1]})] expected = Dataset({"x": [0, 1]}) actual = combine_nested(objs, concat_dim="x") assert_identical(expected, actual) actual = combine_nested(objs, concat_dim=["x"]) assert_identical(expected, actual) actual = combine_nested([actual], concat_dim=None) assert_identical(expected, actual) actual = combine_nested([actual], concat_dim="x") assert_identical(expected, actual) objs = [Dataset({"x": [0, 1]}), Dataset({"x": [2]})] actual = combine_nested(objs, concat_dim="x") expected = Dataset({"x": [0, 1, 2]}) assert_identical(expected, actual) # ensure combine_nested handles non-sorted variables objs = [ Dataset({ "x": ("a", [0]), "y": ("a", [0]) }), Dataset({ "y": ("a", [1]), "x": ("a", [1]) }), ] actual = combine_nested(objs, concat_dim="a") expected = Dataset({"x": ("a", [0, 1]), "y": ("a", [0, 1])}) assert_identical(expected, actual) objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [1]})] actual = combine_nested(objs, concat_dim="x") expected = Dataset({"x": [0, 1], "y": [0]}) assert_identical(expected, actual) @pytest.mark.parametrize( "join, expected", [ ("outer", Dataset({ "x": [0, 1], "y": [0, 1] })), ("inner", Dataset({ "x": [0, 1], "y": [] })), ("left", Dataset({ "x": [0, 1], "y": [0] })), ("right", Dataset({ "x": [0, 1], "y": [1] })), ], ) def test_combine_nested_join(self, join, expected): objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [1], "y": [1]})] actual = combine_nested(objs, concat_dim="x", join=join) assert_identical(expected, actual) def test_combine_nested_join_exact(self): objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [1], "y": [1]})] with raises_regex(ValueError, "indexes along dimension"): combine_nested(objs, concat_dim="x", join="exact") def test_empty_input(self): assert_identical(Dataset(), combine_nested([], concat_dim="x")) # Fails because of concat's weird treatment of dimension coords, see #2975 @pytest.mark.xfail def test_nested_concat_too_many_dims_at_once(self): objs = [Dataset({"x": [0], "y": [1]}), Dataset({"y": [0], "x": [1]})] with pytest.raises(ValueError, match="not equal across datasets"): combine_nested(objs, concat_dim="x", coords="minimal") def test_nested_concat_along_new_dim(self): objs = [ Dataset({ "a": ("x", [10]), "x": [0] }), Dataset({ "a": ("x", [20]), "x": [0] }), ] expected = Dataset({"a": (("t", "x"), [[10], [20]]), "x": [0]}) actual = combine_nested(objs, concat_dim="t") assert_identical(expected, actual) # Same but with a DataArray as new dim, see GH #1988 and #2647 dim = DataArray([100, 150], name="baz", dims="baz") expected = Dataset({ "a": (("baz", "x"), [[10], [20]]), "x": [0], "baz": [100, 150] }) actual = combine_nested(objs, concat_dim=dim) assert_identical(expected, actual) def test_nested_merge(self): data = Dataset({"x": 0}) actual = combine_nested([data, data, data], concat_dim=None) assert_identical(data, actual) ds1 = Dataset({"a": ("x", [1, 2]), "x": [0, 1]}) ds2 = Dataset({"a": ("x", [2, 3]), "x": [1, 2]}) expected = Dataset({"a": ("x", [1, 2, 3]), "x": [0, 1, 2]}) actual = combine_nested([ds1, ds2], concat_dim=None) assert_identical(expected, actual) actual = combine_nested([ds1, ds2], concat_dim=[None]) assert_identical(expected, actual) tmp1 = Dataset({"x": 0}) tmp2 = Dataset({"x": np.nan}) actual = combine_nested([tmp1, tmp2], concat_dim=None) assert_identical(tmp1, actual) actual = combine_nested([tmp1, tmp2], concat_dim=[None]) assert_identical(tmp1, actual) # Single object, with a concat_dim explicitly provided # Test the issue reported in GH #1988 objs = [Dataset({"x": 0, "y": 1})] dim = DataArray([100], name="baz", dims="baz") actual = combine_nested(objs, concat_dim=[dim]) expected = Dataset({ "x": ("baz", [0]), "y": ("baz", [1]) }, {"baz": [100]}) assert_identical(expected, actual) # Just making sure that auto_combine is doing what is # expected for non-scalar values, too. objs = [Dataset({"x": ("z", [0, 1]), "y": ("z", [1, 2])})] dim = DataArray([100], name="baz", dims="baz") actual = combine_nested(objs, concat_dim=[dim]) expected = Dataset( { "x": (("baz", "z"), [[0, 1]]), "y": (("baz", "z"), [[1, 2]]) }, {"baz": [100]}, ) assert_identical(expected, actual) def test_concat_multiple_dims(self): objs = [ [ Dataset({"a": (("x", "y"), [[0]])}), Dataset({"a": (("x", "y"), [[1]])}) ], [ Dataset({"a": (("x", "y"), [[2]])}), Dataset({"a": (("x", "y"), [[3]])}) ], ] actual = combine_nested(objs, concat_dim=["x", "y"]) expected = Dataset({"a": (("x", "y"), [[0, 1], [2, 3]])}) assert_identical(expected, actual) def test_concat_name_symmetry(self): """Inspired by the discussion on GH issue #2777""" da1 = DataArray(name="a", data=[[0]], dims=["x", "y"]) da2 = DataArray(name="b", data=[[1]], dims=["x", "y"]) da3 = DataArray(name="a", data=[[2]], dims=["x", "y"]) da4 = DataArray(name="b", data=[[3]], dims=["x", "y"]) x_first = combine_nested([[da1, da2], [da3, da4]], concat_dim=["x", "y"]) y_first = combine_nested([[da1, da3], [da2, da4]], concat_dim=["y", "x"]) assert_identical(x_first, y_first) def test_concat_one_dim_merge_another(self): data = create_test_data() data1 = data.copy(deep=True) data2 = data.copy(deep=True) objs = [ [ data1.var1.isel(dim2=slice(4)), data2.var1.isel(dim2=slice(4, 9)) ], [ data1.var2.isel(dim2=slice(4)), data2.var2.isel(dim2=slice(4, 9)) ], ] expected = data[["var1", "var2"]] actual = combine_nested(objs, concat_dim=[None, "dim2"]) assert_identical(expected, actual) def test_auto_combine_2d(self): ds = create_test_data partway1 = concat([ds(0), ds(3)], dim="dim1") partway2 = concat([ds(1), ds(4)], dim="dim1") partway3 = concat([ds(2), ds(5)], dim="dim1") expected = concat([partway1, partway2, partway3], dim="dim2") datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4), ds(5)]] result = combine_nested(datasets, concat_dim=["dim1", "dim2"]) assert_equal(result, expected) def test_auto_combine_2d_combine_attrs_kwarg(self): ds = create_test_data partway1 = concat([ds(0), ds(3)], dim="dim1") partway2 = concat([ds(1), ds(4)], dim="dim1") partway3 = concat([ds(2), ds(5)], dim="dim1") expected = concat([partway1, partway2, partway3], dim="dim2") expected_dict = {} expected_dict["drop"] = expected.copy(deep=True) expected_dict["drop"].attrs = {} expected_dict["no_conflicts"] = expected.copy(deep=True) expected_dict["no_conflicts"].attrs = { "a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, } expected_dict["override"] = expected.copy(deep=True) expected_dict["override"].attrs = {"a": 1} datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4), ds(5)]] datasets[0][0].attrs = {"a": 1} datasets[0][1].attrs = {"a": 1, "b": 2} datasets[0][2].attrs = {"a": 1, "c": 3} datasets[1][0].attrs = {"a": 1, "d": 4} datasets[1][1].attrs = {"a": 1, "e": 5} datasets[1][2].attrs = {"a": 1, "f": 6} with raises_regex(ValueError, "combine_attrs='identical'"): result = combine_nested(datasets, concat_dim=["dim1", "dim2"], combine_attrs="identical") for combine_attrs in expected_dict: result = combine_nested(datasets, concat_dim=["dim1", "dim2"], combine_attrs=combine_attrs) assert_identical(result, expected_dict[combine_attrs]) def test_combine_nested_missing_data_new_dim(self): # Your data includes "time" and "station" dimensions, and each year's # data has a different set of stations. datasets = [ Dataset({ "a": ("x", [2, 3]), "x": [1, 2] }), Dataset({ "a": ("x", [1, 2]), "x": [0, 1] }), ] expected = Dataset( {"a": (("t", "x"), [[np.nan, 2, 3], [1, 2, np.nan]])}, {"x": [0, 1, 2]}) actual = combine_nested(datasets, concat_dim="t") assert_identical(expected, actual) def test_invalid_hypercube_input(self): ds = create_test_data datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4)]] with raises_regex(ValueError, "sub-lists do not have consistent lengths"): combine_nested(datasets, concat_dim=["dim1", "dim2"]) datasets = [[ds(0), ds(1)], [[ds(3), ds(4)]]] with raises_regex(ValueError, "sub-lists do not have consistent depths"): combine_nested(datasets, concat_dim=["dim1", "dim2"]) datasets = [[ds(0), ds(1)], [ds(3), ds(4)]] with raises_regex(ValueError, "concat_dims has length"): combine_nested(datasets, concat_dim=["dim1"]) def test_merge_one_dim_concat_another(self): objs = [ [ Dataset({"foo": ("x", [0, 1])}), Dataset({"bar": ("x", [10, 20])}) ], [ Dataset({"foo": ("x", [2, 3])}), Dataset({"bar": ("x", [30, 40])}) ], ] expected = Dataset({ "foo": ("x", [0, 1, 2, 3]), "bar": ("x", [10, 20, 30, 40]) }) actual = combine_nested(objs, concat_dim=["x", None], compat="equals") assert_identical(expected, actual) # Proving it works symmetrically objs = [ [Dataset({"foo": ("x", [0, 1])}), Dataset({"foo": ("x", [2, 3])})], [ Dataset({"bar": ("x", [10, 20])}), Dataset({"bar": ("x", [30, 40])}) ], ] actual = combine_nested(objs, concat_dim=[None, "x"], compat="equals") assert_identical(expected, actual) def test_combine_concat_over_redundant_nesting(self): objs = [[Dataset({"x": [0]}), Dataset({"x": [1]})]] actual = combine_nested(objs, concat_dim=[None, "x"]) expected = Dataset({"x": [0, 1]}) assert_identical(expected, actual) objs = [[Dataset({"x": [0]})], [Dataset({"x": [1]})]] actual = combine_nested(objs, concat_dim=["x", None]) expected = Dataset({"x": [0, 1]}) assert_identical(expected, actual) objs = [[Dataset({"x": [0]})]] actual = combine_nested(objs, concat_dim=[None, None]) expected = Dataset({"x": [0]}) assert_identical(expected, actual) @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0, { "a": 2, "b": 1 }]) def test_combine_nested_fill_value(self, fill_value): datasets = [ Dataset({ "a": ("x", [2, 3]), "b": ("x", [-2, 1]), "x": [1, 2] }), Dataset({ "a": ("x", [1, 2]), "b": ("x", [3, -1]), "x": [0, 1] }), ] if fill_value == dtypes.NA: # if we supply the default, we expect the missing value for a # float array fill_value_a = fill_value_b = np.nan elif isinstance(fill_value, dict): fill_value_a = fill_value["a"] fill_value_b = fill_value["b"] else: fill_value_a = fill_value_b = fill_value expected = Dataset( { "a": (("t", "x"), [[fill_value_a, 2, 3], [1, 2, fill_value_a]]), "b": (("t", "x"), [[fill_value_b, -2, 1], [3, -1, fill_value_b]]), }, {"x": [0, 1, 2]}, ) actual = combine_nested(datasets, concat_dim="t", fill_value=fill_value) assert_identical(expected, actual)
def test_convert_units(self, typename, variant): if typename == "Variable": if variant != "data": pytest.skip("Variable doesn't store coordinates") data = np.linspace(0, 1, 3) * unit_registry.m obj = Variable(dims="x", data=data) units = {None: unit_registry.mm} expected_units = units elif typename == "DataArray": unit_variants = { "data": (unit_registry.Pa, 1, 1), "dims": (1, unit_registry.s, 1), "coords": (1, 1, unit_registry.m), } data_unit, dim_unit, coord_unit = unit_variants.get(variant) coords = { "data": {}, "dims": { "x": [0, 1, 2] * dim_unit }, "coords": { "u": ("x", [10, 3, 4] * coord_unit) }, } obj = DataArray( dims="x", data=np.linspace(0, 1, 3) * data_unit, coords=coords.get(variant), ) template = { **{ obj.name: None }, **{name: None for name in obj.coords}, } units = { "data": { None: unit_registry.hPa }, "dims": { "x": unit_registry.ms }, "coords": { "u": unit_registry.mm }, }.get(variant) expected_units = {**template, **units} elif typename == "Dataset": unit_variants = { "data": ((unit_registry.s, unit_registry.kg), 1, 1), "dims": ((1, 1), unit_registry.s, 1), "coords": ((1, 1), 1, unit_registry.m), } (data_unit1, data_unit2), dim_unit, coord_unit = unit_variants.get(variant) coords = { "data": {}, "dims": { "x": [0, 1, 2] * dim_unit }, "coords": { "u": ("x", [10, 3, 4] * coord_unit) }, } obj = Dataset( data_vars={ "a": ("x", np.linspace(-1, 1, 3) * data_unit1), "b": ("x", np.linspace(1, 2, 3) * data_unit2), }, coords=coords.get(variant), ) template = { **{name: None for name in obj.data_vars.keys()}, **{name: None for name in obj.coords.keys()}, } units = { "data": { "a": unit_registry.ms, "b": unit_registry.g }, "dims": { "x": unit_registry.ms }, "coords": { "u": unit_registry.mm }, }.get(variant) expected_units = {**template, **units} actual = conversion.convert_units(obj, units) assert conversion.extract_units(actual) == expected_units assert_equal(obj, actual)
def test_no_concatenation_needed(self): ds = Dataset({"foo": ("x", [0, 1])}) expected = {(): ds} actual, concat_dims = _infer_concat_order_from_coords([ds]) assert_combined_tile_ids_equal(expected, actual) assert concat_dims == []
hdng = DataArray(hdngens, coords=coords1, dims=dims1) ptch = DataArray(ptchens, coords=coords1, dims=dims1) roll = DataArray(rollens, coords=coords1, dims=dims1) p = DataArray(pens, coords=coords1, dims=dims1) # data_vars = dict(uwrawnotilt=uwrawnotilt, vwrawnotilt=vwrawnotilt, uwraw=uwraw, vwraw=vwraw, uwnotilt=uwnotilt, vwnotilt=vwnotilt, uw=uw, vw=vw, tke=tke, aniso=aniso, u=u, v=v, w5=w5, urms=urms, vrms=vrms, w5rms=w5rms, uz=uz, vz=vz, hdng=hdng, ptch=ptch, roll=roll, p=p) data_vars = dict(uwrawnotilt=uwrawnotilt, vwrawnotilt=vwrawnotilt, uwraw=uwraw, vwraw=vwraw, uwnotilt=uwnotilt, vwnotilt=vwnotilt, uw=uw, vw=vw, tke=tke, u=u, v=v, w5=w5, urms=urms, vrms=vrms, w5rms=w5rms, uz=uz, vz=vz, hdng=hdng, ptch=ptch, roll=roll, p=p) Dataset(data_vars=data_vars, coords=coords).to_netcdf(fname_rs_out)
class TestXarrayFunctions: @pytest.mark.parametrize( "obj", ( pytest.param(Variable("x", np.linspace(0, 1, 5)), id="Variable"), pytest.param( DataArray( data=np.linspace(0, 1, 5), dims="x", coords={"u": ("x", np.arange(5))}, ), id="DataArray", ), pytest.param( Dataset( { "a": ("x", np.linspace(-1, 1, 5)), "b": ("x", np.linspace(0, 1, 5)), }, coords={"u": ("x", np.arange(5))}, ), id="Dataset", ), ), ) @pytest.mark.parametrize( "units", ( pytest.param({ None: None, "u": None }, id="no units"), pytest.param({ None: unit_registry.m, "u": None }, id="data units"), pytest.param({ None: None, "u": unit_registry.s }, id="coord units"), ), ) def test_attach_units(self, obj, units): if isinstance(obj, Variable) and "u" in units: pytest.skip(msg="variables don't have coordinates") if isinstance(obj, Dataset): units = units.copy() data_units = units.pop(None) units.update({"a": data_units, "b": data_units}) actual = conversion.attach_units(obj, units) assert conversion.extract_units(actual) == units @pytest.mark.parametrize( ["obj", "units"], ( pytest.param( DataArray(dims="x", coords={ "x": [], "u": ("x", []) }), { None: "hPa", "x": "m" }, id="DataArray", ), pytest.param( Dataset( data_vars={ "a": ("x", []), "b": ("x", []) }, coords={ "x": [], "u": ("x", []) }, ), { "a": "K", "b": "hPa", "u": "m" }, id="Dataset", ), pytest.param(Variable("x", []), {None: "hPa"}, id="Variable"), ), ) def test_attach_unit_attributes(self, obj, units): actual = conversion.attach_unit_attributes(obj, units) assert units == filter_none_values( conversion.extract_unit_attributes(actual)) @pytest.mark.parametrize( "variant", ( "data", pytest.param( "dims", marks=pytest.mark.xfail(reason="indexes don't support units")), "coords", ), ) @pytest.mark.parametrize("typename", ("Variable", "DataArray", "Dataset")) def test_convert_units(self, typename, variant): if typename == "Variable": if variant != "data": pytest.skip("Variable doesn't store coordinates") data = np.linspace(0, 1, 3) * unit_registry.m obj = Variable(dims="x", data=data) units = {None: unit_registry.mm} expected_units = units elif typename == "DataArray": unit_variants = { "data": (unit_registry.Pa, 1, 1), "dims": (1, unit_registry.s, 1), "coords": (1, 1, unit_registry.m), } data_unit, dim_unit, coord_unit = unit_variants.get(variant) coords = { "data": {}, "dims": { "x": [0, 1, 2] * dim_unit }, "coords": { "u": ("x", [10, 3, 4] * coord_unit) }, } obj = DataArray( dims="x", data=np.linspace(0, 1, 3) * data_unit, coords=coords.get(variant), ) template = { **{ obj.name: None }, **{name: None for name in obj.coords}, } units = { "data": { None: unit_registry.hPa }, "dims": { "x": unit_registry.ms }, "coords": { "u": unit_registry.mm }, }.get(variant) expected_units = {**template, **units} elif typename == "Dataset": unit_variants = { "data": ((unit_registry.s, unit_registry.kg), 1, 1), "dims": ((1, 1), unit_registry.s, 1), "coords": ((1, 1), 1, unit_registry.m), } (data_unit1, data_unit2), dim_unit, coord_unit = unit_variants.get(variant) coords = { "data": {}, "dims": { "x": [0, 1, 2] * dim_unit }, "coords": { "u": ("x", [10, 3, 4] * coord_unit) }, } obj = Dataset( data_vars={ "a": ("x", np.linspace(-1, 1, 3) * data_unit1), "b": ("x", np.linspace(1, 2, 3) * data_unit2), }, coords=coords.get(variant), ) template = { **{name: None for name in obj.data_vars.keys()}, **{name: None for name in obj.coords.keys()}, } units = { "data": { "a": unit_registry.ms, "b": unit_registry.g }, "dims": { "x": unit_registry.ms }, "coords": { "u": unit_registry.mm }, }.get(variant) expected_units = {**template, **units} actual = conversion.convert_units(obj, units) assert conversion.extract_units(actual) == expected_units assert_equal(obj, actual) @pytest.mark.parametrize( "units", ( pytest.param({ None: None, "u": None }, id="no units"), pytest.param({ None: unit_registry.m, "u": None }, id="data units"), pytest.param({ None: None, "u": unit_registry.s }, id="coord units"), pytest.param({ None: unit_registry.m, "u": unit_registry.s }, id="data and coord units"), ), ) @pytest.mark.parametrize("typename", ("Variable", "DataArray", "Dataset")) def test_extract_units(self, typename, units): if typename == "Variable": data_units = units.get(None) or 1 data = np.linspace(0, 1, 2) * data_units units = units.copy() units.pop("u") obj = Variable("x", data) elif typename == "DataArray": data_units = units.get(None) or 1 data = np.linspace(0, 1, 2) * data_units coord_units = units.get("u") or 1 coords = {"u": ("x", np.arange(2) * coord_units)} obj = DataArray(data, dims="x", coords=coords) elif typename == "Dataset": data_units = units.get(None) data1 = np.linspace(-1, 1, 2) * (data_units or 1) data2 = np.linspace(0, 1, 2) * (data_units or 1) coord_units = units.get("u") or 1 coords = {"u": ("x", np.arange(2) * coord_units)} units = units.copy() units.pop(None) units.update({"a": data_units, "b": data_units}) obj = Dataset({ "a": ("x", data1), "b": ("x", data2) }, coords=coords) assert conversion.extract_units(obj) == units @pytest.mark.parametrize( ["obj", "expected"], ( pytest.param( DataArray( coords={ "x": ("x", [], { "units": "m" }), "u": ("x", [], { "units": "s" }), }, attrs={"units": "hPa"}, dims="x", ), { "x": "m", "u": "s", None: "hPa" }, id="DataArray", ), pytest.param( Dataset( data_vars={ "a": ("x", [], { "units": "K" }), "b": ("x", [], { "units": "hPa" }), }, coords={ "x": ("x", [], { "units": "m" }), "u": ("x", [], { "units": "s" }), }, ), { "a": "K", "b": "hPa", "x": "m", "u": "s" }, id="Dataset", ), pytest.param(Variable("x", [], {"units": "hPa"}), {None: "hPa"}, id="Variable"), ), ) def test_extract_unit_attributes(self, obj, expected): actual = conversion.extract_unit_attributes(obj) assert expected == actual @pytest.mark.parametrize( "obj", ( pytest.param(Variable("x", [0, 4, 3] * unit_registry.m), id="Variable"), pytest.param( DataArray( dims="x", data=[0, 4, 3] * unit_registry.m, coords={"u": ("x", [2, 3, 4] * unit_registry.s)}, ), id="DataArray", ), pytest.param( Dataset( data_vars={ "a": ("x", [3, 2, 5] * unit_registry.Pa), "b": ("x", [0, 2, -1] * unit_registry.kg), }, coords={"u": ("x", [2, 3, 4] * unit_registry.s)}, ), id="Dataset", ), ), ) def test_strip_units(self, obj): if isinstance(obj, Variable): expected_units = {None: None} elif isinstance(obj, DataArray): expected_units = {None: None} expected_units.update({name: None for name in obj.coords.keys()}) elif isinstance(obj, Dataset): expected_units = {name: None for name in obj.variables.keys()} actual = conversion.strip_units(obj) assert conversion.extract_units(actual) == expected_units @pytest.mark.parametrize( ["obj", "expected"], ( pytest.param( DataArray( coords={ "x": ("x", [], { "units": "m" }), "u": ("x", [], { "units": "s" }), }, attrs={"units": "hPa"}, dims="x", ), { "x": "m", "u": "s", None: "hPa" }, id="DataArray", ), pytest.param( Dataset( data_vars={ "a": ("x", [], { "units": "K" }), "b": ("x", [], { "units": "hPa" }), }, coords={ "x": ("x", [], { "units": "m" }), "u": ("x", [], { "units": "s" }), }, ), { "a": "K", "b": "hPa", "x": "m", "u": "s" }, id="Dataset", ), pytest.param(Variable("x", [], {"units": "hPa"}), {None: "hPa"}, id="Variable"), ), ) def test_strip_unit_attributes(self, obj, expected): actual = conversion.strip_unit_attributes(obj) expected = {} assert (filter_none_values( conversion.extract_unit_attributes(actual)) == expected)
def test_invalid_coordinates(self): # regression test for GH308 original = Dataset({'foo': ('t', [1, 2], {'coordinates': 'invalid'})}) actual = conventions.decode_cf(original) self.assertDatasetIdentical(original, actual)
def update_time_slice(store: Union[str, MutableMapping], insert_index: int, time_slice: xr.Dataset, mode: str, chunk_sizes: Dict[str, int] = None): """ Update existing zarr dataset by new time slice. :param store: A zarr store. :param insert_index: Time index :param time_slice: Time slice to insert :param mode: Update mode, 'insert' or 'replace' :param chunk_sizes: desired chunk sizes """ if mode not in ('insert', 'replace'): raise ValueError(f'illegal mode value: {mode!r}') insert_mode = mode == 'insert' time_var_names = [] encoding = {} with xr.open_zarr(store) as cube: for var_name in cube.variables: var = cube[var_name] if var.ndim >= 1 and 'time' in var.dims: if var.dims[0] != 'time': raise ValueError( f"dimension 'time' of variable {var_name!r} must be first dimension" ) time_var_names.append(var_name) enc = dict(cube[var_name].encoding) # xarray 0.17+ supports engine preferred chunks if exposed by the backend # zarr does that, but when we use the new 'preferred_chunks' when writing to zarr # it raises and says, 'preferred_chunks' is an unsupported encoding if 'preferred_chunks' in enc: del enc['preferred_chunks'] encoding[var_name] = enc if chunk_sizes: time_slice = chunk_dataset(time_slice, chunk_sizes, format_name='zarr') temp_dir = tempfile.TemporaryDirectory(prefix='xcube-time-slice-', suffix='.zarr') time_slice.to_zarr(temp_dir.name, encoding=encoding) slice_root_group = zarr.open(temp_dir.name, mode='r') slice_arrays = dict(slice_root_group.arrays()) cube_root_group = zarr.open(store, mode='r+') for var_name, var_array in cube_root_group.arrays(): if var_name in time_var_names: slice_array = slice_arrays[var_name] if insert_mode: # Add one empty time step empty = zarr.creation.empty(slice_array.shape, dtype=var_array.dtype) var_array.append(empty, axis=0) # Shift contents var_array[insert_index + 1:, ...] = var_array[insert_index:-1, ...] # Replace slice var_array[insert_index, ...] = slice_array[0] unchunk_dataset(store, coords_only=True)
def test_combine_coords_join_exact(self): objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [1], "y": [1]})] with raises_regex(ValueError, "indexes along dimension"): combine_nested(objs, concat_dim="x", join="exact")
def test_auto_combine(self, combine): objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] actual = combine(objs) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) actual = combine([actual]) assert_identical(expected, actual) objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] actual = combine(objs) expected = Dataset({'x': [0, 1, 2]}) assert_identical(expected, actual) # ensure auto_combine handles non-sorted variables objs = [ Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))])) ] actual = combine(objs) expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) assert_identical(expected, actual) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] with raises_regex(ValueError, 'too many .* dimensions'): combine(objs) objs = [Dataset({'x': 0}), Dataset({'x': 1})] with raises_regex(ValueError, 'cannot infer dimension'): combine(objs) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] with pytest.raises(KeyError): combine(objs)
def apply(self, data): imgdata = {} for band in self.components: imgdata[band] = (data.dims, self.get_8bit_value(data, band)) imgdataset = Dataset(imgdata) return imgdataset