def test_decode_cf_with_drop_variables(self): original = Dataset({ 't': ('t', [0, 1, 2], { 'units': 'days since 2000-01-01' }), 'x': ("x", [9, 8, 7], { 'units': 'km' }), 'foo': (('t', 'x'), [[0, 0, 0], [1, 1, 1], [2, 2, 2]], { 'units': 'bar' }), 'y': ('t', [5, 10, -999], { '_FillValue': -999 }) }) expected = Dataset({ 't': pd.date_range('2000-01-01', periods=3), 'x': ("x", [0, 1, 2]), 'foo': (('t', 'x'), [[0, 0, 0], [1, 1, 1], [2, 2, 2]], { 'units': 'bar' }), 'y': ('t', [5, 10, np.nan]) }) actual = conventions.decode_cf(original, drop_variables=("x", )) actual2 = conventions.decode_cf(original, drop_variables="x") self.assertDatasetIdentical(expected, actual) self.assertDatasetIdentical(expected, actual2)
def test_roundtrip_object_dtype(self): floats = np.array([0.0, 0.0, 1.0, 2.0, 3.0], dtype=object) floats_nans = np.array([np.nan, np.nan, 1.0, 2.0, 3.0], dtype=object) letters = np.array(['ab', 'cdef', 'g'], dtype=object) letters_nans = np.array(['ab', 'cdef', np.nan], dtype=object) all_nans = np.array([np.nan, np.nan], dtype=object) original = Dataset({ 'floats': ('a', floats), 'floats_nans': ('a', floats_nans), 'letters': ('b', letters), 'letters_nans': ('b', letters_nans), 'all_nans': ('c', all_nans), 'nan': ([], np.nan) }) expected = original.copy(deep=True) if isinstance(self, Only32BitTypes): # for netCDF3 tests, expect the results to come back as characters expected['letters_nans'] = expected['letters_nans'].astype('S') expected['letters'] = expected['letters'].astype('S') with self.roundtrip(original) as actual: try: self.assertDatasetIdentical(expected, actual) except AssertionError: # Most stores use '' for nans in strings, but some don't # first try the ideal case (where the store returns exactly) # the original Dataset), then try a more realistic case. # ScipyDataTest, NetCDF3ViaNetCDF4DataTest and NetCDF4DataTest # all end up using this case. expected['letters_nans'][-1] = '' self.assertDatasetIdentical(expected, actual)
def test_groupby(self): data = Dataset({'x': ('x', list('abc')), 'c': ('x', [0, 1, 0]), 'z': (['x', 'y'], np.random.randn(3, 5))}) groupby = data.groupby('x') self.assertEqual(len(groupby), 3) expected_groups = {'a': 0, 'b': 1, 'c': 2} self.assertEqual(groupby.groups, expected_groups) expected_items = [('a', data.indexed(x=0)), ('b', data.indexed(x=1)), ('c', data.indexed(x=2))] self.assertEqual(list(groupby), expected_items) identity = lambda x: x for k in ['x', 'c', 'y']: actual = data.groupby(k, squeeze=False).apply(identity) self.assertEqual(data, actual) data = create_test_data() for n, (t, sub) in enumerate(list(data.groupby('dim1'))[:3]): self.assertEqual(data['dim1'][n], t) self.assertVariableEqual(data['var1'][n], sub['var1']) self.assertVariableEqual(data['var2'][n], sub['var2']) self.assertVariableEqual(data['var3'][:, n], sub['var3']) # TODO: test the other edge cases with self.assertRaisesRegexp(ValueError, 'must be 1 dimensional'): data.groupby('var1') with self.assertRaisesRegexp(ValueError, 'length does not match'): data.groupby(data['dim1'][:3])
def test_groupby_sum(self): array = self.make_groupby_example_array() grouped = array.groupby('abc') expected_sum_all = Dataset( {'foo': Variable(['abc'], np.array([self.x[:, :9].sum(), self.x[:, 10:].sum(), self.x[:, 9:10].sum()]).T), 'abc': Variable(['abc'], np.array(['a', 'b', 'c']))})['foo'] self.assertDataArrayAllClose(expected_sum_all, grouped.reduce(np.sum)) self.assertDataArrayAllClose(expected_sum_all, grouped.sum()) expected = DataArray([array['y'].values[idx].sum() for idx in [slice(9), slice(10, None), slice(9, 10)]], [['a', 'b', 'c']], ['abc']) actual = array['y'].groupby('abc').apply(np.sum) self.assertDataArrayAllClose(expected, actual) actual = array['y'].groupby('abc').sum() self.assertDataArrayAllClose(expected, actual) expected_sum_axis1 = Dataset( {'foo': (['x', 'abc'], np.array([self.x[:, :9].sum(1), self.x[:, 10:].sum(1), self.x[:, 9:10].sum(1)]).T), 'x': self.ds['x'], 'abc': Variable(['abc'], np.array(['a', 'b', 'c']))})['foo'] self.assertDataArrayAllClose(expected_sum_axis1, grouped.reduce(np.sum, 'y')) self.assertDataArrayAllClose(expected_sum_axis1, grouped.sum('y'))
def setUp(self): self.attrs = {'attr1': 'value1', 'attr2': 2929} self.x = np.random.random((10, 20)) self.v = Variable(['x', 'y'], self.x) self.va = Variable(['x', 'y'], self.x, self.attrs) self.ds = Dataset({'foo': self.v}) self.dv = self.ds['foo']
def test_roundtrip_strings_with_fill_value(self): values = np.array(['ab', 'cdef', np.nan], dtype=object) encoding = {'_FillValue': np.string_('X'), 'dtype': np.dtype('S1')} original = Dataset({'x': ('t', values, {}, encoding)}) expected = original.copy(deep=True) expected['x'][:2] = values[:2].astype('S') with self.roundtrip(original) as actual: self.assertDatasetIdentical(expected, actual) original = Dataset({'x': ('t', values, {}, {'_FillValue': '\x00'})}) if not isinstance(self, Only32BitTypes): # these stores can save unicode strings expected = original.copy(deep=True) if isinstance(self, BaseNetCDF4Test): # netCDF4 can't keep track of an empty _FillValue for VLEN # variables expected['x'][-1] = '' elif (isinstance(self, (NetCDF3ViaNetCDF4DataTest, NetCDF4ClassicViaNetCDF4DataTest)) or (has_netCDF4 and type(self) is GenericNetCDFDataTest)): # netCDF4 can't keep track of an empty _FillValue for nc3, either: # https://github.com/Unidata/netcdf4-python/issues/273 expected['x'][-1] = np.string_('') with self.roundtrip(original) as actual: self.assertDatasetIdentical(expected, actual)
def test_roundtrip_object_dtype(self): floats = np.array([0.0, 0.0, 1.0, 2.0, 3.0], dtype=object) floats_nans = np.array([np.nan, np.nan, 1.0, 2.0, 3.0], dtype=object) letters = np.array(['ab', 'cdef', 'g'], dtype=object) letters_nans = np.array(['ab', 'cdef', np.nan], dtype=object) all_nans = np.array([np.nan, np.nan], dtype=object) original = Dataset({ 'floats': ('a', floats), 'floats_nans': ('a', floats_nans), 'letters': ('b', letters), 'letters_nans': ('b', letters_nans), 'all_nans': ('c', all_nans), 'nan': ([], np.nan) }) if PY3 and type(self) is ScipyDataTest: # see the note under test_zero_dimensional_variable del original['nan'] expected = original.copy(deep=True) expected['letters_nans'][-1] = '' if type(self) is not NetCDF4DataTest: # for netCDF3 tests, expect the results to come back as characters expected['letters_nans'] = expected['letters_nans'].astype('S') expected['letters'] = expected['letters'].astype('S') with self.roundtrip(original) as actual: self.assertDatasetIdentical(expected, actual)
def test_coordinates_encoding(self): def equals_latlon(obj): return obj == 'lat lon' or obj == 'lon lat' original = Dataset({'temp': ('x', [0, 1]), 'precip': ('x', [0, -1])}, {'lat': ('x', [2, 3]), 'lon': ('x', [4, 5])}) with self.roundtrip(original) as actual: self.assertDatasetIdentical(actual, original) with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) with open_dataset(tmp_file, decode_coords=False) as ds: self.assertTrue(equals_latlon(ds['temp'].attrs['coordinates'])) self.assertTrue(equals_latlon(ds['precip'].attrs['coordinates'])) self.assertNotIn('coordinates', ds.attrs) self.assertNotIn('coordinates', ds['lat'].attrs) self.assertNotIn('coordinates', ds['lon'].attrs) modified = original.drop(['temp', 'precip']) with self.roundtrip(modified) as actual: self.assertDatasetIdentical(actual, modified) with create_tmp_file() as tmp_file: modified.to_netcdf(tmp_file) with open_dataset(tmp_file, decode_coords=False) as ds: self.assertTrue(equals_latlon(ds.attrs['coordinates'])) self.assertNotIn('coordinates', ds['lat'].attrs) self.assertNotIn('coordinates', ds['lon'].attrs)
def test_coordinates_encoding(self): def equals_latlon(obj): return obj == 'lat lon' or obj == 'lon lat' original = Dataset({ 'temp': ('x', [0, 1]), 'precip': ('x', [0, -1]) }, { 'lat': ('x', [2, 3]), 'lon': ('x', [4, 5]) }) with self.roundtrip(original) as actual: self.assertDatasetIdentical(actual, original) with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) with open_dataset(tmp_file, decode_coords=False) as ds: self.assertTrue(equals_latlon(ds['temp'].attrs['coordinates'])) self.assertTrue( equals_latlon(ds['precip'].attrs['coordinates'])) self.assertNotIn('coordinates', ds.attrs) self.assertNotIn('coordinates', ds['lat'].attrs) self.assertNotIn('coordinates', ds['lon'].attrs) modified = original.drop(['temp', 'precip']) with self.roundtrip(modified) as actual: self.assertDatasetIdentical(actual, modified) with create_tmp_file() as tmp_file: modified.to_netcdf(tmp_file) with open_dataset(tmp_file, decode_coords=False) as ds: self.assertTrue(equals_latlon(ds.attrs['coordinates'])) self.assertNotIn('coordinates', ds['lat'].attrs) self.assertNotIn('coordinates', ds['lon'].attrs)
def test_roundtrip_coordinates(self): original = Dataset({'foo': ('x', [0, 1])}, { 'x': [2, 3], 'y': ('a', [42]), 'z': ('x', [4, 5]) }) with self.roundtrip(original) as actual: self.assertDatasetIdentical(original, actual) expected = original.drop('foo') with self.roundtrip(expected) as actual: self.assertDatasetIdentical(expected, actual) expected = original.copy() expected.attrs['coordinates'] = 'something random' with self.assertRaisesRegexp(ValueError, 'cannot serialize'): with self.roundtrip(expected): pass expected = original.copy(deep=True) expected['foo'].attrs['coordinates'] = 'something random' with self.assertRaisesRegexp(ValueError, 'cannot serialize'): with self.roundtrip(expected): pass
def test_encoding_kwarg(self): ds = Dataset({'x': ('y', np.arange(10.0))}) kwargs = dict(encoding={'x': {'dtype': 'f4'}}) with self.roundtrip(ds, save_kwargs=kwargs) as actual: self.assertEqual(actual.x.encoding['dtype'], 'f4') self.assertEqual(ds.x.encoding, {}) kwargs = dict(encoding={'x': {'foo': 'bar'}}) with self.assertRaisesRegexp(ValueError, 'unexpected encoding'): with self.roundtrip(ds, save_kwargs=kwargs) as actual: pass kwargs = dict(encoding={'x': 'foo'}) with self.assertRaisesRegexp(ValueError, 'must be castable'): with self.roundtrip(ds, save_kwargs=kwargs) as actual: pass kwargs = dict(encoding={'invalid': {}}) with self.assertRaises(KeyError): with self.roundtrip(ds, save_kwargs=kwargs) as actual: pass ds = Dataset({'t': pd.date_range('2000-01-01', periods=3)}) units = 'days since 1900-01-01' kwargs = dict(encoding={'t': {'units': units}}) with self.roundtrip(ds, save_kwargs=kwargs) as actual: self.assertEqual(actual.t.encoding['units'], units) self.assertDatasetIdentical(actual, ds)
def test_open_and_do_math(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: original.to_netcdf(tmp) with open_mfdataset(tmp) as ds: actual = 1.0 * ds self.assertDatasetAllClose(original, actual)
def test_coordinate(self): a = Dataset() vec = np.random.random((10, )) attributes = {'foo': 'bar'} a['x'] = ('x', vec, attributes) self.assertTrue('x' in a.coordinates) self.assertIsInstance(a.coordinates['x'].as_index, pd.Index) self.assertVariableEqual(a.coordinates['x'], a.variables['x']) b = Dataset() b['x'] = ('x', vec, attributes) self.assertVariableEqual(a['x'], b['x']) self.assertEqual(a.dimensions, b.dimensions) # this should work a['x'] = ('x', vec[:5]) a['z'] = ('x', np.arange(5)) with self.assertRaises(ValueError): # now it shouldn't, since there is a conflicting length a['x'] = ('x', vec[:4]) arr = np.random.random(( 10, 1, )) scal = np.array(0) with self.assertRaises(ValueError): a['y'] = ('y', arr) with self.assertRaises(ValueError): a['y'] = ('y', scal) self.assertTrue('y' not in a.dimensions)
def test_roundtrip_object_dtype(self): floats = np.array([0.0, 0.0, 1.0, 2.0, 3.0], dtype=object) floats_nans = np.array([np.nan, np.nan, 1.0, 2.0, 3.0], dtype=object) letters = np.array(['ab', 'cdef', 'g'], dtype=object) letters_nans = np.array(['ab', 'cdef', np.nan], dtype=object) all_nans = np.array([np.nan, np.nan], dtype=object) original = Dataset({'floats': ('a', floats), 'floats_nans': ('a', floats_nans), 'letters': ('b', letters), 'letters_nans': ('b', letters_nans), 'all_nans': ('c', all_nans), 'nan': ([], np.nan)}) if PY3 and type(self) is ScipyDataTest: # see the note under test_zero_dimensional_variable del original['nan'] expected = original.copy(deep=True) if type(self) in [NetCDF3ViaNetCDF4DataTest, ScipyDataTest]: # for netCDF3 tests, expect the results to come back as characters expected['letters_nans'] = expected['letters_nans'].astype('S') expected['letters'] = expected['letters'].astype('S') with self.roundtrip(original) as actual: try: self.assertDatasetIdentical(expected, actual) except AssertionError: # Most stores use '' for nans in strings, but some don't # first try the ideal case (where the store returns exactly) # the original Dataset), then try a more realistic case. # ScipyDataTest, NetCDF3ViaNetCDF4DataTest and NetCDF4DataTest # all end up using this case. expected['letters_nans'][-1] = '' self.assertDatasetIdentical(expected, actual)
def test_pipe_tuple_error(self): df = Dataset({'A': ('x', [1, 2, 3])}) f = lambda x, y: y with self.assertRaises(ValueError): df.pipe((f, 'y'), x=1, y=0) with self.assertRaises(ValueError): df.A.pipe((f, 'y'), x=1, y=0)
def test_pipe_tuple(self): df = Dataset({'A': ('x', [1, 2, 3])}) f = lambda x, y: y result = df.pipe((f, 'y'), 0) self.assertDatasetIdentical(result, df) result = df.A.pipe((f, 'y'), 0) self.assertDataArrayIdentical(result, df.A)
def test_pipe_tuple_error(self): df = Dataset({"A": ("x", [1, 2, 3])}) f = lambda x, y: y with self.assertRaises(ValueError): df.pipe((f, "y"), x=1, y=0) with self.assertRaises(ValueError): df.A.pipe((f, "y"), x=1, y=0)
def test_preprocess_mfdataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: original.to_netcdf(tmp) preprocess = lambda ds: ds.assign_coords(z=0) expected = preprocess(original) with open_mfdataset(tmp, preprocess=preprocess) as actual: self.assertDatasetIdentical(expected, actual)
def test_pipe_tuple(self): df = Dataset({"A": ("x", [1, 2, 3])}) f = lambda x, y: y result = df.pipe((f, "y"), 0) self.assertDatasetIdentical(result, df) result = df.A.pipe((f, "y"), 0) self.assertDataArrayIdentical(result, df.A)
def test_save_mfdataset_roundtrip(self): original = Dataset({'foo': ('x', np.random.randn(10))}) datasets = [original.isel(x=slice(5)), original.isel(x=slice(5, 10))] with create_tmp_file() as tmp1: with create_tmp_file() as tmp2: save_mfdataset(datasets, [tmp1, tmp2]) with open_mfdataset([tmp1, tmp2]) as actual: self.assertDatasetIdentical(actual, original)
def test_reduce_argmin(self): # regression test for #205 ds = Dataset({'a': ('x', [0, 1])}) expected = Dataset({'a': ([], 0)}) actual = ds.argmin() self.assertDatasetIdentical(expected, actual) actual = ds.argmin('x') self.assertDatasetIdentical(expected, actual)
def test_pipe(self): df = Dataset({"A": ("x", [1, 2, 3])}) f = lambda x, y: x ** y result = df.pipe(f, 2) expected = Dataset({"A": ("x", [1, 4, 9])}) self.assertDatasetIdentical(result, expected) result = df.A.pipe(f, 2) self.assertDataArrayIdentical(result, expected.A)
def test_pipe(self): df = Dataset({'A': ('x', [1, 2, 3])}) f = lambda x, y: x**y result = df.pipe(f, 2) expected = Dataset({'A': ('x', [1, 4, 9])}) self.assertDatasetIdentical(result, expected) result = df.A.pipe(f, 2) self.assertDataArrayIdentical(result, expected.A)
def test_weakrefs(self): example = Dataset({'foo': ('x', np.arange(5.0))}) expected = example.rename({'foo': 'bar', 'x': 'y'}) with create_tmp_file() as tmp_file: example.to_netcdf(tmp_file, engine='scipy') on_disk = open_dataset(tmp_file, engine='pynio') actual = on_disk.rename({'foo': 'bar', 'x': 'y'}) del on_disk # trigger garbage collection self.assertDatasetIdentical(actual, expected)
def test_variable_order(self): # doesn't work with scipy or h5py :( ds = Dataset() ds['a'] = 1 ds['z'] = 2 ds['b'] = 3 ds.coords['c'] = 4 with self.roundtrip(ds) as actual: self.assertEqual(list(ds), list(actual))
def test_lock(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: original.to_netcdf(tmp) with open_dataset(tmp, chunks=10) as ds: task = ds.foo.data.dask[ds.foo.data.name, 0] self.assertIsInstance(task[-1], type(Lock())) with open_mfdataset(tmp) as ds: task = ds.foo.data.dask[ds.foo.data.name, 0] self.assertIsInstance(task[-1], type(Lock()))
def test_concat_constant_index(self): # GH425 ds1 = Dataset({'foo': 1.5}, {'y': 1}) ds2 = Dataset({'foo': 2.5}, {'y': 1}) expected = Dataset({'foo': ('y', [1.5, 2.5]), 'y': [1, 1]}) for mode in ['different', 'all', ['foo']]: actual = concat([ds1, ds2], 'y', data_vars=mode) self.assertDatasetIdentical(expected, actual) with self.assertRaisesRegexp(ValueError, 'not equal across datasets'): concat([ds1, ds2], 'y', data_vars='minimal')
def test_groupby_returns_new_type(self): data = Dataset({'z': (['x', 'y'], np.random.randn(3, 5))}) actual = data.groupby('x').apply(lambda ds: ds['z']) expected = data['z'] self.assertDataArrayIdentical(expected, actual) actual = data['z'].groupby('x').apply(lambda x: x.to_dataset()) expected = data self.assertDatasetIdentical(expected, actual)
def test_dims(self): arr = self.dv self.assertEqual(arr.dims, ('x', 'y')) arr.dims = ('w', 'z') self.assertEqual(arr.dims, ('w', 'z')) x = Dataset({'x': ('x', np.arange(5))})['x'] x.dims = ('y',) self.assertEqual(x.dims, ('y',)) self.assertEqual(x.name, 'y')
def test_open_dataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: original.to_netcdf(tmp) with open_dataset(tmp, chunks={'x': 5}) as actual: self.assertIsInstance(actual.foo.variable.data, da.Array) self.assertEqual(actual.foo.variable.data.chunks, ((5, 5),)) self.assertDatasetAllClose(original, actual) with open_dataset(tmp) as actual: self.assertIsInstance(actual.foo.variable.data, np.ndarray) self.assertDatasetAllClose(original, actual)
def test_dataset(self): original = Dataset({ 't': ('t', [0, 1, 2], {'units': 'days since 2000-01-01'}), 'foo': ('t', [0, 0, 0], {'coordinates': 'y', 'units': 'bar'}), 'y': ('t', [5, 10, -999], {'_FillValue': -999}) }) expected = Dataset({'foo': ('t', [0, 0, 0], {'units': 'bar'})}, {'t': pd.date_range('2000-01-01', periods=3), 'y': ('t', [5.0, 10.0, np.nan])}) actual = conventions.decode_cf(original) self.assertDatasetIdentical(expected, actual)
def test_concat_coords(self): data = Dataset({'foo': ('x', np.random.randn(10))}) expected = data.assign_coords(c=('x', [0] * 5 + [1] * 5)) objs = [data.isel(x=slice(5)).assign_coords(c=0), data.isel(x=slice(5, None)).assign_coords(c=1)] for coords in ['different', 'all', ['c']]: actual = concat(objs, dim='x', coords=coords) self.assertDatasetIdentical(expected, actual) for coords in ['minimal', []]: with self.assertRaisesRegexp(ValueError, 'not equal across'): concat(objs, dim='x', coords=coords)
def test_squeeze(self): data = Dataset({'foo': (['x', 'y', 'z'], [[[1], [2]]])}) for args in [[], [['x']], [['x', 'z']]]: def get_args(v): return [set(args[0]) & set(v.dimensions)] if args else [] expected = Dataset({k: v.squeeze(*get_args(v)) for k, v in data.variables.iteritems()}) self.assertDatasetIdentical(expected, data.squeeze(*args)) # invalid squeeze with self.assertRaisesRegexp(ValueError, 'cannot select a dimension'): data.squeeze('y')
def test_dataset_repr_with_netcdf4_datetimes(self): # regression test for #347 attrs = {'units': 'days since 0001-01-01', 'calendar': 'noleap'} with warnings.catch_warnings(): warnings.filterwarnings('ignore', 'unable to decode time') ds = decode_cf(Dataset({'time': ('time', [0, 1], attrs)})) self.assertIn('(time) object', repr(ds)) attrs = {'units': 'days since 1900-01-01'} ds = decode_cf(Dataset({'time': ('time', [0, 1], attrs)})) self.assertIn('(time) datetime64[ns]', repr(ds))
def test_simultaneous_compute(self): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).chunk() count = [0] def counting_get(*args, **kwargs): count[0] += 1 return dask.get(*args, **kwargs) with dask.set_options(get=counting_get): ds.load() self.assertEqual(count[0], 1)
def test_simultaneous_compute(self): ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk() count = [0] def counting_get(*args, **kwargs): count[0] += 1 return dask.get(*args, **kwargs) with dask.set_options(get=counting_get): ds.load() self.assertEqual(count[0], 1)
def test_init(self): var1 = Variable('x', 2 * np.arange(100)) var2 = Variable('x', np.arange(1000)) var3 = Variable(['x', 'y'], np.arange(1000).reshape(100, 10)) with self.assertRaisesRegexp(ValueError, 'but already exists'): Dataset({'a': var1, 'b': var2}) with self.assertRaisesRegexp(ValueError, 'must be defined with 1-d'): Dataset({'a': var1, 'x': var3}) # verify handling of DataArrays expected = Dataset({'x': var1, 'z': var3}) actual = Dataset({'z': expected['z']}) self.assertDatasetIdentical(expected, actual)
def test_lazy_load(self): store = InaccessibleVariableDataStore() store.set_dimension('dim', 10) store.set_variable('dim', XArray(('dim'), np.arange(10))) store.set_variable('var', XArray(('dim'), np.random.uniform(size=10))) ds = Dataset() ds = ds.load_store(store, decode_cf=False) self.assertRaises(UnexpectedDataAccess, lambda: ds['var'].data) ds = ds.load_store(store, decode_cf=True) self.assertRaises(UnexpectedDataAccess, lambda: ds['var'].data)
def test_simultaneous_compute(self): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).reblock() count = np.array(0) def counting_get(*args, **kwargs): count[...] += 1 return dask.get(*args, **kwargs) with dask.set_options(get=counting_get): ds.load_data() self.assertEqual(count, 1)
def test_reset_coords(self): data = DataArray(np.zeros((3, 4)), { 'bar': ('x', ['a', 'b', 'c']), 'baz': ('y', range(4)) }, dims=['x', 'y'], name='foo') actual = data.reset_coords() expected = Dataset({ 'foo': (['x', 'y'], np.zeros((3, 4))), 'bar': ('x', ['a', 'b', 'c']), 'baz': ('y', range(4)) }) self.assertDatasetIdentical(actual, expected) actual = data.reset_coords(['bar', 'baz']) self.assertDatasetIdentical(actual, expected) actual = data.reset_coords('bar') expected = Dataset( { 'foo': (['x', 'y'], np.zeros((3, 4))), 'bar': ('x', ['a', 'b', 'c']) }, {'baz': ('y', range(4))}) self.assertDatasetIdentical(actual, expected) actual = data.reset_coords(['bar']) self.assertDatasetIdentical(actual, expected) actual = data.reset_coords(drop=True) expected = DataArray(np.zeros((3, 4)), dims=['x', 'y'], name='foo') self.assertDataArrayIdentical(actual, expected) actual = data.copy() actual.reset_coords(drop=True, inplace=True) self.assertDataArrayIdentical(actual, expected) actual = data.reset_coords('bar', drop=True) expected = DataArray(np.zeros((3, 4)), {'baz': ('y', range(4))}, dims=['x', 'y'], name='foo') self.assertDataArrayIdentical(actual, expected) with self.assertRaisesRegexp(ValueError, 'cannot reset coord'): data.reset_coords(inplace=True) with self.assertRaises(KeyError): data.reset_coords('foo', drop=True) with self.assertRaisesRegexp(ValueError, 'cannot be found'): data.reset_coords('not_found') with self.assertRaisesRegexp(ValueError, 'cannot remove index'): data.reset_coords('y')
def test_concat_coords(self): data = Dataset({'foo': ('x', np.random.randn(10))}) expected = data.assign_coords(c=('x', [0] * 5 + [1] * 5)) objs = [ data.isel(x=slice(5)).assign_coords(c=0), data.isel(x=slice(5, None)).assign_coords(c=1) ] for coords in ['different', 'all', ['c']]: actual = concat(objs, dim='x', coords=coords) self.assertDatasetIdentical(expected, actual) for coords in ['minimal', []]: with self.assertRaisesRegexp(ValueError, 'not equal across'): concat(objs, dim='x', coords=coords)
def test_lock(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: original.to_netcdf(tmp, format='NETCDF3_CLASSIC') with open_dataset(tmp, chunks=10) as ds: task = ds.foo.data.dask[ds.foo.data.name, 0] self.assertIsInstance(task[-1], type(Lock())) with open_mfdataset(tmp) as ds: task = ds.foo.data.dask[ds.foo.data.name, 0] self.assertIsInstance(task[-1], type(Lock())) with open_mfdataset(tmp, engine='scipy') as ds: task = ds.foo.data.dask[ds.foo.data.name, 0] self.assertNotIsInstance(task[-1], type(Lock()))
def test_open_dataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: original.to_netcdf(tmp) with open_dataset(tmp, chunks={'x': 5}) as actual: self.assertIsInstance(actual.foo.variable.data, da.Array) self.assertEqual(actual.foo.variable.data.chunks, ((5, 5), )) self.assertDatasetIdentical(original, actual) with open_dataset(tmp, chunks=5) as actual: self.assertDatasetIdentical(original, actual) with open_dataset(tmp) as actual: self.assertIsInstance(actual.foo.variable.data, np.ndarray) self.assertDatasetIdentical(original, actual)
def test_to_and_from_dataframe(self): x = np.random.randn(10) y = np.random.randn(10) t = list('abcdefghij') ds = Dataset( OrderedDict([ ('a', ('t', x)), ('b', ('t', y)), ('t', ('t', t)), ])) expected = pd.DataFrame(np.array([x, y]).T, columns=['a', 'b'], index=pd.Index(t, name='t')) actual = ds.to_dataframe() # use the .equals method to check all DataFrame metadata assert expected.equals(actual), (expected, actual) # check roundtrip self.assertDatasetIdentical(ds, Dataset.from_dataframe(actual)) # test a case with a MultiIndex w = np.random.randn(2, 3) ds = Dataset({'w': (('x', 'y'), w)}) ds['y'] = ('y', list('abc')) exp_index = pd.MultiIndex.from_arrays( [[0, 0, 0, 1, 1, 1], ['a', 'b', 'c', 'a', 'b', 'c']], names=['x', 'y']) expected = pd.DataFrame(w.reshape(-1), columns=['w'], index=exp_index) actual = ds.to_dataframe() self.assertTrue(expected.equals(actual)) # check roundtrip self.assertDatasetIdentical(ds, Dataset.from_dataframe(actual))
def test_dataset(): # need to create all the dimensions that GCMDataset likes # oceanic parameters, cartesian coordinates, doubly periodic H = 5000. Lx = 4e6 Ly = 3e6 Nz = 10 Nx = 25 Ny = 20 dz = H / Nz dx = Lx / Nx dy = Ly / Ny ds = Dataset() ds.attrs['H'] = H ds.attrs['Lx'] = Lx ds.attrs['Ly'] = Ly ds.attrs['Nz'] = Nz ds.attrs['Nx'] = Nx ds.attrs['Ny'] = Ny ds.attrs['dz'] = dz ds.attrs['dx'] = dx ds.attrs['dy'] = dy # vertical grid ds['Z'] = ('Z', dz/2 + dz*np.arange(Nz)) ds['Zp1'] = ('Zp1', dz*np.arange(Nz+1)) ds['Zl'] = ('Zl', dz*np.arange(Nz)) ds['Zu'] = ('Zu', dz + dz*np.arange(Nz)) # vertical spacing ds['drF'] = ('Z', np.full(Nz, dz)) ds['drC'] = ('Zp1', np.hstack([dz/2, np.full(Nz-1, dz), dz/2])) # horizontal grid ds['X'] = ('X', dx/2 + dx*np.arange(Nx)) ds['Xp1'] = ('Xp1', dx*np.arange(Nx)) ds['Y'] = ('Y', dy/2 + dy*np.arange(Ny)) ds['Yp1'] = ('Yp1', dy*np.arange(Ny)) xc, yc = np.meshgrid(ds.X, ds.Y) xg, yg = np.meshgrid(ds.Xp1, ds.Yp1) ds['XC'] = (('Y','X'), xc) ds['YC'] = (('Y','X'), yc) ds['XG'] = (('Yp1','Xp1'), xg) ds['YG'] = (('Yp1','Xp1'), yg) # horizontal spacing ds['dxC'] = (('Y','Xp1'), np.full((Ny,Nx), dx)) ds['dyC'] = (('Yp1','X'), np.full((Ny,Nx), dy)) ds['dxG'] = (('Yp1','X'), np.full((Ny,Nx), dx)) ds['dyG'] = (('Y','Xp1'), np.full((Ny,Nx), dx)) return ds
def test_unselect(self): data = create_test_data() self.assertEqual(data, data.unselect()) expected = Dataset({k: data[k] for k in data if k != 'time'}) actual = data.unselect('time') self.assertEqual(expected, actual) expected = Dataset({k: data[k] for k in ['dim2', 'dim3', 'time']}) actual = data.unselect('dim1') self.assertEqual(expected, actual) with self.assertRaisesRegexp(ValueError, 'does not exist in this'): data.unselect('not_found_here')
def test_concat_errors(self): data = create_test_data() split_data = [data.isel(dim1=slice(3)), data.isel(dim1=slice(3, None))] with self.assertRaisesRegexp(ValueError, 'must supply at least one'): concat([], 'dim1') with self.assertRaisesRegexp(ValueError, 'are not coordinates'): concat([data, data], 'new_dim', coords=['not_found']) with self.assertRaisesRegexp(ValueError, 'global attributes not'): data0, data1 = deepcopy(split_data) data1.attrs['foo'] = 'bar' concat([data0, data1], 'dim1', compat='identical') self.assertDatasetIdentical( data, concat([data0, data1], 'dim1', compat='equals')) with self.assertRaisesRegexp(ValueError, 'encountered unexpected'): data0, data1 = deepcopy(split_data) data1['foo'] = ('bar', np.random.randn(10)) concat([data0, data1], 'dim1') with self.assertRaisesRegexp(ValueError, 'not equal across datasets'): data0, data1 = deepcopy(split_data) data1['dim2'] = 2 * data1['dim2'] concat([data0, data1], 'dim1', coords='minimal') with self.assertRaisesRegexp(ValueError, 'it is not 1-dimensional'): concat([data0, data1], 'dim1') with self.assertRaisesRegexp(ValueError, 'compat.* invalid'): concat(split_data, 'dim1', compat='foobar') with self.assertRaisesRegexp(ValueError, 'unexpected value for'): concat([data, data], 'new_dim', coords='foobar') with self.assertRaisesRegexp( ValueError, 'coordinate in some datasets but not others'): concat([Dataset({'x': 0}), Dataset({'x': [1]})], dim='z') with self.assertRaisesRegexp( ValueError, 'coordinate in some datasets but not others'): concat([Dataset({'x': 0}), Dataset({}, {'x': 1})], dim='z') with self.assertRaisesRegexp(ValueError, 'no longer a valid'): concat([data, data], 'new_dim', mode='different') with self.assertRaisesRegexp(ValueError, 'no longer a valid'): concat([data, data], 'new_dim', concat_over='different')
def roundtrip(self, data, decode_cf=True): store = CFEncodedInMemoryStore() data.dump_to_store(store) if decode_cf: yield conventions.decode_cf(store) else: yield Dataset.load_store(store)
def create_test_data(seed=None): rs = np.random.RandomState(seed) _vars = {'var1': ['dim1', 'dim2'], 'var2': ['dim1', 'dim2'], 'var3': ['dim3', 'dim1']} _dims = {'dim1': 8, 'dim2': 9, 'dim3': 10} obj = Dataset() obj['time'] = ('time', pd.date_range('2000-01-01', periods=20)) obj['dim1'] = ('dim1', np.arange(_dims['dim1'])) obj['dim2'] = ('dim2', 0.5 * np.arange(_dims['dim2'])) obj['dim3'] = ('dim3', list('abcdefghij')) for v, dims in sorted(_vars.items()): data = rs.normal(size=tuple(_dims[d] for d in dims)) obj[v] = (dims, data, {'foo': 'variable'}) obj.coords['numbers'] = ('dim3', [0, 1, 2, 0, 0, 1, 1, 2, 2, 3]) return obj
def test_groupby(self): data = Dataset({'x': ('x', list('abc')), 'c': ('x', [0, 1, 0]), 'z': (['x', 'y'], np.random.randn(3, 5))}) groupby = data.groupby('x') self.assertEqual(len(groupby), 3) expected_groups = {'a': 0, 'b': 1, 'c': 2} self.assertEqual(groupby.groups, expected_groups) expected_items = [('a', data.isel(x=0)), ('b', data.isel(x=1)), ('c', data.isel(x=2))] self.assertEqual(list(groupby), expected_items) identity = lambda x: x for k in ['x', 'c', 'y']: actual = data.groupby(k, squeeze=False).apply(identity) self.assertEqual(data, actual)
def test_open_mfdataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp1: with create_tmp_file() as tmp2: original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) with open_mfdataset([tmp1, tmp2]) as actual: self.assertIsInstance(actual.foo.variable.data, da.Array) self.assertEqual(actual.foo.variable.data.chunks, ((5, 5),)) self.assertDatasetAllClose(original, actual) with open_mfdataset([tmp1, tmp2], chunks={'x': 3}) as actual: self.assertEqual(actual.foo.variable.data.chunks, ((3, 2, 3, 2),)) with self.assertRaisesRegexp(IOError, 'no files to open'): open_mfdataset('foo-bar-baz-*.nc')
def test_write_store(self): expected = create_test_data() with self.create_store() as store: expected.dump_to_store(store) # the test data contains times. In case the store # cf_encodes them we need to cf_decode them. actual = Dataset.load_store(store, cf_decoder) self.assertDatasetAllClose(expected, actual)
def test_zero_dimensional_variable(self): expected = create_test_data() expected['xray_awesomeness'] = ([], np.array(1.e9), {'units': 'units of awesome'}) with self.create_store() as store: expected.dump_to_store(store) actual = Dataset.load_store(store) self.assertDatasetAllClose(expected, actual)