def test_dict_equiv(self): x = OrderedDict() x['a'] = 3 x['b'] = np.array([1, 2, 3]) y = OrderedDict() y['b'] = np.array([1.0, 2.0, 3.0]) y['a'] = 3 self.assertTrue(utils.dict_equiv(x, y)) # two nparrays are equal y['b'] = [1, 2, 3] # np.array not the same as a list self.assertTrue(utils.dict_equiv(x, y)) # nparray == list x['b'] = [1.0, 2.0, 3.0] self.assertTrue(utils.dict_equiv(x, y)) # list vs. list x['c'] = None self.assertFalse(utils.dict_equiv(x, y)) # new key in x x['c'] = np.nan y['c'] = np.nan self.assertTrue(utils.dict_equiv(x, y)) # as intended, nan is nan x['c'] = np.inf y['c'] = np.inf self.assertTrue(utils.dict_equiv(x, y)) # inf == inf y = dict(y) self.assertTrue(utils.dict_equiv( x, y)) # different dictionary types are fine y['b'] = 3 * np.arange(3) self.assertFalse(utils.dict_equiv(x, y)) # not equal when arrays differ
def test_auto_combine(self): objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] actual = auto_combine(objs) expected = Dataset({'x': [0, 1]}) self.assertDatasetIdentical(expected, actual) actual = auto_combine([actual]) self.assertDatasetIdentical(expected, actual) objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] actual = auto_combine(objs) expected = Dataset({'x': [0, 1, 2]}) self.assertDatasetIdentical(expected, actual) # ensure auto_combine handles non-sorted variables objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] actual = auto_combine(objs) expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) self.assertDatasetIdentical(expected, actual) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] with self.assertRaisesRegexp(ValueError, 'too many .* dimensions'): auto_combine(objs) objs = [Dataset({'x': 0}), Dataset({'x': 1})] with self.assertRaisesRegexp(ValueError, 'cannot infer dimension'): auto_combine(objs) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] with self.assertRaises(KeyError): auto_combine(objs)
def test_unstack(self): v = Variable('z', [0, 1, 2, 3], {'foo': 'bar'}) actual = v.unstack(z=OrderedDict([('x', 2), ('y', 2)])) expected = Variable(('x', 'y'), [[0, 1], [2, 3]], v.attrs) self.assertVariableIdentical(actual, expected) actual = v.unstack(z=OrderedDict([('x', 4), ('y', 1)])) expected = Variable(('x', 'y'), [[0], [1], [2], [3]], v.attrs) self.assertVariableIdentical(actual, expected) actual = v.unstack(z=OrderedDict([('x', 4)])) expected = Variable('x', [0, 1, 2, 3], v.attrs) self.assertVariableIdentical(actual, expected)
def _initialize_rho_trsp_dataset(cds, rho, lat_vals=None): """Create an xarray Dataset with time, depth, and latitude dims Parameters ---------- ds : xarray Dataset Must contain the coordinates 'k' and (optionally) 'time' rho : xarray DataArray Containing the density field to be binned and made into our new vertical coordinate lat_vals : int or array of ints, optional latitude value(s) rounded to the nearest degree specifying where to compute transport Returns ------- ds : xarray Dataset zero-valued Dataset with time, depth, and latitude dimensions """ # Create density bins rho_bin_edges, rho_bin_centers = get_rho_bins(rho.min().values, rho.max().values, len(cds['k'])) Nrho = len(rho_bin_centers) k_rho = np.arange(Nrho) k_rho_f = np.arange(len(rho_bin_edges)) coords = OrderedDict() dims = () if 'time' in cds: coords.update({'time': cds['time'].values}) dims += ('time', ) if lat_vals is not None: zeros = np.zeros((len(cds['time'].values), Nrho, len(lat_vals))) else: zeros = np.zeros((len(cds['time'].values), Nrho)) else: if lat_vals is not None: zeros = np.zeros((Nrho, len(lat_vals))) else: zeros = np.zeros((Nrho)) coords.update({'k_rho': k_rho}) dims += ('k_rho', ) if lat_vals is not None: coords.update({'lat': lat_vals}) dims += ('lat', ) da = xr.DataArray(data=zeros, coords=coords, dims=dims) # This could be much cleaner, and should mirror the # xgcm notation. ds = da.to_dataset(name='trsp') ds['rho_c'] = rho_bin_centers ds['rho_f'] = rho_bin_edges ds['k_rho_f'] = k_rho_f return ds
def _get_all_data_variables(data_dir, layers): """"Put all the relevant data metadata into one big dictionary.""" allvars = [state_variables] allvars.append(package_state_variables) # add others from available_diagnostics.log fname = os.path.join(data_dir, 'available_diagnostics.log') if os.path.exists(fname): diag_file = fname else: warnings.warn("Couldn't find available_diagnostics.log " "in %s. Using default version." % data_dir) from .default_diagnostics import diagnostics diag_file = StringIO(diagnostics) available_diags = parse_available_diagnostics(diag_file, layers) allvars.append(available_diags) metadata = _concat_dicts(allvars) # Now add the suffix '-T' to every diagnostic. This is a somewhat hacky # way to increase the coverage of possible output filenames. # But it doesn't work in python3!!! extra_metadata = OrderedDict() for name, val in metadata.items(): newname = name + '-T' extra_metadata[newname] = val metadata = _concat_dicts([metadata, extra_metadata]) # now fill in aliases for alias, original in aliases.items(): metadata[alias] = metadata[original] return metadata
def test_to_dask_dataframe(self): # Test conversion of Datasets to dask DataFrames x = da.from_array(np.random.randn(10), chunks=4) y = np.arange(10, dtype='uint8') t = list('abcdefghij') ds = Dataset( OrderedDict([('a', ('t', x)), ('b', ('t', y)), ('t', ('t', t))])) expected_pd = pd.DataFrame({ 'a': x, 'b': y }, index=pd.Index(t, name='t')) # test if 1-D index is correctly set up expected = dd.from_pandas(expected_pd, chunksize=4) actual = ds.to_dask_dataframe(set_index=True) # test if we have dask dataframes self.assertIsInstance(actual, dd.DataFrame) # use the .equals from pandas to check dataframes are equivalent assert_frame_equal(expected.compute(), actual.compute()) # test if no index is given expected = dd.from_pandas(expected_pd.reset_index(drop=False), chunksize=4) actual = ds.to_dask_dataframe(set_index=False) self.assertIsInstance(actual, dd.DataFrame) assert_frame_equal(expected.compute(), actual.compute())
def test_to_dask_dataframe_coordinates(self): # Test if coordinate is also a dask array x = da.from_array(np.random.randn(10), chunks=4) t = da.from_array(np.arange(10) * 2, chunks=4) ds = Dataset(OrderedDict([('a', ('t', x)), ('t', ('t', t))])) expected_pd = pd.DataFrame({'a': x}, index=pd.Index(t, name='t')) expected = dd.from_pandas(expected_pd, chunksize=4) actual = ds.to_dask_dataframe(set_index=True) self.assertIsInstance(actual, dd.DataFrame) assert_frame_equal(expected.compute(), actual.compute())
def test_reduce_keep_attrs(self): _attrs = {'units': 'test', 'long_name': 'testing'} v = Variable(['x', 'y'], self.d, _attrs) # Test dropped attrs vm = v.mean() self.assertEqual(len(vm.attrs), 0) self.assertEqual(vm.attrs, OrderedDict()) # Test kept attrs vm = v.mean(keep_attrs=True) self.assertEqual(len(vm.attrs), len(_attrs)) self.assertEqual(vm.attrs, _attrs)
def test_to_dask_dataframe_not_daskarray(self): # Test if DataArray is not a dask array x = np.random.randn(10) y = np.arange(10, dtype='uint8') t = list('abcdefghij') ds = Dataset( OrderedDict([('a', ('t', x)), ('b', ('t', y)), ('t', ('t', t))])) expected = pd.DataFrame({'a': x, 'b': y}, index=pd.Index(t, name='t')) actual = ds.to_dask_dataframe(set_index=True) self.assertIsInstance(actual, dd.DataFrame) assert_frame_equal(expected, actual.compute())
def _initialize_trsp_data_array(cds, lat_vals): """Create an xarray DataArray with time, depth, and latitude dims Parameters ---------- cds : xarray Dataset contains LLC coordinates 'k' and (optionally) 'time' lat_vals : int or array of ints latitude value(s) rounded to the nearest degree specifying where to compute transport Returns ------- ds_out : xarray Dataset Dataset with the variables 'trsp_z' zero-valued DataArray with time (optional), depth, and latitude dimensions 'Z' the original depth coordinate """ coords = OrderedDict() dims = () if 'time' in cds: coords.update({'time': cds['time'].values}) dims += ('time', ) zeros = np.zeros( (len(cds['time'].values), len(cds['k'].values), len(lat_vals))) else: zeros = np.zeros((len(cds['k'].values), len(lat_vals))) coords.update({'k': cds['k'].values}) coords.update({'lat': lat_vals}) dims += ('k', 'lat') xda = xr.DataArray(data=zeros, coords=coords, dims=dims) # Convert to dataset to add Z coordinate xds = xda.to_dataset(name='trsp_z') xds['Z'] = cds['Z'] xds = xds.set_coords('Z') return xds
def _make_layers_variables(layer_name): """Translate metadata template to actual variable metadata.""" from .variables import layers_grid_variables lvars = OrderedDict() layer_num = layer_name[0] # should always be int assert isinstance(int(layer_num), int) layer_id = 'l' + layer_num for key, vals in layers_grid_variables.items(): # replace the name template with the actual name # e.g. layer_NAME_bounds -> layer_1RHO_bounds varname = key.replace('NAME', layer_name) metadata = _recursively_replace(vals, 'NAME', layer_name) # now fix dimension metadata['dims'] = [metadata['dims'][0].replace('l', layer_id)] lvars[varname] = metadata return lvars
def test_expand_dims(self): v = Variable(['x'], [0, 1]) actual = v.expand_dims(['x', 'y']) expected = Variable(['x', 'y'], [[0], [1]]) self.assertVariableIdentical(actual, expected) actual = v.expand_dims(['y', 'x']) self.assertVariableIdentical(actual, expected.T) actual = v.expand_dims(OrderedDict([('x', 2), ('y', 2)])) expected = Variable(['x', 'y'], [[0, 0], [1, 1]]) self.assertVariableIdentical(actual, expected) v = Variable(['foo'], [0, 1]) actual = v.expand_dims('foo') expected = v self.assertVariableIdentical(actual, expected) with self.assertRaisesRegexp(ValueError, 'must be a superset'): v.expand_dims(['z'])
def drop(self, labels, dim=None, inplace=False): """Drop variables or index labels from this dataset. Based on xarray.dataset.drop, but adds inplace option. Parameters ---------- labels : scalar or list of scalars Name(s) of variables or index labels to drop. dim : None or str, optional Dimension along which to drop index labels. By default (if ``dim is None``), drops variables rather than index labels. inplace : whether the original dataset should be modified or a new one created Returns ------- dropped : Dataset (self if inplace=True) """ if utils.is_scalar(labels): labels = [labels] if dim is None: self._assert_all_in_dataset(labels) drop = set(labels) variables = OrderedDict( (k, v) for k, v in iteritems(self._variables) if k not in drop) coord_names = set(k for k in self._coord_names if k in variables) result = self._replace_vars_and_dims(variables, coord_names, inplace=inplace) else: try: index = self.indexes[dim] except KeyError: raise ValueError('dimension %r does not have coordinate labels' % dim) new_index = index.drop(labels) result = self.loc[{dim: new_index}] return self if inplace else result
def _initialize_section_trsp_data_array(cds): """Create an xarray DataArray with time, depth, and latitude dims Parameters ---------- cds : xarray Dataset contains LLC coordinates 'k' and (optionally) 'time' Returns ------- ds_out : xarray Dataset Dataset with the variables 'trsp_z' zero-valued DataArray with time (optional) and depth dimensions 'Z' the original depth coordinate """ coords = OrderedDict() dims = () if 'time' in cds: coords.update( {'time': cds['time'].values} ) dims += ('time',) zeros = np.zeros((len(cds['time'].values), len(cds['k'].values))) else: zeros = np.zeros((len(cds['k'].values))) coords.update( {'k': cds['k'].values} ) dims += ('k',) xda = xr.DataArray(data=zeros, coords=coords, dims=dims) # Convert to dataset to add Z coordinate xds = xda.to_dataset(name='trsp_z') xds['Z'] = cds['Z'] xds = xds.set_coords('Z') return xds
def __init__(self, data_dir, grid_dir=None, iternum=None, delta_t=1, read_grid=True, file_prefixes=None, ref_date=None, calendar=None, geometry='sphericalpolar', endian='>', ignore_unknown_vars=False, default_dtype=np.dtype('f4'), nx=None, ny=None, nz=None, llc_method="smallchunks", levels=None, extra_metadata=None, extra_variables=None): """ This is not a user-facing class. See open_mdsdataset for argument documentation. The only ones which are distinct are. Parameters ---------- iternum : int, optional The iteration timestep number to read. file_prefixes : list The prefixes of the data files to be read. """ self.geometry = geometry.lower() allowed_geometries = [ 'cartesian', 'sphericalpolar', 'llc', 'cs', 'curvilinear' ] if self.geometry not in allowed_geometries: raise ValueError('Unexpected value for parameter `geometry`. ' 'It must be one of the following: %s' % allowed_geometries) # the directory where the files live self.data_dir = data_dir self.grid_dir = grid_dir if (grid_dir is not None) else data_dir self.extra_variables = extra_variables self._ignore_unknown_vars = ignore_unknown_vars # The endianness of the files # By default, MITgcm does big endian if endian not in ['>', '<', '=']: raise ValueError("Invalid byte order (endian=%s)" % endian) self.endian = endian if default_dtype is not None: self.default_dtype = np.dtype(default_dtype).newbyteorder(endian) else: self.default_dtype = default_dtype # storage dicts for variables and attributes self._variables = OrderedDict() self._attributes = OrderedDict() self._dimensions = [] # the dimensions are theoretically the same for all datasets [self._dimensions.append(k) for k in dimensions] self.llc = (self.geometry == 'llc') self.cs = (self.geometry == 'cs') if nz is None: self.nz = _guess_model_nz(self.grid_dir) else: self.nz = nz # if user passes extra_metadata, this should have priority user_metadata = True if extra_metadata is not None else False # put in local variable to make it more readable if extra_metadata is not None and 'has_faces' in extra_metadata: has_faces = extra_metadata['has_faces'] else: has_faces = False # --------------- LEGACY ---------------------- if self.llc: has_faces = True if extra_metadata is None or 'ny_facets' not in extra_metadata: # default to llc90, we only need number of facets # and we cannot know nx at this point llc = get_extra_metadata(domain='llc', nx=90) extra_metadata = llc if self.cs: has_faces = True if extra_metadata is None or 'ny_facets' not in extra_metadata: # default to llc90, we only need number of facets # and we cannot know nx at this point cs = get_extra_metadata(domain='cs', nx=32) extra_metadata = cs # --------------- /LEGACY ---------------------- # we don't need to know ny if using llc if has_faces and (nx is not None): ny = nx # Now we need to figure out the horizontal dimensions nx, ny # nface is the number of llc faces if (nx is not None) and (ny is not None): # we have been passed enough information to determine the # dimensions without reading any files self.ny, self.nx = ny, nx self.nface = len(extra_metadata['face_facets']) if has_faces \ else None else: # have to peek at the grid file metadata self.nface, self.ny, self.nx = (_guess_model_horiz_dims( self.grid_dir, is_llc=self.llc, is_cs=self.cs)) # --------------- LEGACY ---------------------- if self.llc: if not user_metadata: # if user didn't provide metadata, we default to llc llc = get_extra_metadata(domain='llc', nx=self.nx) extra_metadata = llc if self.cs: if not user_metadata: # if user didn't provide metadata, we default to llc cs = get_extra_metadata(domain='cs', nx=self.nx) extra_metadata = cs # --------------- /LEGACY ---------------------- self.layers = _guess_layers(data_dir) if has_faces: nyraw = self.nx * self.nface else: nyraw = self.ny self.default_shape_3D = (self.nz, nyraw, self.nx) self.default_shape_2D = (nyraw, self.nx) self.llc_method = llc_method # Now set up the corresponding coordinates. # Rather than assuming the dimension names, we use Comodo conventions # to parse the dimension metdata. # http://pycomodo.forge.imag.fr/norm.html irange = np.arange(self.nx) jrange = np.arange(self.ny) if levels is None: krange = np.arange(self.nz) krange_p1 = np.arange(self.nz + 1) else: krange = levels krange_p1 = levels + [levels[-1] + 1] # the keys are `standard_name` attribute dimension_data = { "x_grid_index": irange, "x_grid_index_at_u_location": irange, "x_grid_index_at_f_location": irange, "y_grid_index": jrange, "y_grid_index_at_v_location": jrange, "y_grid_index_at_f_location": jrange, "z_grid_index": krange, "z_grid_index_at_lower_w_location": krange, "z_grid_index_at_upper_w_location": krange, "z_grid_index_at_w_location": krange_p1, } for dim in self._dimensions: dim_meta = dimensions[dim] dims = dim_meta['dims'] attrs = dim_meta['attrs'] data = dimension_data[attrs['standard_name']] dim_variable = xr.Variable(dims, data, attrs) self._variables[dim] = dim_variable # possibly add the llc dimension # seems sloppy to hard code this here # TODO: move this metadata to variables.py if has_faces: self._dimensions.append(FACE_DIMNAME) data = np.arange(self.nface) attrs = {'standard_name': 'face_index'} dims = [FACE_DIMNAME] self._variables[FACE_DIMNAME] = xr.Variable(dims, data, attrs) # do the same for layers for layer_name, n_layer in self.layers.items(): for suffix, offset in zip(['bounds', 'center', 'interface'], [0, -1, -2]): # e.g. "layer_1RHO_bounds" # dimname = 'layer_' + layer_name + '_' + suffix # e.g. "l1_b" dimname = 'l' + layer_name[0] + '_' + suffix[0] self._dimensions.append(dimname) data = np.arange(n_layer + offset) # we should figure out a way to properly populate the layers # attributes attrs = { 'standard_name': layer_name + '_layer_grid_index_at_layer_' + suffix, 'swap_dim': 'layer_' + layer_name + '_' + suffix } dim_variable = xr.Variable([dimname], data, attrs) self._variables[dimname] = dim_variable # maybe add a time dimension if iternum is not None: self.time_dim_name = 'time' self._dimensions.append(self.time_dim_name) # a variable for iteration number self._variables['iter'] = xr.Variable( (self.time_dim_name, ), [iternum], { 'standard_name': 'timestep', 'long_name': 'model timestep number' }) self._variables[ self.time_dim_name] = _iternum_to_datetime_variable( iternum, delta_t, ref_date, calendar, self.time_dim_name) # build lookup tables for variable metadata self._all_grid_variables = _get_all_grid_variables( self.geometry, self.grid_dir, self.layers) self._all_data_variables = _get_all_data_variables( self.data_dir, self.grid_dir, self.layers, self.extra_variables) # The rest of the data has to be read from disk. # The list `prefixes` specifies file prefixes from which to infer # The problem with this is that some prefixes are single variables # while some are multi-variable diagnostics files. prefixes = [] if read_grid: prefixes = prefixes + list(self._all_grid_variables.keys()) # add data files prefixes = ( prefixes + _get_all_matching_prefixes(data_dir, iternum, file_prefixes)) for p in prefixes: # use a generator to loop through the variables in each file for (vname, dims, data, attrs) in \ self.load_from_prefix(p, iternum, extra_metadata): # print(vname, dims, data.shape) # Sizes of grid variables can vary between mitgcm versions. # Check for such inconsistency and correct if so (vname, dims, data, attrs) = self.fix_inconsistent_variables( vname, dims, data, attrs) # Create masks from hFac variables data = self.calc_masks(vname, data) thisvar = xr.Variable(dims, data, attrs) self._variables[vname] = thisvar
def _concat_dicts(list_of_dicts): result = OrderedDict() for eachdict in list_of_dicts: for k, v in eachdict.items(): result[k] = v return result
dimensions = OrderedDict( # x direction i=dict(dims=['i'], attrs=dict(standard_name="x_grid_index", axis='X', long_name="x-dimension of the t grid", swap_dim='XC')), i_g=dict(dims=['i_g'], attrs=dict(standard_name="x_grid_index_at_u_location", axis='X', long_name="x-dimension of the u grid", c_grid_axis_shift=-0.5, swap_dim='XG')), # i_z = dict(dims=['i_z'], swap_dim='XG', attrs=dict( # standard_name="x_grid_index_at_f_location", axis='X', # long_name="x-dimension of the f grid", c_grid_axis_shift=-0.5)), # y direction j=dict(dims=['j'], attrs=dict(standard_name="y_grid_index", axis='Y', long_name="y-dimension of the t grid", swap_dim='YC')), j_g=dict(dims=['j_g'], attrs=dict(standard_name="y_grid_index_at_v_location", axis='Y', long_name="y-dimension of the v grid", c_grid_axis_shift=-0.5, swap_dim='YG')), # j_z = dict(dims=['j_z'], swap_dim='YG', attrs=dict( # standard_name="y_grid_index_at_f_location", axis='Y', # long_name="y-dimension of the f grid", c_grid_axis_shift=-0.5)), # x direction k=dict(dims=['k'], attrs=dict(standard_name="z_grid_index", axis="Z", long_name="z-dimension of the t grid", swap_dim='Z')), k_u=dict(dims=['k_u'], attrs=dict(standard_name="z_grid_index_at_lower_w_location", axis="Z", long_name="z-dimension of the w grid", c_grid_axis_shift=0.5, swap_dim='Zu')), k_l=dict(dims=['k_l'], attrs=dict(standard_name="z_grid_index_at_upper_w_location", axis="Z", long_name="z-dimension of the w grid", c_grid_axis_shift=-0.5, swap_dim='Zl')), # this is complicated because it is offset in both directions - allowed by comodo? k_p1=dict(dims=['k_p1'], attrs=dict(standard_name="z_grid_index_at_w_location", axis="Z", long_name="z-dimension of the w grid", c_grid_axis_shift=(-0.5, 0.5), swap_dim='Zp1')))
def _make_data_array(data_tiles, iVar, jVar, kVar, less_output=False, dim4=None, dim5=None): """Non user facing function to make a data array from tiled numpy/dask array and strings denoting grid location Note that here, I'm including the "tiles" dimension... so dim4 refers to index vector d_4, and dim5 refers to index d_5 No user should have to deal with this though Parameters ---------- data_tiles : numpy/dask array Probably loaded from binary via mds_io.read_bin_to_tiles and rearranged in llc_tiles_to_xda iVar : string denote x grid location, 'i' or 'i_g' jVar : string denote y grid location, 'j' or 'j_g' kVar : string denote x grid location, 'k' only implemented for now. possible to implement 'k_u' for e.g. vertical velocity ... at some point less_output : boolean, optional debugging flag, False => print more dim4 : string, optional Specify the third dimension, either 'depth' or 'time' dim5 : string, optional Specify the third dimension, either 'depth' or 'time' Returns ------- da : xarray DataArray """ # Save shape and num dimensions for below data_shape = data_tiles.shape Ndims = len(data_shape) # Create minimal coordinate information i = np.arange(data_shape[-1]) j = np.arange(data_shape[-2]) tiles = np.arange(data_shape[-3]) d_4 = [] d_5 = [] if len(data_shape) > 3: if dim4 is None: raise TypeError( "Please specify 4th dimension as dim4='depth' or dim4='time'") d_4 = np.arange(data_shape[-4]) if len(data_shape) > 4: if dim5 is None: raise TypeError( "Please specify 5th dimension as dim5='depth' or dim5='time'") d_5 = np.arange(data_shape[-5]) # Determine how to handle fourth dimension if dim4 == 'depth': fourthDimIsDepth = True else: fourthDimIsDepth = False # We can't say much about tile or time dimension tile_attrs = OrderedDict([('standard_name', 'face_index')]) time_attrs = OrderedDict([('standard_name', 'time'), ('long_name', 'Time'), ('axis', 'T')]) # Create dims tuple, which will at least have # e.g. ('tile','j','i') for a 'c' variable dims = ('tile', jVar, iVar) # Coordinates will be a dictionary of 1 dimensional xarray DataArrays # each with their own set of attributes coords = OrderedDict() if Ndims > 3: if fourthDimIsDepth: # Only add depth dims = (kVar, ) + dims k_da = xr.DataArray(data=d_4, coords={kVar: d_4}, dims=(kVar, ), attrs=dimensions[kVar]['attrs']) coords[kVar] = k_da else: # Only add time dims = ('time', ) + dims time_da = xr.DataArray(data=d_4, coords={'time': d_4}, dims=('time', ), attrs=time_attrs) coords['time'] = time_da if Ndims > 4: if fourthDimIsDepth: # Now add time dims = ('time', ) + dims time_da = xr.DataArray(data=d_5, coords={'time': d_5}, dims=('time', ), attrs=time_attrs) coords['time'] = time_da else: # Now add depth dims = (kVar, ) + dims k_da = xr.DataArray(data=d_5, coords={kVar: d_5}, dims=(kVar, ), attrs=dimensions[kVar]['attrs']) coords[kVar] = k_da # Now add the standard coordinates tile_da = xr.DataArray(data=tiles, coords={'tile': tiles}, dims=('tile', ), attrs=tile_attrs) j_da = xr.DataArray(data=j, coords={jVar: j}, dims=(jVar, ), attrs=dimensions[jVar]['attrs']) i_da = xr.DataArray(data=i, coords={iVar: i}, dims=(iVar, ), attrs=dimensions[iVar]['attrs']) coords['tile'] = tile_da coords[jVar] = j_da coords[iVar] = i_da return xr.DataArray(data=data_tiles, coords=coords, dims=dims)
from .common import broadcast_1d_array from .util.gridspec import _get_model_info, prof_altitude #: Hard-coded dimension variables to use with any Dataset read in BASE_DIMENSIONS = OrderedDict( lon=dict(dims=[ 'lon', ], attrs={ 'standard_name': 'longitude', 'axis': 'X', }), lat=dict( dims=[ 'lat', ], attrs={ 'standard_name': 'latitude', 'axis': 'Y', }, ), time=dict(dims=[ 'time', ], attrs={}), nv=dict(dims=[ 'nv', ], attrs={}), ) #: CF/COARDS recommended dimension order; non-spatiotemporal dimensions #: should precede these. DIM_ORDER_PRIORITY = ['time', 'lev', 'lat', 'lon']
def __init__(self, filename, fields=[], categories=[], fix_cf=False, mode='r', endian='>', diaginfo_file='', tracerinfo_file='', use_mmap=False, dask_delayed=False): # Track the metadata accompanying this dataset. dir_path = os.path.abspath(os.path.dirname(filename)) if not dir_path: dir_path = os.getcwd() if not tracerinfo_file: tracerinfo_file = os.path.join(dir_path, 'tracerinfo.dat') if not os.path.exists(tracerinfo_file): tracerinfo_file = '' self.tracerinfo_file = tracerinfo_file if not diaginfo_file: diaginfo_file = os.path.join(dir_path, 'diaginfo.dat') if not os.path.exists(diaginfo_file): diaginfo_file = '' self.diaginfo_file = diaginfo_file self.filename = filename self.fsize = os.path.getsize(self.filename) self.mode = mode if not mode.startswith('r'): raise ValueError( "Currently only know how to 'r(b)'ead bpch files.") # Check endianness flag if endian not in ['>', '<', '=']: raise ValueError("Invalid byte order (endian={})".format(endian)) self.endian = endian # Open the raw output file, but don't yet read all the data self._mmap = use_mmap self._dask = dask_delayed self._bpch = BPCHFile(self.filename, self.mode, self.endian, tracerinfo_file=tracerinfo_file, diaginfo_file=diaginfo_file, eager=False, use_mmap=self._mmap, dask_delayed=self._dask) self.fields = fields self.categories = categories # Peek into the raw output file and read the header and metadata # so that we can get a head start at building the output grid self._bpch._read_metadata() self._bpch._read_header() # Parse the binary file and prepare to add variables to the DataStore self._bpch._read_var_data() # Create storage dicts for variables and attributes, to be used later # when xarray needs to access the data self._variables = OrderedDict() self._attributes = OrderedDict() self._attributes.update(self._bpch._attributes) self._dimensions = [d for d in BASE_DIMENSIONS] # Begin constructing the coordinate dimensions shared by the # output dataset variables dim_coords = {} self.ctm_info = CTMGrid.from_model(self._attributes['modelname'], resolution=self._attributes['res']) # Add vertical dimensions self._dimensions.append(dict(dims=[ 'lev', ], attrs={'axis': 'Z'})) self._dimensions.append(dict(dims=[ 'lev_trop', ], attrs={'axis': 'Z'})) self._dimensions.append(dict(dims=[ 'lev_edge', ], attrs={'axis': 'Z'})) eta_centers = self.ctm_info.eta_centers sigma_centers = self.ctm_info.sigma_centers # Add time dimensions self._dimensions.append( dict(dims=[ 'time', ], attrs={ 'axis': 'T', 'long_name': 'time', 'standard_name': 'time' })) # Add lat/lon dimensions self._dimensions.append( dict(dims=[ 'lon', ], attrs={ 'axis': 'X', 'long_name': 'longitude coordinate', 'standard_name': 'longitude' })) self._dimensions.append( dict(dims=[ 'lat', ], attrs={ 'axis': 'y', 'long_name': 'latitude coordinate', 'standard_name': 'latitude' })) if eta_centers is not None: lev_vals = eta_centers lev_attrs = { 'standard_name': 'atmosphere_hybrid_sigma_pressure_coordinate', 'axis': 'Z' } else: lev_vals = sigma_centers lev_attrs = { 'standard_name': 'atmosphere_hybrid_sigma_pressure_coordinate', 'axis': 'Z' } self._variables['lev'] = xr.Variable([ 'lev', ], lev_vals, lev_attrs) ## Latitude / Longitude # TODO: Add lon/lat bounds # Detect if we're on a nested grid; in that case, we'll have a displaced # origin set in the variable attributes we previously read ref_key = list(self._bpch.var_attrs.keys())[0] ref_attrs = self._bpch.var_attrs[ref_key] self.is_nested = (ref_attrs['origin'] != (1, 1, 1)) lon_centers = self.ctm_info.lon_centers lat_centers = self.ctm_info.lat_centers if self.is_nested: ix, iy, _ = ref_attrs['origin'] nx, ny, *_ = ref_attrs['original_shape'] # Correct i{x,y} for IDL->Python indexing (1-indexed -> 0-indexed) ix -= 1 iy -= 1 lon_centers = lon_centers[ix:ix + nx] lat_centers = lat_centers[iy:iy + ny] self._variables['lon'] = xr.Variable(['lon'], lon_centers, { 'long_name': 'longitude', 'units': 'degrees_east' }) self._variables['lat'] = xr.Variable(['lat'], lat_centers, { 'long_name': 'latitude', 'units': 'degrees_north' }) # TODO: Fix longitudes if ctm_grid.center180 # Add variables from the parsed BPCH file to our DataStore for vname in list(self._bpch.var_data.keys()): var_data = self._bpch.var_data[vname] var_attr = self._bpch.var_attrs[vname] if fields and (var_attr['name'] not in fields): continue if categories and (var_attr['category'] not in categories): continue # Process dimensions dims = [ 'time', 'lon', 'lat', ] dshape = var_attr['original_shape'] if len(dshape) == 3: # Process the vertical coordinate. A few things can happen here: # 1) We have cell-centered values on the "Nlayer" grid; we can take these variables and map them to 'lev' # 2) We have edge value on an "Nlayer" + 1 grid; we can take these and use them with 'lev_edge' # 3) We have troposphere values on "Ntrop"; we can take these and use them with 'lev_trop', but we won't have coordinate information yet # All other cases we do not handle yet; this includes the aircraft emissions and a few other things. Note that tracer sources do not have a vertical coord to worry about! nlev = dshape[-1] grid_nlev = self.ctm_info.Nlayers grid_ntrop = self.ctm_info.Ntrop try: if nlev == grid_nlev: dims.append('lev') elif nlev == grid_nlev + 1: dims.append('lev_edge') elif nlev == grid_ntrop: dims.append('lev_trop') else: continue except AttributeError: warnings.warn("Couldn't resolve grid_spec vertical layout") continue # xarray Variables are thin wrappers for numpy.ndarrays, or really # any object that extends the ndarray interface. A critical part of # the original ndarray interface is that the underlying data has to # be contiguous in memory. We can enforce this to happen by # concatenating each bundle in the variable data bundles we read # from the bpch file data = self._concat([v.data for v in var_data]) # Is the variable time-invariant? If it is, kill the time dim. # Here, we mean it only as one sample in the dataset. if data.shape[0] == 1: dims = dims[1:] data = data.squeeze() # Create a variable containing this data var = xr.Variable(dims, data, var_attr) # Shuffle dims for CF/COARDS compliance if requested # TODO: For this to work, we have to force a load of the data. # Is there a way to re-write BPCHDataProxy so that that's not # necessary? # Actually, we can't even force a load becase var.data is a # numpy.ndarray. Weird. # if fix_dims: # target_dims = [d for d in DIM_ORDER_PRIORITY if d in dims] # var = var.transpose(*target_dims) self._variables[vname] = var # Try to add a time dimension # TODO: Time units? if (len(var_data) > 1) and 'time' not in self._variables: time_bnds = np.asarray([v.time for v in var_data]) times = time_bnds[:, 0] self._variables['time'] = xr.Variable( [ 'time', ], times, { 'bounds': 'time_bnds', 'units': cf.CTM_TIME_UNIT_STR }) self._variables['time_bnds'] = xr.Variable( ['time', 'nv'], time_bnds, {'units': cf.CTM_TIME_UNIT_STR}) self._variables['nv'] = xr.Variable([ 'nv', ], [0, 1])
class BPCHFile(object): """ A file object for representing BPCH data on disk Attributes ---------- fp : FortranFile A pointer to the open unformatted Fortran binary output (the original bpch file) var_data, var_attrs : dict Containers of `BPCHDataBundle`s and dicts, respectively, holding the accessor functions to the raw bpch data and their associated metadata """ def __init__(self, filename, mode='rb', endian='>', diaginfo_file='', tracerinfo_file='', eager=False, use_mmap=False, dask_delayed=False): """ Load a BPCHFile Parameters ---------- filename : str Path to the bpch file on disk mode : str Mode string to pass to the file opener; this is currently fixed to "rb" and all other values will be rejected endian : str {">", "<", ":"} Endian-ness of the Fortran output file {tracerinfo, diaginfo}_file : str Path to the tracerinfo.dat and diaginfo.dat files containing metadata pertaining to the output in the bpch file being read. eager : bool Flag to immediately read variable data; if "False", then nothing will be read from the file and you'll need to do so manually use_mmap : bool Use memory-mapping to read data from file dask_delayed : bool Use dask to create delayed references to the data-reading functions """ self.mode = mode if not mode.startswith('r'): raise ValueError("Currently only know how to 'r(b)'ead bpch files.") self.filename = filename self.fsize = os.path.getsize(self.filename) self.endian = endian # Open a pointer to the file self.fp = FortranFile(self.filename, self.mode, self.endian) dir_path = os.path.abspath(os.path.dirname(filename)) if not dir_path: dir_path = os.getcwd() if not tracerinfo_file: tracerinfo_file = os.path.join(dir_path, "tracerinfo.dat") if not os.path.exists(tracerinfo_file): tracerinfo_file = '' self.tracerinfo_file = tracerinfo_file if not diaginfo_file: diaginfo_file = os.path.join(dir_path, "diaginfo.dat") if not os.path.exists(diaginfo_file): diaginfo_file = '' self.diaginfo_file = diaginfo_file # Container to record file metadata self._attributes = OrderedDict() # Don't necessarily need to save diag/tracer_dict yet self.diaginfo_df, _ = get_diaginfo(self.diaginfo_file) self.tracerinfo_df, _ = get_tracerinfo(self.tracerinfo_file) # Container for bundles contained in the output file. self.var_data = {} self.var_attrs = {} # Critical information for accessing file contents self._header_pos = None # Data loading strategy self.use_mmap = use_mmap self.dask_delayed = dask_delayed # Control eager versus deferring reading self.eager = eager if (mode.startswith('r') and self.eager): self._read() def close(self): """ Close this bpch file. """ if not self.fp.closed: for v in list(self.var_data): del self.var_data[v] self.fp.close() def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() def _read(self): """ Parse the entire bpch file on disk and set up easy access to meta- and data blocks. """ self._read_metadata() self._read_header() self._read_var_data() def _read_metadata(self): """ Read the main metadata packaged within a bpch file, indicating the output filetype and its title. """ filetype = self.fp.readline().strip() filetitle = self.fp.readline().strip() # Decode to UTF string, if possible try: filetype = str(filetype, 'utf-8') filetitle = str(filetitle, 'utf-8') except: # TODO: Handle this edge-case of converting file metadata more elegantly. pass self.__setattr__('filetype', filetype) self.__setattr__('filetitle', filetitle) def _read_header(self): """ Process the header information (data model / grid spec) """ self._header_pos = self.fp.tell() line = self.fp.readline('20sffii') modelname, res0, res1, halfpolar, center180 = line self._attributes.update({ "modelname": str(modelname, 'utf-8').strip(), "halfpolar": halfpolar, "center180": center180, "res": (res0, res1) }) self.__setattr__('modelname', modelname) self.__setattr__('res', (res0, res1)) self.__setattr__('halfpolar', halfpolar) self.__setattr__('center180', center180) # Re-wind the file self.fp.seek(self._header_pos) def _read_var_data(self): """ Iterate over the block of this bpch file and return handlers in the form of `BPCHDataBundle`s for access to the data contained therein. """ var_bundles = OrderedDict() var_attrs = OrderedDict() n_vars = 0 while self.fp.tell() < self.fsize: var_attr = OrderedDict() # read first and second header lines line = self.fp.readline('20sffii') modelname, res0, res1, halfpolar, center180 = line line = self.fp.readline('40si40sdd40s7i') category_name, number, unit, tau0, tau1, reserved = line[:6] dim0, dim1, dim2, dim3, dim4, dim5, skip = line[6:] var_attr['number'] = number # Decode byte-strings to utf-8 category_name = str(category_name, 'utf-8') var_attr['category'] = category_name.strip() unit = str(unit, 'utf-8') # get additional metadata from tracerinfo / diaginfo try: cat_df = self.diaginfo_df[ self.diaginfo_df.name == category_name.strip() ] # TODO: Safer logic for handling case where more than one # tracer metadata match was made # if len(cat_df > 1): # raise ValueError( # "More than one category matching {} found in " # "diaginfo.dat".format( # category_name.strip() # ) # ) # Safe now to select the only row in the DataFrame cat = cat_df.T.squeeze() tracer_num = int(cat.offset) + int(number) diag_df = self.tracerinfo_df[ self.tracerinfo_df.tracer == tracer_num ] # TODO: Safer logic for handling case where more than one # tracer metadata match was made # if len(diag_df > 1): # raise ValueError( # "More than one tracer matching {:d} found in " # "tracerinfo.dat".format(tracer_num) # ) # Safe now to select only row in the DataFrame diag = diag_df.T.squeeze() diag_attr = diag.to_dict() if not unit.strip(): # unit may be empty in bpch unit = diag_attr['unit'] # but not in tracerinfo var_attr.update(diag_attr) except: diag = {'name': '', 'scale': 1} var_attr.update(diag) var_attr['unit'] = unit vname = diag['name'] fullname = category_name.strip() + "_" + vname # parse metadata, get data or set a data proxy if dim2 == 1: data_shape = (dim0, dim1) # 2D field else: data_shape = (dim0, dim1, dim2) var_attr['original_shape'] = data_shape # Add proxy time dimension to shape data_shape = tuple([1, ] + list(data_shape)) origin = (dim3, dim4, dim5) var_attr['origin'] = origin timelo, timehi = cf.tau2time(tau0), cf.tau2time(tau1) pos = self.fp.tell() # Note that we don't pass a dtype, and assume everything is # single-fp floats with the correct endian, as hard-coded var_bundle = BPCHDataBundle( data_shape, self.endian, self.filename, pos, [timelo, timehi], metadata=var_attr, use_mmap=self.use_mmap, dask_delayed=self.dask_delayed ) self.fp.skipline() # Save the data as a "bundle" for concatenating in the final step if fullname in var_bundles: var_bundles[fullname].append(var_bundle) else: var_bundles[fullname] = [var_bundle, ] var_attrs[fullname] = var_attr n_vars += 1 self.var_data = var_bundles self.var_attrs = var_attrs
def _read_var_data(self): """ Iterate over the block of this bpch file and return handlers in the form of `BPCHDataBundle`s for access to the data contained therein. """ var_bundles = OrderedDict() var_attrs = OrderedDict() n_vars = 0 while self.fp.tell() < self.fsize: var_attr = OrderedDict() # read first and second header lines line = self.fp.readline('20sffii') modelname, res0, res1, halfpolar, center180 = line line = self.fp.readline('40si40sdd40s7i') category_name, number, unit, tau0, tau1, reserved = line[:6] dim0, dim1, dim2, dim3, dim4, dim5, skip = line[6:] var_attr['number'] = number # Decode byte-strings to utf-8 category_name = str(category_name, 'utf-8') var_attr['category'] = category_name.strip() unit = str(unit, 'utf-8') # get additional metadata from tracerinfo / diaginfo try: cat_df = self.diaginfo_df[ self.diaginfo_df.name == category_name.strip() ] # TODO: Safer logic for handling case where more than one # tracer metadata match was made # if len(cat_df > 1): # raise ValueError( # "More than one category matching {} found in " # "diaginfo.dat".format( # category_name.strip() # ) # ) # Safe now to select the only row in the DataFrame cat = cat_df.T.squeeze() tracer_num = int(cat.offset) + int(number) diag_df = self.tracerinfo_df[ self.tracerinfo_df.tracer == tracer_num ] # TODO: Safer logic for handling case where more than one # tracer metadata match was made # if len(diag_df > 1): # raise ValueError( # "More than one tracer matching {:d} found in " # "tracerinfo.dat".format(tracer_num) # ) # Safe now to select only row in the DataFrame diag = diag_df.T.squeeze() diag_attr = diag.to_dict() if not unit.strip(): # unit may be empty in bpch unit = diag_attr['unit'] # but not in tracerinfo var_attr.update(diag_attr) except: diag = {'name': '', 'scale': 1} var_attr.update(diag) var_attr['unit'] = unit vname = diag['name'] fullname = category_name.strip() + "_" + vname # parse metadata, get data or set a data proxy if dim2 == 1: data_shape = (dim0, dim1) # 2D field else: data_shape = (dim0, dim1, dim2) var_attr['original_shape'] = data_shape # Add proxy time dimension to shape data_shape = tuple([1, ] + list(data_shape)) origin = (dim3, dim4, dim5) var_attr['origin'] = origin timelo, timehi = cf.tau2time(tau0), cf.tau2time(tau1) pos = self.fp.tell() # Note that we don't pass a dtype, and assume everything is # single-fp floats with the correct endian, as hard-coded var_bundle = BPCHDataBundle( data_shape, self.endian, self.filename, pos, [timelo, timehi], metadata=var_attr, use_mmap=self.use_mmap, dask_delayed=self.dask_delayed ) self.fp.skipline() # Save the data as a "bundle" for concatenating in the final step if fullname in var_bundles: var_bundles[fullname].append(var_bundle) else: var_bundles[fullname] = [var_bundle, ] var_attrs[fullname] = var_attr n_vars += 1 self.var_data = var_bundles self.var_attrs = var_attrs
def __init__(self, filename, mode='rb', endian='>', diaginfo_file='', tracerinfo_file='', eager=False, use_mmap=False, dask_delayed=False): """ Load a BPCHFile Parameters ---------- filename : str Path to the bpch file on disk mode : str Mode string to pass to the file opener; this is currently fixed to "rb" and all other values will be rejected endian : str {">", "<", ":"} Endian-ness of the Fortran output file {tracerinfo, diaginfo}_file : str Path to the tracerinfo.dat and diaginfo.dat files containing metadata pertaining to the output in the bpch file being read. eager : bool Flag to immediately read variable data; if "False", then nothing will be read from the file and you'll need to do so manually use_mmap : bool Use memory-mapping to read data from file dask_delayed : bool Use dask to create delayed references to the data-reading functions """ self.mode = mode if not mode.startswith('r'): raise ValueError("Currently only know how to 'r(b)'ead bpch files.") self.filename = filename self.fsize = os.path.getsize(self.filename) self.endian = endian # Open a pointer to the file self.fp = FortranFile(self.filename, self.mode, self.endian) dir_path = os.path.abspath(os.path.dirname(filename)) if not dir_path: dir_path = os.getcwd() if not tracerinfo_file: tracerinfo_file = os.path.join(dir_path, "tracerinfo.dat") if not os.path.exists(tracerinfo_file): tracerinfo_file = '' self.tracerinfo_file = tracerinfo_file if not diaginfo_file: diaginfo_file = os.path.join(dir_path, "diaginfo.dat") if not os.path.exists(diaginfo_file): diaginfo_file = '' self.diaginfo_file = diaginfo_file # Container to record file metadata self._attributes = OrderedDict() # Don't necessarily need to save diag/tracer_dict yet self.diaginfo_df, _ = get_diaginfo(self.diaginfo_file) self.tracerinfo_df, _ = get_tracerinfo(self.tracerinfo_file) # Container for bundles contained in the output file. self.var_data = {} self.var_attrs = {} # Critical information for accessing file contents self._header_pos = None # Data loading strategy self.use_mmap = use_mmap self.dask_delayed = dask_delayed # Control eager versus deferring reading self.eager = eager if (mode.startswith('r') and self.eager): self._read()
def test_stack_unstack_consistency(self): v = Variable(['x', 'y'], [[0, 1], [2, 3]]) actual = (v.stack(z=('x', 'y')).unstack(z=OrderedDict([('x', 2), ('y', 2)]))) self.assertVariableIdentical(actual, v)
def _dataset_multi_concat( datasets, dim, data_vars, coords, compat, positions, join="outer", ): """ Concatenate a sequence of datasets along a dimension, trying concatenation along alternate dimensions when the chosen dimension is not present. This function is based on _dataset_concat from xarray.core.concat.py in xarray 0.15. It includes a modification to drop mismatched coordinates from datasets instead of throwing a ValueError. This drop removes the variable from coordinates, but it remains a variable in the dataset. """ # Make sure we're working on a copy (we'll be loading variables) datasets = [ds.copy() for ds in datasets] # determine what dimensions we will be concatenating over, including the preferred dim and any alternatives when # the preferred dim is absent dims = _find_concat_dims(datasets, dim) dims, coordinates = _calc_concat_dims_coords(dims) datasets = align(*datasets, join=join, copy=False, exclude=dims) dim_coords, dims_sizes, coord_names, data_names = _parse_datasets(datasets) dim_names = set(dim_coords) unlabeled_dims = dim_names - coord_names both_data_and_coords = coord_names & data_names if both_data_and_coords: # Instead of throwing a ValueError, make the coordinates match by removing the mismatched coordinate for ds in datasets: for variable in both_data_and_coords: if variable in ds.coords: # This makes the variable no longer a coordinate, but does not remove it from the dataset entirely ds._coord_names.remove(variable) coord_names.discard(variable) # we don't want the concat dimensions in the result dataset yet for dim in dims: dim_coords.pop(dim, None) dims_sizes.pop(dim, None) # case where concat dimension is a coordinate or data_var but not a dimension if (dim in coord_names or dim in data_names) and dim not in dim_names: datasets = [ds.expand_dims(dim) for ds in datasets] # determine which variables to concatenate concat_over, equals, concat_dim_lengths = _calc_concat_over( datasets, dims, dim_names, data_vars, coords, compat) # determine which variables to merge, and then merge them according to compat variables_to_merge = (coord_names | data_names) - concat_over - dim_names result_vars = {} if variables_to_merge: to_merge = {var: [] for var in variables_to_merge} for ds in datasets: for var in variables_to_merge: if var in ds: to_merge[var].append(ds.variables[var]) for var in variables_to_merge: result_vars[var] = unique_variable(var, to_merge[var], compat=compat, equals=equals.get(var, None)) else: result_vars = {} result_vars.update(dim_coords) # assign attrs and encoding from first dataset result_attrs = datasets[0].attrs result_encoding = datasets[0].encoding # check that global attributes are fixed across all datasets if necessary for ds in datasets[1:]: if compat == "identical" and not utils.dict_equiv( ds.attrs, result_attrs): raise ValueError("Dataset global attributes not equal.") # we've already verified everything is consistent; now, calculate # shared dimension sizes so we can expand the necessary variables def ensure_common_dims(vars): # ensure each variable with the given name shares the same # dimensions and the same shape for all of them except along the # concat dimension common_dims = tuple(pd.unique([d for v in vars for d in v.dims])) # find the first concat dimension available in vars concat_dim = [x for x in dims if x in common_dims][0] if not concat_dim: # none of the concat dims are present - add the first one dim = dims[0] common_dims = (dim, ) + common_dims concat_dim = dim for var, dim_len in zip(vars, concat_dim_lengths[concat_dim]): if var.dims != common_dims: common_shape = tuple( dims_sizes.get(d, dim_len) for d in common_dims) var = var.expand_dims(common_dims, common_shape) yield var # stack up each variable to fill-out the dataset (in order) # n.b. this loop preserves variable order, needed for groupby. for k in datasets[0].variables: if k in concat_over: try: vars = ensure_common_dims([ds.variables[k] for ds in datasets]) except KeyError: raise ValueError("%r is not present in all datasets." % k) # get the dimension to concatenate this variable on - choose first applicable dim from dims dim = _get_concat_dim(dims, [ds.variables[k] for ds in datasets]) combined = concat_vars(vars, dim, positions) assert isinstance(combined, Variable) result_vars[k] = combined result = Dataset(result_vars, attrs=result_attrs) absent_coord_names = coord_names - set(result.variables) if absent_coord_names: raise ValueError( "Variables %r are coordinates in some datasets but not others." % absent_coord_names) # current versions of dataset.set_coords and dataset.drop force a _assert_all_in_dataset check that we don't want # xarray 0.15 has the option to disable this via errors='ignore', but for now just call the underlying logic #result = result.set_coords(coord_names, errors='ignore') result._coord_names.update(coord_names) result.encoding = result_encoding #result = result.drop(unlabeled_dims, errors='ignore') drop = set(unlabeled_dims) variables = OrderedDict( (k, v) for k, v in iteritems(result._variables) if k not in drop) coord_names = set(k for k in result._coord_names if k in variables) result._replace_vars_and_dims(variables, coord_names) for coord in coordinates: if coord: # add concat dimension last to ensure that its in the final Dataset result[coord.name] = coord return result
def open_bpchdataset(filename, fields=[], categories=[], tracerinfo_file='tracerinfo.dat', diaginfo_file='diaginfo.dat', endian=">", decode_cf=True, memmap=True, dask=True, return_store=False): """ Open a GEOS-Chem BPCH file output as an xarray Dataset. Parameters ---------- filename : string Path to the output file to read in. {tracerinfo,diaginfo}_file : string, optional Path to the metadata "info" .dat files which are used to decipher the metadata corresponding to each variable in the output dataset. If not provided, will look for them in the current directory or fall back on a generic set. fields : list, optional List of a subset of variable names to return. This can substantially improve read performance. Note that the field here is just the tracer name - not the category, e.g. 'O3' instead of 'IJ-AVG-$_O3'. categories : list, optional List a subset of variable categories to look through. This can substantially improve read performance. endian : {'=', '>', '<'}, optional Endianness of file on disk. By default, "big endian" (">") is assumed. decode_cf : bool Enforce CF conventions for variable names, units, and other metadata default_dtype : numpy.dtype, optional Default datatype for variables encoded in file on disk (single-precision float by default). memmap : bool Flag indicating that data should be memory-mapped from disk instead of eagerly loaded into memory dask : bool Flag indicating that data reading should be deferred (delayed) to construct a task-graph for later execution return_store : bool Also return the underlying DataStore to the user Returns ------- ds : xarray.Dataset Dataset containing the requested fields (or the entire file), with data contained in proxy containers for access later. store : xarray.AbstractDataStore Underlying DataStore which handles the loading and processing of bpch files on disk """ store = BPCHDataStore(filename, fields=fields, categories=categories, tracerinfo_file=tracerinfo_file, diaginfo_file=diaginfo_file, endian=endian, use_mmap=memmap, dask_delayed=dask) ds = xr.Dataset.load_store(store) # Record what the file object underlying the store which we culled this # Dataset from is so that we can clean it up later ds._file_obj = store._bpch # Handle CF corrections if decode_cf: decoded_vars = OrderedDict() rename_dict = {} for v in ds: cf_name = cf.get_valid_varname(v) rename_dict[v] = cf_name new_var = cf.enforce_cf_variable(ds[v]) decoded_vars[cf_name] = new_var ds = xr.Dataset(decoded_vars, attrs=ds.attrs.copy()) # ds.rename(rename_dict, inplace=True) # TODO: There's a bug with xr.decode_cf which eagerly loads data. # Re-enable this once that bug is fixed # Note that we do not need to decode the times because we explicitly # kept track of them as we parsed the data. # ds = xr.decode_cf(ds, decode_times=False) # Set attributes for CF conventions ts = get_timestamp() ds.attrs.update( dict( Conventions='CF1.6', source=filename, tracerinfo=tracerinfo_file, diaginfo=diaginfo_file, filetype=store._bpch.filetype, filetitle=store._bpch.filetitle, history=("{}: Processed/loaded by xbpch-{} from {}".format( ts, ver, filename)), )) # To immediately load the data from the BPCHDataProxy paylods, need # to execute ds.data_vars for some reason... if return_store: return ds, store else: return ds