Ejemplo n.º 1
0
 def test_dict_equiv(self):
     x = OrderedDict()
     x['a'] = 3
     x['b'] = np.array([1, 2, 3])
     y = OrderedDict()
     y['b'] = np.array([1.0, 2.0, 3.0])
     y['a'] = 3
     self.assertTrue(utils.dict_equiv(x, y))  # two nparrays are equal
     y['b'] = [1, 2, 3]  # np.array not the same as a list
     self.assertTrue(utils.dict_equiv(x, y))  # nparray == list
     x['b'] = [1.0, 2.0, 3.0]
     self.assertTrue(utils.dict_equiv(x, y))  # list vs. list
     x['c'] = None
     self.assertFalse(utils.dict_equiv(x, y))  # new key in x
     x['c'] = np.nan
     y['c'] = np.nan
     self.assertTrue(utils.dict_equiv(x, y))  # as intended, nan is nan
     x['c'] = np.inf
     y['c'] = np.inf
     self.assertTrue(utils.dict_equiv(x, y))  # inf == inf
     y = dict(y)
     self.assertTrue(utils.dict_equiv(
         x, y))  # different dictionary types are fine
     y['b'] = 3 * np.arange(3)
     self.assertFalse(utils.dict_equiv(x,
                                       y))  # not equal when arrays differ
Ejemplo n.º 2
0
    def test_auto_combine(self):
        objs = [Dataset({'x': [0]}), Dataset({'x': [1]})]
        actual = auto_combine(objs)
        expected = Dataset({'x': [0, 1]})
        self.assertDatasetIdentical(expected, actual)

        actual = auto_combine([actual])
        self.assertDatasetIdentical(expected, actual)

        objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})]
        actual = auto_combine(objs)
        expected = Dataset({'x': [0, 1, 2]})
        self.assertDatasetIdentical(expected, actual)

        # ensure auto_combine handles non-sorted variables
        objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])),
                Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))]
        actual = auto_combine(objs)
        expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])})
        self.assertDatasetIdentical(expected, actual)

        objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})]
        with self.assertRaisesRegexp(ValueError, 'too many .* dimensions'):
            auto_combine(objs)

        objs = [Dataset({'x': 0}), Dataset({'x': 1})]
        with self.assertRaisesRegexp(ValueError, 'cannot infer dimension'):
            auto_combine(objs)

        objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})]
        with self.assertRaises(KeyError):
            auto_combine(objs)
Ejemplo n.º 3
0
    def test_unstack(self):
        v = Variable('z', [0, 1, 2, 3], {'foo': 'bar'})
        actual = v.unstack(z=OrderedDict([('x', 2), ('y', 2)]))
        expected = Variable(('x', 'y'), [[0, 1], [2, 3]], v.attrs)
        self.assertVariableIdentical(actual, expected)

        actual = v.unstack(z=OrderedDict([('x', 4), ('y', 1)]))
        expected = Variable(('x', 'y'), [[0], [1], [2], [3]], v.attrs)
        self.assertVariableIdentical(actual, expected)

        actual = v.unstack(z=OrderedDict([('x', 4)]))
        expected = Variable('x', [0, 1, 2, 3], v.attrs)
        self.assertVariableIdentical(actual, expected)
Ejemplo n.º 4
0
def _initialize_rho_trsp_dataset(cds, rho, lat_vals=None):
    """Create an xarray Dataset with time, depth, and latitude dims

    Parameters
    ----------
    ds : xarray Dataset
        Must contain the coordinates 'k' and (optionally) 'time'
    rho : xarray DataArray
        Containing the density field to be binned and made into our new vertical coordinate
    lat_vals : int or array of ints, optional
        latitude value(s) rounded to the nearest degree
        specifying where to compute transport

    Returns
    -------
    ds : xarray Dataset
        zero-valued Dataset with time, depth, and latitude dimensions
    """

    # Create density bins
    rho_bin_edges, rho_bin_centers = get_rho_bins(rho.min().values,
                                                  rho.max().values,
                                                  len(cds['k']))
    Nrho = len(rho_bin_centers)
    k_rho = np.arange(Nrho)
    k_rho_f = np.arange(len(rho_bin_edges))

    coords = OrderedDict()
    dims = ()

    if 'time' in cds:
        coords.update({'time': cds['time'].values})
        dims += ('time', )
        if lat_vals is not None:
            zeros = np.zeros((len(cds['time'].values), Nrho, len(lat_vals)))
        else:
            zeros = np.zeros((len(cds['time'].values), Nrho))
    else:
        if lat_vals is not None:
            zeros = np.zeros((Nrho, len(lat_vals)))
        else:
            zeros = np.zeros((Nrho))

    coords.update({'k_rho': k_rho})
    dims += ('k_rho', )
    if lat_vals is not None:
        coords.update({'lat': lat_vals})
        dims += ('lat', )

    da = xr.DataArray(data=zeros, coords=coords, dims=dims)

    # This could be much cleaner, and should mirror the
    # xgcm notation.
    ds = da.to_dataset(name='trsp')
    ds['rho_c'] = rho_bin_centers
    ds['rho_f'] = rho_bin_edges
    ds['k_rho_f'] = k_rho_f

    return ds
Ejemplo n.º 5
0
def _get_all_data_variables(data_dir, layers):
    """"Put all the relevant data metadata into one big dictionary."""
    allvars = [state_variables]
    allvars.append(package_state_variables)
    # add others from available_diagnostics.log
    fname = os.path.join(data_dir, 'available_diagnostics.log')
    if os.path.exists(fname):
        diag_file = fname
    else:
        warnings.warn("Couldn't find available_diagnostics.log "
                      "in %s. Using default version." % data_dir)
        from .default_diagnostics import diagnostics
        diag_file = StringIO(diagnostics)
    available_diags = parse_available_diagnostics(diag_file, layers)
    allvars.append(available_diags)
    metadata = _concat_dicts(allvars)

    # Now add the suffix '-T' to every diagnostic. This is a somewhat hacky
    # way to increase the coverage of possible output filenames.
    # But it doesn't work in python3!!!
    extra_metadata = OrderedDict()
    for name, val in metadata.items():
        newname = name + '-T'
        extra_metadata[newname] = val
    metadata = _concat_dicts([metadata, extra_metadata])

    # now fill in aliases
    for alias, original in aliases.items():
        metadata[alias] = metadata[original]

    return metadata
Ejemplo n.º 6
0
    def test_to_dask_dataframe(self):
        # Test conversion of Datasets to dask DataFrames
        x = da.from_array(np.random.randn(10), chunks=4)
        y = np.arange(10, dtype='uint8')
        t = list('abcdefghij')

        ds = Dataset(
            OrderedDict([('a', ('t', x)), ('b', ('t', y)), ('t', ('t', t))]))

        expected_pd = pd.DataFrame({
            'a': x,
            'b': y
        },
                                   index=pd.Index(t, name='t'))

        # test if 1-D index is correctly set up
        expected = dd.from_pandas(expected_pd, chunksize=4)
        actual = ds.to_dask_dataframe(set_index=True)
        # test if we have dask dataframes
        self.assertIsInstance(actual, dd.DataFrame)

        # use the .equals from pandas to check dataframes are equivalent
        assert_frame_equal(expected.compute(), actual.compute())

        # test if no index is given
        expected = dd.from_pandas(expected_pd.reset_index(drop=False),
                                  chunksize=4)

        actual = ds.to_dask_dataframe(set_index=False)

        self.assertIsInstance(actual, dd.DataFrame)
        assert_frame_equal(expected.compute(), actual.compute())
Ejemplo n.º 7
0
    def test_to_dask_dataframe_coordinates(self):
        # Test if coordinate is also a dask array
        x = da.from_array(np.random.randn(10), chunks=4)
        t = da.from_array(np.arange(10) * 2, chunks=4)

        ds = Dataset(OrderedDict([('a', ('t', x)), ('t', ('t', t))]))

        expected_pd = pd.DataFrame({'a': x}, index=pd.Index(t, name='t'))
        expected = dd.from_pandas(expected_pd, chunksize=4)
        actual = ds.to_dask_dataframe(set_index=True)
        self.assertIsInstance(actual, dd.DataFrame)
        assert_frame_equal(expected.compute(), actual.compute())
Ejemplo n.º 8
0
    def test_reduce_keep_attrs(self):
        _attrs = {'units': 'test', 'long_name': 'testing'}

        v = Variable(['x', 'y'], self.d, _attrs)

        # Test dropped attrs
        vm = v.mean()
        self.assertEqual(len(vm.attrs), 0)
        self.assertEqual(vm.attrs, OrderedDict())

        # Test kept attrs
        vm = v.mean(keep_attrs=True)
        self.assertEqual(len(vm.attrs), len(_attrs))
        self.assertEqual(vm.attrs, _attrs)
Ejemplo n.º 9
0
    def test_to_dask_dataframe_not_daskarray(self):
        # Test if DataArray is not a dask array
        x = np.random.randn(10)
        y = np.arange(10, dtype='uint8')
        t = list('abcdefghij')

        ds = Dataset(
            OrderedDict([('a', ('t', x)), ('b', ('t', y)), ('t', ('t', t))]))

        expected = pd.DataFrame({'a': x, 'b': y}, index=pd.Index(t, name='t'))

        actual = ds.to_dask_dataframe(set_index=True)
        self.assertIsInstance(actual, dd.DataFrame)
        assert_frame_equal(expected, actual.compute())
Ejemplo n.º 10
0
def _initialize_trsp_data_array(cds, lat_vals):
    """Create an xarray DataArray with time, depth, and latitude dims

    Parameters
    ----------
    cds : xarray Dataset
        contains LLC coordinates 'k' and (optionally) 'time'
    lat_vals : int or array of ints
        latitude value(s) rounded to the nearest degree
        specifying where to compute transport

    Returns
    -------
    ds_out : xarray Dataset
        Dataset with the variables
            'trsp_z'
                zero-valued DataArray with time (optional), 
                depth, and latitude dimensions
            'Z'
                the original depth coordinate
    """

    coords = OrderedDict()
    dims = ()

    if 'time' in cds:
        coords.update({'time': cds['time'].values})
        dims += ('time', )
        zeros = np.zeros(
            (len(cds['time'].values), len(cds['k'].values), len(lat_vals)))
    else:
        zeros = np.zeros((len(cds['k'].values), len(lat_vals)))

    coords.update({'k': cds['k'].values})
    coords.update({'lat': lat_vals})

    dims += ('k', 'lat')

    xda = xr.DataArray(data=zeros, coords=coords, dims=dims)

    # Convert to dataset to add Z coordinate
    xds = xda.to_dataset(name='trsp_z')
    xds['Z'] = cds['Z']
    xds = xds.set_coords('Z')

    return xds
Ejemplo n.º 11
0
def _make_layers_variables(layer_name):
    """Translate metadata template to actual variable metadata."""
    from .variables import layers_grid_variables
    lvars = OrderedDict()
    layer_num = layer_name[0]
    # should always be int
    assert isinstance(int(layer_num), int)
    layer_id = 'l' + layer_num
    for key, vals in layers_grid_variables.items():
        # replace the name template with the actual name
        # e.g. layer_NAME_bounds -> layer_1RHO_bounds
        varname = key.replace('NAME', layer_name)
        metadata = _recursively_replace(vals, 'NAME', layer_name)
        # now fix dimension
        metadata['dims'] = [metadata['dims'][0].replace('l', layer_id)]
        lvars[varname] = metadata
    return lvars
Ejemplo n.º 12
0
    def test_expand_dims(self):
        v = Variable(['x'], [0, 1])
        actual = v.expand_dims(['x', 'y'])
        expected = Variable(['x', 'y'], [[0], [1]])
        self.assertVariableIdentical(actual, expected)

        actual = v.expand_dims(['y', 'x'])
        self.assertVariableIdentical(actual, expected.T)

        actual = v.expand_dims(OrderedDict([('x', 2), ('y', 2)]))
        expected = Variable(['x', 'y'], [[0, 0], [1, 1]])
        self.assertVariableIdentical(actual, expected)

        v = Variable(['foo'], [0, 1])
        actual = v.expand_dims('foo')
        expected = v
        self.assertVariableIdentical(actual, expected)

        with self.assertRaisesRegexp(ValueError, 'must be a superset'):
            v.expand_dims(['z'])
Ejemplo n.º 13
0
def drop(self, labels, dim=None, inplace=False):
    """Drop variables or index labels from this dataset. Based on xarray.dataset.drop, but adds inplace option.

    Parameters
    ----------
    labels : scalar or list of scalars
        Name(s) of variables or index labels to drop.
    dim : None or str, optional
        Dimension along which to drop index labels. By default (if
        ``dim is None``), drops variables rather than index labels.
    inplace : whether the original dataset should be modified or a new one created

    Returns
    -------
    dropped : Dataset (self if inplace=True)
    """
    if utils.is_scalar(labels):
        labels = [labels]
    if dim is None:
        self._assert_all_in_dataset(labels)
        drop = set(labels)
        variables = OrderedDict(
            (k, v) for k, v in iteritems(self._variables) if k not in drop)
        coord_names = set(k for k in self._coord_names if k in variables)
        result = self._replace_vars_and_dims(variables,
                                             coord_names,
                                             inplace=inplace)
    else:
        try:
            index = self.indexes[dim]
        except KeyError:
            raise ValueError('dimension %r does not have coordinate labels' %
                             dim)
        new_index = index.drop(labels)
        result = self.loc[{dim: new_index}]

    return self if inplace else result
Ejemplo n.º 14
0
def _initialize_section_trsp_data_array(cds):
    """Create an xarray DataArray with time, depth, and latitude dims

    Parameters
    ----------
    cds : xarray Dataset
        contains LLC coordinates 'k' and (optionally) 'time'

    Returns
    -------
    ds_out : xarray Dataset
        Dataset with the variables
            'trsp_z'
                zero-valued DataArray with time (optional) and 
                depth dimensions
            'Z'
                the original depth coordinate
    """

    coords = OrderedDict()
    dims = ()

    if 'time' in cds:
        coords.update( {'time': cds['time'].values} )
        dims += ('time',)
        zeros = np.zeros((len(cds['time'].values),
                          len(cds['k'].values)))
    else:
        zeros = np.zeros((len(cds['k'].values)))

    coords.update( {'k': cds['k'].values} )

    dims += ('k',)

    xda = xr.DataArray(data=zeros, coords=coords, dims=dims)

    # Convert to dataset to add Z coordinate
    xds = xda.to_dataset(name='trsp_z')
    xds['Z'] = cds['Z']
    xds = xds.set_coords('Z')

    return xds
Ejemplo n.º 15
0
    def __init__(self,
                 data_dir,
                 grid_dir=None,
                 iternum=None,
                 delta_t=1,
                 read_grid=True,
                 file_prefixes=None,
                 ref_date=None,
                 calendar=None,
                 geometry='sphericalpolar',
                 endian='>',
                 ignore_unknown_vars=False,
                 default_dtype=np.dtype('f4'),
                 nx=None,
                 ny=None,
                 nz=None,
                 llc_method="smallchunks",
                 levels=None,
                 extra_metadata=None,
                 extra_variables=None):
        """
        This is not a user-facing class. See open_mdsdataset for argument
        documentation. The only ones which are distinct are.

        Parameters
        ----------
        iternum : int, optional
            The iteration timestep number to read.
        file_prefixes : list
            The prefixes of the data files to be read.
        """

        self.geometry = geometry.lower()
        allowed_geometries = [
            'cartesian', 'sphericalpolar', 'llc', 'cs', 'curvilinear'
        ]
        if self.geometry not in allowed_geometries:
            raise ValueError('Unexpected value for parameter `geometry`. '
                             'It must be one of the following: %s' %
                             allowed_geometries)

        # the directory where the files live
        self.data_dir = data_dir
        self.grid_dir = grid_dir if (grid_dir is not None) else data_dir
        self.extra_variables = extra_variables
        self._ignore_unknown_vars = ignore_unknown_vars

        # The endianness of the files
        # By default, MITgcm does big endian
        if endian not in ['>', '<', '=']:
            raise ValueError("Invalid byte order (endian=%s)" % endian)
        self.endian = endian
        if default_dtype is not None:
            self.default_dtype = np.dtype(default_dtype).newbyteorder(endian)
        else:
            self.default_dtype = default_dtype

        # storage dicts for variables and attributes
        self._variables = OrderedDict()
        self._attributes = OrderedDict()
        self._dimensions = []

        # the dimensions are theoretically the same for all datasets
        [self._dimensions.append(k) for k in dimensions]
        self.llc = (self.geometry == 'llc')
        self.cs = (self.geometry == 'cs')

        if nz is None:
            self.nz = _guess_model_nz(self.grid_dir)
        else:
            self.nz = nz

        # if user passes extra_metadata, this should have priority
        user_metadata = True if extra_metadata is not None else False

        # put in local variable to make it more readable
        if extra_metadata is not None and 'has_faces' in extra_metadata:
            has_faces = extra_metadata['has_faces']
        else:
            has_faces = False

        # --------------- LEGACY ----------------------
        if self.llc:
            has_faces = True
            if extra_metadata is None or 'ny_facets' not in extra_metadata:
                # default to llc90, we only need number of facets
                # and we cannot know nx at this point
                llc = get_extra_metadata(domain='llc', nx=90)
                extra_metadata = llc
        if self.cs:
            has_faces = True
            if extra_metadata is None or 'ny_facets' not in extra_metadata:
                # default to llc90, we only need number of facets
                # and we cannot know nx at this point
                cs = get_extra_metadata(domain='cs', nx=32)
                extra_metadata = cs
        # --------------- /LEGACY ----------------------

        # we don't need to know ny if using llc
        if has_faces and (nx is not None):
            ny = nx

        # Now we need to figure out the horizontal dimensions nx, ny
        # nface is the number of llc faces
        if (nx is not None) and (ny is not None):
            # we have been passed enough information to determine the
            # dimensions without reading any files
            self.ny, self.nx = ny, nx
            self.nface = len(extra_metadata['face_facets']) if has_faces \
                else None
        else:
            # have to peek at the grid file metadata
            self.nface, self.ny, self.nx = (_guess_model_horiz_dims(
                self.grid_dir, is_llc=self.llc, is_cs=self.cs))

        # --------------- LEGACY ----------------------
        if self.llc:
            if not user_metadata:
                # if user didn't provide metadata, we default to llc
                llc = get_extra_metadata(domain='llc', nx=self.nx)
                extra_metadata = llc
        if self.cs:
            if not user_metadata:
                # if user didn't provide metadata, we default to llc
                cs = get_extra_metadata(domain='cs', nx=self.nx)
                extra_metadata = cs
        # --------------- /LEGACY ----------------------

        self.layers = _guess_layers(data_dir)

        if has_faces:
            nyraw = self.nx * self.nface
        else:
            nyraw = self.ny
        self.default_shape_3D = (self.nz, nyraw, self.nx)
        self.default_shape_2D = (nyraw, self.nx)
        self.llc_method = llc_method

        # Now set up the corresponding coordinates.
        # Rather than assuming the dimension names, we use Comodo conventions
        # to parse the dimension metdata.
        # http://pycomodo.forge.imag.fr/norm.html
        irange = np.arange(self.nx)
        jrange = np.arange(self.ny)
        if levels is None:
            krange = np.arange(self.nz)
            krange_p1 = np.arange(self.nz + 1)
        else:
            krange = levels
            krange_p1 = levels + [levels[-1] + 1]
        # the keys are `standard_name` attribute
        dimension_data = {
            "x_grid_index": irange,
            "x_grid_index_at_u_location": irange,
            "x_grid_index_at_f_location": irange,
            "y_grid_index": jrange,
            "y_grid_index_at_v_location": jrange,
            "y_grid_index_at_f_location": jrange,
            "z_grid_index": krange,
            "z_grid_index_at_lower_w_location": krange,
            "z_grid_index_at_upper_w_location": krange,
            "z_grid_index_at_w_location": krange_p1,
        }

        for dim in self._dimensions:
            dim_meta = dimensions[dim]
            dims = dim_meta['dims']
            attrs = dim_meta['attrs']
            data = dimension_data[attrs['standard_name']]
            dim_variable = xr.Variable(dims, data, attrs)
            self._variables[dim] = dim_variable

        # possibly add the llc dimension
        # seems sloppy to hard code this here
        # TODO: move this metadata to variables.py
        if has_faces:
            self._dimensions.append(FACE_DIMNAME)
            data = np.arange(self.nface)
            attrs = {'standard_name': 'face_index'}
            dims = [FACE_DIMNAME]
            self._variables[FACE_DIMNAME] = xr.Variable(dims, data, attrs)

        # do the same for layers
        for layer_name, n_layer in self.layers.items():
            for suffix, offset in zip(['bounds', 'center', 'interface'],
                                      [0, -1, -2]):
                # e.g. "layer_1RHO_bounds"
                # dimname = 'layer_' + layer_name + '_' + suffix
                # e.g. "l1_b"
                dimname = 'l' + layer_name[0] + '_' + suffix[0]
                self._dimensions.append(dimname)
                data = np.arange(n_layer + offset)
                # we should figure out a way to properly populate the layers
                # attributes
                attrs = {
                    'standard_name':
                    layer_name + '_layer_grid_index_at_layer_' + suffix,
                    'swap_dim': 'layer_' + layer_name + '_' + suffix
                }
                dim_variable = xr.Variable([dimname], data, attrs)
                self._variables[dimname] = dim_variable

        # maybe add a time dimension
        if iternum is not None:
            self.time_dim_name = 'time'
            self._dimensions.append(self.time_dim_name)
            # a variable for iteration number
            self._variables['iter'] = xr.Variable(
                (self.time_dim_name, ), [iternum], {
                    'standard_name': 'timestep',
                    'long_name': 'model timestep number'
                })
            self._variables[
                self.time_dim_name] = _iternum_to_datetime_variable(
                    iternum, delta_t, ref_date, calendar, self.time_dim_name)

        # build lookup tables for variable metadata
        self._all_grid_variables = _get_all_grid_variables(
            self.geometry, self.grid_dir, self.layers)
        self._all_data_variables = _get_all_data_variables(
            self.data_dir, self.grid_dir, self.layers, self.extra_variables)

        # The rest of the data has to be read from disk.
        # The list `prefixes` specifies file prefixes from which to infer
        # The problem with this is that some prefixes are single variables
        # while some are multi-variable diagnostics files.
        prefixes = []
        if read_grid:
            prefixes = prefixes + list(self._all_grid_variables.keys())

        # add data files
        prefixes = (
            prefixes +
            _get_all_matching_prefixes(data_dir, iternum, file_prefixes))

        for p in prefixes:
            # use a generator to loop through the variables in each file
            for (vname, dims, data, attrs) in \
                    self.load_from_prefix(p, iternum, extra_metadata):
                # print(vname, dims, data.shape)
                # Sizes of grid variables can vary between mitgcm versions.
                # Check for such inconsistency and correct if so
                (vname, dims, data, attrs) = self.fix_inconsistent_variables(
                    vname, dims, data, attrs)

                # Create masks from hFac variables
                data = self.calc_masks(vname, data)

                thisvar = xr.Variable(dims, data, attrs)
                self._variables[vname] = thisvar
Ejemplo n.º 16
0
def _concat_dicts(list_of_dicts):
    result = OrderedDict()
    for eachdict in list_of_dicts:
        for k, v in eachdict.items():
            result[k] = v
    return result
Ejemplo n.º 17
0
dimensions = OrderedDict(
    # x direction
    i=dict(dims=['i'],
           attrs=dict(standard_name="x_grid_index",
                      axis='X',
                      long_name="x-dimension of the t grid",
                      swap_dim='XC')),
    i_g=dict(dims=['i_g'],
             attrs=dict(standard_name="x_grid_index_at_u_location",
                        axis='X',
                        long_name="x-dimension of the u grid",
                        c_grid_axis_shift=-0.5,
                        swap_dim='XG')),
    # i_z = dict(dims=['i_z'], swap_dim='XG', attrs=dict(
    #             standard_name="x_grid_index_at_f_location", axis='X',
    #             long_name="x-dimension of the f grid", c_grid_axis_shift=-0.5)),
    # y direction
    j=dict(dims=['j'],
           attrs=dict(standard_name="y_grid_index",
                      axis='Y',
                      long_name="y-dimension of the t grid",
                      swap_dim='YC')),
    j_g=dict(dims=['j_g'],
             attrs=dict(standard_name="y_grid_index_at_v_location",
                        axis='Y',
                        long_name="y-dimension of the v grid",
                        c_grid_axis_shift=-0.5,
                        swap_dim='YG')),
    # j_z = dict(dims=['j_z'], swap_dim='YG', attrs=dict(
    #             standard_name="y_grid_index_at_f_location", axis='Y',
    #             long_name="y-dimension of the f grid", c_grid_axis_shift=-0.5)),
    # x direction
    k=dict(dims=['k'],
           attrs=dict(standard_name="z_grid_index",
                      axis="Z",
                      long_name="z-dimension of the t grid",
                      swap_dim='Z')),
    k_u=dict(dims=['k_u'],
             attrs=dict(standard_name="z_grid_index_at_lower_w_location",
                        axis="Z",
                        long_name="z-dimension of the w grid",
                        c_grid_axis_shift=0.5,
                        swap_dim='Zu')),
    k_l=dict(dims=['k_l'],
             attrs=dict(standard_name="z_grid_index_at_upper_w_location",
                        axis="Z",
                        long_name="z-dimension of the w grid",
                        c_grid_axis_shift=-0.5,
                        swap_dim='Zl')),
    # this is complicated because it is offset in both directions - allowed by comodo?
    k_p1=dict(dims=['k_p1'],
              attrs=dict(standard_name="z_grid_index_at_w_location",
                         axis="Z",
                         long_name="z-dimension of the w grid",
                         c_grid_axis_shift=(-0.5, 0.5),
                         swap_dim='Zp1')))
Ejemplo n.º 18
0
def _make_data_array(data_tiles,
                     iVar,
                     jVar,
                     kVar,
                     less_output=False,
                     dim4=None,
                     dim5=None):
    """Non user facing function to make a data array from tiled numpy/dask array 
    and strings denoting grid location

    Note that here, I'm including the "tiles" dimension... 
    so dim4 refers to index vector d_4, and dim5 refers to index d_5
    No user should have to deal with this though

    Parameters
    ----------
    data_tiles : numpy/dask array
        Probably loaded from binary via mds_io.read_bin_to_tiles and rearranged
        in llc_tiles_to_xda
    iVar : string
        denote x grid location, 'i' or 'i_g'
    jVar : string
        denote y grid location, 'j' or 'j_g'
    kVar : string
        denote x grid location, 'k' only implemented for now. 
        possible to implement 'k_u' for e.g. vertical velocity ... at some point
    less_output : boolean, optional
        debugging flag, False => print more
    dim4 : string, optional
        Specify the third dimension, either 'depth' or 'time'
    dim5 : string, optional
        Specify the third dimension, either 'depth' or 'time'

    Returns
    -------
    da : xarray DataArray
    """

    # Save shape and num dimensions for below
    data_shape = data_tiles.shape
    Ndims = len(data_shape)

    # Create minimal coordinate information
    i = np.arange(data_shape[-1])
    j = np.arange(data_shape[-2])
    tiles = np.arange(data_shape[-3])
    d_4 = []
    d_5 = []
    if len(data_shape) > 3:
        if dim4 is None:
            raise TypeError(
                "Please specify 4th dimension as dim4='depth' or dim4='time'")
        d_4 = np.arange(data_shape[-4])

    if len(data_shape) > 4:
        if dim5 is None:
            raise TypeError(
                "Please specify 5th dimension as dim5='depth' or dim5='time'")
        d_5 = np.arange(data_shape[-5])

    # Determine how to handle fourth dimension
    if dim4 == 'depth':
        fourthDimIsDepth = True
    else:
        fourthDimIsDepth = False

    # We can't say much about tile or time dimension
    tile_attrs = OrderedDict([('standard_name', 'face_index')])
    time_attrs = OrderedDict([('standard_name', 'time'), ('long_name', 'Time'),
                              ('axis', 'T')])

    # Create dims tuple, which will at least have
    # e.g. ('tile','j','i') for a 'c' variable
    dims = ('tile', jVar, iVar)

    # Coordinates will be a dictionary of 1 dimensional xarray DataArrays
    # each with their own set of attributes
    coords = OrderedDict()

    if Ndims > 3:
        if fourthDimIsDepth:
            # Only add depth
            dims = (kVar, ) + dims
            k_da = xr.DataArray(data=d_4,
                                coords={kVar: d_4},
                                dims=(kVar, ),
                                attrs=dimensions[kVar]['attrs'])
            coords[kVar] = k_da

        else:
            # Only add time
            dims = ('time', ) + dims
            time_da = xr.DataArray(data=d_4,
                                   coords={'time': d_4},
                                   dims=('time', ),
                                   attrs=time_attrs)
            coords['time'] = time_da

    if Ndims > 4:
        if fourthDimIsDepth:
            # Now add time
            dims = ('time', ) + dims
            time_da = xr.DataArray(data=d_5,
                                   coords={'time': d_5},
                                   dims=('time', ),
                                   attrs=time_attrs)
            coords['time'] = time_da

        else:
            # Now add depth
            dims = (kVar, ) + dims
            k_da = xr.DataArray(data=d_5,
                                coords={kVar: d_5},
                                dims=(kVar, ),
                                attrs=dimensions[kVar]['attrs'])
            coords[kVar] = k_da

    # Now add the standard coordinates
    tile_da = xr.DataArray(data=tiles,
                           coords={'tile': tiles},
                           dims=('tile', ),
                           attrs=tile_attrs)
    j_da = xr.DataArray(data=j,
                        coords={jVar: j},
                        dims=(jVar, ),
                        attrs=dimensions[jVar]['attrs'])
    i_da = xr.DataArray(data=i,
                        coords={iVar: i},
                        dims=(iVar, ),
                        attrs=dimensions[iVar]['attrs'])

    coords['tile'] = tile_da
    coords[jVar] = j_da
    coords[iVar] = i_da

    return xr.DataArray(data=data_tiles, coords=coords, dims=dims)
Ejemplo n.º 19
0
from .common import broadcast_1d_array
from .util.gridspec import _get_model_info, prof_altitude

#: Hard-coded dimension variables to use with any Dataset read in
BASE_DIMENSIONS = OrderedDict(
    lon=dict(dims=[
        'lon',
    ],
             attrs={
                 'standard_name': 'longitude',
                 'axis': 'X',
             }),
    lat=dict(
        dims=[
            'lat',
        ],
        attrs={
            'standard_name': 'latitude',
            'axis': 'Y',
        },
    ),
    time=dict(dims=[
        'time',
    ], attrs={}),
    nv=dict(dims=[
        'nv',
    ], attrs={}),
)

#: CF/COARDS recommended dimension order; non-spatiotemporal dimensions
#: should precede these.
DIM_ORDER_PRIORITY = ['time', 'lev', 'lat', 'lon']
Ejemplo n.º 20
0
    def __init__(self,
                 filename,
                 fields=[],
                 categories=[],
                 fix_cf=False,
                 mode='r',
                 endian='>',
                 diaginfo_file='',
                 tracerinfo_file='',
                 use_mmap=False,
                 dask_delayed=False):

        # Track the metadata accompanying this dataset.
        dir_path = os.path.abspath(os.path.dirname(filename))
        if not dir_path:
            dir_path = os.getcwd()
        if not tracerinfo_file:
            tracerinfo_file = os.path.join(dir_path, 'tracerinfo.dat')
            if not os.path.exists(tracerinfo_file):
                tracerinfo_file = ''
        self.tracerinfo_file = tracerinfo_file
        if not diaginfo_file:
            diaginfo_file = os.path.join(dir_path, 'diaginfo.dat')
            if not os.path.exists(diaginfo_file):
                diaginfo_file = ''
        self.diaginfo_file = diaginfo_file

        self.filename = filename
        self.fsize = os.path.getsize(self.filename)
        self.mode = mode
        if not mode.startswith('r'):
            raise ValueError(
                "Currently only know how to 'r(b)'ead bpch files.")

        # Check endianness flag
        if endian not in ['>', '<', '=']:
            raise ValueError("Invalid byte order (endian={})".format(endian))
        self.endian = endian

        # Open the raw output file, but don't yet read all the data
        self._mmap = use_mmap
        self._dask = dask_delayed
        self._bpch = BPCHFile(self.filename,
                              self.mode,
                              self.endian,
                              tracerinfo_file=tracerinfo_file,
                              diaginfo_file=diaginfo_file,
                              eager=False,
                              use_mmap=self._mmap,
                              dask_delayed=self._dask)
        self.fields = fields
        self.categories = categories

        # Peek into the raw output file and read the header and metadata
        # so that we can get a head start at building the output grid
        self._bpch._read_metadata()
        self._bpch._read_header()

        # Parse the binary file and prepare to add variables to the DataStore
        self._bpch._read_var_data()

        # Create storage dicts for variables and attributes, to be used later
        # when xarray needs to access the data
        self._variables = OrderedDict()
        self._attributes = OrderedDict()
        self._attributes.update(self._bpch._attributes)
        self._dimensions = [d for d in BASE_DIMENSIONS]

        # Begin constructing the coordinate dimensions shared by the
        # output dataset variables
        dim_coords = {}
        self.ctm_info = CTMGrid.from_model(self._attributes['modelname'],
                                           resolution=self._attributes['res'])

        # Add vertical dimensions
        self._dimensions.append(dict(dims=[
            'lev',
        ], attrs={'axis': 'Z'}))
        self._dimensions.append(dict(dims=[
            'lev_trop',
        ], attrs={'axis': 'Z'}))
        self._dimensions.append(dict(dims=[
            'lev_edge',
        ], attrs={'axis': 'Z'}))
        eta_centers = self.ctm_info.eta_centers
        sigma_centers = self.ctm_info.sigma_centers

        # Add time dimensions
        self._dimensions.append(
            dict(dims=[
                'time',
            ],
                 attrs={
                     'axis': 'T',
                     'long_name': 'time',
                     'standard_name': 'time'
                 }))

        # Add lat/lon dimensions
        self._dimensions.append(
            dict(dims=[
                'lon',
            ],
                 attrs={
                     'axis': 'X',
                     'long_name': 'longitude coordinate',
                     'standard_name': 'longitude'
                 }))
        self._dimensions.append(
            dict(dims=[
                'lat',
            ],
                 attrs={
                     'axis': 'y',
                     'long_name': 'latitude coordinate',
                     'standard_name': 'latitude'
                 }))

        if eta_centers is not None:
            lev_vals = eta_centers
            lev_attrs = {
                'standard_name': 'atmosphere_hybrid_sigma_pressure_coordinate',
                'axis': 'Z'
            }
        else:
            lev_vals = sigma_centers
            lev_attrs = {
                'standard_name': 'atmosphere_hybrid_sigma_pressure_coordinate',
                'axis': 'Z'
            }
        self._variables['lev'] = xr.Variable([
            'lev',
        ], lev_vals, lev_attrs)

        ## Latitude / Longitude
        # TODO: Add lon/lat bounds

        # Detect if we're on a nested grid; in that case, we'll have a displaced
        # origin set in the variable attributes we previously read
        ref_key = list(self._bpch.var_attrs.keys())[0]
        ref_attrs = self._bpch.var_attrs[ref_key]
        self.is_nested = (ref_attrs['origin'] != (1, 1, 1))

        lon_centers = self.ctm_info.lon_centers
        lat_centers = self.ctm_info.lat_centers

        if self.is_nested:
            ix, iy, _ = ref_attrs['origin']
            nx, ny, *_ = ref_attrs['original_shape']
            # Correct i{x,y} for IDL->Python indexing (1-indexed -> 0-indexed)
            ix -= 1
            iy -= 1
            lon_centers = lon_centers[ix:ix + nx]
            lat_centers = lat_centers[iy:iy + ny]

        self._variables['lon'] = xr.Variable(['lon'], lon_centers, {
            'long_name': 'longitude',
            'units': 'degrees_east'
        })
        self._variables['lat'] = xr.Variable(['lat'], lat_centers, {
            'long_name': 'latitude',
            'units': 'degrees_north'
        })
        # TODO: Fix longitudes if ctm_grid.center180

        # Add variables from the parsed BPCH file to our DataStore
        for vname in list(self._bpch.var_data.keys()):

            var_data = self._bpch.var_data[vname]
            var_attr = self._bpch.var_attrs[vname]

            if fields and (var_attr['name'] not in fields):
                continue
            if categories and (var_attr['category'] not in categories):
                continue

            # Process dimensions
            dims = [
                'time',
                'lon',
                'lat',
            ]
            dshape = var_attr['original_shape']
            if len(dshape) == 3:
                # Process the vertical coordinate. A few things can happen here:
                # 1) We have cell-centered values on the "Nlayer" grid; we can take these variables and map them to 'lev'
                # 2) We have edge value on an "Nlayer" + 1 grid; we can take these and use them with 'lev_edge'
                # 3) We have troposphere values on "Ntrop"; we can take these and use them with 'lev_trop', but we won't have coordinate information yet
                # All other cases we do not handle yet; this includes the aircraft emissions and a few other things. Note that tracer sources do not have a vertical coord to worry about!
                nlev = dshape[-1]
                grid_nlev = self.ctm_info.Nlayers
                grid_ntrop = self.ctm_info.Ntrop
                try:
                    if nlev == grid_nlev:
                        dims.append('lev')
                    elif nlev == grid_nlev + 1:
                        dims.append('lev_edge')
                    elif nlev == grid_ntrop:
                        dims.append('lev_trop')
                    else:
                        continue
                except AttributeError:
                    warnings.warn("Couldn't resolve grid_spec vertical layout")
                    continue

            # xarray Variables are thin wrappers for numpy.ndarrays, or really
            # any object that extends the ndarray interface. A critical part of
            # the original ndarray interface is that the underlying data has to
            # be contiguous in memory. We can enforce this to happen by
            # concatenating each bundle in the variable data bundles we read
            # from the bpch file
            data = self._concat([v.data for v in var_data])

            # Is the variable time-invariant? If it is, kill the time dim.
            # Here, we mean it only as one sample in the dataset.
            if data.shape[0] == 1:
                dims = dims[1:]
                data = data.squeeze()

            # Create a variable containing this data
            var = xr.Variable(dims, data, var_attr)

            # Shuffle dims for CF/COARDS compliance if requested
            # TODO: For this to work, we have to force a load of the data.
            #       Is there a way to re-write BPCHDataProxy so that that's not
            #       necessary?
            #       Actually, we can't even force a load becase var.data is a
            #       numpy.ndarray. Weird.
            # if fix_dims:
            #     target_dims = [d for d in DIM_ORDER_PRIORITY if d in dims]
            #     var = var.transpose(*target_dims)

            self._variables[vname] = var

            # Try to add a time dimension
            # TODO: Time units?
            if (len(var_data) > 1) and 'time' not in self._variables:
                time_bnds = np.asarray([v.time for v in var_data])
                times = time_bnds[:, 0]

                self._variables['time'] = xr.Variable(
                    [
                        'time',
                    ], times, {
                        'bounds': 'time_bnds',
                        'units': cf.CTM_TIME_UNIT_STR
                    })
                self._variables['time_bnds'] = xr.Variable(
                    ['time', 'nv'], time_bnds, {'units': cf.CTM_TIME_UNIT_STR})
                self._variables['nv'] = xr.Variable([
                    'nv',
                ], [0, 1])
Ejemplo n.º 21
0
class BPCHFile(object):
    """ A file object for representing BPCH data on disk

    Attributes
    ----------
    fp : FortranFile
        A pointer to the open unformatted Fortran binary output (the original
        bpch file)
    var_data, var_attrs : dict
        Containers of `BPCHDataBundle`s and dicts, respectively, holding
        the accessor functions to the raw bpch data and their associated
        metadata

    """

    def __init__(self, filename, mode='rb', endian='>',
                 diaginfo_file='', tracerinfo_file='', eager=False,
                 use_mmap=False, dask_delayed=False):
        """ Load a BPCHFile

        Parameters
        ----------
        filename : str
            Path to the bpch file on disk
        mode : str
            Mode string to pass to the file opener; this is currently fixed to
            "rb" and all other values will be rejected
        endian : str {">", "<", ":"}
            Endian-ness of the Fortran output file
        {tracerinfo, diaginfo}_file : str
            Path to the tracerinfo.dat and diaginfo.dat files containing
            metadata pertaining to the output in the bpch file being read.
        eager : bool
            Flag to immediately read variable data; if "False", then nothing
            will be read from the file and you'll need to do so manually
        use_mmap : bool
            Use memory-mapping to read data from file
        dask_delayed : bool
            Use dask to create delayed references to the data-reading functions
        """

        self.mode = mode
        if not mode.startswith('r'):
            raise ValueError("Currently only know how to 'r(b)'ead bpch files.")

        self.filename = filename
        self.fsize = os.path.getsize(self.filename)
        self.endian = endian

        # Open a pointer to the file
        self.fp = FortranFile(self.filename, self.mode, self.endian)

        dir_path = os.path.abspath(os.path.dirname(filename))
        if not dir_path:
            dir_path = os.getcwd()
        if not tracerinfo_file:
            tracerinfo_file = os.path.join(dir_path, "tracerinfo.dat")
            if not os.path.exists(tracerinfo_file):
                tracerinfo_file = ''
        self.tracerinfo_file = tracerinfo_file
        if not diaginfo_file:
            diaginfo_file = os.path.join(dir_path, "diaginfo.dat")
            if not os.path.exists(diaginfo_file):
                diaginfo_file = ''
        self.diaginfo_file = diaginfo_file

        # Container to record file metadata
        self._attributes = OrderedDict()

        # Don't necessarily need to save diag/tracer_dict yet
        self.diaginfo_df, _ = get_diaginfo(self.diaginfo_file)
        self.tracerinfo_df, _ = get_tracerinfo(self.tracerinfo_file)

        # Container for bundles contained in the output file.
        self.var_data = {}
        self.var_attrs = {}

        # Critical information for accessing file contents
        self._header_pos = None

        # Data loading strategy
        self.use_mmap = use_mmap
        self.dask_delayed = dask_delayed

        # Control eager versus deferring reading
        self.eager = eager
        if (mode.startswith('r') and self.eager):
            self._read()

    def close(self):
        """ Close this bpch file.

        """

        if not self.fp.closed:
            for v in list(self.var_data):
                del self.var_data[v]

            self.fp.close()

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.close()

    def _read(self):
        """ Parse the entire bpch file on disk and set up easy access to meta-
        and data blocks.

        """

        self._read_metadata()
        self._read_header()
        self._read_var_data()

    def _read_metadata(self):
        """ Read the main metadata packaged within a bpch file, indicating
        the output filetype and its title.

        """

        filetype = self.fp.readline().strip()
        filetitle = self.fp.readline().strip()
        # Decode to UTF string, if possible
        try:
            filetype = str(filetype, 'utf-8')
            filetitle = str(filetitle, 'utf-8')
        except:
            # TODO: Handle this edge-case of converting file metadata more elegantly.
            pass

        self.__setattr__('filetype', filetype)
        self.__setattr__('filetitle', filetitle)

    def _read_header(self):
        """ Process the header information (data model / grid spec) """

        self._header_pos = self.fp.tell()

        line = self.fp.readline('20sffii')
        modelname, res0, res1, halfpolar, center180 = line
        self._attributes.update({
            "modelname": str(modelname, 'utf-8').strip(),
            "halfpolar": halfpolar,
            "center180": center180,
            "res": (res0, res1)
        })
        self.__setattr__('modelname', modelname)
        self.__setattr__('res', (res0, res1))
        self.__setattr__('halfpolar', halfpolar)
        self.__setattr__('center180', center180)

        # Re-wind the file
        self.fp.seek(self._header_pos)


    def _read_var_data(self):
        """ Iterate over the block of this bpch file and return handlers
        in the form of `BPCHDataBundle`s for access to the data contained
        therein.

        """

        var_bundles = OrderedDict()
        var_attrs = OrderedDict()

        n_vars = 0

        while self.fp.tell() < self.fsize:

            var_attr = OrderedDict()

            # read first and second header lines
            line = self.fp.readline('20sffii')
            modelname, res0, res1, halfpolar, center180 = line

            line = self.fp.readline('40si40sdd40s7i')
            category_name, number, unit, tau0, tau1, reserved = line[:6]
            dim0, dim1, dim2, dim3, dim4, dim5, skip = line[6:]
            var_attr['number'] = number

            # Decode byte-strings to utf-8
            category_name = str(category_name, 'utf-8')
            var_attr['category'] = category_name.strip()
            unit = str(unit, 'utf-8')

            # get additional metadata from tracerinfo / diaginfo
            try:
                cat_df = self.diaginfo_df[
                    self.diaginfo_df.name == category_name.strip()
                ]
                # TODO: Safer logic for handling case where more than one
                #       tracer metadata match was made
                # if len(cat_df > 1):
                #     raise ValueError(
                #         "More than one category matching {} found in "
                #         "diaginfo.dat".format(
                #             category_name.strip()
                #         )
                #     )
                # Safe now to select the only row in the DataFrame
                cat = cat_df.T.squeeze()

                tracer_num = int(cat.offset) + int(number)
                diag_df = self.tracerinfo_df[
                    self.tracerinfo_df.tracer == tracer_num
                ]
                # TODO: Safer logic for handling case where more than one
                #       tracer metadata match was made
                # if len(diag_df > 1):
                #     raise ValueError(
                #         "More than one tracer matching {:d} found in "
                #         "tracerinfo.dat".format(tracer_num)
                #     )
                # Safe now to select only row in the DataFrame
                diag = diag_df.T.squeeze()
                diag_attr = diag.to_dict()

                if not unit.strip():  # unit may be empty in bpch
                    unit = diag_attr['unit']  # but not in tracerinfo
                var_attr.update(diag_attr)
            except:
                diag = {'name': '', 'scale': 1}
                var_attr.update(diag)
            var_attr['unit'] = unit

            vname = diag['name']
            fullname = category_name.strip() + "_" + vname

            # parse metadata, get data or set a data proxy
            if dim2 == 1:
                data_shape = (dim0, dim1)         # 2D field
            else:
                data_shape = (dim0, dim1, dim2)
            var_attr['original_shape'] = data_shape

            # Add proxy time dimension to shape
            data_shape = tuple([1, ] + list(data_shape))
            origin = (dim3, dim4, dim5)
            var_attr['origin'] = origin

            timelo, timehi = cf.tau2time(tau0), cf.tau2time(tau1)

            pos = self.fp.tell()
            # Note that we don't pass a dtype, and assume everything is
            # single-fp floats with the correct endian, as hard-coded
            var_bundle = BPCHDataBundle(
                data_shape, self.endian, self.filename, pos, [timelo, timehi],
                metadata=var_attr,
                use_mmap=self.use_mmap, dask_delayed=self.dask_delayed
            )
            self.fp.skipline()

            # Save the data as a "bundle" for concatenating in the final step
            if fullname in var_bundles:
                var_bundles[fullname].append(var_bundle)
            else:
                var_bundles[fullname] = [var_bundle, ]
                var_attrs[fullname] = var_attr
                n_vars += 1

        self.var_data = var_bundles
        self.var_attrs = var_attrs
Ejemplo n.º 22
0
    def _read_var_data(self):
        """ Iterate over the block of this bpch file and return handlers
        in the form of `BPCHDataBundle`s for access to the data contained
        therein.

        """

        var_bundles = OrderedDict()
        var_attrs = OrderedDict()

        n_vars = 0

        while self.fp.tell() < self.fsize:

            var_attr = OrderedDict()

            # read first and second header lines
            line = self.fp.readline('20sffii')
            modelname, res0, res1, halfpolar, center180 = line

            line = self.fp.readline('40si40sdd40s7i')
            category_name, number, unit, tau0, tau1, reserved = line[:6]
            dim0, dim1, dim2, dim3, dim4, dim5, skip = line[6:]
            var_attr['number'] = number

            # Decode byte-strings to utf-8
            category_name = str(category_name, 'utf-8')
            var_attr['category'] = category_name.strip()
            unit = str(unit, 'utf-8')

            # get additional metadata from tracerinfo / diaginfo
            try:
                cat_df = self.diaginfo_df[
                    self.diaginfo_df.name == category_name.strip()
                ]
                # TODO: Safer logic for handling case where more than one
                #       tracer metadata match was made
                # if len(cat_df > 1):
                #     raise ValueError(
                #         "More than one category matching {} found in "
                #         "diaginfo.dat".format(
                #             category_name.strip()
                #         )
                #     )
                # Safe now to select the only row in the DataFrame
                cat = cat_df.T.squeeze()

                tracer_num = int(cat.offset) + int(number)
                diag_df = self.tracerinfo_df[
                    self.tracerinfo_df.tracer == tracer_num
                ]
                # TODO: Safer logic for handling case where more than one
                #       tracer metadata match was made
                # if len(diag_df > 1):
                #     raise ValueError(
                #         "More than one tracer matching {:d} found in "
                #         "tracerinfo.dat".format(tracer_num)
                #     )
                # Safe now to select only row in the DataFrame
                diag = diag_df.T.squeeze()
                diag_attr = diag.to_dict()

                if not unit.strip():  # unit may be empty in bpch
                    unit = diag_attr['unit']  # but not in tracerinfo
                var_attr.update(diag_attr)
            except:
                diag = {'name': '', 'scale': 1}
                var_attr.update(diag)
            var_attr['unit'] = unit

            vname = diag['name']
            fullname = category_name.strip() + "_" + vname

            # parse metadata, get data or set a data proxy
            if dim2 == 1:
                data_shape = (dim0, dim1)         # 2D field
            else:
                data_shape = (dim0, dim1, dim2)
            var_attr['original_shape'] = data_shape

            # Add proxy time dimension to shape
            data_shape = tuple([1, ] + list(data_shape))
            origin = (dim3, dim4, dim5)
            var_attr['origin'] = origin

            timelo, timehi = cf.tau2time(tau0), cf.tau2time(tau1)

            pos = self.fp.tell()
            # Note that we don't pass a dtype, and assume everything is
            # single-fp floats with the correct endian, as hard-coded
            var_bundle = BPCHDataBundle(
                data_shape, self.endian, self.filename, pos, [timelo, timehi],
                metadata=var_attr,
                use_mmap=self.use_mmap, dask_delayed=self.dask_delayed
            )
            self.fp.skipline()

            # Save the data as a "bundle" for concatenating in the final step
            if fullname in var_bundles:
                var_bundles[fullname].append(var_bundle)
            else:
                var_bundles[fullname] = [var_bundle, ]
                var_attrs[fullname] = var_attr
                n_vars += 1

        self.var_data = var_bundles
        self.var_attrs = var_attrs
Ejemplo n.º 23
0
    def __init__(self, filename, mode='rb', endian='>',
                 diaginfo_file='', tracerinfo_file='', eager=False,
                 use_mmap=False, dask_delayed=False):
        """ Load a BPCHFile

        Parameters
        ----------
        filename : str
            Path to the bpch file on disk
        mode : str
            Mode string to pass to the file opener; this is currently fixed to
            "rb" and all other values will be rejected
        endian : str {">", "<", ":"}
            Endian-ness of the Fortran output file
        {tracerinfo, diaginfo}_file : str
            Path to the tracerinfo.dat and diaginfo.dat files containing
            metadata pertaining to the output in the bpch file being read.
        eager : bool
            Flag to immediately read variable data; if "False", then nothing
            will be read from the file and you'll need to do so manually
        use_mmap : bool
            Use memory-mapping to read data from file
        dask_delayed : bool
            Use dask to create delayed references to the data-reading functions
        """

        self.mode = mode
        if not mode.startswith('r'):
            raise ValueError("Currently only know how to 'r(b)'ead bpch files.")

        self.filename = filename
        self.fsize = os.path.getsize(self.filename)
        self.endian = endian

        # Open a pointer to the file
        self.fp = FortranFile(self.filename, self.mode, self.endian)

        dir_path = os.path.abspath(os.path.dirname(filename))
        if not dir_path:
            dir_path = os.getcwd()
        if not tracerinfo_file:
            tracerinfo_file = os.path.join(dir_path, "tracerinfo.dat")
            if not os.path.exists(tracerinfo_file):
                tracerinfo_file = ''
        self.tracerinfo_file = tracerinfo_file
        if not diaginfo_file:
            diaginfo_file = os.path.join(dir_path, "diaginfo.dat")
            if not os.path.exists(diaginfo_file):
                diaginfo_file = ''
        self.diaginfo_file = diaginfo_file

        # Container to record file metadata
        self._attributes = OrderedDict()

        # Don't necessarily need to save diag/tracer_dict yet
        self.diaginfo_df, _ = get_diaginfo(self.diaginfo_file)
        self.tracerinfo_df, _ = get_tracerinfo(self.tracerinfo_file)

        # Container for bundles contained in the output file.
        self.var_data = {}
        self.var_attrs = {}

        # Critical information for accessing file contents
        self._header_pos = None

        # Data loading strategy
        self.use_mmap = use_mmap
        self.dask_delayed = dask_delayed

        # Control eager versus deferring reading
        self.eager = eager
        if (mode.startswith('r') and self.eager):
            self._read()
Ejemplo n.º 24
0
 def test_stack_unstack_consistency(self):
     v = Variable(['x', 'y'], [[0, 1], [2, 3]])
     actual = (v.stack(z=('x', 'y')).unstack(z=OrderedDict([('x',
                                                             2), ('y',
                                                                  2)])))
     self.assertVariableIdentical(actual, v)
Ejemplo n.º 25
0
def _dataset_multi_concat(
    datasets,
    dim,
    data_vars,
    coords,
    compat,
    positions,
    join="outer",
):
    """
    Concatenate a sequence of datasets along a dimension, trying concatenation along alternate dimensions when the 
    chosen dimension is not present. This function is based on _dataset_concat from xarray.core.concat.py in xarray 
    0.15. It includes a modification to drop mismatched coordinates from datasets instead of throwing a ValueError. 
    This drop removes the variable from coordinates, but it remains a variable in the dataset.
    """
    # Make sure we're working on a copy (we'll be loading variables)
    datasets = [ds.copy() for ds in datasets]

    # determine what dimensions we will be concatenating over, including the preferred dim and any alternatives when
    # the preferred dim is absent
    dims = _find_concat_dims(datasets, dim)
    dims, coordinates = _calc_concat_dims_coords(dims)

    datasets = align(*datasets, join=join, copy=False, exclude=dims)

    dim_coords, dims_sizes, coord_names, data_names = _parse_datasets(datasets)
    dim_names = set(dim_coords)
    unlabeled_dims = dim_names - coord_names
    both_data_and_coords = coord_names & data_names
    if both_data_and_coords:
        # Instead of throwing a ValueError, make the coordinates match by removing the mismatched coordinate
        for ds in datasets:
            for variable in both_data_and_coords:
                if variable in ds.coords:
                    # This makes the variable no longer a coordinate, but does not remove it from the dataset entirely
                    ds._coord_names.remove(variable)
                    coord_names.discard(variable)

    # we don't want the concat dimensions in the result dataset yet
    for dim in dims:
        dim_coords.pop(dim, None)
        dims_sizes.pop(dim, None)

        # case where concat dimension is a coordinate or data_var but not a dimension
        if (dim in coord_names or dim in data_names) and dim not in dim_names:
            datasets = [ds.expand_dims(dim) for ds in datasets]

    # determine which variables to concatenate
    concat_over, equals, concat_dim_lengths = _calc_concat_over(
        datasets, dims, dim_names, data_vars, coords, compat)

    # determine which variables to merge, and then merge them according to compat
    variables_to_merge = (coord_names | data_names) - concat_over - dim_names

    result_vars = {}
    if variables_to_merge:
        to_merge = {var: [] for var in variables_to_merge}

        for ds in datasets:
            for var in variables_to_merge:
                if var in ds:
                    to_merge[var].append(ds.variables[var])

        for var in variables_to_merge:
            result_vars[var] = unique_variable(var,
                                               to_merge[var],
                                               compat=compat,
                                               equals=equals.get(var, None))
    else:
        result_vars = {}

    result_vars.update(dim_coords)

    # assign attrs and encoding from first dataset
    result_attrs = datasets[0].attrs
    result_encoding = datasets[0].encoding

    # check that global attributes are fixed across all datasets if necessary
    for ds in datasets[1:]:
        if compat == "identical" and not utils.dict_equiv(
                ds.attrs, result_attrs):
            raise ValueError("Dataset global attributes not equal.")

    # we've already verified everything is consistent; now, calculate
    # shared dimension sizes so we can expand the necessary variables
    def ensure_common_dims(vars):
        # ensure each variable with the given name shares the same
        # dimensions and the same shape for all of them except along the
        # concat dimension
        common_dims = tuple(pd.unique([d for v in vars for d in v.dims]))

        # find the first concat dimension available in vars
        concat_dim = [x for x in dims if x in common_dims][0]
        if not concat_dim:
            # none of the concat dims are present - add the first one
            dim = dims[0]
            common_dims = (dim, ) + common_dims
            concat_dim = dim

        for var, dim_len in zip(vars, concat_dim_lengths[concat_dim]):
            if var.dims != common_dims:
                common_shape = tuple(
                    dims_sizes.get(d, dim_len) for d in common_dims)
                var = var.expand_dims(common_dims, common_shape)
            yield var

    # stack up each variable to fill-out the dataset (in order)
    # n.b. this loop preserves variable order, needed for groupby.
    for k in datasets[0].variables:
        if k in concat_over:
            try:
                vars = ensure_common_dims([ds.variables[k] for ds in datasets])
            except KeyError:
                raise ValueError("%r is not present in all datasets." % k)
            # get the dimension to concatenate this variable on - choose first applicable dim from dims
            dim = _get_concat_dim(dims, [ds.variables[k] for ds in datasets])
            combined = concat_vars(vars, dim, positions)
            assert isinstance(combined, Variable)
            result_vars[k] = combined

    result = Dataset(result_vars, attrs=result_attrs)
    absent_coord_names = coord_names - set(result.variables)
    if absent_coord_names:
        raise ValueError(
            "Variables %r are coordinates in some datasets but not others." %
            absent_coord_names)
    # current versions of dataset.set_coords and dataset.drop force a _assert_all_in_dataset check that we don't want
    # xarray 0.15 has the option to disable this via errors='ignore', but for now just call the underlying logic
    #result = result.set_coords(coord_names, errors='ignore')
    result._coord_names.update(coord_names)
    result.encoding = result_encoding

    #result = result.drop(unlabeled_dims, errors='ignore')
    drop = set(unlabeled_dims)
    variables = OrderedDict(
        (k, v) for k, v in iteritems(result._variables) if k not in drop)
    coord_names = set(k for k in result._coord_names if k in variables)
    result._replace_vars_and_dims(variables, coord_names)

    for coord in coordinates:
        if coord:
            # add concat dimension last to ensure that its in the final Dataset
            result[coord.name] = coord

    return result
Ejemplo n.º 26
0
def open_bpchdataset(filename,
                     fields=[],
                     categories=[],
                     tracerinfo_file='tracerinfo.dat',
                     diaginfo_file='diaginfo.dat',
                     endian=">",
                     decode_cf=True,
                     memmap=True,
                     dask=True,
                     return_store=False):
    """ Open a GEOS-Chem BPCH file output as an xarray Dataset.

    Parameters
    ----------
    filename : string
        Path to the output file to read in.
    {tracerinfo,diaginfo}_file : string, optional
        Path to the metadata "info" .dat files which are used to decipher
        the metadata corresponding to each variable in the output dataset.
        If not provided, will look for them in the current directory or
        fall back on a generic set.
    fields : list, optional
        List of a subset of variable names to return. This can substantially
        improve read performance. Note that the field here is just the tracer
        name - not the category, e.g. 'O3' instead of 'IJ-AVG-$_O3'.
    categories : list, optional
        List a subset of variable categories to look through. This can
        substantially improve read performance.
    endian : {'=', '>', '<'}, optional
        Endianness of file on disk. By default, "big endian" (">") is assumed.
    decode_cf : bool
        Enforce CF conventions for variable names, units, and other metadata
    default_dtype : numpy.dtype, optional
        Default datatype for variables encoded in file on disk (single-precision
        float by default).
    memmap : bool
        Flag indicating that data should be memory-mapped from disk instead of
        eagerly loaded into memory
    dask : bool
        Flag indicating that data reading should be deferred (delayed) to
        construct a task-graph for later execution
    return_store : bool
        Also return the underlying DataStore to the user

    Returns
    -------
    ds : xarray.Dataset
        Dataset containing the requested fields (or the entire file), with data
        contained in proxy containers for access later.
    store : xarray.AbstractDataStore
        Underlying DataStore which handles the loading and processing of
        bpch files on disk

    """

    store = BPCHDataStore(filename,
                          fields=fields,
                          categories=categories,
                          tracerinfo_file=tracerinfo_file,
                          diaginfo_file=diaginfo_file,
                          endian=endian,
                          use_mmap=memmap,
                          dask_delayed=dask)
    ds = xr.Dataset.load_store(store)
    # Record what the file object underlying the store which we culled this
    # Dataset from is so that we can clean it up later
    ds._file_obj = store._bpch

    # Handle CF corrections
    if decode_cf:
        decoded_vars = OrderedDict()
        rename_dict = {}
        for v in ds:
            cf_name = cf.get_valid_varname(v)
            rename_dict[v] = cf_name
            new_var = cf.enforce_cf_variable(ds[v])
            decoded_vars[cf_name] = new_var
        ds = xr.Dataset(decoded_vars, attrs=ds.attrs.copy())

        # ds.rename(rename_dict, inplace=True)

        # TODO: There's a bug with xr.decode_cf which eagerly loads data.
        #       Re-enable this once that bug is fixed
        # Note that we do not need to decode the times because we explicitly
        # kept track of them as we parsed the data.
        # ds = xr.decode_cf(ds, decode_times=False)

    # Set attributes for CF conventions
    ts = get_timestamp()
    ds.attrs.update(
        dict(
            Conventions='CF1.6',
            source=filename,
            tracerinfo=tracerinfo_file,
            diaginfo=diaginfo_file,
            filetype=store._bpch.filetype,
            filetitle=store._bpch.filetitle,
            history=("{}: Processed/loaded by xbpch-{} from {}".format(
                ts, ver, filename)),
        ))

    # To immediately load the data from the BPCHDataProxy paylods, need
    # to execute ds.data_vars for some reason...
    if return_store:
        return ds, store
    else:
        return ds