Esempio n. 1
0
def inplace_isel(dataset, **indexers):
    invalid = [k for k in indexers if k not in dataset.dims]
    if invalid:
        raise ValueError("dimensions %r do not exist" % invalid)

    # all indexers should be int, slice or np.ndarrays
    indexers = [
        (k, (np.asarray(v) if not isinstance(v,
                                             (int, np.integer, slice)) else v))
        for k, v in iteritems(indexers)
    ]

    for name, var in iteritems(dataset._variables):
        var_indexers = dict((k, v) for k, v in indexers if k in var.dims)
        dataset._variables[name] = var.isel(**var_indexers)
    return dataset
Esempio n. 2
0
    def test_concat(self):
        # TODO: simplify and split this test case

        # drop the third dimension to keep things relatively understandable
        data = create_test_data()
        for k in list(data):
            if 'dim3' in data[k].dims:
                del data[k]

        split_data = [data.isel(dim1=slice(3)),
                      data.isel(dim1=slice(3, None))]
        self.assertDatasetIdentical(data, concat(split_data, 'dim1'))

        def rectify_dim_order(dataset):
            # return a new dataset with all variable dimensions transposed into
            # the order in which they are found in `data`
            return Dataset(dict((k, v.transpose(*data[k].dims))
                                for k, v in iteritems(dataset.data_vars)),
                           dataset.coords, attrs=dataset.attrs)

        for dim in ['dim1', 'dim2']:
            datasets = [g for _, g in data.groupby(dim, squeeze=False)]
            self.assertDatasetIdentical(data, concat(datasets, dim))

        dim = 'dim2'
        self.assertDatasetIdentical(
            data, concat(datasets, data[dim]))
        self.assertDatasetIdentical(
            data, concat(datasets, data[dim], coords='minimal'))

        datasets = [g for _, g in data.groupby(dim, squeeze=True)]
        concat_over = [k for k, v in iteritems(data.coords)
                       if dim in v.dims and k != dim]
        actual = concat(datasets, data[dim], coords=concat_over)
        self.assertDatasetIdentical(data, rectify_dim_order(actual))

        actual = concat(datasets, data[dim], coords='different')
        self.assertDatasetIdentical(data, rectify_dim_order(actual))

        # make sure the coords argument behaves as expected
        data.coords['extra'] = ('dim4', np.arange(3))
        for dim in ['dim1', 'dim2']:
            datasets = [g for _, g in data.groupby(dim, squeeze=True)]
            actual = concat(datasets, data[dim], coords='all')
            expected = np.array([data['extra'].values
                                 for _ in range(data.dims[dim])])
            self.assertArrayEqual(actual['extra'].values, expected)

            actual = concat(datasets, data[dim], coords='different')
            self.assertDataArrayEqual(data['extra'], actual['extra'])
            actual = concat(datasets, data[dim], coords='minimal')
            self.assertDataArrayEqual(data['extra'], actual['extra'])

        # verify that the dim argument takes precedence over
        # concatenating dataset variables of the same name
        dim = (2 * data['dim1']).rename('dim1')
        datasets = [g for _, g in data.groupby('dim1', squeeze=False)]
        expected = data.copy()
        expected['dim1'] = dim
        self.assertDatasetIdentical(expected, concat(datasets, dim))
Esempio n. 3
0
    def test_open_encodings(self):
        # Create a netCDF file with explicit time units
        # and make sure it makes it into the encodings
        # and survives a round trip
        with create_tmp_file() as tmp_file:
            with nc4.Dataset(tmp_file, 'w') as ds:
                ds.createDimension('time', size=10)
                ds.createVariable('time', np.int32, dimensions=('time', ))
                units = 'days since 1999-01-01'
                ds.variables['time'].setncattr('units', units)
                ds.variables['time'][:] = np.arange(10) + 4

            expected = Dataset()

            time = pd.date_range('1999-01-05', periods=10)
            encoding = {'units': units, 'dtype': np.dtype('int32')}
            expected['time'] = ('time', time, {}, encoding)

            with open_dataset(tmp_file) as actual:
                self.assertVariableEqual(actual['time'], expected['time'])
                actual_encoding = dict(
                    (k, v) for k, v in iteritems(actual['time'].encoding)
                    if k in expected['time'].encoding)
                self.assertDictEqual(actual_encoding,
                                     expected['time'].encoding)
Esempio n. 4
0
 def rectify_dim_order(dataset):
     # return a new dataset with all variable dimensions transposed into
     # the order in which they are found in `data`
     return Dataset(dict((k, v.transpose(*data[k].dims))
                         for k, v in iteritems(dataset.data_vars)),
                    dataset.coords,
                    attrs=dataset.attrs)
Esempio n. 5
0
def null_wrap(ds):
    """
    Given a data store this wraps each variable in a NullWrapper so that
    it appears to be out of memory.
    """
    variables = dict((k, Variable(v.dims, NullWrapper(v.values), v.attrs))
                     for k, v in iteritems(ds))
    return InMemoryDataStore(variables=variables, attributes=ds.attrs)
Esempio n. 6
0
 def rectify_dim_order(dataset):
     # return a new dataset with all variable dimensions tranposed into
     # the order in which they are found in `data`
     return Dataset(
         dict((k, v.transpose(*data[k].dims)) for k, v in iteritems(dataset.data_vars)),
         dataset.coords,
         attrs=dataset.attrs,
     )
Esempio n. 7
0
def null_wrap(ds):
    """
    Given a data store this wraps each variable in a NullWrapper so that
    it appears to be out of memory.
    """
    variables = dict((k, Variable(v.dims, NullWrapper(v.values), v.attrs))
                     for k, v in iteritems(ds))
    return InMemoryDataStore(variables=variables, attributes=ds.attrs)
Esempio n. 8
0
    def test_compression_encoding(self):
        data = create_test_data()
        data['var2'].encoding.update({'zlib': True,
                                      'chunksizes': (5, 5),
                                      'fletcher32': True,
                                      'original_shape': data.var2.shape})
        with self.roundtrip(data) as actual:
            for k, v in iteritems(data['var2'].encoding):
                self.assertEqual(v, actual['var2'].encoding[k])

        # regression test for #156
        expected = data.isel(dim1=0)
        with self.roundtrip(expected) as actual:
            self.assertDatasetEqual(expected, actual)
Esempio n. 9
0
    def test_compression_encoding(self):
        data = create_test_data()
        data['var2'].encoding.update({
            'zlib': True,
            'chunksizes': (5, 5),
            'fletcher32': True,
            'original_shape': data.var2.shape
        })
        with self.roundtrip(data) as actual:
            for k, v in iteritems(data['var2'].encoding):
                self.assertEqual(v, actual['var2'].encoding[k])

        # regression test for #156
        expected = data.isel(dim1=0)
        with self.roundtrip(expected) as actual:
            self.assertDatasetEqual(expected, actual)
Esempio n. 10
0
def drop(self, labels, dim=None, inplace=False):
    """Drop variables or index labels from this dataset. Based on xarray.dataset.drop, but adds inplace option.

    Parameters
    ----------
    labels : scalar or list of scalars
        Name(s) of variables or index labels to drop.
    dim : None or str, optional
        Dimension along which to drop index labels. By default (if
        ``dim is None``), drops variables rather than index labels.
    inplace : whether the original dataset should be modified or a new one created

    Returns
    -------
    dropped : Dataset (self if inplace=True)
    """
    if utils.is_scalar(labels):
        labels = [labels]
    if dim is None:
        self._assert_all_in_dataset(labels)
        drop = set(labels)
        variables = OrderedDict(
            (k, v) for k, v in iteritems(self._variables) if k not in drop)
        coord_names = set(k for k in self._coord_names if k in variables)
        result = self._replace_vars_and_dims(variables,
                                             coord_names,
                                             inplace=inplace)
    else:
        try:
            index = self.indexes[dim]
        except KeyError:
            raise ValueError('dimension %r does not have coordinate labels' %
                             dim)
        new_index = index.drop(labels)
        result = self.loc[{dim: new_index}]

    return self if inplace else result
Esempio n. 11
0
    def test_open_encodings(self):
        # Create a netCDF file with explicit time units
        # and make sure it makes it into the encodings
        # and survives a round trip
        with create_tmp_file() as tmp_file:
            with nc4.Dataset(tmp_file, 'w') as ds:
                ds.createDimension('time', size=10)
                ds.createVariable('time', np.int32, dimensions=('time',))
                units = 'days since 1999-01-01'
                ds.variables['time'].setncattr('units', units)
                ds.variables['time'][:] = np.arange(10) + 4

            expected = Dataset()

            time = pd.date_range('1999-01-05', periods=10)
            encoding = {'units': units, 'dtype': np.dtype('int32')}
            expected['time'] = ('time', time, {}, encoding)

            with open_dataset(tmp_file) as actual:
                self.assertVariableEqual(actual['time'], expected['time'])
                actual_encoding = dict((k, v) for k, v in
                                       iteritems(actual['time'].encoding)
                                       if k in expected['time'].encoding)
                self.assertDictEqual(actual_encoding, expected['time'].encoding)
Esempio n. 12
0
def _dataset_multi_concat(
    datasets,
    dim,
    data_vars,
    coords,
    compat,
    positions,
    join="outer",
):
    """
    Concatenate a sequence of datasets along a dimension, trying concatenation along alternate dimensions when the 
    chosen dimension is not present. This function is based on _dataset_concat from xarray.core.concat.py in xarray 
    0.15. It includes a modification to drop mismatched coordinates from datasets instead of throwing a ValueError. 
    This drop removes the variable from coordinates, but it remains a variable in the dataset.
    """
    # Make sure we're working on a copy (we'll be loading variables)
    datasets = [ds.copy() for ds in datasets]

    # determine what dimensions we will be concatenating over, including the preferred dim and any alternatives when
    # the preferred dim is absent
    dims = _find_concat_dims(datasets, dim)
    dims, coordinates = _calc_concat_dims_coords(dims)

    datasets = align(*datasets, join=join, copy=False, exclude=dims)

    dim_coords, dims_sizes, coord_names, data_names = _parse_datasets(datasets)
    dim_names = set(dim_coords)
    unlabeled_dims = dim_names - coord_names
    both_data_and_coords = coord_names & data_names
    if both_data_and_coords:
        # Instead of throwing a ValueError, make the coordinates match by removing the mismatched coordinate
        for ds in datasets:
            for variable in both_data_and_coords:
                if variable in ds.coords:
                    # This makes the variable no longer a coordinate, but does not remove it from the dataset entirely
                    ds._coord_names.remove(variable)
                    coord_names.discard(variable)

    # we don't want the concat dimensions in the result dataset yet
    for dim in dims:
        dim_coords.pop(dim, None)
        dims_sizes.pop(dim, None)

        # case where concat dimension is a coordinate or data_var but not a dimension
        if (dim in coord_names or dim in data_names) and dim not in dim_names:
            datasets = [ds.expand_dims(dim) for ds in datasets]

    # determine which variables to concatenate
    concat_over, equals, concat_dim_lengths = _calc_concat_over(
        datasets, dims, dim_names, data_vars, coords, compat)

    # determine which variables to merge, and then merge them according to compat
    variables_to_merge = (coord_names | data_names) - concat_over - dim_names

    result_vars = {}
    if variables_to_merge:
        to_merge = {var: [] for var in variables_to_merge}

        for ds in datasets:
            for var in variables_to_merge:
                if var in ds:
                    to_merge[var].append(ds.variables[var])

        for var in variables_to_merge:
            result_vars[var] = unique_variable(var,
                                               to_merge[var],
                                               compat=compat,
                                               equals=equals.get(var, None))
    else:
        result_vars = {}

    result_vars.update(dim_coords)

    # assign attrs and encoding from first dataset
    result_attrs = datasets[0].attrs
    result_encoding = datasets[0].encoding

    # check that global attributes are fixed across all datasets if necessary
    for ds in datasets[1:]:
        if compat == "identical" and not utils.dict_equiv(
                ds.attrs, result_attrs):
            raise ValueError("Dataset global attributes not equal.")

    # we've already verified everything is consistent; now, calculate
    # shared dimension sizes so we can expand the necessary variables
    def ensure_common_dims(vars):
        # ensure each variable with the given name shares the same
        # dimensions and the same shape for all of them except along the
        # concat dimension
        common_dims = tuple(pd.unique([d for v in vars for d in v.dims]))

        # find the first concat dimension available in vars
        concat_dim = [x for x in dims if x in common_dims][0]
        if not concat_dim:
            # none of the concat dims are present - add the first one
            dim = dims[0]
            common_dims = (dim, ) + common_dims
            concat_dim = dim

        for var, dim_len in zip(vars, concat_dim_lengths[concat_dim]):
            if var.dims != common_dims:
                common_shape = tuple(
                    dims_sizes.get(d, dim_len) for d in common_dims)
                var = var.expand_dims(common_dims, common_shape)
            yield var

    # stack up each variable to fill-out the dataset (in order)
    # n.b. this loop preserves variable order, needed for groupby.
    for k in datasets[0].variables:
        if k in concat_over:
            try:
                vars = ensure_common_dims([ds.variables[k] for ds in datasets])
            except KeyError:
                raise ValueError("%r is not present in all datasets." % k)
            # get the dimension to concatenate this variable on - choose first applicable dim from dims
            dim = _get_concat_dim(dims, [ds.variables[k] for ds in datasets])
            combined = concat_vars(vars, dim, positions)
            assert isinstance(combined, Variable)
            result_vars[k] = combined

    result = Dataset(result_vars, attrs=result_attrs)
    absent_coord_names = coord_names - set(result.variables)
    if absent_coord_names:
        raise ValueError(
            "Variables %r are coordinates in some datasets but not others." %
            absent_coord_names)
    # current versions of dataset.set_coords and dataset.drop force a _assert_all_in_dataset check that we don't want
    # xarray 0.15 has the option to disable this via errors='ignore', but for now just call the underlying logic
    #result = result.set_coords(coord_names, errors='ignore')
    result._coord_names.update(coord_names)
    result.encoding = result_encoding

    #result = result.drop(unlabeled_dims, errors='ignore')
    drop = set(unlabeled_dims)
    variables = OrderedDict(
        (k, v) for k, v in iteritems(result._variables) if k not in drop)
    coord_names = set(k for k in result._coord_names if k in variables)
    result._replace_vars_and_dims(variables, coord_names)

    for coord in coordinates:
        if coord:
            # add concat dimension last to ensure that its in the final Dataset
            result[coord.name] = coord

    return result