Exemple #1
0
    def _get_chunksizes(self, chunksizes):
        """Return the chunk sizes as an int tuple, if valid.

        We expect a list/tuple of 3 integers.

        :param dict chunksizes: the raw chunksizes parameter, to be
          validated.
        :return tuple chunksizes: the validated chunksizes as a an
          integers tuple.
        """
        if not isinstance(chunksizes, (list, tuple, set)):
            raise DatacubeException(
                'Dataset contains invalid chunking values, cannot write to storage.'
            )
        try:
            chunksizes = tuple(map(int, chunksizes))
        except ValueError:
            raise DatacubeException(
                'Dataset contains invalid chunking values, cannot write to storage.'
            )
        if not chunksizes:
            raise DatacubeException(
                'Dataset contains invalid chunking values, cannot write to storage.'
            )
        return chunksizes
Exemple #2
0
def write_dataset_to_netcdf(dataset,
                            filename,
                            global_attributes=None,
                            variable_params=None,
                            netcdfparams=None):
    """
    Write a Data Cube style xarray Dataset to a NetCDF file

    Requires a spatial Dataset, with attached coordinates and global crs attribute.

    :param `xarray.Dataset` dataset:
    :param filename: Output filename
    :param global_attributes: Global file attributes. dict of attr_name: attr_value
    :param variable_params: dict of variable_name: {param_name: param_value, [...]}
                            Allows setting storage and compression options per variable.
                            See the `netCDF4.Dataset.createVariable` for available
                            parameters.
    :param netcdfparams: Optional params affecting netCDF file creation
    """
    global_attributes = global_attributes or {}
    variable_params = variable_params or {}
    filename = Path(filename)

    if not dataset.data_vars.keys():
        raise DatacubeException('Cannot save empty dataset to disk.')

    if dataset.geobox is None:
        raise DatacubeException(
            'Dataset geobox property is None, cannot write to NetCDF file.')

    if dataset.geobox.crs is None:
        raise DatacubeException(
            'Dataset geobox.crs property is None, cannot write to NetCDF file.'
        )

    nco = create_netcdf_storage_unit(filename, dataset.geobox.crs,
                                     dataset.coords, dataset.data_vars,
                                     variable_params, global_attributes,
                                     netcdfparams)

    for name, variable in dataset.data_vars.items():
        nco[name][:] = netcdf_writer.netcdfy_data(variable.values)

    nco.close()
Exemple #3
0
    def write_dataset_to_storage(self,
                                 dataset,
                                 filename,
                                 global_attributes=None,
                                 variable_params=None,
                                 storage_config=None,
                                 **kwargs):
        """See :meth:`datacube.drivers.driver.write_dataset_to_storage`

        :param `xarray.Dataset` dataset:
        :param filename: Output filename
        :param global_attributes: Global file attributes. dict of attr_name: attr_value
        :param variable_params: dict of variable_name: {param_name: param_value, [...]}

        :return: Dictionary of metadata consigning the s3 storage information.
        This is required for indexing in particular.

        """
        if storage_config is None:
            storage_config = {}

        # TODO: handle missing variable params
        if variable_params is None:
            raise DatacubeException(
                'Missing configuration parameters, cannot write to storage.')
        filename = Path(filename)
        if not dataset.data_vars.keys():
            raise DatacubeException('Cannot save empty dataset to storage.')

        if not hasattr(dataset, 'crs'):
            raise DatacubeException(
                'Dataset does not contain CRS, cannot write to storage.')

        if 'bucket' not in storage_config:
            raise DatacubeException(
                'Expect `bucket` to be set in the storage config')

        bucket = storage_config['bucket']

        # TODO: Should write all data variables to disk, not just configured variables
        outputs = {}
        for band, param in variable_params.items():
            output = {}
            # TODO: Should not assume presence of any kind of parameter
            if 'chunksizes' not in param:
                raise DatacubeException(
                    'Missing `chunksizes` parameter, cannot write to storage.')
            output['chunk_size'] = self._get_chunksizes(param['chunksizes'])
            output['bucket'] = bucket
            self.storage.filepath = bucket  # For the s3_test driver only TODO: is this still needed?
            output['base_name'] = '%s_%s' % (filename.stem, band)
            key_maps = self.storage.put_array_in_s3(dataset[band].values,
                                                    output['chunk_size'],
                                                    output['base_name'],
                                                    output['bucket'], True)
            output['key_maps'] = [{
                's3_key':
                s3_key,
                'chunk':
                chunk,
                'chunk_id':
                chunk_id,
                'compression':
                None,
                'index_min':
                self._get_index(chunk, dataset[band].coords,
                                dataset[band].dims, 'min'),
                'index_max':
                self._get_index(chunk, dataset[band].coords,
                                dataset[band].dims, 'max')
            } for (s3_key, chunk, chunk_id) in key_maps]
            output['dimensions'] = dataset[band].dims
            output['macro_shape'] = dataset[band].shape
            output['numpy_type'] = dataset[band].dtype.str
            (output['regular_dims'], output['regular_index'],
             output['irregular_index']) = self.get_reg_irreg_indices(
                 dataset[band].coords)

            self.logger.info(
                'Wrote %d chunks of size %s to s3 bucket: %s, base_name: %s',
                len(output['key_maps']), output['chunk_size'],
                output['bucket'], output['base_name'])
            outputs[band] = output
        return outputs