def _get_chunksizes(self, chunksizes): """Return the chunk sizes as an int tuple, if valid. We expect a list/tuple of 3 integers. :param dict chunksizes: the raw chunksizes parameter, to be validated. :return tuple chunksizes: the validated chunksizes as a an integers tuple. """ if not isinstance(chunksizes, (list, tuple, set)): raise DatacubeException( 'Dataset contains invalid chunking values, cannot write to storage.' ) try: chunksizes = tuple(map(int, chunksizes)) except ValueError: raise DatacubeException( 'Dataset contains invalid chunking values, cannot write to storage.' ) if not chunksizes: raise DatacubeException( 'Dataset contains invalid chunking values, cannot write to storage.' ) return chunksizes
def write_dataset_to_netcdf(dataset, filename, global_attributes=None, variable_params=None, netcdfparams=None): """ Write a Data Cube style xarray Dataset to a NetCDF file Requires a spatial Dataset, with attached coordinates and global crs attribute. :param `xarray.Dataset` dataset: :param filename: Output filename :param global_attributes: Global file attributes. dict of attr_name: attr_value :param variable_params: dict of variable_name: {param_name: param_value, [...]} Allows setting storage and compression options per variable. See the `netCDF4.Dataset.createVariable` for available parameters. :param netcdfparams: Optional params affecting netCDF file creation """ global_attributes = global_attributes or {} variable_params = variable_params or {} filename = Path(filename) if not dataset.data_vars.keys(): raise DatacubeException('Cannot save empty dataset to disk.') if dataset.geobox is None: raise DatacubeException( 'Dataset geobox property is None, cannot write to NetCDF file.') if dataset.geobox.crs is None: raise DatacubeException( 'Dataset geobox.crs property is None, cannot write to NetCDF file.' ) nco = create_netcdf_storage_unit(filename, dataset.geobox.crs, dataset.coords, dataset.data_vars, variable_params, global_attributes, netcdfparams) for name, variable in dataset.data_vars.items(): nco[name][:] = netcdf_writer.netcdfy_data(variable.values) nco.close()
def write_dataset_to_storage(self, dataset, filename, global_attributes=None, variable_params=None, storage_config=None, **kwargs): """See :meth:`datacube.drivers.driver.write_dataset_to_storage` :param `xarray.Dataset` dataset: :param filename: Output filename :param global_attributes: Global file attributes. dict of attr_name: attr_value :param variable_params: dict of variable_name: {param_name: param_value, [...]} :return: Dictionary of metadata consigning the s3 storage information. This is required for indexing in particular. """ if storage_config is None: storage_config = {} # TODO: handle missing variable params if variable_params is None: raise DatacubeException( 'Missing configuration parameters, cannot write to storage.') filename = Path(filename) if not dataset.data_vars.keys(): raise DatacubeException('Cannot save empty dataset to storage.') if not hasattr(dataset, 'crs'): raise DatacubeException( 'Dataset does not contain CRS, cannot write to storage.') if 'bucket' not in storage_config: raise DatacubeException( 'Expect `bucket` to be set in the storage config') bucket = storage_config['bucket'] # TODO: Should write all data variables to disk, not just configured variables outputs = {} for band, param in variable_params.items(): output = {} # TODO: Should not assume presence of any kind of parameter if 'chunksizes' not in param: raise DatacubeException( 'Missing `chunksizes` parameter, cannot write to storage.') output['chunk_size'] = self._get_chunksizes(param['chunksizes']) output['bucket'] = bucket self.storage.filepath = bucket # For the s3_test driver only TODO: is this still needed? output['base_name'] = '%s_%s' % (filename.stem, band) key_maps = self.storage.put_array_in_s3(dataset[band].values, output['chunk_size'], output['base_name'], output['bucket'], True) output['key_maps'] = [{ 's3_key': s3_key, 'chunk': chunk, 'chunk_id': chunk_id, 'compression': None, 'index_min': self._get_index(chunk, dataset[band].coords, dataset[band].dims, 'min'), 'index_max': self._get_index(chunk, dataset[band].coords, dataset[band].dims, 'max') } for (s3_key, chunk, chunk_id) in key_maps] output['dimensions'] = dataset[band].dims output['macro_shape'] = dataset[band].shape output['numpy_type'] = dataset[band].dtype.str (output['regular_dims'], output['regular_index'], output['irregular_index']) = self.get_reg_irreg_indices( dataset[band].coords) self.logger.info( 'Wrote %d chunks of size %s to s3 bucket: %s, base_name: %s', len(output['key_maps']), output['chunk_size'], output['bucket'], output['base_name']) outputs[band] = output return outputs