Example #1
0
def adjust_spatial_attrs(ds: xr.Dataset, allow_point: bool = False) -> xr.Dataset:
    """
    Adjust the global spatial attributes of the dataset by doing some
    introspection of the dataset and adjusting the appropriate attributes
    accordingly.

    In case the determined attributes do not exist in the dataset, these will
    be added.

    For more information on suggested global attributes see
    `Attribute Convention for Data Discovery <http://wiki.esipfed.org/index.php/Attribute_Convention_for_Data_Discovery>`_

    :param ds: Dataset to adjust
    :param allow_point: Whether a dataset containing a single point is allowed
    :return: Adjusted dataset
    """
    return adjust_spatial_attrs_impl(ds, allow_point=allow_point)
Example #2
0
def adjust_spatial_attrs(ds: xr.Dataset, allow_point: bool = False) -> xr.Dataset:
    """
    Adjust the global spatial attributes of the dataset by doing some
    introspection of the dataset and adjusting the appropriate attributes
    accordingly.

    In case the determined attributes do not exist in the dataset, these will
    be added.

    For more information on suggested global attributes see
    `Attribute Convention for Data Discovery <http://wiki.esipfed.org/index.php/Attribute_Convention_for_Data_Discovery>`_

    :param ds: Dataset to adjust
    :param allow_point: Whether a dataset containing a single point is allowed
    :return: Adjusted dataset
    """
    return adjust_spatial_attrs_impl(ds, allow_point=allow_point)
Example #3
0
File: local.py Project: Evadzi/cate
    def _make_local(self,
                    local_ds: 'LocalDataSource',
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        local_id = local_ds.id

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        var_names = VarNamesLike.convert(
            var_names) if var_names else None  # type: Sequence

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL',
                                             NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({
                'zlib': True,
                'complevel': compression_level
            })

        local_path = os.path.join(local_ds.data_store.data_store_path,
                                  local_id)
        data_store_path = local_ds.data_store.data_store_path
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        monitor.start("Sync " + self.id, total_work=len(self._files.items()))
        for remote_relative_filepath, coverage in self._files.items():
            child_monitor = monitor.child(work=1)

            file_name = os.path.basename(remote_relative_filepath)
            local_relative_filepath = os.path.join(local_id, file_name)
            local_absolute_filepath = os.path.join(data_store_path,
                                                   local_relative_filepath)

            remote_absolute_filepath = os.path.join(
                self._data_store.data_store_path, remote_relative_filepath)

            if isinstance(coverage, Tuple):

                time_coverage_start = coverage[0]
                time_coverage_end = coverage[1]

                if not time_range or time_coverage_start >= time_range[
                        0] and time_coverage_end <= time_range[1]:
                    if region or var_names:

                        do_update_of_variables_meta_info_once = True
                        do_update_of_region_meta_info_once = True

                        try:
                            remote_dataset = xr.open_dataset(
                                remote_absolute_filepath)

                            if var_names:
                                remote_dataset = remote_dataset.drop([
                                    var_name for var_name in
                                    remote_dataset.data_vars.keys()
                                    if var_name not in var_names
                                ])

                            if region:
                                remote_dataset = normalize_impl(remote_dataset)
                                remote_dataset = adjust_spatial_attrs_impl(
                                    subset_spatial_impl(
                                        remote_dataset, region),
                                    allow_point=False)

                                if do_update_of_region_meta_info_once:
                                    # subset_spatial_impl
                                    local_ds.meta_info[
                                        'bbox_maxx'] = remote_dataset.attrs[
                                            'geospatial_lon_max']
                                    local_ds.meta_info[
                                        'bbox_minx'] = remote_dataset.attrs[
                                            'geospatial_lon_min']
                                    local_ds.meta_info[
                                        'bbox_maxy'] = remote_dataset.attrs[
                                            'geospatial_lat_max']
                                    local_ds.meta_info[
                                        'bbox_miny'] = remote_dataset.attrs[
                                            'geospatial_lat_min']
                                    do_update_of_region_meta_info_once = False

                            if compression_enabled:
                                for sel_var_name in remote_dataset.variables.keys(
                                ):
                                    remote_dataset.variables.get(
                                        sel_var_name).encoding.update(
                                            encoding_update)

                            remote_dataset.to_netcdf(local_absolute_filepath)

                            child_monitor.progress(
                                work=1, msg=str(time_coverage_start))
                        finally:
                            if do_update_of_variables_meta_info_once:
                                variables_info = local_ds.meta_info.get(
                                    'variables', [])
                                local_ds.meta_info['variables'] = [
                                    var_info for var_info in variables_info
                                    if var_info.get('name') in remote_dataset.
                                    variables.keys() and var_info.get('name')
                                    not in remote_dataset.dims.keys()
                                ]
                                do_update_of_variables_meta_info_once = False

                            local_ds.add_dataset(
                                os.path.join(local_id, file_name),
                                (time_coverage_start, time_coverage_end))

                        child_monitor.done()
                    else:
                        shutil.copy(remote_absolute_filepath,
                                    local_absolute_filepath)
                        local_ds.add_dataset(
                            local_relative_filepath,
                            (time_coverage_start, time_coverage_end))
                        child_monitor.done()
        monitor.done()
        return local_id
Example #4
0
    def _make_local(self,
                    local_ds: LocalDataSource,
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        local_id = local_ds.id
        time_range = TimeRangeLike.convert(time_range)
        var_names = VarNamesLike.convert(var_names)

        excluded_variables = get_exclude_variables_fix_known_issues(self.id)

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        do_update_of_verified_time_coverage_start_once = True
        verified_time_coverage_start = None
        verified_time_coverage_end = None

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({'zlib': True, 'complevel': compression_level})

        if region or var_names:
            protocol = _ODP_PROTOCOL_OPENDAP
        else:
            protocol = _ODP_PROTOCOL_HTTP

        local_path = os.path.join(local_ds.data_store.data_store_path, local_id)
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        selected_file_list = self._find_files(time_range)
        if not selected_file_list:
            msg = 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets'.format(self.id)
            if time_range is not None:
                msg += ' in given time range {}'.format(TimeRangeLike.format(time_range))
            raise DataAccessError(msg)
        try:
            if protocol == _ODP_PROTOCOL_OPENDAP:

                do_update_of_variables_meta_info_once = True
                do_update_of_region_meta_info_once = True

                files = self._get_urls_list(selected_file_list, protocol)
                monitor.start('Sync ' + self.id, total_work=len(files))
                for idx, dataset_uri in enumerate(files):
                    child_monitor = monitor.child(work=1)

                    file_name = os.path.basename(dataset_uri)
                    local_filepath = os.path.join(local_path, file_name)

                    time_coverage_start = selected_file_list[idx][1]
                    time_coverage_end = selected_file_list[idx][2]

                    child_monitor.start(label=file_name, total_work=1)

                    remote_dataset = xr.open_dataset(dataset_uri, drop_variables=[variable.get('name') for variable in
                                                                                  excluded_variables])
                    if var_names:
                        remote_dataset = remote_dataset.drop([var_name for var_name in remote_dataset.data_vars.keys()
                                                              if var_name not in var_names])

                    if region:
                        remote_dataset = normalize_impl(remote_dataset)
                        remote_dataset = adjust_spatial_attrs_impl(subset_spatial_impl(remote_dataset, region),
                                                                   allow_point=False)

                        if do_update_of_region_meta_info_once:
                            local_ds.meta_info['bbox_minx'] = remote_dataset.attrs['geospatial_lon_min']
                            local_ds.meta_info['bbox_maxx'] = remote_dataset.attrs['geospatial_lon_max']
                            local_ds.meta_info['bbox_maxy'] = remote_dataset.attrs['geospatial_lat_max']
                            local_ds.meta_info['bbox_miny'] = remote_dataset.attrs['geospatial_lat_min']
                            do_update_of_region_meta_info_once = False

                    if compression_enabled:
                        for sel_var_name in remote_dataset.variables.keys():
                            remote_dataset.variables.get(sel_var_name).encoding.update(encoding_update)

                    remote_dataset.to_netcdf(local_filepath)

                    child_monitor.progress(work=1, msg=str(time_coverage_start))

                    if do_update_of_variables_meta_info_once:
                        variables_info = local_ds.meta_info.get('variables', [])
                        local_ds.meta_info['variables'] = [var_info for var_info in variables_info
                                                           if var_info.get('name')
                                                           in remote_dataset.variables.keys() and
                                                           var_info.get('name')
                                                           not in remote_dataset.dims.keys()]
                        do_update_of_variables_meta_info_once = False

                    local_ds.add_dataset(os.path.join(local_id, file_name),
                                         (time_coverage_start, time_coverage_end))

                    if do_update_of_verified_time_coverage_start_once:
                        verified_time_coverage_start = time_coverage_start
                        do_update_of_verified_time_coverage_start_once = False
                    verified_time_coverage_end = time_coverage_end
                    child_monitor.done()
            else:
                outdated_file_list = []
                for file_rec in selected_file_list:
                    filename, _, _, file_size, url = file_rec
                    dataset_file = os.path.join(local_path, filename)
                    # todo (forman, 20160915): must perform better checks on dataset_file if it is...
                    # ... outdated or incomplete or corrupted.
                    # JSON also includes "checksum" and "checksum_type" fields.
                    if not os.path.isfile(dataset_file) or (file_size and os.path.getsize(dataset_file) != file_size):
                        outdated_file_list.append(file_rec)

                if outdated_file_list:
                    with monitor.starting('Sync ' + self.id, len(outdated_file_list)):
                        bytes_to_download = sum([file_rec[3] for file_rec in outdated_file_list])
                        dl_stat = _DownloadStatistics(bytes_to_download)

                        file_number = 1

                        for filename, coverage_from, coverage_to, file_size, url in outdated_file_list:
                            dataset_file = os.path.join(local_path, filename)
                            sub_monitor = monitor.child(work=1.0)

                            # noinspection PyUnusedLocal
                            def reporthook(block_number, read_size, total_file_size):
                                dl_stat.handle_chunk(read_size)
                                sub_monitor.progress(work=read_size, msg=str(dl_stat))

                            sub_monitor_msg = "file %d of %d" % (file_number, len(outdated_file_list))
                            with sub_monitor.starting(sub_monitor_msg, file_size):
                                urllib.request.urlretrieve(url[protocol], filename=dataset_file, reporthook=reporthook)
                            file_number += 1
                            local_ds.add_dataset(os.path.join(local_id, filename), (coverage_from, coverage_to))

                            if do_update_of_verified_time_coverage_start_once:
                                verified_time_coverage_start = coverage_from
                                do_update_of_verified_time_coverage_start_once = False
                            verified_time_coverage_end = coverage_to
        except (OSError, ValueError) as e:
            raise DataAccessError("Copying remote data source failed: {}".format(e), source=self) from e
        local_ds.meta_info['temporal_coverage_start'] = TimeLike.format(verified_time_coverage_start)
        local_ds.meta_info['temporal_coverage_end'] = TimeLike.format(verified_time_coverage_end)
        local_ds.meta_info['exclude_variables'] = excluded_variables
        local_ds.save(True)
Example #5
0
    def _make_local(self,
                    local_ds: 'LocalDataSource',
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        local_id = local_ds.id

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        var_names = VarNamesLike.convert(var_names) if var_names else None  # type: Sequence

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({'zlib': True, 'complevel': compression_level})

        local_path = os.path.join(local_ds.data_store.data_store_path, local_id)
        data_store_path = local_ds.data_store.data_store_path
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        monitor.start("Sync " + self.id, total_work=len(self._files.items()))
        for remote_relative_filepath, coverage in self._files.items():
            child_monitor = monitor.child(work=1)

            file_name = os.path.basename(remote_relative_filepath)
            local_relative_filepath = os.path.join(local_id, file_name)
            local_absolute_filepath = os.path.join(data_store_path, local_relative_filepath)

            remote_absolute_filepath = os.path.join(self._data_store.data_store_path, remote_relative_filepath)

            if isinstance(coverage, Tuple):

                time_coverage_start = coverage[0]
                time_coverage_end = coverage[1]

                if not time_range or time_coverage_start >= time_range[0] and time_coverage_end <= time_range[1]:
                    if region or var_names:

                        do_update_of_variables_meta_info_once = True
                        do_update_of_region_meta_info_once = True

                        remote_dataset = None
                        try:
                            remote_dataset = xr.open_dataset(remote_absolute_filepath)

                            if var_names:
                                remote_dataset = remote_dataset.drop(
                                    [var_name for var_name in remote_dataset.data_vars.keys()
                                     if var_name not in var_names])

                            if region:
                                remote_dataset = normalize_impl(remote_dataset)
                                remote_dataset = adjust_spatial_attrs_impl(subset_spatial_impl(remote_dataset, region),
                                                                           allow_point=False)

                                if do_update_of_region_meta_info_once:
                                    # subset_spatial_impl
                                    local_ds.meta_info['bbox_maxx'] = remote_dataset.attrs['geospatial_lon_max']
                                    local_ds.meta_info['bbox_minx'] = remote_dataset.attrs['geospatial_lon_min']
                                    local_ds.meta_info['bbox_maxy'] = remote_dataset.attrs['geospatial_lat_max']
                                    local_ds.meta_info['bbox_miny'] = remote_dataset.attrs['geospatial_lat_min']
                                    do_update_of_region_meta_info_once = False

                            if compression_enabled:
                                for sel_var_name in remote_dataset.variables.keys():
                                    remote_dataset.variables.get(sel_var_name).encoding.update(encoding_update)

                            remote_dataset.to_netcdf(local_absolute_filepath)

                            child_monitor.progress(work=1, msg=str(time_coverage_start))
                        finally:
                            if do_update_of_variables_meta_info_once and remote_dataset is not None:
                                variables_info = local_ds.meta_info.get('variables', [])
                                local_ds.meta_info['variables'] = [var_info for var_info in variables_info
                                                                   if var_info.get('name')
                                                                   in remote_dataset.variables.keys()
                                                                   and var_info.get('name')
                                                                   not in remote_dataset.dims.keys()]
                                # noinspection PyUnusedLocal
                                do_update_of_variables_meta_info_once = False

                            local_ds.add_dataset(os.path.join(local_id, file_name),
                                                 (time_coverage_start, time_coverage_end))

                        child_monitor.done()
                    else:
                        shutil.copy(remote_absolute_filepath, local_absolute_filepath)
                        local_ds.add_dataset(local_relative_filepath, (time_coverage_start, time_coverage_end))
                        child_monitor.done()
        monitor.done()
        return local_id