Example #1
0
def data_frame_subset(gdf: gpd.GeoDataFrame,
                      region_op: bool = 'intersects',
                      region: PolygonLike.TYPE = None,
                      var_names: VarNamesLike.TYPE = None) -> gpd.GeoDataFrame:
    """
    Create a GeoDataFrame subset from given variables (data frame columns) and/or region.

    :param gdf: A GeoDataFrame.
    :param region_op: The geometric operation to be performed if *region* is given.
    :param region: A region polygon used to filter rows.
    :param var_names: The variables (columns) to select.
    :return: A GeoDataFrame subset.
    """

    region = PolygonLike.convert(region)

    var_names = VarNamesLike.convert(var_names)

    if not var_names and not region:
        return gdf

    if var_names:
        if 'geometry' not in var_names:
            var_names = ['geometry'] + var_names
        gdf = gdf[var_names]

    if region and region_op:
        geom_str = PolygonLike.format(region)
        gdf = data_frame_query(gdf, f'@{region_op}("{geom_str}")')

    return gdf
Example #2
0
def anomaly_internal(ds: xr.Dataset,
                     time_range: TimeRangeLike.TYPE = None,
                     region: PolygonLike.TYPE = None,
                     monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Calculate anomaly using as reference data the mean of an optional region
    and time slice from the given dataset. If no time slice/spatial region is
    given, the operation will calculate anomaly using the mean of the whole
    dataset as the reference.

    This is done for each data array in the dataset.
    :param ds: The dataset to calculate anomalies from
    :param time_range: Time range to use for reference data
    :param region: Spatial region to use for reference data
    :param monitor: a progress monitor.
    :return: The anomaly dataset
    """
    ref = ds.copy()
    if time_range:
        time_range = TimeRangeLike.convert(time_range)
        ref = subset_temporal(ref, time_range)
    if region:
        region = PolygonLike.convert(region)
        ref = subset_spatial(ref, region)
    with monitor.observing("Calculating anomaly"):
        ref = ref.mean(keep_attrs=True, skipna=True)
        diff = ds - ref
    return diff
Example #3
0
 def spatial_coverage(self):
     if not self._spatial_coverage and \
             set(self._meta_info.keys()).issuperset({'bbox_minx', 'bbox_miny', 'bbox_maxx', 'bbox_maxy'}):
         self._spatial_coverage = PolygonLike.convert(",".join([
             self._meta_info.get('bbox_minx'),
             self._meta_info.get('bbox_miny'),
             self._meta_info.get('bbox_maxx'),
             self._meta_info.get('bbox_maxy')])
         )
     return self._spatial_coverage
Example #4
0
def enso(ds: xr.Dataset,
         var: VarName.TYPE,
         file: str,
         region: str = 'n34',
         custom_region: PolygonLike.TYPE = None,
         threshold: float = None,
         monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Calculate ENSO index, which is defined as a five month running mean of
    anomalies of monthly means of SST data in the given region.

    :param ds: A monthly SST dataset
    :param file: Path to the reference data file e.g. a climatology. A suitable reference dataset
    can be generated using the long_term_average operation
    :param var: Dataset variable to use for index calculation
    :param region: Region for index calculation, the default is Nino3.4
    :param custom_region: If 'custom' is chosen as the 'region', this parameter
    has to be provided to set the desired region.
    :param threshold: If given, boolean El Nino/La Nina timeseries will be
    calculated and added to the output dataset, according to the given
    threshold. Where anomaly larger than then positive value of the threshold
    indicates El Nino and anomaly smaller than the negative of the given
    threshold indicates La Nina.
    :param monitor: a progress monitor.
    :return: A dataset that contains the index timeseries.
    """
    regions = {'N1+2': '-90, -10, -80, 0',
               'N3': '-150, -5, -90, 5',
               'N3.4': '-170, -5, -120, 5',
               'N4': '160, -5, -150, 5',
               'custom': custom_region}
    converted_region = PolygonLike.convert(regions[region])
    if not converted_region:
        raise ValidationError('No region has been provided to ENSO index calculation')

    name = 'ENSO ' + region + ' Index'
    if 'custom' == region:
        name = 'ENSO Index over ' + PolygonLike.format(converted_region)

    return _generic_index_calculation(ds, var, converted_region, 5, file, name, threshold, monitor)
Example #5
0
    def make_local(self,
                   local_name: str,
                   local_id: str = None,
                   time_range: TimeRangeLike.TYPE = None,
                   region: PolygonLike.TYPE = None,
                   var_names: VarNamesLike.TYPE = None,
                   monitor: Monitor = Monitor.NONE) -> Optional[DataSource]:

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        region = PolygonLike.convert(region) if region else None
        var_names = VarNamesLike.convert(var_names) if var_names else None

        local_store = DATA_STORE_REGISTRY.get_data_store('local')
        if not local_store:
            add_to_data_store_registry()
            local_store = DATA_STORE_REGISTRY.get_data_store('local')
        if not local_store:
            raise ValueError('Cannot initialize `local` DataStore')

        _uuid = LocalDataStore.generate_uuid(ref_id=self.id, time_range=time_range, region=region, var_names=var_names)

        if not local_name or len(local_name) == 0:
            local_name = "local.{}.{}".format(self.id, _uuid)
            existing_ds_list = local_store.query(ds_id=local_name)
            if len(existing_ds_list) == 1:
                return existing_ds_list[0]
        else:
            existing_ds_list = local_store.query(ds_id='local.%s' % local_name)
            if len(existing_ds_list) == 1:
                if existing_ds_list[0].meta_info.get('uuid', None) == _uuid:
                    return existing_ds_list[0]
                else:
                    raise ValueError('Datastore {} already contains dataset {}'.format(local_store.id, local_name))

        local_meta_info = self.meta_info.copy()
        local_meta_info['ref_uuid'] = local_meta_info.get('uuid', None)
        local_meta_info['uuid'] = _uuid

        local_ds = local_store.create_data_source(local_name, region, local_name,
                                                  time_range=time_range, var_names=var_names,
                                                  meta_info=self.meta_info.copy())
        if local_ds:
            if not local_ds.is_complete:
                self._make_local(local_ds, time_range, region, var_names, monitor=monitor)

            if local_ds.is_empty:
                local_store.remove_data_source(local_ds)
                return None

            local_store.register_ds(local_ds)
            return local_ds
        return None
Example #6
0
    def generate_uuid(cls, ref_id: str,
                      time_range: Optional[TimeRange] = None,
                      region: Optional[shapely.geometry.Polygon] = None,
                      var_names: Optional[VarNames] = None) -> str:

        if time_range:
            ref_id += TimeRangeLike.format(time_range)
        if region:
            ref_id += PolygonLike.format(region)
        if var_names:
            ref_id += VarNamesLike.format(var_names)

        return str(uuid.uuid3(_NAMESPACE, ref_id))
Example #7
0
    def generate_title(cls, title: str,
                       time_range: Optional[TimeRange] = None,
                       region: Optional[shapely.geometry.Polygon] = None,
                       var_names: Optional[VarNames] = None) -> str:

        if time_range:
            title += " [TimeRange:{}]".format(TimeRangeLike.format(time_range))
        if region:
            title += " [Region:{}]".format(PolygonLike.format(region))
        if var_names:
            title += " [Variables:{}]".format(VarNamesLike.format(var_names))

        return title
Example #8
0
    def test_convert(self):
        self.assertEqual(PolygonLike.convert(None), None)
        self.assertEqual(PolygonLike.convert(''), None)
        coords = [(10.4, 20.2), (30.8, 20.2), (30.8, 40.8), (10.4, 40.8)]
        self.assertTrue(PolygonLike.convert(coords), Polygon(coords))
        self.assertTrue(PolygonLike.convert([10.4, 20.2, 30.8, 40.8]), Polygon(coords))

        with self.assertRaises(ValidationError) as err:
            PolygonLike.convert('aaa')
        self.assertEqual(str(err.exception),
                         "Value cannot be converted into a 'PolygonLike': "
                         "Invalid geometry WKT format.")
Example #9
0
    def __init__(self,
                 ds_id: str,
                 files: Union[Sequence[str], OrderedDict],
                 data_store: 'LocalDataStore',
                 temporal_coverage: TimeRangeLike.TYPE = None,
                 spatial_coverage: PolygonLike.TYPE = None,
                 variables: VarNamesLike.TYPE = None,
                 meta_info: dict = None,
                 status: DataSourceStatus = None):
        self._id = ds_id
        if isinstance(files, Sequence):
            self._files = OrderedDict.fromkeys(files)
        else:
            self._files = files
        self._data_store = data_store

        initial_temporal_coverage = TimeRangeLike.convert(temporal_coverage) if temporal_coverage else None
        if not initial_temporal_coverage:
            files_number = len(self._files.items())
            if files_number > 0:
                files_range = list(self._files.values())
                if files_range:
                    if isinstance(files_range[0], Tuple):
                        initial_temporal_coverage = TimeRangeLike.convert(tuple([files_range[0][0],
                                                                                 files_range[files_number - 1][1]]))
                    elif isinstance(files_range[0], datetime):
                        initial_temporal_coverage = TimeRangeLike.convert((files_range[0],
                                                                           files_range[files_number - 1]))

        self._temporal_coverage = initial_temporal_coverage
        self._spatial_coverage = PolygonLike.convert(spatial_coverage) if spatial_coverage else None
        self._variables = VarNamesLike.convert(variables) if variables else []

        self._meta_info = meta_info if meta_info else OrderedDict()

        if self._variables and not self._meta_info.get('variables', None):
            self._meta_info['variables'] = [
                {'name': var_name,
                 'units': '',
                 'long_name': '',
                 'standard_name': ''
                 } for var_name in self._variables]

        self._status = status if status else DataSourceStatus.READY
Example #10
0
def _generic_index_calculation(ds: xr.Dataset,
                               var: VarName.TYPE,
                               region: PolygonLike.TYPE,
                               window: int,
                               file: str,
                               name: str,
                               threshold: float = None,
                               monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    A generic index calculation. Where an index is defined as an anomaly
    against the given reference of a moving average of the given window size of
    the given given region of the given variable of the given dataset.

    :param ds: Dataset from which to calculate the index
    :param var: Variable from which to calculate index
    :param region: Spatial subset from which to calculate the index
    :param window: Window size for the moving average
    :param file: Path to the reference file
    :param threshold: Absolute threshold that indicates an ENSO event
    :param name: Name of the index
    :param monitor: a progress monitor.
    :return: A dataset that contains the index timeseries
    """
    var = VarName.convert(var)
    region = PolygonLike.convert(region)

    with monitor.starting("Calculate the index", total_work=2):
        ds = select_var(ds, var)
        ds_subset = subset_spatial(ds, region)
        anom = anomaly_external(ds_subset, file, monitor=monitor.child(1))
        with monitor.child(1).observing("Calculate mean"):
            ts = anom.mean(dim=['lat', 'lon'])
        df = pd.DataFrame(data=ts[var].values, columns=[name], index=ts.time)
        retval = df.rolling(window=window, center=True).mean().dropna()

    if threshold is None:
        return retval

    retval['El Nino'] = pd.Series((retval[name] > threshold),
                                  index=retval.index)
    retval['La Nina'] = pd.Series((retval[name] < -threshold),
                                  index=retval.index)
    return retval
Example #11
0
 def test_load_datasource_from_json_dict(self):
     test_data = {
         'name': 'local.test_name2',
         "meta_info": {
             "temporal_coverage_start": "2001-01-01T00:00:00",
             "temporal_coverage_end": "2001-01-31T23:59:59",
             "bbox_maxx": "180.0",
             "bbox_maxy": "90.0",
             "bbox_minx": "-180.0",
             "bbox_miny": "-90.0",
             "variables": [
                 {
                     "name": "var_1",
                     "units": "kelvin",
                     "long_name": "var_1 long name..",
                     "standard_name": "std_var_1"
                 },
                 {
                     "name": "var_2",
                     "units": "celsius",
                     "long_name": "var_2 long name..",
                     "standard_name": "std_var_2"
                 }
             ]
         },
         'files': [['file_1', '2002-02-01 00:00:00', '2002-02-01 23:59:59'],
                   ['file_2', '2002-03-01 00:00:00', '2002-03-01 23:59:59']]
     }
     data_source = LocalDataSource.from_json_dict(json_dict=test_data, data_store=self.data_store)
     self.assertIsNotNone(data_source)
     self.assertEqual(data_source.temporal_coverage(),
                      TimeRangeLike.convert("{},{}".format(
                          test_data.get('meta_info').get('temporal_coverage_start'),
                          test_data.get('meta_info').get('temporal_coverage_end')))
                      )
     self.assertEqual(data_source.spatial_coverage(),
                      PolygonLike.convert(",".join([
                          test_data.get('meta_info').get('bbox_minx'),
                          test_data.get('meta_info').get('bbox_miny'),
                          test_data.get('meta_info').get('bbox_maxx'),
                          test_data.get('meta_info').get('bbox_maxy'),
                      ])))
     self.assertListEqual(data_source.variables_info, test_data.get('meta_info').get('variables'))
Example #12
0
    def test_convert(self):
        self.assertEqual(PolygonLike.convert(None), None)
        self.assertEqual(PolygonLike.convert(''), None)
        coords = [(10.4, 20.2), (30.8, 20.2), (30.8, 40.8), (10.4, 40.8)]
        self.assertTrue(PolygonLike.convert(coords), Polygon(coords))
        self.assertTrue(PolygonLike.convert([10.4, 20.2, 30.8, 40.8]),
                        Polygon(coords))

        with self.assertRaises(ValidationError) as err:
            PolygonLike.convert('aaa')
        self.assertEqual(
            str(err.exception),
            "Value cannot be converted into a 'PolygonLike': "
            "Invalid geometry WKT format.")
Example #13
0
    def open_dataset(self,
                     time_range: TimeRangeLike.TYPE = None,
                     region: PolygonLike.TYPE = None,
                     var_names: VarNamesLike.TYPE = None,
                     protocol: str = None) -> Any:
        time_range = TimeRangeLike.convert(time_range) if time_range else None
        region = PolygonLike.convert(region) if region else None
        var_names = VarNamesLike.convert(var_names) if var_names else None

        selected_file_list = self._find_files(time_range)
        if not selected_file_list:
            msg = 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets'.format(
                self.id)
            if time_range is not None:
                msg += ' in given time range {}'.format(
                    TimeRangeLike.format(time_range))
            raise DataAccessError(msg)

        files = self._get_urls_list(selected_file_list, _ODP_PROTOCOL_OPENDAP)
        try:
            ds = open_xarray_dataset(files)
            if region:
                ds = normalize_impl(ds)
                ds = subset_spatial_impl(ds, region)
            if var_names:
                ds = ds.drop([
                    var_name for var_name in ds.data_vars.keys()
                    if var_name not in var_names
                ])
            return ds

        except OSError as e:
            if time_range:
                raise DataAccessError(
                    "Cannot open remote dataset for time range {}:\n"
                    "{}".format(TimeRangeLike.format(time_range), e),
                    source=self) from e
            else:
                raise DataAccessError("Cannot open remote dataset:\n"
                                      "{}".format(
                                          TimeRangeLike.format(time_range), e),
                                      source=self) from e
Example #14
0
    def __init__(self,
                 name: str,
                 files: Union[Sequence[str], OrderedDict],
                 data_store: 'LocalDataStore',
                 temporal_coverage: TimeRangeLike.TYPE = None,
                 spatial_coverage: PolygonLike.TYPE = None,
                 variables: VarNamesLike.TYPE = None,
                 reference_type: str = None,
                 reference_name: str = None):
        self._name = name
        if isinstance(files, Sequence):
            self._files = OrderedDict.fromkeys(files)
        else:
            self._files = files
        self._data_store = data_store

        initial_temporal_coverage = TimeRangeLike.convert(
            temporal_coverage) if temporal_coverage else None
        if not initial_temporal_coverage:
            files_number = len(self._files.items())
            if files_number > 0:
                files_range = list(self._files.values())
                if files_range:
                    if isinstance(files_range[0], Tuple):
                        initial_temporal_coverage = TimeRangeLike.convert(
                            tuple([
                                files_range[0][0],
                                files_range[files_number - 1][1]
                            ]))
                    elif isinstance(files_range[0], datetime):
                        initial_temporal_coverage = TimeRangeLike.convert(
                            (files_range[0], files_range[files_number - 1]))

        self._temporal_coverage = initial_temporal_coverage
        self._spatial_coverage = PolygonLike.convert(
            spatial_coverage) if spatial_coverage else None
        self._variables = VarNamesLike.convert(
            variables) if variables else None

        self._reference_type = reference_type if reference_type else None
        self._reference_name = reference_name
Example #15
0
def _generic_index_calculation(ds: xr.Dataset,
                               var: VarName.TYPE,
                               region: PolygonLike.TYPE,
                               window: int,
                               file: str,
                               name: str,
                               threshold: float = None) -> pd.DataFrame:
    """
    A generic index calculation. Where an index is defined as an anomaly
    against the given reference of a moving average of the given window size of
    the given given region of the given variable of the given dataset.

    :param ds: Dataset from which to calculate the index
    :param var: Variable from which to calculate index
    :param region: Spatial subset from which to calculate the index
    :param window: Window size for the moving average
    :param file: Path to the reference file
    :param threshold: Absolute threshold that indicates an ENSO event
    :param name: Name of the index
    """
    var = VarName.convert(var)
    region = PolygonLike.convert(region)

    ds = select_var(ds, var)
    ds_subset = subset_spatial(ds, region)
    anom = anomaly_external(ds_subset, file)
    ts = anom.mean(dim=['lat', 'lon'])
    df = pd.DataFrame(data=ts[var].values, columns=[name], index=ts.time)
    retval = df.rolling(window=window, center=True).mean().dropna()

    if threshold is None:
        return retval

    retval['El Nino'] = pd.Series((retval[name] > threshold),
                                  index=retval.index)
    retval['La Nina'] = pd.Series((retval[name] < threshold),
                                  index=retval.index)
    return retval
Example #16
0
def anomaly_internal(ds: xr.Dataset,
                     time_range: TimeRangeLike.TYPE = None,
                     region: PolygonLike.TYPE = None) -> xr.Dataset:
    """
    Calculate anomaly using as reference data the mean of an optional region
    and time slice from the given dataset. If no time slice/spatial region is
    given, the operation will calculate anomaly using the mean of the whole
    dataset as the reference.

    This is done for each data array in the dataset.
    :param ds: The dataset to calculate anomalies from
    :param time_range: Time range to use for reference data
    :param region: Spatial region to use for reference data
    :return: The anomaly dataset
    """
    ref = ds.copy()
    if time_range:
        time_range = TimeRangeLike.convert(time_range)
        ref = subset_temporal(ref, time_range)
    if region:
        region = PolygonLike.convert(region)
        ref = subset_spatial(ref, region)
    ref = ref.mean(keep_attrs=True, skipna=True)
    return ds - ref
Example #17
0
    def open_dataset(self,
                     time_range: TimeRangeLike.TYPE = None,
                     region: PolygonLike.TYPE = None,
                     var_names: VarNamesLike.TYPE = None,
                     protocol: str = None) -> Any:
        time_range = TimeRangeLike.convert(time_range) if time_range else None
        region = PolygonLike.convert(region) if region else None
        var_names = VarNamesLike.convert(var_names) if var_names else None

        selected_file_list = self._find_files(time_range)
        if not selected_file_list:
            msg = 'Data source \'{}\' does not seem to have any data files'.format(
                self.name)
            if time_range is not None:
                msg += ' in given time range {}'.format(
                    TimeRangeLike.format(time_range))
            raise IOError(msg)

        files = self._get_urls_list(selected_file_list, _ODP_PROTOCOL_OPENDAP)
        try:
            ds = open_xarray_dataset(files)
            if region:
                [lat_min, lon_min, lat_max, lon_max] = region.bounds
                ds = ds.sel(drop=False,
                            lat=slice(lat_min, lat_max),
                            lon=slice(lon_min, lon_max))
            if var_names:
                ds = ds.drop([
                    var_name for var_name in ds.variables.keys()
                    if var_name not in var_names
                ])
            return ds

        except OSError as e:
            raise IOError("Files: {} caused:\nOSError({}): {}".format(
                files, e.errno, e.strerror))
Example #18
0
    def _make_local(self,
                    local_ds: LocalDataSource,
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        # local_name = local_ds.name
        local_id = local_ds.name

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        region = PolygonLike.convert(region) if region else None
        var_names = VarNamesLike.convert(
            var_names) if var_names else None  # type: Sequence

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL',
                                             NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({
                'zlib': True,
                'complevel': compression_level
            })

        if region or var_names:
            protocol = _ODP_PROTOCOL_OPENDAP
        else:
            protocol = _ODP_PROTOCOL_HTTP

        local_path = os.path.join(local_ds.data_store.data_store_path,
                                  local_id)
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        selected_file_list = self._find_files(time_range)

        if protocol == _ODP_PROTOCOL_OPENDAP:

            files = self._get_urls_list(selected_file_list, protocol)
            monitor.start('Sync ' + self.name, total_work=len(files))
            for idx, dataset_uri in enumerate(files):
                child_monitor = monitor.child(work=1)

                file_name = os.path.basename(dataset_uri)
                local_filepath = os.path.join(local_path, file_name)

                time_coverage_start = selected_file_list[idx][1]
                time_coverage_end = selected_file_list[idx][2]

                remote_netcdf = None
                local_netcdf = None
                try:
                    remote_netcdf = NetCDF4DataStore(dataset_uri)

                    local_netcdf = NetCDF4DataStore(local_filepath,
                                                    mode='w',
                                                    persist=True)
                    local_netcdf.set_attributes(remote_netcdf.get_attrs())

                    remote_dataset = xr.Dataset.load_store(remote_netcdf)

                    process_region = False
                    if region:
                        geo_lat_min = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lat_min')
                        geo_lat_max = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lat_max')
                        geo_lon_min = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lon_min')
                        geo_lon_max = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lon_max')

                        geo_lat_res = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lon_resolution')
                        geo_lon_res = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lat_resolution')
                        if not (isnan(geo_lat_min) or isnan(geo_lat_max)
                                or isnan(geo_lon_min) or isnan(geo_lon_max)
                                or isnan(geo_lat_res) or isnan(geo_lon_res)):
                            process_region = True

                            [lat_min, lon_min, lat_max,
                             lon_max] = region.bounds

                            lat_min = floor(
                                (lat_min - geo_lat_min) / geo_lat_res)
                            lat_max = ceil(
                                (lat_max - geo_lat_min) / geo_lat_res)
                            lon_min = floor(
                                (lon_min - geo_lon_min) / geo_lon_res)
                            lon_max = ceil(
                                (lon_max - geo_lon_min) / geo_lon_res)

                            # TODO (kbernat): check why dataset.sel fails!
                            remote_dataset = remote_dataset.isel(
                                drop=False,
                                lat=slice(lat_min, lat_max),
                                lon=slice(lon_min, lon_max))

                            geo_lat_max = lat_max * geo_lat_res + geo_lat_min
                            geo_lat_min += lat_min * geo_lat_res
                            geo_lon_max = lon_max * geo_lon_res + geo_lon_min
                            geo_lon_min += lon_min * geo_lon_res

                    if not var_names:
                        var_names = [
                            var_name
                            for var_name in remote_netcdf.variables.keys()
                        ]
                    var_names.extend([
                        coord_name
                        for coord_name in remote_dataset.coords.keys()
                        if coord_name not in var_names
                    ])
                    child_monitor.start(label=file_name,
                                        total_work=len(var_names))
                    for sel_var_name in var_names:
                        var_dataset = remote_dataset.drop([
                            var_name
                            for var_name in remote_dataset.variables.keys()
                            if var_name != sel_var_name
                        ])
                        if compression_enabled:
                            var_dataset.variables.get(
                                sel_var_name).encoding.update(encoding_update)
                        local_netcdf.store_dataset(var_dataset)
                        child_monitor.progress(work=1, msg=sel_var_name)
                    if process_region:
                        local_netcdf.set_attribute('geospatial_lat_min',
                                                   geo_lat_min)
                        local_netcdf.set_attribute('geospatial_lat_max',
                                                   geo_lat_max)
                        local_netcdf.set_attribute('geospatial_lon_min',
                                                   geo_lon_min)
                        local_netcdf.set_attribute('geospatial_lon_max',
                                                   geo_lon_max)

                finally:
                    if remote_netcdf:
                        remote_netcdf.close()
                    if local_netcdf:
                        local_netcdf.close()
                        local_ds.add_dataset(
                            os.path.join(local_id, file_name),
                            (time_coverage_start, time_coverage_end))

                child_monitor.done()
        else:
            outdated_file_list = []
            for file_rec in selected_file_list:
                filename, _, _, file_size, url = file_rec
                dataset_file = os.path.join(local_path, filename)
                # todo (forman, 20160915): must perform better checks on dataset_file if it is...
                # ... outdated or incomplete or corrupted.
                # JSON also includes "checksum" and "checksum_type" fields.
                if not os.path.isfile(dataset_file) or (
                        file_size
                        and os.path.getsize(dataset_file) != file_size):
                    outdated_file_list.append(file_rec)

            if outdated_file_list:
                with monitor.starting('Sync ' + self.name,
                                      len(outdated_file_list)):
                    bytes_to_download = sum(
                        [file_rec[3] for file_rec in outdated_file_list])
                    dl_stat = _DownloadStatistics(bytes_to_download)

                    file_number = 1

                    for filename, coverage_from, coverage_to, file_size, url in outdated_file_list:
                        if monitor.is_cancelled():
                            raise InterruptedError
                        dataset_file = os.path.join(local_path, filename)
                        sub_monitor = monitor.child(work=1.0)

                        # noinspection PyUnusedLocal
                        def reporthook(block_number, read_size,
                                       total_file_size):
                            dl_stat.handle_chunk(read_size)
                            if monitor.is_cancelled():
                                raise InterruptedError
                            sub_monitor.progress(work=read_size,
                                                 msg=str(dl_stat))

                        sub_monitor_msg = "file %d of %d" % (
                            file_number, len(outdated_file_list))
                        with sub_monitor.starting(sub_monitor_msg, file_size):
                            urllib.request.urlretrieve(url[protocol],
                                                       filename=dataset_file,
                                                       reporthook=reporthook)
                        file_number += 1
                        local_ds.add_dataset(os.path.join(local_id, filename),
                                             (coverage_from, coverage_to))
        local_ds.save()
        monitor.done()
Example #19
0
    def test_make_local_and_update(self):

        reference_path = os.path.join(os.path.dirname(__file__),
                                      os.path.normpath('resources/datasources/local/files/'))

        def find_files_mock(_, time_range):

            def build_file_item(item_name: str, date_from: datetime, date_to: datetime, size: int):

                return [item_name, date_from, date_to, size,
                        {'OPENDAP': os.path.join(reference_path, item_name),
                         'HTTPServer': 'file:' + urllib.request.pathname2url(os.path.join(reference_path, item_name))}]

            reference_files = {
                'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781114000000-fv02.2.nc': {
                    'date_from': datetime.datetime(1978, 11, 14, 0, 0),
                    'date_to': datetime.datetime(1978, 11, 14, 23, 59),
                    'size': 21511378
                },
                'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781115000000-fv02.2.nc': {
                    'date_from': datetime.datetime(1978, 11, 15, 0, 0),
                    'date_to': datetime.datetime(1978, 11, 15, 23, 59),
                    'size': 21511378
                },
                'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781116000000-fv02.2.nc': {
                    'date_from': datetime.datetime(1978, 11, 16, 0, 0),
                    'date_to': datetime.datetime(1978, 11, 16, 23, 59),
                    'size': 21511378
                }
            }

            reference_files_list = []

            for reference_file in reference_files.items():
                file_name = reference_file[0]
                file_date_from = reference_file[1].get('date_from')
                file_date_to = reference_file[1].get('date_to')
                file_size = reference_file[1].get('size')
                if time_range:
                    if file_date_from >= time_range[0] and file_date_to <= time_range[1]:
                        reference_files_list.append(build_file_item(file_name,
                                                                    file_date_from,
                                                                    file_date_to,
                                                                    file_size))
                else:
                    reference_files_list.append(build_file_item(file_name,
                                                                file_date_from,
                                                                file_date_to,
                                                                file_size))
            return reference_files_list

        with unittest.mock.patch('cate.ds.esa_cci_odp.EsaCciOdpDataSource._find_files', find_files_mock):
            with unittest.mock.patch.object(EsaCciOdpDataStore, 'query', return_value=[]):
                try:
                    new_ds = self.data_source.make_local('local_ds_test', None,
                                                         (datetime.datetime(1978, 11, 14, 0, 0),
                                                          datetime.datetime(1978, 11, 15, 23, 59)))
                except:
                    raise ValueError(reference_path, os.listdir(reference_path))

                self.assertEqual(new_ds.name, 'local.local_ds_test')
                self.assertEqual(new_ds.temporal_coverage(),
                                 (datetime.datetime(1978, 11, 14, 0, 0),
                                  datetime.datetime(1978, 11, 15, 23, 59)))

                self.data_source.update_local(new_ds.name, (datetime.datetime(1978, 11, 15, 00, 00),
                                                            datetime.datetime(1978, 11, 16, 23, 59)))
                self.assertEqual(new_ds.temporal_coverage(), TimeRangeLike.convert(
                                 (datetime.datetime(1978, 11, 15, 0, 0),
                                  datetime.datetime(1978, 11, 16, 23, 59))))

                self.data_source.update_local(new_ds.name, (datetime.datetime(1978, 11, 14, 00, 00),
                                                            datetime.datetime(1978, 11, 15, 23, 59)))
                self.assertEqual(new_ds.temporal_coverage(), TimeRangeLike.convert(
                                 (datetime.datetime(1978, 11, 14, 0, 0),
                                  datetime.datetime(1978, 11, 15, 23, 59))))

                with self.assertRaises(ValueError) as context:
                    self.data_source.update_local("wrong_ds_name", (datetime.datetime(1978, 11, 15, 00, 00),
                                                                    datetime.datetime(1978, 11, 16, 23, 59)))
                self.assertTrue("Couldn't find local DataSource", context.exception.args[0])

                new_ds_w_one_variable = self.data_source.make_local(
                    'local_ds_test_2', None, (datetime.datetime(1978, 11, 14, 0, 0),
                                              datetime.datetime(1978, 11, 15, 23, 59)), None, ['sm'])
                self.assertEqual(new_ds_w_one_variable.name, 'local.local_ds_test_2')
                ds = new_ds_w_one_variable.open_dataset()
                self.assertSetEqual(set(ds.variables), {'sm', 'lat', 'lon', 'time'})

                new_ds_w_region = self.data_source.make_local(
                    'from_local_to_local_region', None, (datetime.datetime(1978, 11, 14, 0, 0),
                                                         datetime.datetime(1978, 11, 15, 23, 59)),
                    "10,10,20,20", ['sm'])  # type: LocalDataSource
                self.assertEqual(new_ds_w_region.name, 'local.from_local_to_local_region')
                self.assertEqual(new_ds_w_region.spatial_coverage(), PolygonLike.convert("10,10,20,20"))
                data_set = new_ds_w_region.open_dataset()
                self.assertSetEqual(set(data_set.variables), {'sm', 'lat', 'lon', 'time'})
Example #20
0
def plot_map(ds: xr.Dataset,
             var: VarName.TYPE = None,
             indexers: DictLike.TYPE = None,
             time: TimeLike.TYPE = None,
             region: PolygonLike.TYPE = None,
             projection: str = 'PlateCarree',
             central_lon: float = 0.0,
             title: str = None,
             properties: DictLike.TYPE = None,
             file: str = None) -> Figure:
    """
    Create a geographic map plot for the variable given by dataset *ds* and variable name *var*.

    Plots the given variable from the given dataset on a map with coastal lines.
    In case no variable name is given, the first encountered variable in the
    dataset is plotted. In case no *time* is given, the first time slice
    is taken. It is also possible to set extents of the plot. If no extents
    are given, a global plot is created.

    The plot can either be shown using pyplot functionality, or saved,
    if a path is given. The following file formats for saving the plot
    are supported: eps, jpeg, jpg, pdf, pgf, png, ps, raw, rgba, svg,
    svgz, tif, tiff

    :param ds: the dataset containing the variable to plot
    :param var: the variable's name
    :param indexers: Optional indexers into data array of *var*. The *indexers* is a dictionary
           or a comma-separated string of key-value pairs that maps the variable's dimension names
           to constant labels. e.g. "layer=4".
    :param time: time slice index to plot, can be a string "YYYY-MM-DD" or an integer number
    :param region: Region to plot
    :param projection: name of a global projection, see http://scitools.org.uk/cartopy/docs/v0.15/crs/projections.html
    :param central_lon: central longitude of the projection in degrees
    :param title: an optional title
    :param properties: optional plot properties for Python matplotlib,
           e.g. "bins=512, range=(-1.5, +1.5)"
           For full reference refer to
           https://matplotlib.org/api/lines_api.html and
           https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.contourf.html
    :param file: path to a file in which to save the plot
    :return: a matplotlib figure object or None if in IPython mode
    """
    if not isinstance(ds, xr.Dataset):
        raise NotImplementedError(
            'Only gridded datasets are currently supported')

    var_name = None
    if not var:
        for key in ds.data_vars.keys():
            var_name = key
            break
    else:
        var_name = VarName.convert(var)
    var = ds[var_name]

    time = TimeLike.convert(time)
    indexers = DictLike.convert(indexers) or {}
    properties = DictLike.convert(properties) or {}

    extents = None
    region = PolygonLike.convert(region)
    if region:
        lon_min, lat_min, lon_max, lat_max = region.bounds
        if not _check_bounding_box(lat_min, lat_max, lon_min, lon_max):
            raise ValueError(
                'Provided plot extents do not form a valid bounding box '
                'within [-180.0,+180.0,-90.0,+90.0]')
        extents = [lon_min, lon_max, lat_min, lat_max]

    # See http://scitools.org.uk/cartopy/docs/v0.15/crs/projections.html#
    if projection == 'PlateCarree':
        proj = ccrs.PlateCarree(central_longitude=central_lon)
    elif projection == 'LambertCylindrical':
        proj = ccrs.LambertCylindrical(central_longitude=central_lon)
    elif projection == 'Mercator':
        proj = ccrs.Mercator(central_longitude=central_lon)
    elif projection == 'Miller':
        proj = ccrs.Miller(central_longitude=central_lon)
    elif projection == 'Mollweide':
        proj = ccrs.Mollweide(central_longitude=central_lon)
    elif projection == 'Orthographic':
        proj = ccrs.Orthographic(central_longitude=central_lon)
    elif projection == 'Robinson':
        proj = ccrs.Robinson(central_longitude=central_lon)
    elif projection == 'Sinusoidal':
        proj = ccrs.Sinusoidal(central_longitude=central_lon)
    elif projection == 'NorthPolarStereo':
        proj = ccrs.NorthPolarStereo(central_longitude=central_lon)
    elif projection == 'SouthPolarStereo':
        proj = ccrs.SouthPolarStereo(central_longitude=central_lon)
    else:
        raise ValueError('illegal projection: "%s"' % projection)

    figure = plt.figure(figsize=(8, 4))
    ax = plt.axes(projection=proj)
    if extents:
        ax.set_extent(extents)
    else:
        ax.set_global()

    ax.coastlines()
    var_data = _get_var_data(var,
                             indexers,
                             time=time,
                             remaining_dims=('lon', 'lat'))
    var_data.plot.contourf(ax=ax, transform=proj, **properties)

    if title:
        ax.set_title(title)

    figure.tight_layout()

    if file:
        figure.savefig(file)

    return figure if not in_notebook() else None
Example #21
0
    def test_make_local_and_update(self):

        soilmoisture_data_sources = self.data_store.query(
            query_expr='esacci.SOILMOISTURE.day.L3S.SSMV.multi-sensor.multi-platform.COMBINED.02-1.r1')
        soilmoisture_data_source = soilmoisture_data_sources[0]

        reference_path = os.path.join(os.path.dirname(__file__),
                                      os.path.normpath('resources/datasources/local/files/'))

        def find_files_mock(_, time_range):

            def build_file_item(item_name: str, date_from: datetime, date_to: datetime, size: int):

                return [item_name, date_from, date_to, size,
                        {'OPENDAP': os.path.join(reference_path, item_name),
                         'HTTPServer': 'file:' + urllib.request.pathname2url(os.path.join(reference_path, item_name))}]

            reference_files = {
                'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781114000000-fv02.2.nc': {
                    'date_from': datetime.datetime(1978, 11, 14, 0, 0),
                    'date_to': datetime.datetime(1978, 11, 14, 23, 59),
                    'size': 21511378
                },
                'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781115000000-fv02.2.nc': {
                    'date_from': datetime.datetime(1978, 11, 15, 0, 0),
                    'date_to': datetime.datetime(1978, 11, 15, 23, 59),
                    'size': 21511378
                },
                'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781116000000-fv02.2.nc': {
                    'date_from': datetime.datetime(1978, 11, 16, 0, 0),
                    'date_to': datetime.datetime(1978, 11, 16, 23, 59),
                    'size': 21511378
                }
            }

            reference_files_list = []

            for reference_file in reference_files.items():
                file_name = reference_file[0]
                file_date_from = reference_file[1].get('date_from')
                file_date_to = reference_file[1].get('date_to')
                file_size = reference_file[1].get('size')
                if time_range:
                    if file_date_from >= time_range[0] and file_date_to <= time_range[1]:
                        reference_files_list.append(build_file_item(file_name,
                                                                    file_date_from,
                                                                    file_date_to,
                                                                    file_size))
                else:
                    reference_files_list.append(build_file_item(file_name,
                                                                file_date_from,
                                                                file_date_to,
                                                                file_size))
            return reference_files_list

        with unittest.mock.patch('cate.ds.esa_cci_odp.EsaCciOdpDataSource._find_files', find_files_mock):
            with unittest.mock.patch.object(EsaCciOdpDataStore, 'query', return_value=[]):

                new_ds_title = 'local_ds_test'
                new_ds_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                           datetime.datetime(1978, 11, 16, 23, 59)))
                try:
                    new_ds = soilmoisture_data_source.make_local(new_ds_title, time_range=new_ds_time_range)
                except Exception:
                    raise ValueError(reference_path, os.listdir(reference_path))
                self.assertIsNotNone(new_ds)

                self.assertEqual(new_ds.id, "local.%s" % new_ds_title)
                self.assertEqual(new_ds.temporal_coverage(), new_ds_time_range)

                new_ds_w_one_variable_title = 'local_ds_test_var'
                new_ds_w_one_variable_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                                          datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_one_variable_var_names = VarNamesLike.convert(['sm'])

                new_ds_w_one_variable = soilmoisture_data_source.make_local(
                    new_ds_w_one_variable_title,
                    time_range=new_ds_w_one_variable_time_range,
                    var_names=new_ds_w_one_variable_var_names
                )
                self.assertIsNotNone(new_ds_w_one_variable)

                self.assertEqual(new_ds_w_one_variable.id, "local.%s" % new_ds_w_one_variable_title)
                ds = new_ds_w_one_variable.open_dataset()

                new_ds_w_one_variable_var_names.extend(['lat', 'lon', 'time'])

                self.assertSetEqual(set(ds.variables),
                                    set(new_ds_w_one_variable_var_names))

                new_ds_w_region_title = 'from_local_to_local_region'
                new_ds_w_region_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                                    datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_region_spatial_coverage = PolygonLike.convert("10,20,30,40")

                new_ds_w_region = soilmoisture_data_source.make_local(
                    new_ds_w_region_title,
                    time_range=new_ds_w_region_time_range,
                    region=new_ds_w_region_spatial_coverage)

                self.assertIsNotNone(new_ds_w_region)

                self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title)

                self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage)

                new_ds_w_region_title = 'from_local_to_local_region_one_var'
                new_ds_w_region_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                                    datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_region_var_names = VarNamesLike.convert(['sm'])
                new_ds_w_region_spatial_coverage = PolygonLike.convert("10,20,30,40")

                new_ds_w_region = soilmoisture_data_source.make_local(
                    new_ds_w_region_title,
                    time_range=new_ds_w_region_time_range,
                    var_names=new_ds_w_region_var_names,
                    region=new_ds_w_region_spatial_coverage)

                self.assertIsNotNone(new_ds_w_region)

                self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title)

                self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage)
                data_set = new_ds_w_region.open_dataset()
                new_ds_w_region_var_names.extend(['lat', 'lon', 'time'])

                self.assertSetEqual(set(data_set.variables), set(new_ds_w_region_var_names))

                new_ds_w_region_title = 'from_local_to_local_region_two_var_sm_uncertainty'
                new_ds_w_region_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                                    datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_region_var_names = VarNamesLike.convert(['sm', 'sm_uncertainty'])
                new_ds_w_region_spatial_coverage = PolygonLike.convert("10,20,30,40")

                new_ds_w_region = soilmoisture_data_source.make_local(
                    new_ds_w_region_title,
                    time_range=new_ds_w_region_time_range,
                    var_names=new_ds_w_region_var_names,
                    region=new_ds_w_region_spatial_coverage)

                self.assertIsNotNone(new_ds_w_region)

                self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title)

                self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage)
                data_set = new_ds_w_region.open_dataset()
                new_ds_w_region_var_names.extend(['lat', 'lon', 'time'])

                self.assertSetEqual(set(data_set.variables), set(new_ds_w_region_var_names))

                empty_ds_timerange = (datetime.datetime(2017, 12, 1, 0, 0), datetime.datetime(2017, 12, 31, 23, 59))
                with self.assertRaises(DataAccessError) as cm:
                    soilmoisture_data_source.make_local('empty_ds', time_range=empty_ds_timerange)
                self.assertEqual(f'Data source "{soilmoisture_data_source.id}" does not'
                                 f' seem to have any datasets in given'
                                 f' time range {TimeRangeLike.format(empty_ds_timerange)}',
                                 str(cm.exception))

                new_ds_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                           datetime.datetime(1978, 11, 14, 23, 59)))

                new_ds = soilmoisture_data_source.make_local("title_test_copy", time_range=new_ds_time_range)
                self.assertIsNotNone(new_ds)
                self.assertEqual(new_ds.meta_info['title'], soilmoisture_data_source.meta_info['title'])

                title = "Title Test!"
                new_ds = soilmoisture_data_source.make_local("title_test_set", title, time_range=new_ds_time_range)
                self.assertIsNotNone(new_ds)
                self.assertEqual(new_ds.meta_info['title'], title)
Example #22
0
    def _make_local(self,
                    local_ds: 'LocalDataSource',
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        local_id = local_ds.id

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        region = PolygonLike.convert(region) if region else None
        var_names = VarNamesLike.convert(
            var_names) if var_names else None  # type: Sequence

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL',
                                             NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({
                'zlib': True,
                'complevel': compression_level
            })

        local_path = os.path.join(local_ds.data_store.data_store_path,
                                  local_id)
        data_store_path = local_ds.data_store.data_store_path
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        monitor.start("Sync " + self.id, total_work=len(self._files.items()))
        for remote_relative_filepath, coverage in self._files.items():
            child_monitor = monitor.child(work=1)

            file_name = os.path.basename(remote_relative_filepath)
            local_relative_filepath = os.path.join(local_id, file_name)
            local_absolute_filepath = os.path.join(data_store_path,
                                                   local_relative_filepath)

            remote_absolute_filepath = os.path.join(
                self._data_store.data_store_path, remote_relative_filepath)

            if isinstance(coverage, Tuple):

                time_coverage_start = coverage[0]
                time_coverage_end = coverage[1]

                if not time_range or time_coverage_start >= time_range[
                        0] and time_coverage_end <= time_range[1]:
                    if region or var_names:

                        do_update_of_variables_meta_info_once = True
                        do_update_of_region_meta_info_once = True

                        try:
                            remote_dataset = xr.open_dataset(
                                remote_absolute_filepath)

                            if var_names:
                                remote_dataset = remote_dataset.drop([
                                    var_name for var_name in
                                    remote_dataset.data_vars.keys()
                                    if var_name not in var_names
                                ])

                            if region:
                                remote_dataset = normalize_impl(remote_dataset)
                                remote_dataset = subset_spatial_impl(
                                    remote_dataset, region)
                                geo_lon_min, geo_lat_min, geo_lon_max, geo_lat_max = region.bounds

                                remote_dataset.attrs[
                                    'geospatial_lat_min'] = geo_lat_min
                                remote_dataset.attrs[
                                    'geospatial_lat_max'] = geo_lat_max
                                remote_dataset.attrs[
                                    'geospatial_lon_min'] = geo_lon_min
                                remote_dataset.attrs[
                                    'geospatial_lon_max'] = geo_lon_max
                                if do_update_of_region_meta_info_once:
                                    local_ds.meta_info[
                                        'bbox_maxx'] = geo_lon_max
                                    local_ds.meta_info[
                                        'bbox_minx'] = geo_lon_min
                                    local_ds.meta_info[
                                        'bbox_maxy'] = geo_lat_max
                                    local_ds.meta_info[
                                        'bbox_miny'] = geo_lat_min
                                    do_update_of_region_meta_info_once = False

                            if compression_enabled:
                                for sel_var_name in remote_dataset.variables.keys(
                                ):
                                    remote_dataset.variables.get(
                                        sel_var_name).encoding.update(
                                            encoding_update)

                            remote_dataset.to_netcdf(local_absolute_filepath)

                            child_monitor.progress(
                                work=1, msg=str(time_coverage_start))
                        finally:
                            if do_update_of_variables_meta_info_once:
                                variables_info = local_ds.meta_info.get(
                                    'variables', [])
                                local_ds.meta_info['variables'] = [
                                    var_info for var_info in variables_info
                                    if var_info.get('name') in remote_dataset.
                                    variables.keys() and var_info.get('name')
                                    not in remote_dataset.dims.keys()
                                ]
                                do_update_of_variables_meta_info_once = False

                            local_ds.add_dataset(
                                os.path.join(local_id, file_name),
                                (time_coverage_start, time_coverage_end))

                        child_monitor.done()
                    else:
                        shutil.copy(remote_absolute_filepath,
                                    local_absolute_filepath)
                        local_ds.add_dataset(
                            local_relative_filepath,
                            (time_coverage_start, time_coverage_end))
                        child_monitor.done()
        monitor.done()
        return local_id
Example #23
0
    def test_accepts(self):
        self.assertTrue(PolygonLike.accepts(""))
        self.assertTrue(PolygonLike.accepts(" \t"))
        self.assertTrue(PolygonLike.accepts("0.0,0.0,1.1,1.1"))
        self.assertTrue(PolygonLike.accepts("0.0, 0.0, 1.1, 1.1"))

        coords = [(10.4, 20.2), (30.8, 20.2), (30.8, 40.8), (10.4, 40.8)]
        pol = Polygon(coords)
        self.assertTrue(PolygonLike.accepts(coords))
        self.assertTrue(PolygonLike.accepts(pol))
        self.assertTrue(PolygonLike.accepts(pol.wkt))
        self.assertTrue(PolygonLike.accepts(pol.bounds))

        self.assertFalse(PolygonLike.accepts("0.0,aaa,1.1,1.1"))
        self.assertFalse(PolygonLike.accepts("0.0, aaa, 1.1, 1.1"))
        self.assertFalse(PolygonLike.accepts([(0.0, 0.0), (0.0, 1.0), (1.0, 'aaa'), (1.0, 0.0)]))
        self.assertFalse(PolygonLike.accepts([(0.0, 0.0), (0.0, 1.0), 'Guten Morgen, Berlin!', (1.0, 0.0)]))
        self.assertFalse(PolygonLike.accepts(Polygon([(0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0)])))
        self.assertFalse(PolygonLike.accepts('MULTIPOLYGON()'))
        self.assertFalse(PolygonLike.accepts('Something_in_the_month_of_May'))
        self.assertFalse(PolygonLike.accepts(1.0))
Example #24
0
 def test_format(self):
     self.assertEqual(PolygonLike.format(None), '')
     coords = [(10.4, 20.2), (30.8, 20.2), (30.8, 40.8), (10.4, 40.8)]
     pol = PolygonLike.convert(coords)
     self.assertEqual(PolygonLike.format(pol), 'POLYGON ((10.4 20.2, 30.8 20.2, 30.8 40.8, 10.4 40.8, 10.4 20.2))')
Example #25
0
 def test_json(self):
     self.assertEqual(PolygonLike.from_json("-10, -10, 10, 10"), "-10, -10, 10, 10")
Example #26
0
    def _make_local(self,
                    local_ds: 'LocalDataSource',
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        # local_name = local_ds.name
        local_id = local_ds.name

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        region = PolygonLike.convert(region) if region else None
        var_names = VarNamesLike.convert(
            var_names) if var_names else None  # type: Sequence

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL',
                                             NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({
                'zlib': True,
                'complevel': compression_level
            })

        local_path = os.path.join(local_ds.data_store.data_store_path,
                                  local_id)
        data_store_path = local_ds.data_store.data_store_path
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        monitor.start("Sync " + self.name, total_work=len(self._files.items()))
        for remote_relative_filepath, coverage in self._files.items():
            child_monitor = monitor.child(work=1)

            file_name = os.path.basename(remote_relative_filepath)
            local_relative_filepath = os.path.join(local_id, file_name)
            local_absolute_filepath = os.path.join(data_store_path,
                                                   local_relative_filepath)

            remote_absolute_filepath = os.path.join(
                self._data_store.data_store_path, remote_relative_filepath)

            if isinstance(coverage, Tuple):

                time_coverage_start = coverage[0]
                time_coverage_end = coverage[1]

                remote_netcdf = None
                local_netcdf = None
                if not time_range or time_coverage_start >= time_range[
                        0] and time_coverage_end <= time_range[1]:
                    if region or var_names:
                        try:
                            remote_netcdf = NetCDF4DataStore(
                                remote_absolute_filepath)

                            local_netcdf = NetCDF4DataStore(
                                local_absolute_filepath,
                                mode='w',
                                persist=True)
                            local_netcdf.set_attributes(
                                remote_netcdf.get_attrs())

                            remote_dataset = xr.Dataset.load_store(
                                remote_netcdf)

                            process_region = False
                            if region:
                                geo_lat_min = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs, 'geospatial_lat_min')
                                geo_lat_max = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs, 'geospatial_lat_max')
                                geo_lon_min = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs, 'geospatial_lon_min')
                                geo_lon_max = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs, 'geospatial_lon_max')

                                geo_lat_res = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs,
                                    'geospatial_lon_resolution')
                                geo_lon_res = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs,
                                    'geospatial_lat_resolution')
                                if not (isnan(geo_lat_min)
                                        or isnan(geo_lat_max)
                                        or isnan(geo_lon_min)
                                        or isnan(geo_lon_max)
                                        or isnan(geo_lat_res)
                                        or isnan(geo_lon_res)):
                                    process_region = True

                                    [lat_min, lon_min, lat_max,
                                     lon_max] = region.bounds

                                    lat_min = floor(
                                        (lat_min - geo_lat_min) / geo_lat_res)
                                    lat_max = ceil(
                                        (lat_max - geo_lat_min) / geo_lat_res)
                                    lon_min = floor(
                                        (lon_min - geo_lon_min) / geo_lon_res)
                                    lon_max = ceil(
                                        (lon_max - geo_lon_min) / geo_lon_res)

                                    # TODO (kbernat): check why dataset.sel fails!
                                    remote_dataset = remote_dataset.isel(
                                        drop=False,
                                        lat=slice(lat_min, lat_max),
                                        lon=slice(lon_min, lon_max))

                                    geo_lat_max = lat_max * geo_lat_res + geo_lat_min
                                    geo_lat_min += lat_min * geo_lat_res
                                    geo_lon_max = lon_max * geo_lon_res + geo_lon_min
                                    geo_lon_min += lon_min * geo_lon_res

                            if not var_names:
                                var_names = [
                                    var_name for var_name in
                                    remote_netcdf.variables.keys()
                                ]
                            var_names.extend([
                                coord_name
                                for coord_name in remote_dataset.coords.keys()
                                if coord_name not in var_names
                            ])
                            child_monitor.start(label=file_name,
                                                total_work=len(var_names))
                            for sel_var_name in var_names:
                                var_dataset = remote_dataset.drop([
                                    var_name for var_name in
                                    remote_dataset.variables.keys()
                                    if var_name != sel_var_name
                                ])
                                if compression_enabled:
                                    var_dataset.variables.get(
                                        sel_var_name).encoding.update(
                                            encoding_update)
                                local_netcdf.store_dataset(var_dataset)
                                child_monitor.progress(work=1,
                                                       msg=sel_var_name)
                            if process_region:
                                local_netcdf.set_attribute(
                                    'geospatial_lat_min', geo_lat_min)
                                local_netcdf.set_attribute(
                                    'geospatial_lat_max', geo_lat_max)
                                local_netcdf.set_attribute(
                                    'geospatial_lon_min', geo_lon_min)
                                local_netcdf.set_attribute(
                                    'geospatial_lon_max', geo_lon_max)
                        finally:
                            if remote_netcdf:
                                remote_netcdf.close()
                            if local_netcdf:
                                local_netcdf.close()
                                local_ds.add_dataset(
                                    local_relative_filepath,
                                    (time_coverage_start, time_coverage_end))
                        child_monitor.done()
                    else:
                        shutil.copy(remote_absolute_filepath,
                                    local_absolute_filepath)
                        local_ds.add_dataset(
                            local_relative_filepath,
                            (time_coverage_start, time_coverage_end))
                        child_monitor.done()
        monitor.done()
        return local_id
Example #27
0
    def test_make_local(self):
        data_source = self._local_data_store.query('local_w_temporal')[0]

        with unittest.mock.patch.object(EsaCciOdpDataStore,
                                        'query',
                                        return_value=[]):
            new_ds_title = 'from_local_to_local'
            new_ds_time_range = TimeRangeLike.convert(
                (datetime.datetime(1978, 11, 14, 0,
                                   0), datetime.datetime(1978, 11, 15, 23,
                                                         59)))
            new_ds = data_source.make_local(new_ds_title,
                                            time_range=new_ds_time_range)
            self.assertIsNotNone(new_ds)

            self.assertEqual(new_ds.id, "local.%s" % new_ds_title)
            self.assertEqual(
                new_ds.temporal_coverage(),
                TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                       datetime.datetime(1978, 11, 15, 23,
                                                         59))))

            new_ds_2_title = 'from_local_to_local_var'
            new_ds_2_time_range = TimeRangeLike.convert(
                (datetime.datetime(1978, 11, 14, 0,
                                   0), datetime.datetime(1978, 11, 15, 23,
                                                         59)))
            new_ds_2_vars = VarNamesLike.convert(['sm'])

            new_ds_w_one_variable = data_source.make_local(
                new_ds_2_title,
                time_range=new_ds_2_time_range,
                var_names=new_ds_2_vars)
            self.assertIsNotNone(new_ds_w_one_variable)
            self.assertEqual(new_ds_w_one_variable.id,
                             "local.%s" % new_ds_2_title)
            data_set = new_ds_w_one_variable.open_dataset()
            self.assertSetEqual(set(data_set.variables),
                                {'sm', 'lat', 'lon', 'time'})

            new_ds_3_title = 'from_local_to_local_range'
            new_ds_3_time_range = TimeRangeLike.convert(
                (datetime.datetime(1978, 11, 14, 0,
                                   0), datetime.datetime(1978, 11, 15, 23,
                                                         59)))
            new_ds_3_vars = VarNamesLike.convert(['sm'])
            new_ds_3_region = PolygonLike.convert("10,10,20,20")

            new_ds_w_region = data_source.make_local(
                new_ds_3_title,
                time_range=new_ds_3_time_range,
                var_names=new_ds_3_vars,
                region=new_ds_3_region)  # type: LocalDataSource
            self.assertIsNotNone(new_ds_w_region)
            self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_3_title)
            self.assertEqual(new_ds_w_region.spatial_coverage(),
                             PolygonLike.convert("10,10,20,20"))
            data_set = new_ds_w_region.open_dataset()
            self.assertSetEqual(set(data_set.variables),
                                {'sm', 'lat', 'lon', 'time'})

            no_data = data_source.make_local(
                'no_data',
                time_range=(datetime.datetime(2020, 11, 14, 0, 0),
                            datetime.datetime(2020, 11, 15, 23, 59)))
            self.assertIsNone(no_data)
Example #28
0
def plot_map(ds: xr.Dataset,
             var: VarName.TYPE = None,
             index: DictLike.TYPE = None,
             time: Union[str, int] = None,
             region: PolygonLike.TYPE = None,
             projection: str = 'PlateCarree',
             central_lon: float = 0.0,
             file: str = None) -> None:
    """
    Plot the given variable from the given dataset on a map with coastal lines.
    In case no variable name is given, the first encountered variable in the
    dataset is plotted. In case no time index is given, the first time slice
    is taken. It is also possible to set extents of the plot. If no extents
    are given, a global plot is created.

    The plot can either be shown using pyplot functionality, or saved,
    if a path is given. The following file formats for saving the plot
    are supported: eps, jpeg, jpg, pdf, pgf, png, ps, raw, rgba, svg,
    svgz, tif, tiff

    :param ds: xr.Dataset to plot
    :param var: variable name in the dataset to plot
    :param index: Optional index into the variable's data array. The *index* is a dictionary
                  that maps the variable's dimension names to constant labels. For example,
                  ``lat`` and ``lon`` are given in decimal degrees, while a ``time`` value may be provided as
                  datetime object or a date string. *index* may also be a comma-separated string of key-value pairs,
                  e.g. "lat=12.4, time='2012-05-02'".
    :param time: time slice index to plot
    :param region: Region to plot
    :param projection: name of a global projection, see http://scitools.org.uk/cartopy/docs/v0.15/crs/projections.html
    :param central_lon: central longitude of the projection in degrees
    :param file: path to a file in which to save the plot
    """
    if not isinstance(ds, xr.Dataset):
        raise NotImplementedError('Only raster datasets are currently '
                                  'supported')

    var_name = None
    if not var:
        for key in ds.data_vars.keys():
            var_name = key
            break
    else:
        var_name = VarName.convert(var)

    var = ds[var_name]
    index = DictLike.convert(index)

    # 0 is a valid index, hence test if time is None
    if time is not None and isinstance(time, int) and 'time' in var.coords:
        time = var.coords['time'][time]

    if time:
        if not index:
            index = dict()
        index['time'] = time

    for dim_name in var.dims:
        if dim_name not in ('lat', 'lon'):
            if not index:
                index = dict()
            if dim_name not in index:
                index[dim_name] = 0

    if region is None:
        lat_min = -90.0
        lat_max = 90.0
        lon_min = -180.0
        lon_max = 180.0
    else:
        region = PolygonLike.convert(region)
        lon_min, lat_min, lon_max, lat_max = region.bounds

    if not _check_bounding_box(lat_min, lat_max, lon_min, lon_max):
        raise ValueError(
            'Provided plot extents do not form a valid bounding box '
            'within [-180.0,+180.0,-90.0,+90.0]')
    extents = [lon_min, lon_max, lat_min, lat_max]

    # See http://scitools.org.uk/cartopy/docs/v0.15/crs/projections.html#
    if projection == 'PlateCarree':
        proj = ccrs.PlateCarree(central_longitude=central_lon)
    elif projection == 'LambertCylindrical':
        proj = ccrs.LambertCylindrical(central_longitude=central_lon)
    elif projection == 'Mercator':
        proj = ccrs.Mercator(central_longitude=central_lon)
    elif projection == 'Miller':
        proj = ccrs.Miller(central_longitude=central_lon)
    elif projection == 'Mollweide':
        proj = ccrs.Mollweide(central_longitude=central_lon)
    elif projection == 'Orthographic':
        proj = ccrs.Orthographic(central_longitude=central_lon)
    elif projection == 'Robinson':
        proj = ccrs.Robinson(central_longitude=central_lon)
    elif projection == 'Sinusoidal':
        proj = ccrs.Sinusoidal(central_longitude=central_lon)
    elif projection == 'NorthPolarStereo':
        proj = ccrs.NorthPolarStereo(central_longitude=central_lon)
    elif projection == 'SouthPolarStereo':
        proj = ccrs.SouthPolarStereo(central_longitude=central_lon)
    else:
        raise ValueError('illegal projection')

    try:
        if index:
            var_data = var.sel(**index)
        else:
            var_data = var
    except ValueError:
        var_data = var

    fig = plt.figure(figsize=(16, 8))
    ax = plt.axes(projection=proj)
    if extents:
        ax.set_extent(extents)
    else:
        ax.set_global()

    ax.coastlines()
    var_data.plot.contourf(ax=ax, transform=proj)
    if file:
        fig.savefig(file)
Example #29
0
    def test_make_local_and_update(self):

        soilmoisture_data_sources = self.data_store.query(
            query_expr='esacci.SOILMOISTURE.day.L3S.SSMV.multi-sensor.multi-platform.COMBINED.02-1.r1')
        soilmoisture_data_source = soilmoisture_data_sources[0]

        reference_path = os.path.join(os.path.dirname(__file__),
                                      os.path.normpath('resources/datasources/local/files/'))

        def find_files_mock(_, time_range):

            def build_file_item(item_name: str, date_from: datetime, date_to: datetime, size: int):

                return [item_name, date_from, date_to, size,
                        {'OPENDAP': os.path.join(reference_path, item_name),
                         'HTTPServer': 'file:' + urllib.request.pathname2url(os.path.join(reference_path, item_name))}]

            reference_files = {
                'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781114000000-fv02.2.nc': {
                    'date_from': datetime.datetime(1978, 11, 14, 0, 0),
                    'date_to': datetime.datetime(1978, 11, 14, 23, 59),
                    'size': 21511378
                },
                'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781115000000-fv02.2.nc': {
                    'date_from': datetime.datetime(1978, 11, 15, 0, 0),
                    'date_to': datetime.datetime(1978, 11, 15, 23, 59),
                    'size': 21511378
                },
                'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781116000000-fv02.2.nc': {
                    'date_from': datetime.datetime(1978, 11, 16, 0, 0),
                    'date_to': datetime.datetime(1978, 11, 16, 23, 59),
                    'size': 21511378
                }
            }

            reference_files_list = []

            for reference_file in reference_files.items():
                file_name = reference_file[0]
                file_date_from = reference_file[1].get('date_from')
                file_date_to = reference_file[1].get('date_to')
                file_size = reference_file[1].get('size')
                if time_range:
                    if file_date_from >= time_range[0] and file_date_to <= time_range[1]:
                        reference_files_list.append(build_file_item(file_name,
                                                                    file_date_from,
                                                                    file_date_to,
                                                                    file_size))
                else:
                    reference_files_list.append(build_file_item(file_name,
                                                                file_date_from,
                                                                file_date_to,
                                                                file_size))
            return reference_files_list

        with unittest.mock.patch('cate.ds.esa_cci_odp.EsaCciOdpDataSource._find_files', find_files_mock):
            with unittest.mock.patch.object(EsaCciOdpDataStore, 'query', return_value=[]):

                new_ds_title = 'local_ds_test'
                new_ds_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                           datetime.datetime(1978, 11, 16, 23, 59)))
                try:
                    new_ds = soilmoisture_data_source.make_local(new_ds_title, time_range=new_ds_time_range)
                except Exception:
                    raise ValueError(reference_path, os.listdir(reference_path))
                self.assertIsNotNone(new_ds)

                self.assertEqual(new_ds.id, "local.%s" % new_ds_title)
                self.assertEqual(new_ds.temporal_coverage(), new_ds_time_range)

                new_ds_w_one_variable_title = 'local_ds_test_var'
                new_ds_w_one_variable_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                                          datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_one_variable_var_names = VarNamesLike.convert(['sm'])

                new_ds_w_one_variable = soilmoisture_data_source.make_local(
                    new_ds_w_one_variable_title,
                    time_range=new_ds_w_one_variable_time_range,
                    var_names=new_ds_w_one_variable_var_names
                )
                self.assertIsNotNone(new_ds_w_one_variable)

                self.assertEqual(new_ds_w_one_variable.id, "local.%s" % new_ds_w_one_variable_title)
                ds = new_ds_w_one_variable.open_dataset()

                new_ds_w_one_variable_var_names.extend(['lat', 'lon', 'time'])

                self.assertSetEqual(set(ds.variables),
                                    set(new_ds_w_one_variable_var_names))

                new_ds_w_region_title = 'from_local_to_local_region'
                new_ds_w_region_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                                    datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_region_spatial_coverage = PolygonLike.convert("10,20,30,40")

                new_ds_w_region = soilmoisture_data_source.make_local(
                    new_ds_w_region_title,
                    time_range=new_ds_w_region_time_range,
                    region=new_ds_w_region_spatial_coverage)  # type: LocalDataSource

                self.assertIsNotNone(new_ds_w_region)

                self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title)

                self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage)

                new_ds_w_region_title = 'from_local_to_local_region_one_var'
                new_ds_w_region_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                                    datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_region_var_names = VarNamesLike.convert(['sm'])
                new_ds_w_region_spatial_coverage = PolygonLike.convert("10,20,30,40")

                new_ds_w_region = soilmoisture_data_source.make_local(
                    new_ds_w_region_title,
                    time_range=new_ds_w_region_time_range,
                    var_names=new_ds_w_region_var_names,
                    region=new_ds_w_region_spatial_coverage)  # type: LocalDataSource

                self.assertIsNotNone(new_ds_w_region)

                self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title)

                self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage)
                data_set = new_ds_w_region.open_dataset()
                new_ds_w_region_var_names.extend(['lat', 'lon', 'time'])

                self.assertSetEqual(set(data_set.variables), set(new_ds_w_region_var_names))

                new_ds_w_region_title = 'from_local_to_local_region_two_var_sm_uncertainty'
                new_ds_w_region_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                                    datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_region_var_names = VarNamesLike.convert(['sm', 'sm_uncertainty'])
                new_ds_w_region_spatial_coverage = PolygonLike.convert("10,20,30,40")

                new_ds_w_region = soilmoisture_data_source.make_local(
                    new_ds_w_region_title,
                    time_range=new_ds_w_region_time_range,
                    var_names=new_ds_w_region_var_names,
                    region=new_ds_w_region_spatial_coverage)  # type: LocalDataSource

                self.assertIsNotNone(new_ds_w_region)

                self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title)

                self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage)
                data_set = new_ds_w_region.open_dataset()
                new_ds_w_region_var_names.extend(['lat', 'lon', 'time'])

                self.assertSetEqual(set(data_set.variables), set(new_ds_w_region_var_names))

                empty_ds_timerange = (datetime.datetime(2017, 12, 1, 0, 0), datetime.datetime(2017, 12, 31, 23, 59))
                with self.assertRaises(DataAccessError) as cm:
                    soilmoisture_data_source.make_local('empty_ds', time_range=empty_ds_timerange)
                self.assertEqual(f'Data source "{soilmoisture_data_source.id}" does not'
                                 f' seem to have any datasets in given'
                                 f' time range {TimeRangeLike.format(empty_ds_timerange)}',
                                 str(cm.exception))

                new_ds_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                           datetime.datetime(1978, 11, 14, 23, 59)))

                new_ds = soilmoisture_data_source.make_local("title_test_copy", time_range=new_ds_time_range)
                self.assertIsNotNone(new_ds)
                self.assertEqual(new_ds.meta_info['title'], soilmoisture_data_source.meta_info['title'])

                title = "Title Test!"
                new_ds = soilmoisture_data_source.make_local("title_test_set", title, time_range=new_ds_time_range)
                self.assertIsNotNone(new_ds)
                self.assertEqual(new_ds.meta_info['title'], title)
Example #30
0
    def _make_local(self,
                    local_ds: LocalDataSource,
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        local_id = local_ds.id
        time_range = TimeRangeLike.convert(time_range)
        region = PolygonLike.convert(region)
        var_names = VarNamesLike.convert(var_names)

        time_range, region, var_names = self._apply_make_local_fixes(
            time_range, region, var_names)

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL',
                                             NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        do_update_of_verified_time_coverage_start_once = True
        verified_time_coverage_start = None
        verified_time_coverage_end = None

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({
                'zlib': True,
                'complevel': compression_level
            })

        if region or var_names:
            protocol = _ODP_PROTOCOL_OPENDAP
        else:
            protocol = _ODP_PROTOCOL_HTTP

        local_path = os.path.join(local_ds.data_store.data_store_path,
                                  local_id)
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        selected_file_list = self._find_files(time_range)
        if not selected_file_list:
            msg = 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets'.format(
                self.id)
            if time_range is not None:
                msg += ' in given time range {}'.format(
                    TimeRangeLike.format(time_range))
            raise DataAccessError(msg)
        try:
            if protocol == _ODP_PROTOCOL_OPENDAP:

                do_update_of_variables_meta_info_once = True
                do_update_of_region_meta_info_once = True

                files = self._get_urls_list(selected_file_list, protocol)
                monitor.start('Sync ' + self.id, total_work=len(files))
                for idx, dataset_uri in enumerate(files):
                    child_monitor = monitor.child(work=1)

                    file_name = os.path.basename(dataset_uri)
                    local_filepath = os.path.join(local_path, file_name)

                    time_coverage_start = selected_file_list[idx][1]
                    time_coverage_end = selected_file_list[idx][2]

                    try:
                        child_monitor.start(label=file_name, total_work=1)

                        remote_dataset = xr.open_dataset(dataset_uri)

                        if var_names:
                            remote_dataset = remote_dataset.drop([
                                var_name for var_name in
                                remote_dataset.data_vars.keys()
                                if var_name not in var_names
                            ])

                        if region:
                            remote_dataset = normalize_impl(remote_dataset)
                            remote_dataset = subset_spatial_impl(
                                remote_dataset, region)
                            geo_lon_min, geo_lat_min, geo_lon_max, geo_lat_max = region.bounds

                            remote_dataset.attrs[
                                'geospatial_lat_min'] = geo_lat_min
                            remote_dataset.attrs[
                                'geospatial_lat_max'] = geo_lat_max
                            remote_dataset.attrs[
                                'geospatial_lon_min'] = geo_lon_min
                            remote_dataset.attrs[
                                'geospatial_lon_max'] = geo_lon_max
                            if do_update_of_region_meta_info_once:
                                local_ds.meta_info['bbox_maxx'] = geo_lon_max
                                local_ds.meta_info['bbox_minx'] = geo_lon_min
                                local_ds.meta_info['bbox_maxy'] = geo_lat_max
                                local_ds.meta_info['bbox_miny'] = geo_lat_min
                                do_update_of_region_meta_info_once = False

                        if compression_enabled:
                            for sel_var_name in remote_dataset.variables.keys(
                            ):
                                remote_dataset.variables.get(
                                    sel_var_name).encoding.update(
                                        encoding_update)

                        remote_dataset.to_netcdf(local_filepath)

                        child_monitor.progress(work=1,
                                               msg=str(time_coverage_start))
                    finally:
                        if do_update_of_variables_meta_info_once:
                            variables_info = local_ds.meta_info.get(
                                'variables', [])
                            local_ds.meta_info['variables'] = [
                                var_info for var_info in variables_info
                                if var_info.get('name') in remote_dataset.
                                variables.keys() and var_info.get(
                                    'name') not in remote_dataset.dims.keys()
                            ]
                            do_update_of_variables_meta_info_once = False

                        local_ds.add_dataset(
                            os.path.join(local_id, file_name),
                            (time_coverage_start, time_coverage_end))

                        if do_update_of_verified_time_coverage_start_once:
                            verified_time_coverage_start = time_coverage_start
                            do_update_of_verified_time_coverage_start_once = False
                        verified_time_coverage_end = time_coverage_end
                    child_monitor.done()
            else:
                outdated_file_list = []
                for file_rec in selected_file_list:
                    filename, _, _, file_size, url = file_rec
                    dataset_file = os.path.join(local_path, filename)
                    # todo (forman, 20160915): must perform better checks on dataset_file if it is...
                    # ... outdated or incomplete or corrupted.
                    # JSON also includes "checksum" and "checksum_type" fields.
                    if not os.path.isfile(dataset_file) or (
                            file_size
                            and os.path.getsize(dataset_file) != file_size):
                        outdated_file_list.append(file_rec)

                if outdated_file_list:
                    with monitor.starting('Sync ' + self.id,
                                          len(outdated_file_list)):
                        bytes_to_download = sum(
                            [file_rec[3] for file_rec in outdated_file_list])
                        dl_stat = _DownloadStatistics(bytes_to_download)

                        file_number = 1

                        for filename, coverage_from, coverage_to, file_size, url in outdated_file_list:
                            dataset_file = os.path.join(local_path, filename)
                            sub_monitor = monitor.child(work=1.0)

                            # noinspection PyUnusedLocal
                            def reporthook(block_number, read_size,
                                           total_file_size):
                                dl_stat.handle_chunk(read_size)
                                sub_monitor.progress(work=read_size,
                                                     msg=str(dl_stat))

                            sub_monitor_msg = "file %d of %d" % (
                                file_number, len(outdated_file_list))
                            with sub_monitor.starting(sub_monitor_msg,
                                                      file_size):
                                urllib.request.urlretrieve(
                                    url[protocol],
                                    filename=dataset_file,
                                    reporthook=reporthook)
                            file_number += 1
                            local_ds.add_dataset(
                                os.path.join(local_id, filename),
                                (coverage_from, coverage_to))

                            if do_update_of_verified_time_coverage_start_once:
                                verified_time_coverage_start = coverage_from
                                do_update_of_verified_time_coverage_start_once = False
                            verified_time_coverage_end = coverage_to
        except OSError as e:
            raise DataAccessError(
                "Copying remote data source failed: {}".format(e),
                source=self) from e
        local_ds.meta_info['temporal_coverage_start'] = TimeLike.format(
            verified_time_coverage_start)
        local_ds.meta_info['temporal_coverage_end'] = TimeLike.format(
            verified_time_coverage_end)
        local_ds.save(True)
Example #31
0
    def make_local(self,
                   local_name: str,
                   local_id: str = None,
                   time_range: TimeRangeLike.TYPE = None,
                   region: PolygonLike.TYPE = None,
                   var_names: VarNamesLike.TYPE = None,
                   monitor: Monitor = Monitor.NONE) -> Optional[DataSource]:

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        region = PolygonLike.convert(region) if region else None
        var_names = VarNamesLike.convert(var_names) if var_names else None

        ds_id = local_name
        title = local_id

        local_store = DATA_STORE_REGISTRY.get_data_store('local')
        if not local_store:
            add_to_data_store_registry()
            local_store = DATA_STORE_REGISTRY.get_data_store('local')
        if not local_store:
            raise ValueError('Cannot initialize `local` DataStore')

        uuid = LocalDataStore.generate_uuid(ref_id=self.id,
                                            time_range=time_range,
                                            region=region,
                                            var_names=var_names)

        if not ds_id or len(ds_id) == 0:
            ds_id = "local.{}.{}".format(self.id, uuid)
            existing_ds_list = local_store.query(ds_id=ds_id)
            if len(existing_ds_list) == 1:
                return existing_ds_list[0]
        else:
            existing_ds_list = local_store.query(ds_id='local.%s' % ds_id)
            if len(existing_ds_list) == 1:
                if existing_ds_list[0].meta_info.get('uuid', None) == uuid:
                    return existing_ds_list[0]
                else:
                    raise ValueError(
                        'Datastore {} already contains dataset {}'.format(
                            local_store.id, ds_id))

        local_meta_info = self.meta_info.copy()
        local_meta_info['ref_uuid'] = local_meta_info.get('uuid', None)
        local_meta_info['uuid'] = uuid

        local_ds = local_store.create_data_source(ds_id,
                                                  title=title,
                                                  time_range=time_range,
                                                  region=region,
                                                  var_names=var_names,
                                                  meta_info=local_meta_info,
                                                  lock_file=True)
        if local_ds:
            if not local_ds.is_complete:
                try:
                    self._make_local(local_ds,
                                     time_range,
                                     region,
                                     var_names,
                                     monitor=monitor)
                except Cancellation as c:
                    local_store.remove_data_source(local_ds)
                    raise c
                except Exception as e:
                    if local_ds.is_empty:
                        local_store.remove_data_source(local_ds)
                    raise e

            if local_ds.is_empty:
                local_store.remove_data_source(local_ds)
                return None

            local_store.register_ds(local_ds)
            return local_ds
        else:
            return None
Example #32
0
    def test_accepts(self):
        self.assertTrue(PolygonLike.accepts("0.0,0.0,1.1,1.1"))
        self.assertTrue(PolygonLike.accepts("0.0, 0.0, 1.1, 1.1"))

        coords = [(0.0, 0.0), (0.0, 1.0), (1.0, 1.0), (1.0, 0.0)]
        pol = Polygon(coords)
        self.assertTrue(PolygonLike.accepts(coords))
        self.assertTrue(PolygonLike.accepts(pol))
        self.assertTrue(PolygonLike.accepts(pol.wkt))

        self.assertFalse(PolygonLike.accepts("0.0,aaa,1.1,1.1"))
        self.assertFalse(PolygonLike.accepts("0.0, aaa, 1.1, 1.1"))

        coords = [(0.0, 0.0), (0.0, 1.0), (1.0, 'aaa'), (1.0, 0.0)]
        self.assertFalse(PolygonLike.accepts(coords))

        coords = [(0.0, 0.0), (0.0, 1.0), 'Guten Morgen, Berlin!', (1.0, 0.0)]
        self.assertFalse(PolygonLike.accepts(coords))

        invalid = Polygon([(0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0)])
        self.assertFalse(PolygonLike.accepts(invalid))

        wkt = 'MULTIPOLYGON()'
        self.assertFalse(PolygonLike.accepts(wkt))

        invalid = 'Something_something_in_the_month_of_May'
        self.assertFalse(PolygonLike.accepts(invalid))

        self.assertFalse(PolygonLike.accepts(1.0))
Example #33
0
    def test_accepts(self):
        self.assertTrue(PolygonLike.accepts(""))
        self.assertTrue(PolygonLike.accepts(" \t"))
        self.assertTrue(PolygonLike.accepts("0.0,0.0,1.1,1.1"))
        self.assertTrue(PolygonLike.accepts("0.0, 0.0, 1.1, 1.1"))

        coords = [(10.4, 20.2), (30.8, 20.2), (30.8, 40.8), (10.4, 40.8)]
        pol = Polygon(coords)
        self.assertTrue(PolygonLike.accepts(coords))
        self.assertTrue(PolygonLike.accepts(pol))
        self.assertTrue(PolygonLike.accepts(pol.wkt))
        self.assertTrue(PolygonLike.accepts(pol.bounds))

        self.assertFalse(PolygonLike.accepts("0.0,aaa,1.1,1.1"))
        self.assertFalse(PolygonLike.accepts("0.0, aaa, 1.1, 1.1"))
        self.assertFalse(
            PolygonLike.accepts([(0.0, 0.0), (0.0, 1.0), (1.0, 'aaa'),
                                 (1.0, 0.0)]))
        self.assertFalse(
            PolygonLike.accepts([(0.0, 0.0), (0.0, 1.0),
                                 'Guten Morgen, Berlin!', (1.0, 0.0)]))
        self.assertFalse(
            PolygonLike.accepts(
                Polygon([(0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0)])))
        self.assertFalse(PolygonLike.accepts('MULTIPOLYGON()'))
        self.assertFalse(PolygonLike.accepts('Something_in_the_month_of_May'))
        self.assertFalse(PolygonLike.accepts(1.0))
Example #34
0
 def test_format(self):
     coords = [(0.0, 0.0), (0.0, 1.0), (1.0, 1.0), (1.0, 0.0)]
     pol = PolygonLike.convert(coords)
     self.assertTrue(
         'POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))' == PolygonLike.format(pol))
Example #35
0
 def test_json(self):
     self.assertEqual(PolygonLike.from_json("-10, -10, 10, 10"),
                      "-10, -10, 10, 10")
Example #36
0
def subset_spatial(ds: xr.Dataset,
                   region: PolygonLike.TYPE,
                   mask: bool = True) -> xr.Dataset:
    """
    Do a spatial subset of the dataset

    :param ds: Dataset to subset
    :param region: Spatial region to subset
    :param mask: Should values falling in the bounding box of the polygon but
    not the polygon itself be masked with NaN.
    :return: Subset dataset
    """
    # Get the bounding box
    region = PolygonLike.convert(region)
    lon_min, lat_min, lon_max, lat_max = region.bounds

    # Validate the bounding box
    if (not (-90 <= lat_min <= 90)) or \
            (not (-90 <= lat_max <= 90)) or \
            (not (-180 <= lon_min <= 180)) or \
            (not (-180 <= lon_max <= 180)):
        raise ValueError('Provided polygon extends outside of geospatial'
                         ' bounds: latitude [-90;90], longitude [-180;180]')

    simple_polygon = False
    if region.equals(box(lon_min, lat_min, lon_max, lat_max)):
        # Don't do the computationally intensive masking if the provided
        # region is a simple box-polygon, for which there will be nothing to
        # mask.
        simple_polygon = True

    crosses_antimeridian = _crosses_antimeridian(region)

    if crosses_antimeridian and not simple_polygon:
        # Unlikely but plausible
        raise NotImplementedError('Spatial subset over the International Date'
                                  ' Line is currently implemented for simple,'
                                  ' rectangular polygons only.')

    if crosses_antimeridian:
        # Shapely messes up longitudes if the polygon crosses the antimeridian
        lon_min, lon_max = lon_max, lon_min

        # Can't perform a simple selection with slice, hence we have to
        # construct an appropriate longitude indexer for selection
        lon_left_of_idl = slice(lon_min, 180)
        lon_right_of_idl = slice(-180, lon_max)
        lon_index = xr.concat((ds.lon.sel(lon=lon_right_of_idl),
                               ds.lon.sel(lon=lon_left_of_idl)), dim='lon')
        indexers = {'lon': lon_index, 'lat': slice(lat_min, lat_max)}
        retset = ds.sel(**indexers)

        if mask:
            # Preserve the original longitude dimension, masking elements that
            # do not belong to the polygon with NaN.
            return retset.reindex_like(ds.lon)
        else:
            # Return the dataset with no NaNs and with a disjoint longitude
            # dimension
            return retset

    if not mask or simple_polygon:
        # The polygon doesn't cross the IDL, it is a simple box -> Use a simple slice
        lat_slice = slice(lat_min, lat_max)
        lon_slice = slice(lon_min, lon_max)
        indexers = {'lat': lat_slice, 'lon': lon_slice}
        return ds.sel(**indexers)

    # Create the mask array. The result of this is a lon/lat DataArray where
    # all values falling in the region or on its boundary are denoted with True
    # and all the rest with False
    lonm, latm = np.meshgrid(ds.lon.values, ds.lat.values)
    mask = np.array([Point(lon, lat).intersects(region) for lon, lat in
                     zip(lonm.ravel(), latm.ravel())], dtype=bool)
    mask = xr.DataArray(mask.reshape(lonm.shape),
                        coords={'lon': ds.lon, 'lat': ds.lat},
                        dims=['lat', 'lon'])

    # Mask values outside the polygon with NaN, crop the dataset
    return ds.where(mask, drop=True)