Ejemplo n.º 1
0
    def open_dataset(self,
                     time_range: TimeRangeLike.TYPE = None,
                     region: PolygonLike.TYPE = None,
                     var_names: VarNamesLike.TYPE = None,
                     protocol: str = None) -> Any:
        time_range = TimeRangeLike.convert(time_range) if time_range else None
        var_names = VarNamesLike.convert(var_names) if var_names else None

        selected_file_list = self._find_files(time_range)
        if not selected_file_list:
            msg = 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets'.format(self.id)
            if time_range is not None:
                msg += ' in given time range {}'.format(TimeRangeLike.format(time_range))
            raise DataAccessError(msg)

        files = self._get_urls_list(selected_file_list, _ODP_PROTOCOL_OPENDAP)
        try:
            ds = open_xarray_dataset(files)
            if region:
                ds = normalize_impl(ds)
                ds = subset_spatial_impl(ds, region)
            if var_names:
                ds = ds.drop([var_name for var_name in ds.data_vars.keys() if var_name not in var_names])
            return ds

        except OSError as e:
            if time_range:
                raise DataAccessError("Cannot open remote dataset for time range {}:\n"
                                      "{}"
                                      .format(TimeRangeLike.format(time_range), e), source=self) from e
            else:
                raise DataAccessError("Cannot open remote dataset:\n"
                                      "{}"
                                      .format(TimeRangeLike.format(time_range), e), source=self) from e
Ejemplo n.º 2
0
 def open_dataset(self,
                  time_range: TimeRangeLike.TYPE = None,
                  region: PolygonLike.TYPE = None,
                  var_names: VarNamesLike.TYPE = None,
                  protocol: str = None) -> Any:
     time_range = TimeRangeLike.convert(time_range) if time_range else None
     if region:
         region = PolygonLike.convert(region)
     if var_names:
         var_names = VarNamesLike.convert(var_names)
     paths = []
     if time_range:
         time_series = list(self._files.values())
         file_paths = list(self._files.keys())
         for i in range(len(time_series)):
             if time_series[i]:
                 if isinstance(time_series[i], Tuple) and \
                         time_series[i][0] >= time_range[0] and \
                         time_series[i][1] <= time_range[1]:
                     paths.extend(self._resolve_file_path(file_paths[i]))
                 elif isinstance(
                         time_series[i], datetime
                 ) and time_range[0] <= time_series[i] < time_range[1]:
                     paths.extend(self._resolve_file_path(file_paths[i]))
     else:
         for file in self._files.items():
             paths.extend(self._resolve_file_path(file[0]))
     if paths:
         paths = sorted(set(paths))
         try:
             ds = open_xarray_dataset(paths)
             if region:
                 ds = normalize_impl(ds)
                 ds = subset_spatial_impl(ds, region)
             if var_names:
                 ds = ds.drop([
                     var_name for var_name in ds.data_vars.keys()
                     if var_name not in var_names
                 ])
             return ds
         except OSError as e:
             if time_range:
                 raise DataAccessError(
                     "Cannot open local dataset for time range {}:\n"
                     "{}".format(TimeRangeLike.format(time_range), e),
                     source=self) from e
             else:
                 raise DataAccessError("Cannot open local dataset:\n"
                                       "{}".format(e),
                                       source=self) from e
     else:
         if time_range:
             raise DataAccessError(
                 "No local datasets available for\nspecified time range {}".
                 format(TimeRangeLike.format(time_range)),
                 source=self)
         else:
             raise DataAccessError("No local datasets available",
                                   source=self)
Ejemplo n.º 3
0
 def _load_json_file(json_path: str):
     if os.path.isfile(json_path):
         try:
             with open(json_path) as fp:
                 return json.load(fp=fp) or {}
         except json.decoder.JSONDecodeError as e:
             raise DataAccessError(
                 "Cannot load data source config from {}".format(
                     json_path)) from e
     else:
         raise DataAccessError(
             "Data source config does not exists: {}".format(json_path))
Ejemplo n.º 4
0
def _fetch_solr_json(base_url,
                     query_args,
                     offset=0,
                     limit=3500,
                     timeout=10,
                     monitor: Monitor = Monitor.NONE):
    """
    Return JSON value read from paginated Solr web-service.
    """
    combined_json_dict = None
    num_found = -1
    # we don't know ahead of time how much request are necessary
    with monitor.starting("Loading", 10):
        while True:
            monitor.progress(work=1)
            paging_query_args = dict(query_args or {})
            # noinspection PyArgumentList
            paging_query_args.update(offset=offset,
                                     limit=limit,
                                     format='application/solr+json')
            url = base_url + '?' + urllib.parse.urlencode(paging_query_args)
            try:
                with urllib.request.urlopen(url, timeout=timeout) as response:
                    json_text = response.read()
                    json_dict = json.loads(json_text.decode('utf-8'))
                    if num_found is -1:
                        num_found = json_dict.get('response',
                                                  {}).get('numFound', 0)
                    if not combined_json_dict:
                        combined_json_dict = json_dict
                        if num_found < limit:
                            break
                    else:
                        docs = json_dict.get('response', {}).get('docs', [])
                        combined_json_dict.get('response',
                                               {}).get('docs', []).extend(docs)
                        if num_found < offset + limit:
                            break
            except (urllib.error.HTTPError, urllib.error.URLError) as e:
                raise DataAccessError(
                    "Downloading CCI Open Data Portal index failed: {}\n{}".
                    format(e, base_url)) from e
            except socket.timeout:
                raise DataAccessError(
                    "Downloading CCI Open Data Portal index failed: connection timeout\n{}"
                    .format(base_url))
            offset += limit
    return combined_json_dict
Ejemplo n.º 5
0
    def _load_index(self):
        try:
            esgf_json_dict = _load_or_fetch_json(
                _fetch_solr_json,
                fetch_json_args=[
                    _ESGF_CEDA_URL,
                    dict(type='Dataset',
                         replica='false',
                         latest='true',
                         project='esacci')
                ],
                cache_used=self._index_cache_used,
                cache_dir=get_metadata_store_path(),
                cache_json_filename='dataset-list.json',
                cache_timestamp_filename='dataset-list-timestamp.json',
                cache_expiration_days=self._index_cache_expiration_days)

            cci_catalogue_service = EsaCciCatalogueService(_CSW_CEDA_URL)
            csw_json_dict = _load_or_fetch_json(
                cci_catalogue_service.getrecords,
                fetch_json_args=[],
                cache_used=self._index_cache_used,
                cache_dir=get_metadata_store_path(),
                cache_json_filename='catalogue.json',
                cache_timestamp_filename='catalogue-timestamp.json',
                cache_expiration_days=self._index_cache_expiration_days)
        except DataAccessError as e:
            raise DataAccessError(
                "Cannot download CCI Open Data Portal ECV index:\n{}".format(
                    e),
                source=self) from e

        self._csw_data = csw_json_dict
        self._esgf_data = esgf_json_dict
Ejemplo n.º 6
0
    def test_with_source(self):
        store = SimpleDataStore('hihi', [])
        try:
            raise DataAccessError("haha", source=store)
        except DataAccessError as e:
            self.assertEqual(str(e), 'Data store "hihi": haha')
            self.assertIs(e.source, store)
            self.assertIs(e.cause, None)

        source = SimpleDataSource('hehe')
        try:
            raise DataAccessError("haha", source=source)
        except DataAccessError as e:
            self.assertEqual(str(e), 'Data source "hehe": haha')
            self.assertIs(e.source, source)
            self.assertIs(e.cause, None)
Ejemplo n.º 7
0
 def test_plain(self):
     try:
         raise DataAccessError("haha")
     except DataAccessError as e:
         self.assertEqual(str(e), "haha")
         self.assertEqual(e.source, None)
         self.assertEqual(e.cause, None)
Ejemplo n.º 8
0
 def test_with_cause(self):
     e1 = ValueError("a > 5")
     try:
         raise DataAccessError("hoho") from e1
     except DataAccessError as e2:
         self.assertEqual(str(e2), "hoho")
         self.assertIs(e2.source, None)
         self.assertIs(e2.cause, e1)
Ejemplo n.º 9
0
 def _save_data_source(self, data_source):
     json_dict = data_source.to_json_dict()
     dump_kwargs = dict(indent='  ', default=self._json_default_serializer)
     file_name = os.path.join(self._store_dir, data_source.id + '.json')
     try:
         with open(file_name, 'w') as fp:
             json.dump(json_dict, fp, **dump_kwargs)
     except EnvironmentError as e:
         raise DataAccessError("Couldn't save data source config file {}\n"
                               "{}".format(file_name, e),
                               source=self) from e
Ejemplo n.º 10
0
    def _make_local(self,
                    local_ds: LocalDataSource,
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        local_id = local_ds.id
        time_range = TimeRangeLike.convert(time_range)
        region = PolygonLike.convert(region)
        var_names = VarNamesLike.convert(var_names)

        time_range, region, var_names = self._apply_make_local_fixes(
            time_range, region, var_names)

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL',
                                             NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        do_update_of_verified_time_coverage_start_once = True
        verified_time_coverage_start = None
        verified_time_coverage_end = None

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({
                'zlib': True,
                'complevel': compression_level
            })

        if region or var_names:
            protocol = _ODP_PROTOCOL_OPENDAP
        else:
            protocol = _ODP_PROTOCOL_HTTP

        local_path = os.path.join(local_ds.data_store.data_store_path,
                                  local_id)
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        selected_file_list = self._find_files(time_range)
        if not selected_file_list:
            msg = 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets'.format(
                self.id)
            if time_range is not None:
                msg += ' in given time range {}'.format(
                    TimeRangeLike.format(time_range))
            raise DataAccessError(msg)
        try:
            if protocol == _ODP_PROTOCOL_OPENDAP:

                do_update_of_variables_meta_info_once = True
                do_update_of_region_meta_info_once = True

                files = self._get_urls_list(selected_file_list, protocol)
                monitor.start('Sync ' + self.id, total_work=len(files))
                for idx, dataset_uri in enumerate(files):
                    child_monitor = monitor.child(work=1)

                    file_name = os.path.basename(dataset_uri)
                    local_filepath = os.path.join(local_path, file_name)

                    time_coverage_start = selected_file_list[idx][1]
                    time_coverage_end = selected_file_list[idx][2]

                    try:
                        child_monitor.start(label=file_name, total_work=1)

                        remote_dataset = xr.open_dataset(dataset_uri)

                        if var_names:
                            remote_dataset = remote_dataset.drop([
                                var_name for var_name in
                                remote_dataset.data_vars.keys()
                                if var_name not in var_names
                            ])

                        if region:
                            remote_dataset = normalize_impl(remote_dataset)
                            remote_dataset = subset_spatial_impl(
                                remote_dataset, region)
                            geo_lon_min, geo_lat_min, geo_lon_max, geo_lat_max = region.bounds

                            remote_dataset.attrs[
                                'geospatial_lat_min'] = geo_lat_min
                            remote_dataset.attrs[
                                'geospatial_lat_max'] = geo_lat_max
                            remote_dataset.attrs[
                                'geospatial_lon_min'] = geo_lon_min
                            remote_dataset.attrs[
                                'geospatial_lon_max'] = geo_lon_max
                            if do_update_of_region_meta_info_once:
                                local_ds.meta_info['bbox_maxx'] = geo_lon_max
                                local_ds.meta_info['bbox_minx'] = geo_lon_min
                                local_ds.meta_info['bbox_maxy'] = geo_lat_max
                                local_ds.meta_info['bbox_miny'] = geo_lat_min
                                do_update_of_region_meta_info_once = False

                        if compression_enabled:
                            for sel_var_name in remote_dataset.variables.keys(
                            ):
                                remote_dataset.variables.get(
                                    sel_var_name).encoding.update(
                                        encoding_update)

                        remote_dataset.to_netcdf(local_filepath)

                        child_monitor.progress(work=1,
                                               msg=str(time_coverage_start))
                    finally:
                        if do_update_of_variables_meta_info_once:
                            variables_info = local_ds.meta_info.get(
                                'variables', [])
                            local_ds.meta_info['variables'] = [
                                var_info for var_info in variables_info
                                if var_info.get('name') in remote_dataset.
                                variables.keys() and var_info.get(
                                    'name') not in remote_dataset.dims.keys()
                            ]
                            do_update_of_variables_meta_info_once = False

                        local_ds.add_dataset(
                            os.path.join(local_id, file_name),
                            (time_coverage_start, time_coverage_end))

                        if do_update_of_verified_time_coverage_start_once:
                            verified_time_coverage_start = time_coverage_start
                            do_update_of_verified_time_coverage_start_once = False
                        verified_time_coverage_end = time_coverage_end
                    child_monitor.done()
            else:
                outdated_file_list = []
                for file_rec in selected_file_list:
                    filename, _, _, file_size, url = file_rec
                    dataset_file = os.path.join(local_path, filename)
                    # todo (forman, 20160915): must perform better checks on dataset_file if it is...
                    # ... outdated or incomplete or corrupted.
                    # JSON also includes "checksum" and "checksum_type" fields.
                    if not os.path.isfile(dataset_file) or (
                            file_size
                            and os.path.getsize(dataset_file) != file_size):
                        outdated_file_list.append(file_rec)

                if outdated_file_list:
                    with monitor.starting('Sync ' + self.id,
                                          len(outdated_file_list)):
                        bytes_to_download = sum(
                            [file_rec[3] for file_rec in outdated_file_list])
                        dl_stat = _DownloadStatistics(bytes_to_download)

                        file_number = 1

                        for filename, coverage_from, coverage_to, file_size, url in outdated_file_list:
                            dataset_file = os.path.join(local_path, filename)
                            sub_monitor = monitor.child(work=1.0)

                            # noinspection PyUnusedLocal
                            def reporthook(block_number, read_size,
                                           total_file_size):
                                dl_stat.handle_chunk(read_size)
                                sub_monitor.progress(work=read_size,
                                                     msg=str(dl_stat))

                            sub_monitor_msg = "file %d of %d" % (
                                file_number, len(outdated_file_list))
                            with sub_monitor.starting(sub_monitor_msg,
                                                      file_size):
                                urllib.request.urlretrieve(
                                    url[protocol],
                                    filename=dataset_file,
                                    reporthook=reporthook)
                            file_number += 1
                            local_ds.add_dataset(
                                os.path.join(local_id, filename),
                                (coverage_from, coverage_to))

                            if do_update_of_verified_time_coverage_start_once:
                                verified_time_coverage_start = coverage_from
                                do_update_of_verified_time_coverage_start_once = False
                            verified_time_coverage_end = coverage_to
        except OSError as e:
            raise DataAccessError(
                "Copying remote data source failed: {}".format(e),
                source=self) from e
        local_ds.meta_info['temporal_coverage_start'] = TimeLike.format(
            verified_time_coverage_start)
        local_ds.meta_info['temporal_coverage_end'] = TimeLike.format(
            verified_time_coverage_end)
        local_ds.save(True)
Ejemplo n.º 11
0
def _load_or_fetch_json(fetch_json_function,
                        fetch_json_args: list = None,
                        fetch_json_kwargs: dict = None,
                        cache_used: bool = False,
                        cache_dir: str = None,
                        cache_json_filename: str = None,
                        cache_timestamp_filename: str = None,
                        cache_expiration_days: float = 1.0) -> Sequence:
    """
    Return (JSON) value of fetch_json_function or return value of a cached JSON file.
    """
    json_obj = None
    cache_json_file = None

    if cache_used:
        if cache_dir is None:
            raise ValueError(
                'if cache_used argument is True, cache_dir argument must not be None'
            )
        if cache_json_filename is None:
            raise ValueError(
                'if cache_used argument is True, cache_json_filename argument must not be None'
            )
        if cache_timestamp_filename is None:
            raise ValueError(
                'if cache_used argument is True, cache_timestamp_filename argument must not be None'
            )
        if cache_expiration_days is None:
            raise ValueError(
                'if cache_used argument is True, cache_expiration_days argument must not be None'
            )

        cache_json_file = os.path.join(cache_dir, cache_json_filename)
        cache_timestamp_file = os.path.join(cache_dir,
                                            cache_timestamp_filename)

        timestamp = datetime(year=2000, month=1, day=1)
        if os.path.exists(cache_timestamp_file):
            with open(cache_timestamp_file) as fp:
                timestamp_text = fp.read()
                timestamp = datetime.strptime(timestamp_text,
                                              _TIMESTAMP_FORMAT)

        time_diff = datetime.now() - timestamp
        time_diff_days = time_diff.days + time_diff.seconds / 3600. / 24.
        if time_diff_days < cache_expiration_days:
            if os.path.exists(cache_json_file):
                with open(cache_json_file) as fp:
                    json_text = fp.read()
                    json_obj = json.loads(json_text)

    if json_obj is None:
        # noinspection PyArgumentList
        try:
            # noinspection PyArgumentList
            json_obj = fetch_json_function(*(fetch_json_args or []),
                                           **(fetch_json_kwargs or {}))
            if cache_used:
                os.makedirs(cache_dir, exist_ok=True)
                # noinspection PyUnboundLocalVariable
                with open(cache_json_file, 'w') as fp:
                    fp.write(json.dumps(json_obj, indent='  '))
                # noinspection PyUnboundLocalVariable
                with open(cache_timestamp_file, 'w') as fp:
                    fp.write(datetime.utcnow().strftime(_TIMESTAMP_FORMAT))
        except Exception as e:
            if cache_json_file and os.path.exists(cache_json_file):
                with open(cache_json_file) as fp:
                    json_text = fp.read()
                    json_obj = json.loads(json_text)
            else:
                if isinstance(e, DataAccessError):
                    raise DataAccessError(
                        "Cannot fetch information from CCI Open Data Portal server."
                    ) from e
                else:
                    raise e

    return json_obj
Ejemplo n.º 12
0
 def test_plain(self):
     try:
         raise DataAccessError("haha")
     except DataAccessError as e:
         self.assertEqual(str(e), "haha")
         self.assertIsInstance(e, Exception)
Ejemplo n.º 13
0
    def create_data_source(self,
                           data_source_id: str,
                           region: PolygonLike.TYPE = None,
                           title: str = None,
                           time_range: TimeRangeLike.TYPE = None,
                           var_names: VarNamesLike.TYPE = None,
                           meta_info: OrderedDict = None,
                           lock_file: bool = False):
        self._init_data_sources()

        if title:
            if not meta_info:
                meta_info = OrderedDict()
            meta_info['title'] = title

        if not data_source_id.startswith('%s.' % self.id):
            data_source_id = '%s.%s' % (self.id, data_source_id)

        lock_filename = '{}.lock'.format(data_source_id)
        lock_filepath = os.path.join(self._store_dir, lock_filename)
        pid = os.getpid()
        create_time = int(psutil.Process(pid).create_time() * 1000000)

        data_source = None
        for ds in self._data_sources:
            if ds.id == data_source_id:
                if lock_file and os.path.isfile(lock_filepath):
                    with open(lock_filepath, 'r') as lock_file:
                        writer_pid = lock_file.readline()
                        if writer_pid:
                            writer_create_time = -1
                            writer_pid, writer_timestamp = [
                                (int(val) for val in writer_pid.split(":"))
                                if ":" in writer_pid else writer_pid,
                                writer_create_time
                            ]
                            if psutil.pid_exists(
                                    writer_pid) and writer_pid != pid:
                                if writer_timestamp > writer_create_time:
                                    writer_create_time = int(
                                        psutil.Process(
                                            writer_pid).create_time() *
                                        1000000)
                                if writer_create_time == writer_timestamp:
                                    raise DataAccessError(
                                        'Data source "{}" is currently being created by another '
                                        'process (pid:{})'.format(
                                            ds.id, writer_pid),
                                        source=self)
                            # ds.temporal_coverage() == time_range and
                            if ds.spatial_coverage() == region \
                                    and ds.variables_info == var_names:
                                data_source = ds
                                data_source.set_completed(False)
                                break
                raise DataAccessError(
                    'Data source "{}" already exists.'.format(data_source_id),
                    source=self)
        if not data_source:
            data_source = LocalDataSource(data_source_id,
                                          files=[],
                                          data_store=self,
                                          spatial_coverage=region,
                                          variables=var_names,
                                          temporal_coverage=time_range,
                                          meta_info=meta_info,
                                          status=DataSourceStatus.PROCESSING)
            data_source.set_completed(False)
            self._save_data_source(data_source)

        if lock_file:
            with open(lock_filepath, 'w') as lock_file:
                lock_file.write("{}:{}".format(pid, create_time))

        return data_source