def _download_stations() -> Tuple[BytesIO, int]: """ Download station list from ECCC FTP server. :return: CSV payload, source identifier """ gdrive_url = "https://drive.google.com/uc?id=1HDRnj41YBWpMioLPwAFiLlK4SK8NV72C" http_url = ( "https://github.com/earthobservations/testdata/raw/main/ftp.tor.ec.gc.ca/Pub/" "Get_More_Data_Plus_de_donnees/Station%20Inventory%20EN.csv.gz" ) payload = None source = None try: payload = download_file(gdrive_url, CacheExpiry.METAINDEX) source = 0 except Exception: log.exception(f"Unable to access Google drive server at {gdrive_url}") # Fall back to different source. try: response = download_file(http_url, CacheExpiry.METAINDEX) with gzip.open(response, mode="rb") as f: payload = BytesIO(f.read()) source = 1 except Exception: log.exception(f"Unable to access HTTP server at {http_url}") if payload is None: raise FailedDownload("Unable to acquire ECCC stations_result list") return payload, source
def _collect_station_parameter(self, station_id: str, parameter: Enum, dataset: Enum) -> pd.DataFrame: endpoint = self._base_url.format(station_id=station_id) payload = download_file(endpoint, CacheExpiry.NO_CACHE) measures_list = json.loads(payload.read())["items"][0]["measures"] if type(measures_list) == dict: measures_list = [measures_list] measures_list = pd.Series(measures_list) measures_list = measures_list[measures_list.map( lambda measure: measure["parameterName"].lower().replace( " ", "") == parameter.value.lower().replace("_", ""))] try: measure_dict = measures_list[0] except IndexError: return pd.DataFrame() values_endpoint = f"{measure_dict['@id']}/readings.json" payload = download_file(values_endpoint, CacheExpiry.FIVE_MINUTES) readings = json.loads(payload.read())["items"] df = pd.DataFrame.from_records(readings) return df.loc[:, ["dateTime", "value"]].rename( columns={ "dateTime": Columns.DATE.value, "value": Columns.VALUE.value })
def _collect_station_parameter(self, station_id: str, parameter: Enum, dataset: Enum) -> pd.DataFrame: """ Method to collect data for station parameter from WSV Pegelonline following its open REST-API at https://pegelonline.wsv.de/webservices/rest-api/v2/stations/ :param station_id: station_id string :param parameter: parameter enumeration :param dataset: dataset enumeration :return: pandas DataFrame with data """ url = self._endpoint.format(station_id=station_id, parameter=parameter.value) try: response = download_file(url, CacheExpiry.NO_CACHE) except FileNotFoundError: return pd.DataFrame() df = pd.read_json(response) df = df.rename(columns={ "timestamp": Columns.DATE.value, "value": Columns.VALUE.value }) df[Columns.PARAMETER.value] = parameter.value.lower() return df
def read_pdf(url): text = StringIO() response = download_file(url, CacheExpiry.NO_CACHE) pdf = PyPDF2.PdfFileReader(response) for page_number in range(pdf.numPages): page = pdf.getPage(page_number) result = page.extractText() text.write(result) return text.getvalue()
def _download_radolan_data(remote_radolan_filepath: str) -> BytesIO: """ Function (cached) that downloads the RADOLAN_CDC file. Args: remote_radolan_filepath: the file path to the file on the DWD server Returns: the file in binary, either an archive of one file or an archive of multiple files """ return download_file(remote_radolan_filepath, ttl=CacheExpiry.TWELVE_HOURS)
def _create_meta_index_for_subdaily_extreme_wind(period: Period) -> pd.DataFrame: """Create metadata DataFrame for subdaily wind extreme :param period: period for which metadata is acquired :return: pandas.DataFrame with combined information for both 3hourly (fx3) and 6hourly (fx6) wind extremes """ parameter_path = build_path_to_parameter(DwdObservationDataset.WIND_EXTREME, Resolution.SUBDAILY, period) url = reduce( urljoin, [ DWD_SERVER, DWD_CDC_PATH, DWDCDCBase.CLIMATE_OBSERVATIONS.value, parameter_path, ], ) files_server = list_remote_files_fsspec(url, ttl=CacheExpiry.METAINDEX) # Find the one meta file from the files listed on the server meta_file_fx3 = _find_meta_file(files_server, url, ["fx3", "beschreibung", "txt"]) meta_file_fx6 = _find_meta_file(files_server, url, ["fx6", "beschreibung", "txt"]) try: meta_file_fx3 = download_file(meta_file_fx3, ttl=CacheExpiry.METAINDEX) except InvalidURL as e: raise InvalidURL(f"Error: reading metadata {meta_file_fx3} file failed.") from e try: meta_file_fx6 = download_file(meta_file_fx6, ttl=CacheExpiry.METAINDEX) except InvalidURL as e: raise InvalidURL(f"Error: reading metadata {meta_file_fx6} file failed.") from e df_fx3 = _read_meta_df(meta_file_fx3) df_fx6 = _read_meta_df(meta_file_fx6) df_fx6 = df_fx6.loc[df_fx6[Columns.STATION_ID.value].isin(df_fx3[Columns.STATION_ID.value].tolist()), :] return pd.concat([df_fx3, df_fx6])
def _all(self) -> pd.DataFrame: """ Create meta data DataFrame from available station list :return: """ payload = download_file(self._url, CacheExpiry.METAINDEX) df = pd.read_fwf( StringIO(payload.read().decode(encoding="latin-1")), skiprows=4, skip_blank_lines=True, colspecs=[ (0, 5), (6, 11), (12, 17), (18, 22), (23, 44), (45, 51), (52, 58), (59, 64), (65, 71), (72, 76), ], na_values=["----"], header=None, dtype="str", ) df = df[(df.iloc[:, 0] != "=====") & (df.iloc[:, 0] != "TABLE") & (df.iloc[:, 0] != "clu")] df = df.iloc[:, [2, 3, 4, 5, 6, 7]] df.columns = [ Columns.STATION_ID.value, Columns.ICAO_ID.value, Columns.NAME.value, Columns.LATITUDE.value, Columns.LONGITUDE.value, Columns.HEIGHT.value, ] # Convert coordinates from degree minutes to decimal degrees df[Columns.LATITUDE.value] = df[Columns.LATITUDE.value].astype( float).apply(convert_dm_to_dd) df[Columns.LONGITUDE.value] = df[Columns.LONGITUDE.value].astype( float).apply(convert_dm_to_dd) return df.reindex(columns=self._base_columns)
def _download_generic_data(url: str) -> Generator[RadarResult, None, None]: """ Download radar data. :param url: The URL to the file on the DWD server :return: The file in binary, either an archive of one file or an archive of multiple files. """ ttl = CacheExpiry.FIVE_MINUTES if not should_cache_download(url): ttl = CacheExpiry.NO_CACHE data = download_file(url, ttl=ttl) # RadarParameter.FX_REFLECTIVITY if url.endswith(Extension.TAR_BZ2.value): tfs = TarFileSystem(data, compression="bz2") for file in tfs.glob("*"): yield RadarResult( data=tfs.open(file).read(), timestamp=get_date_from_filename(file.name), filename=file.name, ) # RadarParameter.WN_REFLECTIVITY, RADAR_PARAMETERS_SWEEPS (BUFR) # noqa: E800 elif url.endswith(Extension.BZ2.value): with bz2.BZ2File(data, mode="rb") as archive: data = BytesIO(archive.read()) yield RadarResult(url=url, data=data, timestamp=get_date_from_filename(url)) # RADAR_PARAMETERS_RADVOR elif url.endswith(Extension.GZ.value): with gzip.GzipFile(fileobj=data, mode="rb") as archive: data = BytesIO(archive.read()) yield RadarResult(url=url, data=data, timestamp=get_date_from_filename(url)) else: yield RadarResult(url=url, data=data, timestamp=get_date_from_filename(url))
def fetch_dynamic_frequency(self, station_id, parameter: Enum, dataset: Enum) -> str: """ Method to get the frequency string for a station and parameter from WSV Pegelonline. The frequency is given at each station dict queried from the REST-API under "equidistance" :param station_id: station_id string :param parameter: parameter enumeration :param dataset: dataset enumeration :return: frequency as string e.g. "15min" -> Literal["1min", "5min", "15min", "60min"] """ url = self._station_endpoint.format(station_id=station_id, parameter=parameter.value) response = download_file(url) station_dict = json.load(response) return f"{station_dict['equidistance']}min"
def _download_metadata_file_for_1minute_precipitation(metadata_file: str) -> BytesIO: """A function that simply opens a filepath with help of the urllib library and then writes the content to a BytesIO object and returns this object. For this case as it opens lots of requests (there are approx 1000 different files to open for 1minute data), it will do the same at most three times for one file to assure success reading the file. Args: metadata_file (str) - the file that shall be downloaded and returned as bytes. Return: A BytesIO object to which the opened file was written beforehand. """ try: return download_file(metadata_file, ttl=CacheExpiry.NO_CACHE) except InvalidURL as e: raise InvalidURL(f"Reading metadata {metadata_file} file failed.") from e
def _collect_station_parameter(self, station_id: str, parameter, dataset) -> pd.DataFrame: """ Collection method for NOAA GHCN data. Parameter and dataset can be ignored as data is provided as a whole. :param station_id: station id of the station being queried :param parameter: parameter being queried :param dataset: dataset being queried :return: dataframe with read data """ url = self._base_url.format(station_id=station_id) file = download_file(url, CacheExpiry.FIVE_MINUTES) df = pd.read_csv(file, sep=",", dtype=str) meta_columns = [ "LATITUDE", "LONGITUDE", "ELEVATION", "NAME", ] meta_columns.extend( filter(lambda col: col.endswith("_ATTRIBUTES"), df.columns)) df = df.drop(columns=meta_columns) df = df.rename(columns=str.lower).rename( columns={ "station": Columns.STATION_ID.value, "date": Columns.DATE.value }) timezone_ = self._get_timezone_from_station(station_id) df[Columns.DATE.value] = df[Columns.DATE.value].astype("datetime64") df[Columns.DATE.value] = (pd.to_datetime( df[Columns.DATE.value], infer_datetime_format=True).dt.tz_localize( timezone_, ambiguous=True).dt.tz_convert(pytz.UTC)) return self._apply_factors(df)
def _all(self) -> pd.DataFrame: """ Get stations listing UK environment agency data :return: """ def _check_parameter_and_period(measures: Union[dict, List[dict]], resolution_as_int: int, parameters: List[str]): # default: daily, for groundwater stations if type(measures) != list: measures = [measures] return (pd.Series(measures).map(lambda measure: measure.get( "period", 86400) == resolution_as_int and measure[ "observedProperty"]["label"] in parameters).any()) log.info(f"Acquiring station listing from {self.endpoint}") response = download_file(self.endpoint, CacheExpiry.FIVE_MINUTES) payload = json.loads(response.read())["items"] df = pd.DataFrame.from_dict(payload) parameters = [ PARAMETER_MAPPING[parameter.value] for parameter, _ in self.parameter ] df.measures.apply(_check_parameter_and_period, resolution_as_int=self._resolution_as_int, parameters=parameters) # filter for stations that have wanted resolution and parameter combinations df = df[df.measures.apply(_check_parameter_and_period, resolution_as_int=self._resolution_as_int, parameters=parameters)] return df.rename( columns={ "label": Columns.NAME.value, "lat": Columns.LATITUDE.value, "long": Columns.LONGITUDE.value, "notation": Columns.STATION_ID.value, }).rename(columns=str.lower)
def __download_climate_observations_data(remote_file: str) -> bytes: try: file = download_file(remote_file, ttl=CacheExpiry.FIVE_MINUTES) except InvalidURL as e: raise InvalidURL(f"Error: the station data {remote_file} could not be reached.") from e except Exception: raise FailedDownload(f"Download failed for {remote_file}") try: zfs = ZipFileSystem(file) except BadZipFile as e: raise BadZipFile(f"The archive of {remote_file} seems to be corrupted.") from e product_file = zfs.glob("produkt*") if len(product_file) != 1: raise ProductFileNotFound(f"The archive of {remote_file} does not hold a 'produkt' file.") return zfs.open(product_file[0]).read()
def _create_meta_index_for_climate_observations( dataset: DwdObservationDataset, resolution: Resolution, period: Period, ) -> pd.DataFrame: """Function used to create meta index DataFrame parsed from the text files that are located in each data section of the station data directory of the weather service. Args: dataset: observation measure resolution: frequency/granularity of measurement interval period: current, recent or historical files Return: DataFrame with parsed columns of the corresponding text file. Columns are translated into English and data is not yet complete as file existence is not checked. """ parameter_path = build_path_to_parameter(dataset, resolution, period) url = reduce( urljoin, [ DWD_SERVER, DWD_CDC_PATH, DWDCDCBase.CLIMATE_OBSERVATIONS.value, parameter_path, ], ) files_server = list_remote_files_fsspec(url, ttl=CacheExpiry.METAINDEX) # Find the one meta file from the files listed on the server meta_file = _find_meta_file(files_server, url, ["beschreibung", "txt"]) try: file = download_file(meta_file, ttl=CacheExpiry.METAINDEX) except InvalidURL as e: raise InvalidURL(f"Error: reading metadata {meta_file} file failed.") from e return _read_meta_df(file)
def _all(self) -> pd.DataFrame: """ Method to acquire station listing, :return: DataFrame with all stations_result """ listings_url = ( "https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/doc/ghcnd-stations.txt" ) listings_file = download_file(listings_url, CacheExpiry.TWELVE_HOURS) # https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn df = pd.read_fwf( listings_file, dtype=str, header=None, colspecs=[(0, 11), (12, 20), (21, 30), (31, 37), (38, 40), (41, 71), (80, 85)], ) df.columns = [ Columns.STATION_ID.value, Columns.LATITUDE.value, Columns.LONGITUDE.value, Columns.HEIGHT.value, Columns.STATE.value, Columns.NAME.value, Columns.WMO_ID.value, ] inventory_url = "http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-inventory.txt" inventory_file = download_file(inventory_url, CacheExpiry.TWELVE_HOURS) inventory_df = pd.read_fwf( inventory_file, header=None, colspecs=[(0, 11), (36, 40), (41, 45)], ) inventory_df.columns = [ Columns.STATION_ID.value, Columns.FROM_DATE.value, Columns.TO_DATE.value ] inventory_df = (inventory_df.groupby(Columns.STATION_ID.value).agg({ Columns.FROM_DATE.value: min, Columns.TO_DATE.value: max }).reset_index()) inventory_df[Columns.FROM_DATE.value] = pd.to_datetime( inventory_df[Columns.FROM_DATE.value], format="%Y", errors="coerce") inventory_df[Columns.TO_DATE.value] = pd.to_datetime( inventory_df[Columns.TO_DATE.value], format="%Y", errors="coerce") inventory_df[Columns.TO_DATE.value] += YearEnd() return df.merge(inventory_df, how="left", left_on=Columns.STATION_ID.value, right_on=Columns.STATION_ID.value)
def _all(self): """ Method to get stations_result for WSV Pegelonline. It involves reading the REST API, doing some transformations and adding characteristic values in extra columns if given for each station. :return: """ def _extract_ts(ts_list: List[dict], ) -> FLOAT_9_TIMES: """ Function to extract water level related information namely gauge zero and characteristic values from timeseries dict given for each station. :param ts_list: list of dictionaries with each dictionary holding information for one characteristic value / gauge zero :return: tuple with values given in exact order """ ts_water = None for ts in ts_list: if ts["shortname"] == "W": ts_water = ts break if not ts_water: return pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA gauge_datum = ts_water.get("gaugeZero", {}).get("value", pd.NA) characteristic_values = ts_water.get("characteristicValues") or { } # could be empty list so ensure dict if characteristic_values: characteristic_values = ( pd.DataFrame.from_dict(characteristic_values).set_index( "shortname").loc[:, "value"].to_dict()) m_i = characteristic_values.get("M_I", pd.NA) m_ii = characteristic_values.get("M_II", pd.NA) m_iii = characteristic_values.get("M_III", pd.NA) mnw = characteristic_values.get("MNW", pd.NA) mw = characteristic_values.get("MW", pd.NA) mhw = characteristic_values.get("MHW", pd.NA) hhw = characteristic_values.get("HHW", pd.NA) hsw = characteristic_values.get("HSW", pd.NA) return gauge_datum, m_i, m_ii, m_iii, mnw, mw, mhw, hhw, hsw response = download_file(self._endpoint, CacheExpiry.ONE_HOUR) df = pd.read_json(response) df = df.rename(columns={ "number": "station_id", "shortname": "name", "km": "river_kilometer" }) df.loc[:, "water"] = df["water"].map(lambda x: x["shortname"]) timeseries = df.pop("timeseries") # Get available parameters per station df["ts"] = timeseries.apply( lambda ts_list: {t["shortname"].lower() for t in ts_list}) parameters = {par.value.lower() for par, ds in self.parameter} # Filter out stations_result that do not have any of the parameters requested df = df.loc[ df["ts"].map(lambda par: not not par.intersection(parameters)), :] df[[ "gauge_datum", "m_i", "m_ii", "m_iii", "mnw", "mw", "mhw", "hhw", "hsw" ]] = timeseries.apply(func=_extract_ts).apply(pd.Series) return df
def _collect_station_parameter( self, station_id: str, parameter: EcccObservationParameter, dataset: Enum ) -> pd.DataFrame: """ :param station_id: station id being queried :param parameter: parameter being queried :param dataset: dataset of query, can be skipped as ECCC has unique dataset :return: pandas.DataFrame with data """ meta = self.sr.df[self.sr.df[Columns.STATION_ID.value] == station_id] name, from_date, to_date = ( meta[ [ Columns.NAME.value, Columns.FROM_DATE.value, Columns.TO_DATE.value, ] ] .values.flatten() .tolist() ) # start and end year from station start_year = None if pd.isna(from_date) else from_date.year end_year = None if pd.isna(to_date) else to_date.year # start_date and end_date from request start_date = self.sr.stations.start_date end_date = self.sr.stations.end_date start_year = start_year and max(start_year, start_date and start_date.year or start_year) end_year = end_year and min(end_year, end_date and end_date.year or end_year) # Following lines may partially be based on @Zeitsperre's canada-climate-python # code at https://github.com/Zeitsperre/canada-climate-python/blob/ # master/ECCC_stations_fulldownload.py data = [] # check that station has a first and last year value if start_year and end_year: for url in self._create_file_urls(station_id, start_year, end_year): log.info(f"Acquiring file from {url}") payload = download_file(url, CacheExpiry.NO_CACHE) df_temp = pd.read_csv(payload) df_temp = df_temp.rename(columns=str.lower) df_temp = df_temp.drop( columns=[ "longitude (x)", "latitude (y)", "station name", "climate id", "year", "month", "day", "time (lst)", ], errors="ignore", ) data.append(df_temp) try: df = pd.concat(data) except ValueError: df = pd.DataFrame() df = df.rename( columns={ "date/time (lst)": Columns.DATE.value, "date/time": Columns.DATE.value, } ) df = df.reset_index(drop=True) df = df.drop(columns=["data quality"], errors="ignore") df[Columns.STATION_ID.value] = station_id return df