Ejemplo n.º 1
0
    def _download_stations() -> Tuple[BytesIO, int]:
        """
        Download station list from ECCC FTP server.

        :return: CSV payload, source identifier
        """

        gdrive_url = "https://drive.google.com/uc?id=1HDRnj41YBWpMioLPwAFiLlK4SK8NV72C"
        http_url = (
            "https://github.com/earthobservations/testdata/raw/main/ftp.tor.ec.gc.ca/Pub/"
            "Get_More_Data_Plus_de_donnees/Station%20Inventory%20EN.csv.gz"
        )

        payload = None
        source = None
        try:
            payload = download_file(gdrive_url, CacheExpiry.METAINDEX)
            source = 0
        except Exception:
            log.exception(f"Unable to access Google drive server at {gdrive_url}")

            # Fall back to different source.
            try:
                response = download_file(http_url, CacheExpiry.METAINDEX)
                with gzip.open(response, mode="rb") as f:
                    payload = BytesIO(f.read())
                source = 1
            except Exception:
                log.exception(f"Unable to access HTTP server at {http_url}")

        if payload is None:
            raise FailedDownload("Unable to acquire ECCC stations_result list")

        return payload, source
Ejemplo n.º 2
0
    def _collect_station_parameter(self, station_id: str, parameter: Enum,
                                   dataset: Enum) -> pd.DataFrame:
        endpoint = self._base_url.format(station_id=station_id)
        payload = download_file(endpoint, CacheExpiry.NO_CACHE)

        measures_list = json.loads(payload.read())["items"][0]["measures"]

        if type(measures_list) == dict:
            measures_list = [measures_list]

        measures_list = pd.Series(measures_list)

        measures_list = measures_list[measures_list.map(
            lambda measure: measure["parameterName"].lower().replace(
                " ", "") == parameter.value.lower().replace("_", ""))]

        try:
            measure_dict = measures_list[0]
        except IndexError:
            return pd.DataFrame()

        values_endpoint = f"{measure_dict['@id']}/readings.json"

        payload = download_file(values_endpoint, CacheExpiry.FIVE_MINUTES)

        readings = json.loads(payload.read())["items"]

        df = pd.DataFrame.from_records(readings)

        return df.loc[:, ["dateTime", "value"]].rename(
            columns={
                "dateTime": Columns.DATE.value,
                "value": Columns.VALUE.value
            })
Ejemplo n.º 3
0
    def _collect_station_parameter(self, station_id: str, parameter: Enum,
                                   dataset: Enum) -> pd.DataFrame:
        """
        Method to collect data for station parameter from WSV Pegelonline following its open REST-API at
        https://pegelonline.wsv.de/webservices/rest-api/v2/stations/
        :param station_id: station_id string
        :param parameter: parameter enumeration
        :param dataset: dataset enumeration
        :return: pandas DataFrame with data
        """
        url = self._endpoint.format(station_id=station_id,
                                    parameter=parameter.value)

        try:
            response = download_file(url, CacheExpiry.NO_CACHE)
        except FileNotFoundError:
            return pd.DataFrame()

        df = pd.read_json(response)

        df = df.rename(columns={
            "timestamp": Columns.DATE.value,
            "value": Columns.VALUE.value
        })

        df[Columns.PARAMETER.value] = parameter.value.lower()

        return df
Ejemplo n.º 4
0
def read_pdf(url):
    text = StringIO()
    response = download_file(url, CacheExpiry.NO_CACHE)
    pdf = PyPDF2.PdfFileReader(response)
    for page_number in range(pdf.numPages):
        page = pdf.getPage(page_number)
        result = page.extractText()
        text.write(result)
    return text.getvalue()
Ejemplo n.º 5
0
def _download_radolan_data(remote_radolan_filepath: str) -> BytesIO:
    """
    Function (cached) that downloads the RADOLAN_CDC file.

    Args:
        remote_radolan_filepath: the file path to the file on the DWD server

    Returns:
        the file in binary, either an archive of one file or an archive of multiple
        files
    """
    return download_file(remote_radolan_filepath, ttl=CacheExpiry.TWELVE_HOURS)
Ejemplo n.º 6
0
def _create_meta_index_for_subdaily_extreme_wind(period: Period) -> pd.DataFrame:
    """Create metadata DataFrame for subdaily wind extreme

    :param period: period for which metadata is acquired
    :return: pandas.DataFrame with combined information for both 3hourly (fx3) and 6hourly (fx6) wind extremes
    """
    parameter_path = build_path_to_parameter(DwdObservationDataset.WIND_EXTREME, Resolution.SUBDAILY, period)

    url = reduce(
        urljoin,
        [
            DWD_SERVER,
            DWD_CDC_PATH,
            DWDCDCBase.CLIMATE_OBSERVATIONS.value,
            parameter_path,
        ],
    )

    files_server = list_remote_files_fsspec(url, ttl=CacheExpiry.METAINDEX)

    # Find the one meta file from the files listed on the server
    meta_file_fx3 = _find_meta_file(files_server, url, ["fx3", "beschreibung", "txt"])
    meta_file_fx6 = _find_meta_file(files_server, url, ["fx6", "beschreibung", "txt"])

    try:
        meta_file_fx3 = download_file(meta_file_fx3, ttl=CacheExpiry.METAINDEX)
    except InvalidURL as e:
        raise InvalidURL(f"Error: reading metadata {meta_file_fx3} file failed.") from e

    try:
        meta_file_fx6 = download_file(meta_file_fx6, ttl=CacheExpiry.METAINDEX)
    except InvalidURL as e:
        raise InvalidURL(f"Error: reading metadata {meta_file_fx6} file failed.") from e

    df_fx3 = _read_meta_df(meta_file_fx3)
    df_fx6 = _read_meta_df(meta_file_fx6)

    df_fx6 = df_fx6.loc[df_fx6[Columns.STATION_ID.value].isin(df_fx3[Columns.STATION_ID.value].tolist()), :]

    return pd.concat([df_fx3, df_fx6])
Ejemplo n.º 7
0
    def _all(self) -> pd.DataFrame:
        """
        Create meta data DataFrame from available station list

        :return:
        """
        payload = download_file(self._url, CacheExpiry.METAINDEX)

        df = pd.read_fwf(
            StringIO(payload.read().decode(encoding="latin-1")),
            skiprows=4,
            skip_blank_lines=True,
            colspecs=[
                (0, 5),
                (6, 11),
                (12, 17),
                (18, 22),
                (23, 44),
                (45, 51),
                (52, 58),
                (59, 64),
                (65, 71),
                (72, 76),
            ],
            na_values=["----"],
            header=None,
            dtype="str",
        )

        df = df[(df.iloc[:, 0] != "=====") & (df.iloc[:, 0] != "TABLE") &
                (df.iloc[:, 0] != "clu")]

        df = df.iloc[:, [2, 3, 4, 5, 6, 7]]

        df.columns = [
            Columns.STATION_ID.value,
            Columns.ICAO_ID.value,
            Columns.NAME.value,
            Columns.LATITUDE.value,
            Columns.LONGITUDE.value,
            Columns.HEIGHT.value,
        ]

        # Convert coordinates from degree minutes to decimal degrees
        df[Columns.LATITUDE.value] = df[Columns.LATITUDE.value].astype(
            float).apply(convert_dm_to_dd)
        df[Columns.LONGITUDE.value] = df[Columns.LONGITUDE.value].astype(
            float).apply(convert_dm_to_dd)

        return df.reindex(columns=self._base_columns)
Ejemplo n.º 8
0
def _download_generic_data(url: str) -> Generator[RadarResult, None, None]:
    """
    Download radar data.

    :param url:         The URL to the file on the DWD server

    :return:            The file in binary, either an archive of one file
                        or an archive of multiple files.
    """

    ttl = CacheExpiry.FIVE_MINUTES
    if not should_cache_download(url):
        ttl = CacheExpiry.NO_CACHE

    data = download_file(url, ttl=ttl)

    # RadarParameter.FX_REFLECTIVITY
    if url.endswith(Extension.TAR_BZ2.value):
        tfs = TarFileSystem(data, compression="bz2")
        for file in tfs.glob("*"):
            yield RadarResult(
                data=tfs.open(file).read(),
                timestamp=get_date_from_filename(file.name),
                filename=file.name,
            )

    # RadarParameter.WN_REFLECTIVITY, RADAR_PARAMETERS_SWEEPS (BUFR)  # noqa: E800
    elif url.endswith(Extension.BZ2.value):
        with bz2.BZ2File(data, mode="rb") as archive:
            data = BytesIO(archive.read())
            yield RadarResult(url=url,
                              data=data,
                              timestamp=get_date_from_filename(url))

    # RADAR_PARAMETERS_RADVOR
    elif url.endswith(Extension.GZ.value):
        with gzip.GzipFile(fileobj=data, mode="rb") as archive:
            data = BytesIO(archive.read())
            yield RadarResult(url=url,
                              data=data,
                              timestamp=get_date_from_filename(url))

    else:
        yield RadarResult(url=url,
                          data=data,
                          timestamp=get_date_from_filename(url))
Ejemplo n.º 9
0
    def fetch_dynamic_frequency(self, station_id, parameter: Enum,
                                dataset: Enum) -> str:
        """
        Method to get the frequency string for a station and parameter from WSV Pegelonline. The frequency is given at
        each station dict queried from the REST-API under "equidistance"
        :param station_id: station_id string
        :param parameter: parameter enumeration
        :param dataset: dataset enumeration
        :return: frequency as string e.g. "15min" -> Literal["1min", "5min", "15min", "60min"]
        """
        url = self._station_endpoint.format(station_id=station_id,
                                            parameter=parameter.value)

        response = download_file(url)

        station_dict = json.load(response)

        return f"{station_dict['equidistance']}min"
Ejemplo n.º 10
0
def _download_metadata_file_for_1minute_precipitation(metadata_file: str) -> BytesIO:
    """A function that simply opens a filepath with help of the urllib library and then
    writes the content to a BytesIO object and returns this object. For this case as it
    opens lots of requests (there are approx 1000 different files to open for
    1minute data), it will do the same at most three times for one file to assure
    success reading the file.

    Args:
        metadata_file (str) - the file that shall be downloaded and returned as bytes.

    Return:
        A BytesIO object to which the opened file was written beforehand.

    """
    try:
        return download_file(metadata_file, ttl=CacheExpiry.NO_CACHE)
    except InvalidURL as e:
        raise InvalidURL(f"Reading metadata {metadata_file} file failed.") from e
Ejemplo n.º 11
0
    def _collect_station_parameter(self, station_id: str, parameter,
                                   dataset) -> pd.DataFrame:
        """
        Collection method for NOAA GHCN data. Parameter and dataset can be ignored as data
        is provided as a whole.

        :param station_id: station id of the station being queried
        :param parameter: parameter being queried
        :param dataset: dataset being queried
        :return: dataframe with read data
        """
        url = self._base_url.format(station_id=station_id)

        file = download_file(url, CacheExpiry.FIVE_MINUTES)

        df = pd.read_csv(file, sep=",", dtype=str)

        meta_columns = [
            "LATITUDE",
            "LONGITUDE",
            "ELEVATION",
            "NAME",
        ]

        meta_columns.extend(
            filter(lambda col: col.endswith("_ATTRIBUTES"), df.columns))

        df = df.drop(columns=meta_columns)

        df = df.rename(columns=str.lower).rename(
            columns={
                "station": Columns.STATION_ID.value,
                "date": Columns.DATE.value
            })

        timezone_ = self._get_timezone_from_station(station_id)

        df[Columns.DATE.value] = df[Columns.DATE.value].astype("datetime64")

        df[Columns.DATE.value] = (pd.to_datetime(
            df[Columns.DATE.value], infer_datetime_format=True).dt.tz_localize(
                timezone_, ambiguous=True).dt.tz_convert(pytz.UTC))

        return self._apply_factors(df)
Ejemplo n.º 12
0
    def _all(self) -> pd.DataFrame:
        """
        Get stations listing UK environment agency data
        :return:
        """
        def _check_parameter_and_period(measures: Union[dict, List[dict]],
                                        resolution_as_int: int,
                                        parameters: List[str]):
            # default: daily, for groundwater stations
            if type(measures) != list:
                measures = [measures]
            return (pd.Series(measures).map(lambda measure: measure.get(
                "period", 86400) == resolution_as_int and measure[
                    "observedProperty"]["label"] in parameters).any())

        log.info(f"Acquiring station listing from {self.endpoint}")

        response = download_file(self.endpoint, CacheExpiry.FIVE_MINUTES)

        payload = json.loads(response.read())["items"]

        df = pd.DataFrame.from_dict(payload)

        parameters = [
            PARAMETER_MAPPING[parameter.value]
            for parameter, _ in self.parameter
        ]

        df.measures.apply(_check_parameter_and_period,
                          resolution_as_int=self._resolution_as_int,
                          parameters=parameters)
        # filter for stations that have wanted resolution and parameter combinations
        df = df[df.measures.apply(_check_parameter_and_period,
                                  resolution_as_int=self._resolution_as_int,
                                  parameters=parameters)]

        return df.rename(
            columns={
                "label": Columns.NAME.value,
                "lat": Columns.LATITUDE.value,
                "long": Columns.LONGITUDE.value,
                "notation": Columns.STATION_ID.value,
            }).rename(columns=str.lower)
Ejemplo n.º 13
0
def __download_climate_observations_data(remote_file: str) -> bytes:

    try:
        file = download_file(remote_file, ttl=CacheExpiry.FIVE_MINUTES)
    except InvalidURL as e:
        raise InvalidURL(f"Error: the station data {remote_file} could not be reached.") from e
    except Exception:
        raise FailedDownload(f"Download failed for {remote_file}")

    try:
        zfs = ZipFileSystem(file)
    except BadZipFile as e:
        raise BadZipFile(f"The archive of {remote_file} seems to be corrupted.") from e

    product_file = zfs.glob("produkt*")

    if len(product_file) != 1:
        raise ProductFileNotFound(f"The archive of {remote_file} does not hold a 'produkt' file.")

    return zfs.open(product_file[0]).read()
Ejemplo n.º 14
0
def _create_meta_index_for_climate_observations(
    dataset: DwdObservationDataset,
    resolution: Resolution,
    period: Period,
) -> pd.DataFrame:
    """Function used to create meta index DataFrame parsed from the text files that are
    located in each data section of the station data directory of the weather service.

    Args:
        dataset: observation measure
        resolution: frequency/granularity of measurement interval
        period: current, recent or historical files
    Return:
        DataFrame with parsed columns of the corresponding text file. Columns are
        translated into English and data is not yet complete as file existence is
        not checked.

    """
    parameter_path = build_path_to_parameter(dataset, resolution, period)

    url = reduce(
        urljoin,
        [
            DWD_SERVER,
            DWD_CDC_PATH,
            DWDCDCBase.CLIMATE_OBSERVATIONS.value,
            parameter_path,
        ],
    )

    files_server = list_remote_files_fsspec(url, ttl=CacheExpiry.METAINDEX)

    # Find the one meta file from the files listed on the server
    meta_file = _find_meta_file(files_server, url, ["beschreibung", "txt"])

    try:
        file = download_file(meta_file, ttl=CacheExpiry.METAINDEX)
    except InvalidURL as e:
        raise InvalidURL(f"Error: reading metadata {meta_file} file failed.") from e

    return _read_meta_df(file)
Ejemplo n.º 15
0
    def _all(self) -> pd.DataFrame:
        """
        Method to acquire station listing,
        :return: DataFrame with all stations_result
        """
        listings_url = (
            "https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/doc/ghcnd-stations.txt"
        )

        listings_file = download_file(listings_url, CacheExpiry.TWELVE_HOURS)

        # https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn
        df = pd.read_fwf(
            listings_file,
            dtype=str,
            header=None,
            colspecs=[(0, 11), (12, 20), (21, 30), (31, 37), (38, 40),
                      (41, 71), (80, 85)],
        )

        df.columns = [
            Columns.STATION_ID.value,
            Columns.LATITUDE.value,
            Columns.LONGITUDE.value,
            Columns.HEIGHT.value,
            Columns.STATE.value,
            Columns.NAME.value,
            Columns.WMO_ID.value,
        ]

        inventory_url = "http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-inventory.txt"

        inventory_file = download_file(inventory_url, CacheExpiry.TWELVE_HOURS)

        inventory_df = pd.read_fwf(
            inventory_file,
            header=None,
            colspecs=[(0, 11), (36, 40), (41, 45)],
        )

        inventory_df.columns = [
            Columns.STATION_ID.value, Columns.FROM_DATE.value,
            Columns.TO_DATE.value
        ]

        inventory_df = (inventory_df.groupby(Columns.STATION_ID.value).agg({
            Columns.FROM_DATE.value:
            min,
            Columns.TO_DATE.value:
            max
        }).reset_index())

        inventory_df[Columns.FROM_DATE.value] = pd.to_datetime(
            inventory_df[Columns.FROM_DATE.value],
            format="%Y",
            errors="coerce")
        inventory_df[Columns.TO_DATE.value] = pd.to_datetime(
            inventory_df[Columns.TO_DATE.value], format="%Y", errors="coerce")

        inventory_df[Columns.TO_DATE.value] += YearEnd()

        return df.merge(inventory_df,
                        how="left",
                        left_on=Columns.STATION_ID.value,
                        right_on=Columns.STATION_ID.value)
Ejemplo n.º 16
0
    def _all(self):
        """
        Method to get stations_result for WSV Pegelonline. It involves reading the REST API, doing some transformations and
        adding characteristic values in extra columns if given for each station.
        :return:
        """
        def _extract_ts(ts_list: List[dict], ) -> FLOAT_9_TIMES:
            """
            Function to extract water level related information namely gauge zero and characteristic values
            from timeseries dict given for each station.
            :param ts_list: list of dictionaries with each dictionary holding information for one
                characteristic value / gauge zero
            :return: tuple with values given in exact order
            """
            ts_water = None
            for ts in ts_list:
                if ts["shortname"] == "W":
                    ts_water = ts
                    break

            if not ts_water:
                return pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA

            gauge_datum = ts_water.get("gaugeZero", {}).get("value", pd.NA)

            characteristic_values = ts_water.get("characteristicValues") or {
            }  # could be empty list so ensure dict

            if characteristic_values:
                characteristic_values = (
                    pd.DataFrame.from_dict(characteristic_values).set_index(
                        "shortname").loc[:, "value"].to_dict())

            m_i = characteristic_values.get("M_I", pd.NA)
            m_ii = characteristic_values.get("M_II", pd.NA)
            m_iii = characteristic_values.get("M_III", pd.NA)
            mnw = characteristic_values.get("MNW", pd.NA)
            mw = characteristic_values.get("MW", pd.NA)
            mhw = characteristic_values.get("MHW", pd.NA)
            hhw = characteristic_values.get("HHW", pd.NA)
            hsw = characteristic_values.get("HSW", pd.NA)

            return gauge_datum, m_i, m_ii, m_iii, mnw, mw, mhw, hhw, hsw

        response = download_file(self._endpoint, CacheExpiry.ONE_HOUR)

        df = pd.read_json(response)

        df = df.rename(columns={
            "number": "station_id",
            "shortname": "name",
            "km": "river_kilometer"
        })

        df.loc[:, "water"] = df["water"].map(lambda x: x["shortname"])

        timeseries = df.pop("timeseries")

        # Get available parameters per station
        df["ts"] = timeseries.apply(
            lambda ts_list: {t["shortname"].lower()
                             for t in ts_list})

        parameters = {par.value.lower() for par, ds in self.parameter}

        # Filter out stations_result that do not have any of the parameters requested
        df = df.loc[
            df["ts"].map(lambda par: not not par.intersection(parameters)), :]

        df[[
            "gauge_datum", "m_i", "m_ii", "m_iii", "mnw", "mw", "mhw", "hhw",
            "hsw"
        ]] = timeseries.apply(func=_extract_ts).apply(pd.Series)

        return df
Ejemplo n.º 17
0
    def _collect_station_parameter(
        self, station_id: str, parameter: EcccObservationParameter, dataset: Enum
    ) -> pd.DataFrame:
        """

        :param station_id: station id being queried
        :param parameter: parameter being queried
        :param dataset: dataset of query, can be skipped as ECCC has unique dataset
        :return: pandas.DataFrame with data
        """
        meta = self.sr.df[self.sr.df[Columns.STATION_ID.value] == station_id]

        name, from_date, to_date = (
            meta[
                [
                    Columns.NAME.value,
                    Columns.FROM_DATE.value,
                    Columns.TO_DATE.value,
                ]
            ]
            .values.flatten()
            .tolist()
        )

        # start and end year from station
        start_year = None if pd.isna(from_date) else from_date.year
        end_year = None if pd.isna(to_date) else to_date.year

        # start_date and end_date from request
        start_date = self.sr.stations.start_date
        end_date = self.sr.stations.end_date

        start_year = start_year and max(start_year, start_date and start_date.year or start_year)
        end_year = end_year and min(end_year, end_date and end_date.year or end_year)

        # Following lines may partially be based on @Zeitsperre's canada-climate-python
        # code at https://github.com/Zeitsperre/canada-climate-python/blob/
        # master/ECCC_stations_fulldownload.py
        data = []

        # check that station has a first and last year value
        if start_year and end_year:
            for url in self._create_file_urls(station_id, start_year, end_year):
                log.info(f"Acquiring file from {url}")

                payload = download_file(url, CacheExpiry.NO_CACHE)

                df_temp = pd.read_csv(payload)

                df_temp = df_temp.rename(columns=str.lower)

                df_temp = df_temp.drop(
                    columns=[
                        "longitude (x)",
                        "latitude (y)",
                        "station name",
                        "climate id",
                        "year",
                        "month",
                        "day",
                        "time (lst)",
                    ],
                    errors="ignore",
                )

                data.append(df_temp)

        try:
            df = pd.concat(data)
        except ValueError:
            df = pd.DataFrame()

        df = df.rename(
            columns={
                "date/time (lst)": Columns.DATE.value,
                "date/time": Columns.DATE.value,
            }
        )

        df = df.reset_index(drop=True)

        df = df.drop(columns=["data quality"], errors="ignore")

        df[Columns.STATION_ID.value] = station_id

        return df