Ejemplo n.º 1
0
def _create_file_index_for_dwd_server(
    parameter: Parameter,
    time_resolution: TimeResolution,
    period_type: PeriodType,
    cdc_base: DWDCDCBase,
) -> pd.DataFrame:
    """
    Function to create a file index of the DWD station data, which usually is shipped as
    zipped/archived data. The file index is created for an individual set of parameters.
    Args:
        parameter: parameter of Parameter enumeration
        time_resolution: time resolution of TimeResolution enumeration
        period_type: period type of PeriodType enumeration
        cdc_base: base path e.g. climate_observations/germany
    Returns:
        file index in a pandas.DataFrame with sets of parameters and station id
    """
    parameter_path = build_path_to_parameter(parameter, time_resolution,
                                             period_type)

    url = reduce(urljoin,
                 [DWD_SERVER, DWD_CDC_PATH, cdc_base.value, parameter_path])

    files_server = list_remote_files(url, recursive=True)

    files_server = pd.DataFrame(files_server,
                                columns=[DWDMetaColumns.FILENAME.value],
                                dtype="str")

    return files_server
Ejemplo n.º 2
0
def _create_meta_index_for_climate_observations(
    parameter_set: DwdObservationDataset,
    resolution: Resolution,
    period: Period,
) -> pd.DataFrame:
    """Function used to create meta index DataFrame parsed from the text files that are
    located in each data section of the station data directory of the weather service.

    Args:
        parameter_set: observation measure
        resolution: frequency/granularity of measurement interval
        period: current, recent or historical files
    Return:
        DataFrame with parsed columns of the corresponding text file. Columns are
        translated into English and data is not yet complete as file existence is
        not checked.

    """
    parameter_path = build_path_to_parameter(parameter_set, resolution, period)

    url = reduce(
        urljoin,
        [
            DWD_SERVER,
            DWD_CDC_PATH,
            DWDCDCBase.CLIMATE_OBSERVATIONS.value,
            parameter_path,
        ],
    )

    files_server = list_remote_files(url, recursive=True)

    # Find the one meta file from the files listed on the server
    meta_file = _find_meta_file(files_server, url)

    try:
        file = download_file_from_dwd(meta_file)
    except InvalidURL as e:
        raise InvalidURL(
            f"Error: reading metadata {meta_file} file failed.") from e

    meta_index = pd.read_fwf(
        filepath_or_buffer=file,
        colspecs=METADATA_FIXED_COLUMN_WIDTH,
        skiprows=[1],
        dtype=str,
        encoding="ISO-8859-1",
    )

    # Fix column names, as header is not aligned to fixed column widths
    meta_index.columns = "".join([
        column for column in meta_index.columns
        if "unnamed" not in column.lower()
    ]).split(" ")

    meta_index = meta_index.rename(columns=str.lower)

    meta_index = meta_index.rename(columns=GERMAN_TO_ENGLISH_COLUMNS_MAPPING)

    return meta_index
Ejemplo n.º 3
0
def _create_meta_index_for_1minute_historical_precipitation() -> pd.DataFrame:
    """
    A helping function to create a raw index of metadata for stations of the set of
    parameters as given. This raw metadata is then used by other functions. This
    second/alternative function must be used for high resolution data, where the
    metadata is not available as file but instead saved in external files per each
    station.
    - especially for precipitation/1_minute/historical!

    """

    parameter_path = f"{TimeResolution.MINUTE_1.value}/{Parameter.PRECIPITATION.value}/"

    url = reduce(
        urljoin,
        [
            DWD_SERVER,
            DWD_CDC_PATH,
            DWDCDCBase.CLIMATE_OBSERVATIONS.value,
            parameter_path,
            META_DATA_FOLDER,
        ],
    )

    metadata_file_paths = list_remote_files(url, recursive=False)

    station_ids = [
        re.findall(STATION_ID_REGEX, file).pop(0)
        for file in metadata_file_paths
    ]

    meta_index_df = pd.DataFrame(columns=METADATA_COLUMNS)

    with ThreadPoolExecutor() as executor:
        metadata_files = executor.map(
            _download_metadata_file_for_1minute_precipitation,
            metadata_file_paths)

    with ThreadPoolExecutor() as executor:
        metadata_dfs = executor.map(_parse_geo_metadata,
                                    zip(metadata_files, station_ids))

    meta_index_df = meta_index_df.append(other=list(metadata_dfs),
                                         ignore_index=True)

    missing_to_date_index = pd.isnull(
        meta_index_df[DWDMetaColumns.TO_DATE.value])

    meta_index_df.loc[missing_to_date_index,
                      DWDMetaColumns.TO_DATE.value] = pd.Timestamp(
                          dt.date.today() -
                          dt.timedelta(days=1)).strftime("%Y%m%d")

    meta_index_df = meta_index_df.astype(METADATA_DTYPE_MAPPING)

    # Drop empty state column again as it will be merged later on
    meta_index_df = meta_index_df.drop(labels=DWDMetaColumns.STATE.value,
                                       axis=1)

    return meta_index_df
Ejemplo n.º 4
0
 def get_url_latest(self, url):
     urls = list_remote_files(url, False)
     try:
         url = list(filter(lambda url: "LATEST" in url, urls))[0]
         return url
     except:  # noqa:E722,B001
         raise KeyError(f"Unable to find LATEST file within {url}")
Ejemplo n.º 5
0
def test_list_files_of_climate_observations():
    files_server = list_remote_files(
        "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/"
        "annual/kl/recent",
        recursive=False,
    )

    assert (
        "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/"
        "annual/kl/recent/jahreswerte_KL_01048_akt.zip" in files_server)
Ejemplo n.º 6
0
    def get_url_for_date(url: str, date: Union[datetime,
                                               DWDForecastDate]) -> str:
        """
        Method to get a file url based on the MOSMIX-S/MOSMIX-L url and the date that is
        used for filtering.

        Args:
            url:    MOSMIX-S/MOSMIX-L path on the dwd server
            date:   date used for filtering of the available files

        Returns:
            file url based on the filtering
        """
        urls = list_remote_files(url, False)

        if date == DWDForecastDate.LATEST:
            try:
                url = list(filter(lambda url_: "LATEST" in url_.upper(),
                                  urls))[0]
                return url
            except IndexError as e:
                raise IndexError(
                    f"Unable to find LATEST file within {url}") from e

        df_urls = pd.DataFrame({"URL": urls})

        df_urls[DWDMetaColumns.DATETIME.value] = df_urls["URL"].apply(
            lambda url_: url_.split("/")[-1].split("_")[2].replace(".kmz", ""))

        df_urls = df_urls[df_urls[DWDMetaColumns.DATETIME.value] != "LATEST"]

        df_urls[DWDMetaColumns.DATETIME.value] = pd.to_datetime(
            df_urls[DWDMetaColumns.DATETIME.value],
            format=DatetimeFormat.YMDH.value)

        df_urls = df_urls.loc[df_urls[DWDMetaColumns.DATETIME.value] == date]

        if df_urls.empty:
            raise IndexError(f"Unable to find {date} file within {url}")

        return df_urls["URL"].item()
Ejemplo n.º 7
0
def create_fileindex_radar(
    parameter: DwdRadarParameter,
    site: Optional[DwdRadarSite] = None,
    fmt: Optional[DwdRadarDataFormat] = None,
    subset: Optional[DwdRadarDataSubset] = None,
    resolution: Optional[DwdRadarResolution] = None,
    period: Optional[DwdRadarPeriod] = None,
    parse_datetime: bool = False,
) -> pd.DataFrame:
    """
    Function to create a file index of the DWD radar data, which is shipped as
    bin bufr or odim-hdf5 data. The file index is created for a single parameter.

    :param parameter:       The radar moment to request
    :param site:            Site/station if parameter is one of
                            RADAR_PARAMETERS_SITES
    :param fmt:          Data format (BINARY, BUFR, HDF5)
    :param subset:          The subset (simple or polarimetric) for HDF5 data.
    :param resolution: Time resolution for RadarParameter.RADOLAN_CDC,
                            either daily or hourly or 5 minutes.
    :param period:     Period type for RadarParameter.RADOLAN_CDC
    :param parse_datetime:  Whether to parse datetimes from file names

    :return:                File index as pandas.DataFrame with FILENAME
                            and DATETIME columns
    """

    parameter_path = build_path_to_parameter(
        parameter=parameter,
        site=site,
        fmt=fmt,
        subset=subset,
        resolution=resolution,
        period=period,
    )

    url = urljoin(DWD_SERVER, parameter_path)

    files_server = list_remote_files(url, recursive=True)

    files_server = pd.DataFrame(files_server,
                                columns=[DwdColumns.FILENAME.value],
                                dtype="str")

    # Some directories have both "---bin" and "---bufr" files within the same directory,
    # so we need to filter here by designated RadarDataFormat. Example:
    # https://opendata.dwd.de/weather/radar/sites/px/boo/
    if fmt is not None:
        if fmt == DwdRadarDataFormat.BINARY:
            files_server = files_server[files_server[
                DwdColumns.FILENAME.value].str.contains("--bin")]
        elif fmt == DwdRadarDataFormat.BUFR:
            files_server = files_server[files_server[
                DwdColumns.FILENAME.value].str.contains("--buf")]

    # Decode datetime of file for filtering.
    if parse_datetime:

        files_server[DwdColumns.DATETIME.value] = files_server[
            DwdColumns.FILENAME.value].apply(get_date_from_filename)

        files_server = files_server.dropna()

    return files_server