def _create_file_index_for_dwd_server( parameter: Parameter, time_resolution: TimeResolution, period_type: PeriodType, cdc_base: DWDCDCBase, ) -> pd.DataFrame: """ Function to create a file index of the DWD station data, which usually is shipped as zipped/archived data. The file index is created for an individual set of parameters. Args: parameter: parameter of Parameter enumeration time_resolution: time resolution of TimeResolution enumeration period_type: period type of PeriodType enumeration cdc_base: base path e.g. climate_observations/germany Returns: file index in a pandas.DataFrame with sets of parameters and station id """ parameter_path = build_path_to_parameter(parameter, time_resolution, period_type) url = reduce(urljoin, [DWD_SERVER, DWD_CDC_PATH, cdc_base.value, parameter_path]) files_server = list_remote_files(url, recursive=True) files_server = pd.DataFrame(files_server, columns=[DWDMetaColumns.FILENAME.value], dtype="str") return files_server
def _create_meta_index_for_climate_observations( parameter_set: DwdObservationDataset, resolution: Resolution, period: Period, ) -> pd.DataFrame: """Function used to create meta index DataFrame parsed from the text files that are located in each data section of the station data directory of the weather service. Args: parameter_set: observation measure resolution: frequency/granularity of measurement interval period: current, recent or historical files Return: DataFrame with parsed columns of the corresponding text file. Columns are translated into English and data is not yet complete as file existence is not checked. """ parameter_path = build_path_to_parameter(parameter_set, resolution, period) url = reduce( urljoin, [ DWD_SERVER, DWD_CDC_PATH, DWDCDCBase.CLIMATE_OBSERVATIONS.value, parameter_path, ], ) files_server = list_remote_files(url, recursive=True) # Find the one meta file from the files listed on the server meta_file = _find_meta_file(files_server, url) try: file = download_file_from_dwd(meta_file) except InvalidURL as e: raise InvalidURL( f"Error: reading metadata {meta_file} file failed.") from e meta_index = pd.read_fwf( filepath_or_buffer=file, colspecs=METADATA_FIXED_COLUMN_WIDTH, skiprows=[1], dtype=str, encoding="ISO-8859-1", ) # Fix column names, as header is not aligned to fixed column widths meta_index.columns = "".join([ column for column in meta_index.columns if "unnamed" not in column.lower() ]).split(" ") meta_index = meta_index.rename(columns=str.lower) meta_index = meta_index.rename(columns=GERMAN_TO_ENGLISH_COLUMNS_MAPPING) return meta_index
def _create_meta_index_for_1minute_historical_precipitation() -> pd.DataFrame: """ A helping function to create a raw index of metadata for stations of the set of parameters as given. This raw metadata is then used by other functions. This second/alternative function must be used for high resolution data, where the metadata is not available as file but instead saved in external files per each station. - especially for precipitation/1_minute/historical! """ parameter_path = f"{TimeResolution.MINUTE_1.value}/{Parameter.PRECIPITATION.value}/" url = reduce( urljoin, [ DWD_SERVER, DWD_CDC_PATH, DWDCDCBase.CLIMATE_OBSERVATIONS.value, parameter_path, META_DATA_FOLDER, ], ) metadata_file_paths = list_remote_files(url, recursive=False) station_ids = [ re.findall(STATION_ID_REGEX, file).pop(0) for file in metadata_file_paths ] meta_index_df = pd.DataFrame(columns=METADATA_COLUMNS) with ThreadPoolExecutor() as executor: metadata_files = executor.map( _download_metadata_file_for_1minute_precipitation, metadata_file_paths) with ThreadPoolExecutor() as executor: metadata_dfs = executor.map(_parse_geo_metadata, zip(metadata_files, station_ids)) meta_index_df = meta_index_df.append(other=list(metadata_dfs), ignore_index=True) missing_to_date_index = pd.isnull( meta_index_df[DWDMetaColumns.TO_DATE.value]) meta_index_df.loc[missing_to_date_index, DWDMetaColumns.TO_DATE.value] = pd.Timestamp( dt.date.today() - dt.timedelta(days=1)).strftime("%Y%m%d") meta_index_df = meta_index_df.astype(METADATA_DTYPE_MAPPING) # Drop empty state column again as it will be merged later on meta_index_df = meta_index_df.drop(labels=DWDMetaColumns.STATE.value, axis=1) return meta_index_df
def get_url_latest(self, url): urls = list_remote_files(url, False) try: url = list(filter(lambda url: "LATEST" in url, urls))[0] return url except: # noqa:E722,B001 raise KeyError(f"Unable to find LATEST file within {url}")
def test_list_files_of_climate_observations(): files_server = list_remote_files( "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/" "annual/kl/recent", recursive=False, ) assert ( "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/" "annual/kl/recent/jahreswerte_KL_01048_akt.zip" in files_server)
def get_url_for_date(url: str, date: Union[datetime, DWDForecastDate]) -> str: """ Method to get a file url based on the MOSMIX-S/MOSMIX-L url and the date that is used for filtering. Args: url: MOSMIX-S/MOSMIX-L path on the dwd server date: date used for filtering of the available files Returns: file url based on the filtering """ urls = list_remote_files(url, False) if date == DWDForecastDate.LATEST: try: url = list(filter(lambda url_: "LATEST" in url_.upper(), urls))[0] return url except IndexError as e: raise IndexError( f"Unable to find LATEST file within {url}") from e df_urls = pd.DataFrame({"URL": urls}) df_urls[DWDMetaColumns.DATETIME.value] = df_urls["URL"].apply( lambda url_: url_.split("/")[-1].split("_")[2].replace(".kmz", "")) df_urls = df_urls[df_urls[DWDMetaColumns.DATETIME.value] != "LATEST"] df_urls[DWDMetaColumns.DATETIME.value] = pd.to_datetime( df_urls[DWDMetaColumns.DATETIME.value], format=DatetimeFormat.YMDH.value) df_urls = df_urls.loc[df_urls[DWDMetaColumns.DATETIME.value] == date] if df_urls.empty: raise IndexError(f"Unable to find {date} file within {url}") return df_urls["URL"].item()
def create_fileindex_radar( parameter: DwdRadarParameter, site: Optional[DwdRadarSite] = None, fmt: Optional[DwdRadarDataFormat] = None, subset: Optional[DwdRadarDataSubset] = None, resolution: Optional[DwdRadarResolution] = None, period: Optional[DwdRadarPeriod] = None, parse_datetime: bool = False, ) -> pd.DataFrame: """ Function to create a file index of the DWD radar data, which is shipped as bin bufr or odim-hdf5 data. The file index is created for a single parameter. :param parameter: The radar moment to request :param site: Site/station if parameter is one of RADAR_PARAMETERS_SITES :param fmt: Data format (BINARY, BUFR, HDF5) :param subset: The subset (simple or polarimetric) for HDF5 data. :param resolution: Time resolution for RadarParameter.RADOLAN_CDC, either daily or hourly or 5 minutes. :param period: Period type for RadarParameter.RADOLAN_CDC :param parse_datetime: Whether to parse datetimes from file names :return: File index as pandas.DataFrame with FILENAME and DATETIME columns """ parameter_path = build_path_to_parameter( parameter=parameter, site=site, fmt=fmt, subset=subset, resolution=resolution, period=period, ) url = urljoin(DWD_SERVER, parameter_path) files_server = list_remote_files(url, recursive=True) files_server = pd.DataFrame(files_server, columns=[DwdColumns.FILENAME.value], dtype="str") # Some directories have both "---bin" and "---bufr" files within the same directory, # so we need to filter here by designated RadarDataFormat. Example: # https://opendata.dwd.de/weather/radar/sites/px/boo/ if fmt is not None: if fmt == DwdRadarDataFormat.BINARY: files_server = files_server[files_server[ DwdColumns.FILENAME.value].str.contains("--bin")] elif fmt == DwdRadarDataFormat.BUFR: files_server = files_server[files_server[ DwdColumns.FILENAME.value].str.contains("--buf")] # Decode datetime of file for filtering. if parse_datetime: files_server[DwdColumns.DATETIME.value] = files_server[ DwdColumns.FILENAME.value].apply(get_date_from_filename) files_server = files_server.dropna() return files_server