def _download_dwd_data(download_specification: Tuple[Union[str, Path], Union[str, Path]]): """ This function downloads the stationdata for which the link is provided by the 'select_dwd' function. It checks the shortened filepath (just the zipfile) for its parameters, creates the full filepath and downloads the file(s) according to the set up folder. Args: download_specification: contains path to file that should be downloaded and the path to the folder to store the files Returns: stores data on local file system """ remote_file, folder = download_specification create_folder(subfolder=SUB_FOLDER_STATIONDATA, folder=folder) file_server = create_remote_file_name(remote_file) file_local = create_local_file_name(remote_file, folder) try: # Open connection with ftp server with FTP(DWD_SERVER) as ftp: ftp.login() ftp_file_download(ftp, Path(file_server), Path(file_local)) except Exception: # In the end raise an error naming the files that couldn't be loaded. raise NameError(f"The file\n {file_local} \n couldn't be downloaded!")
def create_metaindex(parameter: Parameter, time_resolution: TimeResolution, period_type: PeriodType) -> pd.DataFrame: """ The function is used to create a simple metadata DataFrame parsed from the text files that are located in each data section of the station data directory of the weather service. Args: parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files Return: DataFrame with parsed columns of the corresponding text file. Columns are translated into English and data is not yet complete as file existence is not checked. """ server_path = PurePosixPath(DWD_PATH, time_resolution.value, parameter.value, period_type.value) try: with FTP(DWD_SERVER) as ftp: ftp.login() files_server = ftp.list_files(remote_path=str(server_path), also_subfolders=False) except ftplib.all_errors as e: raise ftplib.all_errors( "Error: couldn't retrieve filelist from server.\n" f"{str(e)}") metafile_server = [ file for file in files_server if find_all_matchstrings_in_string(file.lower(), METADATA_MATCHSTRINGS) ].pop(0) metafile_server = create_remote_file_name(metafile_server.lstrip(DWD_PATH)) try: with urllib.request.urlopen(metafile_server) as request: file = BytesIO(request.read()) except urllib.error.URLError as e: raise urllib.error.URLError("Error: reading metadata file failed.\n" f"{str(e)}") metaindex = pd.read_fwf(filepath_or_buffer=file, colspecs=METADATA_FIXED_COLUMN_WIDTH, skiprows=[1], dtype=str, encoding="ISO-8859-1") # Fix column names, as header is not aligned to fixed column widths metaindex.columns = "".join([ column for column in metaindex.columns if "unnamed" not in column.lower() ]).split(" ") metaindex = metaindex.rename(columns=str.upper).rename( columns=GERMAN_TO_ENGLISH_COLUMNS_MAPPING) return metaindex.astype(METADATA_DTYPE_MAPPING)
def create_file_index_for_dwd_server(parameter: Parameter, time_resolution: TimeResolution, period_type: PeriodType) -> pd.DataFrame: """ Function (cached) to create a file index of the DWD station data. The file index is created for an individual set of parameters. Args: parameter: parameter of Parameter enumeration time_resolution: time resolution of TimeResolution enumeration period_type: period type of PeriodType enumeration Returns: file index in a pandas.DataFrame with sets of parameters and station id """ server_path = PurePosixPath(DWD_PATH) / time_resolution.value / \ parameter.value / period_type.value # todo: replace with global requests.Session creating the index try: with FTP(DWD_SERVER) as ftp: ftp.login() files_server = ftp.list_files(remote_path=str(server_path), also_subfolders=True) except ftplib.all_errors as e: raise e("Creating file index currently not possible.") files_server = pd.DataFrame(files_server, columns=[DWDMetaColumns.FILENAME.value], dtype='str') # Filter for .zip files files_server = files_server[files_server.FILENAME.str.endswith( ARCHIVE_FORMAT)] files_server.loc[:, DWDMetaColumns.FILENAME.value] = files_server.loc[:, DWDMetaColumns.FILENAME.value].\ str.replace(DWD_PATH + '/', '') file_names = files_server.loc[:, DWDMetaColumns.FILENAME.value].str.split( "/").apply(lambda strings: strings[-1]) files_server.loc[:, DWDMetaColumns.STATION_ID.value] = file_names.apply( lambda x: re.findall(STATID_REGEX, x).pop(0)) files_server.loc[:, DWDMetaColumns.STATION_ID.value] = files_server.loc[:, DWDMetaColumns.STATION_ID.value].\ astype(int) files_server = files_server.sort_values( by=[DWDMetaColumns.STATION_ID.value, DWDMetaColumns.FILENAME.value]) return files_server.loc[:, [ DWDMetaColumns.STATION_ID.value, DWDMetaColumns.FILENAME.value ]]
def metaindex_for_1minute_data( parameter: Parameter, time_resolution: TimeResolution) -> pd.DataFrame: """ A helping function to create a raw index of metadata for stations of the set of parameters as given. This raw metadata is then used by other functions. This second/alternative function must be used for high resolution data, where the metadata is not available as file but instead saved in external files per each station. - especially for precipitation/1_minute/historical! """ assert time_resolution == TimeResolution.MINUTE_1, \ "Wrong TimeResolution, only 1 minute is valid " metadata_path = PurePosixPath(DWD_PATH, time_resolution.value, parameter.value, FTP_METADATA_NAME) with FTP(DWD_SERVER) as ftp: ftp.login() metadata_filepaths = ftp.list_files(remote_path=str(metadata_path), also_subfolders=False) metadata_filepaths = [ create_remote_file_name(file.lstrip(DWD_PATH)) for file in metadata_filepaths ] statids = [ re.findall(STATID_REGEX, file).pop(0) for file in metadata_filepaths ] metaindex_df = pd.DataFrame(None, columns=METADATA_COLUMNS) metadata_files = Pool().map(download_metadata_file_for_1minute_data, metadata_filepaths) metadata_dfs = Pool().map(combine_geo_and_par_file_to_metadata_df, zip(metadata_files, statids)) metaindex_df = metaindex_df.append(other=metadata_dfs, ignore_index=True) metaindex_df = metaindex_df.astype(METADATA_DTYPE_MAPPING) return metaindex_df.sort_values( DWDColumns.STATION_ID.value).reset_index(drop=True)
def create_fileindex(parameter: Parameter, time_resolution: TimeResolution, period_type: PeriodType, folder: str = DWD_FOLDER_MAIN) -> None: """ A function to receive current files on server as list excluding description files and only containing those files that have measuring data. """ # Check for folder and create if necessary create_folder(subfolder=DWD_FOLDER_METADATA, folder=folder) filelist_local_path = Path( folder, DWD_FOLDER_METADATA, f"{FILELIST_NAME}_{parameter.value}_" f"{time_resolution.value}_" f"{period_type.value}{DATA_FORMAT}") server_path = PurePosixPath(DWD_PATH, time_resolution.value, parameter.value, period_type.value) try: with FTP(DWD_SERVER) as ftp: ftp.login() files_server = ftp.list_files(remote_path=str(server_path), also_subfolders=True) except ftplib.all_errors as e: raise ftplib.all_errors( "Error: creating a filelist currently not possible.\n" f"{str(e)}") files_server = pd.DataFrame(files_server, columns=[DWDColumns.FILENAME.value], dtype='str') files_server.loc[:, DWDColumns.FILENAME. value] = files_server.loc[:, DWDColumns.FILENAME. value].apply( lambda filename: filename. lstrip(DWD_PATH + '/')) files_server = files_server[files_server.FILENAME.str.contains( ARCHIVE_FORMAT)] files_server.loc[:, DWDColumns.FILEID.value] = files_server.index file_names = files_server.iloc[:, 0].str.split("/").apply( lambda string: string[-1]) files_server.loc[:, DWDColumns.STATION_ID.value] = file_names.apply( lambda x: re.findall(STATID_REGEX, x).pop(0)) files_server = files_server.iloc[:, [1, 2, 0]] files_server.iloc[:, 1] = files_server.iloc[:, 1].astype(int) files_server = files_server.sort_values(by=[DWDColumns.STATION_ID.value]) remove_old_file(file_type=FILELIST_NAME, parameter=parameter, time_resolution=time_resolution, period_type=period_type, file_postfix=DATA_FORMAT, folder=folder, subfolder=DWD_FOLDER_METADATA) files_server.to_csv(path_or_buf=filelist_local_path, header=True, index=False)
def metaindex_for_1minute_data(parameter: Parameter, time_resolution: TimeResolution, folder): """ A helping function to create a raw index of metadata for stations of the set of parameters as given. This raw metadata is then used by other functions. This second/alternative function must be used for high resolution data, where the metadata is not available as file but instead saved in external files per each station. - especially for precipitation/1_minute/historical! """ assert time_resolution == TimeResolution.MINUTE_1, \ "Wrong TimeResolution, only 1 minute is valid " metadata_path = PurePosixPath(DWD_PATH, time_resolution.value, parameter.value, FTP_METADATA_NAME) metadata_path = str(metadata_path) with FTP(DWD_SERVER) as ftp: ftp.login() metadata_server = ftp.nlst(metadata_path) metadata_local = [ str(Path(folder, SUB_FOLDER_METADATA, metadata_file.split("/")[-1])) for metadata_file in metadata_server ] metadata_df = pd.DataFrame(None, columns=METADATA_1MIN_COLUMNS) for metafile_server, metafile_local in tqdm(zip(metadata_server, metadata_local), total=len(metadata_server)): with FTP(DWD_SERVER) as ftp: ftp.login() ftp.download(metafile_server, metafile_local) with ZipFile(metafile_local) as zip_file: zip_file_files = zip_file.infolist() zip_file_files = [ zip_file_file.filename for zip_file_file in zip_file_files ] file_geo = [ zip_file_file for zip_file_file in zip_file_files if all([ matchstring in zip_file_file.lower() for matchstring in METADATA_1MIN_GEO_MATCHSTRINGS ]) ].pop(0) file_par = [ zip_file_file for zip_file_file in zip_file_files if all([ matchstring in zip_file_file.lower() for matchstring in METADATA_1MIN_PAR_MATCHSTRINGS ]) ].pop(0) with zip_file.open(file_geo) as file_opened: try: geo_file = parse_zipped_data_into_df(file_opened) except UnicodeDecodeError: geo_file = parse_zipped_data_into_df(file_opened, engine='python') with zip_file.open(file_par) as file_opened: try: par_file = parse_zipped_data_into_df(file_opened) except UnicodeDecodeError: par_file = parse_zipped_data_into_df(file_opened, engine='python') Path(metafile_local).unlink() geo_file.columns = [ GERMAN_TO_ENGLISH_COLUMNS_MAPPING.get(name.strip().upper(), name.strip().upper()) for name in geo_file.columns ] par_file.columns = [ GERMAN_TO_ENGLISH_COLUMNS_MAPPING.get(name.strip().upper(), name.strip().upper()) for name in par_file.columns ] geo_file = geo_file.iloc[[-1], :] par_file = par_file.loc[:, [FROM_DATE_NAME, TO_DATE_NAME]].dropna() geo_file[FROM_DATE_NAME] = par_file[FROM_DATE_NAME].min() geo_file[TO_DATE_NAME] = par_file[TO_DATE_NAME].max() geo_file = geo_file.loc[:, METADATA_1MIN_COLUMNS] metadata_df = metadata_df.append(geo_file, ignore_index=True) columns = metadata_df.columns META_INDEX_DTYPES = { columns[0]: int, columns[1]: datetime64, columns[2]: datetime64, columns[3]: float, columns[4]: float, columns[5]: float, columns[6]: str } metadata_df = metadata_df.astype(META_INDEX_DTYPES) metadata_df = metadata_df.sort_values(STATION_ID_NAME).reset_index( drop=True) return metadata_df
def create_metaindex(parameter: Parameter, time_resolution: TimeResolution, period_type: PeriodType) -> pd.DataFrame: """ @todo: please specify what this function does Args: parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files Return: """ server_path = PurePosixPath(DWD_PATH, time_resolution.value, parameter.value, period_type.value) try: with FTP(DWD_SERVER) as ftp: ftp.login() files_server = ftp.list_files(path=str(server_path)) except Exception: raise NameError("Couldn't retrieve filelist from server") metafile_server = [ file for file in files_server if all([ matchstring in file.lower() for matchstring in METADATA_MATCHSTRINGS ]) ].pop(0) try: with FTP(DWD_SERVER) as ftp: ftp.login() file = ftp.read_file_to_bytes(metafile_server) except Exception: raise NameError( "Reading metadata file currently is not possible. Try again!") metaindex = pd.read_fwf(filepath_or_buffer=file, colspecs=METADATA_FIXED_COLUMN_WIDTH, skiprows=[1], dtype=str) metaindex_colnames = [ colname for colname in metaindex.columns if "unnamed" not in colname.lower() ] metaindex_colnames_fixed = "".join(metaindex_colnames).split(" ") metaindex.columns = [ GERMAN_TO_ENGLISH_COLUMNS_MAPPING.get(name.upper(), name.upper()) for name in metaindex_colnames_fixed ] columns = metaindex.columns META_INDEX_DTYPES = { columns[0]: int, columns[1]: datetime64, columns[2]: datetime64, columns[3]: float, columns[4]: float, columns[5]: float, columns[6]: str, columns[7]: str } metaindex = metaindex.astype(META_INDEX_DTYPES) return metaindex
def create_fileindex(parameter: Parameter, time_resolution: TimeResolution, period_type: PeriodType, folder: str = MAIN_FOLDER): """ A function to receive current files on server as list excluding description files and only containing those files that have measuring data. """ # Check for folder and create if necessary create_folder(subfolder=SUB_FOLDER_METADATA, folder=folder) filelist_local_path = Path( folder, SUB_FOLDER_METADATA, f"{FILELIST_NAME}_{parameter.value}_" f"{time_resolution.value}_" f"{period_type.value}{DATA_FORMAT}") filelist_local_path = str(filelist_local_path).replace('\\', '/') server_path = Path(DWD_PATH, time_resolution.value, parameter.value, period_type.value) server_path = f"{server_path}{os.sep}".replace('\\', '/') try: with FTP(DWD_SERVER) as ftp: ftp.login() files_server = ftp.list_files(path=server_path) except Exception: raise NameError( "Download of fileslist file currently not possible. Try again!") files_server = pd.DataFrame(files_server) files_server.columns = [FILENAME_NAME] files_server.loc[:, FILENAME_NAME] = files_server.loc[:, FILENAME_NAME] \ .apply(str) files_server.loc[:, FILENAME_NAME] = files_server.loc[:, FILENAME_NAME].apply( lambda filename: filename.lstrip(DWD_PATH + '/')) files_server = files_server[files_server.FILENAME.str.contains( ARCHIVE_FORMAT)] files_server \ .insert(loc=1, column=FILEID_NAME, value=files_server.index) files_server \ .insert(loc=2, column=STATION_ID_NAME, value=files_server.iloc[:, 0].str.split('_') .apply(lambda string: string[STRING_STATID_COL.get(period_type, None)])) files_server = files_server.iloc[:, [1, 2, 0]] files_server.iloc[:, 1] = files_server.iloc[:, 1].apply(int) files_server = files_server.sort_values(by=[STATION_ID_NAME]) remove_old_file(file_type=FILELIST_NAME, parameter=parameter, time_resolution=time_resolution, period_type=period_type, file_postfix=DATA_FORMAT, folder=folder, subfolder=SUB_FOLDER_METADATA) files_server.to_csv(path_or_buf=filelist_local_path, header=True, index=False) return None