Exemple #1
0
def _download_dwd_data(download_specification: Tuple[Union[str, Path],
                                                     Union[str, Path]]):
    """
    This function downloads the stationdata for which the link is
    provided by the 'select_dwd' function. It checks the shortened filepath (just
    the zipfile) for its parameters, creates the full filepath and downloads the
    file(s) according to the set up folder.

    Args:
        download_specification: contains path to file that should be downloaded
            and the path to the folder to store the files

    Returns:
        stores data on local file system

    """
    remote_file, folder = download_specification

    create_folder(subfolder=SUB_FOLDER_STATIONDATA, folder=folder)

    file_server = create_remote_file_name(remote_file)
    file_local = create_local_file_name(remote_file, folder)

    try:
        # Open connection with ftp server
        with FTP(DWD_SERVER) as ftp:
            ftp.login()
            ftp_file_download(ftp, Path(file_server), Path(file_local))

    except Exception:
        # In the end raise an error naming the files that couldn't be loaded.
        raise NameError(f"The file\n {file_local} \n couldn't be downloaded!")
def create_metainfo_fpath(folder: str, parameter: Parameter,
                          period_type: PeriodType,
                          time_resolution: TimeResolution) -> Path:
    """ checks if the file behind the path exists """
    folder = correct_folder_path(folder)

    create_folder(subfolder=SUB_FOLDER_METADATA, folder=folder)
    return Path(
        folder, SUB_FOLDER_METADATA, f"{METADATA_NAME}_{parameter.value}_"
        f"{time_resolution.value}_{period_type.value}"
        f"{DATA_FORMAT}")
Exemple #3
0
def create_fileindex(parameter: Parameter,
                     time_resolution: TimeResolution,
                     period_type: PeriodType,
                     folder: str = DWD_FOLDER_MAIN) -> None:
    """
        A function to receive current files on server as list excluding description
        files and only containing those files that have measuring data.

    """
    # Check for folder and create if necessary
    create_folder(subfolder=DWD_FOLDER_METADATA, folder=folder)

    filelist_local_path = Path(
        folder, DWD_FOLDER_METADATA, f"{FILELIST_NAME}_{parameter.value}_"
        f"{time_resolution.value}_"
        f"{period_type.value}{DATA_FORMAT}")

    server_path = PurePosixPath(DWD_PATH, time_resolution.value,
                                parameter.value, period_type.value)

    try:
        with FTP(DWD_SERVER) as ftp:
            ftp.login()
            files_server = ftp.list_files(remote_path=str(server_path),
                                          also_subfolders=True)

    except ftplib.all_errors as e:
        raise ftplib.all_errors(
            "Error: creating a filelist currently not possible.\n"
            f"{str(e)}")

    files_server = pd.DataFrame(files_server,
                                columns=[DWDColumns.FILENAME.value],
                                dtype='str')

    files_server.loc[:, DWDColumns.FILENAME.
                     value] = files_server.loc[:, DWDColumns.FILENAME.
                                               value].apply(
                                                   lambda filename: filename.
                                                   lstrip(DWD_PATH + '/'))

    files_server = files_server[files_server.FILENAME.str.contains(
        ARCHIVE_FORMAT)]

    files_server.loc[:, DWDColumns.FILEID.value] = files_server.index

    file_names = files_server.iloc[:, 0].str.split("/").apply(
        lambda string: string[-1])

    files_server.loc[:, DWDColumns.STATION_ID.value] = file_names.apply(
        lambda x: re.findall(STATID_REGEX, x).pop(0))

    files_server = files_server.iloc[:, [1, 2, 0]]

    files_server.iloc[:, 1] = files_server.iloc[:, 1].astype(int)

    files_server = files_server.sort_values(by=[DWDColumns.STATION_ID.value])

    remove_old_file(file_type=FILELIST_NAME,
                    parameter=parameter,
                    time_resolution=time_resolution,
                    period_type=period_type,
                    file_postfix=DATA_FORMAT,
                    folder=folder,
                    subfolder=DWD_FOLDER_METADATA)

    files_server.to_csv(path_or_buf=filelist_local_path,
                        header=True,
                        index=False)
def _parse_dwd_data(files_in_bytes: Optional[List[Tuple[str, BytesIO]]],
                    prefer_local: bool, folder: str, write_file: bool,
                    request_string: str) -> pd.DataFrame:
    """
    A wrapping function that only handles data for one station id. The files passed to it are thus related to this id.
    This is important for storing the data locally as the DataFrame that is stored should obviously only handle one
    station at a time.

    :param files_in_bytes: the files belonging to one station
    :param prefer_local: if it should preferably be loaded from a local file
    :param folder: the folder where the local file is stored
    :return: the dataframe with data from that station, acn be empty if no data is provided or local file is not found
    or has no data in it
    """
    loaded_locally = False
    data = None

    # If prefered locally try now to read from this data
    if prefer_local:
        try:
            data = pd.read_hdf(Path(folder, STATIONDATA_NAME) /
                               f"{STATIONDATA_NAME}{H5_FORMAT}",
                               key=request_string)

            data = data.astype(create_stationdata_dtype_mapping(data.columns))

            loaded_locally = True
        except (FileNotFoundError, OSError):
            print(
                f"Error: There seems to be no file "
                f"{Path(folder, STATIONDATA_NAME) / f'{STATIONDATA_NAME}{H5_FORMAT}'}. Data will be loaded freshly."
            )
        except KeyError:
            print(
                f"Error: The requested data for {request_string} does not yet exist in local store. Data will be "
                f"loaded freshly.")

    if not loaded_locally:
        data = []
        for _statid, filename, file_in_bytes in files_in_bytes:
            try:
                data_file = pd.read_csv(
                    filepath_or_buffer=file_in_bytes,
                    sep=STATIONDATA_SEP,
                    na_values=NA_STRING,
                    dtype=
                    "str"  # dtypes are mapped manually to ensure expected dtypes
                )

                data.append(data_file)
            except pd.errors.ParserError as e:
                print(
                    f"Error: The file for {filename} could not be parsed to a DataFrame and will be skipped. \n"
                    f"Message: {str(e)}")
            except ValueError:
                print(
                    "Error: file from files_in_bytes is None. No data is parsed."
                )

        try:
            data = pd.concat(data).reset_index(drop=True)
        except ValueError:
            return pd.DataFrame()

        data = data.rename(columns=str.upper).rename(
            columns=str.strip).rename(GERMAN_TO_ENGLISH_COLUMNS_MAPPING)

        data = data.astype(create_stationdata_dtype_mapping(data.columns))

    if write_file and not loaded_locally:
        try:
            create_folder(STATIONDATA_NAME, folder)

            data.to_hdf(Path(folder, STATIONDATA_NAME) /
                        f"{STATIONDATA_NAME}{H5_FORMAT}",
                        key=request_string)
        except FileNotFoundError:
            print(
                f"Error: File for station data could not be created at "
                f"{str(Path(folder, STATIONDATA_NAME, f'{STATIONDATA_NAME}{H5_FORMAT}'))}. "
                f"Data for {request_string} could not be written.")

    return data
Exemple #5
0
def create_fileindex(parameter: Parameter,
                     time_resolution: TimeResolution,
                     period_type: PeriodType,
                     folder: str = MAIN_FOLDER):
    """
        A function to receive current files on server as list excluding description
        files and only containing those files that have measuring data.

    """
    # Check for folder and create if necessary
    create_folder(subfolder=SUB_FOLDER_METADATA, folder=folder)

    filelist_local_path = Path(
        folder, SUB_FOLDER_METADATA, f"{FILELIST_NAME}_{parameter.value}_"
        f"{time_resolution.value}_"
        f"{period_type.value}{DATA_FORMAT}")

    filelist_local_path = str(filelist_local_path).replace('\\', '/')

    server_path = Path(DWD_PATH, time_resolution.value, parameter.value,
                       period_type.value)

    server_path = f"{server_path}{os.sep}".replace('\\', '/')

    try:
        with FTP(DWD_SERVER) as ftp:
            ftp.login()
            files_server = ftp.list_files(path=server_path)

    except Exception:
        raise NameError(
            "Download of fileslist file currently not possible. Try again!")

    files_server = pd.DataFrame(files_server)

    files_server.columns = [FILENAME_NAME]

    files_server.loc[:, FILENAME_NAME] = files_server.loc[:, FILENAME_NAME] \
        .apply(str)

    files_server.loc[:,
                     FILENAME_NAME] = files_server.loc[:, FILENAME_NAME].apply(
                         lambda filename: filename.lstrip(DWD_PATH + '/'))

    files_server = files_server[files_server.FILENAME.str.contains(
        ARCHIVE_FORMAT)]

    files_server \
        .insert(loc=1,
                column=FILEID_NAME,
                value=files_server.index)

    files_server \
        .insert(loc=2,
                column=STATION_ID_NAME,
                value=files_server.iloc[:, 0].str.split('_')
                .apply(lambda string: string[STRING_STATID_COL.get(period_type, None)]))

    files_server = files_server.iloc[:, [1, 2, 0]]

    files_server.iloc[:, 1] = files_server.iloc[:, 1].apply(int)

    files_server = files_server.sort_values(by=[STATION_ID_NAME])

    remove_old_file(file_type=FILELIST_NAME,
                    parameter=parameter,
                    time_resolution=time_resolution,
                    period_type=period_type,
                    file_postfix=DATA_FORMAT,
                    folder=folder,
                    subfolder=SUB_FOLDER_METADATA)

    files_server.to_csv(path_or_buf=filelist_local_path,
                        header=True,
                        index=False)

    return None