Beispiel #1
0
def metadata_for_dwd_data(parameter: Parameter,
                          time_resolution: TimeResolution,
                          period_type: PeriodType,
                          folder: str = MAIN_FOLDER,
                          write_file: bool = True,
                          create_new_filelist: bool = False):
    """
    A main function to retrieve metadata for a set of parameters that creates a
        corresponding csv.

    STATE information is added to metadata for cases where there's no such named
    column (e.g. STATE) in the dataframe.
    For this purpose we use daily precipitation data. That has two reasons:
     - daily precipitation data has a STATE information combined with a city
     - daily precipitation data is the most common data served by the DWD


    Args:
        parameter: observation measure
        time_resolution: frequency/granularity of measurement interval
        period_type: recent or historical files
        folder: local file system folder where files should be stored
        write_file: writes the meta data file to the local file system
        create_new_filelist: if true: a new file_list for metadata will
         be created

    Returns:

    """
    assert isinstance(parameter, Parameter)
    assert isinstance(time_resolution, TimeResolution)
    assert isinstance(period_type, PeriodType)
    assert isinstance(folder, str)
    assert isinstance(write_file, bool)
    assert isinstance(create_new_filelist, bool)

    check_parameters(parameter=parameter,
                     time_resolution=time_resolution,
                     period_type=period_type)

    file_path = create_metainfo_fpath(folder, parameter, period_type,
                                      time_resolution)

    if check_file_exist(file_path) and not create_new_filelist:
        metainfo = pd.read_csv(filepath_or_buffer=file_path)
        return metainfo

    if time_resolution != TimeResolution.MINUTE_1:
        metainfo = create_metaindex(parameter=parameter,
                                    time_resolution=time_resolution,
                                    period_type=period_type)

    else:
        metainfo = metaindex_for_1minute_data(parameter=parameter,
                                              time_resolution=time_resolution,
                                              folder=folder)

    if STATE_NAME not in metainfo.columns:
        mdp = metadata_for_dwd_data(Parameter.PRECIPITATION_MORE,
                                    TimeResolution.DAILY,
                                    PeriodType.HISTORICAL,
                                    folder=folder,
                                    write_file=False,
                                    create_new_filelist=False)

        metainfo = metainfo.merge(mdp.loc[:, [STATIONNAME_NAME, STATE_NAME]],
                                  on=STATIONNAME_NAME).reset_index(drop=True)

    metainfo = add_filepresence(metainfo=metainfo,
                                parameter=parameter,
                                time_resolution=time_resolution,
                                period_type=period_type,
                                folder=folder,
                                create_new_filelist=create_new_filelist)

    if write_file and not check_file_exist(file_path) and not \
            create_new_filelist:
        remove_old_file(file_type=METADATA_NAME,
                        file_postfix=DATA_FORMAT,
                        parameter=parameter,
                        time_resolution=time_resolution,
                        period_type=period_type,
                        folder=folder,
                        subfolder=SUB_FOLDER_METADATA)

        metainfo.to_csv(path_or_buf=file_path, header=True, index=False)

    return metainfo
Beispiel #2
0
def create_fileindex(parameter: Parameter,
                     time_resolution: TimeResolution,
                     period_type: PeriodType,
                     folder: str = DWD_FOLDER_MAIN) -> None:
    """
        A function to receive current files on server as list excluding description
        files and only containing those files that have measuring data.

    """
    # Check for folder and create if necessary
    create_folder(subfolder=DWD_FOLDER_METADATA, folder=folder)

    filelist_local_path = Path(
        folder, DWD_FOLDER_METADATA, f"{FILELIST_NAME}_{parameter.value}_"
        f"{time_resolution.value}_"
        f"{period_type.value}{DATA_FORMAT}")

    server_path = PurePosixPath(DWD_PATH, time_resolution.value,
                                parameter.value, period_type.value)

    try:
        with FTP(DWD_SERVER) as ftp:
            ftp.login()
            files_server = ftp.list_files(remote_path=str(server_path),
                                          also_subfolders=True)

    except ftplib.all_errors as e:
        raise ftplib.all_errors(
            "Error: creating a filelist currently not possible.\n"
            f"{str(e)}")

    files_server = pd.DataFrame(files_server,
                                columns=[DWDColumns.FILENAME.value],
                                dtype='str')

    files_server.loc[:, DWDColumns.FILENAME.
                     value] = files_server.loc[:, DWDColumns.FILENAME.
                                               value].apply(
                                                   lambda filename: filename.
                                                   lstrip(DWD_PATH + '/'))

    files_server = files_server[files_server.FILENAME.str.contains(
        ARCHIVE_FORMAT)]

    files_server.loc[:, DWDColumns.FILEID.value] = files_server.index

    file_names = files_server.iloc[:, 0].str.split("/").apply(
        lambda string: string[-1])

    files_server.loc[:, DWDColumns.STATION_ID.value] = file_names.apply(
        lambda x: re.findall(STATID_REGEX, x).pop(0))

    files_server = files_server.iloc[:, [1, 2, 0]]

    files_server.iloc[:, 1] = files_server.iloc[:, 1].astype(int)

    files_server = files_server.sort_values(by=[DWDColumns.STATION_ID.value])

    remove_old_file(file_type=FILELIST_NAME,
                    parameter=parameter,
                    time_resolution=time_resolution,
                    period_type=period_type,
                    file_postfix=DATA_FORMAT,
                    folder=folder,
                    subfolder=DWD_FOLDER_METADATA)

    files_server.to_csv(path_or_buf=filelist_local_path,
                        header=True,
                        index=False)
Beispiel #3
0
def metadata_for_dwd_data(parameter: Parameter,
                          time_resolution: TimeResolution,
                          period_type: PeriodType,
                          folder: str = DWD_FOLDER_MAIN,
                          write_file: bool = True,
                          create_new_filelist: bool = False) -> pd.DataFrame:
    """
    A main function to retrieve metadata for a set of parameters that creates a
        corresponding csv.

    STATE information is added to metadata for cases where there's no such named
    column (e.g. STATE) in the dataframe.
    For this purpose we use daily precipitation data. That has two reasons:
     - daily precipitation data has a STATE information combined with a city
     - daily precipitation data is the most common data served by the DWD


    Args:
        parameter: observation measure
        time_resolution: frequency/granularity of measurement interval
        period_type: recent or historical files
        folder: local file system folder where files should be stored
        write_file: writes the meta data file to the local file system
        create_new_filelist: if true: a new file_list for metadata will
         be created

    Returns:

    """

    if not isinstance(parameter, Parameter):
        raise TypeError("Error: 'parameter' is not of type Parameter(Enum).")
    if not isinstance(time_resolution, TimeResolution):
        raise TypeError(
            "Error: 'time_resolution' is not of type TimeResolution(Enum).")
    if not isinstance(period_type, PeriodType):
        raise TypeError(
            "Error: 'period_type' is not of type PeriodType(Enum).")
    if not isinstance(folder, str):
        raise TypeError("Error: 'folder' is not a string.")
    if not isinstance(write_file, bool):
        raise TypeError("Error: 'write_file' is not a bool.")
    if not isinstance(create_new_filelist, bool):
        raise TypeError("Error: 'create_new_filelist' is not a bool.")

    check_parameters(parameter=parameter,
                     time_resolution=time_resolution,
                     period_type=period_type)

    file_path = create_metainfo_fpath(folder, parameter, period_type,
                                      time_resolution)

    if check_file_exist(file_path) and not create_new_filelist:
        metainfo = pd.read_csv(filepath_or_buffer=file_path)
        return metainfo

    if time_resolution == TimeResolution.MINUTE_1:
        metainfo = metaindex_for_1minute_data(parameter=parameter,
                                              time_resolution=time_resolution)
    else:
        metainfo = create_metaindex(parameter=parameter,
                                    time_resolution=time_resolution,
                                    period_type=period_type)

    if all(pd.isnull(metainfo[DWDColumns.STATE.value])):
        # @todo avoid calling function in function -> we have to build a function around to manage missing data
        mdp = metadata_for_dwd_data(Parameter.PRECIPITATION_MORE,
                                    TimeResolution.DAILY,
                                    PeriodType.HISTORICAL,
                                    folder=folder,
                                    write_file=False,
                                    create_new_filelist=False)

        stateinfo = pd.merge(
            metainfo[DWDColumns.STATION_ID],
            mdp.loc[:, [DWDColumns.STATION_ID.value, DWDColumns.STATE.value]],
            how="left")

        metainfo[DWDColumns.STATE.value] = stateinfo[DWDColumns.STATE.value]

        # for station, state in mdp.loc[:, [DWDColumns.STATIONNAME.value, DWDColumns.STATE.value]]:
        #     metainfo.loc[metainfo[DWDColumns.STATIONNAME.value] == station, DWDColumns.STATE.value] = state

    metainfo = add_filepresence(metainfo=metainfo,
                                parameter=parameter,
                                time_resolution=time_resolution,
                                period_type=period_type,
                                folder=folder,
                                create_new_filelist=create_new_filelist)

    if write_file and not check_file_exist(file_path) and not \
            create_new_filelist:
        remove_old_file(file_type=METADATA_NAME,
                        file_postfix=DATA_FORMAT,
                        parameter=parameter,
                        time_resolution=time_resolution,
                        period_type=period_type,
                        folder=folder,
                        subfolder=DWD_FOLDER_METADATA)

        metainfo.to_csv(path_or_buf=file_path, header=True, index=False)

    return metainfo
Beispiel #4
0
def metadata_for_dwd_data(parameter: Union[Parameter, str],
                          time_resolution: Union[TimeResolution, str],
                          period_type: Union[PeriodType, str],
                          folder: str = DWD_FOLDER_MAIN,
                          write_file: bool = True,
                          create_new_file_index: bool = False) -> pd.DataFrame:
    """
    A main function to retrieve metadata for a set of parameters that creates a
        corresponding csv.
    STATE information is added to metadata for cases where there's no such named
    column (e.g. STATE) in the pandas.DataFrame.
    For this purpose we use daily precipitation data. That has two reasons:
     - daily precipitation data has a STATE information combined with a city
     - daily precipitation data is the most common data served by the DWD
    Args:
        parameter: observation measure
        time_resolution: frequency/granularity of measurement interval
        period_type: recent or historical files
        folder: local file system folder where files should be stored
        write_file: writes the meta data file to the local file system
        create_new_file_index: if true: a new file_list for metadata will
         be created
    Returns:
        pandas.DataFrame with metadata for selected parameters
    """
    if create_new_file_index:
        reset_file_index_cache()

    parameter = Parameter(parameter)
    time_resolution = TimeResolution(time_resolution)
    period_type = PeriodType(period_type)

    check_parameters(parameter=parameter,
                     time_resolution=time_resolution,
                     period_type=period_type)

    file_path = create_metainfo_fpath(folder,
                                      parameter,
                                      period_type,
                                      time_resolution)

    if time_resolution == TimeResolution.MINUTE_1:
        metainfo = metaindex_for_1minute_data(parameter=parameter,
                                              time_resolution=time_resolution)
    else:
        metainfo = create_metaindex(parameter=parameter,
                                    time_resolution=time_resolution,
                                    period_type=period_type)

    if all(pd.isnull(metainfo[DWDMetaColumns.STATE.value])):
        # @todo avoid calling function in function -> we have to build a function around to manage missing data
        mdp = metadata_for_dwd_data(Parameter.PRECIPITATION_MORE,
                                    TimeResolution.DAILY,
                                    PeriodType.HISTORICAL,
                                    create_new_file_index=False)

        stateinfo = pd.merge(metainfo[DWDMetaColumns.STATION_ID],
                             mdp.loc[:, [DWDMetaColumns.STATION_ID.value, DWDMetaColumns.STATE.value]],
                             how="left")

        metainfo[DWDMetaColumns.STATE.value] = stateinfo[DWDMetaColumns.STATE.value]

    metainfo = add_filepresence(metainfo=metainfo,
                                parameter=parameter,
                                time_resolution=time_resolution,
                                period_type=period_type)

    if write_file and not file_path.is_file() and create_new_file_index:
        remove_old_file(file_type=METADATA_NAME,
                        file_postfix=DATA_FORMAT,
                        parameter=parameter,
                        time_resolution=time_resolution,
                        period_type=period_type,
                        folder=folder,
                        subfolder=DWD_FOLDER_METADATA)

        metainfo.to_csv(path_or_buf=file_path,
                        header=True,
                        index=False)

    return metainfo
Beispiel #5
0
def create_fileindex(parameter: Parameter,
                     time_resolution: TimeResolution,
                     period_type: PeriodType,
                     folder: str = MAIN_FOLDER):
    """
        A function to receive current files on server as list excluding description
        files and only containing those files that have measuring data.

    """
    # Check for folder and create if necessary
    create_folder(subfolder=SUB_FOLDER_METADATA, folder=folder)

    filelist_local_path = Path(
        folder, SUB_FOLDER_METADATA, f"{FILELIST_NAME}_{parameter.value}_"
        f"{time_resolution.value}_"
        f"{period_type.value}{DATA_FORMAT}")

    filelist_local_path = str(filelist_local_path).replace('\\', '/')

    server_path = Path(DWD_PATH, time_resolution.value, parameter.value,
                       period_type.value)

    server_path = f"{server_path}{os.sep}".replace('\\', '/')

    try:
        with FTP(DWD_SERVER) as ftp:
            ftp.login()
            files_server = ftp.list_files(path=server_path)

    except Exception:
        raise NameError(
            "Download of fileslist file currently not possible. Try again!")

    files_server = pd.DataFrame(files_server)

    files_server.columns = [FILENAME_NAME]

    files_server.loc[:, FILENAME_NAME] = files_server.loc[:, FILENAME_NAME] \
        .apply(str)

    files_server.loc[:,
                     FILENAME_NAME] = files_server.loc[:, FILENAME_NAME].apply(
                         lambda filename: filename.lstrip(DWD_PATH + '/'))

    files_server = files_server[files_server.FILENAME.str.contains(
        ARCHIVE_FORMAT)]

    files_server \
        .insert(loc=1,
                column=FILEID_NAME,
                value=files_server.index)

    files_server \
        .insert(loc=2,
                column=STATION_ID_NAME,
                value=files_server.iloc[:, 0].str.split('_')
                .apply(lambda string: string[STRING_STATID_COL.get(period_type, None)]))

    files_server = files_server.iloc[:, [1, 2, 0]]

    files_server.iloc[:, 1] = files_server.iloc[:, 1].apply(int)

    files_server = files_server.sort_values(by=[STATION_ID_NAME])

    remove_old_file(file_type=FILELIST_NAME,
                    parameter=parameter,
                    time_resolution=time_resolution,
                    period_type=period_type,
                    file_postfix=DATA_FORMAT,
                    folder=folder,
                    subfolder=SUB_FOLDER_METADATA)

    files_server.to_csv(path_or_buf=filelist_local_path,
                        header=True,
                        index=False)

    return None