Exemple #1
0
def dwd_stations(
    parameter: str = Query(default=None),
    resolution: str = Query(default=None),
    period: str = Query(default=None),
    sql: str = Query(default=None),
):

    parameter = parse_enumeration_from_template(parameter, Parameter)
    resolution = parse_enumeration_from_template(resolution, TimeResolution)
    period = parse_enumeration_from_template(period, PeriodType)

    # Data acquisition.
    df = metadata_for_climate_observations(
        parameter=parameter,
        time_resolution=resolution,
        period_type=period,
    )

    # Postprocessing.
    df = df.dwd.lower()

    if sql is not None:
        df = df.io.sql(sql)

    return make_json_response(df.io.to_dict())
def test_parse_enumeration_from_template():
    assert (parse_enumeration_from_template(
        "climate_summary", Parameter) == Parameter.CLIMATE_SUMMARY)
    assert parse_enumeration_from_template(
        "kl", Parameter) == Parameter.CLIMATE_SUMMARY

    with pytest.raises(InvalidParameter):
        parse_enumeration_from_template("climate", Parameter)
Exemple #3
0
    def __init__(
        self,
        time_resolution: Union[str, TimeResolution],
        date_times: Optional[Union[str, List[Union[str, datetime]]]] = None,
        start_date: Optional[Union[str, datetime]] = None,
        end_date: Optional[Union[str, datetime]] = None,
        prefer_local: bool = False,
        write_file: bool = False,
        folder: Union[str, Path] = DWD_FOLDER_MAIN,
    ) -> None:
        """

        :param time_resolution: Time resolution enumeration, either hourly or daily
        :param date_times:      List of datetimes for which RADOLAN is requested.
                                Minutes have o be defined (HOUR:50), otherwise rounded
                                to 50 minutes as of its provision.
        :param start_date:      Alternative to datetimes, giving a start and end date
        :param end_date:        Alternative to datetimes, giving a start and end date
        :param prefer_local:    RADOLAN should rather be loaded from disk, for
                                processing purposes
        :param write_file:      File should be stored on drive
        :param folder:          Folder where to store RADOLAN data

        :return:                Nothing for now.
        """
        time_resolution = parse_enumeration_from_template(
            time_resolution, TimeResolution
        )

        if time_resolution not in (TimeResolution.HOURLY, TimeResolution.DAILY):
            raise ValueError("RADOLAN only supports hourly and daily resolution.")

        self.time_resolution = time_resolution

        if date_times == "latest":
            file_index_radolan = create_file_index_for_radolan(time_resolution)

            self.date_times = pd.Series(
                file_index_radolan[DWDMetaColumns.DATETIME.value][-1:]
            )
        elif date_times:
            self.date_times = pd.Series(
                pd.to_datetime(date_times, infer_datetime_format=True)
            )
        else:
            self.date_times = pd.Series(
                pd.date_range(
                    pd.to_datetime(start_date, infer_datetime_format=True),
                    pd.to_datetime(end_date, infer_datetime_format=True),
                )
            )

        self.date_times = self.date_times.dt.floor(freq="H") + pd.Timedelta(minutes=50)

        self.date_times = self.date_times.drop_duplicates().sort_values()

        self.prefer_local = prefer_local
        self.write_file = write_file
        self.folder = folder
Exemple #4
0
def metadata_for_climate_observations(
    parameter: Union[Parameter, str],
    time_resolution: Union[TimeResolution, str],
    period_type: Union[PeriodType, str],
) -> pd.DataFrame:
    """
    A main function to retrieve metadata for a set of parameters that creates a
    corresponding csv.
    STATE information is added to metadata for cases where there's no such named
    column (e.g. STATE) in the pandas.DataFrame.
    For this purpose we use daily precipitation data. That has two reasons:

    - daily precipitation data has a STATE information combined with a city
    - daily precipitation data is the most common data served by the DWD

    :param parameter:               Observation measure
    :param time_resolution:         Frequency/granularity of measurement interval
    :param period_type:             Recent or historical files

    :return: List of stations for selected parameters
    """

    parameter = parse_enumeration_from_template(parameter, Parameter)
    time_resolution = parse_enumeration_from_template(time_resolution,
                                                      TimeResolution)
    period_type = parse_enumeration_from_template(period_type, PeriodType)

    meta_index = create_meta_index_for_climate_observations(
        parameter, time_resolution, period_type)

    meta_index[DWDMetaColumns.HAS_FILE.value] = False

    file_index = create_file_index_for_climate_observations(
        parameter, time_resolution, period_type)

    meta_index.loc[meta_index.loc[:, DWDMetaColumns.STATION_ID.value].
                   isin(file_index[DWDMetaColumns.STATION_ID.value]),
                   DWDMetaColumns.HAS_FILE.value, ] = True

    return meta_index
Exemple #5
0
def get_nearby_stations_by_number(
    latitude: float,
    longitude: float,
    num_stations_nearby: int,
    parameter: Union[Parameter, str],
    time_resolution: Union[TimeResolution, str],
    period_type: Union[PeriodType, str],
    minimal_available_date: Optional[Union[datetime, str]] = None,
    maximal_available_date: Optional[Union[datetime, str]] = None,
) -> pd.DataFrame:
    """
    Provides a list of weather station ids for the requested data

    :param latitude:                Latitude of location to search for nearest
                                    weather station
    :param longitude:               Longitude of location to search for nearest
                                    weather station
    :param minimal_available_date:  Start date of timespan where measurements
                                    should be available
    :param maximal_available_date:  End date of timespan where measurements
                                    should be available
    :param parameter:               Observation measure
    :param time_resolution:         Frequency/granularity of measurement interval
    :param period_type:             Recent or historical files
    :param num_stations_nearby:     Number of stations that should be nearby

    :return:                        DataFrames with valid stations in radius per
                                    requested location

    """
    if num_stations_nearby <= 0:
        raise ValueError("'num_stations_nearby' has to be at least 1.")

    parameter = parse_enumeration_from_template(parameter, Parameter)
    time_resolution = parse_enumeration_from_template(time_resolution,
                                                      TimeResolution)
    period_type = parse_enumeration_from_template(period_type, PeriodType)

    if not check_parameters(parameter, time_resolution, period_type):
        raise InvalidParameterCombination(
            f"The combination of {parameter.value}, {time_resolution.value}, "
            f"{period_type.value} is invalid.")

    minimal_available_date = (minimal_available_date
                              if not minimal_available_date
                              or isinstance(minimal_available_date, datetime)
                              else parse_datetime(minimal_available_date))
    maximal_available_date = (maximal_available_date
                              if not minimal_available_date
                              or isinstance(maximal_available_date, datetime)
                              else parse_datetime(maximal_available_date))

    if minimal_available_date and maximal_available_date:
        if minimal_available_date > maximal_available_date:
            raise ValueError("'minimal_available_date' has to be before "
                             "'maximal_available_date'")

    coords = Coordinates(np.array(latitude), np.array(longitude))

    metadata = metadata_for_climate_observations(parameter, time_resolution,
                                                 period_type)

    # Filter only for stations that have a file
    metadata = metadata[metadata[DWDMetaColumns.HAS_FILE.value].values]

    if minimal_available_date:
        metadata = metadata[
            metadata[DWDMetaColumns.FROM_DATE.value] <= minimal_available_date]

    if maximal_available_date:
        metadata = metadata[
            metadata[DWDMetaColumns.TO_DATE.value] >= maximal_available_date]

    metadata = metadata.reset_index(drop=True)

    distances, indices_nearest_neighbours = _derive_nearest_neighbours(
        metadata.LAT.values, metadata.LON.values, coords, num_stations_nearby)

    distances = pd.Series(distances)
    indices_nearest_neighbours = pd.Series(indices_nearest_neighbours)

    # If num_stations_nearby is higher then the actual amount of stations
    # further indices and distances are added which have to be filtered out
    distances = distances[:min(metadata.shape[0], num_stations_nearby)]
    indices_nearest_neighbours = indices_nearest_neighbours[:min(
        metadata.shape[0], num_stations_nearby)]

    distances_km = np.array(distances * KM_EARTH_RADIUS)

    metadata_location = metadata.iloc[
        indices_nearest_neighbours, :].reset_index(drop=True)

    metadata_location[DWDMetaColumns.DISTANCE_TO_LOCATION.value] = distances_km

    if metadata_location.empty:
        logger.warning(f"No weather stations were found for coordinate "
                       f"{latitude}°N and {longitude}°E ")

    return metadata_location
Exemple #6
0
    def __init__(
        self,
        station_ids: Union[str, int, List[Union[int, str]]],
        parameter: Union[str, Parameter, List[Union[str, Parameter]]],
        time_resolution: Union[str, TimeResolution],
        period_type: Union[Union[None, str, PeriodType],
                           List[Union[str, PeriodType]]] = None,
        start_date: Union[None, str, Timestamp] = None,
        end_date: Union[None, str, Timestamp] = None,
        prefer_local: bool = False,
        write_file: bool = False,
        folder: Union[str, Path] = DWD_FOLDER_MAIN,
        tidy_data: bool = True,
        humanize_column_names: bool = False,
    ) -> None:
        """
        Class with mostly flexible arguments to define a request regarding DWD data.
        Special handling for period type. If start_date/end_date are given all period
        types are considered and merged together and the data is filtered for the given
        dates afterwards.

        :param station_ids: definition of stations by str, int or list of str/int,
                            will be parsed to list of int
        :param parameter:           Observation measure
        :param time_resolution:     Frequency/granularity of measurement interval
        :param period_type:         Recent or historical files (optional), if None
                                    and start_date and end_date None, all period
                                    types are used
        :param start_date:          Replacement for period type to define exact time
                                    of requested data, if used, period type will be set
                                    to all period types (hist, recent, now)
        :param end_date:            Replacement for period type to define exact time
                                    of requested data, if used, period type will be set
                                    to all period types (hist, recent, now)
        :param prefer_local:        Definition if data should rather be taken from a
                                    local source
        :param write_file:          Should data be written to a local file
        :param folder:              Place where file lists (and station data) are stored
        :param tidy_data:           Reshape DataFrame to a more tidy
                                    and row-based version of data
        :param humanize_column_names: Replace column names by more meaningful ones
        """

        try:
            self.station_ids = pd.Series(station_ids).astype(int).tolist()
        except ValueError:
            raise ValueError(
                "List of station id's can not be parsed to integers.")

        self.parameter = (pd.Series(parameter).apply(
            parse_enumeration_from_template, args=(Parameter, )).tolist())

        self.time_resolution = parse_enumeration_from_template(
            time_resolution, TimeResolution)

        # If any date is given, use all period types and filter, else if not period type
        # is given use all period types
        if start_date or end_date or not period_type:
            self.period_type = [*PeriodType]
        # Otherwise period types will be parsed
        else:
            # For the case that a period_type is given, parse the period type(s)
            self.period_type = (pd.Series(period_type).apply(
                parse_enumeration_from_template,
                args=(PeriodType, )).sort_values().tolist())

        if start_date or end_date:
            # If only one date given, make the other one equal
            if not start_date:
                start_date = end_date

            if not end_date:
                end_date = start_date

            self.start_date = Timestamp(dateparser.parse(start_date))
            self.end_date = Timestamp(dateparser.parse(end_date))

            if not self.start_date <= self.end_date:
                raise StartDateEndDateError(
                    "Error: 'start_date' must be smaller or equal to 'end_date'."
                )
        else:
            self.start_date = start_date
            self.end_date = end_date

        self.prefer_local = prefer_local
        self.write_file = write_file
        self.folder = folder
        # If more then one parameter requested, automatically tidy data
        self.tidy_data = len(self.parameter) == 2 or tidy_data
        self.humanize_column_names = humanize_column_names
Exemple #7
0
def dwd_readings(
    station: str = Query(default=None),
    parameter: str = Query(default=None),
    resolution: str = Query(default=None),
    period: str = Query(default=None),
    date: str = Query(default=None),
    sql: str = Query(default=None),
):
    """
    Acquire data from DWD.

    # TODO: Obtain lat/lon distance/number information.

    :param station:     Comma-separated list of station identifiers.
    :param parameter:   Observation measure
    :param resolution:  Frequency/granularity of measurement interval
    :param period:      Recent or historical files
    :param date:        Date or date range
    :param sql:         SQL expression
    :return:
    """

    if station is None:
        raise HTTPException(
            status_code=400, detail="Query argument 'station' is required"
        )

    if parameter is None or resolution is None or period is None:
        raise HTTPException(
            status_code=400,
            detail="Query arguments 'parameter', 'resolution' "
            "and 'period' are required",
        )

    station_ids = map(int, read_list(station))
    parameter = parse_enumeration_from_template(parameter, Parameter)
    resolution = parse_enumeration_from_template(resolution, TimeResolution)
    period = parse_enumeration_from_template(period, PeriodType)

    # Data acquisition.
    request = DWDStationRequest(
        station_ids=station_ids,
        parameter=parameter,
        time_resolution=resolution,
        period_type=period,
        tidy_data=True,
        humanize_column_names=True,
    )

    # Postprocessing.
    df = request.collect_safe()

    if date is not None:
        df = df.dwd.filter_by_date(date, resolution)

    df = df.dwd.lower()

    if sql is not None:
        df = df.io.sql(sql)

    data = json.loads(df.to_json(orient="records", date_format="iso"))
    return make_json_response(data)
Exemple #8
0
def collect_climate_observations_data(
    station_ids: List[int],
    parameter: Union[Parameter, str],
    time_resolution: Union[TimeResolution, str],
    period_type: Union[PeriodType, str],
    folder: Union[str, Path] = DWD_FOLDER_MAIN,
    prefer_local: bool = False,
    write_file: bool = False,
    tidy_data: bool = True,
    humanize_column_names: bool = False,
    run_download_only: bool = False,
) -> Optional[pd.DataFrame]:
    """
    Function that organizes the complete pipeline of data collection, either
    from the internet or from a local file. It therefor goes through every given
    station id and, given by the parameters, either tries to get data from local
    store and/or if fails tries to get data from the internet. Finally if wanted
    it will try to store the data in a hdf file.

    :param station_ids:             station ids that are trying to be loaded
    :param parameter:               Parameter as enumeration
    :param time_resolution:         Time resolution as enumeration
    :param period_type:             Period type as enumeration
    :param folder:                  Folder for local file interaction
    :param prefer_local:            Local data should be preferred
    :param write_file:              Write data to local storage
    :param tidy_data:               Tidy up data so that there's only one set of values
                                    for a datetime in a row, e.g. station_id, parameter,
                                    element, datetime, value, quality.
    :param humanize_column_names:   Yield column names for human consumption
    :param run_download_only:       Run only the download and storing process

    :return:                        All the data given by the station ids.
    """
    parameter = parse_enumeration_from_template(parameter, Parameter)
    time_resolution = parse_enumeration_from_template(time_resolution, TimeResolution)
    period_type = parse_enumeration_from_template(period_type, PeriodType)

    if not check_parameters(parameter, time_resolution, period_type):
        raise InvalidParameterCombination(
            f"The combination of {parameter.value}, {time_resolution.value}, "
            f"{period_type.value} is invalid."
        )

    # List for collected pandas DataFrames per each station id
    data = []
    for station_id in set(station_ids):

        # Just for logging.
        request_string = _build_local_store_key(
            station_id, parameter, time_resolution, period_type
        )

        if prefer_local:
            # Try restoring data
            station_data = restore_climate_observations(
                station_id, parameter, time_resolution, period_type, folder
            )

            # When successful append data and continue with next iteration
            if not station_data.empty:
                log.info(f"Data for {request_string} restored from local.")

                data.append(station_data)

                continue

        log.info(f"Acquiring observations data for {request_string}")

        remote_files = create_file_list_for_climate_observations(
            [station_id], parameter, time_resolution, period_type
        )

        if len(remote_files) == 0:
            log.info(f"No files found for {request_string}. Station will be skipped.")
            continue

        filenames_and_files = download_climate_observations_data_parallel(remote_files)

        station_data = parse_climate_observations_data(
            filenames_and_files, parameter, time_resolution
        )

        if write_file:
            store_climate_observations(
                station_data,
                station_id,
                parameter,
                time_resolution,
                period_type,
                folder,
            )

        data.append(station_data)

    if run_download_only:
        return None

    try:
        data = pd.concat(data)
    except ValueError:
        return pd.DataFrame()

    if tidy_data:
        data = _tidy_up_data(data, parameter)

    # Assign meaningful column names (humanized).
    if humanize_column_names:
        hcnm = create_humanized_column_names_mapping(time_resolution, parameter)
        if tidy_data:
            data[DWDMetaColumns.ELEMENT.value] = data[
                DWDMetaColumns.ELEMENT.value
            ].apply(lambda x: hcnm[x])
        else:
            data = data.rename(columns=hcnm)

    return data