Ejemplo n.º 1
0
def test_parse_enumeration_from_template():
    assert (
        parse_enumeration_from_template("climate_summary", Parameter)
        == Parameter.CLIMATE_SUMMARY
    )
    assert parse_enumeration_from_template("kl", Parameter) == Parameter.CLIMATE_SUMMARY

    with pytest.raises(InvalidParameter):
        parse_enumeration_from_template("climate", Parameter)
Ejemplo n.º 2
0
def metadata_for_climate_observations(
    parameter: Union[Parameter, str],
    time_resolution: Union[TimeResolution, str],
    period_type: Union[PeriodType, str],
    create_new_meta_index: bool = False,
    create_new_file_index: bool = False,
) -> pd.DataFrame:
    """
    A main function to retrieve metadata for a set of parameters that creates a
        corresponding csv.
    STATE information is added to metadata for cases where there's no such named
    column (e.g. STATE) in the pandas.DataFrame.
    For this purpose we use daily precipitation data. That has two reasons:
     - daily precipitation data has a STATE information combined with a city
     - daily precipitation data is the most common data served by the DWD
    Args:
        parameter: observation measure
        time_resolution: frequency/granularity of measurement interval
        period_type: recent or historical files
        create_new_meta_index: if true: a new meta index for metadata will
         be created
        create_new_file_index: if true: a new file index for metadata will
         be created
    Returns:
        pandas.DataFrame with metadata for selected parameters
    """
    if create_new_meta_index:
        reset_meta_index_cache()

    if create_new_file_index:
        reset_file_index_cache()

    parameter = parse_enumeration_from_template(parameter, Parameter)
    time_resolution = parse_enumeration_from_template(time_resolution,
                                                      TimeResolution)
    period_type = parse_enumeration_from_template(period_type, PeriodType)

    meta_index = create_meta_index_for_climate_observations(
        parameter, time_resolution, period_type)

    meta_index[DWDMetaColumns.HAS_FILE.value] = False

    file_index = create_file_index_for_climate_observations(
        parameter, time_resolution, period_type)

    meta_index.loc[meta_index.loc[:, DWDMetaColumns.STATION_ID.value].
                   isin(file_index[DWDMetaColumns.STATION_ID.value]),
                   DWDMetaColumns.HAS_FILE.value, ] = True

    return meta_index
Ejemplo n.º 3
0
    def __init__(
        self,
        time_resolution: Union[str, TimeResolution],
        date_times: Optional[Union[str, List[Union[str, datetime]]]] = None,
        start_date: Optional[Union[str, datetime]] = None,
        end_date: Optional[Union[str, datetime]] = None,
        prefer_local: bool = False,
        write_file: bool = False,
        folder: Union[str, Path] = DWD_FOLDER_MAIN,
    ):

        time_resolution = parse_enumeration_from_template(
            time_resolution, TimeResolution
        )

        if time_resolution not in (TimeResolution.HOURLY, TimeResolution.DAILY):
            raise ValueError("RADOLAN only supports hourly and daily resolution.")

        self.time_resolution = time_resolution

        if date_times == "latest":
            file_index_radolan = create_file_index_for_radolan(time_resolution)

            self.date_times = pd.Series(
                file_index_radolan[DWDMetaColumns.DATETIME.value][-1:]
            )
        elif date_times:
            self.date_times = pd.Series(
                pd.to_datetime(date_times, infer_datetime_format=True)
            )
        else:
            self.date_times = pd.Series(
                pd.date_range(
                    pd.to_datetime(start_date, infer_datetime_format=True),
                    pd.to_datetime(end_date, infer_datetime_format=True),
                )
            )

        self.date_times = self.date_times.dt.floor(freq="H") + pd.Timedelta(minutes=50)

        self.date_times = self.date_times.drop_duplicates().sort_values()

        self.prefer_local = prefer_local
        self.write_file = write_file
        self.folder = folder
Ejemplo n.º 4
0
def get_nearby_stations(
    latitude: float,
    longitude: float,
    minimal_available_date: Union[datetime, str],
    maximal_available_date: Union[datetime, str],
    parameter: Union[Parameter, str],
    time_resolution: Union[TimeResolution, str],
    period_type: Union[PeriodType, str],
    num_stations_nearby: Optional[int] = None,
    max_distance_in_km: Optional[float] = None,
) -> pd.DataFrame:
    """
    Provides a list of weather station ids for the requested data
    Args:
        latitude: latitude of location to search for nearest
            weather station
        longitude: longitude of location to search for nearest
            weather station
        minimal_available_date: Start date of timespan where measurements
            should be available
        maximal_available_date: End date of timespan where measurements
            should be available
        parameter: observation measure
        time_resolution: frequency/granularity of measurement interval
        period_type: recent or historical files
        num_stations_nearby: Number of stations that should be nearby
        max_distance_in_km: alternative filtering criteria, maximum
            distance to location in km

    Returns:
        DataFrames with valid Stations in radius per requested location

    """
    if (num_stations_nearby
            and max_distance_in_km) and (num_stations_nearby
                                         and max_distance_in_km):
        raise ValueError(
            "Either set 'num_stations_nearby' or 'max_distance_in_km'.")

    if num_stations_nearby == 0:
        raise ValueError("'num_stations_nearby' has to be at least 1.")

    parameter = parse_enumeration_from_template(parameter, Parameter)
    time_resolution = parse_enumeration_from_template(time_resolution,
                                                      TimeResolution)
    period_type = parse_enumeration_from_template(period_type, PeriodType)
    minimal_available_date = (minimal_available_date if isinstance(
        minimal_available_date, datetime) else
                              parse_datetime(minimal_available_date))
    maximal_available_date = (maximal_available_date if isinstance(
        maximal_available_date, datetime) else
                              parse_datetime(maximal_available_date))

    if not check_parameters(parameter, time_resolution, period_type):
        raise InvalidParameterCombination(
            f"The combination of {parameter.value}, {time_resolution.value}, "
            f"{period_type.value} is invalid.")

    coords = Coordinates(np.array(latitude), np.array(longitude))

    metadata = metadata_for_climate_observations(parameter, time_resolution,
                                                 period_type)

    metadata = metadata[
        (metadata[DWDMetaColumns.FROM_DATE.value] <= minimal_available_date)
        & (metadata[DWDMetaColumns.TO_DATE.value] >= maximal_available_date
           )].reset_index(drop=True)

    # For distance filtering make normal query including all stations
    if max_distance_in_km:
        num_stations_nearby = metadata.shape[0]

    distances, indices_nearest_neighbours = _derive_nearest_neighbours(
        metadata.LAT.values, metadata.LON.values, coords, num_stations_nearby)

    # Require list of indices for consistency
    # Cast to np.array required for subset
    indices_nearest_neighbours = np.array(
        cast_to_list(indices_nearest_neighbours))
    distances_km = np.array(distances * KM_EARTH_RADIUS)

    # Filter for distance based on calculated distances
    if max_distance_in_km:
        _in_max_distance_indices = np.where(
            distances_km <= max_distance_in_km)[0]
        indices_nearest_neighbours = indices_nearest_neighbours[
            _in_max_distance_indices]
        distances_km = distances_km[_in_max_distance_indices]

    metadata_location = metadata.loc[
        indices_nearest_neighbours if isinstance(indices_nearest_neighbours, (
            list, np.ndarray)) else [indices_nearest_neighbours], :, ]
    metadata_location["DISTANCE_TO_LOCATION"] = distances_km

    if metadata_location.empty:
        logger.warning(f"No weather station was found for coordinate "
                       f"{latitude}°N and {longitude}°E ")

    return metadata_location
Ejemplo n.º 5
0
    def __init__(
        self,
        station_ids: Union[str, int, List[Union[int, str]]],
        parameter: Union[str, Parameter, List[Union[str, Parameter]]],
        time_resolution: Union[str, TimeResolution],
        period_type: Union[
            Union[None, str, PeriodType], List[Union[None, str, PeriodType]]
        ] = None,
        start_date: Union[None, str, Timestamp] = None,
        end_date: Union[None, str, Timestamp] = None,
        prefer_local: bool = False,
        write_file: bool = False,
        folder: Union[str, Path] = DWD_FOLDER_MAIN,
        tidy_data: bool = True,
        humanize_column_names: bool = False,
        create_new_file_index: bool = False,
    ) -> None:
        """
        Class with mostly flexible arguments to define a request regarding DWD data.
        Special handling for period type. If start_date/end_date are given all period
        types are considered and merged together and the data is filtered for the given
        dates afterwards.
        Args:
            station_ids: definition of stations by str, int or list of str/int,
            will be parsed to list of int
            parameter: str or parameter enumeration defining the requested parameter
            time_resolution: str or time resolution enumeration defining the requested
            time resolution
            period_type: str or period type enumeration defining the requested
            period type
            start_date: replacement for period type to define exact time of
            requested data
            end_date: replacement for period type to define exact time of requested data
            prefer_local: definition if data should rather be taken from a local source
            write_file: should data be written to a local file
            folder: place where file lists (and station data) are stored
            tidy_data: reshape DataFrame to a more tidy, row based version of data
            humanize_column_names: replace column names by more meaningful ones
            create_new_file_index: definition if the file index should be recreated
        """

        if not (period_type or start_date or end_date):
            raise ValueError(
                "Define either a 'time_resolution' or one of or both 'start_date' and "
                "'end_date' and leave 'time_resolution' empty!"
            )

        try:
            self.station_ids = [
                int(station_id) for station_id in cast_to_list(station_ids)
            ]
        except ValueError:
            raise ValueError("List of station id's can not be parsed to integers.")

        self.parameter = []
        for p in cast_to_list(parameter):
            self.parameter.append(parse_enumeration_from_template(p, Parameter))

        self.time_resolution = parse_enumeration_from_template(
            time_resolution, TimeResolution
        )

        # start date and end date required for collect_data in any case
        self.start_date = None
        self.end_date = None

        if period_type:
            # For the case that a period_type is given, parse the period type(s)
            self.period_type = []
            for pt in cast_to_list(period_type):
                if pt is None:
                    self.period_type.append(None)
                else:
                    self.period_type.append(
                        parse_enumeration_from_template(pt, PeriodType)
                    )

            # Additional sorting required for self.period_type to ensure that for
            # multiple periods the data is first sourced from historical
            self.period_type = sorted(self.period_type)

        else:
            # working with ranges of data means expecting data to be laying between
            # periods, thus including all periods
            self.period_type = [
                PeriodType.HISTORICAL,
                PeriodType.RECENT,
                PeriodType.NOW,
            ]

            # If only one date given, make the other one equal
            if not start_date:
                start_date = end_date

            if not end_date:
                end_date = start_date

            self.start_date = Timestamp(dateparser.parse(start_date))
            self.end_date = Timestamp(dateparser.parse(end_date))

            if not self.start_date <= self.end_date:
                raise StartDateEndDateError(
                    "Error: 'start_date' must be smaller or equal to 'end_date'."
                )

        self.prefer_local = prefer_local
        self.write_file = write_file
        self.folder = folder
        # If more then one parameter requested, automatically tidy data
        self.tidy_data = len(self.parameter) == 2 or tidy_data
        self.humanize_column_names = humanize_column_names
        self.create_new_file_index = create_new_file_index
Ejemplo n.º 6
0
def get_nearby_stations(
    latitudes: Union[List[float], np.array],
    longitudes: Union[List[float], np.array],
    parameter: Union[Parameter, str],
    time_resolution: Union[TimeResolution, str],
    period_type: Union[PeriodType, str],
    num_stations_nearby: Optional[int] = None,
    max_distance_in_km: Optional[float] = None,
) -> Tuple[List[int], List[List[float]]]:
    """
    Provides a list of weather station ids for the requested data
    Args:
        latitudes: latitudes of locations to search for nearest
            weather station
        longitudes: longitudes of locations to search for nearest
            weather station
        parameter: observation measure
        time_resolution: frequency/granularity of measurement interval
        period_type: recent or historical files
        num_stations_nearby: Number of stations that should be nearby
        max_distance_in_km: alternative filtering criteria, maximum
            distance to location in km

    Returns:
        list of stations ids for the given locations/coordinate pairs and
        a list of distances in kilometer to the weather station

    """
    if (num_stations_nearby
            and max_distance_in_km) and (num_stations_nearby
                                         and max_distance_in_km):
        raise ValueError(
            "Either set 'num_stations_nearby' or 'max_distance_in_km'.")

    if num_stations_nearby == 0:
        raise ValueError("'num_stations_nearby' has to be at least 1.")

    parameter = parse_enumeration_from_template(parameter, Parameter)
    time_resolution = parse_enumeration_from_template(time_resolution,
                                                      TimeResolution)
    period_type = parse_enumeration_from_template(period_type, PeriodType)

    if not check_parameters(parameter, time_resolution, period_type):
        raise InvalidParameterCombination(
            f"The combination of {parameter.value}, {time_resolution.value}, "
            f"{period_type.value} is invalid.")

    if not isinstance(latitudes, list):
        latitudes = np.array(latitudes)

    if not isinstance(longitudes, list):
        latitudes = np.array(longitudes)

    coords = Coordinates(latitudes, longitudes)

    metadata = metadata_for_dwd_data(parameter, time_resolution, period_type)

    # For distance filtering make normal query including all stations
    if max_distance_in_km:
        num_stations_nearby = metadata.shape[0]

    distances, indices_nearest_neighbours = _derive_nearest_neighbours(
        metadata.LAT.values, metadata.LON.values, coords, num_stations_nearby)

    # Make sure go get list of lists [[dist1_1, dist1_2], [dist2_1, dist2_2]]
    if num_stations_nearby == 1:
        distances = np.array([distances])
    else:
        distances = distances.T

    if np.max(indices_nearest_neighbours.shape) > 1:
        indices_nearest_neighbours = indices_nearest_neighbours[0]

    # Require list of indices for consistency
    # Cast to np.array required for subset
    indices_nearest_neighbours = np.array(
        cast_to_list(indices_nearest_neighbours))

    distances_km = np.array(distances * KM_EARTH_RADIUS)

    # Filter for distance based on calculated distances
    if max_distance_in_km:
        indices_stations_in_distance = (np.max(distances_km, axis=1) <=
                                        max_distance_in_km)

        # Reduce stations to those in distance
        distances_km = distances_km[indices_stations_in_distance]
        indices_nearest_neighbours = indices_nearest_neighbours[
            indices_stations_in_distance]

    return (
        metadata.loc[indices_nearest_neighbours,
                     DWDMetaColumns.STATION_ID.value].values.tolist(),
        distances_km.tolist(),
    )
Ejemplo n.º 7
0
def collect_climate_observations_data(
    station_ids: List[int],
    parameter: Union[Parameter, str],
    time_resolution: Union[TimeResolution, str],
    period_type: Union[PeriodType, str],
    folder: Union[str, Path] = DWD_FOLDER_MAIN,
    prefer_local: bool = False,
    write_file: bool = False,
    tidy_data: bool = True,
    humanize_column_names: bool = False,
    run_download_only: bool = False,
    create_new_file_index: bool = False,
) -> Optional[pd.DataFrame]:
    """
    Function that organizes the complete pipeline of data collection, either
    from the internet or from a local file. It therefor goes through every given
    station id and, given by the parameters, either tries to get data from local
    store and/or if fails tries to get data from the internet. Finally if wanted
    it will try to store the data in a hdf file.
    Args:
        station_ids: station ids that are trying to be loaded
        parameter: parameter as enumeration
        time_resolution: time resolution as enumeration
        period_type: period type as enumeration
        folder: folder for local file interaction
        prefer_local: boolean for if local data should be preferred
        write_file: boolean to write data to local storage
        tidy_data: boolean to tidy up data so that there's only one set of values for
        a datetime in a row
        e.g. station_id, parameter, element, datetime, value, quality
        humanize_column_names: boolean to yield column names better for
        human consumption
        run_download_only: boolean to run only the download and storing process
        create_new_file_index: boolean if to create a new file index for the
        data selection

    Returns:
        a pandas DataFrame with all the data given by the station ids
    """
    parameter = parse_enumeration_from_template(parameter, Parameter)
    time_resolution = parse_enumeration_from_template(time_resolution, TimeResolution)
    period_type = parse_enumeration_from_template(period_type, PeriodType)

    if not check_parameters(parameter, time_resolution, period_type):
        raise InvalidParameterCombination(
            f"The combination of {parameter.value}, {time_resolution.value}, "
            f"{period_type.value} is invalid."
        )

    if create_new_file_index:
        reset_file_index_cache()

    # List for collected pandas DataFrames per each station id
    data = []
    for station_id in set(station_ids):
        request_string = _build_local_store_key(
            station_id, parameter, time_resolution, period_type
        )

        if prefer_local:
            # Try restoring data
            station_data = restore_climate_observations(
                station_id, parameter, time_resolution, period_type, folder
            )

            # When successful append data and continue with next iteration
            if not station_data.empty:
                log.info(f"Data for {request_string} restored from local.")

                data.append(station_data)

                continue

        log.info(f"Data for {request_string} will be collected from internet.")

        remote_files = create_file_list_for_climate_observations(
            [station_id], parameter, time_resolution, period_type
        )

        if len(remote_files) == 0:
            log.info(f"No files found for {request_string}. Station will be skipped.")
            continue

        filenames_and_files = download_climate_observations_data_parallel(remote_files)

        station_data = parse_climate_observations_data(
            filenames_and_files, parameter, time_resolution
        )

        if write_file:
            store_climate_observations(
                station_data,
                station_id,
                parameter,
                time_resolution,
                period_type,
                folder,
            )

        data.append(station_data)

    if run_download_only:
        return None

    try:
        data = pd.concat(data)
    except ValueError:
        return pd.DataFrame()

    if tidy_data:
        data = _tidy_up_data(data, parameter)

    # Assign meaningful column names (humanized).
    if humanize_column_names:
        hcnm = create_humanized_column_names_mapping(time_resolution, parameter)
        if tidy_data:
            data[DWDMetaColumns.ELEMENT.value] = data[
                DWDMetaColumns.ELEMENT.value
            ].apply(lambda x: hcnm[x])
        else:
            data = data.rename(columns=hcnm)

    return data