Ejemplo n.º 1
0
    def __init__(
        self,
        parameter_set: Union[str, DWDObservationParameterSet],
        resolution: Union[str, DWDObservationResolution],
        period: Union[str, DWDObservationPeriod] = None,
        start_date: Union[None, str, Timestamp] = None,
        end_date: Union[None, str, Timestamp] = None,
    ):
        super().__init__(start_date=start_date, end_date=end_date)

        parameter_set = parse_enumeration_from_template(
            parameter_set, DWDObservationParameterSet
        )
        resolution = parse_enumeration_from_template(
            resolution, DWDObservationResolution
        )
        period = parse_enumeration_from_template(period, DWDObservationPeriod)

        if not check_dwd_observations_parameter_set(parameter_set, resolution, period):
            raise InvalidParameterCombination(
                f"The combination of {parameter_set.value}, {resolution.value}, "
                f"{period.value} is invalid."
            )

        self.parameter = parameter_set
        self.resolution = resolution
        self.period = period
Ejemplo n.º 2
0
    def __init__(
        self,
        parameter_set: Union[str, DWDObservationParameterSet],
        resolution: Union[str, DWDObservationResolution],
        period: Union[str, DWDObservationPeriod] = None,
        start_date: Union[None, str, Timestamp] = None,
        end_date: Union[None, str, Timestamp] = None,
    ):
        """

        :param parameter_set: parameter set str/enumeration
        :param resolution: resolution str/enumeration
        :param period: period str/enumeration
        :param start_date: start date to limit the stations
        :param end_date: end date to limit the stations
        """
        super().__init__(start_date=start_date, end_date=end_date)

        parameter_set = parse_enumeration_from_template(
            parameter_set, DWDObservationParameterSet)
        resolution = parse_enumeration_from_template(resolution,
                                                     DWDObservationResolution)
        period = parse_enumeration_from_template(period, DWDObservationPeriod)

        # TODO: move to _all and replace error with logging + empty dataframe
        if not check_dwd_observations_parameter_set(parameter_set, resolution,
                                                    period):
            raise InvalidParameterCombination(
                f"The combination of {parameter_set.value}, {resolution.value}, "
                f"{period.value} is invalid.")

        self.parameter = parameter_set
        self.resolution = resolution
        self.period = period
Ejemplo n.º 3
0
def collect_climate_observations_data(
    station_id: int,
    parameter_set: DWDObservationParameterSet,
    resolution: DWDObservationResolution,
    period: DWDObservationPeriod,
) -> pd.DataFrame:
    """
    Function that organizes the complete pipeline of data collection, either
    from the internet or from a local file. It therefore goes through every given
    station id and, given by the parameters, either tries to get data from local
    store and/or if fails tries to get data from the internet. Finally if wanted
    it will try to store the data in a hdf file.

    :param station_id:              station id that is being loaded
    :param parameter_set:               Parameter as enumeration
    :param resolution:         Time resolution as enumeration
    :param period:             Period type as enumeration

    :return:                        All the data given by the station ids.
    """
    if not check_dwd_observations_parameter_set(parameter_set, resolution,
                                                period):
        raise InvalidParameterCombination(
            f"Invalid combination: {parameter_set.value} / {resolution.value} / "
            f"{period.value}")

    remote_files = create_file_list_for_climate_observations(
        station_id, parameter_set, resolution, period)

    if len(remote_files) == 0:
        parameter_identifier = build_parameter_set_identifier(
            parameter_set, resolution, period, station_id)
        log.info(
            f"No files found for {parameter_identifier}. Station will be skipped."
        )
        return pd.DataFrame()

    filenames_and_files = download_climate_observations_data_parallel(
        remote_files)

    obs_df = parse_climate_observations_data(filenames_and_files,
                                             parameter_set, resolution)

    obs_df = coerce_field_types(obs_df, resolution)

    return obs_df
Ejemplo n.º 4
0
def get_nearby_stations_by_number(
    latitude: float,
    longitude: float,
    num_stations_nearby: int,
    parameter: Union[Parameter, str],
    time_resolution: Union[TimeResolution, str],
    period_type: Union[PeriodType, str],
    minimal_available_date: Optional[Union[datetime, str]] = None,
    maximal_available_date: Optional[Union[datetime, str]] = None,
) -> pd.DataFrame:
    """
    Provides a list of weather station ids for the requested data

    :param latitude:                Latitude of location to search for nearest
                                    weather station
    :param longitude:               Longitude of location to search for nearest
                                    weather station
    :param minimal_available_date:  Start date of timespan where measurements
                                    should be available
    :param maximal_available_date:  End date of timespan where measurements
                                    should be available
    :param parameter:               Observation measure
    :param time_resolution:         Frequency/granularity of measurement interval
    :param period_type:             Recent or historical files
    :param num_stations_nearby:     Number of stations that should be nearby

    :return:                        DataFrames with valid stations in radius per
                                    requested location

    """
    if num_stations_nearby <= 0:
        raise ValueError("'num_stations_nearby' has to be at least 1.")

    parameter = parse_enumeration_from_template(parameter, Parameter)
    time_resolution = parse_enumeration_from_template(time_resolution,
                                                      TimeResolution)
    period_type = parse_enumeration_from_template(period_type, PeriodType)

    if not check_parameters(parameter, time_resolution, period_type):
        raise InvalidParameterCombination(
            f"The combination of {parameter.value}, {time_resolution.value}, "
            f"{period_type.value} is invalid.")

    minimal_available_date = (minimal_available_date
                              if not minimal_available_date
                              or isinstance(minimal_available_date, datetime)
                              else parse_datetime(minimal_available_date))
    maximal_available_date = (maximal_available_date
                              if not minimal_available_date
                              or isinstance(maximal_available_date, datetime)
                              else parse_datetime(maximal_available_date))

    if minimal_available_date and maximal_available_date:
        if minimal_available_date > maximal_available_date:
            raise ValueError("'minimal_available_date' has to be before "
                             "'maximal_available_date'")

    coords = Coordinates(np.array(latitude), np.array(longitude))

    metadata = metadata_for_climate_observations(parameter, time_resolution,
                                                 period_type)

    # Filter only for stations that have a file
    metadata = metadata[metadata[DWDMetaColumns.HAS_FILE.value].values]

    if minimal_available_date:
        metadata = metadata[
            metadata[DWDMetaColumns.FROM_DATE.value] <= minimal_available_date]

    if maximal_available_date:
        metadata = metadata[
            metadata[DWDMetaColumns.TO_DATE.value] >= maximal_available_date]

    metadata = metadata.reset_index(drop=True)

    distances, indices_nearest_neighbours = _derive_nearest_neighbours(
        metadata.LAT.values, metadata.LON.values, coords, num_stations_nearby)

    distances = pd.Series(distances)
    indices_nearest_neighbours = pd.Series(indices_nearest_neighbours)

    # If num_stations_nearby is higher then the actual amount of stations
    # further indices and distances are added which have to be filtered out
    distances = distances[:min(metadata.shape[0], num_stations_nearby)]
    indices_nearest_neighbours = indices_nearest_neighbours[:min(
        metadata.shape[0], num_stations_nearby)]

    distances_km = np.array(distances * KM_EARTH_RADIUS)

    metadata_location = metadata.iloc[
        indices_nearest_neighbours, :].reset_index(drop=True)

    metadata_location[DWDMetaColumns.DISTANCE_TO_LOCATION.value] = distances_km

    if metadata_location.empty:
        logger.warning(f"No weather stations were found for coordinate "
                       f"{latitude}°N and {longitude}°E ")

    return metadata_location
Ejemplo n.º 5
0
def get_nearby_stations(
    latitude: float,
    longitude: float,
    minimal_available_date: Union[datetime, str],
    maximal_available_date: Union[datetime, str],
    parameter: Union[Parameter, str],
    time_resolution: Union[TimeResolution, str],
    period_type: Union[PeriodType, str],
    num_stations_nearby: Optional[int] = None,
    max_distance_in_km: Optional[float] = None,
) -> pd.DataFrame:
    """
    Provides a list of weather station ids for the requested data
    Args:
        latitude: latitude of location to search for nearest
            weather station
        longitude: longitude of location to search for nearest
            weather station
        minimal_available_date: Start date of timespan where measurements
            should be available
        maximal_available_date: End date of timespan where measurements
            should be available
        parameter: observation measure
        time_resolution: frequency/granularity of measurement interval
        period_type: recent or historical files
        num_stations_nearby: Number of stations that should be nearby
        max_distance_in_km: alternative filtering criteria, maximum
            distance to location in km

    Returns:
        DataFrames with valid Stations in radius per requested location

    """
    if (num_stations_nearby
            and max_distance_in_km) and (num_stations_nearby
                                         and max_distance_in_km):
        raise ValueError(
            "Either set 'num_stations_nearby' or 'max_distance_in_km'.")

    if num_stations_nearby == 0:
        raise ValueError("'num_stations_nearby' has to be at least 1.")

    parameter = parse_enumeration_from_template(parameter, Parameter)
    time_resolution = parse_enumeration_from_template(time_resolution,
                                                      TimeResolution)
    period_type = parse_enumeration_from_template(period_type, PeriodType)
    minimal_available_date = (minimal_available_date if isinstance(
        minimal_available_date, datetime) else
                              parse_datetime(minimal_available_date))
    maximal_available_date = (maximal_available_date if isinstance(
        maximal_available_date, datetime) else
                              parse_datetime(maximal_available_date))

    if not check_parameters(parameter, time_resolution, period_type):
        raise InvalidParameterCombination(
            f"The combination of {parameter.value}, {time_resolution.value}, "
            f"{period_type.value} is invalid.")

    coords = Coordinates(np.array(latitude), np.array(longitude))

    metadata = metadata_for_climate_observations(parameter, time_resolution,
                                                 period_type)

    metadata = metadata[
        (metadata[DWDMetaColumns.FROM_DATE.value] <= minimal_available_date)
        & (metadata[DWDMetaColumns.TO_DATE.value] >= maximal_available_date
           )].reset_index(drop=True)

    # For distance filtering make normal query including all stations
    if max_distance_in_km:
        num_stations_nearby = metadata.shape[0]

    distances, indices_nearest_neighbours = _derive_nearest_neighbours(
        metadata.LAT.values, metadata.LON.values, coords, num_stations_nearby)

    # Require list of indices for consistency
    # Cast to np.array required for subset
    indices_nearest_neighbours = np.array(
        cast_to_list(indices_nearest_neighbours))
    distances_km = np.array(distances * KM_EARTH_RADIUS)

    # Filter for distance based on calculated distances
    if max_distance_in_km:
        _in_max_distance_indices = np.where(
            distances_km <= max_distance_in_km)[0]
        indices_nearest_neighbours = indices_nearest_neighbours[
            _in_max_distance_indices]
        distances_km = distances_km[_in_max_distance_indices]

    metadata_location = metadata.loc[
        indices_nearest_neighbours if isinstance(indices_nearest_neighbours, (
            list, np.ndarray)) else [indices_nearest_neighbours], :, ]
    metadata_location["DISTANCE_TO_LOCATION"] = distances_km

    if metadata_location.empty:
        logger.warning(f"No weather station was found for coordinate "
                       f"{latitude}°N and {longitude}°E ")

    return metadata_location
Ejemplo n.º 6
0
def collect_climate_observations_data(
    station_ids: List[int],
    parameter: Union[Parameter, str],
    time_resolution: Union[TimeResolution, str],
    period_type: Union[PeriodType, str],
    folder: Union[str, Path] = DWD_FOLDER_MAIN,
    prefer_local: bool = False,
    write_file: bool = False,
    tidy_data: bool = True,
    humanize_column_names: bool = False,
    run_download_only: bool = False,
    create_new_file_index: bool = False,
) -> Optional[pd.DataFrame]:
    """
    Function that organizes the complete pipeline of data collection, either
    from the internet or from a local file. It therefor goes through every given
    station id and, given by the parameters, either tries to get data from local
    store and/or if fails tries to get data from the internet. Finally if wanted
    it will try to store the data in a hdf file.
    Args:
        station_ids: station ids that are trying to be loaded
        parameter: parameter as enumeration
        time_resolution: time resolution as enumeration
        period_type: period type as enumeration
        folder: folder for local file interaction
        prefer_local: boolean for if local data should be preferred
        write_file: boolean to write data to local storage
        tidy_data: boolean to tidy up data so that there's only one set of values for
        a datetime in a row
        e.g. station_id, parameter, element, datetime, value, quality
        humanize_column_names: boolean to yield column names better for
        human consumption
        run_download_only: boolean to run only the download and storing process
        create_new_file_index: boolean if to create a new file index for the
        data selection

    Returns:
        a pandas DataFrame with all the data given by the station ids
    """
    parameter = parse_enumeration_from_template(parameter, Parameter)
    time_resolution = parse_enumeration_from_template(time_resolution, TimeResolution)
    period_type = parse_enumeration_from_template(period_type, PeriodType)

    if not check_parameters(parameter, time_resolution, period_type):
        raise InvalidParameterCombination(
            f"The combination of {parameter.value}, {time_resolution.value}, "
            f"{period_type.value} is invalid."
        )

    if create_new_file_index:
        reset_file_index_cache()

    # List for collected pandas DataFrames per each station id
    data = []
    for station_id in set(station_ids):
        request_string = _build_local_store_key(
            station_id, parameter, time_resolution, period_type
        )

        if prefer_local:
            # Try restoring data
            station_data = restore_climate_observations(
                station_id, parameter, time_resolution, period_type, folder
            )

            # When successful append data and continue with next iteration
            if not station_data.empty:
                log.info(f"Data for {request_string} restored from local.")

                data.append(station_data)

                continue

        log.info(f"Data for {request_string} will be collected from internet.")

        remote_files = create_file_list_for_climate_observations(
            [station_id], parameter, time_resolution, period_type
        )

        if len(remote_files) == 0:
            log.info(f"No files found for {request_string}. Station will be skipped.")
            continue

        filenames_and_files = download_climate_observations_data_parallel(remote_files)

        station_data = parse_climate_observations_data(
            filenames_and_files, parameter, time_resolution
        )

        if write_file:
            store_climate_observations(
                station_data,
                station_id,
                parameter,
                time_resolution,
                period_type,
                folder,
            )

        data.append(station_data)

    if run_download_only:
        return None

    try:
        data = pd.concat(data)
    except ValueError:
        return pd.DataFrame()

    if tidy_data:
        data = _tidy_up_data(data, parameter)

    # Assign meaningful column names (humanized).
    if humanize_column_names:
        hcnm = create_humanized_column_names_mapping(time_resolution, parameter)
        if tidy_data:
            data[DWDMetaColumns.ELEMENT.value] = data[
                DWDMetaColumns.ELEMENT.value
            ].apply(lambda x: hcnm[x])
        else:
            data = data.rename(columns=hcnm)

    return data