def test_build_parameter_identifier(): parameter_identifier = build_parameter_set_identifier( DWDObservationParameterSet.CLIMATE_SUMMARY, DWDObservationResolution.DAILY, DWDObservationPeriod.HISTORICAL, 1, ) assert parameter_identifier == "kl/daily/historical/station_id_1"
def hdf5_key(self, station_id: int) -> str: """ Builds a HDF5 key string from defined parameters including a single station id. :param station_id: Station id of data :return: Key for storing data into HDF5 file. """ return build_parameter_set_identifier(self.parameter_set, self.resolution, self.period, station_id)
def collect_climate_observations_data( station_id: int, parameter_set: DWDObservationParameterSet, resolution: DWDObservationResolution, period: DWDObservationPeriod, ) -> pd.DataFrame: """ Function that organizes the complete pipeline of data collection, either from the internet or from a local file. It therefore goes through every given station id and, given by the parameters, either tries to get data from local store and/or if fails tries to get data from the internet. Finally if wanted it will try to store the data in a hdf file. :param station_id: station id that is being loaded :param parameter_set: Parameter as enumeration :param resolution: Time resolution as enumeration :param period: Period type as enumeration :return: All the data given by the station ids. """ if not check_dwd_observations_parameter_set(parameter_set, resolution, period): raise InvalidParameterCombination( f"Invalid combination: {parameter_set.value} / {resolution.value} / " f"{period.value}") remote_files = create_file_list_for_climate_observations( station_id, parameter_set, resolution, period) if len(remote_files) == 0: parameter_identifier = build_parameter_set_identifier( parameter_set, resolution, period, station_id) log.info( f"No files found for {parameter_identifier}. Station will be skipped." ) return pd.DataFrame() filenames_and_files = download_climate_observations_data_parallel( remote_files) obs_df = parse_climate_observations_data(filenames_and_files, parameter_set, resolution) obs_df = coerce_field_types(obs_df, resolution) return obs_df
def _collect_data( self, station_id: int, parameter_set: DWDObservationParameterSet ) -> pd.DataFrame: """ Method to collect data for one specified parameter. Manages restoring, collection and storing of data, transformation and combination of different periods. Args: station_id: station id for which parameter is collected parameter_set: chosen parameter that is collected Returns: pandas.DataFrame for given parameter of station """ df_parameter = pd.DataFrame() for period_type in self.periods: parameter_identifier = build_parameter_set_identifier( parameter_set, self.resolution, period_type, station_id ) storage = None if self.storage: storage = self.storage.hdf5( parameter=parameter_set, resolution=self.resolution, period=period_type, ) df_period = storage.restore(station_id) if not df_period.empty: df_parameter = df_parameter.append(df_period) continue log.info(f"Acquiring observations data for {parameter_identifier}.") try: df_period = collect_climate_observations_data( station_id, parameter_set, self.resolution, period_type ) except InvalidParameterCombination: log.info( f"Invalid combination {parameter_set.value}/" f"{self.resolution.value}/{period_type} is skipped." ) df_period = pd.DataFrame() if self.storage and self.storage.persist: storage.store(station_id=station_id, df=df_period) # Filter out values which already are in the DataFrame try: df_period = df_period[ ~df_period[DWDMetaColumns.DATE.value].isin( df_parameter[DWDMetaColumns.DATE.value] ) ] except KeyError: pass df_parameter = df_parameter.append(df_period) if self.tidy_data: df_parameter = df_parameter.dwd.tidy_up_data() df_parameter.insert(2, DWDMetaColumns.PARAMETER.value, parameter_set.name) # Assign meaningful column names (humanized). if self.humanize_column_names: hcnm = self._create_humanized_column_names_mapping( self.resolution, parameter_set ) if self.tidy_data: df_parameter[DWDMetaColumns.ELEMENT.value] = df_parameter[ DWDMetaColumns.ELEMENT.value ].apply(lambda x: hcnm[x]) else: df_parameter = df_parameter.rename(columns=hcnm) return df_parameter
def _collect_station_parameter( self, station_id: str, parameter: Tuple[Union[DWDObservationParameter, DWDObservationParameterSet], DWDObservationParameterSet, ], ) -> pd.DataFrame: """ Method to collect data for one specified parameter. Manages restoring, collection and storing of data, transformation and combination of different periods. Args: station_id: station id for which parameter is collected parameter: chosen parameter-parameter_set combination that is collected Returns: pandas.DataFrame for given parameter of station """ parameter, parameter_set = parameter periods_and_date_ranges = [] for period in self.periods: if (self.resolution in HIGH_RESOLUTIONS and period == DWDObservationPeriod.HISTORICAL): date_ranges = self._get_historical_date_ranges( station_id, parameter_set) for date_range in date_ranges: periods_and_date_ranges.append((period, date_range)) else: periods_and_date_ranges.append((period, None)) parameter_df = pd.DataFrame() for period, date_range in periods_and_date_ranges: parameter_identifier = build_parameter_set_identifier( parameter_set, self.resolution, period, station_id, date_range) log.info( f"Acquiring observations data for {parameter_identifier}.") # TODO: integrate collect_climate_observations_data in class try: period_df = collect_climate_observations_data( station_id, parameter_set, self.resolution, period, date_range) except InvalidParameterCombination: log.info(f"Invalid combination {parameter_set.value}/" f"{self.resolution.value}/{period} is skipped.") period_df = pd.DataFrame() # Filter out values which already are in the DataFrame try: period_df = period_df[~period_df[DWDMetaColumns.DATE.value]. isin(parameter_df[DWDMetaColumns.DATE. value])] except KeyError: pass parameter_df = parameter_df.append(period_df) if self.tidy_data: parameter_df = parameter_df.dwd.tidy_up_data() # TODO: remove this column and rather move it into metadata of resulting # data model parameter_df.insert(2, DWDMetaColumns.PARAMETER_SET.value, parameter_set.name) parameter_df[DWDMetaColumns.PARAMETER_SET.value] = parameter_df[ DWDMetaColumns.PARAMETER_SET.value].astype("category") if parameter not in DWDObservationParameterSet: parameter_df = parameter_df[parameter_df[ DWDMetaColumns.PARAMETER.value] == parameter.value] return parameter_df