def get_stations(cls, **retrieval_kwargs): """Retrieve a list of measuring stations. Args: retrieval_kwargs: keyword arguments to pass to retrieve function """ # Retrieve and reshape data stations = retrieve(stations_cache_file, API_ENDPOINTS["stations"], "station metadata", **retrieval_kwargs) stations = (stations.drop(columns=["geometry.type", "type"]).rename( columns={ "properties.id": "id", "properties.label": "label" }).set_index("id")) # Split coordinates into columns coords = pd.DataFrame( [row for row in stations["geometry.coordinates"]], index=stations.index) stations[["lat", "lon"]] = coords[[1, 0]] stations.drop(columns=["geometry.coordinates"], inplace=True) cls.stations = stations
def __init__(cls, **retrieval_kwargs): """Retrieve sensor information from the InfluencAir project. Args: retrieval_kwargs: keyword arguments to pass to retrieve function Raises: KeyError if sheet structure does not match listed columns """ sensor_info = retrieve(cache_file=sensor_info_cache_file, url=SENSOR_SHEET_DOWNLOAD_URL, label="InfluencAir sensor information", read_func=pd.read_csv, read_func_kwargs={ "header": 1, "dtype": "object" }, call_rate_limiter=google_call_rate_limiter, **retrieval_kwargs) try: sensor_info = (sensor_info[[ "Chip ID", "PM Sensor ID", "Hum/Temp Sensor ID", "Label", "Address", "Floor", "Side (Street/Garden)" ]].rename(columns={"Side (Street/Garden)": "Side"})) except KeyError: raise KeyError("Could not get columns. Check if the structure or " "labels of the InfluencAir sensor Google Sheet " "have changed.") cls.sensors = sensor_info cls.initialized = True
def get_phenomena(cls, **retrieval_kwargs): """Retrieve a list of measured phenomena. Args: retrieval_kwargs: keyword arguments to pass to retrieve function """ phenomena = retrieve(phenomena_cache_file, API_ENDPOINTS["phenomena"], "phenomenon metadata", **retrieval_kwargs) phenomena["id"] = phenomena["id"].astype("int") phenomena = phenomena.set_index("id").sort_index() cls.phenomena = phenomena
def get_metadata(self, **retrieval_kwargs): """Get sensor metadata and current measurements from cache or luftdaten.info API. Args: retrieval_kwargs: keyword arguments to pass to retrieve function Warns: UserWarning if sensor does not appear to be online """ # Get and cache metadata and measurements of past five minutes filename = os.path.basename(self.metadata_url.rstrip("/")) + ".json" filepath = os.path.join(cache_dir, filename) parsed = retrieve( cache_file=filepath, url=self.metadata_url, label=("sensor {} metadata from luftdaten.info".format( self.sensor_id)), call_rate_limiter=call_rate_limiter, **retrieval_kwargs) try: metadata = (parsed.drop( columns=["sensordatavalues", "timestamp"]).iloc[0]) except (ValueError, AttributeError, KeyError): warnings.warn("Sensor metadata could not be retrieved") else: metadata.name = "metadata" self.metadata = metadata # Extract metadata into corresponding properties self.sensor_type = metadata["sensor.sensor_type.name"] self.lat = float(metadata["location.latitude"]) self.lon = float(metadata["location.longitude"]) self.label = "at " + utils.label_coordinates(self.lat, self.lon) # Extract most current measurements current = parsed["sensordatavalues"].iloc[-1] current = (json_normalize(current).replace({ "P1": "pm10", "P2": "pm2.5" }).set_index("value_type")["value"]) current = (pd.to_numeric(current).replace([999.9, 1999.9], pd.np.nan)) self.current_measurements = dict(current) self.phenomena = list(current.index) self.units = { phenomenon: UNITS[phenomenon] for phenomenon in UNITS if phenomenon in self.phenomena }
def get_time_series(cls, **retrieval_kwargs): """Retrieve information on available time series: a collection of station & phenomenon combinations. Args: retrieval_kwargs: keyword arguments to pass to retrieve function """ def get_phenomenon_name(label): """Extract phenomenon name from time series label.""" phenomenon_name_series_id = (label.split(sep=" - ", maxsplit=1)[0]) phenomenon_name = phenomenon_name_series_id.rsplit(maxsplit=1)[0] return phenomenon_name # Retrieve and reshape data time_series = retrieve(cache_file=time_series_cache_file, url=API_ENDPOINTS["time series"], label="time series metadata", call_rate_limiter=call_rate_limiter, **retrieval_kwargs) time_series["id"] = time_series["id"].astype("int") time_series = (time_series.set_index("id").drop( columns=["station.geometry.type", "station.type"]).rename( columns={ "station.properties.id": "station_id", "station.properties.label": "station_label", "uom": "unit" })) # Extract phenomenon names from labels labels = time_series["label"] time_series["phenomenon"] = labels.apply(get_phenomenon_name) # Split coordinates into columns coords = pd.DataFrame( [row for row in time_series["station.geometry.coordinates"]], index=time_series.index) time_series[["station_lat", "station_lon"]] = coords[[1, 0]] # Sort and drop columns time_series = time_series[[ "label", "phenomenon", "unit", "station_id", "station_label", "station_lat", "station_lon" ]] # Clean unit descriptors time_series["unit"] = (time_series["unit"].str.replace( "m3", "m³").str.replace("ug", "µg")) (time_series.loc[time_series["phenomenon"] == "temperature", "unit"]) = "°C" cls.time_series = time_series
def get_latest_measurement(self, **retrieval_kwargs): """Retrieve time series data. Args: retrieval_kwargs: keyword arguments to pass to retrieve function """ sensor_id = self.sensor_id # Make start and end timezone aware and truncate time values today = time.strftime("%Y-%m-%d") tomorrow_date = pd.to_datetime(today, format="%Y-%m-%d", utc=True).normalize() + \ pd.Timedelta(days=1) tomorrow = tomorrow_date.strftime("%Y-%m-%d") # download the data url = (API_ENDPOINTS["data pattern"].format(time_series_id=sensor_id, start=today, end=tomorrow)) filename = ( "irceline_{time_series_id}_{start_date}_{end_date}.json".format( time_series_id=sensor_id, start_date=today, end_date=tomorrow)) filepath = os.path.join(cache_dir, filename) # Retrieve and parse data data = retrieve(filepath, url, "IRCELINE timeseries data", **retrieval_kwargs) data = pd.DataFrame.from_dict(data.loc[0, "values"]) data = data[data['value'] != "NaN"] data = data.tail(1) last_measurement = data['value'].iloc[0] return last_measurement
def get_measurements(self, start_date, end_date, **retrieval_kwargs): """Retrieve time series data. Args: start_date: date string in ISO 8601 (YYYY-MM-DD) format. Interpreted as UTC. end_date: date string like start_date. If the current date or a future date is entered, end will be truncated so that only complete days are downloaded. retrieval_kwargs: keyword arguments to pass to retrieve function Raises: ValueError if start_date is later than end_date """ # Make start and end timezone aware and truncate time values query_start_date = pd.to_datetime(start_date, format="%Y-%m-%d", utc=True).normalize() query_end_date = ( pd.to_datetime(end_date, format="%Y-%m-%d", utc=True).normalize() + pd.Timedelta(days=1)) # To include end_date data # Check validity of input and truncate end date if needed today = pd.to_datetime("today", utc=True) if query_end_date > today: warnings.warn("Resetting end_date to yesterday") yesterday = today - pd.Timedelta(days=1) end_date = yesterday.strftime("%Y-%m-%d") query_end_date = today # 00:00, to include yesterday's data if query_start_date > query_end_date: raise ValueError("end_date must be greater than or equal to " "start_date") # IRCELINE API takes local times. Convert start and end accordingly. query_start_local = query_start_date.tz_convert("Europe/Brussels") query_start_local_str = query_start_local.strftime("%Y-%m-%dT%H") query_end_local = query_end_date.tz_convert("Europe/Brussels") query_end_local -= pd.Timedelta(1, "s") query_end_local_str = query_end_local.strftime("%Y-%m-%dT%H:%M:%S") url = (API_ENDPOINTS["data pattern"].format( time_series_id=self.sensor_id, start=query_start_local_str, end=query_end_local_str)) # TODO: Split response into days and cache as daily files; check cache # day by day. Find longest missing intervals to make as few # requests as possible. filename = ( "irceline_{time_series_id}_{start_date}_{end_date}.json".format( time_series_id=self.sensor_id, start_date=start_date, end_date=end_date)) filepath = os.path.join(cache_dir, filename) # TODO: Check day by day if data are cached # Retrieve and parse data data = retrieve(filepath, url, "IRCELINE timeseries data", **retrieval_kwargs) data = pd.DataFrame.from_dict(data.loc[0, "values"]) if len(data) == 0: return data["value"] = data["value"].astype("float") data = data.rename(columns={"value": self.metadata["phenomenon"]}) # Convert Unix timestamps to datetimes and then to periods for index data.index = (pd.to_datetime(data["timestamp"], unit="ms", utc=True).dt.to_period(freq="h")) data.index.name = "Period" data = data.drop(columns=["timestamp"]) self.measurements = data
def get_measurements(self, start_date, end_date, **retrieval_kwargs): """Get measurement data of the sensor in a given period. Data are read from cache if available, or downloaded from luftdaten.info and saved to cache as retrieved, and then cleaned for self.measurements. If the instance already has data associated with it, calling this method replaces them. Args: start_date: first date of data to retrieve, in ISO 8601 (YYYY-MM-DD) format end_date: last date of data to retrieve, in ISO 8601 (YYYY-MM-DD) format retrieval_kwargs: keyword arguments to pass to retrieve function """ sid = self.sensor_id if self.sensor_type is None: self.sensor_type = input("Type of sensor {} has not been set yet. " "Enter sensor type: ".format(sid)) stype = self.sensor_type.lower() # Get and process the data file for each date in the requested range daily_data = [] for date in pd.date_range(start_date, end_date): date_iso = date.strftime("%Y-%m-%d") filename = ARCHIVE_FILENAME_PATTERN.format(date=date_iso, sensor_type=stype, sensor_id=sid) filepath = os.path.join(cache_dir, filename) url = ARCHIVE_URL_PATTERN.format(date=date_iso, filename=filename) data = retrieve( cache_file=filepath, url=url, label=("luftdaten.info data for sensor {} on {}".format( sid, date_iso)), read_func=pd.read_csv, call_rate_limiter=call_rate_limiter, read_func_kwargs={"sep": ";"}, **retrieval_kwargs) if data is None: continue # Parse timestamps and make them timezone aware timestamps = pd.to_datetime(data["timestamp"], utc=True) # Reformat data according to sensor type data.set_index(timestamps, inplace=True) if self.sensor_type in ("SDS011", "HPM"): data = (data[["P1", "P2"]].rename(columns={ "P1": "pm10", "P2": "pm2.5" })) elif self.sensor_type == "DHT22": data = data[["temperature", "humidity"]] else: raise NotImplementedError("No data parsing method implemented " "for sensor type {}".format( self.sensor_type)) daily_data.append(data) # If daily data were retrieved, concatenate them to a single dataframe if daily_data: self.measurements = pd.concat(daily_data) else: self.measurements = None print("No data for sensor", sid) return # Remove duplicates duplicates = self.measurements.index.duplicated(keep="last") self.measurements = self.measurements[~duplicates] self.measurements.sort_index(inplace=True) self.clean_measurements()