def extract_airqo_hourly_data_from_api(start_time: str, end_time: str) -> list: airqo_api = AirQoApi() devices = airqo_api.get_devices(tenant="airqo") devices_list = list(devices) hourly_events = [] if len(devices_list) == 0: print("devices empty") return [] for device in devices_list: try: if "name" not in device.keys(): print(f"name missing in device keys : {device}") continue device_name = device["name"] events = airqo_api.get_events( tenant="airqo", start_time=start_time, frequency="hourly", end_time=end_time, device=device_name, ) if not events: print( f"No measurements for {device_name} : startTime {start_time} : endTime : {end_time}" ) continue hourly_events.extend(events) except Exception as ex: traceback.print_exc() print(ex) device_measurements = pd.json_normalize(hourly_events) column_mappings = { "internalTemperature.value": "internalTemperature", "internalHumidity.value": "internalHumidity", "externalTemperature.value": "temperature", "externalHumidity.value": "humidity", "externalPressure.value": "pressure", "speed.value": "windSpeed", "altitude.value": "altitude", "battery.value": "battery", "satellites.value": "satellites", "hdop.value": "hdop", "pm10.value": "pm10", "s2_pm10.value": "s2_pm10", "s2_pm2_5.value": "s2_pm2_5", "average_pm2_5.calibratedValue": "calibrated_pm2_5", } device_measurements.rename(columns=column_mappings, inplace=True) return device_measurements.to_dict(orient="records")
def load(inputs: dict): from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.airqo_api import AirQoApi kcca_data = un_fill_nan(inputs.get("data")) airqo_api = AirQoApi() airqo_api.save_events(measurements=kcca_data, tenant="kcca")
def load(kcca_data: dict, **kwargs): from airqo_etl_utils.kcca_utils import ( transform_kcca_measurements_for_api, transform_kcca_hourly_data_for_bigquery, transform_kcca_data_for_message_broker, ) from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.airqo_api import AirQoApi from airqo_etl_utils.message_broker import KafkaBrokerClient from airqo_etl_utils.bigquery_api import BigQueryApi from airqo_etl_utils.config import configuration data = un_fill_nan(kcca_data.get("data")) try: dag_run = kwargs.get("dag_run") destination = dag_run.conf["destination"] except KeyError: destination = "bigquery" if destination == "bigquery": kcca_transformed_data = transform_kcca_hourly_data_for_bigquery( data) big_query_api = BigQueryApi() big_query_api.save_data( data=kcca_transformed_data, table=big_query_api.hourly_measurements_table, ) elif destination == "message-broker": kcca_transformed_data = transform_kcca_data_for_message_broker( data=data, frequency="hourly") info = { "data": kcca_transformed_data, "action": "insert", "tenant": "kcca" } kafka = KafkaBrokerClient() kafka.send_data(info=info, topic=configuration.HOURLY_MEASUREMENTS_TOPIC) elif destination == "api": kcca_transformed_data = transform_kcca_measurements_for_api(data) airqo_api = AirQoApi() airqo_api.save_events(measurements=kcca_transformed_data, tenant="kcca") else: raise Exception( "Invalid data destination. Valid values are bigquery, message-broker and api" )
def send_raw_measurements_to_api(airqo_data: dict): from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.airqo_utils import restructure_airqo_data from airqo_etl_utils.airqo_api import AirQoApi data = un_fill_nan(airqo_data.get("data")) airqo_restructured_data = restructure_airqo_data(data=data, destination="api") airqo_api = AirQoApi() airqo_api.save_events(measurements=airqo_restructured_data, tenant="airqo")
def send_hourly_measurements_to_api(inputs: dict): from airqo_etl_utils.kcca_utils import transform_kcca_measurements_for_api from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.airqo_api import AirQoApi data = un_fill_nan(inputs.get("data")) kcca_data = transform_kcca_measurements_for_api(data) airqo_api = AirQoApi() airqo_api.save_events(measurements=kcca_data, tenant="kcca")
def create_empty_insights(): from airqo_etl_utils.airqo_api import AirQoApi from airqo_etl_utils.commons import fill_nan import random import pandas as pd from airqo_etl_utils.date import ( date_to_str_days, date_to_str_hours, ) airqo_api = AirQoApi() sites = airqo_api.get_sites(tenant="airqo") insights = [] dates = pd.date_range(start_date_time, end_date_time, freq="1H") for date in dates: date_time = date_to_str_hours(date) for site in sites: try: hourly_insight = { "time": date_time, "pm2_5": random.uniform(50.0, 150.0), "pm10": random.uniform(50.0, 150.0), "empty": True, "frequency": "HOURLY", "forecast": False, "siteId": site["_id"], } insights.append(hourly_insight) except Exception as ex: print(ex) dates = pd.date_range(start_date_time, end_date_time, freq="24H") for date in dates: date_time = date_to_str_days(date) for site in sites: try: daily_insight = { "time": date_time, "pm2_5": random.uniform(50.0, 150.0), "pm10": random.uniform(50.0, 150.0), "empty": True, "frequency": "DAILY", "forecast": False, "siteId": site["_id"], } insights.append(daily_insight) except Exception as ex: print(ex) return dict({"data": fill_nan(data=insights)})
def extract_airqo_devices_deployment_history() -> list: airqo_api = AirQoApi() devices = airqo_api.get_devices(tenant="airqo") devices_history = [] for device in devices: try: maintenance_logs = airqo_api.get_maintenance_logs( tenant="airqo", device=device["name"], activity_type="deployment" ) if not maintenance_logs or len(maintenance_logs) <= 1: continue log_df = pd.DataFrame(maintenance_logs) log_df = log_df.dropna(subset=["date"]) log_df["site_id"] = ( log_df["site_id"].fillna(method="bfill").fillna(method="ffill") ) log_df = log_df.dropna(subset=["site_id"]) log_df["start_time"] = pd.to_datetime(log_df["date"]) log_df = log_df.sort_values(by="start_time") log_df["end_time"] = log_df["start_time"].shift(-1) log_df["end_time"] = log_df["end_time"].fillna(datetime.utcnow()) log_df["start_time"] = log_df["start_time"].apply(lambda x: date_to_str(x)) log_df["end_time"] = log_df["end_time"].apply(lambda x: date_to_str(x)) if len(set(log_df["site_id"].tolist())) == 1: continue for _, raw in log_df.iterrows(): device_history = { "device": raw["device"], "device_id": device["_id"], "start_time": raw["start_time"], "end_time": raw["end_time"], "site_id": raw["site_id"], } devices_history.append(device_history) except Exception as ex: print(ex) traceback.print_exc() return devices_history
def load(airqo_data: dict, **kwargs): from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.bigquery_api import BigQueryApi from airqo_etl_utils.airqo_api import AirQoApi from airqo_etl_utils.airqo_utils import restructure_airqo_data from airqo_etl_utils.config import configuration from airqo_etl_utils.message_broker import KafkaBrokerClient data = un_fill_nan(airqo_data.get("data")) try: dag_run = kwargs.get("dag_run") destination = dag_run.conf["destination"] except KeyError: destination = "bigquery" if destination == "bigquery": airqo_restructured_data = restructure_airqo_data( data=data, destination="bigquery" ) big_query_api = BigQueryApi() big_query_api.save_data( data=airqo_restructured_data, table=big_query_api.hourly_measurements_table, ) elif destination == "message-broker": airqo_restructured_data = restructure_airqo_data( data=data, destination="message-broker" ) info = { "data": airqo_restructured_data, "action": "insert", "tenant": "airqo", } kafka = KafkaBrokerClient() kafka.send_data(info=info, topic=configuration.HOURLY_MEASUREMENTS_TOPIC) elif destination == "api": airqo_restructured_data = restructure_airqo_data( data=data, destination="api" ) airqo_api = AirQoApi() airqo_api.save_events(measurements=airqo_restructured_data, tenant="airqo") else: raise Exception( "Invalid data destination. Valid values are bigquery, message-broker and api" )
def get_airqo_data(freq: str, start_time: str = None, end_time: str = None) -> list: airqo_api = AirQoApi() devices = airqo_api.get_devices(tenant="airqo", all_devices=False) measurements = [] start = (str_to_date(start_time) if start_time else datetime.utcnow() - timedelta(days=7)) end = str_to_date(end_time) if end_time else datetime.utcnow() start_time = (date_to_str_days(start) if freq == "daily" else date_to_str_hours(start)) end_time = date_to_str_days(end) if freq == "daily" else date_to_str_hours( end) frequency = get_airqo_api_frequency(freq=freq) dates = pd.date_range(start_time, end_time, freq=frequency) last_date_time = dates.values[len(dates.values) - 1] for device in devices: for date in dates: start = date_to_str(date) end_date_time = date + timedelta(hours=dates.freq.n) if np.datetime64(end_date_time) > last_date_time: end = end_time else: end = date_to_str(end_date_time) try: events = airqo_api.get_events( tenant="airqo", start_time=start, frequency=freq, end_time=end, device=device["name"], ) measurements.extend(events) except Exception as ex: print(ex) traceback.print_exc() insights = format_measurements_to_insights(data=measurements) return insights
def calibrate_using_api(measurements: list) -> list: if not measurements: return [] data_df = pd.DataFrame(measurements) data_df_groups = data_df.groupby("time") airqo_api = AirQoApi() calibrated_measurements = [] for _, time_group in data_df_groups: try: data = time_group date_time = data.iloc[0]["time"] calibrate_body = data.to_dict(orient="records") calibrated_values = airqo_api.get_calibrated_values( time=date_time, calibrate_body=calibrate_body ) for value in calibrated_values: try: data.loc[ (data["device_id"] == value["device_id"]) & (data["time"] == date_time), "calibrated_pm2_5", ] = value["calibrated_PM2.5"] data.loc[ (data["device_id"] == value["device_id"]) & (data["time"] == date_time), "calibrated_pm10", ] = value["calibrated_PM10"] except Exception as ex: traceback.print_exc() print(ex) continue calibrated_measurements.extend(data.to_dict(orient="records")) except Exception as ex: traceback.print_exc() print(ex) continue return calibrated_measurements
def get_weather_data_from_tahmo(start_time=None, end_time=None, tenant="airqo"): airqo_api = AirQoApi() airqo_sites = airqo_api.get_sites(tenant=tenant) station_codes = [] for site in airqo_sites: try: if "nearest_tahmo_station" in dict(site).keys(): station_codes.append(site["nearest_tahmo_station"]["code"]) except Exception as ex: print(ex) measurements = [] tahmo_api = TahmoApi() frequency = get_frequency(start_time=start_time, end_time=end_time) dates = pd.date_range(start_time, end_time, freq=frequency) last_date_time = dates.values[len(dates.values) - 1] for date in dates: start = date_to_str(date) end_date_time = date + timedelta(hours=dates.freq.n) if np.datetime64(end_date_time) > last_date_time: end = end_time else: end = date_to_str(end_date_time) print(start + " : " + end) range_measurements = tahmo_api.get_measurements( start, end, station_codes) measurements.extend(range_measurements) if len(measurements) != 0: measurements_df = pd.DataFrame(data=measurements) else: measurements_df = pd.DataFrame( [], columns=["value", "variable", "station", "time"]) return measurements_df.to_dict(orient="records") clean_measurements_df = remove_invalid_dates(dataframe=measurements_df, start_time=start_time, end_time=end_time) return clean_measurements_df.to_dict(orient="records")
def get_forecast_data(tenant: str) -> list: airqo_api = AirQoApi() devices = airqo_api.get_devices(tenant=tenant, all_devices=False) forecast_measurements = pd.DataFrame(data=[], columns=insights_columns) time = int((datetime.utcnow() + timedelta(hours=1)).timestamp()) for device in devices: device_dict = dict(device) device_number = device_dict.get("device_number", None) site = device_dict.get("site", None) if not site: print(f"device {device_number} isn't attached to a site.") continue site_id = site["_id"] if device_number: forecast = airqo_api.get_forecast(channel_id=device_number, timestamp=time) if forecast: forecast_df = pd.DataFrame(forecast) forecast_cleaned_df = pd.DataFrame(columns=insights_columns) forecast_cleaned_df["time"] = forecast_df["prediction_time"] forecast_cleaned_df["pm2_5"] = forecast_df["prediction_value"] forecast_cleaned_df["pm10"] = forecast_df["prediction_value"] forecast_cleaned_df["siteId"] = site_id forecast_cleaned_df["frequency"] = "hourly" forecast_cleaned_df["forecast"] = True forecast_cleaned_df["empty"] = False forecast_measurements = forecast_measurements.append( forecast_cleaned_df, ignore_index=True) forecast_measurements["time"] = forecast_measurements["time"].apply( lambda x: predict_time_to_string(x)) forecast_measurements = forecast_measurements[ forecast_measurements["pm2_5"].notna()] return forecast_measurements.to_dict(orient="records")
def map_site_ids_to_historical_measurements(data: list, deployment_logs: list) -> list: if not deployment_logs or not data: return data airqo_api = AirQoApi() devices = airqo_api.get_devices(tenant="airqo") mapped_data = [] devices_logs_df = pd.DataFrame(deployment_logs) devices_logs_df["start_time"] = devices_logs_df["start_time"].apply( lambda x: str_to_date(x) ) devices_logs_df["end_time"] = devices_logs_df["end_time"].apply( lambda x: str_to_date(x) ) data = un_fill_nan(data) data_df = pd.DataFrame(data) for _, data_row in data_df.iterrows(): device = get_device(devices, device_id=data_row["device_id"]) if not device: continue site_id = device.get("site").get("_id") time = str_to_date(data_row["time"]) device_logs = devices_logs_df[devices_logs_df["device_id"] == device.get("_id")] if not device_logs.empty: for _, log in device_logs.iterrows(): if log["start_time"] <= time <= log["end_time"]: site_id = log["site_id"] data_row["site_id"] = site_id mapped_data.append(data_row.to_dict()) return mapped_data
def query_insights_data(freq: str, start_date_time: str, end_date_time: str, forecast=False, all_data=False) -> list: airqo_api = AirQoApi() insights = [] frequency = get_frequency(start_time=start_date_time, end_time=end_date_time) dates = pd.date_range(start_date_time, end_date_time, freq=frequency) last_date_time = dates.values[len(dates.values) - 1] for date in dates: start = date_to_str(date) query_end_date_time = date + timedelta(hours=dates.freq.n) if np.datetime64(query_end_date_time) > last_date_time: end = end_date_time else: end = date_to_str(query_end_date_time) try: api_results = airqo_api.get_app_insights( start_time=start, frequency=freq, end_time=end, forecast=forecast, all_data=all_data, ) insights.extend(api_results) except Exception as ex: print(ex) traceback.print_exc() return insights
def extract_sites_meta_data(tenant=None) -> list: airqo_api = AirQoApi() sites = airqo_api.get_sites(tenant=tenant) sites_df = pd.DataFrame(sites) sites_df = sites_df[[ "_id", "latitude", "tenant", "longitude", "name", "bearing_to_kampala_center", "landform_90", "distance_to_kampala_center", "altitude", "landform_270", "aspect", "description", "distance_to_nearest_tertiary_road", "distance_to_nearest_primary_road", "distance_to_nearest_road", "distance_to_nearest_residential_road", "distance_to_nearest_secondary_road", "distance_to_nearest_unclassified_road", "country", "region", "parish", "sub_county", "county", "district", "city", ]] sites_df.rename( columns={ "_id": "site_id", "latitude": "site_latitude", "longitude": "site_longitude", "description": "site_description", "altitude": "site_altitude", "name": "site_name", "distance_to_nearest_tertiary_road": "site_distance_to_nearest_tertiary_road", "distance_to_nearest_primary_road": "site_distance_to_nearest_primary_road", "distance_to_nearest_road": "site_distance_to_nearest_road", "distance_to_nearest_residential_road": "site_distance_to_nearest_residential_road", "distance_to_nearest_secondary_road": "site_distance_to_nearest_secondary_road", "distance_to_nearest_unclassified_road": "site_distance_to_nearest_unclassified_road", "bearing_to_kampala_center": "site_bearing_to_kampala_center", "landform_90": "site_landform_90", "distance_to_kampala_center": "site_distance_to_kampala_center", "landform_270": "site_landform_270", "aspect": "site_aspect", }, inplace=True, ) sites_df.reset_index(drop=True, inplace=True) return sites_df.to_dict(orient="records")
def transform_kcca_data_for_message_broker(data: list, frequency: str) -> list: restructured_data = [] data_df = pd.DataFrame(data) columns = list(data_df.columns) airqo_api = AirQoApi() devices = airqo_api.get_devices(tenant="kcca") for _, data_row in data_df.iterrows(): device_name = data_row["deviceCode"] site_id, device_id = get_site_and_device_id(devices, device_name=device_name) if not site_id and not device_id: continue location = str(data_row["location.coordinates"]) location = location.replace("[", "").replace("]", "") location_coordinates = location.split(",") device_data = dict({ "time": frequency_time(dateStr=data_row["time"], frequency=frequency), "tenant": "kcca", "site_id": site_id, "device_id": device_id, "device_number": 0, "device": device_name, "latitude": location_coordinates[1], "longitude": location_coordinates[0], "pm2_5": get_column_value( column="characteristics.pm2_5ConcMass.value", columns=columns, series=data_row, ), "pm10": get_column_value( column="characteristics.pm10ConcMass.value", columns=columns, series=data_row, ), "s1_pm2_5": get_column_value( column="characteristics.pm2_5ConcMass.raw", columns=columns, series=data_row, ), "s1_pm10": get_column_value( column="characteristics.pm10ConcMass.raw", columns=columns, series=data_row, ), "s2_pm2_5": None, "s2_pm10": None, "pm2_5_calibrated_value": get_column_value( column="characteristics.pm2_5ConcMass.calibratedValue", columns=columns, series=data_row, ), "pm10_calibrated_value": get_column_value( column="characteristics.pm10ConcMass.calibratedValue", columns=columns, series=data_row, ), "altitude": get_column_value( column="characteristics.altitude.value", columns=columns, series=data_row, ), "wind_speed": get_column_value( column="characteristics.windSpeed.value", columns=columns, series=data_row, ), "external_temperature": get_column_value( column="characteristics.temperature.value", columns=columns, series=data_row, ), "external_humidity": get_column_value( column="characteristics.relHumid.value", columns=columns, series=data_row, ), }) restructured_data.append(device_data) return restructured_data
def transform_kcca_hourly_data_for_bigquery(data: list) -> list: restructured_data = [] data_df = pd.DataFrame(data) columns = list(data_df.columns) airqo_api = AirQoApi() devices = airqo_api.get_devices(tenant="kcca") for _, data_row in data_df.iterrows(): device_name = data_row["deviceCode"] site_id, _ = get_site_and_device_id(devices, device_name=device_name) if not site_id: continue location = str(data_row["location.coordinates"]) location = location.replace("[", "").replace("]", "") location_coordinates = location.split(",") device_data = dict({ "timestamp": str_to_date(data_row["time"]), "tenant": "kcca", "site_id": site_id, "device_number": 0, "device": device_name, "latitude": location_coordinates[1], "longitude": location_coordinates[0], "pm2_5": get_column_value( column="characteristics.pm2_5ConcMass.value", columns=columns, series=data_row, ), "s1_pm2_5": get_column_value(column="s1_pm2_5", columns=columns, series=data_row), "s2_pm2_5": get_column_value(column="s2_pm2_5", columns=columns, series=data_row), "pm2_5_raw_value": get_column_value( column="characteristics.pm2_5ConcMass.raw", columns=columns, series=data_row, ), "pm2_5_calibrated_value": get_column_value( column="characteristics.pm2_5ConcMass.calibratedValue", columns=columns, series=data_row, ), "pm10": get_column_value( column="characteristics.pm10ConcMass.value", columns=columns, series=data_row, ), "s1_pm10": get_column_value(column="s1_pm10", columns=columns, series=data_row), "s2_pm10": get_column_value(column="s2_pm10", columns=columns, series=data_row), "pm10_raw_value": get_column_value( column="characteristics.pm10ConcMass.raw", columns=columns, series=data_row, ), "pm10_calibrated_value": get_column_value( column="characteristics.pm10ConcMass.calibratedValue", columns=columns, series=data_row, ), "no2": get_column_value( column="characteristics.no2Conc.value", columns=columns, series=data_row, ), "no2_raw_value": get_column_value( column="characteristics.no2Conc.raw", columns=columns, series=data_row, ), "no2_calibrated_value": get_column_value( column="characteristics.no2Conc.calibratedValue", columns=columns, series=data_row, ), "pm1": get_column_value( column="characteristics.pm1ConcMass.value", columns=columns, series=data_row, ), "pm1_raw_value": get_column_value( column="characteristics.pm1ConcMass.raw", columns=columns, series=data_row, ), "pm1_calibrated_value": get_column_value( column="characteristics.pm1ConcMass.calibratedValue", columns=columns, series=data_row, ), "altitude": get_column_value( column="characteristics.altitude.value", columns=columns, series=data_row, ), "wind_speed": get_column_value( column="characteristics.windSpeed.value", columns=columns, series=data_row, ), "external_temperature": get_column_value( column="characteristics.temperature.value", columns=columns, series=data_row, ), "external_humidity": get_column_value( column="characteristics.relHumid.value", columns=columns, series=data_row, ), }) restructured_data.append(device_data) return pd.DataFrame(columns=BigQueryApi().hourly_measurements_columns, data=restructured_data).to_dict(orient="records")
def resample_weather_data(data: list, frequency: str): weather_raw_data = pd.DataFrame(data) if weather_raw_data.empty: return weather_raw_data.to_dict(orient="records") airqo_api = AirQoApi() sites = airqo_api.get_sites(tenant="airqo") valid_sites = list( filter(lambda x: "nearest_tahmo_station" in dict(x).keys(), sites)) # to include site id # devices = get_devices_or_sites(configuration.AIRQO_BASE_URL, tenant='airqo', sites=False) temperature = weather_raw_data.loc[ weather_raw_data["variable"] == "te", ["value", "variable", "station", "time"]] humidity = weather_raw_data.loc[weather_raw_data["variable"] == "rh", ["value", "variable", "station", "time"]] wind_speed = weather_raw_data.loc[weather_raw_data["variable"] == "ws", ["value", "variable", "station", "time"]] humidity["value"] = pd.to_numeric(humidity["value"], errors="coerce") humidity["value"] = humidity["value"].apply(lambda x: x * 100) data = pd.concat([temperature, humidity, wind_speed]) data.reset_index(inplace=True) devices_weather_data = [] data["value"] = pd.to_numeric(data["value"], errors="coerce", downcast="float") data = data.fillna(0) data_station_gps = data.groupby("station") for _, station_group in data_station_gps: device_weather_data = [] station = station_group.iloc[0]["station"] try: # resampling station values temperature = resample_data( station_group.loc[station_group["variable"] == "te", ["value", "time"]], frequency, ) temperature.columns = ["temperature", "time"] humidity = resample_data( station_group.loc[station_group["variable"] == "rh", ["value", "time"]], frequency, ) humidity.columns = ["humidity", "time"] wind_speed = resample_data( station_group.loc[station_group["variable"] == "ws", ["value", "time"]], frequency, ) wind_speed.columns = ["wind_speed", "time"] data_frames = [temperature, humidity, wind_speed] station_df = reduce( lambda left, right: pd.merge( left, right, on=["time"], how="outer"), data_frames, ) station_df["frequency"] = frequency # mapping device to station station_devices = get_device_ids_from_station(station, valid_sites) if len(station_devices) == 0: continue for device_id in station_devices: device_station_df = station_df.copy(deep=True) device_station_df["device_id"] = device_id device_weather_data.extend( device_station_df.to_dict(orient="records")) except Exception as ex: print(ex) traceback.print_exc() continue # to include site id # device_station_data_df = pd.DataFrame(device_weather_data) # device_station_data_df['site_id'] = device_station_data_df['device_id'].apply( # lambda x: get_device_site_id(x, devices)) # devices_weather_data.extend(device_station_data_df.to_dict(orient='records')) devices_weather_data.extend(device_weather_data) # pd.DataFrame(devices_weather_data).to_csv(path_or_buf='devices_weather.csv', index=False) return devices_weather_data
def transform_kcca_measurements_for_api(unclean_data) -> list: data = pd.DataFrame(unclean_data) airqo_api = AirQoApi() devices = airqo_api.get_devices(tenant="kcca") device_gps = data.groupby("deviceCode") cleaned_measurements = [] for _, group in device_gps: device_name = group.iloc[0]["deviceCode"] site_id, device_id = get_site_and_device_id(devices, device_name=device_name) if not site_id and not device_id: continue transformed_data = [] columns = group.columns for index, row in group.iterrows(): location = str(row["location.coordinates"]) location = location.replace("[", "").replace("]", "") location_coordinates = location.split(",") frequency = str(row.get("outputFrequency", "raw")) if frequency.lower() == "hour": frequency = "hourly" elif frequency.lower() == "day": frequency = "daily" else: frequency = "raw" row_data = dict({ "frequency": frequency, "time": frequency_time(dateStr=row.get("time"), frequency=frequency), "tenant": "kcca", "site_id": site_id, "device_id": device_id, "device": row["deviceCode"], "location": dict({ "longitude": dict({"value": to_double(location_coordinates[0])}), "latitude": dict({"value": to_double(location_coordinates[1])}), }), "pm2_5": { "value": get_valid_column_value( column_name="characteristics.pm2_5ConcMass.raw", series=row, columns_names=columns, data_name="pm2_5", ), "calibratedValue": get_valid_column_value( column_name="characteristics.pm2_5ConcMass.value", series=row, columns_names=columns, data_name="pm2_5", ), }, "pm1": { "value": get_valid_column_value( column_name="characteristics.pm1ConcMass.raw", series=row, columns_names=columns, data_name=None, ), "calibratedValue": get_valid_column_value( column_name="characteristics.pm1ConcMass.value", series=row, columns_names=columns, data_name=None, ), }, "pm10": { "value": get_valid_column_value( column_name="characteristics.pm10ConcMass.raw", series=row, columns_names=columns, data_name="pm10", ), "calibratedValue": get_valid_column_value( column_name="characteristics.pm10ConcMass.value", series=row, columns_names=columns, data_name="pm10", ), }, "externalTemperature": { "value": get_valid_column_value( column_name="characteristics.temperature.value", series=row, columns_names=columns, data_name="externalTemperature", ), }, "externalHumidity": { "value": get_valid_column_value( column_name="characteristics.relHumid.value", series=row, columns_names=columns, data_name="externalHumidity", ), }, "no2": { "value": get_valid_column_value( column_name="characteristics.no2Conc.raw", series=row, columns_names=columns, data_name=None, ), "calibratedValue": get_valid_column_value( column_name="characteristics.no2Conc.value", series=row, columns_names=columns, data_name=None, ), }, "speed": { "value": get_valid_column_value( column_name="characteristics.windSpeed.value", series=row, columns_names=columns, data_name=None, ), }, }) transformed_data.append(row_data) if transformed_data: cleaned_measurements.extend(transformed_data) return cleaned_measurements
def extract_airqo_data_from_thingspeak( start_time: str, end_time: str, all_devices: bool ) -> list: thingspeak_base_url = configuration.THINGSPEAK_CHANNEL_URL airqo_api = AirQoApi() airqo_devices = airqo_api.get_devices(tenant="airqo", all_devices=all_devices) read_keys = airqo_api.get_read_keys(devices=airqo_devices) channels_data = [] frequency = get_frequency(start_time=start_time, end_time=end_time) def get_field_8_value(x: str, position: int): try: values = x.split(",") return values[position] except Exception as exc: print(exc) return None dates = pd.date_range(start_time, end_time, freq=frequency) last_date_time = dates.values[len(dates.values) - 1] for device in airqo_devices: try: channel_id = str(device["device_number"]) for date in dates: start = date_to_str(date) end_date_time = date + timedelta(hours=dates.freq.n) if np.datetime64(end_date_time) > last_date_time: end = end_time else: end = date_to_str(end_date_time) read_key = read_keys[str(channel_id)] channel_url = f"{thingspeak_base_url}{channel_id}/feeds.json?start={start}&end={end}&api_key={read_key}" print(f"{channel_url}") data = json.loads( requests.get(channel_url, timeout=100.0).content.decode("utf-8") ) if (data != -1) and ("feeds" in data): dataframe = pd.DataFrame(data["feeds"]) if dataframe.empty: print( f"{channel_id} does not have data between {start} and {end}" ) continue channel_df = pd.DataFrame( data=[], columns=[ "time", "s1_pm2_5", "s2_pm2_5", "s1_pm10", "device_id", "site_id", "s2_pm10", "latitude", "longitude", "altitude", "wind_speed", "satellites", "hdop", "internalTemperature", "internalHumidity", "battery", "temperature", "humidity", "pressure", "externalAltitude", ], ) channel_df["s1_pm2_5"] = dataframe["field1"].apply( lambda x: get_valid_value(x, "pm2_5") ) channel_df["s1_pm10"] = dataframe["field2"].apply( lambda x: get_valid_value(x, "pm10") ) channel_df["s2_pm2_5"] = dataframe["field3"].apply( lambda x: get_valid_value(x, "pm2_5") ) channel_df["s2_pm10"] = dataframe["field4"].apply( lambda x: get_valid_value(x, "pm10") ) channel_df["latitude"] = dataframe["field5"].apply( lambda x: get_valid_value(x, "latitude") ) channel_df["longitude"] = dataframe["field6"].apply( lambda x: get_valid_value(x, "longitude") ) channel_df["battery"] = dataframe["field7"].apply( lambda x: get_valid_value(x, "battery") ) if "field8" in dataframe.columns: try: channel_df["latitude"] = dataframe["field8"].apply( lambda x: get_valid_value( get_field_8_value(x, 0), "latitude" ) ) channel_df["longitude"] = dataframe["field8"].apply( lambda x: get_valid_value( get_field_8_value(x, 1), "longitude" ) ) channel_df["altitude"] = dataframe["field8"].apply( lambda x: get_valid_value( get_field_8_value(x, 2), "altitude" ) ) channel_df["wind_speed"] = dataframe["field8"].apply( lambda x: get_valid_value( get_field_8_value(x, 3), "wind_speed" ) ) channel_df["satellites"] = dataframe["field8"].apply( lambda x: get_valid_value( get_field_8_value(x, 4), "satellites" ) ) channel_df["hdop"] = dataframe["field8"].apply( lambda x: get_valid_value( get_field_8_value(x, 5), "hdop" ) ) channel_df["internalTemperature"] = dataframe[ "field8" ].apply( lambda x: get_valid_value( get_field_8_value(x, 6), "externalTemperature" ) ) channel_df["internalHumidity"] = dataframe["field8"].apply( lambda x: get_valid_value( get_field_8_value(x, 7), "externalHumidity" ) ) channel_df["temperature"] = dataframe["field8"].apply( lambda x: get_valid_value( get_field_8_value(x, 8), "externalTemperature" ) ) channel_df["humidity"] = dataframe["field8"].apply( lambda x: get_valid_value( get_field_8_value(x, 9), "externalHumidity" ) ) channel_df["pressure"] = dataframe["field8"].apply( lambda x: get_valid_value( get_field_8_value(x, 10), "pressure" ) ) channel_df["externalAltitude"] = dataframe["field8"].apply( lambda x: get_valid_value( get_field_8_value(x, 11), "altitude" ) ) except Exception as ex: traceback.print_exc() print(ex) channel_df["time"] = dataframe["created_at"] channel_df["device_id"] = device["_id"] channel_df["site_id"] = device["site"]["_id"] channel_df["device_number"] = device["device_number"] channel_df["device"] = device["name"] channel_df["frequency"] = "raw" channels_data.extend(channel_df.to_dict(orient="records")) except Exception as ex: print(ex) traceback.print_exc() channel_data_df = pd.DataFrame(channels_data) clean_channel_data_df = remove_invalid_dates( dataframe=channel_data_df, start_time=start_time, end_time=end_time ) return clean_channel_data_df.to_dict(orient="records")