def merge_data(averaged_airqo_data: dict, weather_data: dict): from airqo_etl_utils.commons import un_fill_nan, fill_nan from airqo_etl_utils.airqo_utils import merge_airqo_and_weather_data hourly_airqo_data = un_fill_nan(averaged_airqo_data.get("data")) hourly_weather_data = un_fill_nan(weather_data.get("data")) merged_measurements = merge_airqo_and_weather_data( airqo_data=hourly_airqo_data, weather_data=hourly_weather_data ) return dict({"data": fill_nan(data=merged_measurements)})
def load(inputs: dict): from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.airqo_api import AirQoApi kcca_data = un_fill_nan(inputs.get("data")) airqo_api = AirQoApi() airqo_api.save_events(measurements=kcca_data, tenant="kcca")
def average_data_by_hour(raw_data: dict): from airqo_etl_utils.airqo_utils import average_airqo_data from airqo_etl_utils.commons import fill_nan, un_fill_nan raw_airqo_data = un_fill_nan(raw_data.get("data")) average_data = average_airqo_data(data=raw_airqo_data, frequency="hourly") return dict({"data": fill_nan(data=average_data)})
def transform(inputs: dict): from airqo_etl_utils.kcca_utils import transform_kcca_measurements_for_api from airqo_etl_utils.commons import un_fill_nan, fill_nan data = un_fill_nan(inputs.get("data")) cleaned_data = transform_kcca_measurements_for_api(data) return dict({"data": fill_nan(data=cleaned_data)})
def load(insights_data: dict): from airqo_etl_utils.commons import un_fill_nan empty_insights_data = un_fill_nan(insights_data.get("data")) from airqo_etl_utils.app_insights_utils import save_insights_data save_insights_data(insights_data=empty_insights_data, action="insert", partition=2)
def average_data(inputs: dict): from airqo_etl_utils.commons import un_fill_nan, fill_nan from airqo_etl_utils.airqo_utils import average_airqo_measurements data = un_fill_nan(inputs.get("data")) averaged_data = average_airqo_measurements(data=data, frequency="daily") return dict({"data": fill_nan(data=averaged_data)})
def load(kcca_data: dict, **kwargs): from airqo_etl_utils.kcca_utils import ( transform_kcca_measurements_for_api, transform_kcca_hourly_data_for_bigquery, transform_kcca_data_for_message_broker, ) from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.airqo_api import AirQoApi from airqo_etl_utils.message_broker import KafkaBrokerClient from airqo_etl_utils.bigquery_api import BigQueryApi from airqo_etl_utils.config import configuration data = un_fill_nan(kcca_data.get("data")) try: dag_run = kwargs.get("dag_run") destination = dag_run.conf["destination"] except KeyError: destination = "bigquery" if destination == "bigquery": kcca_transformed_data = transform_kcca_hourly_data_for_bigquery( data) big_query_api = BigQueryApi() big_query_api.save_data( data=kcca_transformed_data, table=big_query_api.hourly_measurements_table, ) elif destination == "message-broker": kcca_transformed_data = transform_kcca_data_for_message_broker( data=data, frequency="hourly") info = { "data": kcca_transformed_data, "action": "insert", "tenant": "kcca" } kafka = KafkaBrokerClient() kafka.send_data(info=info, topic=configuration.HOURLY_MEASUREMENTS_TOPIC) elif destination == "api": kcca_transformed_data = transform_kcca_measurements_for_api(data) airqo_api = AirQoApi() airqo_api.save_events(measurements=kcca_transformed_data, tenant="kcca") else: raise Exception( "Invalid data destination. Valid values are bigquery, message-broker and api" )
def load(data: dict): from airqo_etl_utils.app_insights_utils import ( save_insights_data, create_insights_data, ) from airqo_etl_utils.commons import un_fill_nan insights_list = un_fill_nan(data.get("data")) insights_data = create_insights_data(data=insights_list) save_insights_data(insights_data=insights_data, action="save")
def calibrate(inputs: dict): from airqo_etl_utils.commons import un_fill_nan, fill_nan from airqo_etl_utils.airqo_utils import calibrate_hourly_airqo_measurements data = un_fill_nan(inputs.get("data")) airqo_calibrated_data = calibrate_hourly_airqo_measurements(measurements=data) return dict({"data": fill_nan(data=airqo_calibrated_data)})
def filter_insights(empty_insights_data: dict, available_insights_data: dict): from airqo_etl_utils.commons import fill_nan, un_fill_nan import pandas as pd insights_data_df = pd.DataFrame( data=un_fill_nan(available_insights_data.get("data"))) empty_insights_data_df = pd.DataFrame( data=un_fill_nan(empty_insights_data.get("data"))) insights_data = pd.concat([empty_insights_data_df, insights_data_df]).drop_duplicates( keep=False, subset=["siteId", "time", "frequency"]) return dict( {"data": fill_nan(data=insights_data.to_dict(orient="records"))})
def send_raw_measurements_to_api(airqo_data: dict): from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.airqo_utils import restructure_airqo_data from airqo_etl_utils.airqo_api import AirQoApi data = un_fill_nan(airqo_data.get("data")) airqo_restructured_data = restructure_airqo_data(data=data, destination="api") airqo_api = AirQoApi() airqo_api.save_events(measurements=airqo_restructured_data, tenant="airqo")
def load(forecast: dict, transformed_forecast: dict): from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.app_insights_utils import save_insights_data import pandas as pd forecast_insights_data = un_fill_nan(forecast.get("data")) transformed_forecast_data = un_fill_nan( transformed_forecast.get("data")) forecast_insights_data_df = pd.DataFrame(forecast_insights_data) transformed_forecast_data_df = pd.DataFrame(transformed_forecast_data) insights_data = pd.concat( [forecast_insights_data_df, transformed_forecast_data_df], ignore_index=True) save_insights_data( insights_data=insights_data.to_dict(orient="records"), action="save", partition=1, )
def send_hourly_measurements_to_api(inputs: dict): from airqo_etl_utils.kcca_utils import transform_kcca_measurements_for_api from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.airqo_api import AirQoApi data = un_fill_nan(inputs.get("data")) kcca_data = transform_kcca_measurements_for_api(data) airqo_api = AirQoApi() airqo_api.save_events(measurements=kcca_data, tenant="kcca")
def send_hourly_measurements_to_bigquery(kcca_data: dict): from airqo_etl_utils.kcca_utils import transform_kcca_hourly_data_for_bigquery from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.bigquery_api import BigQueryApi data = un_fill_nan(kcca_data.get("data")) kcca_restructured_data = transform_kcca_hourly_data_for_bigquery(data) big_query_api = BigQueryApi() big_query_api.save_data(data=kcca_restructured_data, table=big_query_api.hourly_measurements_table)
def send_raw_measurements_to_bigquery(airqo_data: dict): from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.airqo_utils import restructure_airqo_data from airqo_etl_utils.bigquery_api import BigQueryApi data = un_fill_nan(airqo_data.get("data")) airqo_restructured_data = restructure_airqo_data( data=data, destination="bigquery" ) big_query_api = BigQueryApi() big_query_api.save_raw_measurements(airqo_restructured_data)
def map_site_ids(airqo_data: dict, deployment_logs: dict): from airqo_etl_utils.commons import un_fill_nan, fill_nan from airqo_etl_utils.airqo_utils import map_site_ids_to_historical_measurements data = un_fill_nan(airqo_data.get("data")) logs = deployment_logs.get("data") restructured_data = map_site_ids_to_historical_measurements( data=data, deployment_logs=logs ) return dict({"data": fill_nan(data=restructured_data)})
def save_to_bigquery(inputs: dict): from airqo_etl_utils.bigquery_api import BigQueryApi from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.weather_data_utils import ( transform_weather_data_for_bigquery, ) weather_data = un_fill_nan(inputs.get("data")) bigquery_data = transform_weather_data_for_bigquery(data=weather_data) big_query_api = BigQueryApi() big_query_api.save_data( data=bigquery_data, table=big_query_api.hourly_weather_table )
def load(airqo_data: dict, **kwargs): from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.bigquery_api import BigQueryApi from airqo_etl_utils.airqo_api import AirQoApi from airqo_etl_utils.airqo_utils import restructure_airqo_data from airqo_etl_utils.config import configuration from airqo_etl_utils.message_broker import KafkaBrokerClient data = un_fill_nan(airqo_data.get("data")) try: dag_run = kwargs.get("dag_run") destination = dag_run.conf["destination"] except KeyError: destination = "bigquery" if destination == "bigquery": airqo_restructured_data = restructure_airqo_data( data=data, destination="bigquery" ) big_query_api = BigQueryApi() big_query_api.save_data( data=airqo_restructured_data, table=big_query_api.hourly_measurements_table, ) elif destination == "message-broker": airqo_restructured_data = restructure_airqo_data( data=data, destination="message-broker" ) info = { "data": airqo_restructured_data, "action": "insert", "tenant": "airqo", } kafka = KafkaBrokerClient() kafka.send_data(info=info, topic=configuration.HOURLY_MEASUREMENTS_TOPIC) elif destination == "api": airqo_restructured_data = restructure_airqo_data( data=data, destination="api" ) airqo_api = AirQoApi() airqo_api.save_events(measurements=airqo_restructured_data, tenant="airqo") else: raise Exception( "Invalid data destination. Valid values are bigquery, message-broker and api" )
def update_app_insights(airqo_data: dict): from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.airqo_utils import restructure_airqo_data from airqo_etl_utils.message_broker import KafkaBrokerClient from airqo_etl_utils.config import configuration data = un_fill_nan(airqo_data.get("data")) insights_data = restructure_airqo_data(data=data, destination="app-insights") info = {"data": insights_data, "action": "save"} kafka = KafkaBrokerClient() kafka.send_data( info=info, topic=configuration.INSIGHTS_MEASUREMENTS_TOPIC, partition=0 )
def send_hourly_measurements_to_message_broker(airqo_data: dict): from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.config import configuration from airqo_etl_utils.message_broker import KafkaBrokerClient from airqo_etl_utils.airqo_utils import restructure_airqo_data data = un_fill_nan(airqo_data.get("data")) airqo_restructured_data = restructure_airqo_data( data=data, destination="message-broker" ) info = {"data": airqo_restructured_data, "action": "insert", "tenant": "airqo"} kafka = KafkaBrokerClient() kafka.send_data(info=info, topic=configuration.HOURLY_MEASUREMENTS_TOPIC)
def send_hourly_measurements_to_message_broker(airqo_data: dict): from airqo_etl_utils.kcca_utils import transform_kcca_data_for_message_broker from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.config import configuration from airqo_etl_utils.message_broker import KafkaBrokerClient data = un_fill_nan(airqo_data.get("data")) kcca_restructured_data = transform_kcca_data_for_message_broker( data=data, frequency="hourly") info = { "data": kcca_restructured_data, "action": "insert", "tenant": "kcca" } kafka = KafkaBrokerClient() kafka.send_data(info=info, topic=configuration.HOURLY_MEASUREMENTS_TOPIC)
def map_site_ids_to_historical_measurements(data: list, deployment_logs: list) -> list: if not deployment_logs or not data: return data airqo_api = AirQoApi() devices = airqo_api.get_devices(tenant="airqo") mapped_data = [] devices_logs_df = pd.DataFrame(deployment_logs) devices_logs_df["start_time"] = devices_logs_df["start_time"].apply( lambda x: str_to_date(x) ) devices_logs_df["end_time"] = devices_logs_df["end_time"].apply( lambda x: str_to_date(x) ) data = un_fill_nan(data) data_df = pd.DataFrame(data) for _, data_row in data_df.iterrows(): device = get_device(devices, device_id=data_row["device_id"]) if not device: continue site_id = device.get("site").get("_id") time = str_to_date(data_row["time"]) device_logs = devices_logs_df[devices_logs_df["device_id"] == device.get("_id")] if not device_logs.empty: for _, log in device_logs.iterrows(): if log["start_time"] <= time <= log["end_time"]: site_id = log["site_id"] data_row["site_id"] = site_id mapped_data.append(data_row.to_dict()) return mapped_data