def query_hourly_measurements(start_date_time: str, end_date_time: str) -> list: biq_query_api = BigQueryApi() hourly_measurements = biq_query_api.get_hourly_data( start_date_time=start_date_time, end_date_time=end_date_time, columns=hourly_measurements_columns, table=biq_query_api.hourly_measurements_table, ) if hourly_measurements.empty: hourly_measurements = pd.DataFrame(data=[], columns=hourly_measurements_columns) hourly_measurements.rename( columns={ "latitude": "device_latitude", "longitude": "device_longitude", "device": "device_name", }, inplace=True, ) return hourly_measurements.to_dict(orient="records")
def load(kcca_data: dict, **kwargs): from airqo_etl_utils.kcca_utils import ( transform_kcca_measurements_for_api, transform_kcca_hourly_data_for_bigquery, transform_kcca_data_for_message_broker, ) from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.airqo_api import AirQoApi from airqo_etl_utils.message_broker import KafkaBrokerClient from airqo_etl_utils.bigquery_api import BigQueryApi from airqo_etl_utils.config import configuration data = un_fill_nan(kcca_data.get("data")) try: dag_run = kwargs.get("dag_run") destination = dag_run.conf["destination"] except KeyError: destination = "bigquery" if destination == "bigquery": kcca_transformed_data = transform_kcca_hourly_data_for_bigquery( data) big_query_api = BigQueryApi() big_query_api.save_data( data=kcca_transformed_data, table=big_query_api.hourly_measurements_table, ) elif destination == "message-broker": kcca_transformed_data = transform_kcca_data_for_message_broker( data=data, frequency="hourly") info = { "data": kcca_transformed_data, "action": "insert", "tenant": "kcca" } kafka = KafkaBrokerClient() kafka.send_data(info=info, topic=configuration.HOURLY_MEASUREMENTS_TOPIC) elif destination == "api": kcca_transformed_data = transform_kcca_measurements_for_api(data) airqo_api = AirQoApi() airqo_api.save_events(measurements=kcca_transformed_data, tenant="kcca") else: raise Exception( "Invalid data destination. Valid values are bigquery, message-broker and api" )
def send_hourly_measurements_to_bigquery(kcca_data: dict): from airqo_etl_utils.kcca_utils import transform_kcca_hourly_data_for_bigquery from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.bigquery_api import BigQueryApi data = un_fill_nan(kcca_data.get("data")) kcca_restructured_data = transform_kcca_hourly_data_for_bigquery(data) big_query_api = BigQueryApi() big_query_api.save_data(data=kcca_restructured_data, table=big_query_api.hourly_measurements_table)
def send_raw_measurements_to_bigquery(airqo_data: dict): from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.airqo_utils import restructure_airqo_data from airqo_etl_utils.bigquery_api import BigQueryApi data = un_fill_nan(airqo_data.get("data")) airqo_restructured_data = restructure_airqo_data( data=data, destination="bigquery" ) big_query_api = BigQueryApi() big_query_api.save_raw_measurements(airqo_restructured_data)
def query_hourly_weather_data(start_date_time: str, end_date_time: str) -> list: biq_query_api = BigQueryApi() hourly_weather_measurements = biq_query_api.get_hourly_data( start_date_time=start_date_time, end_date_time=end_date_time, columns=hourly_weather_columns, table=biq_query_api.hourly_weather_table, ) if hourly_weather_measurements.empty: return pd.DataFrame( data=[], columns=hourly_weather_columns).to_dict(orient="records") return hourly_weather_measurements.to_dict(orient="records")
def save_to_bigquery(inputs: dict): from airqo_etl_utils.bigquery_api import BigQueryApi from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.weather_data_utils import ( transform_weather_data_for_bigquery, ) weather_data = un_fill_nan(inputs.get("data")) bigquery_data = transform_weather_data_for_bigquery(data=weather_data) big_query_api = BigQueryApi() big_query_api.save_data( data=bigquery_data, table=big_query_api.hourly_weather_table )
def load(airqo_data: dict, **kwargs): from airqo_etl_utils.commons import un_fill_nan from airqo_etl_utils.bigquery_api import BigQueryApi from airqo_etl_utils.airqo_api import AirQoApi from airqo_etl_utils.airqo_utils import restructure_airqo_data from airqo_etl_utils.config import configuration from airqo_etl_utils.message_broker import KafkaBrokerClient data = un_fill_nan(airqo_data.get("data")) try: dag_run = kwargs.get("dag_run") destination = dag_run.conf["destination"] except KeyError: destination = "bigquery" if destination == "bigquery": airqo_restructured_data = restructure_airqo_data( data=data, destination="bigquery" ) big_query_api = BigQueryApi() big_query_api.save_data( data=airqo_restructured_data, table=big_query_api.hourly_measurements_table, ) elif destination == "message-broker": airqo_restructured_data = restructure_airqo_data( data=data, destination="message-broker" ) info = { "data": airqo_restructured_data, "action": "insert", "tenant": "airqo", } kafka = KafkaBrokerClient() kafka.send_data(info=info, topic=configuration.HOURLY_MEASUREMENTS_TOPIC) elif destination == "api": airqo_restructured_data = restructure_airqo_data( data=data, destination="api" ) airqo_api = AirQoApi() airqo_api.save_events(measurements=airqo_restructured_data, tenant="airqo") else: raise Exception( "Invalid data destination. Valid values are bigquery, message-broker and api" )
def data_warehouse(start_date_time: str, end_date_time: str): from airqo_etl_utils.data_warehouse_utils import ( query_hourly_measurements, query_hourly_weather_data, extract_sites_meta_data, merge_measurements_weather_sites, ) from airqo_etl_utils.bigquery_api import BigQueryApi hourly_device_measurements = query_hourly_measurements( start_date_time=start_date_time, end_date_time=end_date_time, ) pd.DataFrame(hourly_device_measurements).to_csv( path_or_buf="hourly_device_measurements.csv", index=False) hourly_weather_measurements = query_hourly_weather_data( start_date_time=start_date_time, end_date_time=end_date_time, ) pd.DataFrame(hourly_weather_measurements).to_csv( path_or_buf="hourly_weather_measurements.csv", index=False) sites_meta_data = extract_sites_meta_data() pd.DataFrame(sites_meta_data).to_csv(path_or_buf="sites_meta_data.csv", index=False) data = merge_measurements_weather_sites( measurements_data=hourly_device_measurements, weather_data=hourly_weather_measurements, sites=sites_meta_data, ) data_df = pd.DataFrame(data) bigquery_api = BigQueryApi() data_df = bigquery_api.validate_data( dataframe=data_df, columns=bigquery_api.analytics_columns, numeric_columns=bigquery_api.analytics_numeric_columns, table=bigquery_api.analytics_table, ) data_df.to_csv(path_or_buf="data_warehouse.csv", index=False)
def create_insights_data_from_bigquery(start_date_time: str, end_date_time: str) -> list: from airqo_etl_utils.bigquery_api import BigQueryApi bigquery_api = BigQueryApi() hourly_data = bigquery_api.get_hourly_data( start_date_time=start_date_time, end_date_time=end_date_time, columns=["pm2_5", "pm10", "site_id", "timestamp"], table=bigquery_api.hourly_measurements_table, ) hourly_data["forecast"] = False hourly_data["empty"] = False hourly_data["frequency"] = "hourly" hourly_data.rename(columns={ "site_id": "siteId", "timestamp": "time" }, inplace=True) return hourly_data.to_dict(orient="records")
def kcca_hourly_measurements(start_date_time: str, end_date_time: str): from airqo_etl_utils.kcca_utils import ( extract_kcca_measurements, transform_kcca_measurements_for_api, transform_kcca_data_for_message_broker, transform_kcca_hourly_data_for_bigquery, ) from airqo_etl_utils.bigquery_api import BigQueryApi kcca_unclean_data = extract_kcca_measurements(start_time=start_date_time, end_time=end_date_time, freq="hourly") pd.DataFrame(kcca_unclean_data).to_csv( path_or_buf="outputs/kcca_unclean_data.csv", index=False) # API cleaned_data = transform_kcca_measurements_for_api(kcca_unclean_data) pd.DataFrame(cleaned_data).to_csv(path_or_buf="kcca_cleaned_data.csv", index=False) # Message Broker message_broker_data = transform_kcca_data_for_message_broker( kcca_unclean_data, frequency="hourly") pd.DataFrame(message_broker_data).to_csv( path_or_buf="kcca_message_broker_data.csv", index=False) # Big Query bigquery_data = transform_kcca_hourly_data_for_bigquery( data=kcca_unclean_data) bigquery_data_df = pd.DataFrame(bigquery_data) bigquery_api = BigQueryApi() bigquery_data_df = bigquery_api.validate_data( dataframe=bigquery_data_df, columns=bigquery_api.hourly_measurements_columns, numeric_columns=bigquery_api.hourly_measurements_numeric_columns, table=bigquery_api.hourly_measurements_table, ) bigquery_data_df.to_csv(path_or_buf="kcca_data_for_bigquery.csv", index=False)
def weather_data(start_date_time: str, end_date_time: str): from airqo_etl_utils.weather_data_utils import ( resample_weather_data, query_weather_data_from_tahmo, add_site_info_to_weather_data, transform_weather_data_for_bigquery, ) from airqo_etl_utils.bigquery_api import BigQueryApi raw_weather_data = query_weather_data_from_tahmo( start_date_time=start_date_time, end_date_time=end_date_time) pd.DataFrame(raw_weather_data).to_csv(path_or_buf="raw_weather_data.csv", index=False) hourly_weather_data = resample_weather_data(data=raw_weather_data, frequency="hourly") pd.DataFrame(hourly_weather_data).to_csv( path_or_buf="hourly_weather_data.csv", index=False) sites_weather_data = add_site_info_to_weather_data( data=hourly_weather_data) pd.DataFrame(sites_weather_data).to_csv( path_or_buf="sites_weather_data.csv", index=False) bigquery_data = transform_weather_data_for_bigquery( data=sites_weather_data) bigquery_data_df = pd.DataFrame(bigquery_data) bigquery_api = BigQueryApi() bigquery_data_df = bigquery_api.validate_data( dataframe=bigquery_data_df, columns=bigquery_api.hourly_weather_columns, numeric_columns=bigquery_api.hourly_weather_numeric_columns, table=bigquery_api.hourly_weather_table, ) bigquery_data_df.to_csv(path_or_buf="bigquery_weather_data.csv", index=False)
def airqo_hourly_measurements(start_date_time: str, end_date_time: str): from airqo_etl_utils.airqo_utils import ( extract_airqo_data_from_thingspeak, average_airqo_data, extract_airqo_weather_data_from_tahmo, merge_airqo_and_weather_data, calibrate_hourly_airqo_measurements, restructure_airqo_data, ) from airqo_etl_utils.bigquery_api import BigQueryApi # extract_airqo_data raw_airqo_data = extract_airqo_data_from_thingspeak( start_time=start_date_time, end_time=end_date_time, all_devices=False) pd.DataFrame(raw_airqo_data).to_csv(path_or_buf="raw_airqo_data.csv", index=False) average_data = average_airqo_data(data=raw_airqo_data, frequency="hourly") pd.DataFrame(average_data).to_csv(path_or_buf="averaged_airqo_data.csv", index=False) # extract_weather_data airqo_weather_data = extract_airqo_weather_data_from_tahmo( start_time=start_date_time, end_time=end_date_time) pd.DataFrame(airqo_weather_data).to_csv( path_or_buf="tahmo_weather_data.csv", index=False) # merge_data merged_measurements = merge_airqo_and_weather_data( airqo_data=average_data, weather_data=airqo_weather_data) pd.DataFrame(merged_measurements).to_csv( path_or_buf="merged_airqo_data.csv", index=False) # calibrate data calibrated_data = calibrate_hourly_airqo_measurements( measurements=merged_measurements) pd.DataFrame(calibrated_data).to_csv( path_or_buf="calibrated_airqo_data.csv", index=False) # restructure data for api restructure_data = restructure_airqo_data(data=calibrated_data, destination="api") pd.DataFrame(restructure_data).to_csv(path_or_buf="airqo_data_for_api.csv", index=False) # restructure data for bigquery restructure_data = restructure_airqo_data(data=calibrated_data, destination="bigquery") bigquery_data_df = pd.DataFrame(restructure_data) bigquery_api = BigQueryApi() bigquery_data_df = bigquery_api.validate_data( dataframe=bigquery_data_df, columns=bigquery_api.hourly_measurements_columns, numeric_columns=bigquery_api.hourly_measurements_numeric_columns, table=bigquery_api.hourly_measurements_table, ) bigquery_data_df.to_csv(path_or_buf="airqo_data_for_bigquery.csv", index=False) # restructure data for message broker restructure_data = restructure_airqo_data(data=calibrated_data, destination="message-broker") pd.DataFrame(restructure_data).to_csv( path_or_buf="airqo_data_for_message_broker.csv", index=False)
def restructure_airqo_data_for_bigquery(data: list) -> list: restructured_data = [] data_df = pd.DataFrame(data) columns = list(data_df.columns) for _, data_row in data_df.iterrows(): device_data = dict( { "timestamp": str_to_date(data_row["time"]), "tenant": "airqo", "site_id": data_row["site_id"], "device_number": data_row["device_number"], "device": data_row["device"], "latitude": get_column_value( column="latitude", columns=columns, series=data_row ), "longitude": get_column_value( column="longitude", columns=columns, series=data_row ), "pm2_5": get_column_value( column="pm2_5", columns=columns, series=data_row ), "s1_pm2_5": get_column_value( column="s1_pm2_5", columns=columns, series=data_row ), "s2_pm2_5": get_column_value( column="s2_pm2_5", columns=columns, series=data_row ), "pm2_5_raw_value": get_column_value( column="raw_pm2_5", columns=columns, series=data_row ), "pm2_5_calibrated_value": get_column_value( column="calibrated_pm2_5", columns=columns, series=data_row, ), "pm10": get_column_value( column="pm10", columns=columns, series=data_row, ), "s1_pm10": get_column_value( column="s1_pm10", columns=columns, series=data_row ), "s2_pm10": get_column_value( column="s2_pm10", columns=columns, series=data_row ), "pm10_raw_value": get_column_value( column="raw_pm10", columns=columns, series=data_row, ), "pm10_calibrated_value": get_column_value( column="calibrated_pm10", columns=columns, series=data_row, ), "altitude": get_column_value( column="altitude", columns=columns, series=data_row ), "wind_speed": get_column_value( column="wind_speed", columns=columns, series=data_row ), "external_temperature": get_column_value( column="temperature", columns=columns, series=data_row ), "external_humidity": get_column_value( column="humidity", columns=columns, series=data_row ), } ) restructured_data.append(device_data) return pd.DataFrame( columns=BigQueryApi().hourly_measurements_columns, data=restructured_data ).to_dict(orient="records")
def transform_kcca_hourly_data_for_bigquery(data: list) -> list: restructured_data = [] data_df = pd.DataFrame(data) columns = list(data_df.columns) airqo_api = AirQoApi() devices = airqo_api.get_devices(tenant="kcca") for _, data_row in data_df.iterrows(): device_name = data_row["deviceCode"] site_id, _ = get_site_and_device_id(devices, device_name=device_name) if not site_id: continue location = str(data_row["location.coordinates"]) location = location.replace("[", "").replace("]", "") location_coordinates = location.split(",") device_data = dict({ "timestamp": str_to_date(data_row["time"]), "tenant": "kcca", "site_id": site_id, "device_number": 0, "device": device_name, "latitude": location_coordinates[1], "longitude": location_coordinates[0], "pm2_5": get_column_value( column="characteristics.pm2_5ConcMass.value", columns=columns, series=data_row, ), "s1_pm2_5": get_column_value(column="s1_pm2_5", columns=columns, series=data_row), "s2_pm2_5": get_column_value(column="s2_pm2_5", columns=columns, series=data_row), "pm2_5_raw_value": get_column_value( column="characteristics.pm2_5ConcMass.raw", columns=columns, series=data_row, ), "pm2_5_calibrated_value": get_column_value( column="characteristics.pm2_5ConcMass.calibratedValue", columns=columns, series=data_row, ), "pm10": get_column_value( column="characteristics.pm10ConcMass.value", columns=columns, series=data_row, ), "s1_pm10": get_column_value(column="s1_pm10", columns=columns, series=data_row), "s2_pm10": get_column_value(column="s2_pm10", columns=columns, series=data_row), "pm10_raw_value": get_column_value( column="characteristics.pm10ConcMass.raw", columns=columns, series=data_row, ), "pm10_calibrated_value": get_column_value( column="characteristics.pm10ConcMass.calibratedValue", columns=columns, series=data_row, ), "no2": get_column_value( column="characteristics.no2Conc.value", columns=columns, series=data_row, ), "no2_raw_value": get_column_value( column="characteristics.no2Conc.raw", columns=columns, series=data_row, ), "no2_calibrated_value": get_column_value( column="characteristics.no2Conc.calibratedValue", columns=columns, series=data_row, ), "pm1": get_column_value( column="characteristics.pm1ConcMass.value", columns=columns, series=data_row, ), "pm1_raw_value": get_column_value( column="characteristics.pm1ConcMass.raw", columns=columns, series=data_row, ), "pm1_calibrated_value": get_column_value( column="characteristics.pm1ConcMass.calibratedValue", columns=columns, series=data_row, ), "altitude": get_column_value( column="characteristics.altitude.value", columns=columns, series=data_row, ), "wind_speed": get_column_value( column="characteristics.windSpeed.value", columns=columns, series=data_row, ), "external_temperature": get_column_value( column="characteristics.temperature.value", columns=columns, series=data_row, ), "external_humidity": get_column_value( column="characteristics.relHumid.value", columns=columns, series=data_row, ), }) restructured_data.append(device_data) return pd.DataFrame(columns=BigQueryApi().hourly_measurements_columns, data=restructured_data).to_dict(orient="records")