def extract(): from airqo_etl_utils.date import date_to_str_days from airqo_etl_utils.kcca_utils import extract_kcca_measurements from airqo_etl_utils.commons import fill_nan from datetime import datetime, timedelta start_time = date_to_str_days(datetime.utcnow() - timedelta(days=3)) end_time = date_to_str_days(datetime.utcnow()) daily_kcca_data = extract_kcca_measurements(start_time=start_time, end_time=end_time, freq="daily") return dict({"data": fill_nan(data=daily_kcca_data)})
def get_airqo_data(freq: str, start_time: str = None, end_time: str = None) -> list: airqo_api = AirQoApi() devices = airqo_api.get_devices(tenant="airqo", all_devices=False) measurements = [] start = (str_to_date(start_time) if start_time else datetime.utcnow() - timedelta(days=7)) end = str_to_date(end_time) if end_time else datetime.utcnow() start_time = (date_to_str_days(start) if freq == "daily" else date_to_str_hours(start)) end_time = date_to_str_days(end) if freq == "daily" else date_to_str_hours( end) frequency = get_airqo_api_frequency(freq=freq) dates = pd.date_range(start_time, end_time, freq=frequency) last_date_time = dates.values[len(dates.values) - 1] for device in devices: for date in dates: start = date_to_str(date) end_date_time = date + timedelta(hours=dates.freq.n) if np.datetime64(end_date_time) > last_date_time: end = end_time else: end = date_to_str(end_date_time) try: events = airqo_api.get_events( tenant="airqo", start_time=start, frequency=freq, end_time=end, device=device["name"], ) measurements.extend(events) except Exception as ex: print(ex) traceback.print_exc() insights = format_measurements_to_insights(data=measurements) return insights
def create_empty_insights(): from airqo_etl_utils.airqo_api import AirQoApi from airqo_etl_utils.commons import fill_nan import random import pandas as pd from airqo_etl_utils.date import ( date_to_str_days, date_to_str_hours, ) airqo_api = AirQoApi() sites = airqo_api.get_sites(tenant="airqo") insights = [] dates = pd.date_range(start_date_time, end_date_time, freq="1H") for date in dates: date_time = date_to_str_hours(date) for site in sites: try: hourly_insight = { "time": date_time, "pm2_5": random.uniform(50.0, 150.0), "pm10": random.uniform(50.0, 150.0), "empty": True, "frequency": "HOURLY", "forecast": False, "siteId": site["_id"], } insights.append(hourly_insight) except Exception as ex: print(ex) dates = pd.date_range(start_date_time, end_date_time, freq="24H") for date in dates: date_time = date_to_str_days(date) for site in sites: try: daily_insight = { "time": date_time, "pm2_5": random.uniform(50.0, 150.0), "pm10": random.uniform(50.0, 150.0), "empty": True, "frequency": "DAILY", "forecast": False, "siteId": site["_id"], } insights.append(daily_insight) except Exception as ex: print(ex) return dict({"data": fill_nan(data=insights)})
def time_values(**kwargs): from airqo_etl_utils.date import date_to_str_days from datetime import datetime, timedelta try: dag_run = kwargs.get("dag_run") start_time = dag_run.conf["startTime"] end_time = dag_run.conf["endTime"] except KeyError: hour_of_day = datetime.utcnow() - timedelta(hours=24) start_time = date_to_str_days(hour_of_day) end_time = datetime.strftime(hour_of_day, "%Y-%m-%dT%23:59:59Z") return start_time, end_time
def resample_data(data: pd.DataFrame, frequency: str) -> pd.DataFrame: data = data.dropna(subset=["time"]) data["time"] = pd.to_datetime(data["time"]) data = data.sort_index(axis=0) if "latitude" in data.columns and "longitude" in data.columns: original_df = data[["time", "latitude", "longitude"]] else: original_df = data[["time"]] resample_value = "24H" if frequency.lower() == "daily" else "1H" averages = pd.DataFrame(data.resample(resample_value, on="time").mean()) averages["time"] = averages.index averages["time"] = averages["time"].apply(lambda x: date_to_str(x)) averages = averages.reset_index(drop=True) if resample_value == "1H": original_df["time"] = original_df["time"].apply( lambda x: date_to_str_hours(x)) elif resample_value == "24H": original_df["time"] = original_df["time"].apply( lambda x: date_to_str_days(x)) else: original_df["time"] = original_df["time"].apply( lambda x: date_to_str(x)) if "latitude" in original_df.columns and "longitude" in original_df.columns: def reset_latitude_or_longitude(time: str, field: str): date_row = pd.DataFrame( original_df.loc[original_df["time"] == time]) if date_row.empty: return time return (date_row.iloc[0]["latitude"] if field == "latitude" else date_row.iloc[0]["longitude"]) averages["latitude"] = averages.apply( lambda row: reset_latitude_or_longitude(row["time"], "latitude"), axis=1) averages["longitude"] = averages.apply( lambda row: reset_latitude_or_longitude(row["time"], "longitude"), axis=1) return averages
def insights_cleanup_etl(): from airqo_etl_utils.date import ( date_to_str_days, first_day_of_week, last_day_of_week, first_day_of_month, last_day_of_month, ) start_date_time = date_to_str_days( first_day_of_week(first_day_of_month(date_time=datetime.now()))) end_date_time = date_to_str_days( last_day_of_week(last_day_of_month(date_time=datetime.now()))) @task(multiple_outputs=True) def create_empty_insights(): from airqo_etl_utils.airqo_api import AirQoApi from airqo_etl_utils.commons import fill_nan import random import pandas as pd from airqo_etl_utils.date import ( date_to_str_days, date_to_str_hours, ) airqo_api = AirQoApi() sites = airqo_api.get_sites(tenant="airqo") insights = [] dates = pd.date_range(start_date_time, end_date_time, freq="1H") for date in dates: date_time = date_to_str_hours(date) for site in sites: try: hourly_insight = { "time": date_time, "pm2_5": random.uniform(50.0, 150.0), "pm10": random.uniform(50.0, 150.0), "empty": True, "frequency": "HOURLY", "forecast": False, "siteId": site["_id"], } insights.append(hourly_insight) except Exception as ex: print(ex) dates = pd.date_range(start_date_time, end_date_time, freq="24H") for date in dates: date_time = date_to_str_days(date) for site in sites: try: daily_insight = { "time": date_time, "pm2_5": random.uniform(50.0, 150.0), "pm10": random.uniform(50.0, 150.0), "empty": True, "frequency": "DAILY", "forecast": False, "siteId": site["_id"], } insights.append(daily_insight) except Exception as ex: print(ex) return dict({"data": fill_nan(data=insights)}) @task(multiple_outputs=True) def query_insights_data(): from airqo_etl_utils.app_insights_utils import query_insights_data from airqo_etl_utils.commons import fill_nan all_insights_data = query_insights_data( start_date_time=start_date_time, end_date_time=end_date_time, all_data=True, freq="", ) return dict({"data": fill_nan(data=all_insights_data)}) @task(multiple_outputs=True) def filter_insights(empty_insights_data: dict, available_insights_data: dict): from airqo_etl_utils.commons import fill_nan, un_fill_nan import pandas as pd insights_data_df = pd.DataFrame( data=un_fill_nan(available_insights_data.get("data"))) empty_insights_data_df = pd.DataFrame( data=un_fill_nan(empty_insights_data.get("data"))) insights_data = pd.concat([empty_insights_data_df, insights_data_df]).drop_duplicates( keep=False, subset=["siteId", "time", "frequency"]) return dict( {"data": fill_nan(data=insights_data.to_dict(orient="records"))}) @task() def load(insights_data: dict): from airqo_etl_utils.commons import un_fill_nan empty_insights_data = un_fill_nan(insights_data.get("data")) from airqo_etl_utils.app_insights_utils import save_insights_data save_insights_data(insights_data=empty_insights_data, action="insert", partition=2) empty_insights = create_empty_insights() available_insights = query_insights_data() filtered_insights = filter_insights( empty_insights_data=empty_insights, available_insights_data=available_insights) load(insights_data=filtered_insights)
def measurement_time_to_string(time: str, daily=False): date_time = str_to_date(time) if daily: return date_to_str_days(date_time) else: return date_to_str_hours(date_time)