Exemple #1
0
    def extract():

        from airqo_etl_utils.date import date_to_str_days
        from airqo_etl_utils.kcca_utils import extract_kcca_measurements
        from airqo_etl_utils.commons import fill_nan
        from datetime import datetime, timedelta

        start_time = date_to_str_days(datetime.utcnow() - timedelta(days=3))
        end_time = date_to_str_days(datetime.utcnow())

        daily_kcca_data = extract_kcca_measurements(start_time=start_time,
                                                    end_time=end_time,
                                                    freq="daily")

        return dict({"data": fill_nan(data=daily_kcca_data)})
Exemple #2
0
def get_airqo_data(freq: str,
                   start_time: str = None,
                   end_time: str = None) -> list:
    airqo_api = AirQoApi()
    devices = airqo_api.get_devices(tenant="airqo", all_devices=False)
    measurements = []

    start = (str_to_date(start_time) if start_time else datetime.utcnow() -
             timedelta(days=7))
    end = str_to_date(end_time) if end_time else datetime.utcnow()

    start_time = (date_to_str_days(start)
                  if freq == "daily" else date_to_str_hours(start))
    end_time = date_to_str_days(end) if freq == "daily" else date_to_str_hours(
        end)

    frequency = get_airqo_api_frequency(freq=freq)
    dates = pd.date_range(start_time, end_time, freq=frequency)
    last_date_time = dates.values[len(dates.values) - 1]

    for device in devices:

        for date in dates:

            start = date_to_str(date)
            end_date_time = date + timedelta(hours=dates.freq.n)

            if np.datetime64(end_date_time) > last_date_time:
                end = end_time
            else:
                end = date_to_str(end_date_time)

            try:
                events = airqo_api.get_events(
                    tenant="airqo",
                    start_time=start,
                    frequency=freq,
                    end_time=end,
                    device=device["name"],
                )
                measurements.extend(events)

            except Exception as ex:
                print(ex)
                traceback.print_exc()

    insights = format_measurements_to_insights(data=measurements)
    return insights
Exemple #3
0
    def create_empty_insights():

        from airqo_etl_utils.airqo_api import AirQoApi

        from airqo_etl_utils.commons import fill_nan
        import random
        import pandas as pd
        from airqo_etl_utils.date import (
            date_to_str_days,
            date_to_str_hours,
        )

        airqo_api = AirQoApi()
        sites = airqo_api.get_sites(tenant="airqo")
        insights = []

        dates = pd.date_range(start_date_time, end_date_time, freq="1H")
        for date in dates:
            date_time = date_to_str_hours(date)
            for site in sites:
                try:
                    hourly_insight = {
                        "time": date_time,
                        "pm2_5": random.uniform(50.0, 150.0),
                        "pm10": random.uniform(50.0, 150.0),
                        "empty": True,
                        "frequency": "HOURLY",
                        "forecast": False,
                        "siteId": site["_id"],
                    }
                    insights.append(hourly_insight)
                except Exception as ex:
                    print(ex)

        dates = pd.date_range(start_date_time, end_date_time, freq="24H")
        for date in dates:
            date_time = date_to_str_days(date)
            for site in sites:
                try:
                    daily_insight = {
                        "time": date_time,
                        "pm2_5": random.uniform(50.0, 150.0),
                        "pm10": random.uniform(50.0, 150.0),
                        "empty": True,
                        "frequency": "DAILY",
                        "forecast": False,
                        "siteId": site["_id"],
                    }
                    insights.append(daily_insight)
                except Exception as ex:
                    print(ex)

        return dict({"data": fill_nan(data=insights)})
    def time_values(**kwargs):
        from airqo_etl_utils.date import date_to_str_days
        from datetime import datetime, timedelta

        try:
            dag_run = kwargs.get("dag_run")
            start_time = dag_run.conf["startTime"]
            end_time = dag_run.conf["endTime"]
        except KeyError:
            hour_of_day = datetime.utcnow() - timedelta(hours=24)
            start_time = date_to_str_days(hour_of_day)
            end_time = datetime.strftime(hour_of_day, "%Y-%m-%dT%23:59:59Z")

        return start_time, end_time
Exemple #5
0
def resample_data(data: pd.DataFrame, frequency: str) -> pd.DataFrame:
    data = data.dropna(subset=["time"])
    data["time"] = pd.to_datetime(data["time"])
    data = data.sort_index(axis=0)
    if "latitude" in data.columns and "longitude" in data.columns:
        original_df = data[["time", "latitude", "longitude"]]
    else:
        original_df = data[["time"]]

    resample_value = "24H" if frequency.lower() == "daily" else "1H"
    averages = pd.DataFrame(data.resample(resample_value, on="time").mean())

    averages["time"] = averages.index
    averages["time"] = averages["time"].apply(lambda x: date_to_str(x))
    averages = averages.reset_index(drop=True)

    if resample_value == "1H":
        original_df["time"] = original_df["time"].apply(
            lambda x: date_to_str_hours(x))
    elif resample_value == "24H":
        original_df["time"] = original_df["time"].apply(
            lambda x: date_to_str_days(x))
    else:
        original_df["time"] = original_df["time"].apply(
            lambda x: date_to_str(x))

    if "latitude" in original_df.columns and "longitude" in original_df.columns:

        def reset_latitude_or_longitude(time: str, field: str):
            date_row = pd.DataFrame(
                original_df.loc[original_df["time"] == time])
            if date_row.empty:
                return time
            return (date_row.iloc[0]["latitude"]
                    if field == "latitude" else date_row.iloc[0]["longitude"])

        averages["latitude"] = averages.apply(
            lambda row: reset_latitude_or_longitude(row["time"], "latitude"),
            axis=1)
        averages["longitude"] = averages.apply(
            lambda row: reset_latitude_or_longitude(row["time"], "longitude"),
            axis=1)

    return averages
Exemple #6
0
def insights_cleanup_etl():
    from airqo_etl_utils.date import (
        date_to_str_days,
        first_day_of_week,
        last_day_of_week,
        first_day_of_month,
        last_day_of_month,
    )

    start_date_time = date_to_str_days(
        first_day_of_week(first_day_of_month(date_time=datetime.now())))
    end_date_time = date_to_str_days(
        last_day_of_week(last_day_of_month(date_time=datetime.now())))

    @task(multiple_outputs=True)
    def create_empty_insights():

        from airqo_etl_utils.airqo_api import AirQoApi

        from airqo_etl_utils.commons import fill_nan
        import random
        import pandas as pd
        from airqo_etl_utils.date import (
            date_to_str_days,
            date_to_str_hours,
        )

        airqo_api = AirQoApi()
        sites = airqo_api.get_sites(tenant="airqo")
        insights = []

        dates = pd.date_range(start_date_time, end_date_time, freq="1H")
        for date in dates:
            date_time = date_to_str_hours(date)
            for site in sites:
                try:
                    hourly_insight = {
                        "time": date_time,
                        "pm2_5": random.uniform(50.0, 150.0),
                        "pm10": random.uniform(50.0, 150.0),
                        "empty": True,
                        "frequency": "HOURLY",
                        "forecast": False,
                        "siteId": site["_id"],
                    }
                    insights.append(hourly_insight)
                except Exception as ex:
                    print(ex)

        dates = pd.date_range(start_date_time, end_date_time, freq="24H")
        for date in dates:
            date_time = date_to_str_days(date)
            for site in sites:
                try:
                    daily_insight = {
                        "time": date_time,
                        "pm2_5": random.uniform(50.0, 150.0),
                        "pm10": random.uniform(50.0, 150.0),
                        "empty": True,
                        "frequency": "DAILY",
                        "forecast": False,
                        "siteId": site["_id"],
                    }
                    insights.append(daily_insight)
                except Exception as ex:
                    print(ex)

        return dict({"data": fill_nan(data=insights)})

    @task(multiple_outputs=True)
    def query_insights_data():
        from airqo_etl_utils.app_insights_utils import query_insights_data

        from airqo_etl_utils.commons import fill_nan

        all_insights_data = query_insights_data(
            start_date_time=start_date_time,
            end_date_time=end_date_time,
            all_data=True,
            freq="",
        )

        return dict({"data": fill_nan(data=all_insights_data)})

    @task(multiple_outputs=True)
    def filter_insights(empty_insights_data: dict,
                        available_insights_data: dict):

        from airqo_etl_utils.commons import fill_nan, un_fill_nan

        import pandas as pd

        insights_data_df = pd.DataFrame(
            data=un_fill_nan(available_insights_data.get("data")))
        empty_insights_data_df = pd.DataFrame(
            data=un_fill_nan(empty_insights_data.get("data")))

        insights_data = pd.concat([empty_insights_data_df,
                                   insights_data_df]).drop_duplicates(
                                       keep=False,
                                       subset=["siteId", "time", "frequency"])

        return dict(
            {"data": fill_nan(data=insights_data.to_dict(orient="records"))})

    @task()
    def load(insights_data: dict):
        from airqo_etl_utils.commons import un_fill_nan

        empty_insights_data = un_fill_nan(insights_data.get("data"))
        from airqo_etl_utils.app_insights_utils import save_insights_data

        save_insights_data(insights_data=empty_insights_data,
                           action="insert",
                           partition=2)

    empty_insights = create_empty_insights()
    available_insights = query_insights_data()
    filtered_insights = filter_insights(
        empty_insights_data=empty_insights,
        available_insights_data=available_insights)
    load(insights_data=filtered_insights)
Exemple #7
0
def measurement_time_to_string(time: str, daily=False):
    date_time = str_to_date(time)
    if daily:
        return date_to_str_days(date_time)
    else:
        return date_to_str_hours(date_time)