def extract_airqo_hourly_data_from_api(start_time: str, end_time: str) -> list:
    airqo_api = AirQoApi()
    devices = airqo_api.get_devices(tenant="airqo")
    devices_list = list(devices)
    hourly_events = []

    if len(devices_list) == 0:
        print("devices empty")
        return []

    for device in devices_list:

        try:
            if "name" not in device.keys():
                print(f"name missing in device keys : {device}")
                continue

            device_name = device["name"]
            events = airqo_api.get_events(
                tenant="airqo",
                start_time=start_time,
                frequency="hourly",
                end_time=end_time,
                device=device_name,
            )

            if not events:
                print(
                    f"No measurements for {device_name} : startTime {start_time} : endTime : {end_time}"
                )
                continue

            hourly_events.extend(events)
        except Exception as ex:
            traceback.print_exc()
            print(ex)

    device_measurements = pd.json_normalize(hourly_events)
    column_mappings = {
        "internalTemperature.value": "internalTemperature",
        "internalHumidity.value": "internalHumidity",
        "externalTemperature.value": "temperature",
        "externalHumidity.value": "humidity",
        "externalPressure.value": "pressure",
        "speed.value": "windSpeed",
        "altitude.value": "altitude",
        "battery.value": "battery",
        "satellites.value": "satellites",
        "hdop.value": "hdop",
        "pm10.value": "pm10",
        "s2_pm10.value": "s2_pm10",
        "s2_pm2_5.value": "s2_pm2_5",
        "average_pm2_5.calibratedValue": "calibrated_pm2_5",
    }

    device_measurements.rename(columns=column_mappings, inplace=True)

    return device_measurements.to_dict(orient="records")
Exemple #2
0
    def load(inputs: dict):

        from airqo_etl_utils.commons import un_fill_nan
        from airqo_etl_utils.airqo_api import AirQoApi

        kcca_data = un_fill_nan(inputs.get("data"))

        airqo_api = AirQoApi()
        airqo_api.save_events(measurements=kcca_data, tenant="kcca")
Exemple #3
0
    def load(kcca_data: dict, **kwargs):

        from airqo_etl_utils.kcca_utils import (
            transform_kcca_measurements_for_api,
            transform_kcca_hourly_data_for_bigquery,
            transform_kcca_data_for_message_broker,
        )
        from airqo_etl_utils.commons import un_fill_nan
        from airqo_etl_utils.airqo_api import AirQoApi
        from airqo_etl_utils.message_broker import KafkaBrokerClient
        from airqo_etl_utils.bigquery_api import BigQueryApi
        from airqo_etl_utils.config import configuration

        data = un_fill_nan(kcca_data.get("data"))

        try:
            dag_run = kwargs.get("dag_run")
            destination = dag_run.conf["destination"]
        except KeyError:
            destination = "bigquery"

        if destination == "bigquery":

            kcca_transformed_data = transform_kcca_hourly_data_for_bigquery(
                data)

            big_query_api = BigQueryApi()
            big_query_api.save_data(
                data=kcca_transformed_data,
                table=big_query_api.hourly_measurements_table,
            )

        elif destination == "message-broker":

            kcca_transformed_data = transform_kcca_data_for_message_broker(
                data=data, frequency="hourly")

            info = {
                "data": kcca_transformed_data,
                "action": "insert",
                "tenant": "kcca"
            }

            kafka = KafkaBrokerClient()
            kafka.send_data(info=info,
                            topic=configuration.HOURLY_MEASUREMENTS_TOPIC)

        elif destination == "api":
            kcca_transformed_data = transform_kcca_measurements_for_api(data)
            airqo_api = AirQoApi()
            airqo_api.save_events(measurements=kcca_transformed_data,
                                  tenant="kcca")

        else:
            raise Exception(
                "Invalid data destination. Valid values are bigquery, message-broker and api"
            )
    def send_raw_measurements_to_api(airqo_data: dict):
        from airqo_etl_utils.commons import un_fill_nan
        from airqo_etl_utils.airqo_utils import restructure_airqo_data
        from airqo_etl_utils.airqo_api import AirQoApi

        data = un_fill_nan(airqo_data.get("data"))

        airqo_restructured_data = restructure_airqo_data(data=data, destination="api")
        airqo_api = AirQoApi()
        airqo_api.save_events(measurements=airqo_restructured_data, tenant="airqo")
Exemple #5
0
    def send_hourly_measurements_to_api(inputs: dict):

        from airqo_etl_utils.kcca_utils import transform_kcca_measurements_for_api
        from airqo_etl_utils.commons import un_fill_nan
        from airqo_etl_utils.airqo_api import AirQoApi

        data = un_fill_nan(inputs.get("data"))
        kcca_data = transform_kcca_measurements_for_api(data)

        airqo_api = AirQoApi()
        airqo_api.save_events(measurements=kcca_data, tenant="kcca")
Exemple #6
0
    def create_empty_insights():

        from airqo_etl_utils.airqo_api import AirQoApi

        from airqo_etl_utils.commons import fill_nan
        import random
        import pandas as pd
        from airqo_etl_utils.date import (
            date_to_str_days,
            date_to_str_hours,
        )

        airqo_api = AirQoApi()
        sites = airqo_api.get_sites(tenant="airqo")
        insights = []

        dates = pd.date_range(start_date_time, end_date_time, freq="1H")
        for date in dates:
            date_time = date_to_str_hours(date)
            for site in sites:
                try:
                    hourly_insight = {
                        "time": date_time,
                        "pm2_5": random.uniform(50.0, 150.0),
                        "pm10": random.uniform(50.0, 150.0),
                        "empty": True,
                        "frequency": "HOURLY",
                        "forecast": False,
                        "siteId": site["_id"],
                    }
                    insights.append(hourly_insight)
                except Exception as ex:
                    print(ex)

        dates = pd.date_range(start_date_time, end_date_time, freq="24H")
        for date in dates:
            date_time = date_to_str_days(date)
            for site in sites:
                try:
                    daily_insight = {
                        "time": date_time,
                        "pm2_5": random.uniform(50.0, 150.0),
                        "pm10": random.uniform(50.0, 150.0),
                        "empty": True,
                        "frequency": "DAILY",
                        "forecast": False,
                        "siteId": site["_id"],
                    }
                    insights.append(daily_insight)
                except Exception as ex:
                    print(ex)

        return dict({"data": fill_nan(data=insights)})
def extract_airqo_devices_deployment_history() -> list:
    airqo_api = AirQoApi()
    devices = airqo_api.get_devices(tenant="airqo")
    devices_history = []
    for device in devices:

        try:
            maintenance_logs = airqo_api.get_maintenance_logs(
                tenant="airqo", device=device["name"], activity_type="deployment"
            )

            if not maintenance_logs or len(maintenance_logs) <= 1:
                continue

            log_df = pd.DataFrame(maintenance_logs)
            log_df = log_df.dropna(subset=["date"])

            log_df["site_id"] = (
                log_df["site_id"].fillna(method="bfill").fillna(method="ffill")
            )
            log_df = log_df.dropna(subset=["site_id"])

            log_df["start_time"] = pd.to_datetime(log_df["date"])
            log_df = log_df.sort_values(by="start_time")
            log_df["end_time"] = log_df["start_time"].shift(-1)
            log_df["end_time"] = log_df["end_time"].fillna(datetime.utcnow())

            log_df["start_time"] = log_df["start_time"].apply(lambda x: date_to_str(x))
            log_df["end_time"] = log_df["end_time"].apply(lambda x: date_to_str(x))

            if len(set(log_df["site_id"].tolist())) == 1:
                continue

            for _, raw in log_df.iterrows():
                device_history = {
                    "device": raw["device"],
                    "device_id": device["_id"],
                    "start_time": raw["start_time"],
                    "end_time": raw["end_time"],
                    "site_id": raw["site_id"],
                }

                devices_history.append(device_history)

        except Exception as ex:
            print(ex)
            traceback.print_exc()

    return devices_history
    def load(airqo_data: dict, **kwargs):

        from airqo_etl_utils.commons import un_fill_nan
        from airqo_etl_utils.bigquery_api import BigQueryApi
        from airqo_etl_utils.airqo_api import AirQoApi
        from airqo_etl_utils.airqo_utils import restructure_airqo_data
        from airqo_etl_utils.config import configuration
        from airqo_etl_utils.message_broker import KafkaBrokerClient

        data = un_fill_nan(airqo_data.get("data"))

        try:
            dag_run = kwargs.get("dag_run")
            destination = dag_run.conf["destination"]
        except KeyError:
            destination = "bigquery"

        if destination == "bigquery":
            airqo_restructured_data = restructure_airqo_data(
                data=data, destination="bigquery"
            )
            big_query_api = BigQueryApi()
            big_query_api.save_data(
                data=airqo_restructured_data,
                table=big_query_api.hourly_measurements_table,
            )

        elif destination == "message-broker":
            airqo_restructured_data = restructure_airqo_data(
                data=data, destination="message-broker"
            )

            info = {
                "data": airqo_restructured_data,
                "action": "insert",
                "tenant": "airqo",
            }
            kafka = KafkaBrokerClient()
            kafka.send_data(info=info, topic=configuration.HOURLY_MEASUREMENTS_TOPIC)
        elif destination == "api":
            airqo_restructured_data = restructure_airqo_data(
                data=data, destination="api"
            )
            airqo_api = AirQoApi()
            airqo_api.save_events(measurements=airqo_restructured_data, tenant="airqo")
        else:
            raise Exception(
                "Invalid data destination. Valid values are bigquery, message-broker and api"
            )
Exemple #9
0
def get_airqo_data(freq: str,
                   start_time: str = None,
                   end_time: str = None) -> list:
    airqo_api = AirQoApi()
    devices = airqo_api.get_devices(tenant="airqo", all_devices=False)
    measurements = []

    start = (str_to_date(start_time) if start_time else datetime.utcnow() -
             timedelta(days=7))
    end = str_to_date(end_time) if end_time else datetime.utcnow()

    start_time = (date_to_str_days(start)
                  if freq == "daily" else date_to_str_hours(start))
    end_time = date_to_str_days(end) if freq == "daily" else date_to_str_hours(
        end)

    frequency = get_airqo_api_frequency(freq=freq)
    dates = pd.date_range(start_time, end_time, freq=frequency)
    last_date_time = dates.values[len(dates.values) - 1]

    for device in devices:

        for date in dates:

            start = date_to_str(date)
            end_date_time = date + timedelta(hours=dates.freq.n)

            if np.datetime64(end_date_time) > last_date_time:
                end = end_time
            else:
                end = date_to_str(end_date_time)

            try:
                events = airqo_api.get_events(
                    tenant="airqo",
                    start_time=start,
                    frequency=freq,
                    end_time=end,
                    device=device["name"],
                )
                measurements.extend(events)

            except Exception as ex:
                print(ex)
                traceback.print_exc()

    insights = format_measurements_to_insights(data=measurements)
    return insights
def calibrate_using_api(measurements: list) -> list:
    if not measurements:
        return []

    data_df = pd.DataFrame(measurements)

    data_df_groups = data_df.groupby("time")
    airqo_api = AirQoApi()
    calibrated_measurements = []

    for _, time_group in data_df_groups:

        try:
            data = time_group
            date_time = data.iloc[0]["time"]

            calibrate_body = data.to_dict(orient="records")

            calibrated_values = airqo_api.get_calibrated_values(
                time=date_time, calibrate_body=calibrate_body
            )

            for value in calibrated_values:
                try:
                    data.loc[
                        (data["device_id"] == value["device_id"])
                        & (data["time"] == date_time),
                        "calibrated_pm2_5",
                    ] = value["calibrated_PM2.5"]
                    data.loc[
                        (data["device_id"] == value["device_id"])
                        & (data["time"] == date_time),
                        "calibrated_pm10",
                    ] = value["calibrated_PM10"]
                except Exception as ex:
                    traceback.print_exc()
                    print(ex)
                    continue

            calibrated_measurements.extend(data.to_dict(orient="records"))

        except Exception as ex:
            traceback.print_exc()
            print(ex)
            continue

    return calibrated_measurements
Exemple #11
0
def get_weather_data_from_tahmo(start_time=None,
                                end_time=None,
                                tenant="airqo"):
    airqo_api = AirQoApi()
    airqo_sites = airqo_api.get_sites(tenant=tenant)
    station_codes = []
    for site in airqo_sites:
        try:
            if "nearest_tahmo_station" in dict(site).keys():
                station_codes.append(site["nearest_tahmo_station"]["code"])
        except Exception as ex:
            print(ex)

    measurements = []
    tahmo_api = TahmoApi()

    frequency = get_frequency(start_time=start_time, end_time=end_time)
    dates = pd.date_range(start_time, end_time, freq=frequency)
    last_date_time = dates.values[len(dates.values) - 1]

    for date in dates:

        start = date_to_str(date)
        end_date_time = date + timedelta(hours=dates.freq.n)

        if np.datetime64(end_date_time) > last_date_time:
            end = end_time
        else:
            end = date_to_str(end_date_time)

        print(start + " : " + end)

        range_measurements = tahmo_api.get_measurements(
            start, end, station_codes)
        measurements.extend(range_measurements)

    if len(measurements) != 0:
        measurements_df = pd.DataFrame(data=measurements)
    else:
        measurements_df = pd.DataFrame(
            [], columns=["value", "variable", "station", "time"])
        return measurements_df.to_dict(orient="records")

    clean_measurements_df = remove_invalid_dates(dataframe=measurements_df,
                                                 start_time=start_time,
                                                 end_time=end_time)
    return clean_measurements_df.to_dict(orient="records")
Exemple #12
0
def get_forecast_data(tenant: str) -> list:
    airqo_api = AirQoApi()
    devices = airqo_api.get_devices(tenant=tenant, all_devices=False)

    forecast_measurements = pd.DataFrame(data=[], columns=insights_columns)
    time = int((datetime.utcnow() + timedelta(hours=1)).timestamp())

    for device in devices:
        device_dict = dict(device)
        device_number = device_dict.get("device_number", None)
        site = device_dict.get("site", None)
        if not site:
            print(f"device {device_number} isn't attached to  a site.")
            continue
        site_id = site["_id"]

        if device_number:

            forecast = airqo_api.get_forecast(channel_id=device_number,
                                              timestamp=time)
            if forecast:
                forecast_df = pd.DataFrame(forecast)

                forecast_cleaned_df = pd.DataFrame(columns=insights_columns)
                forecast_cleaned_df["time"] = forecast_df["prediction_time"]
                forecast_cleaned_df["pm2_5"] = forecast_df["prediction_value"]
                forecast_cleaned_df["pm10"] = forecast_df["prediction_value"]
                forecast_cleaned_df["siteId"] = site_id
                forecast_cleaned_df["frequency"] = "hourly"
                forecast_cleaned_df["forecast"] = True
                forecast_cleaned_df["empty"] = False

                forecast_measurements = forecast_measurements.append(
                    forecast_cleaned_df, ignore_index=True)

    forecast_measurements["time"] = forecast_measurements["time"].apply(
        lambda x: predict_time_to_string(x))
    forecast_measurements = forecast_measurements[
        forecast_measurements["pm2_5"].notna()]

    return forecast_measurements.to_dict(orient="records")
def map_site_ids_to_historical_measurements(data: list, deployment_logs: list) -> list:

    if not deployment_logs or not data:
        return data

    airqo_api = AirQoApi()
    devices = airqo_api.get_devices(tenant="airqo")

    mapped_data = []

    devices_logs_df = pd.DataFrame(deployment_logs)
    devices_logs_df["start_time"] = devices_logs_df["start_time"].apply(
        lambda x: str_to_date(x)
    )
    devices_logs_df["end_time"] = devices_logs_df["end_time"].apply(
        lambda x: str_to_date(x)
    )

    data = un_fill_nan(data)
    data_df = pd.DataFrame(data)

    for _, data_row in data_df.iterrows():
        device = get_device(devices, device_id=data_row["device_id"])

        if not device:
            continue

        site_id = device.get("site").get("_id")
        time = str_to_date(data_row["time"])
        device_logs = devices_logs_df[devices_logs_df["device_id"] == device.get("_id")]

        if not device_logs.empty:
            for _, log in device_logs.iterrows():
                if log["start_time"] <= time <= log["end_time"]:
                    site_id = log["site_id"]

        data_row["site_id"] = site_id

        mapped_data.append(data_row.to_dict())

    return mapped_data
Exemple #14
0
def query_insights_data(freq: str,
                        start_date_time: str,
                        end_date_time: str,
                        forecast=False,
                        all_data=False) -> list:
    airqo_api = AirQoApi()
    insights = []

    frequency = get_frequency(start_time=start_date_time,
                              end_time=end_date_time)
    dates = pd.date_range(start_date_time, end_date_time, freq=frequency)
    last_date_time = dates.values[len(dates.values) - 1]

    for date in dates:

        start = date_to_str(date)
        query_end_date_time = date + timedelta(hours=dates.freq.n)

        if np.datetime64(query_end_date_time) > last_date_time:
            end = end_date_time
        else:
            end = date_to_str(query_end_date_time)

        try:
            api_results = airqo_api.get_app_insights(
                start_time=start,
                frequency=freq,
                end_time=end,
                forecast=forecast,
                all_data=all_data,
            )
            insights.extend(api_results)

        except Exception as ex:
            print(ex)
            traceback.print_exc()

    return insights
def extract_sites_meta_data(tenant=None) -> list:
    airqo_api = AirQoApi()
    sites = airqo_api.get_sites(tenant=tenant)
    sites_df = pd.DataFrame(sites)
    sites_df = sites_df[[
        "_id",
        "latitude",
        "tenant",
        "longitude",
        "name",
        "bearing_to_kampala_center",
        "landform_90",
        "distance_to_kampala_center",
        "altitude",
        "landform_270",
        "aspect",
        "description",
        "distance_to_nearest_tertiary_road",
        "distance_to_nearest_primary_road",
        "distance_to_nearest_road",
        "distance_to_nearest_residential_road",
        "distance_to_nearest_secondary_road",
        "distance_to_nearest_unclassified_road",
        "country",
        "region",
        "parish",
        "sub_county",
        "county",
        "district",
        "city",
    ]]

    sites_df.rename(
        columns={
            "_id": "site_id",
            "latitude": "site_latitude",
            "longitude": "site_longitude",
            "description": "site_description",
            "altitude": "site_altitude",
            "name": "site_name",
            "distance_to_nearest_tertiary_road":
            "site_distance_to_nearest_tertiary_road",
            "distance_to_nearest_primary_road":
            "site_distance_to_nearest_primary_road",
            "distance_to_nearest_road": "site_distance_to_nearest_road",
            "distance_to_nearest_residential_road":
            "site_distance_to_nearest_residential_road",
            "distance_to_nearest_secondary_road":
            "site_distance_to_nearest_secondary_road",
            "distance_to_nearest_unclassified_road":
            "site_distance_to_nearest_unclassified_road",
            "bearing_to_kampala_center": "site_bearing_to_kampala_center",
            "landform_90": "site_landform_90",
            "distance_to_kampala_center": "site_distance_to_kampala_center",
            "landform_270": "site_landform_270",
            "aspect": "site_aspect",
        },
        inplace=True,
    )
    sites_df.reset_index(drop=True, inplace=True)
    return sites_df.to_dict(orient="records")
def transform_kcca_data_for_message_broker(data: list, frequency: str) -> list:
    restructured_data = []

    data_df = pd.DataFrame(data)
    columns = list(data_df.columns)

    airqo_api = AirQoApi()
    devices = airqo_api.get_devices(tenant="kcca")

    for _, data_row in data_df.iterrows():
        device_name = data_row["deviceCode"]
        site_id, device_id = get_site_and_device_id(devices,
                                                    device_name=device_name)
        if not site_id and not device_id:
            continue

        location = str(data_row["location.coordinates"])
        location = location.replace("[", "").replace("]", "")
        location_coordinates = location.split(",")

        device_data = dict({
            "time":
            frequency_time(dateStr=data_row["time"], frequency=frequency),
            "tenant":
            "kcca",
            "site_id":
            site_id,
            "device_id":
            device_id,
            "device_number":
            0,
            "device":
            device_name,
            "latitude":
            location_coordinates[1],
            "longitude":
            location_coordinates[0],
            "pm2_5":
            get_column_value(
                column="characteristics.pm2_5ConcMass.value",
                columns=columns,
                series=data_row,
            ),
            "pm10":
            get_column_value(
                column="characteristics.pm10ConcMass.value",
                columns=columns,
                series=data_row,
            ),
            "s1_pm2_5":
            get_column_value(
                column="characteristics.pm2_5ConcMass.raw",
                columns=columns,
                series=data_row,
            ),
            "s1_pm10":
            get_column_value(
                column="characteristics.pm10ConcMass.raw",
                columns=columns,
                series=data_row,
            ),
            "s2_pm2_5":
            None,
            "s2_pm10":
            None,
            "pm2_5_calibrated_value":
            get_column_value(
                column="characteristics.pm2_5ConcMass.calibratedValue",
                columns=columns,
                series=data_row,
            ),
            "pm10_calibrated_value":
            get_column_value(
                column="characteristics.pm10ConcMass.calibratedValue",
                columns=columns,
                series=data_row,
            ),
            "altitude":
            get_column_value(
                column="characteristics.altitude.value",
                columns=columns,
                series=data_row,
            ),
            "wind_speed":
            get_column_value(
                column="characteristics.windSpeed.value",
                columns=columns,
                series=data_row,
            ),
            "external_temperature":
            get_column_value(
                column="characteristics.temperature.value",
                columns=columns,
                series=data_row,
            ),
            "external_humidity":
            get_column_value(
                column="characteristics.relHumid.value",
                columns=columns,
                series=data_row,
            ),
        })

        restructured_data.append(device_data)

    return restructured_data
def transform_kcca_hourly_data_for_bigquery(data: list) -> list:
    restructured_data = []

    data_df = pd.DataFrame(data)
    columns = list(data_df.columns)

    airqo_api = AirQoApi()
    devices = airqo_api.get_devices(tenant="kcca")

    for _, data_row in data_df.iterrows():
        device_name = data_row["deviceCode"]
        site_id, _ = get_site_and_device_id(devices, device_name=device_name)
        if not site_id:
            continue

        location = str(data_row["location.coordinates"])
        location = location.replace("[", "").replace("]", "")
        location_coordinates = location.split(",")

        device_data = dict({
            "timestamp":
            str_to_date(data_row["time"]),
            "tenant":
            "kcca",
            "site_id":
            site_id,
            "device_number":
            0,
            "device":
            device_name,
            "latitude":
            location_coordinates[1],
            "longitude":
            location_coordinates[0],
            "pm2_5":
            get_column_value(
                column="characteristics.pm2_5ConcMass.value",
                columns=columns,
                series=data_row,
            ),
            "s1_pm2_5":
            get_column_value(column="s1_pm2_5",
                             columns=columns,
                             series=data_row),
            "s2_pm2_5":
            get_column_value(column="s2_pm2_5",
                             columns=columns,
                             series=data_row),
            "pm2_5_raw_value":
            get_column_value(
                column="characteristics.pm2_5ConcMass.raw",
                columns=columns,
                series=data_row,
            ),
            "pm2_5_calibrated_value":
            get_column_value(
                column="characteristics.pm2_5ConcMass.calibratedValue",
                columns=columns,
                series=data_row,
            ),
            "pm10":
            get_column_value(
                column="characteristics.pm10ConcMass.value",
                columns=columns,
                series=data_row,
            ),
            "s1_pm10":
            get_column_value(column="s1_pm10",
                             columns=columns,
                             series=data_row),
            "s2_pm10":
            get_column_value(column="s2_pm10",
                             columns=columns,
                             series=data_row),
            "pm10_raw_value":
            get_column_value(
                column="characteristics.pm10ConcMass.raw",
                columns=columns,
                series=data_row,
            ),
            "pm10_calibrated_value":
            get_column_value(
                column="characteristics.pm10ConcMass.calibratedValue",
                columns=columns,
                series=data_row,
            ),
            "no2":
            get_column_value(
                column="characteristics.no2Conc.value",
                columns=columns,
                series=data_row,
            ),
            "no2_raw_value":
            get_column_value(
                column="characteristics.no2Conc.raw",
                columns=columns,
                series=data_row,
            ),
            "no2_calibrated_value":
            get_column_value(
                column="characteristics.no2Conc.calibratedValue",
                columns=columns,
                series=data_row,
            ),
            "pm1":
            get_column_value(
                column="characteristics.pm1ConcMass.value",
                columns=columns,
                series=data_row,
            ),
            "pm1_raw_value":
            get_column_value(
                column="characteristics.pm1ConcMass.raw",
                columns=columns,
                series=data_row,
            ),
            "pm1_calibrated_value":
            get_column_value(
                column="characteristics.pm1ConcMass.calibratedValue",
                columns=columns,
                series=data_row,
            ),
            "altitude":
            get_column_value(
                column="characteristics.altitude.value",
                columns=columns,
                series=data_row,
            ),
            "wind_speed":
            get_column_value(
                column="characteristics.windSpeed.value",
                columns=columns,
                series=data_row,
            ),
            "external_temperature":
            get_column_value(
                column="characteristics.temperature.value",
                columns=columns,
                series=data_row,
            ),
            "external_humidity":
            get_column_value(
                column="characteristics.relHumid.value",
                columns=columns,
                series=data_row,
            ),
        })

        restructured_data.append(device_data)

    return pd.DataFrame(columns=BigQueryApi().hourly_measurements_columns,
                        data=restructured_data).to_dict(orient="records")
Exemple #18
0
def resample_weather_data(data: list, frequency: str):
    weather_raw_data = pd.DataFrame(data)
    if weather_raw_data.empty:
        return weather_raw_data.to_dict(orient="records")

    airqo_api = AirQoApi()
    sites = airqo_api.get_sites(tenant="airqo")
    valid_sites = list(
        filter(lambda x: "nearest_tahmo_station" in dict(x).keys(), sites))

    # to include site id
    # devices = get_devices_or_sites(configuration.AIRQO_BASE_URL, tenant='airqo', sites=False)

    temperature = weather_raw_data.loc[
        weather_raw_data["variable"] == "te",
        ["value", "variable", "station", "time"]]
    humidity = weather_raw_data.loc[weather_raw_data["variable"] == "rh",
                                    ["value", "variable", "station", "time"]]
    wind_speed = weather_raw_data.loc[weather_raw_data["variable"] == "ws",
                                      ["value", "variable", "station", "time"]]

    humidity["value"] = pd.to_numeric(humidity["value"], errors="coerce")
    humidity["value"] = humidity["value"].apply(lambda x: x * 100)

    data = pd.concat([temperature, humidity, wind_speed])
    data.reset_index(inplace=True)
    devices_weather_data = []

    data["value"] = pd.to_numeric(data["value"],
                                  errors="coerce",
                                  downcast="float")
    data = data.fillna(0)

    data_station_gps = data.groupby("station")

    for _, station_group in data_station_gps:

        device_weather_data = []
        station = station_group.iloc[0]["station"]

        try:

            # resampling station values
            temperature = resample_data(
                station_group.loc[station_group["variable"] == "te",
                                  ["value", "time"]],
                frequency,
            )
            temperature.columns = ["temperature", "time"]
            humidity = resample_data(
                station_group.loc[station_group["variable"] == "rh",
                                  ["value", "time"]],
                frequency,
            )
            humidity.columns = ["humidity", "time"]
            wind_speed = resample_data(
                station_group.loc[station_group["variable"] == "ws",
                                  ["value", "time"]],
                frequency,
            )
            wind_speed.columns = ["wind_speed", "time"]

            data_frames = [temperature, humidity, wind_speed]

            station_df = reduce(
                lambda left, right: pd.merge(
                    left, right, on=["time"], how="outer"),
                data_frames,
            )
            station_df["frequency"] = frequency

            # mapping device to station
            station_devices = get_device_ids_from_station(station, valid_sites)

            if len(station_devices) == 0:
                continue

            for device_id in station_devices:
                device_station_df = station_df.copy(deep=True)
                device_station_df["device_id"] = device_id
                device_weather_data.extend(
                    device_station_df.to_dict(orient="records"))

        except Exception as ex:
            print(ex)
            traceback.print_exc()
            continue

        # to include site id
        # device_station_data_df = pd.DataFrame(device_weather_data)
        # device_station_data_df['site_id'] = device_station_data_df['device_id'].apply(
        #     lambda x: get_device_site_id(x, devices))
        # devices_weather_data.extend(device_station_data_df.to_dict(orient='records'))

        devices_weather_data.extend(device_weather_data)

    # pd.DataFrame(devices_weather_data).to_csv(path_or_buf='devices_weather.csv', index=False)

    return devices_weather_data
def transform_kcca_measurements_for_api(unclean_data) -> list:
    data = pd.DataFrame(unclean_data)
    airqo_api = AirQoApi()
    devices = airqo_api.get_devices(tenant="kcca")
    device_gps = data.groupby("deviceCode")
    cleaned_measurements = []
    for _, group in device_gps:
        device_name = group.iloc[0]["deviceCode"]

        site_id, device_id = get_site_and_device_id(devices,
                                                    device_name=device_name)

        if not site_id and not device_id:
            continue

        transformed_data = []
        columns = group.columns

        for index, row in group.iterrows():

            location = str(row["location.coordinates"])
            location = location.replace("[", "").replace("]", "")
            location_coordinates = location.split(",")

            frequency = str(row.get("outputFrequency", "raw"))

            if frequency.lower() == "hour":
                frequency = "hourly"
            elif frequency.lower() == "day":
                frequency = "daily"
            else:
                frequency = "raw"

            row_data = dict({
                "frequency":
                frequency,
                "time":
                frequency_time(dateStr=row.get("time"), frequency=frequency),
                "tenant":
                "kcca",
                "site_id":
                site_id,
                "device_id":
                device_id,
                "device":
                row["deviceCode"],
                "location":
                dict({
                    "longitude":
                    dict({"value": to_double(location_coordinates[0])}),
                    "latitude":
                    dict({"value": to_double(location_coordinates[1])}),
                }),
                "pm2_5": {
                    "value":
                    get_valid_column_value(
                        column_name="characteristics.pm2_5ConcMass.raw",
                        series=row,
                        columns_names=columns,
                        data_name="pm2_5",
                    ),
                    "calibratedValue":
                    get_valid_column_value(
                        column_name="characteristics.pm2_5ConcMass.value",
                        series=row,
                        columns_names=columns,
                        data_name="pm2_5",
                    ),
                },
                "pm1": {
                    "value":
                    get_valid_column_value(
                        column_name="characteristics.pm1ConcMass.raw",
                        series=row,
                        columns_names=columns,
                        data_name=None,
                    ),
                    "calibratedValue":
                    get_valid_column_value(
                        column_name="characteristics.pm1ConcMass.value",
                        series=row,
                        columns_names=columns,
                        data_name=None,
                    ),
                },
                "pm10": {
                    "value":
                    get_valid_column_value(
                        column_name="characteristics.pm10ConcMass.raw",
                        series=row,
                        columns_names=columns,
                        data_name="pm10",
                    ),
                    "calibratedValue":
                    get_valid_column_value(
                        column_name="characteristics.pm10ConcMass.value",
                        series=row,
                        columns_names=columns,
                        data_name="pm10",
                    ),
                },
                "externalTemperature": {
                    "value":
                    get_valid_column_value(
                        column_name="characteristics.temperature.value",
                        series=row,
                        columns_names=columns,
                        data_name="externalTemperature",
                    ),
                },
                "externalHumidity": {
                    "value":
                    get_valid_column_value(
                        column_name="characteristics.relHumid.value",
                        series=row,
                        columns_names=columns,
                        data_name="externalHumidity",
                    ),
                },
                "no2": {
                    "value":
                    get_valid_column_value(
                        column_name="characteristics.no2Conc.raw",
                        series=row,
                        columns_names=columns,
                        data_name=None,
                    ),
                    "calibratedValue":
                    get_valid_column_value(
                        column_name="characteristics.no2Conc.value",
                        series=row,
                        columns_names=columns,
                        data_name=None,
                    ),
                },
                "speed": {
                    "value":
                    get_valid_column_value(
                        column_name="characteristics.windSpeed.value",
                        series=row,
                        columns_names=columns,
                        data_name=None,
                    ),
                },
            })

            transformed_data.append(row_data)

        if transformed_data:
            cleaned_measurements.extend(transformed_data)

    return cleaned_measurements
def extract_airqo_data_from_thingspeak(
    start_time: str, end_time: str, all_devices: bool
) -> list:
    thingspeak_base_url = configuration.THINGSPEAK_CHANNEL_URL

    airqo_api = AirQoApi()
    airqo_devices = airqo_api.get_devices(tenant="airqo", all_devices=all_devices)
    read_keys = airqo_api.get_read_keys(devices=airqo_devices)

    channels_data = []

    frequency = get_frequency(start_time=start_time, end_time=end_time)

    def get_field_8_value(x: str, position: int):

        try:
            values = x.split(",")
            return values[position]
        except Exception as exc:
            print(exc)
            return None

    dates = pd.date_range(start_time, end_time, freq=frequency)
    last_date_time = dates.values[len(dates.values) - 1]
    for device in airqo_devices:
        try:

            channel_id = str(device["device_number"])

            for date in dates:

                start = date_to_str(date)
                end_date_time = date + timedelta(hours=dates.freq.n)

                if np.datetime64(end_date_time) > last_date_time:
                    end = end_time
                else:
                    end = date_to_str(end_date_time)

                read_key = read_keys[str(channel_id)]

                channel_url = f"{thingspeak_base_url}{channel_id}/feeds.json?start={start}&end={end}&api_key={read_key}"
                print(f"{channel_url}")

                data = json.loads(
                    requests.get(channel_url, timeout=100.0).content.decode("utf-8")
                )
                if (data != -1) and ("feeds" in data):
                    dataframe = pd.DataFrame(data["feeds"])

                    if dataframe.empty:
                        print(
                            f"{channel_id} does not have data between {start} and {end}"
                        )
                        continue

                    channel_df = pd.DataFrame(
                        data=[],
                        columns=[
                            "time",
                            "s1_pm2_5",
                            "s2_pm2_5",
                            "s1_pm10",
                            "device_id",
                            "site_id",
                            "s2_pm10",
                            "latitude",
                            "longitude",
                            "altitude",
                            "wind_speed",
                            "satellites",
                            "hdop",
                            "internalTemperature",
                            "internalHumidity",
                            "battery",
                            "temperature",
                            "humidity",
                            "pressure",
                            "externalAltitude",
                        ],
                    )

                    channel_df["s1_pm2_5"] = dataframe["field1"].apply(
                        lambda x: get_valid_value(x, "pm2_5")
                    )
                    channel_df["s1_pm10"] = dataframe["field2"].apply(
                        lambda x: get_valid_value(x, "pm10")
                    )
                    channel_df["s2_pm2_5"] = dataframe["field3"].apply(
                        lambda x: get_valid_value(x, "pm2_5")
                    )
                    channel_df["s2_pm10"] = dataframe["field4"].apply(
                        lambda x: get_valid_value(x, "pm10")
                    )
                    channel_df["latitude"] = dataframe["field5"].apply(
                        lambda x: get_valid_value(x, "latitude")
                    )
                    channel_df["longitude"] = dataframe["field6"].apply(
                        lambda x: get_valid_value(x, "longitude")
                    )
                    channel_df["battery"] = dataframe["field7"].apply(
                        lambda x: get_valid_value(x, "battery")
                    )

                    if "field8" in dataframe.columns:
                        try:
                            channel_df["latitude"] = dataframe["field8"].apply(
                                lambda x: get_valid_value(
                                    get_field_8_value(x, 0), "latitude"
                                )
                            )
                            channel_df["longitude"] = dataframe["field8"].apply(
                                lambda x: get_valid_value(
                                    get_field_8_value(x, 1), "longitude"
                                )
                            )
                            channel_df["altitude"] = dataframe["field8"].apply(
                                lambda x: get_valid_value(
                                    get_field_8_value(x, 2), "altitude"
                                )
                            )
                            channel_df["wind_speed"] = dataframe["field8"].apply(
                                lambda x: get_valid_value(
                                    get_field_8_value(x, 3), "wind_speed"
                                )
                            )
                            channel_df["satellites"] = dataframe["field8"].apply(
                                lambda x: get_valid_value(
                                    get_field_8_value(x, 4), "satellites"
                                )
                            )
                            channel_df["hdop"] = dataframe["field8"].apply(
                                lambda x: get_valid_value(
                                    get_field_8_value(x, 5), "hdop"
                                )
                            )
                            channel_df["internalTemperature"] = dataframe[
                                "field8"
                            ].apply(
                                lambda x: get_valid_value(
                                    get_field_8_value(x, 6), "externalTemperature"
                                )
                            )
                            channel_df["internalHumidity"] = dataframe["field8"].apply(
                                lambda x: get_valid_value(
                                    get_field_8_value(x, 7), "externalHumidity"
                                )
                            )
                            channel_df["temperature"] = dataframe["field8"].apply(
                                lambda x: get_valid_value(
                                    get_field_8_value(x, 8), "externalTemperature"
                                )
                            )
                            channel_df["humidity"] = dataframe["field8"].apply(
                                lambda x: get_valid_value(
                                    get_field_8_value(x, 9), "externalHumidity"
                                )
                            )
                            channel_df["pressure"] = dataframe["field8"].apply(
                                lambda x: get_valid_value(
                                    get_field_8_value(x, 10), "pressure"
                                )
                            )
                            channel_df["externalAltitude"] = dataframe["field8"].apply(
                                lambda x: get_valid_value(
                                    get_field_8_value(x, 11), "altitude"
                                )
                            )

                        except Exception as ex:
                            traceback.print_exc()
                            print(ex)

                    channel_df["time"] = dataframe["created_at"]
                    channel_df["device_id"] = device["_id"]
                    channel_df["site_id"] = device["site"]["_id"]
                    channel_df["device_number"] = device["device_number"]
                    channel_df["device"] = device["name"]
                    channel_df["frequency"] = "raw"

                    channels_data.extend(channel_df.to_dict(orient="records"))

        except Exception as ex:
            print(ex)
            traceback.print_exc()

    channel_data_df = pd.DataFrame(channels_data)
    clean_channel_data_df = remove_invalid_dates(
        dataframe=channel_data_df, start_time=start_time, end_time=end_time
    )
    return clean_channel_data_df.to_dict(orient="records")