コード例 #1
0
def query_hourly_measurements(start_date_time: str,
                              end_date_time: str) -> list:
    biq_query_api = BigQueryApi()

    hourly_measurements = biq_query_api.get_hourly_data(
        start_date_time=start_date_time,
        end_date_time=end_date_time,
        columns=hourly_measurements_columns,
        table=biq_query_api.hourly_measurements_table,
    )

    if hourly_measurements.empty:
        hourly_measurements = pd.DataFrame(data=[],
                                           columns=hourly_measurements_columns)

    hourly_measurements.rename(
        columns={
            "latitude": "device_latitude",
            "longitude": "device_longitude",
            "device": "device_name",
        },
        inplace=True,
    )

    return hourly_measurements.to_dict(orient="records")
コード例 #2
0
    def load(kcca_data: dict, **kwargs):

        from airqo_etl_utils.kcca_utils import (
            transform_kcca_measurements_for_api,
            transform_kcca_hourly_data_for_bigquery,
            transform_kcca_data_for_message_broker,
        )
        from airqo_etl_utils.commons import un_fill_nan
        from airqo_etl_utils.airqo_api import AirQoApi
        from airqo_etl_utils.message_broker import KafkaBrokerClient
        from airqo_etl_utils.bigquery_api import BigQueryApi
        from airqo_etl_utils.config import configuration

        data = un_fill_nan(kcca_data.get("data"))

        try:
            dag_run = kwargs.get("dag_run")
            destination = dag_run.conf["destination"]
        except KeyError:
            destination = "bigquery"

        if destination == "bigquery":

            kcca_transformed_data = transform_kcca_hourly_data_for_bigquery(
                data)

            big_query_api = BigQueryApi()
            big_query_api.save_data(
                data=kcca_transformed_data,
                table=big_query_api.hourly_measurements_table,
            )

        elif destination == "message-broker":

            kcca_transformed_data = transform_kcca_data_for_message_broker(
                data=data, frequency="hourly")

            info = {
                "data": kcca_transformed_data,
                "action": "insert",
                "tenant": "kcca"
            }

            kafka = KafkaBrokerClient()
            kafka.send_data(info=info,
                            topic=configuration.HOURLY_MEASUREMENTS_TOPIC)

        elif destination == "api":
            kcca_transformed_data = transform_kcca_measurements_for_api(data)
            airqo_api = AirQoApi()
            airqo_api.save_events(measurements=kcca_transformed_data,
                                  tenant="kcca")

        else:
            raise Exception(
                "Invalid data destination. Valid values are bigquery, message-broker and api"
            )
コード例 #3
0
    def send_hourly_measurements_to_bigquery(kcca_data: dict):

        from airqo_etl_utils.kcca_utils import transform_kcca_hourly_data_for_bigquery
        from airqo_etl_utils.commons import un_fill_nan
        from airqo_etl_utils.bigquery_api import BigQueryApi

        data = un_fill_nan(kcca_data.get("data"))
        kcca_restructured_data = transform_kcca_hourly_data_for_bigquery(data)

        big_query_api = BigQueryApi()
        big_query_api.save_data(data=kcca_restructured_data,
                                table=big_query_api.hourly_measurements_table)
コード例 #4
0
    def send_raw_measurements_to_bigquery(airqo_data: dict):

        from airqo_etl_utils.commons import un_fill_nan
        from airqo_etl_utils.airqo_utils import restructure_airqo_data
        from airqo_etl_utils.bigquery_api import BigQueryApi

        data = un_fill_nan(airqo_data.get("data"))
        airqo_restructured_data = restructure_airqo_data(
            data=data, destination="bigquery"
        )

        big_query_api = BigQueryApi()
        big_query_api.save_raw_measurements(airqo_restructured_data)
コード例 #5
0
def query_hourly_weather_data(start_date_time: str,
                              end_date_time: str) -> list:
    biq_query_api = BigQueryApi()
    hourly_weather_measurements = biq_query_api.get_hourly_data(
        start_date_time=start_date_time,
        end_date_time=end_date_time,
        columns=hourly_weather_columns,
        table=biq_query_api.hourly_weather_table,
    )
    if hourly_weather_measurements.empty:
        return pd.DataFrame(
            data=[], columns=hourly_weather_columns).to_dict(orient="records")

    return hourly_weather_measurements.to_dict(orient="records")
コード例 #6
0
    def save_to_bigquery(inputs: dict):
        from airqo_etl_utils.bigquery_api import BigQueryApi
        from airqo_etl_utils.commons import un_fill_nan
        from airqo_etl_utils.weather_data_utils import (
            transform_weather_data_for_bigquery,
        )

        weather_data = un_fill_nan(inputs.get("data"))
        bigquery_data = transform_weather_data_for_bigquery(data=weather_data)

        big_query_api = BigQueryApi()
        big_query_api.save_data(
            data=bigquery_data, table=big_query_api.hourly_weather_table
        )
コード例 #7
0
    def load(airqo_data: dict, **kwargs):

        from airqo_etl_utils.commons import un_fill_nan
        from airqo_etl_utils.bigquery_api import BigQueryApi
        from airqo_etl_utils.airqo_api import AirQoApi
        from airqo_etl_utils.airqo_utils import restructure_airqo_data
        from airqo_etl_utils.config import configuration
        from airqo_etl_utils.message_broker import KafkaBrokerClient

        data = un_fill_nan(airqo_data.get("data"))

        try:
            dag_run = kwargs.get("dag_run")
            destination = dag_run.conf["destination"]
        except KeyError:
            destination = "bigquery"

        if destination == "bigquery":
            airqo_restructured_data = restructure_airqo_data(
                data=data, destination="bigquery"
            )
            big_query_api = BigQueryApi()
            big_query_api.save_data(
                data=airqo_restructured_data,
                table=big_query_api.hourly_measurements_table,
            )

        elif destination == "message-broker":
            airqo_restructured_data = restructure_airqo_data(
                data=data, destination="message-broker"
            )

            info = {
                "data": airqo_restructured_data,
                "action": "insert",
                "tenant": "airqo",
            }
            kafka = KafkaBrokerClient()
            kafka.send_data(info=info, topic=configuration.HOURLY_MEASUREMENTS_TOPIC)
        elif destination == "api":
            airqo_restructured_data = restructure_airqo_data(
                data=data, destination="api"
            )
            airqo_api = AirQoApi()
            airqo_api.save_events(measurements=airqo_restructured_data, tenant="airqo")
        else:
            raise Exception(
                "Invalid data destination. Valid values are bigquery, message-broker and api"
            )
コード例 #8
0
ファイル: main.py プロジェクト: airqo-platform/AirQo-api
def data_warehouse(start_date_time: str, end_date_time: str):
    from airqo_etl_utils.data_warehouse_utils import (
        query_hourly_measurements,
        query_hourly_weather_data,
        extract_sites_meta_data,
        merge_measurements_weather_sites,
    )
    from airqo_etl_utils.bigquery_api import BigQueryApi

    hourly_device_measurements = query_hourly_measurements(
        start_date_time=start_date_time,
        end_date_time=end_date_time,
    )
    pd.DataFrame(hourly_device_measurements).to_csv(
        path_or_buf="hourly_device_measurements.csv", index=False)

    hourly_weather_measurements = query_hourly_weather_data(
        start_date_time=start_date_time,
        end_date_time=end_date_time,
    )
    pd.DataFrame(hourly_weather_measurements).to_csv(
        path_or_buf="hourly_weather_measurements.csv", index=False)

    sites_meta_data = extract_sites_meta_data()
    pd.DataFrame(sites_meta_data).to_csv(path_or_buf="sites_meta_data.csv",
                                         index=False)

    data = merge_measurements_weather_sites(
        measurements_data=hourly_device_measurements,
        weather_data=hourly_weather_measurements,
        sites=sites_meta_data,
    )

    data_df = pd.DataFrame(data)
    bigquery_api = BigQueryApi()
    data_df = bigquery_api.validate_data(
        dataframe=data_df,
        columns=bigquery_api.analytics_columns,
        numeric_columns=bigquery_api.analytics_numeric_columns,
        table=bigquery_api.analytics_table,
    )
    data_df.to_csv(path_or_buf="data_warehouse.csv", index=False)
コード例 #9
0
def create_insights_data_from_bigquery(start_date_time: str,
                                       end_date_time: str) -> list:
    from airqo_etl_utils.bigquery_api import BigQueryApi

    bigquery_api = BigQueryApi()

    hourly_data = bigquery_api.get_hourly_data(
        start_date_time=start_date_time,
        end_date_time=end_date_time,
        columns=["pm2_5", "pm10", "site_id", "timestamp"],
        table=bigquery_api.hourly_measurements_table,
    )
    hourly_data["forecast"] = False
    hourly_data["empty"] = False
    hourly_data["frequency"] = "hourly"
    hourly_data.rename(columns={
        "site_id": "siteId",
        "timestamp": "time"
    },
                       inplace=True)
    return hourly_data.to_dict(orient="records")
コード例 #10
0
ファイル: main.py プロジェクト: airqo-platform/AirQo-api
def kcca_hourly_measurements(start_date_time: str, end_date_time: str):
    from airqo_etl_utils.kcca_utils import (
        extract_kcca_measurements,
        transform_kcca_measurements_for_api,
        transform_kcca_data_for_message_broker,
        transform_kcca_hourly_data_for_bigquery,
    )
    from airqo_etl_utils.bigquery_api import BigQueryApi

    kcca_unclean_data = extract_kcca_measurements(start_time=start_date_time,
                                                  end_time=end_date_time,
                                                  freq="hourly")
    pd.DataFrame(kcca_unclean_data).to_csv(
        path_or_buf="outputs/kcca_unclean_data.csv", index=False)

    # API
    cleaned_data = transform_kcca_measurements_for_api(kcca_unclean_data)
    pd.DataFrame(cleaned_data).to_csv(path_or_buf="kcca_cleaned_data.csv",
                                      index=False)

    # Message Broker
    message_broker_data = transform_kcca_data_for_message_broker(
        kcca_unclean_data, frequency="hourly")
    pd.DataFrame(message_broker_data).to_csv(
        path_or_buf="kcca_message_broker_data.csv", index=False)

    # Big Query
    bigquery_data = transform_kcca_hourly_data_for_bigquery(
        data=kcca_unclean_data)
    bigquery_data_df = pd.DataFrame(bigquery_data)
    bigquery_api = BigQueryApi()
    bigquery_data_df = bigquery_api.validate_data(
        dataframe=bigquery_data_df,
        columns=bigquery_api.hourly_measurements_columns,
        numeric_columns=bigquery_api.hourly_measurements_numeric_columns,
        table=bigquery_api.hourly_measurements_table,
    )
    bigquery_data_df.to_csv(path_or_buf="kcca_data_for_bigquery.csv",
                            index=False)
コード例 #11
0
ファイル: main.py プロジェクト: airqo-platform/AirQo-api
def weather_data(start_date_time: str, end_date_time: str):
    from airqo_etl_utils.weather_data_utils import (
        resample_weather_data,
        query_weather_data_from_tahmo,
        add_site_info_to_weather_data,
        transform_weather_data_for_bigquery,
    )
    from airqo_etl_utils.bigquery_api import BigQueryApi

    raw_weather_data = query_weather_data_from_tahmo(
        start_date_time=start_date_time, end_date_time=end_date_time)
    pd.DataFrame(raw_weather_data).to_csv(path_or_buf="raw_weather_data.csv",
                                          index=False)

    hourly_weather_data = resample_weather_data(data=raw_weather_data,
                                                frequency="hourly")
    pd.DataFrame(hourly_weather_data).to_csv(
        path_or_buf="hourly_weather_data.csv", index=False)

    sites_weather_data = add_site_info_to_weather_data(
        data=hourly_weather_data)
    pd.DataFrame(sites_weather_data).to_csv(
        path_or_buf="sites_weather_data.csv", index=False)

    bigquery_data = transform_weather_data_for_bigquery(
        data=sites_weather_data)
    bigquery_data_df = pd.DataFrame(bigquery_data)
    bigquery_api = BigQueryApi()
    bigquery_data_df = bigquery_api.validate_data(
        dataframe=bigquery_data_df,
        columns=bigquery_api.hourly_weather_columns,
        numeric_columns=bigquery_api.hourly_weather_numeric_columns,
        table=bigquery_api.hourly_weather_table,
    )
    bigquery_data_df.to_csv(path_or_buf="bigquery_weather_data.csv",
                            index=False)
コード例 #12
0
ファイル: main.py プロジェクト: airqo-platform/AirQo-api
def airqo_hourly_measurements(start_date_time: str, end_date_time: str):
    from airqo_etl_utils.airqo_utils import (
        extract_airqo_data_from_thingspeak,
        average_airqo_data,
        extract_airqo_weather_data_from_tahmo,
        merge_airqo_and_weather_data,
        calibrate_hourly_airqo_measurements,
        restructure_airqo_data,
    )
    from airqo_etl_utils.bigquery_api import BigQueryApi

    # extract_airqo_data
    raw_airqo_data = extract_airqo_data_from_thingspeak(
        start_time=start_date_time, end_time=end_date_time, all_devices=False)
    pd.DataFrame(raw_airqo_data).to_csv(path_or_buf="raw_airqo_data.csv",
                                        index=False)
    average_data = average_airqo_data(data=raw_airqo_data, frequency="hourly")
    pd.DataFrame(average_data).to_csv(path_or_buf="averaged_airqo_data.csv",
                                      index=False)

    # extract_weather_data
    airqo_weather_data = extract_airqo_weather_data_from_tahmo(
        start_time=start_date_time, end_time=end_date_time)
    pd.DataFrame(airqo_weather_data).to_csv(
        path_or_buf="tahmo_weather_data.csv", index=False)

    # merge_data
    merged_measurements = merge_airqo_and_weather_data(
        airqo_data=average_data, weather_data=airqo_weather_data)
    pd.DataFrame(merged_measurements).to_csv(
        path_or_buf="merged_airqo_data.csv", index=False)

    # calibrate data
    calibrated_data = calibrate_hourly_airqo_measurements(
        measurements=merged_measurements)
    pd.DataFrame(calibrated_data).to_csv(
        path_or_buf="calibrated_airqo_data.csv", index=False)

    # restructure data for api
    restructure_data = restructure_airqo_data(data=calibrated_data,
                                              destination="api")
    pd.DataFrame(restructure_data).to_csv(path_or_buf="airqo_data_for_api.csv",
                                          index=False)

    # restructure data for bigquery
    restructure_data = restructure_airqo_data(data=calibrated_data,
                                              destination="bigquery")

    bigquery_data_df = pd.DataFrame(restructure_data)
    bigquery_api = BigQueryApi()
    bigquery_data_df = bigquery_api.validate_data(
        dataframe=bigquery_data_df,
        columns=bigquery_api.hourly_measurements_columns,
        numeric_columns=bigquery_api.hourly_measurements_numeric_columns,
        table=bigquery_api.hourly_measurements_table,
    )
    bigquery_data_df.to_csv(path_or_buf="airqo_data_for_bigquery.csv",
                            index=False)

    # restructure data for message broker
    restructure_data = restructure_airqo_data(data=calibrated_data,
                                              destination="message-broker")
    pd.DataFrame(restructure_data).to_csv(
        path_or_buf="airqo_data_for_message_broker.csv", index=False)
コード例 #13
0
def restructure_airqo_data_for_bigquery(data: list) -> list:
    restructured_data = []

    data_df = pd.DataFrame(data)
    columns = list(data_df.columns)

    for _, data_row in data_df.iterrows():
        device_data = dict(
            {
                "timestamp": str_to_date(data_row["time"]),
                "tenant": "airqo",
                "site_id": data_row["site_id"],
                "device_number": data_row["device_number"],
                "device": data_row["device"],
                "latitude": get_column_value(
                    column="latitude", columns=columns, series=data_row
                ),
                "longitude": get_column_value(
                    column="longitude", columns=columns, series=data_row
                ),
                "pm2_5": get_column_value(
                    column="pm2_5", columns=columns, series=data_row
                ),
                "s1_pm2_5": get_column_value(
                    column="s1_pm2_5", columns=columns, series=data_row
                ),
                "s2_pm2_5": get_column_value(
                    column="s2_pm2_5", columns=columns, series=data_row
                ),
                "pm2_5_raw_value": get_column_value(
                    column="raw_pm2_5", columns=columns, series=data_row
                ),
                "pm2_5_calibrated_value": get_column_value(
                    column="calibrated_pm2_5",
                    columns=columns,
                    series=data_row,
                ),
                "pm10": get_column_value(
                    column="pm10",
                    columns=columns,
                    series=data_row,
                ),
                "s1_pm10": get_column_value(
                    column="s1_pm10", columns=columns, series=data_row
                ),
                "s2_pm10": get_column_value(
                    column="s2_pm10", columns=columns, series=data_row
                ),
                "pm10_raw_value": get_column_value(
                    column="raw_pm10",
                    columns=columns,
                    series=data_row,
                ),
                "pm10_calibrated_value": get_column_value(
                    column="calibrated_pm10",
                    columns=columns,
                    series=data_row,
                ),
                "altitude": get_column_value(
                    column="altitude", columns=columns, series=data_row
                ),
                "wind_speed": get_column_value(
                    column="wind_speed", columns=columns, series=data_row
                ),
                "external_temperature": get_column_value(
                    column="temperature", columns=columns, series=data_row
                ),
                "external_humidity": get_column_value(
                    column="humidity", columns=columns, series=data_row
                ),
            }
        )

        restructured_data.append(device_data)

    return pd.DataFrame(
        columns=BigQueryApi().hourly_measurements_columns, data=restructured_data
    ).to_dict(orient="records")
コード例 #14
0
def transform_kcca_hourly_data_for_bigquery(data: list) -> list:
    restructured_data = []

    data_df = pd.DataFrame(data)
    columns = list(data_df.columns)

    airqo_api = AirQoApi()
    devices = airqo_api.get_devices(tenant="kcca")

    for _, data_row in data_df.iterrows():
        device_name = data_row["deviceCode"]
        site_id, _ = get_site_and_device_id(devices, device_name=device_name)
        if not site_id:
            continue

        location = str(data_row["location.coordinates"])
        location = location.replace("[", "").replace("]", "")
        location_coordinates = location.split(",")

        device_data = dict({
            "timestamp":
            str_to_date(data_row["time"]),
            "tenant":
            "kcca",
            "site_id":
            site_id,
            "device_number":
            0,
            "device":
            device_name,
            "latitude":
            location_coordinates[1],
            "longitude":
            location_coordinates[0],
            "pm2_5":
            get_column_value(
                column="characteristics.pm2_5ConcMass.value",
                columns=columns,
                series=data_row,
            ),
            "s1_pm2_5":
            get_column_value(column="s1_pm2_5",
                             columns=columns,
                             series=data_row),
            "s2_pm2_5":
            get_column_value(column="s2_pm2_5",
                             columns=columns,
                             series=data_row),
            "pm2_5_raw_value":
            get_column_value(
                column="characteristics.pm2_5ConcMass.raw",
                columns=columns,
                series=data_row,
            ),
            "pm2_5_calibrated_value":
            get_column_value(
                column="characteristics.pm2_5ConcMass.calibratedValue",
                columns=columns,
                series=data_row,
            ),
            "pm10":
            get_column_value(
                column="characteristics.pm10ConcMass.value",
                columns=columns,
                series=data_row,
            ),
            "s1_pm10":
            get_column_value(column="s1_pm10",
                             columns=columns,
                             series=data_row),
            "s2_pm10":
            get_column_value(column="s2_pm10",
                             columns=columns,
                             series=data_row),
            "pm10_raw_value":
            get_column_value(
                column="characteristics.pm10ConcMass.raw",
                columns=columns,
                series=data_row,
            ),
            "pm10_calibrated_value":
            get_column_value(
                column="characteristics.pm10ConcMass.calibratedValue",
                columns=columns,
                series=data_row,
            ),
            "no2":
            get_column_value(
                column="characteristics.no2Conc.value",
                columns=columns,
                series=data_row,
            ),
            "no2_raw_value":
            get_column_value(
                column="characteristics.no2Conc.raw",
                columns=columns,
                series=data_row,
            ),
            "no2_calibrated_value":
            get_column_value(
                column="characteristics.no2Conc.calibratedValue",
                columns=columns,
                series=data_row,
            ),
            "pm1":
            get_column_value(
                column="characteristics.pm1ConcMass.value",
                columns=columns,
                series=data_row,
            ),
            "pm1_raw_value":
            get_column_value(
                column="characteristics.pm1ConcMass.raw",
                columns=columns,
                series=data_row,
            ),
            "pm1_calibrated_value":
            get_column_value(
                column="characteristics.pm1ConcMass.calibratedValue",
                columns=columns,
                series=data_row,
            ),
            "altitude":
            get_column_value(
                column="characteristics.altitude.value",
                columns=columns,
                series=data_row,
            ),
            "wind_speed":
            get_column_value(
                column="characteristics.windSpeed.value",
                columns=columns,
                series=data_row,
            ),
            "external_temperature":
            get_column_value(
                column="characteristics.temperature.value",
                columns=columns,
                series=data_row,
            ),
            "external_humidity":
            get_column_value(
                column="characteristics.relHumid.value",
                columns=columns,
                series=data_row,
            ),
        })

        restructured_data.append(device_data)

    return pd.DataFrame(columns=BigQueryApi().hourly_measurements_columns,
                        data=restructured_data).to_dict(orient="records")