コード例 #1
0
def generate_and_run_glean_query(task_id,
                                 product,
                                 destination_project_id,
                                 destination_dataset_id="glam_etl",
                                 source_project_id="moz-fx-data-shared-prod",
                                 docker_image="mozilla/bigquery-etl:latest",
                                 gcp_conn_id="google_cloud_derived_datasets",
                                 **kwargs):
    """
    :param task_id:                     Airflow task id
    :param product:                     Product name of glean app
    :param destination_project_id:      Project to store derived tables
    :param destination_dataset_id:      Name of the dataset to store derived tables
    :param source_project_id:           Project containing the source datasets
    :param docker_image:                Docker image
    :param gcp_conn_id:                 Airflow GCP connection
    """
    env_vars = {
        "PRODUCT": product,
        "SRC_PROJECT": source_project_id,
        "PROJECT": destination_project_id,
        "DATASET": destination_dataset_id,
        "SUBMISSION_DATE": "{{ ds }}",
    }

    return gke_command(
        task_id=task_id,
        cmds=["bash", "-c"],
        env_vars=env_vars,
        command=["script/glam/generate_glean_sql && script/glam/run_glam_sql"],
        docker_image=docker_image,
        gcp_conn_id=gcp_conn_id,
        **kwargs
    )
コード例 #2
0
def export(
        bq_dataset_id,
        task_id,
        bq_project,
        s3_prefix,
        version,
        s3_bucket="moz-fx-data-us-west-2-leanplum-export",
        gcs_bucket="moz-fx-data-prod-external-data",
        table_prefix=None,
        gcp_conn_id="google_cloud_derived_datasets",
        gke_location="us-central1-a",
        gke_cluster_name="bq-load-gke-1",
        gke_namespace="default",
        docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/leanplum-data-export:latest",
        aws_conn_id="aws_data_iam_s3",
        **kwargs):
    """ Export a day of data from Leanplum for a single application,
        and make it available in BigQuery.

    See bug 1588654 for information on which buckets and datasets
    these tabes should live in.

    :param str bq_dataset_id:           [Required] BigQuery default dataset id
    :param str task_id:              [Required] The task ID for this task
    :param str bq_project:           [Required] The project to create tables in
    :param str s3_prefix:            Prefix for data in the s3 bucket
    :param str s3_bucket:            [Required] S3 bucket to retrieve streaming exports from
    :param str gcs_bucket:           GCS Bucket to export data to
    :param str table_prefix:         Prefix of tables in Bigquery
    :param str version:              Version of the destination table
    :param str gcp_conn_id:          Airflow connection id for GCP access
    :param str gke_location:         GKE cluster location
    :param str gke_cluster_name:     GKE cluster name
    :param str gke_namespace:        GKE cluster namespace
    :param str docker_image:         docker image to use
    :param str aws_conn_id:          Airflow connection id for AWS access
    :param Dict[str, Any] kwargs:    Additional keyword arguments for
                                     GKEPodOperator

    :return: GKEPodOperator
    """
    args = [
        "leanplum-data-export", "export-leanplum", "--date", "{{ ds_nodash }}",
        "--bucket", gcs_bucket, "--bq-dataset", bq_dataset_id, "--project",
        bq_project, "--s3-bucket", s3_bucket, "--version", version, "--prefix",
        s3_prefix
    ]

    if table_prefix is not None:
        args += ["--table-prefix", table_prefix]

    return gke_command(task_id=task_id,
                       docker_image=docker_image,
                       command=args,
                       gcp_conn_id=gcp_conn_id,
                       gke_location=gke_location,
                       gke_cluster_name=gke_cluster_name,
                       gke_namespace=gke_namespace,
                       aws_conn_id=aws_conn_id,
                       **kwargs)
コード例 #3
0
def generate_and_run_desktop_query(task_id,
                                   project_id,
                                   source_dataset_id,
                                   sample_size,
                                   overwrite,
                                   probe_type,
                                   destination_dataset_id=None,
                                   process=None,
                                   docker_image="mozilla/bigquery-etl:latest",
                                   gcp_conn_id="google_cloud_derived_datasets",
                                   **kwargs):
    """
    :param task_id:                     Airflow task id
    :param project_id:                  GCP project to write to
    :param source_dataset_id:           Bigquery dataset to read from in queries
    :param sample_size:                 Value to use for windows release client sampling
    :param overwrite:                   Overwrite the destination table
    :param probe_type:                  Probe type to generate query
    :param destination_dataset_id:      Bigquery dataset to write results to.  Defaults to source_dataset_id
    :param process:                     Process to filter probes for.  Gets all processes by default.
    :param docker_image:                Docker image
    :param gcp_conn_id:                 Airflow GCP connection
    """
    if destination_dataset_id is None:
        destination_dataset_id = source_dataset_id
    env_vars = {
        "PROJECT": project_id,
        "PROD_DATASET": source_dataset_id,
        "DATASET": destination_dataset_id,
        "SUBMISSION_DATE": "{{ ds }}",
        "RUN_QUERY": "t",
    }
    if not overwrite:
        env_vars["APPEND"] = "t"

    command = [
        "script/glam/generate_and_run_desktop_sql",
        probe_type,
        sample_size,
    ]
    if process is not None:
        command.append(process)

    return gke_command(
        task_id=task_id,
        cmds=["bash"],
        env_vars=env_vars,
        command=command,
        docker_image=docker_image,
        gcp_conn_id=gcp_conn_id,
        **kwargs
    )
コード例 #4
0
) as dag:

    pocket_derived__rolling_monthly_active_user_counts__v1 = bigquery_etl_query(
        task_id="pocket_derived__rolling_monthly_active_user_counts__v1",
        destination_table="rolling_monthly_active_user_counts_v1",
        dataset_id="pocket_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter=None,
        depends_on_past=False,
        parameters=["submission_date:DATE:{{ds}}"],
        dag=dag,
    )

    pocket_derived__rolling_monthly_active_user_counts_history__v1 = gke_command(
        task_id="pocket_derived__rolling_monthly_active_user_counts_history__v1",
        command=[
            "python",
            "sql/moz-fx-data-shared-prod/pocket_derived/rolling_monthly_active_user_counts_history_v1/query.py",
        ]
        + ["--date", "{{ ds }}"],
        docker_image="mozilla/bigquery-etl:latest",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
    )

    pocket_derived__rolling_monthly_active_user_counts__v1.set_upstream(
        pocket_derived__rolling_monthly_active_user_counts_history__v1
    )
コード例 #5
0
        execution_delta=timedelta(seconds=3600),
        check_existence=True,
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
    )

    wait_for_copy_deduplicate_all = ExternalTaskSensor(
        task_id="wait_for_copy_deduplicate_all",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_all",
        execution_delta=timedelta(seconds=3600),
        check_existence=True,
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
    )

    stable_table_sizes = gke_command(
        task_id="stable_table_sizes",
        command=[
            "python",
            "sql/monitoring/stable_table_sizes_v1/query.py",
            "--date",
            "{{ ds }}",
        ],
        docker_image="mozilla/bigquery-etl:latest",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"])

    stable_table_sizes.set_upstream(wait_for_copy_deduplicate_main_ping)
    stable_table_sizes.set_upstream(wait_for_copy_deduplicate_all)
コード例 #6
0
default_args = {
    "owner": "*****@*****.**",
    "start_date": datetime.datetime(2020, 10, 9, 0, 0),
    "end_date": None,
    "email": ["*****@*****.**"],
    "depends_on_past": False,
    "retry_delay": datetime.timedelta(seconds=1800),
    "email_on_failure": True,
    "email_on_retry": True,
    "retries": 0,
}

with DAG(
    "bqetl_experimenter_experiments_import",
    default_args=default_args,
    schedule_interval="*/10 * * * *",
    doc_md=docs,
) as dag:

    monitoring__experimenter_experiments__v1 = gke_command(
        task_id="monitoring__experimenter_experiments__v1",
        command=[
            "python",
            "sql/moz-fx-data-experiments/monitoring/experimenter_experiments_v1/query.py",
        ]
        + [],
        docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
        owner="*****@*****.**",
        email=["*****@*****.**"],
    )
コード例 #7
0
        destination_table="experiment_enrollment_aggregates_v1",
        dataset_id="telemetry_derived",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"])

    gen_query_task_id = "experiment_enrollment_aggregates_live_generate_query"

    # setting xcom_push to True outputs this query to an xcom
    experiment_enrollment_aggregates_live_generate_query = gke_command(
        task_id=gen_query_task_id,
        command=[
            "python",
            "sql/telemetry_derived/experiment_enrollment_aggregates_live/view.sql.py",
            "--submission-date",
            "{{ ds }}",
            "--json-output",
            "--wait-seconds",
            "15",
        ],
        docker_image="mozilla/bigquery-etl:latest",
        xcom_push=True,
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"])

    experiment_enrollment_aggregates_live_run_query = bigquery_xcom_query(
        task_id="experiment_enrollment_aggregates_live_run_query",
        destination_table=None,
        dataset_id="telemetry_derived",
        xcom_task_id=gen_query_task_id,
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"])
コード例 #8
0
        storage_bucket=storage_bucket,
        default_args=subdag_args,
    ),
)

trim_database = gke_command(
    task_id="trim_database",
    cmds=["python"],
    command=[
        "-m",
        "mozaggregator.trim_db",
        "--retention-period",
        f"{365*2}",  # 2 year retention
        "--postgres-db",
        "telemetry",
        "--postgres-user",
        "root",
        "--postgres-pass",
        "{{ var.value.mozaggregator_postgres_pass }}",
        "--postgres-host",
        "{{ var.value.mozaggregator_postgres_host }}",
        # TODO: uncomment this after a successful run
        # "--no-dry-run",
    ],
    docker_image="mozilla/python_mozaggregator:latest",
    dag=dag,
)

mozaggregator2bq_extract = gke_command(
    task_id="mozaggregator2bq_extract",
    name="mozaggregator2bq_extract",
    command=["bin/backfill"],
コード例 #9
0
    # list of datasets to execute query for and export
    experiment_datasets = [
        "moz-fx-data-shared-prod.telemetry_derived.experiment_enrollment_other_events_overall_v1",
        "moz-fx-data-shared-prod.telemetry_derived.experiment_enrollment_cumulative_population_estimate_v1",
        "moz-fx-data-shared-prod.telemetry_derived.experiment_enrollment_overall_v1",
        "moz-fx-data-shared-prod.telemetry_derived.experiment_unenrollment_overall_v1",
        "moz-fx-data-shared-prod.telemetry_derived.experiment_cumulative_ad_clicks_v1",
        "moz-fx-data-shared-prod.telemetry_derived.experiment_cumulative_search_count_v1",
        "moz-fx-data-shared-prod.telemetry_derived.experiment_cumulative_search_with_ads_count_v1"
    ]

    export_monitoring_data = gke_command(
        task_id="export_enrollments_monitoring_data",
        command=[
            "python",
            "script/experiments/export_experiment_monitoring_data.py",
            "--datasets"
        ] + experiment_datasets,
        docker_image=docker_image,
        is_delete_operator_pod=True,
    )

    for dataset in experiment_datasets:
        task_id = dataset.split(".")[-1]

        query_etl = bigquery_etl_query(
            task_id=task_id,
            destination_table=task_id,
            dataset_id="telemetry_derived",
            project_id="moz-fx-data-shared-prod",
            owner="*****@*****.**",
            email=["*****@*****.**", "*****@*****.**"],
コード例 #10
0
ファイル: leanplum.py プロジェクト: willkg/telemetry-airflow
def get_messages(
        task_id,
        app_id,
        client_key,
        bq_project,
        bq_dataset_id,
        version,
        table_prefix=None,
        gcp_conn_id="google_cloud_derived_datasets",
        gke_location="us-central1-a",
        gke_cluster_name="bq-load-gke-1",
        gke_namespace="default",
        docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/leanplum-data-export:latest",
        **kwargs):
    """Get all leanplum messages and save to bigquery,

    :param str app_id:              [Required] Leanplum app id
    :param str client_key:          [Required] Leanplum content read-only key for the app
    :param str task_id:             [Required] The task ID for this task
    :param str bq_project:          [Required] The project to create tables in
    :param str bq_dataset_id:       [Required] BigQuery dataset id
    :param str table_prefix:        Prefix of tables in Bigquery
    :param str version:             Version of the destination table
    :param str gcp_conn_id:         Airflow connection id for GCP access
    :param str gke_location:        GKE cluster location
    :param str gke_cluster_name:    GKE cluster name
    :param str gke_namespace:       GKE cluster namespace
    :param str docker_image:        docker image to use
    :param Dict[str, Any] kwargs:   Additional keyword arguments for
                                    GKEPodOperator

    :return: GKEPodOperator
    """
    args = [
        "leanplum-data-export",
        "get-messages",
        "--date",
        "{{ ds }}",
        "--app-id",
        app_id,
        "--client-key",
        client_key,
        "--project",
        bq_project,
        "--bq-dataset",
        bq_dataset_id,
        "--version",
        version,
    ]

    if table_prefix is not None:
        args += ["--table-prefix", table_prefix]

    return gke_command(task_id=task_id,
                       docker_image=docker_image,
                       command=args,
                       gcp_conn_id=gcp_conn_id,
                       gke_location=gke_location,
                       gke_cluster_name=gke_cluster_name,
                       gke_namespace=gke_namespace,
                       **kwargs)
コード例 #11
0
    project_id=project_id,
    owner="*****@*****.**",
    depends_on_past=True,
    arguments=("--replace", ),
    dag=dag,
)

scalar_percentiles = gke_command(
    task_id="scalar_percentiles",
    command=[
        "python3",
        "script/glam/run_scalar_agg_clustered_query.py",
        "--submission-date",
        "{{ds}}",
        "--dst-table",
        "scalar_percentiles_v1",
        "--project",
        project_id,
        "--dataset",
        dataset_id,
    ],
    docker_image="mozilla/bigquery-etl:latest",
    dag=dag,
)

# This task runs first and replaces the relevant partition, followed
# by the next task below that appends to the same partition of the same table.
clients_daily_histogram_aggregates_parent = generate_and_run_desktop_query(
    task_id="clients_daily_histogram_aggregates_parent",
    project_id=project_id,
    source_dataset_id=dataset_id,
コード例 #12
0
    ],
    parent_dag_name=dag.dag_id,
    dag_name="addon_aggregates_export",
    default_args=default_args),
                                         task_id="addon_aggregates_export",
                                         executor=GetDefaultExecutor(),
                                         dag=dag)

main_summary_experiments_get_experiment_list = gke_command(
    task_id="main_summary_experiments_get_experiment_list",
    command=[
        "python3",
        "templates/telemetry_derived/experiments_v1/get_experiment_list.py",
        "{{ds}}"
    ],
    docker_image="mozilla/bigquery-etl:latest",
    xcom_push=True,
    owner="*****@*****.**",
    email=[
        "*****@*****.**", "*****@*****.**",
        "*****@*****.**", "*****@*****.**"
    ],
    dag=dag)

main_summary_experiments = bigquery_etl_query(
    task_id="main_summary_experiments",
    destination_table="experiments_v1",
    parameters=
    ("experiment_list:ARRAY<STRING>:{{task_instance.xcom_pull('main_summary_experiments_get_experiment_list') | tojson}}",
     ),
    project_id="moz-fx-data-shared-prod",
コード例 #13
0
        dag=dag,
    )

    mozilla_vpn_derived__survey_cancellation_of_service__v1 = gke_command(
        task_id="mozilla_vpn_derived__survey_cancellation_of_service__v1",
        command=[
            "python",
            "sql/moz-fx-data-shared-prod/mozilla_vpn_derived/survey_cancellation_of_service_v1/query.py",
        ] + [
            "--date",
            "{{ ds }}",
            "--survey_id",
            "5111573",
            "--api_token",
            "{{ var.value.surveygizmo_api_token }}",
            "--api_secret",
            "{{ var.value.surveygizmo_api_secret }}",
            "--destination_table",
            "moz-fx-data-shared-prod.mozilla_vpn_derived.survey_cancellation_of_service_v1",
        ],
        docker_image="mozilla/bigquery-etl:latest",
        owner="*****@*****.**",
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
    )

    mozilla_vpn_derived__survey_intercept_q3__v1 = gke_command(
        task_id="mozilla_vpn_derived__survey_intercept_q3__v1",
コード例 #14
0
# This task runs first and replaces the relevant partition, followed
# by the next two tasks that append to the same partition of the same table.
clients_daily_scalar_aggregates = gke_command(
    task_id="clients_daily_scalar_aggregates",
    owner="*****@*****.**",
    email=[
        "*****@*****.**",
        "*****@*****.**",
        "*****@*****.**",
        "*****@*****.**",
    ],
    cmds=["bash"],
    env_vars={
        "PROJECT": project_id,
        "PROD_DATASET": dataset_id,
        "DATASET": dataset_id,
        "SUBMISSION_DATE": "{{ ds }}",
        "RUN_QUERY": "t",
    },
    command=[
        "script/glam/generate_and_run_desktop_sql",
        "scalar",
        PERCENT_RELEASE_WINDOWS_SAMPLING,
    ],
    docker_image="mozilla/bigquery-etl:latest",
    gcp_conn_id="google_cloud_derived_datasets",
    dag=dag,
)

clients_daily_keyed_scalar_aggregates = gke_command(
    task_id="clients_daily_keyed_scalar_aggregates",
コード例 #15
0
ファイル: shredder.py プロジェクト: willkg/telemetry-airflow
    # dags run schedule_interval after ds, and end date should be one day
    # before the dag runs, so 28-1 = 27 days after ds.
    "--end-date={{macros.ds_add(ds, 27)}}",
    # start date should be two schedule intervals before end date, to avoid
    # race conditions with downstream tables and pings received shortly after a
    # deletion request.
    "--start-date={{macros.ds_add(ds, 27-28*2)}}",
]

# main_v4 is cheaper to handle in a project without flat-rate query pricing
on_demand = gke_command(
    task_id="on_demand",
    name="shredder-on-demand",
    command=base_command + [
        "--parallelism=2",
        "--billing-project=moz-fx-data-shredder",
        "--only=telemetry_stable.main_v4",
    ],
    docker_image=docker_image,
    is_delete_operator_pod=True,
    dag=dag,
)

# handle main_summary separately to ensure that it doesn't slow everything else
# down and also to avoid timeout errors related to queueing when running more
# than 2 DML DELETE statements at once on a single table
flat_rate_main_summary = gke_command(
    task_id="flat_rate_main_summary",
    name="shredder-flat-rate-main-summary",
    command=base_command + [
        "--parallelism=2",
        "--billing-project=moz-fx-data-bq-batch-prod",
コード例 #16
0
GA_PROPERTIES = [
    ("65789850", "www_mozilla_org"),
    ("66602784", "blog_mozilla_org"),
    ("65912487", "support_mozilla_org"),
    ("180612539", "monitor_firefox_com"),
    ("220432379", "vpn_mozilla_org"),
    ("65887927", "hacks_mozilla_org"),
    ("66726481", "developer_mozilla_org"),
]

with DAG(
        "copy_ga_sessions",
        default_args=default_args,
        schedule_interval="0 21 * * *",
) as dag:
    for property_id, property_name in GA_PROPERTIES:
        commands = [
            "python3", "script/marketing/copy_ga_sessions.py", "--start-date",
            "{{ ds }}", "--src-project", "ga-mozilla-org-prod-001",
            "--dst-project", "moz-fx-data-marketing-prod", "--overwrite",
            property_id
        ]

        copy_ga_sessions = gke_command(
            task_id=f"copy_ga_sessions_{property_name}",
            command=commands,
            docker_image="mozilla/bigquery-etl:latest",
            gcp_conn_id="google_cloud_derived_datasets",
            dag=dag,
        )
コード例 #17
0
    email=[
        "*****@*****.**", "*****@*****.**",
        "*****@*****.**"
    ],
    dag=dag)

gcloud_docker_image = "google/cloud-sdk:263.0.0-slim"
main_summary_dataproc_bucket = "gs://moz-fx-data-derived-datasets-parquet-tmp"
main_ping_bigquery_export_prefix = main_summary_dataproc_bucket + "/export"
main_ping_bigquery_export_dest = main_ping_bigquery_export_prefix + "/submission_date={{ds}}/document_namespace=telemetry/document_type=main/document_version=4/*.avro"  # noqa
main_ping_bigquery_export = gke_command(
    task_id="main_ping_bigquery_extract",
    command=[
        "bq",
        "extract",
        "--destination_format=AVRO",
        "moz-fx-data-shared-prod:payload_bytes_decoded.telemetry_telemetry__main_v4${{ds_nodash}}",
        main_ping_bigquery_export_dest,
    ],
    docker_image=gcloud_docker_image,
    dag=dag,
)

main_summary_dataproc = SubDagOperator(
    subdag=moz_dataproc_jar_runner(
        parent_dag_name="main_summary",
        dag_name="main_summary_dataproc",
        default_args=default_args,
        cluster_name="main-summary-{{ds}}",
        image_version="1.3",
        worker_machine_type="n1-standard-8",
        num_preemptible_workers=40,
コード例 #18
0
    "retry_delay": datetime.timedelta(minutes=30),
}

dag_name = "attitudes_daily"

with models.DAG(dag_name,
                schedule_interval="0 3 * * *",
                default_args=default_args) as dag:

    surveygizmo_attitudes_daily_import = gke_command(
        task_id="surveygizmo_attitudes_daily_import",
        command=[
            "python",
            "sql/moz-fx-data-shared-prod/telemetry_derived/surveygizmo_daily_attitudes/import_responses.py",
            "--date", "{{ ds }}", "--survey_id",
            Variable.get("surveygizmo_daily_attitudes_survey_id"),
            "--sg_api_token",
            Variable.get("surveygizmo_api_token"), "--sg_api_secret",
            Variable.get("surveygizmo_api_secret"), "--destination_table",
            "moz-fx-data-shared-prod.telemetry_derived.survey_gizmo_daily_attitudes"
        ],
        docker_image="mozilla/bigquery-etl:latest")

    wait_for_copy_deduplicate = ExternalTaskSensor(
        task_id="wait_for_copy_deduplicate",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_all",
        execution_delta=datetime.timedelta(hours=2),
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
        email_on_retry=False,
コード例 #19
0
from airflow import DAG
from datetime import timedelta, datetime
from utils.gcp import gke_command

default_args = {
    "owner": "*****@*****.**",
    "email": ["*****@*****.**", "*****@*****.**"],
    "depends_on_past": False,
    "start_date": datetime(2020, 6, 11),
    "email_on_failure": True,
    "email_on_retry": True,
    "retries": 2,
    "retry_delay": timedelta(minutes=10),
}

with DAG("stripe", default_args=default_args,
         schedule_interval="@daily") as dag:
    gke_command(
        task_id="stripe_import_events",
        command=[
            "bqetl",
            "stripe",
            "import",
            "--date={{ ds }}",
            "--api-key={{ var.value.stripe_api_key }}",
            "--resource=Event",
            "--table=moz-fx-data-shared-prod.stripe_external.events_v1",
        ],
        docker_image="mozilla/bigquery-etl:latest",
    )
コード例 #20
0
default_args = {
    "owner": "*****@*****.**",
    "start_date": datetime.datetime(2021, 3, 18, 0, 0),
    "end_date": None,
    "email": ["*****@*****.**", "*****@*****.**"],
    "depends_on_past": False,
    "retry_delay": datetime.timedelta(seconds=1800),
    "email_on_failure": True,
    "email_on_retry": True,
    "retries": 2,
}

with DAG(
        "bqetl_firefox_ios",
        default_args=default_args,
        schedule_interval="0 4 * * *",
        doc_md=docs,
) as dag:

    org_mozilla_ios_firefox__unified_metrics__v1 = gke_command(
        task_id="org_mozilla_ios_firefox__unified_metrics__v1",
        command=[
            "python",
            "sql/moz-fx-data-shared-prod/org_mozilla_ios_firefox/unified_metrics_v1/query.py",
        ] + [],
        docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
    )
コード例 #21
0
            default_args=default_args,
            project='moz-fx-data-shared-prod',
            dataset='telemetry',
            table_or_view='fenix_events_v1',
            s3_prefix='fenix',
        ),
        task_id=fenix_task_id
    )

    shredder_fenix = gke_command(
        task_id="shredder_amplitude_fenix",
        name="shredder-amplitude-fenix",
        command=[
            "script/shredder_amplitude",
            "--date={{ ds }}",
            "--api-key={{ var.value.fenix_amplitude_api_key }}",
            "--secret-key={{ var.value.fenix_amplitude_secret_key }}",
            "--table-id=moz-fx-data-shared-prod.org_mozilla_fenix_stable.deletion_request_v1",
            "--device-id-field=client_info.client_id",
        ],
        docker_image="mozilla/bigquery-etl:latest",
        dag=dag,
    )

    rocket_android_task_id = 'rocket_android_amplitude_export'
    rocket_args = default_args.copy()
    rocket_args["start_date"] = datetime.datetime(2019, 12, 2)
    SubDagOperator(
        subdag=export_to_amplitude(
            dag_name=rocket_android_task_id,
            parent_dag_name=dag_name,
            default_args=rocket_args,
コード例 #22
0
        default_args=subdag_args,
    ),
)

trim_database = gke_command(
    task_id="trim_database",
    cmds=["bash"],
    command=[
        "python",
        "-m",
        "mozaggregator.trim_db",
        "--retention-period",
        f"{365*2}",  # 2 year retention
        "--postgres-db",
        "telemetry",
        "--postgres-user",
        "root",
        "--postgres-pass",
        "{{ var.value.mozaggregator_postgres_pass }}",
        "--postgres-host",
        "{{ var.value.mozaggregator_postgres_host }}",
        # TODO: uncomment this after a successful run
        # "--no-dry-run",
    ],
    docker_image="mozilla/python_mozaggregator:latest",
    dag=dag,
)

prerelease_telemetry_aggregate_view_dataproc >> trim_database

# export to avro, if necessary
コード例 #23
0
from airflow import DAG
from datetime import timedelta, datetime
from utils.gcp import gke_command

default_args = {
    "owner": "*****@*****.**",
    "email": ["*****@*****.**"],
    "depends_on_past": False,
    "start_date": datetime(2020, 6, 11),
    "email_on_failure": True,
    "email_on_retry": True,
    "retries": 2,
    "retry_delay": timedelta(minutes=30),
}

with DAG("mozfun", default_args=default_args, schedule_interval="0 1 * * *") as dag:
    docker_image = "mozilla/bigquery-etl:latest"

    publish_public_udfs = gke_command(
        task_id="publish_public_udfs",
        command=["script/publish_public_udfs"],
        docker_image=docker_image
    )
コード例 #24
0
    "bqetl_app_store_connect",
    default_args=default_args,
    schedule_interval="0 20 * * *",
    doc_md=docs,
) as dag:

    apple_app_store__report_subscriber_detailed__v13 = gke_command(
        task_id="apple_app_store__report_subscriber_detailed__v13",
        command=[
            "python",
            "sql/moz-fx-data-marketing-prod/apple_app_store/report_subscriber_detailed_v13/query.py",
        ]
        + [
            "--key-id",
            "{{ var.value.app_store_connect_key_id }}",
            "--issuer-id",
            "{{ var.value.app_store_connect_issuer_id }}",
            "--private-key",
            "{{ var.value.app_store_connect_private_key }}",
            "--vendor-number",
            "{{ var.value.app_store_connect_vendor_number }}",
            "--date",
            "{{ ds }}",
            "--table",
            "moz-fx-data-marketing-prod.apple_app_store.report_subscriber_detailed_v13",
        ],
        docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
        owner="*****@*****.**",
        email=["*****@*****.**"],
    )
コード例 #25
0
    "retries": 2,
}

with DAG(
    "bqetl_monitoring",
    default_args=default_args,
    schedule_interval="0 2 * * *",
    doc_md=docs,
) as dag:

    monitoring_derived__average_ping_sizes__v1 = gke_command(
        task_id="monitoring_derived__average_ping_sizes__v1",
        command=[
            "python",
            "sql/moz-fx-data-shared-prod/monitoring_derived/average_ping_sizes_v1/query.py",
        ]
        + ["--date", "{{ ds }}"],
        docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
        owner="*****@*****.**",
        email=["*****@*****.**"],
    )

    monitoring_derived__bigquery_etl_scheduled_queries_cost__v1 = gke_command(
        task_id="monitoring_derived__bigquery_etl_scheduled_queries_cost__v1",
        command=[
            "python",
            "sql/moz-fx-data-shared-prod/monitoring_derived/bigquery_etl_scheduled_queries_cost_v1/query.py",
        ]
        + ["--date", "{{ ds }}"],
        docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
        owner="*****@*****.**",
コード例 #26
0
        artifact_bucket=artifact_bucket,
        storage_bucket=storage_bucket,
        default_args=subdag_args,
    ),
)

# export to avro, if necessary
if EXPORT_TO_AVRO:
    gke_command(
        task_id="export_main_avro",
        cmds=["bash"],
        command=[
            "bin/export-avro.sh",
            "moz-fx-data-shared-prod",
            "moz-fx-data-shared-prod:analysis",
            "gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/prerelease",
            "main_v4",
            "'nightly', 'beta'",
            "{{ ds }}",
        ],
        docker_image="mozilla/python_mozaggregator:latest",
        dag=dag,
    ).set_downstream(prerelease_telemetry_aggregate_view_dataproc)

    gke_command(
        task_id="export_saved_session_avro",
        cmds=["bash"],
        command=[
            "bin/export-avro.sh",
            "moz-fx-data-shared-prod",
            "moz-fx-data-shared-prod:analysis",
コード例 #27
0
    sum(LOGICAL_MAPPING.values(), []))
for product in final_products:
    query = generate_and_run_glean_query(
        task_id=f"incremental_{product}",
        product=product,
        destination_project_id=PROJECT,
        env_vars=dict(STAGE="incremental"),
        dag=dag,
    )
    # get the dependencies for the logical mapping, or just pass through the
    # daily query unmodified
    for dependency in LOGICAL_MAPPING.get(product, [product]):
        mapping[dependency] >> query

    export = gke_command(
        task_id=f"export_{product}",
        cmds=["bash"],
        env_vars={
            "SRC_PROJECT": PROJECT,
            "DATASET": "glam_etl",
            "PRODUCT": product,
            "BUCKET": BUCKET,
        },
        command=["script/glam/export_csv"],
        docker_image="mozilla/bigquery-etl:latest",
        gcp_conn_id="google_cloud_derived_datasets",
        dag=dag,
    )

    query >> export
コード例 #28
0
wait_for_copy_deduplicate = ExternalTaskSensor(
    task_id="wait_for_copy_deduplicate",
    external_dag_id="copy_deduplicate",
    external_task_id="copy_deduplicate_all",
    execution_delta=timedelta(hours=-1),
    check_existence=True,
    dag=dag,
)

run_sql = gke_command(
    task_id="run_sql",
    cmds=["bash"],
    env_vars={
        "DATASET": "glam_etl",
        "SUBMISSION_DATE": "{{ ds }}"
    },
    command=["script/glam/run_glam_sql"],
    docker_image="mozilla/bigquery-etl:latest",
    gcp_conn_id="google_cloud_derived_datasets",
    dag=dag,
)

export_csv = gke_command(
    task_id="export_csv",
    cmds=["bash"],
    env_vars={"DATASET": "glam_etl"},
    command=["script/glam/export_csv"],
    docker_image="mozilla/bigquery-etl:latest",
    gcp_conn_id="google_cloud_derived_datasets",
    dag=dag,
)
コード例 #29
0
        image=docker_image,
        dag=dag,
    )

    wait_for_telemetry_derived__ssl_ratios__v1 = ExternalTaskSensor(
        task_id="wait_for_telemetry_derived__ssl_ratios__v1",
        external_dag_id="bqetl_ssl_ratios",
        external_task_id="telemetry_derived__ssl_ratios__v1",
        execution_delta=datetime.timedelta(seconds=7200),
        check_existence=True,
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
    )

    export_public_data_json_telemetry_derived__ssl_ratios__v1.set_upstream(
        wait_for_telemetry_derived__ssl_ratios__v1
    )

    public_data_gcs_metadata = gke_command(
        task_id="public_data_gcs_metadata",
        command=["script/publish_public_data_gcs_metadata"],
        docker_image=docker_image,
        dag=dag,
    )

    public_data_gcs_metadata.set_upstream(
        [
            export_public_data_json_telemetry_derived__ssl_ratios__v1,
        ]
    )
コード例 #30
0
    task_id="wait_for_copy_deduplicate",
    external_dag_id="copy_deduplicate",
    external_task_id="copy_deduplicate_all",
    execution_delta=timedelta(hours=1),
    check_existence=True,
    dag=dag,
)

for product in PRODUCTS:
    query = generate_and_run_glean_query(
        task_id=product,
        product=product,
        destination_project_id="glam-fenix-dev",
        dag=dag,
    )

    export = gke_command(
        task_id="export_{}".format(product),
        cmds=["bash"],
        env_vars={
            "DATASET": "glam_etl",
            "PRODUCT": product,
        },
        command=["script/glam/export_csv"],
        docker_image="mozilla/bigquery-etl:latest",
        gcp_conn_id="google_cloud_derived_datasets",
        dag=dag,
    )

    wait_for_copy_deduplicate >> query >> export