def generate_and_run_glean_query(task_id, product, destination_project_id, destination_dataset_id="glam_etl", source_project_id="moz-fx-data-shared-prod", docker_image="mozilla/bigquery-etl:latest", gcp_conn_id="google_cloud_derived_datasets", **kwargs): """ :param task_id: Airflow task id :param product: Product name of glean app :param destination_project_id: Project to store derived tables :param destination_dataset_id: Name of the dataset to store derived tables :param source_project_id: Project containing the source datasets :param docker_image: Docker image :param gcp_conn_id: Airflow GCP connection """ env_vars = { "PRODUCT": product, "SRC_PROJECT": source_project_id, "PROJECT": destination_project_id, "DATASET": destination_dataset_id, "SUBMISSION_DATE": "{{ ds }}", } return gke_command( task_id=task_id, cmds=["bash", "-c"], env_vars=env_vars, command=["script/glam/generate_glean_sql && script/glam/run_glam_sql"], docker_image=docker_image, gcp_conn_id=gcp_conn_id, **kwargs )
def export( bq_dataset_id, task_id, bq_project, s3_prefix, version, s3_bucket="moz-fx-data-us-west-2-leanplum-export", gcs_bucket="moz-fx-data-prod-external-data", table_prefix=None, gcp_conn_id="google_cloud_derived_datasets", gke_location="us-central1-a", gke_cluster_name="bq-load-gke-1", gke_namespace="default", docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/leanplum-data-export:latest", aws_conn_id="aws_data_iam_s3", **kwargs): """ Export a day of data from Leanplum for a single application, and make it available in BigQuery. See bug 1588654 for information on which buckets and datasets these tabes should live in. :param str bq_dataset_id: [Required] BigQuery default dataset id :param str task_id: [Required] The task ID for this task :param str bq_project: [Required] The project to create tables in :param str s3_prefix: Prefix for data in the s3 bucket :param str s3_bucket: [Required] S3 bucket to retrieve streaming exports from :param str gcs_bucket: GCS Bucket to export data to :param str table_prefix: Prefix of tables in Bigquery :param str version: Version of the destination table :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use :param str aws_conn_id: Airflow connection id for AWS access :param Dict[str, Any] kwargs: Additional keyword arguments for GKEPodOperator :return: GKEPodOperator """ args = [ "leanplum-data-export", "export-leanplum", "--date", "{{ ds_nodash }}", "--bucket", gcs_bucket, "--bq-dataset", bq_dataset_id, "--project", bq_project, "--s3-bucket", s3_bucket, "--version", version, "--prefix", s3_prefix ] if table_prefix is not None: args += ["--table-prefix", table_prefix] return gke_command(task_id=task_id, docker_image=docker_image, command=args, gcp_conn_id=gcp_conn_id, gke_location=gke_location, gke_cluster_name=gke_cluster_name, gke_namespace=gke_namespace, aws_conn_id=aws_conn_id, **kwargs)
def generate_and_run_desktop_query(task_id, project_id, source_dataset_id, sample_size, overwrite, probe_type, destination_dataset_id=None, process=None, docker_image="mozilla/bigquery-etl:latest", gcp_conn_id="google_cloud_derived_datasets", **kwargs): """ :param task_id: Airflow task id :param project_id: GCP project to write to :param source_dataset_id: Bigquery dataset to read from in queries :param sample_size: Value to use for windows release client sampling :param overwrite: Overwrite the destination table :param probe_type: Probe type to generate query :param destination_dataset_id: Bigquery dataset to write results to. Defaults to source_dataset_id :param process: Process to filter probes for. Gets all processes by default. :param docker_image: Docker image :param gcp_conn_id: Airflow GCP connection """ if destination_dataset_id is None: destination_dataset_id = source_dataset_id env_vars = { "PROJECT": project_id, "PROD_DATASET": source_dataset_id, "DATASET": destination_dataset_id, "SUBMISSION_DATE": "{{ ds }}", "RUN_QUERY": "t", } if not overwrite: env_vars["APPEND"] = "t" command = [ "script/glam/generate_and_run_desktop_sql", probe_type, sample_size, ] if process is not None: command.append(process) return gke_command( task_id=task_id, cmds=["bash"], env_vars=env_vars, command=command, docker_image=docker_image, gcp_conn_id=gcp_conn_id, **kwargs )
) as dag: pocket_derived__rolling_monthly_active_user_counts__v1 = bigquery_etl_query( task_id="pocket_derived__rolling_monthly_active_user_counts__v1", destination_table="rolling_monthly_active_user_counts_v1", dataset_id="pocket_derived", project_id="moz-fx-data-shared-prod", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], date_partition_parameter=None, depends_on_past=False, parameters=["submission_date:DATE:{{ds}}"], dag=dag, ) pocket_derived__rolling_monthly_active_user_counts_history__v1 = gke_command( task_id="pocket_derived__rolling_monthly_active_user_counts_history__v1", command=[ "python", "sql/moz-fx-data-shared-prod/pocket_derived/rolling_monthly_active_user_counts_history_v1/query.py", ] + ["--date", "{{ ds }}"], docker_image="mozilla/bigquery-etl:latest", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], ) pocket_derived__rolling_monthly_active_user_counts__v1.set_upstream( pocket_derived__rolling_monthly_active_user_counts_history__v1 )
execution_delta=timedelta(seconds=3600), check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", ) wait_for_copy_deduplicate_all = ExternalTaskSensor( task_id="wait_for_copy_deduplicate_all", external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_all", execution_delta=timedelta(seconds=3600), check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", ) stable_table_sizes = gke_command( task_id="stable_table_sizes", command=[ "python", "sql/monitoring/stable_table_sizes_v1/query.py", "--date", "{{ ds }}", ], docker_image="mozilla/bigquery-etl:latest", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"]) stable_table_sizes.set_upstream(wait_for_copy_deduplicate_main_ping) stable_table_sizes.set_upstream(wait_for_copy_deduplicate_all)
default_args = { "owner": "*****@*****.**", "start_date": datetime.datetime(2020, 10, 9, 0, 0), "end_date": None, "email": ["*****@*****.**"], "depends_on_past": False, "retry_delay": datetime.timedelta(seconds=1800), "email_on_failure": True, "email_on_retry": True, "retries": 0, } with DAG( "bqetl_experimenter_experiments_import", default_args=default_args, schedule_interval="*/10 * * * *", doc_md=docs, ) as dag: monitoring__experimenter_experiments__v1 = gke_command( task_id="monitoring__experimenter_experiments__v1", command=[ "python", "sql/moz-fx-data-experiments/monitoring/experimenter_experiments_v1/query.py", ] + [], docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest", owner="*****@*****.**", email=["*****@*****.**"], )
destination_table="experiment_enrollment_aggregates_v1", dataset_id="telemetry_derived", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"]) gen_query_task_id = "experiment_enrollment_aggregates_live_generate_query" # setting xcom_push to True outputs this query to an xcom experiment_enrollment_aggregates_live_generate_query = gke_command( task_id=gen_query_task_id, command=[ "python", "sql/telemetry_derived/experiment_enrollment_aggregates_live/view.sql.py", "--submission-date", "{{ ds }}", "--json-output", "--wait-seconds", "15", ], docker_image="mozilla/bigquery-etl:latest", xcom_push=True, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"]) experiment_enrollment_aggregates_live_run_query = bigquery_xcom_query( task_id="experiment_enrollment_aggregates_live_run_query", destination_table=None, dataset_id="telemetry_derived", xcom_task_id=gen_query_task_id, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"])
storage_bucket=storage_bucket, default_args=subdag_args, ), ) trim_database = gke_command( task_id="trim_database", cmds=["python"], command=[ "-m", "mozaggregator.trim_db", "--retention-period", f"{365*2}", # 2 year retention "--postgres-db", "telemetry", "--postgres-user", "root", "--postgres-pass", "{{ var.value.mozaggregator_postgres_pass }}", "--postgres-host", "{{ var.value.mozaggregator_postgres_host }}", # TODO: uncomment this after a successful run # "--no-dry-run", ], docker_image="mozilla/python_mozaggregator:latest", dag=dag, ) mozaggregator2bq_extract = gke_command( task_id="mozaggregator2bq_extract", name="mozaggregator2bq_extract", command=["bin/backfill"],
# list of datasets to execute query for and export experiment_datasets = [ "moz-fx-data-shared-prod.telemetry_derived.experiment_enrollment_other_events_overall_v1", "moz-fx-data-shared-prod.telemetry_derived.experiment_enrollment_cumulative_population_estimate_v1", "moz-fx-data-shared-prod.telemetry_derived.experiment_enrollment_overall_v1", "moz-fx-data-shared-prod.telemetry_derived.experiment_unenrollment_overall_v1", "moz-fx-data-shared-prod.telemetry_derived.experiment_cumulative_ad_clicks_v1", "moz-fx-data-shared-prod.telemetry_derived.experiment_cumulative_search_count_v1", "moz-fx-data-shared-prod.telemetry_derived.experiment_cumulative_search_with_ads_count_v1" ] export_monitoring_data = gke_command( task_id="export_enrollments_monitoring_data", command=[ "python", "script/experiments/export_experiment_monitoring_data.py", "--datasets" ] + experiment_datasets, docker_image=docker_image, is_delete_operator_pod=True, ) for dataset in experiment_datasets: task_id = dataset.split(".")[-1] query_etl = bigquery_etl_query( task_id=task_id, destination_table=task_id, dataset_id="telemetry_derived", project_id="moz-fx-data-shared-prod", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"],
def get_messages( task_id, app_id, client_key, bq_project, bq_dataset_id, version, table_prefix=None, gcp_conn_id="google_cloud_derived_datasets", gke_location="us-central1-a", gke_cluster_name="bq-load-gke-1", gke_namespace="default", docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/leanplum-data-export:latest", **kwargs): """Get all leanplum messages and save to bigquery, :param str app_id: [Required] Leanplum app id :param str client_key: [Required] Leanplum content read-only key for the app :param str task_id: [Required] The task ID for this task :param str bq_project: [Required] The project to create tables in :param str bq_dataset_id: [Required] BigQuery dataset id :param str table_prefix: Prefix of tables in Bigquery :param str version: Version of the destination table :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use :param Dict[str, Any] kwargs: Additional keyword arguments for GKEPodOperator :return: GKEPodOperator """ args = [ "leanplum-data-export", "get-messages", "--date", "{{ ds }}", "--app-id", app_id, "--client-key", client_key, "--project", bq_project, "--bq-dataset", bq_dataset_id, "--version", version, ] if table_prefix is not None: args += ["--table-prefix", table_prefix] return gke_command(task_id=task_id, docker_image=docker_image, command=args, gcp_conn_id=gcp_conn_id, gke_location=gke_location, gke_cluster_name=gke_cluster_name, gke_namespace=gke_namespace, **kwargs)
project_id=project_id, owner="*****@*****.**", depends_on_past=True, arguments=("--replace", ), dag=dag, ) scalar_percentiles = gke_command( task_id="scalar_percentiles", command=[ "python3", "script/glam/run_scalar_agg_clustered_query.py", "--submission-date", "{{ds}}", "--dst-table", "scalar_percentiles_v1", "--project", project_id, "--dataset", dataset_id, ], docker_image="mozilla/bigquery-etl:latest", dag=dag, ) # This task runs first and replaces the relevant partition, followed # by the next task below that appends to the same partition of the same table. clients_daily_histogram_aggregates_parent = generate_and_run_desktop_query( task_id="clients_daily_histogram_aggregates_parent", project_id=project_id, source_dataset_id=dataset_id,
], parent_dag_name=dag.dag_id, dag_name="addon_aggregates_export", default_args=default_args), task_id="addon_aggregates_export", executor=GetDefaultExecutor(), dag=dag) main_summary_experiments_get_experiment_list = gke_command( task_id="main_summary_experiments_get_experiment_list", command=[ "python3", "templates/telemetry_derived/experiments_v1/get_experiment_list.py", "{{ds}}" ], docker_image="mozilla/bigquery-etl:latest", xcom_push=True, owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**", "*****@*****.**" ], dag=dag) main_summary_experiments = bigquery_etl_query( task_id="main_summary_experiments", destination_table="experiments_v1", parameters= ("experiment_list:ARRAY<STRING>:{{task_instance.xcom_pull('main_summary_experiments_get_experiment_list') | tojson}}", ), project_id="moz-fx-data-shared-prod",
dag=dag, ) mozilla_vpn_derived__survey_cancellation_of_service__v1 = gke_command( task_id="mozilla_vpn_derived__survey_cancellation_of_service__v1", command=[ "python", "sql/moz-fx-data-shared-prod/mozilla_vpn_derived/survey_cancellation_of_service_v1/query.py", ] + [ "--date", "{{ ds }}", "--survey_id", "5111573", "--api_token", "{{ var.value.surveygizmo_api_token }}", "--api_secret", "{{ var.value.surveygizmo_api_secret }}", "--destination_table", "moz-fx-data-shared-prod.mozilla_vpn_derived.survey_cancellation_of_service_v1", ], docker_image="mozilla/bigquery-etl:latest", owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**", ], ) mozilla_vpn_derived__survey_intercept_q3__v1 = gke_command( task_id="mozilla_vpn_derived__survey_intercept_q3__v1",
# This task runs first and replaces the relevant partition, followed # by the next two tasks that append to the same partition of the same table. clients_daily_scalar_aggregates = gke_command( task_id="clients_daily_scalar_aggregates", owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**", "*****@*****.**", ], cmds=["bash"], env_vars={ "PROJECT": project_id, "PROD_DATASET": dataset_id, "DATASET": dataset_id, "SUBMISSION_DATE": "{{ ds }}", "RUN_QUERY": "t", }, command=[ "script/glam/generate_and_run_desktop_sql", "scalar", PERCENT_RELEASE_WINDOWS_SAMPLING, ], docker_image="mozilla/bigquery-etl:latest", gcp_conn_id="google_cloud_derived_datasets", dag=dag, ) clients_daily_keyed_scalar_aggregates = gke_command( task_id="clients_daily_keyed_scalar_aggregates",
# dags run schedule_interval after ds, and end date should be one day # before the dag runs, so 28-1 = 27 days after ds. "--end-date={{macros.ds_add(ds, 27)}}", # start date should be two schedule intervals before end date, to avoid # race conditions with downstream tables and pings received shortly after a # deletion request. "--start-date={{macros.ds_add(ds, 27-28*2)}}", ] # main_v4 is cheaper to handle in a project without flat-rate query pricing on_demand = gke_command( task_id="on_demand", name="shredder-on-demand", command=base_command + [ "--parallelism=2", "--billing-project=moz-fx-data-shredder", "--only=telemetry_stable.main_v4", ], docker_image=docker_image, is_delete_operator_pod=True, dag=dag, ) # handle main_summary separately to ensure that it doesn't slow everything else # down and also to avoid timeout errors related to queueing when running more # than 2 DML DELETE statements at once on a single table flat_rate_main_summary = gke_command( task_id="flat_rate_main_summary", name="shredder-flat-rate-main-summary", command=base_command + [ "--parallelism=2", "--billing-project=moz-fx-data-bq-batch-prod",
GA_PROPERTIES = [ ("65789850", "www_mozilla_org"), ("66602784", "blog_mozilla_org"), ("65912487", "support_mozilla_org"), ("180612539", "monitor_firefox_com"), ("220432379", "vpn_mozilla_org"), ("65887927", "hacks_mozilla_org"), ("66726481", "developer_mozilla_org"), ] with DAG( "copy_ga_sessions", default_args=default_args, schedule_interval="0 21 * * *", ) as dag: for property_id, property_name in GA_PROPERTIES: commands = [ "python3", "script/marketing/copy_ga_sessions.py", "--start-date", "{{ ds }}", "--src-project", "ga-mozilla-org-prod-001", "--dst-project", "moz-fx-data-marketing-prod", "--overwrite", property_id ] copy_ga_sessions = gke_command( task_id=f"copy_ga_sessions_{property_name}", command=commands, docker_image="mozilla/bigquery-etl:latest", gcp_conn_id="google_cloud_derived_datasets", dag=dag, )
email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**" ], dag=dag) gcloud_docker_image = "google/cloud-sdk:263.0.0-slim" main_summary_dataproc_bucket = "gs://moz-fx-data-derived-datasets-parquet-tmp" main_ping_bigquery_export_prefix = main_summary_dataproc_bucket + "/export" main_ping_bigquery_export_dest = main_ping_bigquery_export_prefix + "/submission_date={{ds}}/document_namespace=telemetry/document_type=main/document_version=4/*.avro" # noqa main_ping_bigquery_export = gke_command( task_id="main_ping_bigquery_extract", command=[ "bq", "extract", "--destination_format=AVRO", "moz-fx-data-shared-prod:payload_bytes_decoded.telemetry_telemetry__main_v4${{ds_nodash}}", main_ping_bigquery_export_dest, ], docker_image=gcloud_docker_image, dag=dag, ) main_summary_dataproc = SubDagOperator( subdag=moz_dataproc_jar_runner( parent_dag_name="main_summary", dag_name="main_summary_dataproc", default_args=default_args, cluster_name="main-summary-{{ds}}", image_version="1.3", worker_machine_type="n1-standard-8", num_preemptible_workers=40,
"retry_delay": datetime.timedelta(minutes=30), } dag_name = "attitudes_daily" with models.DAG(dag_name, schedule_interval="0 3 * * *", default_args=default_args) as dag: surveygizmo_attitudes_daily_import = gke_command( task_id="surveygizmo_attitudes_daily_import", command=[ "python", "sql/moz-fx-data-shared-prod/telemetry_derived/surveygizmo_daily_attitudes/import_responses.py", "--date", "{{ ds }}", "--survey_id", Variable.get("surveygizmo_daily_attitudes_survey_id"), "--sg_api_token", Variable.get("surveygizmo_api_token"), "--sg_api_secret", Variable.get("surveygizmo_api_secret"), "--destination_table", "moz-fx-data-shared-prod.telemetry_derived.survey_gizmo_daily_attitudes" ], docker_image="mozilla/bigquery-etl:latest") wait_for_copy_deduplicate = ExternalTaskSensor( task_id="wait_for_copy_deduplicate", external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_all", execution_delta=datetime.timedelta(hours=2), mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", email_on_retry=False,
from airflow import DAG from datetime import timedelta, datetime from utils.gcp import gke_command default_args = { "owner": "*****@*****.**", "email": ["*****@*****.**", "*****@*****.**"], "depends_on_past": False, "start_date": datetime(2020, 6, 11), "email_on_failure": True, "email_on_retry": True, "retries": 2, "retry_delay": timedelta(minutes=10), } with DAG("stripe", default_args=default_args, schedule_interval="@daily") as dag: gke_command( task_id="stripe_import_events", command=[ "bqetl", "stripe", "import", "--date={{ ds }}", "--api-key={{ var.value.stripe_api_key }}", "--resource=Event", "--table=moz-fx-data-shared-prod.stripe_external.events_v1", ], docker_image="mozilla/bigquery-etl:latest", )
default_args = { "owner": "*****@*****.**", "start_date": datetime.datetime(2021, 3, 18, 0, 0), "end_date": None, "email": ["*****@*****.**", "*****@*****.**"], "depends_on_past": False, "retry_delay": datetime.timedelta(seconds=1800), "email_on_failure": True, "email_on_retry": True, "retries": 2, } with DAG( "bqetl_firefox_ios", default_args=default_args, schedule_interval="0 4 * * *", doc_md=docs, ) as dag: org_mozilla_ios_firefox__unified_metrics__v1 = gke_command( task_id="org_mozilla_ios_firefox__unified_metrics__v1", command=[ "python", "sql/moz-fx-data-shared-prod/org_mozilla_ios_firefox/unified_metrics_v1/query.py", ] + [], docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], )
default_args=default_args, project='moz-fx-data-shared-prod', dataset='telemetry', table_or_view='fenix_events_v1', s3_prefix='fenix', ), task_id=fenix_task_id ) shredder_fenix = gke_command( task_id="shredder_amplitude_fenix", name="shredder-amplitude-fenix", command=[ "script/shredder_amplitude", "--date={{ ds }}", "--api-key={{ var.value.fenix_amplitude_api_key }}", "--secret-key={{ var.value.fenix_amplitude_secret_key }}", "--table-id=moz-fx-data-shared-prod.org_mozilla_fenix_stable.deletion_request_v1", "--device-id-field=client_info.client_id", ], docker_image="mozilla/bigquery-etl:latest", dag=dag, ) rocket_android_task_id = 'rocket_android_amplitude_export' rocket_args = default_args.copy() rocket_args["start_date"] = datetime.datetime(2019, 12, 2) SubDagOperator( subdag=export_to_amplitude( dag_name=rocket_android_task_id, parent_dag_name=dag_name, default_args=rocket_args,
default_args=subdag_args, ), ) trim_database = gke_command( task_id="trim_database", cmds=["bash"], command=[ "python", "-m", "mozaggregator.trim_db", "--retention-period", f"{365*2}", # 2 year retention "--postgres-db", "telemetry", "--postgres-user", "root", "--postgres-pass", "{{ var.value.mozaggregator_postgres_pass }}", "--postgres-host", "{{ var.value.mozaggregator_postgres_host }}", # TODO: uncomment this after a successful run # "--no-dry-run", ], docker_image="mozilla/python_mozaggregator:latest", dag=dag, ) prerelease_telemetry_aggregate_view_dataproc >> trim_database # export to avro, if necessary
from airflow import DAG from datetime import timedelta, datetime from utils.gcp import gke_command default_args = { "owner": "*****@*****.**", "email": ["*****@*****.**"], "depends_on_past": False, "start_date": datetime(2020, 6, 11), "email_on_failure": True, "email_on_retry": True, "retries": 2, "retry_delay": timedelta(minutes=30), } with DAG("mozfun", default_args=default_args, schedule_interval="0 1 * * *") as dag: docker_image = "mozilla/bigquery-etl:latest" publish_public_udfs = gke_command( task_id="publish_public_udfs", command=["script/publish_public_udfs"], docker_image=docker_image )
"bqetl_app_store_connect", default_args=default_args, schedule_interval="0 20 * * *", doc_md=docs, ) as dag: apple_app_store__report_subscriber_detailed__v13 = gke_command( task_id="apple_app_store__report_subscriber_detailed__v13", command=[ "python", "sql/moz-fx-data-marketing-prod/apple_app_store/report_subscriber_detailed_v13/query.py", ] + [ "--key-id", "{{ var.value.app_store_connect_key_id }}", "--issuer-id", "{{ var.value.app_store_connect_issuer_id }}", "--private-key", "{{ var.value.app_store_connect_private_key }}", "--vendor-number", "{{ var.value.app_store_connect_vendor_number }}", "--date", "{{ ds }}", "--table", "moz-fx-data-marketing-prod.apple_app_store.report_subscriber_detailed_v13", ], docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest", owner="*****@*****.**", email=["*****@*****.**"], )
"retries": 2, } with DAG( "bqetl_monitoring", default_args=default_args, schedule_interval="0 2 * * *", doc_md=docs, ) as dag: monitoring_derived__average_ping_sizes__v1 = gke_command( task_id="monitoring_derived__average_ping_sizes__v1", command=[ "python", "sql/moz-fx-data-shared-prod/monitoring_derived/average_ping_sizes_v1/query.py", ] + ["--date", "{{ ds }}"], docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest", owner="*****@*****.**", email=["*****@*****.**"], ) monitoring_derived__bigquery_etl_scheduled_queries_cost__v1 = gke_command( task_id="monitoring_derived__bigquery_etl_scheduled_queries_cost__v1", command=[ "python", "sql/moz-fx-data-shared-prod/monitoring_derived/bigquery_etl_scheduled_queries_cost_v1/query.py", ] + ["--date", "{{ ds }}"], docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest", owner="*****@*****.**",
artifact_bucket=artifact_bucket, storage_bucket=storage_bucket, default_args=subdag_args, ), ) # export to avro, if necessary if EXPORT_TO_AVRO: gke_command( task_id="export_main_avro", cmds=["bash"], command=[ "bin/export-avro.sh", "moz-fx-data-shared-prod", "moz-fx-data-shared-prod:analysis", "gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/prerelease", "main_v4", "'nightly', 'beta'", "{{ ds }}", ], docker_image="mozilla/python_mozaggregator:latest", dag=dag, ).set_downstream(prerelease_telemetry_aggregate_view_dataproc) gke_command( task_id="export_saved_session_avro", cmds=["bash"], command=[ "bin/export-avro.sh", "moz-fx-data-shared-prod", "moz-fx-data-shared-prod:analysis",
sum(LOGICAL_MAPPING.values(), [])) for product in final_products: query = generate_and_run_glean_query( task_id=f"incremental_{product}", product=product, destination_project_id=PROJECT, env_vars=dict(STAGE="incremental"), dag=dag, ) # get the dependencies for the logical mapping, or just pass through the # daily query unmodified for dependency in LOGICAL_MAPPING.get(product, [product]): mapping[dependency] >> query export = gke_command( task_id=f"export_{product}", cmds=["bash"], env_vars={ "SRC_PROJECT": PROJECT, "DATASET": "glam_etl", "PRODUCT": product, "BUCKET": BUCKET, }, command=["script/glam/export_csv"], docker_image="mozilla/bigquery-etl:latest", gcp_conn_id="google_cloud_derived_datasets", dag=dag, ) query >> export
wait_for_copy_deduplicate = ExternalTaskSensor( task_id="wait_for_copy_deduplicate", external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_all", execution_delta=timedelta(hours=-1), check_existence=True, dag=dag, ) run_sql = gke_command( task_id="run_sql", cmds=["bash"], env_vars={ "DATASET": "glam_etl", "SUBMISSION_DATE": "{{ ds }}" }, command=["script/glam/run_glam_sql"], docker_image="mozilla/bigquery-etl:latest", gcp_conn_id="google_cloud_derived_datasets", dag=dag, ) export_csv = gke_command( task_id="export_csv", cmds=["bash"], env_vars={"DATASET": "glam_etl"}, command=["script/glam/export_csv"], docker_image="mozilla/bigquery-etl:latest", gcp_conn_id="google_cloud_derived_datasets", dag=dag, )
image=docker_image, dag=dag, ) wait_for_telemetry_derived__ssl_ratios__v1 = ExternalTaskSensor( task_id="wait_for_telemetry_derived__ssl_ratios__v1", external_dag_id="bqetl_ssl_ratios", external_task_id="telemetry_derived__ssl_ratios__v1", execution_delta=datetime.timedelta(seconds=7200), check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", ) export_public_data_json_telemetry_derived__ssl_ratios__v1.set_upstream( wait_for_telemetry_derived__ssl_ratios__v1 ) public_data_gcs_metadata = gke_command( task_id="public_data_gcs_metadata", command=["script/publish_public_data_gcs_metadata"], docker_image=docker_image, dag=dag, ) public_data_gcs_metadata.set_upstream( [ export_public_data_json_telemetry_derived__ssl_ratios__v1, ] )
task_id="wait_for_copy_deduplicate", external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_all", execution_delta=timedelta(hours=1), check_existence=True, dag=dag, ) for product in PRODUCTS: query = generate_and_run_glean_query( task_id=product, product=product, destination_project_id="glam-fenix-dev", dag=dag, ) export = gke_command( task_id="export_{}".format(product), cmds=["bash"], env_vars={ "DATASET": "glam_etl", "PRODUCT": product, }, command=["script/glam/export_csv"], docker_image="mozilla/bigquery-etl:latest", gcp_conn_id="google_cloud_derived_datasets", dag=dag, ) wait_for_copy_deduplicate >> query >> export