service_account=params.client_email, artifact_bucket=params.artifact_bucket, storage_bucket=params.storage_bucket, default_args=subdag_args, ), ) bgbb_pred_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="bgbb_pred_bigquery_load", default_args=default_args, dataset="bgbb/active_profiles", dataset_version="v1", p2b_table_alias="active_profiles_v1", bigquery_dataset="telemetry_derived", ds_type="ds", gke_cluster_name="bq-load-gke-1", cluster_by=["sample_id"], rename={"submission_date_s3": "submission_date"}, replace=["SAFE_CAST(sample_id AS INT64) AS sample_id"], env_vars={"GOOGLE_CLOUD_PROJECT": "{{ var.value.gcp_shared_prod_project }}"}, ), task_id="bgbb_pred_bigquery_load", dag=dag) search_clients_daily_bigquery = bigquery_etl_query( task_id="search_clients_daily_bigquery", destination_table="search_clients_daily_v8", dataset_id="search_derived", project_id="moz-fx-data-shared-prod",
task_name = dataset.replace('-', '_') + '_bigquery_load' kwargs = { 'parent_dag_name': dag.dag_id, 'dag_name': task_name, 'default_args': default_args, 'dataset_s3_bucket': 'net-mozaws-prod-us-west-2-pipeline-data', 'aws_conn_id': 'aws_prod_iam_s3', 'dataset': dataset, 'gke_cluster_name': 'bq-load-gke-1', 'bigquery_dataset': 'telemetry_derived', } kwargs.update(values) tasks[task_name] = SubDagOperator(subdag=load_to_bigquery(**kwargs), task_id=task_name) # Daily and last seen views on top of core pings. core_clients_daily = bigquery_etl_query( task_id='core_clients_daily', destination_table='core_clients_daily_v1', dataset_id='telemetry', ) tasks['telemetry_core_parquet_bigquery_load'] >> core_clients_daily core_clients_last_seen = bigquery_etl_query( task_id='core_clients_last_seen', destination_table='core_clients_last_seen_raw_v1',
ebs_volume_size=250, execution_timeout=timedelta(hours=4), env=tbv_envvar( "com.mozilla.telemetry.views.CrashSummaryView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "outputBucket": "{{ task.__class__.private_output_bucket }}" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) crash_summary_view_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="crash_summary_view_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", ds_type="ds", aws_conn_id="aws_dev_iam_s3", dataset="crash_summary", dataset_version="v2", date_submission_col="submission_date", gke_cluster_name="bq-load-gke-1", bigquery_dataset="telemetry_derived", ), task_id="crash_summary_view_bigquery_load", dag=dag) crash_summary_view >> crash_summary_view_bigquery_load
register_status(main_summary, "Main Summary", "A summary view of main pings.") main_summary_schema = EmailSchemaChangeOperator( task_id="main_summary_schema", email=["*****@*****.**", "*****@*****.**"], to=["*****@*****.**", "*****@*****.**"], key_prefix='schema/main_summary/submission_date_s3=', dag=dag) main_summary_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="main_summary_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="main_summary", dataset_version="v4", gke_cluster_name="bq-load-gke-1", ), task_id="main_summary_bigquery_load", dag=dag) engagement_ratio = EMRSparkOperator( task_id="engagement_ratio", job_name="Update Engagement Ratio", execution_timeout=timedelta(hours=6), instance_count=10, env=mozetl_envvar("engagement_ratio", options={ "input_bucket": "{{ task.__class__.private_output_bucket }}",
env={"date": "{{ ds_nodash }}"}, uri= "https://raw.githubusercontent.com/mozilla-services/data-pipeline/master/reports/socorro_import/ImportCrashData.ipynb", output_visibility="public", dag=dag, ) register_status( crash_report_parquet, crash_report_parquet.job_name, "Convert processed crash reports into parquet for analysis", ) crash_report_parquet_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="crash_report_parquet_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="socorro_crash", dataset_version="v2", date_submission_col="crash_date", gke_cluster_name="bq-load-gke-1", bigquery_dataset="telemetry_raw", ), task_id="crash_report_parquet_bigquery_load", dag=dag) crash_report_parquet >> crash_report_parquet_bigquery_load
execution_timeout=timedelta(hours=4), instance_count=10, env={"date": "{{ ds_nodash }}"}, uri="https://raw.githubusercontent.com/mozilla-services/data-pipeline/master/reports/socorro_import/ImportCrashData.ipynb", output_visibility="public", dag=dag, ) register_status( crash_report_parquet, crash_report_parquet.job_name, "Convert processed crash reports into parquet for analysis", ) crash_report_parquet_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="crash_report_parquet_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="socorro_crash", dataset_version="v2", date_submission_col="crash_date", gke_cluster_name="bq-load-gke-1", ), task_id="crash_report_parquet_bigquery_load", dag=dag) crash_report_parquet >> crash_report_parquet_bigquery_load
job_name="main_summary_view_{{ds_nodash}}", init_actions_uris=[], gcp_conn_id="google_cloud_airflow_dataproc", ), task_id="main_summary_dataproc", dag=dag, ) main_summary_dataproc_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="main_summary_dataproc_bigquery_load", default_args=default_args, dataset_gcs_bucket=main_summary_dataproc_bucket.replace("gs://", ""), dataset="main_summary", dataset_version="v4", gke_cluster_name="bq-load-gke-1", bigquery_dataset="backfill", cluster_by=["sample_id"], drop=["submission_date"], rename={"submission_date_s3": "submission_date"}, replace=["SAFE_CAST(sample_id AS INT64) AS sample_id"], ), task_id="main_summary_dataproc_bigquery_load", dag=dag, ) main_ping_bigquery_export_delete = gke_command( task_id="main_ping_bigquery_export_delete", command=[ "gsutil", "-m",
], gcp_conn_id=params.conn_id, service_account=params.client_email, artifact_bucket=params.artifact_bucket, storage_bucket=params.storage_bucket, default_args=subdag_args, ), ) bgbb_pred_bigquery_load = SubDagOperator(subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="bgbb_pred_bigquery_load", default_args=default_args, dataset="bgbb/active_profiles", dataset_version="v1", p2b_table_alias="active_profiles_v1", bigquery_dataset="telemetry_derived", ds_type="ds", gke_cluster_name="bq-load-gke-1", cluster_by=["sample_id"], rename={"submission_date_s3": "submission_date"}, replace=["SAFE_CAST(sample_id AS INT64) AS sample_id"], ), task_id="bgbb_pred_bigquery_load", dag=dag) search_clients_daily_bigquery = bigquery_etl_query( task_id="search_clients_daily_bigquery", destination_table="search_clients_daily_v8", dataset_id="search_derived", project_id="moz-fx-data-shared-prod", owner="*****@*****.**",
instance_count=10, env=mozetl_envvar("churn", { "start_date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}" }), uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", output_visibility="public", dag=dag) churn_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="churn_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="churn", dataset_version="v3", date_submission_col="week_start", gke_cluster_name="bq-load-gke-1", ), task_id="churn_bigquery_load", dag=dag) churn_v2 = MozDatabricksSubmitRunOperator( task_id="churn_v2", job_name="churn 7-day v2", execution_timeout=timedelta(hours=4), instance_count=5, env=mozetl_envvar("churn", { "start_date": "{{ ds_nodash }}",
env=tbv_envvar( "com.mozilla.telemetry.views.SyncView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) sync_view_bigquery_load = SubDagOperator(subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="sync_view_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="sync_summary", dataset_version="v2", gke_cluster_name="bq-load-gke-1", bigquery_dataset="telemetry_derived", ), task_id="sync_view_bigquery_load", dag=dag) sync_events_view = EMRSparkOperator( task_id="sync_events_view", job_name="Sync Events View", execution_timeout=timedelta(hours=10), instance_count=1, email=['*****@*****.**'], env=tbv_envvar(
"to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", "doc-type": "first_shutdown", "read-mode": "aligned", "input-partition-multiplier": "4" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) first_shutdown_summary_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="first_shutdown_summary_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="first_shutdown_summary", dataset_version="v4", gke_cluster_name="bq-load-gke-1", bigquery_dataset="telemetry_derived", cluster_by=["sample_id"], drop=["submission_date"], rename={"submission_date_s3": "submission_date"}, replace=["SAFE_CAST(sample_id AS INT64) AS sample_id"], ), task_id="first_shutdown_summary_bigquery_load", dag=dag) first_shutdown_summary >> first_shutdown_summary_bigquery_load
register_status(main_summary, "Main Summary", "A summary view of main pings.") main_summary_schema = EmailSchemaChangeOperator( task_id="main_summary_schema", email=["*****@*****.**", "*****@*****.**"], to=["*****@*****.**", "*****@*****.**"], key_prefix='schema/main_summary/submission_date_s3=', dag=dag) main_summary_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="main_summary_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="main_summary", dataset_version="v4", gke_cluster_name="bq-load-gke-1", ), task_id="main_summary_bigquery_load", dag=dag) engagement_ratio = EMRSparkOperator( task_id="engagement_ratio", job_name="Update Engagement Ratio", execution_timeout=timedelta(hours=6), instance_count=10, env=mozetl_envvar("engagement_ratio", options={ "input_bucket": "{{ task.__class__.private_output_bucket }}",
job_name="Sync Pings View", execution_timeout=timedelta(hours=10), instance_count=5, env = tbv_envvar("com.mozilla.telemetry.views.SyncView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}"}), uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) sync_view_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="sync_view_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="sync_summary", dataset_version="v2", gke_cluster_name="bq-load-gke-1", ), task_id="sync_view_bigquery_load", dag=dag) sync_events_view = EMRSparkOperator( task_id="sync_events_view", job_name="Sync Events View", execution_timeout=timedelta(hours=10), instance_count=1, email=['*****@*****.**'], env = tbv_envvar("com.mozilla.telemetry.views.SyncEventView", { "from": "{{ ds_nodash }}",
task_id="first_shutdown_summary", job_name="First Shutdown Summary View", execution_timeout=timedelta(hours=1), instance_count=1, env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", "doc-type": "first_shutdown", "read-mode": "aligned", "input-partition-multiplier": "4" }), uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) first_shutdown_summary_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="first_shutdown_summary_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="first_shutdown_summary", dataset_version="v4", gke_cluster_name="bq-load-gke-1", ), task_id="first_shutdown_summary_bigquery_load", dag=dag) first_shutdown_summary >> first_shutdown_summary_bigquery_load
execution_timeout=timedelta(hours=1), instance_count=1, env=tbv_envvar( "com.mozilla.telemetry.views.MainSummaryView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", "doc-type": "first_shutdown", "read-mode": "aligned", "input-partition-multiplier": "4" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) first_shutdown_summary_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="first_shutdown_summary_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="first_shutdown_summary", dataset_version="v4", gke_cluster_name="bq-load-gke-1", ), task_id="first_shutdown_summary_bigquery_load", dag=dag) first_shutdown_summary >> first_shutdown_summary_bigquery_load
kwargs = { 'parent_dag_name': dag.dag_id, 'dag_name': task_name, 'default_args': default_args, 'dataset_s3_bucket': 'net-mozaws-prod-us-west-2-pipeline-data', 'aws_conn_id': 'aws_prod_iam_s3', 'dataset': dataset, 'gke_cluster_name': 'bq-load-gke-1', 'bigquery_dataset': 'telemetry_raw', } kwargs.update(values) tasks[task_name] = SubDagOperator( subdag=load_to_bigquery(**kwargs), task_id=task_name) # Daily and last seen views on top of core pings. core_clients_daily = bigquery_etl_query( task_id='core_clients_daily', destination_table='core_clients_daily_v1', ) tasks['telemetry_core_parquet_bigquery_load'] >> core_clients_daily core_clients_last_seen = bigquery_etl_query( task_id='core_clients_last_seen', destination_table='core_clients_last_seen_raw_v1',