'email_on_failure': True, 'email_on_retry': True, 'retries': 3, 'retry_delay': timedelta(minutes=30), } dag = DAG('mobile_aggregates', default_args=default_args, schedule_interval='@daily') mobile_aggregate_view = EMRSparkOperator( task_id="mobile_aggregate_view", job_name="Mobile Aggregate View", instance_count=5, execution_timeout=timedelta(hours=12), env={ "date": "{{ ds_nodash }}", "channels": "nightly", "bucket": "{{ task.__class__.private_output_bucket }}", }, uri=("https://raw.githubusercontent.com/" "mozilla/telemetry-airflow/master/jobs/run_mobile_aggregator.sh"), dag=dag) register_status(mobile_aggregate_view, 'Mobile Aggregates', 'Aggregates of metrics sent through the mobile-events pings.')
blp_logs.set_downstream(blp_job_sensor) blp_job_sensor.set_downstream(s3_to_gcs) s3_to_gcs.set_downstream(load_blpadi_to_bq) amo_dag = DAG('mango_log_processing_amo', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=6), schedule_interval='0 3 * * *') amo_logs = EmrCreateJobFlowOperator(task_id='amo_create_job_flow', job_flow_overrides={'Steps': AMO_STEPS}, aws_conn_id='aws_data_iam', emr_conn_id='emr_data_iam_mango', dag=amo_dag) register_status(amo_logs, 'AMO Logs', 'Mango Processed AMO Logs') amo_job_sensor = EmrJobFlowSensor( task_id='amo_check_job_flow', job_flow_id= "{{ task_instance.xcom_pull('amo_create_job_flow', key='return_value') }}", aws_conn_id='aws_data_iam', dag=amo_dag, on_retry_callback=lambda context: amo_dag.clear( start_date=context['execution_date'], end_date=context['execution_date']), ) amo_logs.set_downstream(amo_job_sensor) # For AMO Dev and Stage Environments
table="moz-fx-data-shared-prod:telemetry_derived.main_summary_v4${{ds_nodash}}", static_partitions=["submission_date_s3={{ds_nodash}}"], arguments=[ "--partition-by=sample_id", "--replace='{{ds_nodash}}' AS submission_date", "--maps-from-entries", ] + main_summary_bigint_columns, parent_dag_name=dag.dag_id, dag_name="main_summary_export", default_args=default_args, num_workers=40), task_id="main_summary_export", executor=get_default_executor(), dag=dag) register_status(main_summary, "Main Summary", "A summary view of main pings.") addons = bigquery_etl_query( task_id="addons", destination_table="addons_v2", project_id="moz-fx-data-shared-prod", dataset_id="telemetry_derived", dag=dag) addon_aggregates = bigquery_etl_query( task_id="addon_aggregates", destination_table="addon_aggregates_v2", project_id="moz-fx-data-shared-prod", dataset_id="telemetry_derived", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"],
longitudinal = MozDatabricksSubmitRunOperator( task_id="longitudinal", job_name="Longitudinal View", execution_timeout=timedelta(hours=12), instance_count=16, instance_type="i3.8xlarge", env=tbv_envvar("com.mozilla.telemetry.views.LongitudinalView", { "bucket": "{{ task.__class__.private_output_bucket }}", "to": DS_WEEKLY }, metastore_location="s3://telemetry-parquet/longitudinal"), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) register_status(longitudinal, "Longitudinal", "A 6-month longitudinal view of client history.") addon_recommender = EMRSparkOperator( task_id="addon_recommender", job_name="Train the Addon Recommender", execution_timeout=timedelta(hours=10), instance_count=20, owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**" ], env={ "date": DS_WEEKLY, "privateBucket": "{{ task.__class__.private_output_bucket }}", "publicBucket": "{{ task.__class__.public_output_bucket }}"
ebs_volume_size=250, env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView", options={ "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "schema-report-location": "s3://{{ task.__class__.private_output_bucket }}/schema/main_summary/submission_date_s3={{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", "read-mode": "aligned", "input-partition-multiplier": "400" }, dev_options={ "channel": "nightly", # run on smaller nightly data rather than release }), dag=dag) register_status(main_summary, "Main Summary", "A summary view of main pings.") main_summary_schema = EmailSchemaChangeOperator( task_id="main_summary_schema", email=["*****@*****.**", "*****@*****.**"], to=["*****@*****.**", "*****@*****.**"], key_prefix='schema/main_summary/submission_date_s3=', dag=dag) main_summary_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="main_summary_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3",
# output: telemetry-parquet/socorro_crash/v2 crash_report_parquet = EMRSparkOperator( task_id="crash_report_parquet", job_name="Socorro Crash Reports Parquet", execution_timeout=timedelta(hours=4), instance_count=10, env={"date": "{{ ds_nodash }}"}, uri= "https://raw.githubusercontent.com/mozilla-services/data-pipeline/master/reports/socorro_import/ImportCrashData.ipynb", output_visibility="public", dag=dag, ) register_status( crash_report_parquet, crash_report_parquet.job_name, "Convert processed crash reports into parquet for analysis", ) crash_report_parquet_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="crash_report_parquet_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="socorro_crash", dataset_version="v2", date_submission_col="crash_date", gke_cluster_name="bq-load-gke-1", bigquery_dataset="telemetry_raw",
mobile_aggregate_view = MozDatabricksSubmitRunOperator( task_id="mobile_aggregate_view", job_name="Mobile Aggregate View", release_label="6.1.x-scala2.11", instance_count=5, execution_timeout=timedelta(hours=12), env=mozetl_envvar( "mobile", { "date": "{{ ds_nodash }}", "channels": "nightly", "output": "s3://{{ task.__class__.private_output_bucket }}/mobile_metrics_aggregates/v2", "num-partitions": 5 * 32 }, other={ "MOZETL_GIT_PATH": "https://github.com/mozilla/python_mozaggregator.git", "MOZETL_EXTERNAL_MODULE": "mozaggregator", }, ), dag=dag, ) register_status( mobile_aggregate_view, "Mobile Aggregates", "Aggregates of metrics sent through the mobile-events pings.", )
amo_dag = DAG( 'mango_log_processing_amo', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=6), schedule_interval='0 3 * * *' ) amo_logs = EmrCreateJobFlowOperator( task_id='amo_create_job_flow', job_flow_overrides={'Steps': AMO_STEPS}, aws_conn_id='aws_data_iam', emr_conn_id='emr_data_iam_mango', dag=amo_dag ) register_status(amo_logs, 'AMO Logs', 'Mango Processed AMO Logs') amo_job_sensor = EmrJobFlowSensor( task_id='amo_check_job_flow', job_flow_id="{{ task_instance.xcom_pull('amo_create_job_flow', key='return_value') }}", aws_conn_id='aws_data_iam', dag=amo_dag, on_retry_callback=lambda context: amo_dag.clear( start_date=context['execution_date'], end_date=context['execution_date']), ) amo_logs.set_downstream(amo_job_sensor) # For AMO Dev and Stage Environments amo_dev_stage_dag = DAG(
# We remove the current date partition for idempotency. remove_bq_table_partition = BigQueryTableDeleteOperator( task_id='remove_bq_table_partition', bigquery_conn_id=bq_gcp_conn_id, deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format(bq_dataset, bq_table_name), # noqa ignore_if_missing=True, dag=dag ) bq_load = GKEPodOperator( task_id='bigquery_load', gcp_conn_id=bq_gcp_conn_id, project_id=bq_connection.project_id, name='load-socorro-crash-parquet-to-bq', image=docker_image, arguments=gke_args, env_vars={"GOOGLE_CLOUD_PROJECT": "{{ var.value.gcp_shared_prod_project }}"}, dag=dag, ) register_status( bq_load, "Socorro Crash Reports Parquet", "Convert processed crash reports into parquet for analysis", ) s3_to_gcs >> crash_report_parquet crash_report_parquet >> remove_bq_table_partition >> bq_load
job_name="Fennec iOS Events to Amplitude", execution_timeout=timedelta(hours=8), instance_count=FENNEC_IOS_INSTANCES, email=['*****@*****.**', '*****@*****.**'], env={ "date": "{{ ds_nodash }}", "max_requests": FENNEC_IOS_INSTANCES * VCPUS_PER_INSTANCE, "key_file": key_file("fennec_ios"), "artifact": get_artifact_url(slug, branch="master"), "config_filename": "fennec_ios_events_schemas.json", }, uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/events_to_amplitude.sh", dag=dag ) register_status(fennec_ios_events_to_amplitude, "Firefox-iOS Amplitude events", "Daily job sending Firefox iOS events to Amplitude.") devtools_release_events_to_amplitude = EMRSparkOperator( task_id="devtools_release_events_to_amplitude", job_name="DevTools Release Events to Amplitude", execution_timeout=timedelta(hours=8), instance_count=DEVTOOLS_INSTANCES, dev_instance_count=DEVTOOLS_INSTANCES, email=['*****@*****.**', '*****@*****.**'], owner='*****@*****.**', env=tbv_envvar( "com.mozilla.telemetry.streaming.EventsToAmplitude", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "max_parallel_requests": str(DEVTOOLS_INSTANCES * VCPUS_PER_INSTANCE),
# input: crashstats-telemetry-crashes-prod-us-west-2/v1/crash_report # output: telemetry-parquet/socorro_crash/v2 crash_report_parquet = EMRSparkOperator( task_id="crash_report_parquet", job_name="Socorro Crash Reports Parquet", execution_timeout=timedelta(hours=4), instance_count=10, env={"date": "{{ ds_nodash }}"}, uri="https://raw.githubusercontent.com/mozilla-services/data-pipeline/master/reports/socorro_import/ImportCrashData.ipynb", output_visibility="public", dag=dag, ) register_status( crash_report_parquet, crash_report_parquet.job_name, "Convert processed crash reports into parquet for analysis", ) crash_report_parquet_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="crash_report_parquet_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="socorro_crash", dataset_version="v2", date_submission_col="crash_date", gke_cluster_name="bq-load-gke-1", ),
task_id="longitudinal", job_name="Longitudinal View", execution_timeout=timedelta(hours=12), instance_count=16, instance_type="i3.8xlarge", env=tbv_envvar( "com.mozilla.telemetry.views.LongitudinalView", { "bucket": "{{ task.__class__.private_output_bucket }}", "to": DS_WEEKLY }, metastore_location="s3://telemetry-parquet/longitudinal"), uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) register_status(longitudinal, "Longitudinal", "A 6-month longitudinal view of client history.") game_hw_survey = EMRSparkOperator( task_id="game_hw_survey", job_name="Firefox Hardware Report", execution_timeout=timedelta(hours=5), instance_count=15, owner="*****@*****.**", depends_on_past=True, email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], env={"date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.public_output_bucket }}"}, uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/hardware_report.sh", output_visibility="public", dag=dag)
'email_on_retry': True, 'retries': 3, 'retry_delay': timedelta(minutes=30), } dag = DAG( 'mobile_aggregates', default_args=default_args, schedule_interval='@daily' ) mobile_aggregate_view = EMRSparkOperator( task_id="mobile_aggregate_view", job_name="Mobile Aggregate View", instance_count=5, execution_timeout=timedelta(hours=12), env={ "date": "{{ ds_nodash }}", "channels": "nightly", "bucket": "{{ task.__class__.private_output_bucket }}", }, uri=("https://raw.githubusercontent.com/" "mozilla/telemetry-airflow/master/jobs/run_mobile_aggregator.sh"), dag=dag) register_status( mobile_aggregate_view, 'Mobile Aggregates', 'Aggregates of metrics sent through the mobile-events pings.' )