'email_on_failure':
    True,
    'email_on_retry':
    True,
    'retries':
    3,
    'retry_delay':
    timedelta(minutes=30),
}

dag = DAG('mobile_aggregates',
          default_args=default_args,
          schedule_interval='@daily')

mobile_aggregate_view = EMRSparkOperator(
    task_id="mobile_aggregate_view",
    job_name="Mobile Aggregate View",
    instance_count=5,
    execution_timeout=timedelta(hours=12),
    env={
        "date": "{{ ds_nodash }}",
        "channels": "nightly",
        "bucket": "{{ task.__class__.private_output_bucket }}",
    },
    uri=("https://raw.githubusercontent.com/"
         "mozilla/telemetry-airflow/master/jobs/run_mobile_aggregator.sh"),
    dag=dag)

register_status(mobile_aggregate_view, 'Mobile Aggregates',
                'Aggregates of metrics sent through the mobile-events pings.')
Example #2
0
blp_logs.set_downstream(blp_job_sensor)
blp_job_sensor.set_downstream(s3_to_gcs)
s3_to_gcs.set_downstream(load_blpadi_to_bq)

amo_dag = DAG('mango_log_processing_amo',
              default_args=DEFAULT_ARGS,
              dagrun_timeout=timedelta(hours=6),
              schedule_interval='0 3 * * *')

amo_logs = EmrCreateJobFlowOperator(task_id='amo_create_job_flow',
                                    job_flow_overrides={'Steps': AMO_STEPS},
                                    aws_conn_id='aws_data_iam',
                                    emr_conn_id='emr_data_iam_mango',
                                    dag=amo_dag)

register_status(amo_logs, 'AMO Logs', 'Mango Processed AMO Logs')

amo_job_sensor = EmrJobFlowSensor(
    task_id='amo_check_job_flow',
    job_flow_id=
    "{{ task_instance.xcom_pull('amo_create_job_flow', key='return_value') }}",
    aws_conn_id='aws_data_iam',
    dag=amo_dag,
    on_retry_callback=lambda context: amo_dag.clear(
        start_date=context['execution_date'],
        end_date=context['execution_date']),
)

amo_logs.set_downstream(amo_job_sensor)

# For AMO Dev and Stage Environments
Example #3
0
        table="moz-fx-data-shared-prod:telemetry_derived.main_summary_v4${{ds_nodash}}",
        static_partitions=["submission_date_s3={{ds_nodash}}"],
        arguments=[
            "--partition-by=sample_id",
            "--replace='{{ds_nodash}}' AS submission_date",
            "--maps-from-entries",
        ] + main_summary_bigint_columns,
        parent_dag_name=dag.dag_id,
        dag_name="main_summary_export",
        default_args=default_args,
        num_workers=40),
    task_id="main_summary_export",
    executor=get_default_executor(),
    dag=dag)

register_status(main_summary, "Main Summary", "A summary view of main pings.")

addons = bigquery_etl_query(
    task_id="addons",
    destination_table="addons_v2",
    project_id="moz-fx-data-shared-prod",
    dataset_id="telemetry_derived",
    dag=dag)

addon_aggregates = bigquery_etl_query(
    task_id="addon_aggregates",
    destination_table="addon_aggregates_v2",
    project_id="moz-fx-data-shared-prod",
    dataset_id="telemetry_derived",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
Example #4
0
longitudinal = MozDatabricksSubmitRunOperator(
    task_id="longitudinal",
    job_name="Longitudinal View",
    execution_timeout=timedelta(hours=12),
    instance_count=16,
    instance_type="i3.8xlarge",
    env=tbv_envvar("com.mozilla.telemetry.views.LongitudinalView", {
        "bucket": "{{ task.__class__.private_output_bucket }}",
        "to": DS_WEEKLY
    },
                   metastore_location="s3://telemetry-parquet/longitudinal"),
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

register_status(longitudinal, "Longitudinal",
                "A 6-month longitudinal view of client history.")

addon_recommender = EMRSparkOperator(
    task_id="addon_recommender",
    job_name="Train the Addon Recommender",
    execution_timeout=timedelta(hours=10),
    instance_count=20,
    owner="*****@*****.**",
    email=[
        "*****@*****.**", "*****@*****.**",
        "*****@*****.**"
    ],
    env={
        "date": DS_WEEKLY,
        "privateBucket": "{{ task.__class__.private_output_bucket }}",
        "publicBucket": "{{ task.__class__.public_output_bucket }}"
    ebs_volume_size=250,
    env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView",
        options={
            "from": "{{ ds_nodash }}",
            "to": "{{ ds_nodash }}",
            "schema-report-location": "s3://{{ task.__class__.private_output_bucket }}/schema/main_summary/submission_date_s3={{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "read-mode": "aligned",
            "input-partition-multiplier": "400"
        },
        dev_options={
            "channel": "nightly",   # run on smaller nightly data rather than release
        }),
    dag=dag)

register_status(main_summary, "Main Summary", "A summary view of main pings.")

main_summary_schema = EmailSchemaChangeOperator(
    task_id="main_summary_schema",
    email=["*****@*****.**", "*****@*****.**"],
    to=["*****@*****.**", "*****@*****.**"],
    key_prefix='schema/main_summary/submission_date_s3=',
    dag=dag)

main_summary_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="main_summary_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
# output: telemetry-parquet/socorro_crash/v2
crash_report_parquet = EMRSparkOperator(
    task_id="crash_report_parquet",
    job_name="Socorro Crash Reports Parquet",
    execution_timeout=timedelta(hours=4),
    instance_count=10,
    env={"date": "{{ ds_nodash }}"},
    uri=
    "https://raw.githubusercontent.com/mozilla-services/data-pipeline/master/reports/socorro_import/ImportCrashData.ipynb",
    output_visibility="public",
    dag=dag,
)

register_status(
    crash_report_parquet,
    crash_report_parquet.job_name,
    "Convert processed crash reports into parquet for analysis",
)

crash_report_parquet_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="crash_report_parquet_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="socorro_crash",
        dataset_version="v2",
        date_submission_col="crash_date",
        gke_cluster_name="bq-load-gke-1",
        bigquery_dataset="telemetry_raw",
Example #7
0
mobile_aggregate_view = MozDatabricksSubmitRunOperator(
    task_id="mobile_aggregate_view",
    job_name="Mobile Aggregate View",
    release_label="6.1.x-scala2.11",
    instance_count=5,
    execution_timeout=timedelta(hours=12),
    env=mozetl_envvar(
        "mobile",
        {
            "date": "{{ ds_nodash }}",
            "channels": "nightly",
            "output":
            "s3://{{ task.__class__.private_output_bucket }}/mobile_metrics_aggregates/v2",
            "num-partitions": 5 * 32
        },
        other={
            "MOZETL_GIT_PATH":
            "https://github.com/mozilla/python_mozaggregator.git",
            "MOZETL_EXTERNAL_MODULE": "mozaggregator",
        },
    ),
    dag=dag,
)

register_status(
    mobile_aggregate_view,
    "Mobile Aggregates",
    "Aggregates of metrics sent through the mobile-events pings.",
)
amo_dag = DAG(
    'mango_log_processing_amo',
    default_args=DEFAULT_ARGS,
    dagrun_timeout=timedelta(hours=6),
    schedule_interval='0 3 * * *'
)

amo_logs = EmrCreateJobFlowOperator(
    task_id='amo_create_job_flow',
    job_flow_overrides={'Steps': AMO_STEPS},
    aws_conn_id='aws_data_iam',
    emr_conn_id='emr_data_iam_mango',
    dag=amo_dag
)

register_status(amo_logs, 'AMO Logs', 'Mango Processed AMO Logs')

amo_job_sensor = EmrJobFlowSensor(
    task_id='amo_check_job_flow',
    job_flow_id="{{ task_instance.xcom_pull('amo_create_job_flow', key='return_value') }}",
    aws_conn_id='aws_data_iam',
    dag=amo_dag,
    on_retry_callback=lambda context: amo_dag.clear(
        start_date=context['execution_date'],
        end_date=context['execution_date']),
)

amo_logs.set_downstream(amo_job_sensor)

# For AMO Dev and Stage Environments
amo_dev_stage_dag = DAG(
Example #9
0
# We remove the current date partition for idempotency.
remove_bq_table_partition = BigQueryTableDeleteOperator(
    task_id='remove_bq_table_partition',
    bigquery_conn_id=bq_gcp_conn_id,
    deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format(bq_dataset, bq_table_name), # noqa
    ignore_if_missing=True,
    dag=dag
)

bq_load = GKEPodOperator(
    task_id='bigquery_load',
    gcp_conn_id=bq_gcp_conn_id,
    project_id=bq_connection.project_id,
    name='load-socorro-crash-parquet-to-bq',
    image=docker_image,
    arguments=gke_args,
    env_vars={"GOOGLE_CLOUD_PROJECT":
              "{{ var.value.gcp_shared_prod_project }}"},
    dag=dag,
)

register_status(
    bq_load,
    "Socorro Crash Reports Parquet",
    "Convert processed crash reports into parquet for analysis",
)

s3_to_gcs >> crash_report_parquet
crash_report_parquet >> remove_bq_table_partition >> bq_load
Example #10
0
    job_name="Fennec iOS Events to Amplitude",
    execution_timeout=timedelta(hours=8),
    instance_count=FENNEC_IOS_INSTANCES,
    email=['*****@*****.**', '*****@*****.**'],
    env={
        "date": "{{ ds_nodash }}",
        "max_requests": FENNEC_IOS_INSTANCES * VCPUS_PER_INSTANCE,
        "key_file": key_file("fennec_ios"),
        "artifact": get_artifact_url(slug, branch="master"),
        "config_filename": "fennec_ios_events_schemas.json",
    },
    uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/events_to_amplitude.sh",
    dag=dag
)

register_status(fennec_ios_events_to_amplitude, "Firefox-iOS Amplitude events",
                "Daily job sending Firefox iOS events to Amplitude.")

devtools_release_events_to_amplitude = EMRSparkOperator(
    task_id="devtools_release_events_to_amplitude",
    job_name="DevTools Release Events to Amplitude",
    execution_timeout=timedelta(hours=8),
    instance_count=DEVTOOLS_INSTANCES,
    dev_instance_count=DEVTOOLS_INSTANCES,
    email=['*****@*****.**', '*****@*****.**'],
    owner='*****@*****.**',
    env=tbv_envvar(
        "com.mozilla.telemetry.streaming.EventsToAmplitude",
        {
            "from": "{{ ds_nodash }}",
            "to": "{{ ds_nodash }}",
            "max_parallel_requests": str(DEVTOOLS_INSTANCES * VCPUS_PER_INSTANCE),
# input: crashstats-telemetry-crashes-prod-us-west-2/v1/crash_report
# output: telemetry-parquet/socorro_crash/v2
crash_report_parquet = EMRSparkOperator(
    task_id="crash_report_parquet",
    job_name="Socorro Crash Reports Parquet",
    execution_timeout=timedelta(hours=4),
    instance_count=10,
    env={"date": "{{ ds_nodash }}"},
    uri="https://raw.githubusercontent.com/mozilla-services/data-pipeline/master/reports/socorro_import/ImportCrashData.ipynb",
    output_visibility="public",
    dag=dag,
)

register_status(
    crash_report_parquet,
    crash_report_parquet.job_name,
    "Convert processed crash reports into parquet for analysis",
)

crash_report_parquet_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="crash_report_parquet_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="socorro_crash",
        dataset_version="v2",
        date_submission_col="crash_date",
        gke_cluster_name="bq-load-gke-1",
        ),
Example #12
0
    task_id="longitudinal",
    job_name="Longitudinal View",
    execution_timeout=timedelta(hours=12),
    instance_count=16,
    instance_type="i3.8xlarge",
    env=tbv_envvar(
        "com.mozilla.telemetry.views.LongitudinalView",
        {
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "to": DS_WEEKLY
        },
        metastore_location="s3://telemetry-parquet/longitudinal"),
    uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

register_status(longitudinal, "Longitudinal", "A 6-month longitudinal view of client history.")


game_hw_survey = EMRSparkOperator(
    task_id="game_hw_survey",
    job_name="Firefox Hardware Report",
    execution_timeout=timedelta(hours=5),
    instance_count=15,
    owner="*****@*****.**",
    depends_on_past=True,
    email=["*****@*****.**", "*****@*****.**",
           "*****@*****.**"],
    env={"date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.public_output_bucket }}"},
    uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/hardware_report.sh",
    output_visibility="public",
    dag=dag)
    'email_on_retry': True,
    'retries': 3,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG(
    'mobile_aggregates',
    default_args=default_args,
    schedule_interval='@daily'
)

mobile_aggregate_view = EMRSparkOperator(
    task_id="mobile_aggregate_view",
    job_name="Mobile Aggregate View",
    instance_count=5,
    execution_timeout=timedelta(hours=12),
    env={
      "date": "{{ ds_nodash }}",
      "channels": "nightly",
      "bucket": "{{ task.__class__.private_output_bucket }}",
    },
    uri=("https://raw.githubusercontent.com/"
         "mozilla/telemetry-airflow/master/jobs/run_mobile_aggregator.sh"),
    dag=dag)

register_status(
    mobile_aggregate_view,
    'Mobile Aggregates',
    'Aggregates of metrics sent through the mobile-events pings.'
)