execution_delta=timedelta( days=-7, hours=-1 ), # main_summary waits one hour, execution date is beginning of the week dag=taar_weekly, ) taar_ensemble = MozDatabricksSubmitRunOperator( task_id="taar_ensemble", job_name="TAAR Ensemble Model", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], execution_timeout=timedelta(hours=11), instance_count=5, instance_type="i3.2xlarge", spot_bid_price_percent=100, max_instance_count=60, enable_autoscale=True, pypi_libs=[ "mozilla-taar3==0.4.5", "mozilla-srgutil==0.1.10", "python-decouple==3.1", ], env=mozetl_envvar("taar_ensemble", {"date": "{{ ds_nodash }}"}), start_date=datetime(2019, 7, 14), uri= "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-databricks.py", output_visibility="private", ) taar_ensemble.set_upstream(wait_for_clients_daily)
"bucket": "{{ task.__class__.private_output_bucket }}", "prefix": "taar/locale/" }), uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", output_visibility="private", dag=dag) taar_similarity = MozDatabricksSubmitRunOperator( task_id="taar_similarity", job_name="Taar Similarity model", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], execution_timeout=timedelta(hours=2), instance_count=11, instance_type="i3.8xlarge", driver_instance_type="i3.xlarge", env=mozetl_envvar("taar_similarity", options={ "date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", "prefix": "taar/similarity/" }), uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", output_visibility="private", dag=dag) taar_collaborative_recommender = EMRSparkOperator( task_id="addon_recommender", job_name="Train the Collaborative Addon Recommender", execution_timeout=timedelta(hours=10), instance_count=20, owner="*****@*****.**",
schedule_interval="@daily", ) prerelease_telemetry_aggregate_view = MozDatabricksSubmitRunOperator( task_id="prerelease_telemetry_aggregate_view", job_name="Prerelease Telemetry Aggregate View", instance_count=10, dev_instance_count=10, execution_timeout=timedelta(hours=12), python_version=2, env=mozetl_envvar( "aggregator", { "date": "{{ ds_nodash }}", "channels": "nightly,aurora,beta", "credentials-bucket": "telemetry-spark-emr-2", "credentials-prefix": "aggregator_database_envvars.json", "num-partitions": 10 * 32, }, dev_options={ "credentials-prefix": "aggregator_dev_database_envvars.json" }, other={ "MOZETL_GIT_PATH": "https://github.com/mozilla/python_mozaggregator.git", "MOZETL_EXTERNAL_MODULE": "mozaggregator", }, ), dag=dag, )
dag = DAG( "telemetry_aggregates_parquet", default_args=default_args, schedule_interval="@daily", ) telemetry_aggregate_parquet_view = MozDatabricksSubmitRunOperator( task_id="telemetry_aggregate_parquet_view", job_name="Telemetry Aggregate Parquet View", instance_count=5, execution_timeout=timedelta(hours=12), python_version=2, env=mozetl_envvar( "parquet", { "date": "{{ ds_nodash }}", "channels": "nightly", "output": "s3://{{ task.__class__.private_output_bucket }}/aggregates_poc/v1", }, other={ "MOZETL_GIT_PATH": "https://github.com/mozilla/python_mozaggregator.git", "MOZETL_EXTERNAL_MODULE": "mozaggregator", }, ), dag=dag, )
aws_conn_id="aws_dev_iam_s3", dag=dag, ) main_summary_all_histograms = MozDatabricksSubmitRunOperator( task_id="main_summary_all_histograms", job_name="Main Summary View - All Histograms", execution_timeout=timedelta(hours=12), instance_count=5, max_instance_count=50, enable_autoscale=True, instance_type="c4.4xlarge", spot_bid_price_percent=50, ebs_volume_count=1, ebs_volume_size=250, env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView", options={ "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "telemetry-backfill", "all_histograms": "", "read-mode": "aligned", "input-partition-multiplier": "400", }, dev_options={ "channel": "nightly", }), dag=dag) main_summary = MozDatabricksSubmitRunOperator( task_id="main_summary", job_name="Main Summary View",
dag = DAG("mobile_aggregates", default_args=default_args, schedule_interval="@daily") mobile_aggregate_view = MozDatabricksSubmitRunOperator( task_id="mobile_aggregate_view", job_name="Mobile Aggregate View", release_label="6.1.x-scala2.11", instance_count=5, execution_timeout=timedelta(hours=12), env=mozetl_envvar( "mobile", { "date": "{{ ds_nodash }}", "channels": "nightly", "output": "s3://{{ task.__class__.private_output_bucket }}/mobile_metrics_aggregates/v2", "num-partitions": 5 * 32 }, other={ "MOZETL_GIT_PATH": "https://github.com/mozilla/python_mozaggregator.git", "MOZETL_EXTERNAL_MODULE": "mozaggregator", }, ), dag=dag, ) register_status( mobile_aggregate_view, "Mobile Aggregates",
dag=dag, ) addons_daily = MozDatabricksSubmitRunOperator( task_id="addons_daily", job_name="Addons Daily", execution_timeout=timedelta(hours=4), instance_count=10, owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**", "*****@*****.**", "*****@*****.**", ], env=mozetl_envvar( "addons_report", { "date": "{{ ds_nodash }}", "deploy_environment": "{{ task.__class__.deploy_environment }}", }, other={ "MOZETL_GIT_PATH": "https://github.com/mozilla/addons_daily.git", "MOZETL_EXTERNAL_MODULE": "addons_daily", }, ), dag=dag, ) addons_daily.set_upstream(wait_for_search_clients_daily)
# Make sure all the data for the given day has arrived before running. # Running at 1am should suffice. dag = DAG('first_shutdown_summary', default_args=default_args, schedule_interval='0 1 * * *') first_shutdown_summary = MozDatabricksSubmitRunOperator( task_id="first_shutdown_summary", job_name="First Shutdown Summary View", execution_timeout=timedelta(hours=4), instance_count=5, env=tbv_envvar( "com.mozilla.telemetry.views.MainSummaryView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", "doc-type": "first_shutdown", "read-mode": "aligned", "input-partition-multiplier": "4" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) first_shutdown_summary_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="first_shutdown_summary_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet",
default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2018, 9, 10), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('landfill', default_args=default_args, schedule_interval='0 1 * * *') landfill_sampler = MozDatabricksSubmitRunOperator( task_id="landfill_sampler", job_name="Landfill Sampler", execution_timeout=timedelta(hours=2), instance_count=3, iam_role= "arn:aws:iam::144996185633:instance-profile/databricks-ec2-landfill", env=mozetl_envvar( "landfill_sampler", { "submission-date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", "prefix": "sanitized-landfill-sample", }), uri= "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", dag=dag)
# Make sure all the data for the given day has arrived before running. # Running at 1am should suffice. dag = DAG('main_summary', default_args=default_args, schedule_interval='0 1 * * *') main_summary_all_histograms = MozDatabricksSubmitRunOperator( task_id="main_summary_all_histograms", job_name="Main Summary View - All Histograms", execution_timeout=timedelta(hours=12), instance_count=5, max_instance_count=50, enable_autoscale=True, instance_type="c4.4xlarge", spot_bid_price_percent=50, ebs_volume_count=1, ebs_volume_size=250, env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView", options={ "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "bucket": "telemetry-backfill", "all_histograms": "", "read-mode": "aligned", "input-partition-multiplier": "400", }, dev_options={ "channel": "nightly", }), dag=dag) main_summary = MozDatabricksSubmitRunOperator( task_id="main_summary", job_name="Main Summary View",
dataset="churn", dataset_version="v3", date_submission_col="week_start", gke_cluster_name="bq-load-gke-1", ), task_id="churn_bigquery_load", dag=dag) churn_v2 = MozDatabricksSubmitRunOperator( task_id="churn_v2", job_name="churn 7-day v2", execution_timeout=timedelta(hours=4), instance_count=5, env=mozetl_envvar("churn", { "start_date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}" }, other={ "MOZETL_GIT_BRANCH": "churn-v2" }), # the mozetl branch was forked before python 3 support python_version=2, uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", output_visibility="public", dag=dag) churn_v2_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="churn_v2_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3",
schedule_interval='0 1 * * *') main_summary_all_histograms = MozDatabricksSubmitRunOperator( task_id="main_summary_all_histograms", job_name="Main Summary View - All Histograms", execution_timeout=timedelta(hours=12), instance_count=5, max_instance_count=50, enable_autoscale=True, instance_type="c4.4xlarge", spot_bid_price_percent=50, ebs_volume_count=1, ebs_volume_size=250, env=tbv_envvar( "com.mozilla.telemetry.views.MainSummaryView", options={ "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "schema-report-location": "s3://{{ task.__class__.private_output_bucket }}/schema/main_summary/submission_date_s3={{ ds_nodash }}", "bucket": "telemetry-backfill", "all_histograms": "", "read-mode": "aligned", "input-partition-multiplier": "400", }, dev_options={ "channel": "nightly", }), dag=dag) main_summary = EMRSparkOperator(
aws_conn_id=taar_aws_conn_id, gcp_conn_id=taar_gcpdataproc_conn_id, ), dag=dag, ) taar_similarity = MozDatabricksSubmitRunOperator( task_id="taar_similarity", job_name="Taar Similarity model", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], execution_timeout=timedelta(hours=2), instance_count=11, instance_type="i3.8xlarge", driver_instance_type="i3.xlarge", env=mozetl_envvar("taar_similarity", options={ "date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", "prefix": "taar/similarity/" }), uri= "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", output_visibility="private", dag=dag) taar_collaborative_recommender = SubDagOperator( task_id="addon_recommender", subdag=moz_dataproc_jar_runner( parent_dag_name=dag.dag_id, dag_name="addon_recommender",
dag = DAG('crash_summary', default_args=default_args, schedule_interval='@daily') # we deliberately do not autoscale this job, as it seems that the bottleneck is not # the CPU crash_summary_view = MozDatabricksSubmitRunOperator( task_id="crash_summary_view", job_name="Crash Summary View", dev_instance_count=1, instance_count=1, instance_type="c4.4xlarge", ebs_volume_count=1, ebs_volume_size=250, execution_timeout=timedelta(hours=4), env=tbv_envvar( "com.mozilla.telemetry.views.CrashSummaryView", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}", "outputBucket": "{{ task.__class__.private_output_bucket }}" }), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) crash_summary_view_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="crash_summary_view_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet",
job_name="A placeholder for the implicit clients daily dependency", dag=dag, ) bgbb_fit = MozDatabricksSubmitRunOperator( task_id="bgbb_fit", job_name="Fit parameters for a BGBB model to determine active profiles", execution_timeout=timedelta(hours=2), instance_count=3, env=mozetl_envvar( "bgbb_fit", { "submission-date": "{{ next_ds }}", "model-win": "120", "start-params": "[0.387, 0.912, 0.102, 1.504]", "sample-ids": "[42]", "sample-fraction": "1.0", "penalizer-coef": "0.01", "bucket": "{{ task.__class__.private_output_bucket }}", "prefix": "bgbb/params/v1", }, dev_options={"model-win": "30"}, other={ "MOZETL_GIT_PATH": "https://github.com/wcbeard/bgbb_airflow.git", "MOZETL_EXTERNAL_MODULE": "bgbb_airflow", }, ), dag=dag, ) clients_daily_v6_dummy >> bgbb_fit
2, 'retry_delay': timedelta(minutes=30), } dag = DAG('longitudinal', default_args=default_args, schedule_interval='@weekly') longitudinal = MozDatabricksSubmitRunOperator( task_id="longitudinal", job_name="Longitudinal View", execution_timeout=timedelta(hours=12), instance_count=16, instance_type="i3.8xlarge", env=tbv_envvar("com.mozilla.telemetry.views.LongitudinalView", { "bucket": "{{ task.__class__.private_output_bucket }}", "to": DS_WEEKLY }, metastore_location="s3://telemetry-parquet/longitudinal"), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) register_status(longitudinal, "Longitudinal", "A 6-month longitudinal view of client history.") addon_recommender = EMRSparkOperator( task_id="addon_recommender", job_name="Train the Addon Recommender", execution_timeout=timedelta(hours=10), instance_count=20,
from airflow import DAG from airflow.operators.moz_databricks import MozDatabricksSubmitRunOperator from datetime import datetime, timedelta from utils.mozetl import mozetl_envvar default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2018, 11, 26), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 3, 'retry_delay': timedelta(minutes=30), } dag = DAG('tab_spinner_severity', default_args=default_args, schedule_interval='@daily') update_tab_spinner_severity = MozDatabricksSubmitRunOperator( task_id="update_tab_spinner_severity", job_name="Tab Spinner Severity Job", execution_timeout=timedelta(hours=12), instance_count=12, env=mozetl_envvar("long_tab_spinners", {}), uri= "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", dag=dag)