def test_templated_sensor(self): dag = DAG(TEST_DAG_ID, self.args) with dag: sensor = ExternalTaskSensor( task_id='templated_task', external_dag_id='dag_{{ ds }}', external_task_id='task_{{ ds }}', start_date=DEFAULT_DATE ) instance = TaskInstance(sensor, DEFAULT_DATE) instance.render_templates() self.assertEqual(sensor.external_dag_id, "dag_{}".format(DEFAULT_DATE.date())) self.assertEqual(sensor.external_task_id, "task_{}".format(DEFAULT_DATE.date()))
def buildSubDag(dag_id, filePath, dagIdToWait): with DAG( dag_id=dag_id, schedule_interval=None, start_date=datetime(2018, 11, 11) ) as dag: def takeTime(**context): time = context['ti'].xcom_pull(task_ids=pushTimeTaskId) print(f'The time was taken {time}') return time externalTaskSensor = ExternalTaskSensor( task_id="wait_for_other_dag", external_dag_id=dagIdToWait, execution_delta=timedelta(minutes=0), external_task_id=None, ) printOperator = PythonOperator( task_id="print_time", python_callable=takeTime, provide_context=True ) removeFileOp = BashOperator( task_id="remove_file", bash_command="rm {}".format(filePath) ) finishedDateOp = BashOperator( task_id="_create_finish_date_file", bash_command="touch {}/finished_{}".format(placeToStoreResultFile, "{{ ds }}") ) externalTaskSensor >> printOperator >> removeFileOp >> finishedDateOp return dag
def test_external_task_sensor_fn_multiple_execution_dates(self): bash_command_code = """ {% set s=execution_date.time().second %} echo "second is {{ s }}" if [[ $(( {{ s }} % 60 )) == 1 ]] then exit 1 fi exit 0 """ dag_external_id = TEST_DAG_ID + '_external' dag_external = DAG(dag_external_id, default_args=self.args, schedule_interval=timedelta(seconds=1)) task_external_with_failure = BashOperator( task_id="task_external_with_failure", bash_command=bash_command_code, retries=0, dag=dag_external) task_external_without_failure = DummyOperator( task_id="task_external_without_failure", retries=0, dag=dag_external) task_external_without_failure.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + timedelta(seconds=1), ignore_ti_state=True) session = settings.Session() TI = TaskInstance try: task_external_with_failure.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + timedelta(seconds=1), ignore_ti_state=True) # The test_with_failure task is excepted to fail # once per minute (the run on the first second of # each minute). except Exception as e: failed_tis = session.query(TI).filter( TI.dag_id == dag_external_id, TI.state == State.FAILED, TI.execution_date == DEFAULT_DATE + timedelta(seconds=1)).all() if (len(failed_tis) == 1 and failed_tis[0].task_id == 'task_external_with_failure'): pass else: raise e dag_id = TEST_DAG_ID dag = DAG(dag_id, default_args=self.args, schedule_interval=timedelta(minutes=1)) task_without_failure = ExternalTaskSensor( task_id='task_without_failure', external_dag_id=dag_external_id, external_task_id='task_external_without_failure', execution_date_fn=lambda dt: [dt + timedelta(seconds=i) for i in range(2)], allowed_states=['success'], retries=0, timeout=1, poke_interval=1, dag=dag) task_with_failure = ExternalTaskSensor( task_id='task_with_failure', external_dag_id=dag_external_id, external_task_id='task_external_with_failure', execution_date_fn=lambda dt: [dt + timedelta(seconds=i) for i in range(2)], allowed_states=['success'], retries=0, timeout=1, poke_interval=1, dag=dag) task_without_failure.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) with self.assertRaises(AirflowSensorTimeout): task_with_failure.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
with DAG( "bhr_collection", default_args=default_args, schedule_interval="0 5 * * *", ) as dag: # Jobs read from/write to s3://telemetry-public-analysis-2/bhr/data/hang_aggregates/ write_aws_conn_id = 'aws_dev_telemetry_public_analysis_2_rw' aws_access_key, aws_secret_key, _ = AwsHook( write_aws_conn_id).get_credentials() wait_for_bhr_ping = ExternalTaskSensor( task_id="wait_for_bhr_ping", external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_all", execution_delta=datetime.timedelta(hours=4), check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", dag=dag, ) params = get_dataproc_parameters("google_cloud_airflow_dataproc") bhr_collection = SubDagOperator( task_id="bhr_collection", dag=dag, subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, image_version="1.5", dag_name="bhr_collection", default_args=default_args,
task_id="internet_outages__global_outages__v1", destination_table="global_outages_v1", dataset_id="internet_outages", project_id="moz-fx-data-shared-prod", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], date_partition_parameter="submission_date", depends_on_past=False, dag=dag, ) wait_for_copy_deduplicate_all = ExternalTaskSensor( task_id="wait_for_copy_deduplicate_all", external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_all", execution_delta=datetime.timedelta(seconds=7200), check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", ) internet_outages__global_outages__v1.set_upstream( wait_for_copy_deduplicate_all) wait_for_copy_deduplicate_main_ping = ExternalTaskSensor( task_id="wait_for_copy_deduplicate_main_ping", external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_main_ping", execution_delta=datetime.timedelta(seconds=7200), check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR",
task_id="telemetry_derived__addon_names__v1", destination_table="addon_names_v1", dataset_id="telemetry_derived", project_id="moz-fx-data-shared-prod", owner="*****@*****.**", email=["*****@*****.**"], date_partition_parameter=None, depends_on_past=False, parameters=["submission_date:DATE:{{ds}}"], dag=dag, ) wait_for_search_derived__search_clients_daily__v8 = ExternalTaskSensor( task_id="wait_for_search_derived__search_clients_daily__v8", external_dag_id="bqetl_search", external_task_id="search_derived__search_clients_daily__v8", check_existence=True, mode="reschedule", ) telemetry_derived__addons_daily__v1.set_upstream( wait_for_search_derived__search_clients_daily__v8) wait_for_telemetry_derived__clients_last_seen__v1 = ExternalTaskSensor( task_id="wait_for_telemetry_derived__clients_last_seen__v1", external_dag_id="bqetl_clients_daily", external_task_id="telemetry_derived__clients_last_seen__v1", check_existence=True, mode="reschedule", ) telemetry_derived__addons_daily__v1.set_upstream(
'start_date': datetime(2019, 9, 30), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('fx_usage_report', default_args=default_args, schedule_interval='@weekly') wait_for_main_summary = ExternalTaskSensor( task_id='wait_for_main_summary', external_dag_id='parquet_export', external_task_id='main_summary_export', execution_delta=timedelta( days=-7, hours=-2 ), # main_summary waits two hours, execution date is beginning of the week dag=dag) cluster_name = 'fx-usage-report-dataproc-cluster' gcp_conn_id = 'google_cloud_airflow_dataproc' # AWS credentials to read/write from output bucket aws_conn_id = 'aws_prod_fx_usage_report' aws_access_key, aws_secret_key, session = AwsHook( aws_conn_id).get_credentials() output_bucket = 'net-mozaws-prod-us-west-2-data-public' usage_report = SubDagOperator(
task_id="telemetry_derived__smoot_usage_nondesktop_compressed__v2", destination_table="smoot_usage_nondesktop_compressed_v2", dataset_id="telemetry_derived", project_id="moz-fx-data-shared-prod", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], date_partition_parameter="submission_date", depends_on_past=False, dag=dag, ) wait_for_telemetry_derived__clients_last_seen__v1 = ExternalTaskSensor( task_id="wait_for_telemetry_derived__clients_last_seen__v1", external_dag_id="bqetl_main_summary", external_task_id="telemetry_derived__clients_last_seen__v1", execution_delta=datetime.timedelta(seconds=3600), check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", ) telemetry_derived__smoot_usage_desktop__v2.set_upstream( wait_for_telemetry_derived__clients_last_seen__v1) telemetry_derived__smoot_usage_desktop_compressed__v2.set_upstream( telemetry_derived__smoot_usage_desktop__v2) wait_for_firefox_accounts_derived__fxa_users_last_seen__v1 = ExternalTaskSensor( task_id="wait_for_firefox_accounts_derived__fxa_users_last_seen__v1", external_dag_id="bqetl_fxa_events", external_task_id="firefox_accounts_derived__fxa_users_last_seen__v1",
# If a task fails, retry it once after waiting at least 5 minutes "retries": 1, "retry_delay": datetime.timedelta(minutes=5), } dag_name = "fission_experiment_monitoring" with models.DAG( dag_name, schedule_interval="0 2 * * *", default_args=default_args) as dag: wait_for_copy_deduplicate_main_ping = ExternalTaskSensor( task_id="wait_for_copy_deduplicate_main_ping", external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_main_ping", execution_delta=datetime.timedelta(hours=1), dag=dag, ) fission_monitoring_main_v1 = bigquery_etl_query( task_id="fission_monitoring_main_v1", project_id="moz-fx-data-shared-prod", destination_table="fission_monitoring_main_v1", dataset_id="telemetry_derived", arguments=('--schema_update_option=ALLOW_FIELD_ADDITION',), ) wait_for_copy_deduplicate_crash_ping = ExternalTaskSensor( task_id="wait_for_copy_deduplicate_crash_ping", external_dag_id="copy_deduplicate",
artifact_bucket=params.artifact_bucket, storage_bucket=params.storage_bucket, default_args=subdag_args, ), ) if params.is_dev: copy_to_dev = copy_artifacts_dev(dag, params.project_id, params.artifact_bucket, params.storage_bucket) copy_to_dev >> ltv_daily else: wait_for_search_clients_last_seen = ExternalTaskSensor( task_id="wait_for_search_clients_last_seen", external_dag_id="bqetl_search", external_task_id="search_derived__search_clients_last_seen__v1", execution_delta=timedelta(hours=1), check_existence=True, dag=dag, ) wait_for_search_clients_last_seen >> ltv_daily response = urlopen('/'.join([ 'https://raw.githubusercontent.com/mozilla/bigquery-etl/master/sql', 'revenue_derived', 'client_ltv_v1', 'query.sql' ])) BigQueryOperator.template_fields += ('query_params', ) ltv_revenue_join = BigQueryOperator( task_id='ltv_revenue_join', sql=response.read().decode('utf-8'), query_params=[{
core_clients_last_seen = bigquery_etl_query( task_id='core_clients_last_seen', destination_table='core_clients_last_seen_raw_v1', dataset_id='telemetry', depends_on_past=True, ) core_clients_daily >> core_clients_last_seen # Daily and last seen views on top of glean pings. wait_for_copy_deduplicate = ExternalTaskSensor( task_id="wait_for_copy_deduplicate", external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_all", execution_delta=timedelta(hours=1), dag=dag, ) fenix_clients_daily = bigquery_etl_query( task_id='fenix_clients_daily', destination_table= 'moz-fx-data-shared-prod:org_mozilla_fenix_derived.clients_daily_v1', sql_file_path= 'sql/org_mozilla_fenix_derived/clients_daily_v1/query.sql', dataset_id='org_mozilla_fenix_derived', start_date=datetime(2019, 9, 5), ) fenix_clients_daily << wait_for_copy_deduplicate
# top_signatures_correlations uploads results to public analysis bucket write_aws_conn_id = "aws_dev_telemetry_public_analysis_2_rw" analysis_access_key, analysis_secret_key, _ = AwsHook( write_aws_conn_id).get_credentials() # modules_with_missing_symbols sends results as email ses_aws_conn_id = "aws_data_iam_ses" ses_access_key, ses_secret_key, _ = AwsHook( ses_aws_conn_id).get_credentials() wait_for_socorro_import = ExternalTaskSensor( task_id="wait_for_socorro_import", external_dag_id="socorro_import", external_task_id="bigquery_load", check_existence=True, execution_delta=datetime.timedelta(hours=5), mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", email_on_retry=False, dag=dag, ) params = get_dataproc_parameters("google_cloud_airflow_dataproc") modules_with_missing_symbols = SubDagOperator( task_id="modules_with_missing_symbols", dag=dag, subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, image_version="1.5", dag_name="modules_with_missing_symbols",
name="taar-update-whitelist", # This uses a circleci built docker image from github.com/mozilla/taar_gcp_etl image="gcr.io/moz-fx-data-airflow-prod-88e0/taar_gcp_etl:0.1", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], arguments=["-m", "taar_etl.taar_update_whitelist", "--date", "{{ ds_nodash }}"], env_vars={ "AWS_ACCESS_KEY_ID": taar_aws_access_key, "AWS_SECRET_ACCESS_KEY": taar_aws_secret_key, }, dag=dag, ) wait_for_clients_daily_export = ExternalTaskSensor( task_id="wait_for_clients_daily_export", external_dag_id="main_summary", external_task_id="clients_daily_export", dag=dag) wait_for_main_summary_export = ExternalTaskSensor( task_id="wait_for_main_summary_export", external_dag_id="main_summary", external_task_id="main_summary_export", dag=dag) taar_locale = SubDagOperator( task_id="taar_locale", subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name="taar_locale",
True, "retries": 2, } with DAG("bqetl_deviations", default_args=default_args, schedule_interval="0 1 * * *") as dag: telemetry_derived__deviations__v1 = bigquery_etl_query( task_id="telemetry_derived__deviations__v1", destination_table="deviations_v1", dataset_id="telemetry_derived", project_id="moz-fx-data-shared-prod", owner="*****@*****.**", email=["*****@*****.**"], date_partition_parameter="submission_date", depends_on_past=False, dag=dag, ) wait_for_anomdtct_anomdtct = ExternalTaskSensor( task_id="wait_for_anomdtct_anomdtct", external_dag_id="anomdtct", external_task_id="anomdtct", check_existence=True, dag=dag, ) telemetry_derived__deviations__v1.set_upstream(wait_for_anomdtct_anomdtct)
def test_external_task_sensor_fn_multiple_execution_dates(self): bash_command_code = """ {% set s=execution_date.time().second %} echo "second is {{ s }}" if [[ $(( {{ s }} % 60 )) == 1 ]] then exit 1 fi exit 0 """ dag_external_id = TEST_DAG_ID + '_external' dag_external = DAG( dag_external_id, default_args=self.args, schedule_interval=timedelta(seconds=1)) task_external_with_failure = BashOperator( task_id="task_external_with_failure", bash_command=bash_command_code, retries=0, dag=dag_external) task_external_without_failure = DummyOperator( task_id="task_external_without_failure", retries=0, dag=dag_external) task_external_without_failure.run( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + timedelta(seconds=1), ignore_ti_state=True) session = settings.Session() TI = TaskInstance try: task_external_with_failure.run( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + timedelta(seconds=1), ignore_ti_state=True) # The test_with_failure task is excepted to fail # once per minute (the run on the first second of # each minute). except Exception as e: failed_tis = session.query(TI).filter( TI.dag_id == dag_external_id, TI.state == State.FAILED, TI.execution_date == DEFAULT_DATE + timedelta(seconds=1)).all() if (len(failed_tis) == 1 and failed_tis[0].task_id == 'task_external_with_failure'): pass else: raise e dag_id = TEST_DAG_ID dag = DAG( dag_id, default_args=self.args, schedule_interval=timedelta(minutes=1)) task_without_failure = ExternalTaskSensor( task_id='task_without_failure', external_dag_id=dag_external_id, external_task_id='task_external_without_failure', execution_date_fn=lambda dt: [dt + timedelta(seconds=i) for i in range(2)], allowed_states=['success'], retries=0, timeout=1, poke_interval=1, dag=dag) task_with_failure = ExternalTaskSensor( task_id='task_with_failure', external_dag_id=dag_external_id, external_task_id='task_external_with_failure', execution_date_fn=lambda dt: [dt + timedelta(seconds=i) for i in range(2)], allowed_states=['success'], retries=0, timeout=1, poke_interval=1, dag=dag) task_without_failure.run( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) with self.assertRaises(AirflowSensorTimeout): task_with_failure.run( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
"retries": 2, } with DAG( "bqetl_asn_aggregates", default_args=default_args, schedule_interval="0 2 * * *" ) as dag: telemetry_derived__asn_aggregates__v1 = bigquery_etl_query( task_id="telemetry_derived__asn_aggregates__v1", destination_table="asn_aggregates_v1", dataset_id="telemetry_derived", project_id="moz-fx-data-shared-prod", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], date_partition_parameter="submission_date", depends_on_past=False, parameters=["n_clients:INT64:500"], dag=dag, ) wait_for_bq_main_events = ExternalTaskSensor( task_id="wait_for_bq_main_events", external_dag_id="copy_deduplicate", external_task_id="bq_main_events", execution_delta=datetime.timedelta(seconds=3600), check_existence=True, mode="reschedule", ) telemetry_derived__asn_aggregates__v1.set_upstream(wait_for_bq_main_events)
"retries": 2, "retry_delay": timedelta(minutes=10), } dag = DAG("public_data_hardware_report", default_args=default_args, schedule_interval="0 1 * * MON") # Required to write json output to s3://telemetry-public-analysis-2/public-data-report/hardware/ write_aws_conn_id='aws_dev_telemetry_public_analysis_2_rw' aws_access_key, aws_secret_key, session = AwsHook(write_aws_conn_id).get_credentials() # hardware_report's execution date will be {now}-7days. It will read last week's main pings, # therefore we need to wait for yesterday's Main Ping deduplication task to finish wait_for_main_ping = ExternalTaskSensor( task_id="wait_for_main_ping", external_dag_id="main_summary", external_task_id="copy_deduplicate_main_ping", execution_delta=timedelta(days=-6), check_existence=True, dag=dag, ) params = get_dataproc_parameters("google_cloud_airflow_dataproc") hardware_report = SubDagOperator( task_id="public_data_hardware_report", dag=dag, subdag = moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name="public_data_hardware_report", default_args=default_args, cluster_name="public-data-hardware-report-{{ ds }}", job_name="Firefox_Public_Data_Hardware_Report-{{ ds }}",
'owner': 'datascience', 'depends_on_past': False, 'start_date': days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG( 'email_discounts', schedule_interval='@hourly', catchup=False, default_args=default_args, description= 'Email discounts to customers that have experienced order delays daily.') # Wait for delivery_times_7_days DAG to complete t1 = ExternalTaskSensor(task_id='wait_for_delivery_times_7_days', external_dag_id='delivery_times_7_days', mode='reschedule', dag=dag) t2 = PostgresOperator(task_id='insert', postgres_conn_id='food_delivery_db', sql=''' SELECT * FROM discounts; ''', dag=dag) t1 >> t2
} GLAM_DAG = "glam" GLAM_CLIENTS_HISTOGRAM_AGGREGATES_SUBDAG = "clients_histogram_aggregates" PERCENT_RELEASE_WINDOWS_SAMPLING = "10" dag = DAG(GLAM_DAG, default_args=default_args, schedule_interval="0 2 * * *") gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc") # Make sure all the data for the given day has arrived before running. wait_for_main_ping = ExternalTaskSensor( task_id="wait_for_main_ping", project_id=project_id, external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_main_ping", execution_delta=timedelta(hours=1), check_existence=True, dag=dag, ) latest_versions = bigquery_etl_query( task_id="latest_versions", destination_table="latest_versions", dataset_id=dataset_id, project_id=project_id, owner="*****@*****.**", date_partition_parameter=None, arguments=("--replace", ), dag=dag, )
email=["*****@*****.**"], date_partition_parameter="submission_date", depends_on_past=False, dag=dag, ) search_derived__search_metric_contribution__v1.set_upstream( search_derived__search_clients_daily__v8 ) search_derived__search_aggregates__v8.set_upstream( search_derived__search_clients_daily__v8 ) search_derived__search_clients_last_seen__v1.set_upstream( search_derived__search_clients_daily__v8 ) wait_for_main_summary_main_summary = ExternalTaskSensor( task_id="wait_for_main_summary_main_summary", external_dag_id="main_summary", external_task_id="main_summary", check_existence=True, mode="reschedule", dag=dag, ) search_derived__search_clients_daily__v8.set_upstream( wait_for_main_summary_main_summary )
task_id="telemetry_derived__adm_engagements_daily__v1", destination_table="adm_engagements_daily_v1", dataset_id="telemetry_derived", project_id="moz-fx-data-shared-prod", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], date_partition_parameter="submission_date", depends_on_past=False, dag=dag, ) wait_for_bq_main_events = ExternalTaskSensor( task_id="wait_for_bq_main_events", external_dag_id="copy_deduplicate", external_task_id="bq_main_events", execution_delta=datetime.timedelta(seconds=3600), check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", ) telemetry_derived__adm_engagements_daily__v1.set_upstream(wait_for_bq_main_events) wait_for_event_events = ExternalTaskSensor( task_id="wait_for_event_events", external_dag_id="copy_deduplicate", external_task_id="event_events", execution_delta=datetime.timedelta(seconds=3600), check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", )
project_id="moz-fx-data-shared-prod", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], date_partition_parameter="submission_date", depends_on_past=False, dag=dag, ) messaging_system_derived__cfr_exact_mau28_by_dimensions__v1.set_upstream( messaging_system_derived__cfr_users_last_seen__v1) wait_for_copy_deduplicate_all = ExternalTaskSensor( task_id="wait_for_copy_deduplicate_all", external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_all", execution_delta=datetime.timedelta(seconds=3600), check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", ) messaging_system_derived__cfr_users_daily__v1.set_upstream( wait_for_copy_deduplicate_all) messaging_system_derived__cfr_users_last_seen__v1.set_upstream( messaging_system_derived__cfr_users_daily__v1) messaging_system_derived__onboarding_users_daily__v1.set_upstream( wait_for_copy_deduplicate_all) messaging_system_derived__onboarding_users_last_seen__v1.set_upstream(
default_args=rocket_args, project='moz-fx-data-shared-prod', dataset='telemetry', table_or_view='rocket_android_events_v1', s3_prefix='rocket_android', ), task_id=rocket_android_task_id ) # DevTools view merges events from `telemetry.main` and `telemetry.event`. # We need to make sure both tables are ready and deduplicated before proceeding. wait_for_copy_deduplicate_all = ExternalTaskSensor( task_id="wait_for_copy_deduplicate_all", external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_all", execution_delta=datetime.timedelta(hours=1), mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", email_on_retry=False, dag=dag) wait_for_copy_deduplicate_main_ping = ExternalTaskSensor( task_id="wait_for_copy_deduplicate_main_ping", external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_main_ping", execution_delta=datetime.timedelta(hours=1), mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", email_on_retry=False, dag=dag, )
], "org_mozilla_fenix_glam_beta": ["org_mozilla_fenix", "org_mozilla_firefox_beta"], "org_mozilla_fenix_glam_release": ["org_mozilla_firefox"], } dag = DAG("glam_fenix", default_args=default_args, schedule_interval="0 2 * * *") wait_for_copy_deduplicate = ExternalTaskSensor( task_id="wait_for_copy_deduplicate", external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_all", execution_delta=timedelta(hours=1), check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", email_on_retry=False, dag=dag, ) mapping = {} for product in PRODUCTS: query = generate_and_run_glean_query( task_id=f"daily_{product}", product=product, destination_project_id=PROJECT, env_vars=dict(STAGE="daily"), dag=dag, )
"--model-input-table-id", "ltv_daily_model_perf", "--model-output-table-id", "ltv_daily", "--temporary-gcs-bucket", params.storage_bucket, ], gcp_conn_id=params.conn_id, service_account=params.client_email, artifact_bucket=params.artifact_bucket, storage_bucket=params.storage_bucket, default_args=subdag_args, ), ) if params.is_dev: copy_to_dev = copy_artifacts_dev(dag, params.project_id, params.artifact_bucket, params.storage_bucket) copy_to_dev >> ltv_daily else: wait_for_search_clients_last_seen = ExternalTaskSensor( task_id="wait_for_search_clients_last_seen", external_dag_id="main_summary", external_task_id="search_clients_last_seen", execution_delta=timedelta(hours=-1), check_existence=True, dag=dag, ) wait_for_search_clients_last_seen >> ltv_daily
task_id="event_events", project_id="moz-fx-data-shared-prod", destination_table="event_events_v1", dataset_id="telemetry_derived", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], arguments=('--schema_update_option=ALLOW_FIELD_ADDITION', ), ) copy_deduplicate_all >> event_events # Experiment enrollment aggregates chain (depends on events) wait_for_main_events = ExternalTaskSensor( task_id="wait_for_main_events", external_dag_id="main_summary", external_task_id="bq_main_events", dag=dag) experiment_enrollment_aggregates = bigquery_etl_query( task_id="experiment_enrollment_aggregates", project_id="moz-fx-data-shared-prod", destination_table="experiment_enrollment_aggregates_v1", dataset_id="telemetry_derived", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"]) gen_query_task_id = "experiment_enrollment_aggregates_live_generate_query" # setting xcom_push to True outputs this query to an xcom experiment_enrollment_aggregates_live_generate_query = gke_command(
'retries': 3 } dag_core_id = TEST_DAG_ID + '_core' dag_core = DAG(dag_core_id, default_args=args, schedule_interval=timedelta(seconds=1)) task_core = DummyOperator(task_id='task_core', dag=dag_core) dag_first_child_id = TEST_DAG_ID + '_first_child' dag_first_child = DAG(dag_first_child_id, default_args=args, schedule_interval=timedelta(seconds=1)) t1_first_child = ExternalTaskSensor(task_id='t1_first_child', external_dag_id=dag_core_id, external_task_id='task_core', poke_interval=1, dag=dag_first_child, depends_on_past=True) t2_first_child = DummyOperator(task_id='t2_first_child', dag=dag_first_child, depends_on_past=True) t2_first_child.set_upstream(t1_first_child) dag_second_child_id = TEST_DAG_ID + '_second_child' dag_second_child = DAG(dag_second_child_id, default_args=args, schedule_interval=timedelta(seconds=1)) t1_second_child = ExternalTaskSensor(task_id='t1_second_child', external_dag_id=dag_first_child_id, external_task_id='t2_first_child', poke_interval=1,
CleanUpSKOPages = TriggerDagRunOperator(task_id='CleanUpSKOPages', trigger_dag_id="01_CleanupDag", python_callable=trigger_dag_run_pass_params, params={}, dag=dag) ImportComtelData = TriggerDagRunOperator(task_id='ImportComtelData', trigger_dag_id="02_ImportDataDag", python_callable=trigger_dag_run_pass_params, params={}, dag=dag) WaitForImportComtelData = ExternalTaskSensor(task_id='WaitForImportComtelData', external_dag_id='02_ImportDataDag', external_task_id='dag_complete', execution_delta=None, # Same day as today dag=dag) ImportProgramLogMinus14 = TriggerDagRunOperator(task_id='ImportProgramLogMinus14', trigger_dag_id="03_ImportProgramLog", python_callable=trigger_dag_run_pass_params, params={'daysDelta': 14}, dag=dag) WaitForImportProgramLogMinus14 = ExternalTaskSensor(task_id='WaitForImportProgramLogMinus14', external_dag_id='03_ImportProgramLog', external_task_id='dag_complete', execution_delta=None, # Same day as today dag=dag)
org_mozilla_vrbrowser_derived__clients_daily__v1 = bigquery_etl_query( task_id="org_mozilla_vrbrowser_derived__clients_daily__v1", destination_table="clients_daily_v1", dataset_id="org_mozilla_vrbrowser_derived", project_id="moz-fx-data-shared-prod", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], date_partition_parameter="submission_date", depends_on_past=False, dag=dag, ) wait_for_copy_deduplicate_copy_deduplicate_all = ExternalTaskSensor( task_id="wait_for_copy_deduplicate_copy_deduplicate_all", external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_all", check_existence=True, dag=dag, ) org_mozilla_vrbrowser_derived__baseline_daily__v1.set_upstream( wait_for_copy_deduplicate_copy_deduplicate_all) org_mozilla_vrbrowser_derived__metrics_daily__v1.set_upstream( wait_for_copy_deduplicate_copy_deduplicate_all) org_mozilla_vrbrowser_derived__clients_last_seen__v1.set_upstream( org_mozilla_vrbrowser_derived__clients_daily__v1) org_mozilla_vrbrowser_derived__clients_daily__v1.set_upstream( org_mozilla_vrbrowser_derived__baseline_daily__v1)
'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG( 'etl_orders', schedule_interval='@hourly', catchup=False, default_args=default_args, description='Loads newly placed orders daily.' ) # Wait for new_food_deliveries DAG to complete t1 = ExternalTaskSensor( task_id='wait_for_new_food_deliveries', external_dag_id='new_food_deliveries', mode='reschedule', dag=dag ) # Wait for etl_menu_items DAG to complete t2 = ExternalTaskSensor( task_id='wait_for_etl_menu_items', external_dag_id='etl_menu_items', mode='reschedule', dag=dag ) t3 = PostgresOperator( task_id='if_not_exists', postgres_conn_id='food_delivery_db', sql='''
default_args = { 'owner': '*****@*****.**', 'depends_on_past': True, 'start_date': datetime(2017, 1, 1), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('fx_usage_report', default_args=default_args, schedule_interval='@weekly') wait_for_main_summary = ExternalTaskSensor( task_id='wait_for_main_summary', external_dag_id='main_summary', external_task_id='main_summary', execution_delta=timedelta(days=-7, hours=-1), # main_summary waits one hour, execution date is beginning of the week dag=dag) usage_report = EMRSparkOperator( task_id="fx_usage_report", job_name="Fx Usage Report", execution_timeout=timedelta(hours=4), instance_count=10, release_label="emr-5.11.0", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], env={"date": DS_WEEKLY, "bucket": "{{ task.__class__.public_output_bucket }}", "deploy_environment": "{{ task.__class__.deploy_environment }}"}, uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/fx_usage_report.sh",