dag_name="taar_lite", default_args=default_args, cluster_name=taarlite_cluster_name, job_name="TAAR_Lite_GUID_GUID", python_driver_code= "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_lite_guidguid.py", # python_driver_code="gs://temp-hwoo-removemelater/taar_lite_guidguid.py", num_workers=8, py_args=[ "--date", "{{ ds_nodash }}", "--aws_access_key_id", aws_access_key, "--aws_secret_access_key", aws_secret_key, ], aws_conn_id=aws_conn_id, gcp_conn_id=gcpdataproc_conn_id, ), dag=dag, ) # Set a dependency on amodump from amowhitelist amowhitelist.set_upstream(amodump) # Set a dependency on amodump for the editorial reviewed whitelist of # addons editorial_whitelist.set_upstream(amodump) # Set a dependency on amowhitelist from taar_lite taar_lite.set_upstream(amowhitelist)
bash_command=a1, dag=dag) t_to_hive = BashOperator(task_id='to_hive', bash_command=a1, dag=dag) feature_analysis = SubDagOperator( task_id='feature_analysis', subdag=subdag(DAG_NAME, 'feature_analysis', default_args), dag=dag, ) feature_slope = SubDagOperator( task_id='feature_slope', subdag=subdag(DAG_NAME, 'feature_slope', default_args), dag=dag, ) kdj_rsi = SubDagOperator( task_id='kdj_rsi_stockstats', subdag=subdag(DAG_NAME, 'kdj_rsi_stockstats', default_args), dag=dag, ) download_main = DummyOperator(task_id='run_main_PROJECTNAME') to_hive = DummyOperator(task_id='to_hive') #t_mv_daily_report.set_upstream(t_dazongjiaoyi) t_clean_data.set_upstream(t_download_data) t_to_hive.set_upstream(t_clean_data) t_scrap_data.set_upstream(t_to_hive) t_analysis.set_upstream(t_scrap_data) feature_analysis.set_upstream(t_analysis) feature_slope.set_upstream(t_analysis) kdj_rsi.set_upstream(t_analysis) #download_main.set_upstream(t_scrap_data) t_scrap_data >> download_main >> to_hive
taar_collaborative_recommender = EMRSparkOperator( task_id="addon_recommender", job_name="Train the Collaborative Addon Recommender", execution_timeout=timedelta(hours=10), instance_count=20, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], env={"date": "{{ ds_nodash }}", "privateBucket": "{{ task.__class__.private_output_bucket }}", "publicBucket": "{{ task.__class__.public_output_bucket }}"}, uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/addon_recommender.sh", dag=dag) main_summary_schema.set_upstream(main_summary) main_summary_bigquery_load.set_upstream(main_summary) engagement_ratio.set_upstream(main_summary) addons.set_upstream(main_summary) addons_bigquery_load.set_upstream(addons) addon_aggregates.set_upstream(addons) addon_aggregates_bigquery_load.set_upstream(addon_aggregates) main_events.set_upstream(main_summary) main_events_bigquery_load.set_upstream(main_events) main_summary_experiments.set_upstream(main_summary) main_summary_experiments_bigquery_load.set_upstream(main_summary_experiments) experiments_aggregates_import.set_upstream(main_summary_experiments)
subdag_id = 'copy_data_to_redshift' copy_data_to_redshift = SubDagOperator(subdag=get_s3_to_redshift( parent_dag_name='udacity-dend-capstone', task_id=subdag_id, tables_definition=copy_definitions, redshift_conn_id='redshift', redshift_schema='public', s3_conn_id='aws_credentials', s3_bucket='udac-dend-capstone-dz', load_type='truncate', schema_location='Local', start_date=start_date), task_id=subdag_id, dag=dag, executor=LocalExecutor()) copy_data_to_redshift.set_upstream(start_operator) process_dim_category = PostgresOperator(dag=dag, task_id='process_dim_category', sql='/sql/categories.sql', postgres_conn_id='redshift') process_dim_category.set_upstream(copy_data_to_redshift) process_dim_cities = PostgresOperator(dag=dag, task_id='process_dim_cities', sql='/sql/cities.sql', postgres_conn_id='redshift') process_dim_cities.set_upstream(copy_data_to_redshift) process_dim_business = PostgresOperator(dag=dag, task_id='process_dim_business',
owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], sql_file_path='sql/search/search_clients_daily_v8/query.sql', dag=dag) search_aggregates_bigquery = bigquery_etl_query( task_id="search_aggregates_bigquery", destination_table="search_aggregates_v8", dataset_id="search_staging", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], sql_file_path='sql/search/search_aggregates_v8/query.sql', dag=dag) main_summary_schema.set_upstream(main_summary) main_summary_bigquery_load.set_upstream(main_summary) main_summary_dataproc.set_upstream(main_ping_bigquery_export) main_ping_bigquery_export_delete.set_upstream(main_summary_dataproc) main_summary_dataproc_bigquery_load.set_upstream(main_summary_dataproc) main_summary_dataproc_s3_copy.set_upstream(main_summary_dataproc) engagement_ratio.set_upstream(main_summary) addons.set_upstream(main_summary) addons_bigquery_load.set_upstream(addons) addon_aggregates.set_upstream(addons) addon_aggregates_bigquery_load.set_upstream(addon_aggregates) main_events.set_upstream(main_summary) main_events_bigquery_load.set_upstream(main_events)
'depends_on_past': False, 'start_date': datetime(2020, 6, 4), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } # Step 2 - Create a DAG object dag = DAG('sub_dag_demo', schedule_interval='0 0 * * *', default_args=default_args) # Step 3 - Create tasks some_task = BashOperator(task_id='someTask', bash_command="echo I am some task", dag=dag) sub_dag = SubDagOperator(subdag=my_sub_dag('sub_dag_demo'), task_id='my_sub_dag', dag=dag) final_task = BashOperator(task_id='finalTask', bash_command="echo I am the final task", dag=dag) # Step 4 - Define the sequence of tasks. sub_dag.set_upstream(some_task) final_task.set_upstream(sub_dag)
dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="sync_flat_summary", dataset_version="v1", gke_cluster_name="bq-load-gke-1", ), task_id="sync_flat_view_bigquery_load", dag=dag) sync_bookmark_validation = EMRSparkOperator( task_id="sync_bookmark_validation", job_name="Sync Bookmark Validation", execution_timeout=timedelta(hours=2), instance_count=1, email=["*****@*****.**", "*****@*****.**"], env=mozetl_envvar("sync_bookmark_validation", { "start_date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}", }), uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", dag=dag) sync_bookmark_validation.set_upstream(sync_view) sync_view_bigquery_load.set_upstream(sync_view) sync_events_view_bigquery_load.set_upstream(sync_events_view) sync_flat_view_bigquery_load.set_upstream(sync_flat_view)
task_id=task_id, subdag=subdag(task_id, 'daily_items_indicator_info', data_worker.worker.collect_some_daily_info), executor=CeleryExecutor(), dag=dag, ) task_id = 'collect_daily_items_info_all' items_all_task = SubDagOperator( task_id=task_id, subdag=subdag(task_id, 'daily_items_info', data_worker.worker.collect_some_daily_info_all), executor=CeleryExecutor(), dag=dag, ) task_id = 'collect_daily_items_indicator_info_all' items_indicator_all_task = SubDagOperator( task_id=task_id, subdag=subdag(task_id, 'daily_items_indicator_info', data_worker.worker.collect_some_daily_info_all), executor=CeleryExecutor(), dag=dag, ) items_task.set_upstream( [itemcodes_task, item_naver_sectors_task, item_themes_task]) items_task.set_downstream(items_all_task) items_all_task.set_downstream(items_indicator_task) items_indicator_task.set_downstream(items_indicator_all_task)
}, dag=dag) deliver_res_op = """ cp -r {{ params.project_dir }}/{{ params.dag_id }}_results/* {{ params.project_dir }}/results """ deliver_res = BashOperator(task_id='Deliver_result', bash_command=deliver_res_op, params={ 'project_dir': project_directory, 'dag_id': dag_id }, dag=dag) preprosessing.set_upstream(clean_up) ms_concatenation.set_upstream(preprosessing) clustering_or.set_upstream(ms_concatenation) taxo_assignation.set_upstream(clustering_or) biom_generation.set_upstream(clustering_or) biom_generation.set_upstream(taxo_assignation) tree_generation.set_upstream(clustering_or) filter_weak_otus.set_upstream(biom_generation) biom_conversion.set_upstream(filter_weak_otus) raw_matrix_generation.set_upstream(biom_conversion) matrix_normalization.set_upstream(raw_matrix_generation) matrix_consolidation.set_upstream(matrix_normalization) output_res.set_upstream(ms_concatenation) output_res.set_upstream(tree_generation) output_res.set_upstream(biom_conversion) output_res.set_upstream(raw_matrix_generation)
task_id="terminate_backfill_cluster", aws_conn_id='aws_default', execution_timeout=timedelta(minutes=10), job_flow_id=job_flow_id_template, trigger_rule=TriggerRule.ALL_DONE, dag=dag ) job_flow_termination_sensor_task = MozEmrClusterEndSensor( task_id="cluster_termination_sensor", timeout=timedelta(hours=1).total_seconds(), job_flow_id=job_flow_id_template, dag=dag ) cluster_start_sensor_task.set_upstream(create_job_flow_task) for day in range(7): task_id = "main_summary_day_{}".format(day) subdag_task = SubDagOperator( task_id=task_id, subdag=main_summary_subdag_factory(dag, task_id, day), on_retry_callback=clear_subdag_callback, dag=dag ) subdag_task.set_upstream(cluster_start_sensor_task) terminate_job_flow_task.set_upstream(subdag_task) job_flow_termination_sensor_task.set_upstream(terminate_job_flow_task)
terminate_job_flow_task = EmrTerminateJobFlowOperator( task_id="terminate_backfill_cluster", aws_conn_id='aws_default', execution_timeout=timedelta(minutes=10), job_flow_id=job_flow_id_template, trigger_rule=TriggerRule.ALL_DONE, dag=dag) job_flow_termination_sensor_task = MozEmrClusterEndSensor( task_id="cluster_termination_sensor", timeout=timedelta(hours=1).total_seconds(), job_flow_id=job_flow_id_template, dag=dag) cluster_start_sensor_task.set_upstream(create_job_flow_task) upstream = cluster_start_sensor_task for day in range(7): task_id = "main_summary_day_{}".format(day) subdag_task = SubDagOperator(task_id=task_id, subdag=main_summary_subdag_factory( dag, task_id, day), on_retry_callback=clear_subdag_callback, dag=dag) subdag_task.set_upstream(upstream) terminate_job_flow_task.set_upstream(subdag_task) upstream = subdag_task job_flow_termination_sensor_task.set_upstream(terminate_job_flow_task)
py_args=[ "--date", "{{ ds_nodash }}", "--aws_access_key_id", taar_aws_access_key, "--aws_secret_access_key", taar_aws_secret_key, ], aws_conn_id=taar_aws_conn_id, gcp_conn_id=taar_gcpdataproc_conn_id, ), dag=dag, ) main_summary_schema.set_upstream(main_summary) main_summary_bigquery_load.set_upstream(main_summary) sql_main_summary.set_upstream(copy_deduplicate_main_ping) sql_main_summary_export.set_upstream(sql_main_summary) clients_daily.set_upstream(sql_main_summary) clients_daily_export.set_upstream(clients_daily) addons.set_upstream(main_summary) addons_bigquery_load.set_upstream(addons) addon_aggregates.set_upstream(addons) addon_aggregates_bigquery_load.set_upstream(addon_aggregates) main_events.set_upstream(main_summary) main_events_bigquery_load.set_upstream(main_events) main_summary_experiments.set_upstream(main_summary)
for model in SOURCE_DATASET_TABLE_NAMES: subdag_preprocess_op = SubDagOperator( task_id="subdag_preprocess_{}_task".format(model.replace(".", "_")), subdag=preprocess.preprocess_tasks( model, DAG_NAME, "subdag_preprocess_{}_task".format(model.replace(".", "_")), default_args, PROJECT_ID, BUCKET, DATA_DIR), dag=dag) subdag_training_op = SubDagOperator( task_id="subdag_training_{}_task".format(model.replace(".", "_")), subdag=training.training_tasks( model, DAG_NAME, "subdag_training_{}_task".format(model.replace(".", "_")), default_args, PROJECT_ID, BUCKET, DATA_DIR, MODEL_NAME, MODEL_VERSION, MODEL_LOCATION), dag=dag) subdag_deploy_op = SubDagOperator( task_id="subdag_deploy_{}_task".format(model.replace(".", "_")), subdag=deploy.deploy_tasks( model, DAG_NAME, "subdag_deploy_{}_task".format(model.replace(".", "_")), default_args, PROJECT_ID, MODEL_NAME, MODEL_VERSION, MODEL_LOCATION), dag=dag) # Build dependency graph, set_upstream dependencies for all tasks subdag_training_op.set_upstream(subdag_preprocess_op) subdag_deploy_op.set_upstream(subdag_training_op)
task_id=VALIDATE_SITE_DESIGN_DAG_NAME, on_failure_callback=failure_handlers.step_failure_handler, dag=dag) drydock_build = SubDagOperator( subdag=deploy_site_drydock(PARENT_DAG_NAME, DRYDOCK_BUILD_DAG_NAME, args=default_args), task_id=DRYDOCK_BUILD_DAG_NAME, on_failure_callback=failure_handlers.step_failure_handler, dag=dag) query_node_status = PlaceholderOperator( task_id='deployed_node_status', on_failure_callback=failure_handlers.step_failure_handler, dag=dag) armada_build = PlaceholderOperator( task_id='armada_build', on_failure_callback=failure_handlers.step_failure_handler, dag=dag) # DAG Wiring concurrency_check.set_upstream(action_xcom) preflight.set_upstream(concurrency_check) get_design_version.set_upstream(preflight) validate_site_design.set_upstream(get_design_version) drydock_build.set_upstream(validate_site_design) query_node_status.set_upstream(drydock_build) armada_build.set_upstream(query_node_status)
"pred-prefix": "bgbb/active_profiles/v1", }, dev_options={ "model-win": "30", "sample-ids": "[1]", }, other={ "MOZETL_GIT_PATH": "https://github.com/wcbeard/bgbb_airflow.git", "MOZETL_EXTERNAL_MODULE": "bgbb_airflow", }, ), dag=dag ) main_summary_schema.set_upstream(main_summary) main_summary_bigquery_load.set_upstream(main_summary) engagement_ratio.set_upstream(main_summary) addons.set_upstream(main_summary) addons_bigquery_load.set_upstream(addons) addon_aggregates.set_upstream(addons) addon_aggregates_bigquery_load.set_upstream(addon_aggregates) main_events.set_upstream(main_summary) main_events_bigquery_load.set_upstream(main_events) main_summary_experiments.set_upstream(main_summary) main_summary_experiments_bigquery_load.set_upstream(main_summary_experiments) experiments_aggregates_import.set_upstream(main_summary_experiments)
remove_updated_dim_grade = PythonOperator( task_id='remove_updated_dim_grade', python_callable=remove_updated_dim_grade, op_kwargs={'db': db}, dag=dag) insert_updated_dim_grade = PythonOperator( task_id='insert_updated_dim_grade', python_callable=insert_updated_dim_grade, op_kwargs={'db': db}, dag=dag) ######################################################################### ########################## DAG flow ##################################### download_files.set_upstream(begin_task) download_files.set_downstream(end_task) end_task.set_downstream(initialize_transaction_table_dim_business) initialize_transaction_table_dim_business.set_downstream( prepare_and_load_transaction_table_dim_business) prepare_and_load_transaction_table_dim_business.set_downstream( update_transaction_table_dim_business_boro) update_transaction_table_dim_business_boro.set_downstream(update_dim_business) update_dim_business.set_downstream(remove_updated_dim_business) remove_updated_dim_business.set_downstream(insert_updated_dim_business) initialize_transaction_table_dim_grade.set_upstream( remove_updated_dim_business) initialize_transaction_table_dim_grade.set_downstream( prepare_and_load_transaction_table_dim_grade)
"https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", dag=dag) sync_bookmark_validation_total_per_day_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="sync_bookmark_validation_total_per_day_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", p2b_table_alias="sync_bmk_total_per_day_v1", dataset="sync/bmk_total_per_day", dataset_version="v1", date_submission_col="start_date", gke_cluster_name="bq-load-gke-1", bigquery_dataset="telemetry_derived", ), task_id="sync_bookmark_validation_total_per_day_bigquery_load", dag=dag) sync_bookmark_validation.set_upstream(sync_view) sync_view_bigquery_load.set_upstream(sync_view) sync_events_view_bigquery_load.set_upstream(sync_events_view) sync_flat_view_bigquery_load.set_upstream(sync_flat_view) sync_bookmark_validation_total_per_day_bigquery_load.set_upstream( sync_bookmark_validation)
"INT(sandbox_effective_content_process_level) AS sandbox_effective_content_process_level", "INT(scalar_parent_browser_engagement_max_concurrent_tab_count_max) AS scalar_parent_browser_engagement_max_concurrent_tab_count_max", "INT(scalar_parent_browser_engagement_max_concurrent_window_count_max) AS scalar_parent_browser_engagement_max_concurrent_window_count_max", "INT(scalar_parent_browser_engagement_unique_domains_count_max) AS scalar_parent_browser_engagement_unique_domains_count_max", "INT(timezone_offset) AS timezone_offset", ], parent_dag_name=dag.dag_id, dag_name="clients_daily_export", default_args=default_args, num_preemptible_workers=10), task_id="clients_daily_export", executor=get_default_executor(), dag=dag) wait_for_clients_daily = ExternalTaskSensor( task_id="wait_for_clients_daily", external_dag_id="bqetl_main_summary", external_task_id="telemetry_derived__clients_daily__v6", execution_delta=timedelta(hours=1), dag=dag) wait_for_main_summary = ExternalTaskSensor( task_id="wait_for_main_summary", external_dag_id="bqetl_main_summary", external_task_id="telemetry_derived__main_summary__v4", execution_delta=timedelta(hours=1), dag=dag) main_summary_export.set_upstream(wait_for_main_summary) clients_daily_export.set_upstream(wait_for_clients_daily)
depends_on_past=True, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], dag=dag) experiments_daily_active_clients = bigquery_etl_query( task_id="experiments_daily_active_clients", destination_table="experiments_daily_active_clients_v1", dataset_id="telemetry_derived", project_id="moz-fx-data-shared-prod", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], dag=dag) main_summary.set_upstream(copy_deduplicate_main_ping) main_summary_export.set_upstream(main_summary) clients_daily.set_upstream(main_summary) clients_daily_export.set_upstream(clients_daily) addons.set_upstream(copy_deduplicate_main_ping) addon_aggregates.set_upstream(copy_deduplicate_main_ping) addon_names.set_upstream(copy_deduplicate_main_ping) clients_first_seen.set_upstream(clients_daily) clients_last_seen.set_upstream(clients_daily) exact_mau_by_dimensions.set_upstream(clients_last_seen) exact_mau_by_client_count_dimensions.set_upstream(clients_last_seen) smoot_usage_desktop_v2.set_upstream(clients_last_seen) smoot_usage_desktop_compressed_v2.set_upstream(smoot_usage_desktop_v2) simpleprophet_forecasts_desktop.set_upstream(exact_mau_by_dimensions) devtools_panel_usage.set_upstream(clients_daily)
dag=dag, postgres_conn_id=REDSHIFT_CONN, sql=[ fact_queries['create_ft_complaints'], procon_queries['insert_ft_complaints'] ]) subdag_task_id = 'data_quality_check' data_quality_check = SubDagOperator(task_id=subdag_task_id, dag=dag, subdag=data_quality_check_subdag( DAG_NAME, subdag_task_id, start_date, REDSHIFT_CONN, STAGING_TABLE)) drop_procon_stage_table = PostgresOperator( task_id='drop_procon_stage_table', dag=dag, postgres_conn_id=REDSHIFT_CONN, sql=procon_queries['drop_stage_table']) end_operator = DummyOperator(task_id='finish_execution', dag=dag) chain(start_operator, has_file_to_process, create_procon_stage_table, load_procon_stage_data, [ load_dm_date_data, load_dm_region_data, load_dm_consumer_data, load_dm_company_data ], load_ft_complaints_data, data_quality_check, drop_procon_stage_table, end_operator) delete_s3_key_files.set_upstream(load_procon_stage_data)
channel=slack_channel, username='******', text='Cluster has been *restarted!*\n' 'It\'s all fine move forward with your ETLs and Crawlers!\n' 'Message datetime: {{params.curr_date}}', params={'curr_date': str(datetime.now(pytz.timezone('America/Sao_Paulo')))}, dag=dag ) run_etl_crawler_cluster_up = SubDagOperator( subdag=sub_dag('check_cluster_slack', 'crawler_dag_cluster_up', dag.schedule_interval), task_id='crawler_dag_cluster_up', dag=dag, ) run_etl_crawler_cluster_restarted = SubDagOperator( subdag=sub_dag('check_cluster_slack', 'crawler_dag_cluster_restarted', dag.schedule_interval), task_id='crawler_dag_cluster_restarted', dag=dag, ) branch1.set_upstream(check_cluster) send_slack_cluster_ok.set_upstream(branch1) send_slack_cluster_start.set_upstream(branch1) start_cluster.set_upstream(send_slack_cluster_start) branch2.set_upstream(start_cluster) send_slack_cluster_down.set_upstream(branch2) send_slack_cluster_restarted_ok.set_upstream(branch2) run_etl_crawler_cluster_up.set_upstream(send_slack_cluster_ok) run_etl_crawler_cluster_restarted.set_upstream(send_slack_cluster_restarted_ok)
site_evacuation = PlaceholderOperator( task_id='site_evacuation', on_failure_callback=failure_handlers.step_failure_handler, dag=dag) drydock_rebuild = PlaceholderOperator( task_id='drydock_rebuild', on_failure_callback=failure_handlers.step_failure_handler, dag=dag) query_node_status = PlaceholderOperator( task_id='redeployed_node_status', on_failure_callback=failure_handlers.step_failure_handler, dag=dag) armada_rebuild = PlaceholderOperator( task_id='armada_rebuild', on_failure_callback=failure_handlers.step_failure_handler, dag=dag) # DAG Wiring concurrency_check.set_upstream(action_xcom) preflight.set_upstream(concurrency_check) get_design_version.set_upstream(preflight) validate_site_design.set_upstream(get_design_version) site_evacuation.set_upstream(validate_site_design) drydock_rebuild.set_upstream(site_evacuation) query_node_status.set_upstream(drydock_rebuild) armada_rebuild.set_upstream(query_node_status)
gcp_conn_id=taar_gcpdataproc_conn_id, ), dag=dag, ) experiments_daily_active_clients = bigquery_etl_query( task_id="experiments_daily_active_clients", destination_table="experiments_daily_active_clients_v1", dataset_id="telemetry_derived", project_id="moz-fx-data-shared-prod", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], dag=dag) main_summary.set_upstream(copy_deduplicate_main_ping) main_summary_export.set_upstream(main_summary) clients_daily.set_upstream(main_summary) clients_daily_export.set_upstream(clients_daily) addons.set_upstream(copy_deduplicate_main_ping) addons_export.set_upstream(addons) addon_aggregates.set_upstream(copy_deduplicate_main_ping) addon_aggregates_export.set_upstream(addon_aggregates) main_summary_experiments.set_upstream(main_summary) main_summary_experiments.set_upstream( main_summary_experiments_get_experiment_list) main_summary_experiments_export.set_upstream(main_summary_experiments) taar_dynamo.set_upstream(main_summary_export) taar_similarity.set_upstream(clients_daily_export)
owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], dag=dag) experiments_daily_active_clients = bigquery_etl_query( task_id="experiments_daily_active_clients", destination_table="experiments_daily_active_clients_v1", dataset_id="telemetry_derived", project_id="moz-fx-data-shared-prod", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], dag=dag) main_summary.set_upstream(copy_deduplicate_main_ping) main_summary_export.set_upstream(main_summary) clients_daily.set_upstream(main_summary) clients_daily_export.set_upstream(clients_daily) addons.set_upstream(copy_deduplicate_main_ping) addon_aggregates.set_upstream(copy_deduplicate_main_ping) clients_last_seen.set_upstream(clients_daily) exact_mau_by_dimensions.set_upstream(clients_last_seen) exact_mau_by_client_count_dimensions.set_upstream(clients_last_seen) smoot_usage_desktop_v2.set_upstream(clients_last_seen) smoot_usage_desktop_compressed_v2.set_upstream(smoot_usage_desktop_v2) simpleprophet_forecasts_desktop.set_upstream(exact_mau_by_dimensions) devtools_panel_usage.set_upstream(clients_daily) bgbb_pred_dataproc.set_upstream(clients_daily)
PARENT_DAG_NAME, VALIDATE_SITE_DESIGN_DAG_NAME, args=default_args), task_id=VALIDATE_SITE_DESIGN_DAG_NAME, on_failure_callback=failure_handlers.step_failure_handler, dag=dag) drydock_build = SubDagOperator( subdag=deploy_site_drydock( PARENT_DAG_NAME, DRYDOCK_BUILD_DAG_NAME, args=default_args), task_id=DRYDOCK_BUILD_DAG_NAME, on_failure_callback=failure_handlers.step_failure_handler, dag=dag) query_node_status = PlaceholderOperator( task_id='deployed_node_status', on_failure_callback=failure_handlers.step_failure_handler, dag=dag) armada_build = PlaceholderOperator( task_id='armada_build', on_failure_callback=failure_handlers.step_failure_handler, dag=dag) # DAG Wiring concurrency_check.set_upstream(action_xcom) preflight.set_upstream(concurrency_check) get_design_version.set_upstream(preflight) validate_site_design.set_upstream(get_design_version) drydock_build.set_upstream(validate_site_design) query_node_status.set_upstream(drydock_build) armada_build.set_upstream(query_node_status)
fda_linker_task = SubDagOperator( dag=dag, subdag=fda_dap(parent_dag_name='fda', child_dag_name='linker', start_date=dag.start_date, schedule_interval=dag.schedule_interval), task_id='linker', ) remove_unknown_documentcloud_docs_task = DockerOperator( task_id='remove_unknown_documentcloud_docs', dag=dag, image='opentrials/processors:latest', force_pull=True, api_version='1.23', environment={ 'WAREHOUSE_URL': helpers.get_postgres_uri('warehouse_db'), 'DATABASE_URL': helpers.get_postgres_uri('api_db'), 'EXPLORERDB_URL': helpers.get_postgres_uri('explorer_db'), 'LOGGING_URL': Variable.get('LOGGING_URL'), 'DOCUMENTCLOUD_USERNAME': Variable.get('DOCUMENTCLOUD_USERNAME'), 'DOCUMENTCLOUD_PASSWORD': Variable.get('DOCUMENTCLOUD_PASSWORD'), 'DOCUMENTCLOUD_PROJECT': Variable.get('DOCUMENTCLOUD_PROJECT'), 'FERNET_KEY': os.environ['FERNET_KEY'], }, command='make start remove_unknown_documentcloud_docs') remove_unknown_documentcloud_docs_task.set_upstream(fda_linker_task) fda_linker_task.set_upstream(fda_dap_task)
dag=dag, postgres_conn_id=REDSHIFT_CONN, sql=[ fact_queries['create_ft_complaints'], consumidorgovbr_queries['insert_ft_complaints'] ]) subdag_task_id = 'data_quality_check' data_quality_check = SubDagOperator(task_id=subdag_task_id, dag=dag, subdag=data_quality_check_subdag( DAG_NAME, subdag_task_id, start_date, REDSHIFT_CONN, STAGING_TABLE)) drop_consumidorgov_stage_table = PostgresOperator( task_id='drop_consumidorgov_stage_table', dag=dag, postgres_conn_id=REDSHIFT_CONN, sql=[consumidorgovbr_queries['drop_stage_table']]) end_operator = DummyOperator(task_id='finish_execution', dag=dag) chain(start_operator, has_file_to_process, convert_file_encoding, create_consumidorgov_stage_table, load_consumidorgovbr_stage_data, [ load_dm_date_data, load_dm_region_data, load_dm_consumer_data, load_dm_company_data ], load_ft_complaints_data, data_quality_check, drop_consumidorgov_stage_table, end_operator) delete_s3_key_files.set_upstream(load_consumidorgovbr_stage_data)
parent_dag_name=dag.dag_id, dag_name='fx_usage_report', default_args=default_args, cluster_name=cluster_name, service_account= '*****@*****.**', job_name="Fx_Usage_Report", uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/fx_usage_report.sh", env={ "date": DS_WEEKLY, "bucket": output_bucket, "PYTHONPATH": "/usr/lib/spark/python/lib/pyspark.zip", "deploy_environment": "prod", # These env variables are needed in addition to the s3a configs, since some code uses boto to list bucket objects "AWS_ACCESS_KEY_ID": aws_access_key, "AWS_SECRET_ACCESS_KEY": aws_secret_key }, gcp_conn_id=gcp_conn_id, # This should is used to set the s3a configs for read/write to s3 for non boto calls aws_conn_id=aws_conn_id, num_workers=9, worker_machine_type='n1-standard-16', image_version='1.3', init_actions_uris=[ 'gs://moz-fx-data-prod-airflow-dataproc-artifacts/bootstrap/fx_usage_init.sh' ], )) usage_report.set_upstream(wait_for_main_summary)
dag=dag) churn_v2_bigquery_load = SubDagOperator( subdag=load_to_bigquery( parent_dag_name=dag.dag_id, dag_name="churn_v2_bigquery_load", default_args=default_args, dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="churn", dataset_version="v2", date_submission_col="week_start", gke_cluster_name="bq-load-gke-1", ), task_id="churn_v2_bigquery_load", dag=dag) churn_to_csv = EMRSparkOperator( task_id="churn_to_csv", job_name="Convert Churn v2 to csv", execution_timeout=timedelta(hours=4), instance_count=1, env=mozetl_envvar("churn_to_csv", {"start_date": "{{ ds_nodash }}"}), uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh", dag=dag) churn_bigquery_load.set_upstream(churn) churn_to_csv.set_upstream(churn_v2) churn_v2_bigquery_load.set_upstream(churn_v2)
validate_site_design = SubDagOperator( subdag=validate_site_design(PARENT_DAG_NAME, VALIDATE_SITE_DESIGN_DAG_NAME, args=default_args), task_id=VALIDATE_SITE_DESIGN_DAG_NAME, on_failure_callback=failure_handlers.step_failure_handler, dag=dag) drydock_build = SubDagOperator( subdag=deploy_site_drydock(PARENT_DAG_NAME, DRYDOCK_BUILD_DAG_NAME, args=default_args), task_id=DRYDOCK_BUILD_DAG_NAME, on_failure_callback=failure_handlers.step_failure_handler, dag=dag) armada_build = SubDagOperator( subdag=deploy_site_armada(PARENT_DAG_NAME, ARMADA_BUILD_DAG_NAME, args=default_args), task_id=ARMADA_BUILD_DAG_NAME, on_failure_callback=failure_handlers.step_failure_handler, dag=dag) # DAG Wiring concurrency_check.set_upstream(action_xcom) get_design_version.set_upstream(concurrency_check) validate_site_design.set_upstream(get_design_version) drydock_build.set_upstream(validate_site_design) armada_build.set_upstream(drydock_build)
) skill_tag = SubDagOperator( subdag=skill_tag_dag, task_id='skill_tag', priority_weight=1, queue='subdag', dag=dag, ) tabular_upload = SubDagOperator(subdag=tabular_upload_dag, task_id='tabular_upload', priority_weight=1, queue='subdag', dag=dag) partner_etl.set_upstream(partner_quarterly) api_sync.set_upstream(title_count) api_sync.set_upstream(onet_extract) api_sync.set_upstream(normalizer_index) normalizer_index.set_upstream(partner_etl) normalizer_index.set_upstream(onet_extract) geocode.set_upstream(partner_etl) title_count.set_upstream(geocode) soc_count.set_upstream(geocode) job_label.set_upstream(partner_etl) job_vectorize.set_upstream(partner_etl) skill_tag.set_upstream(partner_etl) skill_tag.set_upstream(onet_extract) tabular_upload.set_upstream(title_count)