Esempio n. 1
0
        dag_name="taar_lite",
        default_args=default_args,
        cluster_name=taarlite_cluster_name,
        job_name="TAAR_Lite_GUID_GUID",
        python_driver_code=
        "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_lite_guidguid.py",
        # python_driver_code="gs://temp-hwoo-removemelater/taar_lite_guidguid.py",
        num_workers=8,
        py_args=[
            "--date",
            "{{ ds_nodash }}",
            "--aws_access_key_id",
            aws_access_key,
            "--aws_secret_access_key",
            aws_secret_key,
        ],
        aws_conn_id=aws_conn_id,
        gcp_conn_id=gcpdataproc_conn_id,
    ),
    dag=dag,
)
# Set a dependency on amodump from amowhitelist
amowhitelist.set_upstream(amodump)

# Set a dependency on amodump for the editorial reviewed whitelist of
# addons
editorial_whitelist.set_upstream(amodump)

# Set a dependency on amowhitelist from taar_lite
taar_lite.set_upstream(amowhitelist)
Esempio n. 2
0
                               bash_command=a1,
                               dag=dag)
t_to_hive = BashOperator(task_id='to_hive', bash_command=a1, dag=dag)
feature_analysis = SubDagOperator(
    task_id='feature_analysis',
    subdag=subdag(DAG_NAME, 'feature_analysis', default_args),
    dag=dag,
)
feature_slope = SubDagOperator(
    task_id='feature_slope',
    subdag=subdag(DAG_NAME, 'feature_slope', default_args),
    dag=dag,
)
kdj_rsi = SubDagOperator(
    task_id='kdj_rsi_stockstats',
    subdag=subdag(DAG_NAME, 'kdj_rsi_stockstats', default_args),
    dag=dag,
)
download_main = DummyOperator(task_id='run_main_PROJECTNAME')
to_hive = DummyOperator(task_id='to_hive')
#t_mv_daily_report.set_upstream(t_dazongjiaoyi)
t_clean_data.set_upstream(t_download_data)
t_to_hive.set_upstream(t_clean_data)
t_scrap_data.set_upstream(t_to_hive)
t_analysis.set_upstream(t_scrap_data)
feature_analysis.set_upstream(t_analysis)
feature_slope.set_upstream(t_analysis)
kdj_rsi.set_upstream(t_analysis)
#download_main.set_upstream(t_scrap_data)
t_scrap_data >> download_main >> to_hive
Esempio n. 3
0
taar_collaborative_recommender = EMRSparkOperator(
    task_id="addon_recommender",
    job_name="Train the Collaborative Addon Recommender",
    execution_timeout=timedelta(hours=10),
    instance_count=20,
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**", "*****@*****.**"],
    env={"date": "{{ ds_nodash }}",
         "privateBucket": "{{ task.__class__.private_output_bucket }}",
         "publicBucket": "{{ task.__class__.public_output_bucket }}"},
    uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/addon_recommender.sh",
    dag=dag)


main_summary_schema.set_upstream(main_summary)
main_summary_bigquery_load.set_upstream(main_summary)

engagement_ratio.set_upstream(main_summary)

addons.set_upstream(main_summary)
addons_bigquery_load.set_upstream(addons)
addon_aggregates.set_upstream(addons)
addon_aggregates_bigquery_load.set_upstream(addon_aggregates)

main_events.set_upstream(main_summary)
main_events_bigquery_load.set_upstream(main_events)

main_summary_experiments.set_upstream(main_summary)
main_summary_experiments_bigquery_load.set_upstream(main_summary_experiments)

experiments_aggregates_import.set_upstream(main_summary_experiments)
Esempio n. 4
0
    subdag_id = 'copy_data_to_redshift'
    copy_data_to_redshift = SubDagOperator(subdag=get_s3_to_redshift(
        parent_dag_name='udacity-dend-capstone',
        task_id=subdag_id,
        tables_definition=copy_definitions,
        redshift_conn_id='redshift',
        redshift_schema='public',
        s3_conn_id='aws_credentials',
        s3_bucket='udac-dend-capstone-dz',
        load_type='truncate',
        schema_location='Local',
        start_date=start_date),
                                           task_id=subdag_id,
                                           dag=dag,
                                           executor=LocalExecutor())
    copy_data_to_redshift.set_upstream(start_operator)

process_dim_category = PostgresOperator(dag=dag,
                                        task_id='process_dim_category',
                                        sql='/sql/categories.sql',
                                        postgres_conn_id='redshift')
process_dim_category.set_upstream(copy_data_to_redshift)

process_dim_cities = PostgresOperator(dag=dag,
                                      task_id='process_dim_cities',
                                      sql='/sql/cities.sql',
                                      postgres_conn_id='redshift')
process_dim_cities.set_upstream(copy_data_to_redshift)

process_dim_business = PostgresOperator(dag=dag,
                                        task_id='process_dim_business',
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    sql_file_path='sql/search/search_clients_daily_v8/query.sql',
    dag=dag)

search_aggregates_bigquery = bigquery_etl_query(
    task_id="search_aggregates_bigquery",
    destination_table="search_aggregates_v8",
    dataset_id="search_staging",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    sql_file_path='sql/search/search_aggregates_v8/query.sql',
    dag=dag)

main_summary_schema.set_upstream(main_summary)
main_summary_bigquery_load.set_upstream(main_summary)

main_summary_dataproc.set_upstream(main_ping_bigquery_export)
main_ping_bigquery_export_delete.set_upstream(main_summary_dataproc)
main_summary_dataproc_bigquery_load.set_upstream(main_summary_dataproc)
main_summary_dataproc_s3_copy.set_upstream(main_summary_dataproc)

engagement_ratio.set_upstream(main_summary)

addons.set_upstream(main_summary)
addons_bigquery_load.set_upstream(addons)
addon_aggregates.set_upstream(addons)
addon_aggregates_bigquery_load.set_upstream(addon_aggregates)

main_events.set_upstream(main_summary)
main_events_bigquery_load.set_upstream(main_events)
Esempio n. 6
0
    'depends_on_past': False,
    'start_date': datetime(2020, 6, 4),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

# Step 2 - Create a DAG object
dag = DAG('sub_dag_demo',
          schedule_interval='0 0 * * *',
          default_args=default_args)

# Step 3 - Create tasks
some_task = BashOperator(task_id='someTask',
                         bash_command="echo I am some task",
                         dag=dag)

sub_dag = SubDagOperator(subdag=my_sub_dag('sub_dag_demo'),
                         task_id='my_sub_dag',
                         dag=dag)

final_task = BashOperator(task_id='finalTask',
                          bash_command="echo I am the final task",
                          dag=dag)

# Step 4 - Define the sequence of tasks.
sub_dag.set_upstream(some_task)
final_task.set_upstream(sub_dag)
Esempio n. 7
0
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="sync_flat_summary",
        dataset_version="v1",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="sync_flat_view_bigquery_load",
    dag=dag)

sync_bookmark_validation = EMRSparkOperator(
    task_id="sync_bookmark_validation",
    job_name="Sync Bookmark Validation",
    execution_timeout=timedelta(hours=2),
    instance_count=1,
    email=["*****@*****.**", "*****@*****.**"],
    env=mozetl_envvar("sync_bookmark_validation", {
        "start_date": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}",
    }),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    dag=dag)


sync_bookmark_validation.set_upstream(sync_view)

sync_view_bigquery_load.set_upstream(sync_view)

sync_events_view_bigquery_load.set_upstream(sync_events_view)

sync_flat_view_bigquery_load.set_upstream(sync_flat_view)
Esempio n. 8
0
File: dag.py Progetto: gsroot/storm
    task_id=task_id,
    subdag=subdag(task_id, 'daily_items_indicator_info',
                  data_worker.worker.collect_some_daily_info),
    executor=CeleryExecutor(),
    dag=dag,
)

task_id = 'collect_daily_items_info_all'
items_all_task = SubDagOperator(
    task_id=task_id,
    subdag=subdag(task_id, 'daily_items_info',
                  data_worker.worker.collect_some_daily_info_all),
    executor=CeleryExecutor(),
    dag=dag,
)

task_id = 'collect_daily_items_indicator_info_all'
items_indicator_all_task = SubDagOperator(
    task_id=task_id,
    subdag=subdag(task_id, 'daily_items_indicator_info',
                  data_worker.worker.collect_some_daily_info_all),
    executor=CeleryExecutor(),
    dag=dag,
)

items_task.set_upstream(
    [itemcodes_task, item_naver_sectors_task, item_themes_task])
items_task.set_downstream(items_all_task)
items_all_task.set_downstream(items_indicator_task)
items_indicator_task.set_downstream(items_indicator_all_task)
Esempio n. 9
0
                            },
                            dag=dag)

deliver_res_op = """
cp -r {{ params.project_dir }}/{{ params.dag_id }}_results/* {{ params.project_dir }}/results
"""

deliver_res = BashOperator(task_id='Deliver_result',
                           bash_command=deliver_res_op,
                           params={
                               'project_dir': project_directory,
                               'dag_id': dag_id
                           },
                           dag=dag)

preprosessing.set_upstream(clean_up)
ms_concatenation.set_upstream(preprosessing)
clustering_or.set_upstream(ms_concatenation)
taxo_assignation.set_upstream(clustering_or)
biom_generation.set_upstream(clustering_or)
biom_generation.set_upstream(taxo_assignation)
tree_generation.set_upstream(clustering_or)
filter_weak_otus.set_upstream(biom_generation)
biom_conversion.set_upstream(filter_weak_otus)
raw_matrix_generation.set_upstream(biom_conversion)
matrix_normalization.set_upstream(raw_matrix_generation)
matrix_consolidation.set_upstream(matrix_normalization)
output_res.set_upstream(ms_concatenation)
output_res.set_upstream(tree_generation)
output_res.set_upstream(biom_conversion)
output_res.set_upstream(raw_matrix_generation)
    task_id="terminate_backfill_cluster",
    aws_conn_id='aws_default',
    execution_timeout=timedelta(minutes=10),
    job_flow_id=job_flow_id_template,
    trigger_rule=TriggerRule.ALL_DONE,
    dag=dag
)

job_flow_termination_sensor_task = MozEmrClusterEndSensor(
    task_id="cluster_termination_sensor",
    timeout=timedelta(hours=1).total_seconds(),
    job_flow_id=job_flow_id_template,
    dag=dag
)


cluster_start_sensor_task.set_upstream(create_job_flow_task)

for day in range(7):
    task_id = "main_summary_day_{}".format(day)
    subdag_task = SubDagOperator(
        task_id=task_id,
        subdag=main_summary_subdag_factory(dag, task_id, day),
        on_retry_callback=clear_subdag_callback,
        dag=dag
    )
    subdag_task.set_upstream(cluster_start_sensor_task)
    terminate_job_flow_task.set_upstream(subdag_task)

job_flow_termination_sensor_task.set_upstream(terminate_job_flow_task)
Esempio n. 11
0
terminate_job_flow_task = EmrTerminateJobFlowOperator(
    task_id="terminate_backfill_cluster",
    aws_conn_id='aws_default',
    execution_timeout=timedelta(minutes=10),
    job_flow_id=job_flow_id_template,
    trigger_rule=TriggerRule.ALL_DONE,
    dag=dag)

job_flow_termination_sensor_task = MozEmrClusterEndSensor(
    task_id="cluster_termination_sensor",
    timeout=timedelta(hours=1).total_seconds(),
    job_flow_id=job_flow_id_template,
    dag=dag)

cluster_start_sensor_task.set_upstream(create_job_flow_task)

upstream = cluster_start_sensor_task
for day in range(7):
    task_id = "main_summary_day_{}".format(day)
    subdag_task = SubDagOperator(task_id=task_id,
                                 subdag=main_summary_subdag_factory(
                                     dag, task_id, day),
                                 on_retry_callback=clear_subdag_callback,
                                 dag=dag)
    subdag_task.set_upstream(upstream)
    terminate_job_flow_task.set_upstream(subdag_task)
    upstream = subdag_task

job_flow_termination_sensor_task.set_upstream(terminate_job_flow_task)
Esempio n. 12
0
        py_args=[
            "--date",
            "{{ ds_nodash }}",
            "--aws_access_key_id",
            taar_aws_access_key,
            "--aws_secret_access_key",
            taar_aws_secret_key,
        ],
        aws_conn_id=taar_aws_conn_id,
        gcp_conn_id=taar_gcpdataproc_conn_id,
    ),
    dag=dag,
)

main_summary_schema.set_upstream(main_summary)
main_summary_bigquery_load.set_upstream(main_summary)

sql_main_summary.set_upstream(copy_deduplicate_main_ping)
sql_main_summary_export.set_upstream(sql_main_summary)
clients_daily.set_upstream(sql_main_summary)
clients_daily_export.set_upstream(clients_daily)

addons.set_upstream(main_summary)
addons_bigquery_load.set_upstream(addons)
addon_aggregates.set_upstream(addons)
addon_aggregates_bigquery_load.set_upstream(addon_aggregates)

main_events.set_upstream(main_summary)
main_events_bigquery_load.set_upstream(main_events)

main_summary_experiments.set_upstream(main_summary)
for model in SOURCE_DATASET_TABLE_NAMES:
    subdag_preprocess_op = SubDagOperator(
        task_id="subdag_preprocess_{}_task".format(model.replace(".", "_")),
        subdag=preprocess.preprocess_tasks(
            model, DAG_NAME,
            "subdag_preprocess_{}_task".format(model.replace(".", "_")),
            default_args, PROJECT_ID, BUCKET, DATA_DIR),
        dag=dag)

    subdag_training_op = SubDagOperator(
        task_id="subdag_training_{}_task".format(model.replace(".", "_")),
        subdag=training.training_tasks(
            model, DAG_NAME,
            "subdag_training_{}_task".format(model.replace(".", "_")),
            default_args, PROJECT_ID, BUCKET, DATA_DIR, MODEL_NAME,
            MODEL_VERSION, MODEL_LOCATION),
        dag=dag)

    subdag_deploy_op = SubDagOperator(
        task_id="subdag_deploy_{}_task".format(model.replace(".", "_")),
        subdag=deploy.deploy_tasks(
            model, DAG_NAME,
            "subdag_deploy_{}_task".format(model.replace(".",
                                                         "_")), default_args,
            PROJECT_ID, MODEL_NAME, MODEL_VERSION, MODEL_LOCATION),
        dag=dag)

    # Build dependency graph, set_upstream dependencies for all tasks
    subdag_training_op.set_upstream(subdag_preprocess_op)
    subdag_deploy_op.set_upstream(subdag_training_op)
Esempio n. 14
0
    task_id=VALIDATE_SITE_DESIGN_DAG_NAME,
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

drydock_build = SubDagOperator(
    subdag=deploy_site_drydock(PARENT_DAG_NAME,
                               DRYDOCK_BUILD_DAG_NAME,
                               args=default_args),
    task_id=DRYDOCK_BUILD_DAG_NAME,
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

query_node_status = PlaceholderOperator(
    task_id='deployed_node_status',
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

armada_build = PlaceholderOperator(
    task_id='armada_build',
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

# DAG Wiring
concurrency_check.set_upstream(action_xcom)
preflight.set_upstream(concurrency_check)
get_design_version.set_upstream(preflight)
validate_site_design.set_upstream(get_design_version)
drydock_build.set_upstream(validate_site_design)
query_node_status.set_upstream(drydock_build)
armada_build.set_upstream(query_node_status)
            "pred-prefix": "bgbb/active_profiles/v1",
        },
        dev_options={
            "model-win": "30",
            "sample-ids": "[1]",
        },
        other={
            "MOZETL_GIT_PATH": "https://github.com/wcbeard/bgbb_airflow.git",
            "MOZETL_EXTERNAL_MODULE": "bgbb_airflow",
        },
    ),
    dag=dag
)

main_summary_schema.set_upstream(main_summary)
main_summary_bigquery_load.set_upstream(main_summary)

engagement_ratio.set_upstream(main_summary)

addons.set_upstream(main_summary)
addons_bigquery_load.set_upstream(addons)
addon_aggregates.set_upstream(addons)
addon_aggregates_bigquery_load.set_upstream(addon_aggregates)

main_events.set_upstream(main_summary)
main_events_bigquery_load.set_upstream(main_events)

main_summary_experiments.set_upstream(main_summary)
main_summary_experiments_bigquery_load.set_upstream(main_summary_experiments)

experiments_aggregates_import.set_upstream(main_summary_experiments)
remove_updated_dim_grade = PythonOperator(
    task_id='remove_updated_dim_grade',
    python_callable=remove_updated_dim_grade,
    op_kwargs={'db': db},
    dag=dag)

insert_updated_dim_grade = PythonOperator(
    task_id='insert_updated_dim_grade',
    python_callable=insert_updated_dim_grade,
    op_kwargs={'db': db},
    dag=dag)

#########################################################################
########################## DAG flow #####################################

download_files.set_upstream(begin_task)
download_files.set_downstream(end_task)
end_task.set_downstream(initialize_transaction_table_dim_business)

initialize_transaction_table_dim_business.set_downstream(
    prepare_and_load_transaction_table_dim_business)
prepare_and_load_transaction_table_dim_business.set_downstream(
    update_transaction_table_dim_business_boro)
update_transaction_table_dim_business_boro.set_downstream(update_dim_business)
update_dim_business.set_downstream(remove_updated_dim_business)
remove_updated_dim_business.set_downstream(insert_updated_dim_business)

initialize_transaction_table_dim_grade.set_upstream(
    remove_updated_dim_business)
initialize_transaction_table_dim_grade.set_downstream(
    prepare_and_load_transaction_table_dim_grade)
Esempio n. 17
0
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    dag=dag)

sync_bookmark_validation_total_per_day_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="sync_bookmark_validation_total_per_day_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        p2b_table_alias="sync_bmk_total_per_day_v1",
        dataset="sync/bmk_total_per_day",
        dataset_version="v1",
        date_submission_col="start_date",
        gke_cluster_name="bq-load-gke-1",
        bigquery_dataset="telemetry_derived",
    ),
    task_id="sync_bookmark_validation_total_per_day_bigquery_load",
    dag=dag)

sync_bookmark_validation.set_upstream(sync_view)

sync_view_bigquery_load.set_upstream(sync_view)

sync_events_view_bigquery_load.set_upstream(sync_events_view)

sync_flat_view_bigquery_load.set_upstream(sync_flat_view)

sync_bookmark_validation_total_per_day_bigquery_load.set_upstream(
    sync_bookmark_validation)
Esempio n. 18
0
            "INT(sandbox_effective_content_process_level) AS sandbox_effective_content_process_level",
            "INT(scalar_parent_browser_engagement_max_concurrent_tab_count_max) AS scalar_parent_browser_engagement_max_concurrent_tab_count_max",
            "INT(scalar_parent_browser_engagement_max_concurrent_window_count_max) AS scalar_parent_browser_engagement_max_concurrent_window_count_max",
            "INT(scalar_parent_browser_engagement_unique_domains_count_max) AS scalar_parent_browser_engagement_unique_domains_count_max",
            "INT(timezone_offset) AS timezone_offset",
        ],
        parent_dag_name=dag.dag_id,
        dag_name="clients_daily_export",
        default_args=default_args,
        num_preemptible_workers=10),
    task_id="clients_daily_export",
    executor=get_default_executor(),
    dag=dag)

wait_for_clients_daily = ExternalTaskSensor(
    task_id="wait_for_clients_daily",
    external_dag_id="bqetl_main_summary",
    external_task_id="telemetry_derived__clients_daily__v6",
    execution_delta=timedelta(hours=1),
    dag=dag)

wait_for_main_summary = ExternalTaskSensor(
    task_id="wait_for_main_summary",
    external_dag_id="bqetl_main_summary",
    external_task_id="telemetry_derived__main_summary__v4",
    execution_delta=timedelta(hours=1),
    dag=dag)

main_summary_export.set_upstream(wait_for_main_summary)
clients_daily_export.set_upstream(wait_for_clients_daily)
Esempio n. 19
0
    depends_on_past=True,
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    dag=dag)

experiments_daily_active_clients = bigquery_etl_query(
    task_id="experiments_daily_active_clients",
    destination_table="experiments_daily_active_clients_v1",
    dataset_id="telemetry_derived",
    project_id="moz-fx-data-shared-prod",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    dag=dag)

main_summary.set_upstream(copy_deduplicate_main_ping)
main_summary_export.set_upstream(main_summary)
clients_daily.set_upstream(main_summary)
clients_daily_export.set_upstream(clients_daily)

addons.set_upstream(copy_deduplicate_main_ping)
addon_aggregates.set_upstream(copy_deduplicate_main_ping)
addon_names.set_upstream(copy_deduplicate_main_ping)

clients_first_seen.set_upstream(clients_daily)
clients_last_seen.set_upstream(clients_daily)
exact_mau_by_dimensions.set_upstream(clients_last_seen)
exact_mau_by_client_count_dimensions.set_upstream(clients_last_seen)
smoot_usage_desktop_v2.set_upstream(clients_last_seen)
smoot_usage_desktop_compressed_v2.set_upstream(smoot_usage_desktop_v2)
simpleprophet_forecasts_desktop.set_upstream(exact_mau_by_dimensions)
devtools_panel_usage.set_upstream(clients_daily)
Esempio n. 20
0
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN,
    sql=[
        fact_queries['create_ft_complaints'],
        procon_queries['insert_ft_complaints']
    ])

subdag_task_id = 'data_quality_check'
data_quality_check = SubDagOperator(task_id=subdag_task_id,
                                    dag=dag,
                                    subdag=data_quality_check_subdag(
                                        DAG_NAME, subdag_task_id, start_date,
                                        REDSHIFT_CONN, STAGING_TABLE))

drop_procon_stage_table = PostgresOperator(
    task_id='drop_procon_stage_table',
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN,
    sql=procon_queries['drop_stage_table'])

end_operator = DummyOperator(task_id='finish_execution', dag=dag)

chain(start_operator, has_file_to_process, create_procon_stage_table,
      load_procon_stage_data, [
          load_dm_date_data, load_dm_region_data, load_dm_consumer_data,
          load_dm_company_data
      ], load_ft_complaints_data, data_quality_check, drop_procon_stage_table,
      end_operator)

delete_s3_key_files.set_upstream(load_procon_stage_data)
    channel=slack_channel,
    username='******',
    text='Cluster has been *restarted!*\n'
         'It\'s all fine move forward with your ETLs and Crawlers!\n'
         'Message datetime: {{params.curr_date}}',
    params={'curr_date': str(datetime.now(pytz.timezone('America/Sao_Paulo')))},
    dag=dag
)

run_etl_crawler_cluster_up = SubDagOperator(
  subdag=sub_dag('check_cluster_slack', 'crawler_dag_cluster_up', dag.schedule_interval),
  task_id='crawler_dag_cluster_up',
  dag=dag,
)

run_etl_crawler_cluster_restarted = SubDagOperator(
  subdag=sub_dag('check_cluster_slack', 'crawler_dag_cluster_restarted', dag.schedule_interval),
  task_id='crawler_dag_cluster_restarted',
  dag=dag,
)
    
branch1.set_upstream(check_cluster)                                       
send_slack_cluster_ok.set_upstream(branch1)     
send_slack_cluster_start.set_upstream(branch1)
start_cluster.set_upstream(send_slack_cluster_start)
branch2.set_upstream(start_cluster)
send_slack_cluster_down.set_upstream(branch2)
send_slack_cluster_restarted_ok.set_upstream(branch2)
run_etl_crawler_cluster_up.set_upstream(send_slack_cluster_ok)
run_etl_crawler_cluster_restarted.set_upstream(send_slack_cluster_restarted_ok)
Esempio n. 22
0
site_evacuation = PlaceholderOperator(
    task_id='site_evacuation',
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

drydock_rebuild = PlaceholderOperator(
    task_id='drydock_rebuild',
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

query_node_status = PlaceholderOperator(
    task_id='redeployed_node_status',
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

armada_rebuild = PlaceholderOperator(
    task_id='armada_rebuild',
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

# DAG Wiring
concurrency_check.set_upstream(action_xcom)
preflight.set_upstream(concurrency_check)
get_design_version.set_upstream(preflight)
validate_site_design.set_upstream(get_design_version)
site_evacuation.set_upstream(validate_site_design)
drydock_rebuild.set_upstream(site_evacuation)
query_node_status.set_upstream(drydock_rebuild)
armada_rebuild.set_upstream(query_node_status)
Esempio n. 23
0
        gcp_conn_id=taar_gcpdataproc_conn_id,
    ),
    dag=dag,
)

experiments_daily_active_clients = bigquery_etl_query(
    task_id="experiments_daily_active_clients",
    destination_table="experiments_daily_active_clients_v1",
    dataset_id="telemetry_derived",
    project_id="moz-fx-data-shared-prod",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    dag=dag)

main_summary.set_upstream(copy_deduplicate_main_ping)
main_summary_export.set_upstream(main_summary)
clients_daily.set_upstream(main_summary)
clients_daily_export.set_upstream(clients_daily)

addons.set_upstream(copy_deduplicate_main_ping)
addons_export.set_upstream(addons)
addon_aggregates.set_upstream(copy_deduplicate_main_ping)
addon_aggregates_export.set_upstream(addon_aggregates)

main_summary_experiments.set_upstream(main_summary)
main_summary_experiments.set_upstream(
    main_summary_experiments_get_experiment_list)
main_summary_experiments_export.set_upstream(main_summary_experiments)

taar_dynamo.set_upstream(main_summary_export)
taar_similarity.set_upstream(clients_daily_export)
Esempio n. 24
0
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    dag=dag)

experiments_daily_active_clients = bigquery_etl_query(
    task_id="experiments_daily_active_clients",
    destination_table="experiments_daily_active_clients_v1",
    dataset_id="telemetry_derived",
    project_id="moz-fx-data-shared-prod",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    dag=dag)


main_summary.set_upstream(copy_deduplicate_main_ping)
main_summary_export.set_upstream(main_summary)
clients_daily.set_upstream(main_summary)
clients_daily_export.set_upstream(clients_daily)

addons.set_upstream(copy_deduplicate_main_ping)
addon_aggregates.set_upstream(copy_deduplicate_main_ping)

clients_last_seen.set_upstream(clients_daily)
exact_mau_by_dimensions.set_upstream(clients_last_seen)
exact_mau_by_client_count_dimensions.set_upstream(clients_last_seen)
smoot_usage_desktop_v2.set_upstream(clients_last_seen)
smoot_usage_desktop_compressed_v2.set_upstream(smoot_usage_desktop_v2)
simpleprophet_forecasts_desktop.set_upstream(exact_mau_by_dimensions)
devtools_panel_usage.set_upstream(clients_daily)

bgbb_pred_dataproc.set_upstream(clients_daily)
Esempio n. 25
0
        PARENT_DAG_NAME, VALIDATE_SITE_DESIGN_DAG_NAME, args=default_args),
    task_id=VALIDATE_SITE_DESIGN_DAG_NAME,
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

drydock_build = SubDagOperator(
    subdag=deploy_site_drydock(
        PARENT_DAG_NAME, DRYDOCK_BUILD_DAG_NAME, args=default_args),
    task_id=DRYDOCK_BUILD_DAG_NAME,
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

query_node_status = PlaceholderOperator(
    task_id='deployed_node_status',
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

armada_build = PlaceholderOperator(
    task_id='armada_build',
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

# DAG Wiring
concurrency_check.set_upstream(action_xcom)
preflight.set_upstream(concurrency_check)
get_design_version.set_upstream(preflight)
validate_site_design.set_upstream(get_design_version)
drydock_build.set_upstream(validate_site_design)
query_node_status.set_upstream(drydock_build)
armada_build.set_upstream(query_node_status)
Esempio n. 26
0
fda_linker_task = SubDagOperator(
    dag=dag,
    subdag=fda_dap(parent_dag_name='fda',
                   child_dag_name='linker',
                   start_date=dag.start_date,
                   schedule_interval=dag.schedule_interval),
    task_id='linker',
)

remove_unknown_documentcloud_docs_task = DockerOperator(
    task_id='remove_unknown_documentcloud_docs',
    dag=dag,
    image='opentrials/processors:latest',
    force_pull=True,
    api_version='1.23',
    environment={
        'WAREHOUSE_URL': helpers.get_postgres_uri('warehouse_db'),
        'DATABASE_URL': helpers.get_postgres_uri('api_db'),
        'EXPLORERDB_URL': helpers.get_postgres_uri('explorer_db'),
        'LOGGING_URL': Variable.get('LOGGING_URL'),
        'DOCUMENTCLOUD_USERNAME': Variable.get('DOCUMENTCLOUD_USERNAME'),
        'DOCUMENTCLOUD_PASSWORD': Variable.get('DOCUMENTCLOUD_PASSWORD'),
        'DOCUMENTCLOUD_PROJECT': Variable.get('DOCUMENTCLOUD_PROJECT'),
        'FERNET_KEY': os.environ['FERNET_KEY'],
    },
    command='make start remove_unknown_documentcloud_docs')

remove_unknown_documentcloud_docs_task.set_upstream(fda_linker_task)
fda_linker_task.set_upstream(fda_dap_task)
Esempio n. 27
0
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN,
    sql=[
        fact_queries['create_ft_complaints'],
        consumidorgovbr_queries['insert_ft_complaints']
    ])

subdag_task_id = 'data_quality_check'
data_quality_check = SubDagOperator(task_id=subdag_task_id,
                                    dag=dag,
                                    subdag=data_quality_check_subdag(
                                        DAG_NAME, subdag_task_id, start_date,
                                        REDSHIFT_CONN, STAGING_TABLE))

drop_consumidorgov_stage_table = PostgresOperator(
    task_id='drop_consumidorgov_stage_table',
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN,
    sql=[consumidorgovbr_queries['drop_stage_table']])

end_operator = DummyOperator(task_id='finish_execution', dag=dag)

chain(start_operator, has_file_to_process, convert_file_encoding,
      create_consumidorgov_stage_table, load_consumidorgovbr_stage_data, [
          load_dm_date_data, load_dm_region_data, load_dm_consumer_data,
          load_dm_company_data
      ], load_ft_complaints_data, data_quality_check,
      drop_consumidorgov_stage_table, end_operator)

delete_s3_key_files.set_upstream(load_consumidorgovbr_stage_data)
Esempio n. 28
0
        parent_dag_name=dag.dag_id,
        dag_name='fx_usage_report',
        default_args=default_args,
        cluster_name=cluster_name,
        service_account=
        '*****@*****.**',
        job_name="Fx_Usage_Report",
        uri=
        "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/fx_usage_report.sh",
        env={
            "date": DS_WEEKLY,
            "bucket": output_bucket,
            "PYTHONPATH": "/usr/lib/spark/python/lib/pyspark.zip",
            "deploy_environment": "prod",
            # These env variables are needed in addition to the s3a configs, since some code uses boto to list bucket objects
            "AWS_ACCESS_KEY_ID": aws_access_key,
            "AWS_SECRET_ACCESS_KEY": aws_secret_key
        },
        gcp_conn_id=gcp_conn_id,
        # This should is used to set the s3a configs for read/write to s3 for non boto calls
        aws_conn_id=aws_conn_id,
        num_workers=9,
        worker_machine_type='n1-standard-16',
        image_version='1.3',
        init_actions_uris=[
            'gs://moz-fx-data-prod-airflow-dataproc-artifacts/bootstrap/fx_usage_init.sh'
        ],
    ))

usage_report.set_upstream(wait_for_main_summary)
Esempio n. 29
0
    dag=dag)

churn_v2_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="churn_v2_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="churn",
        dataset_version="v2",
        date_submission_col="week_start",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="churn_v2_bigquery_load",
    dag=dag)

churn_to_csv = EMRSparkOperator(
    task_id="churn_to_csv",
    job_name="Convert Churn v2 to csv",
    execution_timeout=timedelta(hours=4),
    instance_count=1,
    env=mozetl_envvar("churn_to_csv", {"start_date": "{{ ds_nodash }}"}),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    dag=dag)

churn_bigquery_load.set_upstream(churn)

churn_to_csv.set_upstream(churn_v2)
churn_v2_bigquery_load.set_upstream(churn_v2)
Esempio n. 30
0
validate_site_design = SubDagOperator(
    subdag=validate_site_design(PARENT_DAG_NAME,
                                VALIDATE_SITE_DESIGN_DAG_NAME,
                                args=default_args),
    task_id=VALIDATE_SITE_DESIGN_DAG_NAME,
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

drydock_build = SubDagOperator(
    subdag=deploy_site_drydock(PARENT_DAG_NAME,
                               DRYDOCK_BUILD_DAG_NAME,
                               args=default_args),
    task_id=DRYDOCK_BUILD_DAG_NAME,
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

armada_build = SubDagOperator(
    subdag=deploy_site_armada(PARENT_DAG_NAME,
                              ARMADA_BUILD_DAG_NAME,
                              args=default_args),
    task_id=ARMADA_BUILD_DAG_NAME,
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

# DAG Wiring
concurrency_check.set_upstream(action_xcom)
get_design_version.set_upstream(concurrency_check)
validate_site_design.set_upstream(get_design_version)
drydock_build.set_upstream(validate_site_design)
armada_build.set_upstream(drydock_build)
)

skill_tag = SubDagOperator(
    subdag=skill_tag_dag,
    task_id='skill_tag',
    priority_weight=1,
    queue='subdag',
    dag=dag,
)

tabular_upload = SubDagOperator(subdag=tabular_upload_dag,
                                task_id='tabular_upload',
                                priority_weight=1,
                                queue='subdag',
                                dag=dag)

partner_etl.set_upstream(partner_quarterly)
api_sync.set_upstream(title_count)
api_sync.set_upstream(onet_extract)
api_sync.set_upstream(normalizer_index)
normalizer_index.set_upstream(partner_etl)
normalizer_index.set_upstream(onet_extract)
geocode.set_upstream(partner_etl)
title_count.set_upstream(geocode)
soc_count.set_upstream(geocode)
job_label.set_upstream(partner_etl)
job_vectorize.set_upstream(partner_etl)
skill_tag.set_upstream(partner_etl)
skill_tag.set_upstream(onet_extract)
tabular_upload.set_upstream(title_count)