def make_dag(name, default_args): # Create the DAG dag = DAG(name, schedule_interval='@daily', default_args = default_args) # Instantiate tasks for the dag Task.add_run_cralwer(dag) Task.add_run_model_generator(dag) Task.add_stop_start_flask_api(dag) # Setup dependencies dag.set_dependency('run_cralwer', 'run_model_generator') dag.set_dependency('run_model_generator', 'stop_start_flask_api') return dag
bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='reports_raw_cleanup', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='autoclaved_tarlz4_s3_sync', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='autoclaved_jsonl_s3_sync', bash_command='shovel_jump.sh', dag=dag) dag.set_dependency('reports_raw_sensor', 'canning') dag.set_dependency('reports_raw_sensor', 'tar_reports_raw') dag.set_dependency('canning', 'tar_reports_raw') dag.set_dependency('tar_reports_raw', 'reports_tgz_s3_sync') dag.set_dependency('reports_tgz_s3_sync', 'reports_tgz_s3_ls') # reports_raw_cleanup -> reports_tgz_cleanup is NOT a dependency as reports_raw_cleanup uses only index file dag.set_dependency('reports_tgz_s3_sync', 'reports_tgz_cleanup') # can't cleanup unless synced dag.set_dependency('reports_tgz_s3_ls', 'reports_tgz_cleanup') # data dependency dag.set_dependency('canning', 'canned_s3_sync')
dag=dag) BashOperator(pool='datacollector_disk_io', task_id='reports_raw_cleanup', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='sanitised_s3_ls', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='sanitised_check', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='sanitised_cleanup', bash_command='shovel_jump.sh', dag=dag) dag.set_dependency('canning', 'autoclaving') dag.set_dependency('autoclaving', 'simhash_text') dag.set_dependency('autoclaving', 'meta_pg') dag.set_dependency('reports_raw_s3_ls', 'reports_raw_cleanup') dag.set_dependency('canning', 'reports_raw_cleanup') dag.set_dependency('autoclaving', 'sanitised_check') dag.set_dependency('autoclaving', 'sanitised_cleanup') dag.set_dependency('sanitised_s3_ls', 'sanitised_cleanup') dag.set_dependency('sanitised_check', 'sanitised_cleanup')
BashOperator(pool='datacollector_disk_io', task_id='canning', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='tar_reports_raw', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='reports_tgz_s3_sync', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='reports_tgz_s3_ls', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='reports_tgz_cleanup', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='canned_s3_sync', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='canned_s3_ls', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='canned_cleanup', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='autoclaving', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='meta_pg', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='meta_wal_flush', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='reports_raw_cleanup', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='autoclaved_tarlz4_s3_sync', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='autoclaved_jsonl_s3_sync', bash_command='shovel_jump.sh', dag=dag) dag.set_dependency('reports_raw_sensor', 'canning') dag.set_dependency('reports_raw_sensor', 'tar_reports_raw') dag.set_dependency('canning', 'tar_reports_raw') dag.set_dependency('tar_reports_raw', 'reports_tgz_s3_sync') dag.set_dependency('reports_tgz_s3_sync', 'reports_tgz_s3_ls') # reports_raw_cleanup -> reports_tgz_cleanup is NOT a dependency as reports_raw_cleanup uses only index file dag.set_dependency('reports_tgz_s3_sync', 'reports_tgz_cleanup') # can't cleanup unless synced dag.set_dependency('reports_tgz_s3_ls', 'reports_tgz_cleanup') # data dependency dag.set_dependency('canning', 'canned_s3_sync') dag.set_dependency('canned_s3_sync', 'canned_s3_ls')