from airflow import DAG from datetime import datetime, timedelta default_args = { 'owner': 'yourself', 'depends_on_past': False, 'start_date': datetime(2019, 6, 24), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 2, 'retry_delay': timedelta(minutes=5), } dag = DAG('ssh_tutorial', default_args=default_args, schedule_interval=timedelta(days=1)) SPARK_CMD = """spark-submit --jars \ $SPARK_HOME/jars/aws-java-sdk-1.7.4.jar,\ $SPARK_HOME/jars/RedshiftJDBC42-no-awssdk-1.2.27.1051.jar,\ $SPARK_HOME/jars/spark-redshift_2.10-3.0.0-preview1.jar,\ $SPARK_HOME/jars/spark-avro_2.11-4.0.0.jar \ --master spark://{master_ip}:7077 \ --num-executors {num_executors} \ {spark_filepath}"""\ .format(master_ip=master_ip, num_executors=num_executors, spark_filepath=spark_filepath) t1 = SSHOperator(task_id='Download_data_to_s3', ssh_conn_id="ssh_spark_master", command="python3 /home/ubuntu/downloaddata.py", dag=dag) t2 = SSHOperator(task_id='Read_transform_data_in_spark_and_store_data_in_redshift', ssh_conn_id="ssh_spark_master", command=SPARK_CMD, dag=dag) t1.set_upstream(t2)
max_active_runs=1, default_args=default_args) ## TO DO ssh_hook = SSHHook(ssh_conn_id='cx1_ssh_conn') check_hpc_queue=SSHOperator( task_id='check_hpc_queue', dag=dag, ssh_hook=ssh_hook, do_xcom_push=True, command='source /etc/bashrc;qstat' ) run_demultiplexing_pipeline = SSHOperator( task_id='run_demultiplexing_pipeline', dag=dag, ssh_hook=ssh_hook, command='bash /rds/general/user/igf/home/git_repo/IGF-cron-scripts/hpc/run_demultiplexing_pipeline.sh ' ) run_demultiplexing_pipeline.set_upstream(check_hpc_queue) run_primary_analysis_pipeline = SSHOperator( task_id='run_primary_analysis_pipeline', dag=dag, ssh_hook=ssh_hook, command='bash /rds/general/user/igf/home/git_repo/IGF-cron-scripts/hpc/run_primary_analysis_pipeline.sh ' ) run_primary_analysis_pipeline.set_upstream(run_demultiplexing_pipeline)
t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag) t2 = BashOperator(task_id="sleep", bash_command="sleep 5", retries=3, dag=dag) templated_command = """ {% for i in range(5) %} echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}" echo "{{ params.my_param }}" {% endfor %} """ t3 = BashOperator( task_id="templated", bash_command=templated_command, params={"my_param": "Parameter I passed in"}, dag=dag, ) day_file_check = SSHOperator( task_id='day_file_check', ssh_conn_id="ssh_ec2", command="pwd", do_xcom_push=True, dag=dag ) t2.set_upstream(t1) t3.set_upstream(t1) day_file_check.set_upstream(t1)
command4 = "echo '{0}' > /usr/share/zabgres/repl_state".format( query_func(ds, query4)) t1 = SSHOperator(task_id="sshtask1", command=command1, ssh_hook=sshHook, remote_host="10.127.33.41", dag=dag) t2 = SSHOperator(task_id="sshtask2", command=command2, ssh_hook=sshHook, remote_host="10.127.33.41", dag=dag) t3 = SSHOperator(task_id="sshtask3", command=command3, ssh_hook=sshHook, remote_host="10.127.33.41", dag=dag) t4 = SSHOperator(task_id="sshtask4", command=command4, ssh_hook=sshHook, remote_host="10.127.33.41", dag=dag) t2.set_upstream(t1) t3.set_upstream(t2) t4.set_upstream(t3)
dag=dag) t3 >> t2 t3 << t1 if e =='LOAD': script_loc = etl_task_type_df['SCRIPT_LOC'][1] script_name = etl_task_type_df['SCRIPT_NAME'][1] complete_script_path = script_loc+script_name t3 = SSHOperator( ssh_conn_id=ssh_conn_id, task_id=str(table_name)+'_'+str(e), command= 'spark-submit --num-executors '+str(num_executor)+' --executor-cores '+str(executor_cores)+' --executor-memory '+executor_mem+' --driver-memory '+driver_mem+' --driver-cores '+str(driver_cores)+' '+complete_script_path+' '+table_name , dag=dag) TML_dependencies = [t for t in dependencies if t.startswith('TML_')] if len(TML_dependencies) == 0: t3.set_upstream(t2) continue else: for d in TML_dependencies: t4 = SSHOperator( ssh_conn_id=ssh_conn_id, task_id= d +'_'+str(e), command= 'spark-submit --num-executors '+str(num_executor)+' --executor-cores '+str(executor_cores)+' --executor-memory '+executor_mem+' --driver-memory '+driver_mem+' --driver-cores '+str(driver_cores)+' '+complete_script_path+' '+table_name , dag=dag) t3.set_upstream(t4) t4.set_upstream(t2)
ssh_hook=sshHook, remote_host="10.127.33.41", dag=dag ) t9 = SSHOperator( task_id="sshtask9", command=command9, ssh_hook=sshHook, remote_host="10.127.33.41", dag=dag ) t10 = SSHOperator( task_id="sshtask10", command=command10, ssh_hook=sshHook, remote_host="10.127.33.41", dag=dag ) t2.set_upstream(t1) t3.set_upstream(t2) t4.set_upstream(t3) t5.set_upstream(t4) t6.set_upstream(t5) t7.set_upstream(t6) t8.set_upstream(t7) t9.set_upstream(t8) t10.set_upstream(t9)
#'email_on_failure': False, #'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(seconds=20), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2018, 12, 31), } # Create DAG object (workflow) that runs every 2 minutes. dag = DAG('airflow_demo_ssh', default_args=default_args, schedule_interval=timedelta(minutes=2)) # Task to print date t1 = BashOperator(task_id='Start', bash_command='date', dag=dag) # Task to execute remote command (via SSH Hook). t2 = SSHOperator( ssh_conn_id='ssh_daza1', task_id="remote_task1", command="touch zzz.txt", #do_xcom_push=True, dag=dag) # Create DAG by specifying upstream tasks t2.set_upstream(t1) #ZEND
ssh_hook = SSHHook(ssh_conn_id='cx1_ssh_conn') orwell_ssh_hook = SSHHook(ssh_conn_id='orwell_ssh_conn') update_exp_metadata = SSHOperator( task_id='update_exp_metadata', dag=dag, ssh_hook=ssh_hook, command= 'bash /rds/general/user/igf/home/git_repo/IGF-cron-scripts/hpc/update_exp_metadata.sh ' ) find_new_exp_for_analysis = SSHOperator( task_id='find_new_exp_for_analysis', dag=dag, ssh_hook=orwell_ssh_hook, command= 'bash /home/igf/igf_code/IGF-cron-scripts/orwell/find_new_exp_for_analysis.sh ' ) find_new_exp_for_analysis.set_upstream(update_exp_metadata) seed_analysis_pipeline = SSHOperator( task_id='seed_analysis_pipeline', dag=dag, ssh_hook=ssh_hook, command= 'bash /rds/general/user/igf/home/git_repo/IGF-cron-scripts/hpc/seed_analysis_pipeline.sh ' ) seed_analysis_pipeline.set_upstream(find_new_exp_for_analysis)
task_id='switch_off_project_barcode', dag=dag, ssh_hook=ssh_hook, command= 'bash /home/igf/igf_code/IGF-cron-scripts/orwell/switch_off_project_barcode_check.sh ' ) change_samplesheet_for_run = SSHOperator( task_id='change_samplesheet_for_run', dag=dag, ssh_hook=ssh_hook, command= 'bash /home/igf/igf_code/IGF-cron-scripts/orwell/change_samplesheet_for_seqrun.sh ' ) change_samplesheet_for_run.set_upstream(switch_off_project_barcode) restart_seqrun_processing = SSHOperator( task_id='restart_seqrun_processing', dag=dag, ssh_hook=ssh_hook, command= 'bash /home/igf/igf_code/IGF-cron-scripts/orwell/restart_seqrun_processing.sh ' ) restart_seqrun_processing.set_upstream(change_samplesheet_for_run) register_project_metadata = SSHOperator( task_id='register_project_metadata', dag=dag, ssh_hook=ssh_hook,
config_generation_task = SSHOperator( ssh_conn_id='spark_master_conn', task_id='config_generation', command='cd ~/InnSight/batch_processing; ./s3_urls_generation.sh all', dag=dag) data_cleaning_task = SSHOperator( ssh_conn_id='spark_master_conn', task_id='data_cleaning', command='source ~/.profile; ' 'cd ~/InnSight/batch_processing; ' '~/.local/bin/spark-submit ' '--executor-memory 4G --master spark://ip-10-0-0-11.us-west-2.compute.internal:7077 ' 'data_cleaning_to_parquet_batch.py all', dag=dag) stats_aggregation_task = SSHOperator( ssh_conn_id='spark_master_conn', task_id='stats_aggregation', command='source ~/.profile; ' 'cd ~/InnSight/batch_processing; ' '~/.local/bin/spark-submit ' '--executor-memory 4G --master spark://ip-10-0-0-11.us-west-2.compute.internal:7077 ' 'metrics_calculation_batch.py all', dag=dag) config_generation_task.set_upstream(data_fetch_task) data_cleaning_task.set_upstream(config_generation_task) stats_aggregation_task.set_upstream(data_cleaning_task)
check_orwell_disk = SSHOperator( task_id='check_orwell_disk', dag=dag, ssh_hook=orwell_ssh_hook, command= 'bash /home/igf/igf_code/IGF-cron-scripts/orwell/orwell_disk_usage.sh ') check_eliot_disk = SSHOperator( task_id='check_eliot_disk', dag=dag, ssh_hook=eliot_ssh_hook, command= 'bash /home/igf/git_repos/IGF-cron-scripts/eliot/eliot_disk_usage.sh ') check_eliot_disk.set_upstream(check_orwell_disk) check_woolf_disk = SSHOperator( task_id='check_woolf_disk', dag=dag, ssh_hook=woolf_ssh_hook, command= 'bash /home/igf/git_repos/IGF-cron-scripts/woolf/woolf_disk_usage.sh ') check_woolf_disk.set_upstream(check_eliot_disk) check_igf_lims_disk = SSHOperator( task_id='check_igf_lims_disk', dag=dag, ssh_hook=igf_lims_ssh_hook, command=