from airflow import DAG
from datetime import datetime, timedelta

default_args = {
    'owner': 'yourself',
    'depends_on_past': False,
    'start_date': datetime(2019, 6, 24),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=5),
}
dag = DAG('ssh_tutorial', default_args=default_args, schedule_interval=timedelta(days=1))

SPARK_CMD = """spark-submit --jars \
	$SPARK_HOME/jars/aws-java-sdk-1.7.4.jar,\
        $SPARK_HOME/jars/RedshiftJDBC42-no-awssdk-1.2.27.1051.jar,\
        $SPARK_HOME/jars/spark-redshift_2.10-3.0.0-preview1.jar,\
        $SPARK_HOME/jars/spark-avro_2.11-4.0.0.jar \
        --master spark://{master_ip}:7077 \
        --num-executors {num_executors} \
        {spark_filepath}"""\
        .format(master_ip=master_ip, num_executors=num_executors, spark_filepath=spark_filepath)


t1 = SSHOperator(task_id='Download_data_to_s3', ssh_conn_id="ssh_spark_master", command="python3 /home/ubuntu/downloaddata.py", dag=dag)
t2 = SSHOperator(task_id='Read_transform_data_in_spark_and_store_data_in_redshift', ssh_conn_id="ssh_spark_master", command=SPARK_CMD, dag=dag)

t1.set_upstream(t2)
Exemple #2
0
          max_active_runs=1,
          default_args=default_args)  ## TO DO

ssh_hook = SSHHook(ssh_conn_id='cx1_ssh_conn')

check_hpc_queue=SSHOperator(
    task_id='check_hpc_queue',
    dag=dag,
    ssh_hook=ssh_hook,
    do_xcom_push=True,
    command='source /etc/bashrc;qstat'
)

run_demultiplexing_pipeline = SSHOperator(
    task_id='run_demultiplexing_pipeline',
    dag=dag,
    ssh_hook=ssh_hook,
    command='bash /rds/general/user/igf/home/git_repo/IGF-cron-scripts/hpc/run_demultiplexing_pipeline.sh '
)

run_demultiplexing_pipeline.set_upstream(check_hpc_queue)

run_primary_analysis_pipeline = SSHOperator(
    task_id='run_primary_analysis_pipeline',
    dag=dag,
    ssh_hook=ssh_hook,
    command='bash /rds/general/user/igf/home/git_repo/IGF-cron-scripts/hpc/run_primary_analysis_pipeline.sh '
)

run_primary_analysis_pipeline.set_upstream(run_demultiplexing_pipeline)
Exemple #3
0
t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag)

t2 = BashOperator(task_id="sleep", bash_command="sleep 5", retries=3, dag=dag)

templated_command = """
    {% for i in range(5) %}
        echo "{{ ds }}"
        echo "{{ macros.ds_add(ds, 7)}}"
        echo "{{ params.my_param }}"
    {% endfor %}
"""

t3 = BashOperator(
    task_id="templated",
    bash_command=templated_command,
    params={"my_param": "Parameter I passed in"},
    dag=dag,
)

day_file_check = SSHOperator(
    task_id='day_file_check',
    ssh_conn_id="ssh_ec2",
    command="pwd",
    do_xcom_push=True,
    dag=dag
)

t2.set_upstream(t1)
t3.set_upstream(t1)
day_file_check.set_upstream(t1)
Exemple #4
0
command4 = "echo '{0}' > /usr/share/zabgres/repl_state".format(
    query_func(ds, query4))

t1 = SSHOperator(task_id="sshtask1",
                 command=command1,
                 ssh_hook=sshHook,
                 remote_host="10.127.33.41",
                 dag=dag)

t2 = SSHOperator(task_id="sshtask2",
                 command=command2,
                 ssh_hook=sshHook,
                 remote_host="10.127.33.41",
                 dag=dag)

t3 = SSHOperator(task_id="sshtask3",
                 command=command3,
                 ssh_hook=sshHook,
                 remote_host="10.127.33.41",
                 dag=dag)

t4 = SSHOperator(task_id="sshtask4",
                 command=command4,
                 ssh_hook=sshHook,
                 remote_host="10.127.33.41",
                 dag=dag)

t2.set_upstream(t1)
t3.set_upstream(t2)
t4.set_upstream(t3)
Exemple #5
0
					dag=dag)
					
			t3 >> t2
			t3 << t1
		
		if e =='LOAD':
			script_loc = etl_task_type_df['SCRIPT_LOC'][1]
			script_name = etl_task_type_df['SCRIPT_NAME'][1]
			complete_script_path = script_loc+script_name
	
			t3 = SSHOperator(
					ssh_conn_id=ssh_conn_id,
					task_id=str(table_name)+'_'+str(e),					
					command= 'spark-submit --num-executors '+str(num_executor)+' --executor-cores '+str(executor_cores)+' --executor-memory '+executor_mem+' --driver-memory '+driver_mem+' --driver-cores '+str(driver_cores)+' '+complete_script_path+' '+table_name  , 	
					dag=dag)
	
			TML_dependencies = [t for t in dependencies if t.startswith('TML_')]
			
			if len(TML_dependencies) == 0:
				t3.set_upstream(t2)
				continue
			else:
				for d in TML_dependencies:
					t4 = SSHOperator(
							ssh_conn_id=ssh_conn_id,
							task_id= d +'_'+str(e),					
							command= 'spark-submit --num-executors '+str(num_executor)+' --executor-cores '+str(executor_cores)+' --executor-memory '+executor_mem+' --driver-memory '+driver_mem+' --driver-cores '+str(driver_cores)+' '+complete_script_path+' '+table_name  , 	
							dag=dag)
					t3.set_upstream(t4)
					t4.set_upstream(t2)
Exemple #6
0
    ssh_hook=sshHook,
    remote_host="10.127.33.41",
    dag=dag
 )

t9 = SSHOperator(
    task_id="sshtask9",
    command=command9,
    ssh_hook=sshHook,
    remote_host="10.127.33.41",
    dag=dag
 )

t10 = SSHOperator(
    task_id="sshtask10",
    command=command10,
    ssh_hook=sshHook,
    remote_host="10.127.33.41",
    dag=dag
 )

t2.set_upstream(t1)
t3.set_upstream(t2)
t4.set_upstream(t3)
t5.set_upstream(t4)
t6.set_upstream(t5)
t7.set_upstream(t6)
t8.set_upstream(t7)
t9.set_upstream(t8)
t10.set_upstream(t9)
Exemple #7
0
    #'email_on_failure': False,
    #'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(seconds=20),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2018, 12, 31),
}

# Create DAG object (workflow) that runs every 2 minutes.
dag = DAG('airflow_demo_ssh',
          default_args=default_args,
          schedule_interval=timedelta(minutes=2))

# Task to print date
t1 = BashOperator(task_id='Start', bash_command='date', dag=dag)

# Task to execute remote command (via SSH Hook).
t2 = SSHOperator(
    ssh_conn_id='ssh_daza1',
    task_id="remote_task1",
    command="touch zzz.txt",
    #do_xcom_push=True,
    dag=dag)

# Create DAG by specifying upstream tasks
t2.set_upstream(t1)

#ZEND
ssh_hook = SSHHook(ssh_conn_id='cx1_ssh_conn')
orwell_ssh_hook = SSHHook(ssh_conn_id='orwell_ssh_conn')

update_exp_metadata = SSHOperator(
    task_id='update_exp_metadata',
    dag=dag,
    ssh_hook=ssh_hook,
    command=
    'bash /rds/general/user/igf/home/git_repo/IGF-cron-scripts/hpc/update_exp_metadata.sh '
)

find_new_exp_for_analysis = SSHOperator(
    task_id='find_new_exp_for_analysis',
    dag=dag,
    ssh_hook=orwell_ssh_hook,
    command=
    'bash /home/igf/igf_code/IGF-cron-scripts/orwell/find_new_exp_for_analysis.sh '
)

find_new_exp_for_analysis.set_upstream(update_exp_metadata)

seed_analysis_pipeline = SSHOperator(
    task_id='seed_analysis_pipeline',
    dag=dag,
    ssh_hook=ssh_hook,
    command=
    'bash /rds/general/user/igf/home/git_repo/IGF-cron-scripts/hpc/seed_analysis_pipeline.sh '
)

seed_analysis_pipeline.set_upstream(find_new_exp_for_analysis)
    task_id='switch_off_project_barcode',
    dag=dag,
    ssh_hook=ssh_hook,
    command=
    'bash /home/igf/igf_code/IGF-cron-scripts/orwell/switch_off_project_barcode_check.sh '
)

change_samplesheet_for_run = SSHOperator(
    task_id='change_samplesheet_for_run',
    dag=dag,
    ssh_hook=ssh_hook,
    command=
    'bash /home/igf/igf_code/IGF-cron-scripts/orwell/change_samplesheet_for_seqrun.sh '
)

change_samplesheet_for_run.set_upstream(switch_off_project_barcode)

restart_seqrun_processing = SSHOperator(
    task_id='restart_seqrun_processing',
    dag=dag,
    ssh_hook=ssh_hook,
    command=
    'bash /home/igf/igf_code/IGF-cron-scripts/orwell/restart_seqrun_processing.sh '
)

restart_seqrun_processing.set_upstream(change_samplesheet_for_run)

register_project_metadata = SSHOperator(
    task_id='register_project_metadata',
    dag=dag,
    ssh_hook=ssh_hook,
config_generation_task = SSHOperator(
    ssh_conn_id='spark_master_conn',
    task_id='config_generation',
    command='cd ~/InnSight/batch_processing; ./s3_urls_generation.sh all',
    dag=dag)

data_cleaning_task = SSHOperator(
    ssh_conn_id='spark_master_conn',
    task_id='data_cleaning',
    command='source ~/.profile; '
    'cd ~/InnSight/batch_processing; '
    '~/.local/bin/spark-submit '
    '--executor-memory 4G --master spark://ip-10-0-0-11.us-west-2.compute.internal:7077 '
    'data_cleaning_to_parquet_batch.py all',
    dag=dag)

stats_aggregation_task = SSHOperator(
    ssh_conn_id='spark_master_conn',
    task_id='stats_aggregation',
    command='source ~/.profile; '
    'cd ~/InnSight/batch_processing; '
    '~/.local/bin/spark-submit '
    '--executor-memory 4G --master spark://ip-10-0-0-11.us-west-2.compute.internal:7077 '
    'metrics_calculation_batch.py all',
    dag=dag)

config_generation_task.set_upstream(data_fetch_task)
data_cleaning_task.set_upstream(config_generation_task)
stats_aggregation_task.set_upstream(data_cleaning_task)
Exemple #11
0
check_orwell_disk = SSHOperator(
    task_id='check_orwell_disk',
    dag=dag,
    ssh_hook=orwell_ssh_hook,
    command=
    'bash /home/igf/igf_code/IGF-cron-scripts/orwell/orwell_disk_usage.sh ')

check_eliot_disk = SSHOperator(
    task_id='check_eliot_disk',
    dag=dag,
    ssh_hook=eliot_ssh_hook,
    command=
    'bash /home/igf/git_repos/IGF-cron-scripts/eliot/eliot_disk_usage.sh ')

check_eliot_disk.set_upstream(check_orwell_disk)

check_woolf_disk = SSHOperator(
    task_id='check_woolf_disk',
    dag=dag,
    ssh_hook=woolf_ssh_hook,
    command=
    'bash /home/igf/git_repos/IGF-cron-scripts/woolf/woolf_disk_usage.sh ')

check_woolf_disk.set_upstream(check_eliot_disk)

check_igf_lims_disk = SSHOperator(
    task_id='check_igf_lims_disk',
    dag=dag,
    ssh_hook=igf_lims_ssh_hook,
    command=