def create_sub_dag(parent_dag, report_name): sub_dag = DAG(dag_id=parent_dag.dag_id + '.hive_' + report_name, default_args=parent_dag.default_args) # Use ssh operator that executes a hive script in our etl always on cluster hive_task = SSHExecuteOperator(task_id='hive_transformation', ssh_hook=SSHHook(SSH_HOOK), bash_command=parse_hive_command(report_name), dag=sub_dag) return SubDagOperator(task_id='hive_' + report_name, subdag=sub_dag, default_args=parent_dag.default_args, dag=parent_dag)
bash_command=templated_command_distribute_labels, dag=dag) templated_command_Node_1 = """ sh ~/Deep_Images_Hub/src/producer/auto_upload.sh ~/sample_labels_validation_aa "validation" """ Upload_images_From_Node_1 = SSHExecuteOperator( task_id="Upload_images_From_Node_1", bash_command=templated_command_Node_1, dag=dag) sshHook_node2 = SSHHook(conn_id='Node_2') templated_command_Node_2 = """ sh ~/Deep_Images_Hub/src/producer/auto_upload.sh ~/sample_labels_validation_ab "validation" """ Upload_images_From_Node_2 = SSHExecuteOperator( task_id="Upload_images_From_Node_2", bash_command=templated_command_Node_2, ssh_hook=sshHook_node2, dag=dag) sshHook_node3 = SSHHook(conn_id='Node_3')
### # define the workflow ### with DAG( 'tem1_pre-processing', description="Pre-processing of TEM1 CryoEM data", schedule_interval=None, default_args=args, catchup=False, max_active_runs=6, concurrency=32, dagrun_timeout=3600, ) as dag: # hook to container host for lsf commands hook = SSHHook(conn_id=args['ssh_connection_id']) # lsftest_hook = SSHHook(conn_id='ssh_lsf_test') ### # parse the epu xml metadata file ### parameter_file = FileGlobSensor( task_id='parameter_file', filepath="{{ dag_run.conf['directory'] }}/**/{{ dag_run.conf['base'] }}.xml", recursive=True, poke_interval=1, ) parse_parameters = FeiEpuOperator(task_id='parse_parameters', filepath="{{ ti.xcom_pull( task_ids='parameter_file' )[0] }}", ) # upload to the logbook logbook_parameters = PythonOperator(task_id='logbook_parameters',
'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG( 'domain_checker', default_args=default_args, schedule_interval=timedelta(1)) #define ssh01 on Admin > Configuration > ssh_hook_01 = SSHHook(conn_id='ssh01') t0 = BashOperator( task_id='transfer_files_to_remote', bash_command= AIRFLOW_HOME + '/dags/transfer_files.sh', params={}, retries=1, dag=dag) t1 = SSHExecuteOperator( task_id='verify_transfer_to_remote', ssh_hook = sssh_hook_01, bash_command= AIRFLOW_HOME + '/dags/echo_date.sh', params={}, retries=1, dag=dag)
day = '0'+str(day) # 테스트를 위해 설정한 시간 year = '2019' month = '07' day = '10' now = "{}-{}-{}".format(year, month, day) '''-------------------------------------------- ''' ip_address = '172.31.20.78' ip_domain = 'ip-172-31-20-78.ap-northeast-2.compute.internal' project_home = '~/Project/02_Data_Batch_Processing' # SSH 연결을 위한 SSHHook 정의 sshHook = SSHHook(ssh_conn_id='emr_cluster_conn',remote_host='{}'.format(ip_domain),username='******',password='******') # DAG 정의 dag = DAG('DAG_data_refine', schedule_interval = '*/30 * * * *', start_date=datetime(2019, 8, 1), catchup=False) # EMR 클러스터 생성하는 task t1 = BashOperator( task_id = "emr_cluster_create", xcom_push=True, bash_command = """bash {}/shell_script/emr_cluster_create.sh {}""".format(project_home,ip_address), dag=dag ) # EMR 클러스터에 원격으로 커맨드를 실행하기 위한 전초작업
from datetime import timedelta import airflow from airflow.contrib.operators import SSHOperator from airflow import DAG from airflow.contrib.hooks import SSHHook import os import sys args = { 'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2), } dag = DAG( dag_id='process_gdelt_files', default_args=args, schedule_interval='*/15 * * * *', dagrun_timeout=timedelta(minutes=60), ) command = 'source ~/.bashrc && source ~/.profile && sh ~/Documents/Insight/InsightDataEngineer/data-processing/run_spark.sh ' sshHook = SSHHook(ssh_conn_id='spark_server') task = SSHOperator( task_id="run_gdelt_process", command=command, ssh_hook=sshHook, dag=dag)
from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.contrib.operators.ssh_execute_operator import SSHExecuteOperator from datetime import datetime, timedelta from airflow.contrib.hooks import SSHHook sshHook = SSHHook(conn_id='server_ssh') import airflow default_args = { 'owner': 'airflow', 'schedule_interval': '@once', 'start_date': airflow.utils.dates.days_ago(2) } dag = DAG('bash_ssh', default_args=default_args) t1 = SSHExecuteOperator(task_id="task1", bash_command='echo hello >> /tmp/hello.txt', ssh_hook=sshHook, dag=dag)
'provide_context': True, # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG( 'clix_dashboard_backend_dag', default_args=default_args, schedule_interval=timedelta(hours=5)) # -------------------------------------------------------------------------------- # Each state is synced independently. We have four states and syncthing data folders # corresponding to those states are synced through sync_school_data # -------------------------------------------------------------------------------- sshHook = SSHHook(conn_id=<YOUR CONNECTION ID FROM THE UI>) for each_state in clix_config.states: src = clix_config.remote_src + each_state dst = clix_config.local_dst + each_state sync_state_data = SSHExecuteOperator( task_id="task1", bash_command= rsync -avzhe ssh {0}@{1}:{2} {3}".format(user, ip, src, dst), ssh_hook=sshHook, dag=dag) sync_state_data = PythonOperator( task_id='sync_state_data_' + each_state, python_callable=sync_school_data.rsync_data_ssh, op_kwargs={'state': each_state, 'src': src, 'dst': dst},
default_args = { 'owner': "avkabay1", 'depends_on_past': False, 'start_date': datetime(2020, 4, 15), 'catchup': False, 'max_active_runs': 1, 'concurrency': 1, 'wait_for_downstream': False, 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': False, 'retries': 0, 'retry_delay': timedelta(minutes=1) } sshHook = SSHHook(ssh_conn_id="avkabay1@test-en-0002") dag = DAG('test_gobike_tripdata', schedule_interval='0 8 * * 6', default_args=default_args) kinit = SSHOperator(task_id='kinit', command="echo '{1}' | kinit {0}".format(user, password), ssh_hook=sshHook, dag=dag) python_path = '/data/venv/python2/dq-venv/bin/python2.7' script_dir = '/home/avkabay1/spdevices/etl/test_gobike_tripdata/' task_raw_script = 'test_raw_gobike_tripdata.py' task_ods_script = 'test_ods_gobike_tripdata.py' task_dm_station_script = 'test_dm_station_gobike_tripdata.py'
# 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), # 'wait_for_downstream': False, # 'dag': dag, # 'adhoc':False, # 'sla': timedelta(hours=2), # 'execution_timeout': timedelta(seconds=300), # 'on_failure_callback': some_function, # 'on_success_callback': some_other_function, # 'on_retry_callback': another_function, # 'trigger_rule': u'all_success' } dag = DAG(dag_id='ssh_airflow_test', default_args=args, schedule_interval='*/1 * * * *', catchup=False) bash_command = """python /Users/zaferdurkut/test/dizin1/ssh_test.py""" ssh_hook = SSHHook(username=os.getenv('SSH_USER'), password=os.getenv('SSH_PASSWORD'), remote_host=os.getenv('SSH_HOST')) ssh_task = SSHOperator(task_id='ssh_airflow_test_task', ssh_hook=ssh_hook, command=bash_command, dag=dag) ssh_task