def test_simple(self): task = SSHExecuteOperator( task_id="test", bash_command="echo airflow", ssh_hook=self.hook, dag=self.dag, ) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_simple(self): task = SSHExecuteOperator( task_id="test", bash_command="echo airflow", ssh_hook=self.hook, dag=self.dag, ) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, force=True)
def test_with_env(self): task = SSHExecuteOperator( task_id="test", bash_command="echo $AIRFLOW_HOME", ssh_hook=self.hook, env={"AIRFLOW_test": "test"}, dag=self.dag, ) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, force=True)
def test_simple(self, temp_file): temp_file.return_value.__enter__ = lambda x: 'filepath' task = SSHExecuteOperator( task_id="test", bash_command="echo airflow", ssh_hook=self.hook, dag=self.dag, ) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_with_env(self): test_env = os.environ.copy() test_env['AIRFLOW_test'] = "test" task = SSHExecuteOperator( task_id="test", bash_command="echo $AIRFLOW_HOME", ssh_hook=self.hook, env=test_env, dag=self.dag, ) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_with_env(self, temp_file): temp_file.return_value.__enter__ = lambda x: 'filepath' test_env = os.environ.copy() test_env['AIRFLOW_test'] = "test" task = SSHExecuteOperator( task_id="test", bash_command="echo $AIRFLOW_HOME", ssh_hook=self.hook, env=test_env['AIRFLOW_test'], dag=self.dag, ) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def get_sub_ssh_cmds_dag(parent_dag, task_id, args): ssh_dag = DAG( '%s.%s' % (parent_dag.dag_id, task_id), default_args=args, start_date=args['start_date'], schedule_interval=parent_dag.schedule_interval, ) start = DummyOperator( task_id='ssh_start', dag=ssh_dag) end = DummyOperator( task_id='ssh_end', dag=ssh_dag) # generate the task to submit dynamically depending on the number of hive script that needs to be run response = s3_client.list_objects_v2(Bucket=wk_conf.get('s3_bucket'),Prefix=wk_conf.get('s3_hive_script_location')) hive_scripts = [c.get('Key') for c in response.get('Contents')] if len(hive_scripts)>0: ssh_emr_hook = SSHHook(conn_id='ssh_emr_default') ssh_tasks = [ SSHExecuteOperator( task_id=str(key.replace(':','_').replace('/','_')), ssh_hook=ssh_emr_hook, bash_command='hive -f "s3://'+wk_conf.get('s3_bucket')+'/'+str(key)+'"', dag=ssh_dag) for key in hive_scripts if key.endswith('hql')] start.set_downstream(ssh_tasks) end.set_upstream(ssh_tasks) # if no hive scripts generrated short circuit step in the begining of main dag return ssh_dag
def create_sub_dag(parent_dag, report_name): sub_dag = DAG(dag_id=parent_dag.dag_id + '.hive_' + report_name, default_args=parent_dag.default_args) # Use ssh operator that executes a hive script in our etl always on cluster hive_task = SSHExecuteOperator(task_id='hive_transformation', ssh_hook=SSHHook(SSH_HOOK), bash_command=parse_hive_command(report_name), dag=sub_dag) return SubDagOperator(task_id='hive_' + report_name, subdag=sub_dag, default_args=parent_dag.default_args, dag=parent_dag)
lastday = (today + datetime.timedelta(days=-1)).strftime('%Y%m%d') return lastday def get_last_update_date(): try: result = os.popen(check_partition_cmd, "r").readline() except: raise Exception("ssh operation failed!") else: return str(eval(result)) spark = SSHExecuteOperator( task_id="itemsearch_spark", bash_command='(bash {path}/itemsearch_parse.sh {lastday})'.format(path=path, lastday=get_lastday()), ssh_hook=sshHook, dag=dag) hive_distinct = SSHExecuteOperator( task_id="itemsearch_hive_dis", bash_command='(bash {path}/distinct.sh)'.format(path=path), ssh_hook=sshHook, dag=dag) hive = SSHExecuteOperator( task_id="itemsearch_hive_import", bash_command='(bash {path}/itemsearch_import.sh {lastday} {last_update_day})'.format(path=path, lastday=get_lastday(), last_update_day=get_last_update_date()), ssh_hook=sshHook,
return 'update' if '1' in result else 'pass' def get_last_update_date(): try: result = os.popen(check_partition_cmd, "r").readline() except: raise Exception("ssh operation failed!") else: return str(eval(result)) spark = SSHExecuteOperator( task_id="shopitem_b_parse", bash_command='(bash {path}/shopitem_b_parse.sh {lastday} {latest_partition})'.format(path=path, lastday=get_lastday(), latest_partition=get_last_update_date()), ssh_hook=sshHook, dag=dag) hive = SSHExecuteOperator( task_id="shopitem_b_import", bash_command='(bash {path}/shopitem_b_import.sh {lastday})'.format(path=path, lastday=get_lastday()), ssh_hook=sshHook, dag=dag) email_update = EmailOperator(task_id='shopitem_b_updated_email', to=['*****@*****.**'], subject='ec shopitem b workflow', html_content='[ ec shopitem b data updated!!! ]', dag=dag)
from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.contrib.operators.ssh_execute_operator import SSHExecuteOperator from datetime import datetime, timedelta from airflow.contrib.hooks import SSHHook sshHook = SSHHook(conn_id='server_ssh') import airflow default_args = { 'owner': 'airflow', 'schedule_interval': '@once', 'start_date': airflow.utils.dates.days_ago(2) } dag = DAG('bash_ssh', default_args=default_args) t1 = SSHExecuteOperator(task_id="task1", bash_command='echo hello >> /tmp/hello.txt', ssh_hook=sshHook, dag=dag)
except: raise Exception("ssh operation failed!") else: return str(eval(result)) def get_last_update_c_date(): try: result = os.popen(check_partition_c_cmd, "r").readline() except: raise Exception("ssh operation failed!") else: return str(eval(result)) b = SSHExecuteOperator( task_id="update_b", bash_command='(bash {path}/record_bc_feed/record_b_feed.sh {latest_partition})'.format(path=path, latest_partition=get_last_update_b_date()), ssh_hook=sshHook, dag=dag) c = SSHExecuteOperator( task_id="update_c", bash_command='(bash {path}/record_bc_feed/record_c_feed.sh {latest_partition})'.format(path=path, latest_partition=get_last_update_c_date()), ssh_hook=sshHook, dag=dag) chain(b, c)
'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=4), } dag = DAG('data_pipeline', default_args=default_args, schedule_interval=timedelta(days=1)) connection = "hadoop_connection" sshHook = SSHHook(conn_id=connection) task0 = SSHExecuteOperator(task_id="sync_jars", bash_command=sync_jars, ssh_hook=sshHook, dag=dag) task1 = SSHExecuteOperator(task_id="import_all_tables", bash_command=import_all_tables, ssh_hook=sshHook, dag=dag) task2 = SSHExecuteOperator(task_id="spark_process_dimension", bash_command=spark_process_dimension, ssh_hook=sshHook, dag=dag) task3 = SSHExecuteOperator(task_id="spark_process_facts", bash_command=spark_process_facts, ssh_hook=sshHook,
else: return 'update' if '1' in result else 'pass' def get_last_update_date(): try: result = os.popen(check_partition_cmd, "r").readline() except: raise Exception("ssh operation failed!") else: return str(eval(result)) spark = SSHExecuteOperator( task_id="comment_parse", bash_command='(bash {path}/xianyu_itemcomment_parse.sh {lastday})'.format( path=path, lastday=get_lastday()), ssh_hook=sshHook, dag=dag) hive = SSHExecuteOperator( task_id="comment_import", bash_command= '(bash {path}/xianyu_itemcomment_import.sh {lastday} {last_update_date})'. format(path=path, lastday=get_lastday(), last_update_date=get_last_update_date()), ssh_hook=sshHook, dag=dag) email_update = EmailOperator(task_id='xianyu_itemcomment_update_email', to=['*****@*****.**'],
""" Distribute_labels_to_nodes = BashOperator( task_id='Distribute_labels', bash_command=templated_command_distribute_labels, dag=dag) templated_command_Node_1 = """ sh ~/Deep_Images_Hub/src/producer/auto_upload.sh ~/sample_labels_validation_aa "validation" """ Upload_images_From_Node_1 = SSHExecuteOperator( task_id="Upload_images_From_Node_1", bash_command=templated_command_Node_1, dag=dag) sshHook_node2 = SSHHook(conn_id='Node_2') templated_command_Node_2 = """ sh ~/Deep_Images_Hub/src/producer/auto_upload.sh ~/sample_labels_validation_ab "validation" """ Upload_images_From_Node_2 = SSHExecuteOperator( task_id="Upload_images_From_Node_2", bash_command=templated_command_Node_2, ssh_hook=sshHook_node2,
check_partition_cmd = "ssh -p 22 wrt@cs220 bash {path}/get_latest_partition.sh".format(path=path) def get_last_update_date(): try: result = os.popen(check_partition_cmd, "r").readline() print result except: raise Exception("ssh operation failed!") else: return str(eval(result)) spark_sale = SSHExecuteOperator( task_id="itemsold_sale_parse", bash_command='(bash {path}/ec_itemsold_sale_parse.sh {last_2_days} {lastday} {last_update})' .format(path=path, last_2_days=get_date()[1], lastday=get_date()[0], last_update=get_last_update_date()), ssh_hook=sshHook, dag=dag) hive_sale = SSHExecuteOperator( task_id="itemsold_sale_import", bash_command='(bash {path}/ec_itemsold_sale_import.sh {lastday})' .format(path=path, lastday=get_date()[0]), ssh_hook=sshHook, dag=dag) spark_daysale = SSHExecuteOperator( task_id="itemsold_daysale_parse", bash_command='(bash {path}/ec_itemsold_daysale_parse.sh {last_2_days} {lastday})' .format(path=path, last_2_days=get_date()[1], lastday=get_date()[0]), ssh_hook=sshHook,
return str(eval(result)) def shopinfo_last_update(): try: result = os.popen(check_partition_shopinfo, "r").readline() except: raise Exception("ssh operation failed!") else: return str(eval(result)) spark_item = SSHExecuteOperator( task_id="ec_iteminfo_spark", bash_command='(bash {path}/ec_iteminfo_parse.sh)'.format(path=path), ssh_hook=sshHook, dag=dag) hive_item = SSHExecuteOperator( task_id="ec_iteminfo_hive", bash_command='(bash {path}/ec_iteminfo_import.sh {last_par})'.format(path=path,last_par=iteminfo_last_update()), ssh_hook=sshHook, dag=dag) spark_shop = SSHExecuteOperator( task_id="ec_shopinfo_spark", bash_command='(bash {path}/ec_shopinfo_parse.sh)'.format(path=path), ssh_hook=sshHook, dag=dag)
'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=4), } dag = DAG('qubole_data_pipeline', default_args=default_args, schedule_interval=timedelta(days=1)) connection = "qubole_connection" sshHook = SSHHook(conn_id=connection) task_import_nation = SSHExecuteOperator(task_id="import_nation", bash_command=import_nation, ssh_hook=sshHook, dag=dag) task_import_region = SSHExecuteOperator(task_id="import_region", bash_command=import_region, ssh_hook=sshHook, dag=dag) task_import_customer = SSHExecuteOperator(task_id="import_customer", bash_command=import_customer, ssh_hook=sshHook, dag=dag) task_import_supplier = SSHExecuteOperator(task_id="import_supplier", bash_command=import_supplier, ssh_hook=sshHook, dag=dag) task_import_orders = SSHExecuteOperator(task_id="import_orders",
'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=4), } dag = DAG('data_pipeline', default_args=default_args, schedule_interval=timedelta(days=1)) connection = "hadoop_connection" sshHook = SSHHook(conn_id=connection) task0 = SSHExecuteOperator(task_id="sync_jars", bash_command=sync_jars, ssh_hook=sshHook, dag=dag) task1 = SSHExecuteOperator(task_id="import_nation", bash_command=import_nation, ssh_hook=sshHook, dag=dag) task2 = SSHExecuteOperator(task_id="import_region", bash_command=import_region, ssh_hook=sshHook, dag=dag) task3 = SSHExecuteOperator(task_id="spark_dimNation", bash_command=spark_dimNation, ssh_hook=sshHook,