def test_simple(self):
     task = SSHExecuteOperator(
         task_id="test",
         bash_command="echo airflow",
         ssh_hook=self.hook,
         dag=self.dag,
     )
     task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
Exemple #2
0
 def test_simple(self):
     task = SSHExecuteOperator(
         task_id="test",
         bash_command="echo airflow",
         ssh_hook=self.hook,
         dag=self.dag,
     )
     task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, force=True)
 def test_with_env(self):
     task = SSHExecuteOperator(
         task_id="test",
         bash_command="echo $AIRFLOW_HOME",
         ssh_hook=self.hook,
         env={"AIRFLOW_test": "test"},
         dag=self.dag,
     )
     task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, force=True)
 def test_simple(self, temp_file):
     temp_file.return_value.__enter__ = lambda x: 'filepath'
     task = SSHExecuteOperator(
         task_id="test",
         bash_command="echo airflow",
         ssh_hook=self.hook,
         dag=self.dag,
     )
     task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
Exemple #5
0
 def test_with_env(self):
     task = SSHExecuteOperator(
         task_id="test",
         bash_command="echo $AIRFLOW_HOME",
         ssh_hook=self.hook,
         env={"AIRFLOW_test": "test"},
         dag=self.dag,
     )
     task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, force=True)
 def test_with_env(self):
     test_env = os.environ.copy()
     test_env['AIRFLOW_test'] = "test"
     task = SSHExecuteOperator(
         task_id="test",
         bash_command="echo $AIRFLOW_HOME",
         ssh_hook=self.hook,
         env=test_env,
         dag=self.dag,
     )
     task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
 def test_simple(self, temp_file):
     temp_file.return_value.__enter__ = lambda x: 'filepath'
     task = SSHExecuteOperator(
         task_id="test",
         bash_command="echo airflow",
         ssh_hook=self.hook,
         dag=self.dag,
     )
     task.run(start_date=DEFAULT_DATE,
              end_date=DEFAULT_DATE,
              ignore_ti_state=True)
 def test_with_env(self, temp_file):
     temp_file.return_value.__enter__ = lambda x: 'filepath'
     test_env = os.environ.copy()
     test_env['AIRFLOW_test'] = "test"
     task = SSHExecuteOperator(
         task_id="test",
         bash_command="echo $AIRFLOW_HOME",
         ssh_hook=self.hook,
         env=test_env['AIRFLOW_test'],
         dag=self.dag,
     )
     task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
Exemple #9
0
 def test_with_env(self):
     test_env = os.environ.copy()
     test_env['AIRFLOW_test'] = "test"
     task = SSHExecuteOperator(
         task_id="test",
         bash_command="echo $AIRFLOW_HOME",
         ssh_hook=self.hook,
         env=test_env,
         dag=self.dag,
     )
     task.run(start_date=DEFAULT_DATE,
              end_date=DEFAULT_DATE,
              ignore_ti_state=True)
 def test_with_env(self, temp_file):
     temp_file.return_value.__enter__ = lambda x: 'filepath'
     test_env = os.environ.copy()
     test_env['AIRFLOW_test'] = "test"
     task = SSHExecuteOperator(
         task_id="test",
         bash_command="echo $AIRFLOW_HOME",
         ssh_hook=self.hook,
         env=test_env['AIRFLOW_test'],
         dag=self.dag,
     )
     task.run(start_date=DEFAULT_DATE,
              end_date=DEFAULT_DATE,
              ignore_ti_state=True)
Exemple #11
0
def get_sub_ssh_cmds_dag(parent_dag, task_id, args):
	ssh_dag = DAG(
		'%s.%s' % (parent_dag.dag_id, task_id),
		default_args=args,
		start_date=args['start_date'],
		schedule_interval=parent_dag.schedule_interval,
		)
	start = DummyOperator(
		task_id='ssh_start',
		dag=ssh_dag)
	end = DummyOperator(
		task_id='ssh_end',
		dag=ssh_dag)
	# generate the task to submit dynamically depending on the number of hive script that needs to be run 
	response = s3_client.list_objects_v2(Bucket=wk_conf.get('s3_bucket'),Prefix=wk_conf.get('s3_hive_script_location'))
	hive_scripts = [c.get('Key') for c in response.get('Contents')]
	if len(hive_scripts)>0:
		ssh_emr_hook = SSHHook(conn_id='ssh_emr_default')
		ssh_tasks = [ SSHExecuteOperator(
			task_id=str(key.replace(':','_').replace('/','_')),
            		ssh_hook=ssh_emr_hook,
            		bash_command='hive -f "s3://'+wk_conf.get('s3_bucket')+'/'+str(key)+'"',
			dag=ssh_dag) for key in hive_scripts if key.endswith('hql')]
	start.set_downstream(ssh_tasks)
	end.set_upstream(ssh_tasks)
	# if no hive scripts generrated short circuit step in the begining of main dag
	return ssh_dag
def create_sub_dag(parent_dag, report_name):
    sub_dag = DAG(dag_id=parent_dag.dag_id + '.hive_' + report_name, default_args=parent_dag.default_args)

    # Use ssh operator that executes a hive script in our etl always on cluster
    hive_task = SSHExecuteOperator(task_id='hive_transformation',
                            ssh_hook=SSHHook(SSH_HOOK),
                            bash_command=parse_hive_command(report_name),
                            dag=sub_dag)

    return SubDagOperator(task_id='hive_' + report_name,
                          subdag=sub_dag,
                          default_args=parent_dag.default_args,
                          dag=parent_dag)
Exemple #13
0
    lastday = (today + datetime.timedelta(days=-1)).strftime('%Y%m%d')
    return lastday


def get_last_update_date():
    try:
        result = os.popen(check_partition_cmd, "r").readline()
    except:
        raise Exception("ssh operation failed!")
    else:
        return str(eval(result))


spark = SSHExecuteOperator(
    task_id="itemsearch_spark",
    bash_command='(bash {path}/itemsearch_parse.sh {lastday})'.format(path=path, lastday=get_lastday()),
    ssh_hook=sshHook,
    dag=dag)

hive_distinct = SSHExecuteOperator(
    task_id="itemsearch_hive_dis",
    bash_command='(bash {path}/distinct.sh)'.format(path=path),
    ssh_hook=sshHook,
    dag=dag)

hive = SSHExecuteOperator(
    task_id="itemsearch_hive_import",
    bash_command='(bash {path}/itemsearch_import.sh {lastday} {last_update_day})'.format(path=path,
                                                                                         lastday=get_lastday(),
                                                                                         last_update_day=get_last_update_date()),
    ssh_hook=sshHook,
Exemple #14
0
        return 'update' if '1' in result else 'pass'


def get_last_update_date():
    try:
        result = os.popen(check_partition_cmd, "r").readline()
    except:
        raise Exception("ssh operation failed!")
    else:
        return str(eval(result))


spark = SSHExecuteOperator(
    task_id="shopitem_b_parse",
    bash_command='(bash {path}/shopitem_b_parse.sh {lastday} {latest_partition})'.format(path=path,
                                                                                         lastday=get_lastday(),
                                                                                         latest_partition=get_last_update_date()),
    ssh_hook=sshHook,
    dag=dag)

hive = SSHExecuteOperator(
    task_id="shopitem_b_import",
    bash_command='(bash {path}/shopitem_b_import.sh {lastday})'.format(path=path, lastday=get_lastday()),
    ssh_hook=sshHook,
    dag=dag)

email_update = EmailOperator(task_id='shopitem_b_updated_email',
                             to=['*****@*****.**'],
                             subject='ec shopitem b workflow',
                             html_content='[ ec shopitem b data updated!!! ]',
                             dag=dag)
Exemple #15
0
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.operators.ssh_execute_operator import SSHExecuteOperator
from datetime import datetime, timedelta
from airflow.contrib.hooks import SSHHook
sshHook = SSHHook(conn_id='server_ssh')
import airflow

default_args = {
    'owner': 'airflow',
    'schedule_interval': '@once',
    'start_date': airflow.utils.dates.days_ago(2)
}

dag = DAG('bash_ssh', default_args=default_args)

t1 = SSHExecuteOperator(task_id="task1",
                        bash_command='echo hello >> /tmp/hello.txt',
                        ssh_hook=sshHook,
                        dag=dag)
Exemple #16
0
    except:
        raise Exception("ssh operation failed!")
    else:
        return str(eval(result))


def get_last_update_c_date():
    try:
        result = os.popen(check_partition_c_cmd, "r").readline()
    except:
        raise Exception("ssh operation failed!")
    else:
        return str(eval(result))


b = SSHExecuteOperator(
    task_id="update_b",
    bash_command='(bash {path}/record_bc_feed/record_b_feed.sh {latest_partition})'.format(path=path,
                                                                            latest_partition=get_last_update_b_date()),
    ssh_hook=sshHook,
    dag=dag)

c = SSHExecuteOperator(
    task_id="update_c",
    bash_command='(bash {path}/record_bc_feed/record_c_feed.sh {latest_partition})'.format(path=path,
                                                                            latest_partition=get_last_update_c_date()),
    ssh_hook=sshHook,
    dag=dag)

chain(b, c)
Exemple #17
0
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=4),
}

dag = DAG('data_pipeline',
          default_args=default_args,
          schedule_interval=timedelta(days=1))

connection = "hadoop_connection"
sshHook = SSHHook(conn_id=connection)

task0 = SSHExecuteOperator(task_id="sync_jars",
                           bash_command=sync_jars,
                           ssh_hook=sshHook,
                           dag=dag)

task1 = SSHExecuteOperator(task_id="import_all_tables",
                           bash_command=import_all_tables,
                           ssh_hook=sshHook,
                           dag=dag)

task2 = SSHExecuteOperator(task_id="spark_process_dimension",
                           bash_command=spark_process_dimension,
                           ssh_hook=sshHook,
                           dag=dag)

task3 = SSHExecuteOperator(task_id="spark_process_facts",
                           bash_command=spark_process_facts,
                           ssh_hook=sshHook,
    else:
        return 'update' if '1' in result else 'pass'


def get_last_update_date():
    try:
        result = os.popen(check_partition_cmd, "r").readline()
    except:
        raise Exception("ssh operation failed!")
    else:
        return str(eval(result))


spark = SSHExecuteOperator(
    task_id="comment_parse",
    bash_command='(bash {path}/xianyu_itemcomment_parse.sh {lastday})'.format(
        path=path, lastday=get_lastday()),
    ssh_hook=sshHook,
    dag=dag)

hive = SSHExecuteOperator(
    task_id="comment_import",
    bash_command=
    '(bash {path}/xianyu_itemcomment_import.sh {lastday} {last_update_date})'.
    format(path=path,
           lastday=get_lastday(),
           last_update_date=get_last_update_date()),
    ssh_hook=sshHook,
    dag=dag)

email_update = EmailOperator(task_id='xianyu_itemcomment_update_email',
                             to=['*****@*****.**'],
"""

Distribute_labels_to_nodes = BashOperator(
    task_id='Distribute_labels',
    bash_command=templated_command_distribute_labels,
    dag=dag)

templated_command_Node_1 = """

sh ~/Deep_Images_Hub/src/producer/auto_upload.sh ~/sample_labels_validation_aa "validation"


"""

Upload_images_From_Node_1 = SSHExecuteOperator(
    task_id="Upload_images_From_Node_1",
    bash_command=templated_command_Node_1,
    dag=dag)

sshHook_node2 = SSHHook(conn_id='Node_2')

templated_command_Node_2 = """

sh ~/Deep_Images_Hub/src/producer/auto_upload.sh ~/sample_labels_validation_ab "validation"


"""

Upload_images_From_Node_2 = SSHExecuteOperator(
    task_id="Upload_images_From_Node_2",
    bash_command=templated_command_Node_2,
    ssh_hook=sshHook_node2,
check_partition_cmd = "ssh -p 22 wrt@cs220 bash {path}/get_latest_partition.sh".format(path=path)


def get_last_update_date():
    try:
        result = os.popen(check_partition_cmd, "r").readline()
        print result
    except:
        raise Exception("ssh operation failed!")
    else:
        return str(eval(result))


spark_sale = SSHExecuteOperator(
    task_id="itemsold_sale_parse",
    bash_command='(bash {path}/ec_itemsold_sale_parse.sh {last_2_days} {lastday} {last_update})'
        .format(path=path, last_2_days=get_date()[1], lastday=get_date()[0], last_update=get_last_update_date()),
    ssh_hook=sshHook,
    dag=dag)

hive_sale = SSHExecuteOperator(
    task_id="itemsold_sale_import",
    bash_command='(bash {path}/ec_itemsold_sale_import.sh {lastday})'
        .format(path=path, lastday=get_date()[0]),
    ssh_hook=sshHook,
    dag=dag)

spark_daysale = SSHExecuteOperator(
    task_id="itemsold_daysale_parse",
    bash_command='(bash {path}/ec_itemsold_daysale_parse.sh {last_2_days} {lastday})'
        .format(path=path, last_2_days=get_date()[1], lastday=get_date()[0]),
    ssh_hook=sshHook,
        return str(eval(result))

def shopinfo_last_update():
    try:
        result = os.popen(check_partition_shopinfo, "r").readline()
    except:
        raise Exception("ssh operation failed!")
    else:
        return str(eval(result))




spark_item = SSHExecuteOperator(
    task_id="ec_iteminfo_spark",
    bash_command='(bash {path}/ec_iteminfo_parse.sh)'.format(path=path),
    ssh_hook=sshHook,
    dag=dag)

hive_item = SSHExecuteOperator(
    task_id="ec_iteminfo_hive",
    bash_command='(bash {path}/ec_iteminfo_import.sh {last_par})'.format(path=path,last_par=iteminfo_last_update()),
    ssh_hook=sshHook,
    dag=dag)

spark_shop = SSHExecuteOperator(
    task_id="ec_shopinfo_spark",
    bash_command='(bash {path}/ec_shopinfo_parse.sh)'.format(path=path),
    ssh_hook=sshHook,
    dag=dag)
Exemple #22
0
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=4),
}

dag = DAG('qubole_data_pipeline',
          default_args=default_args,
          schedule_interval=timedelta(days=1))

connection = "qubole_connection"
sshHook = SSHHook(conn_id=connection)

task_import_nation = SSHExecuteOperator(task_id="import_nation",
                                        bash_command=import_nation,
                                        ssh_hook=sshHook,
                                        dag=dag)

task_import_region = SSHExecuteOperator(task_id="import_region",
                                        bash_command=import_region,
                                        ssh_hook=sshHook,
                                        dag=dag)
task_import_customer = SSHExecuteOperator(task_id="import_customer",
                                          bash_command=import_customer,
                                          ssh_hook=sshHook,
                                          dag=dag)
task_import_supplier = SSHExecuteOperator(task_id="import_supplier",
                                          bash_command=import_supplier,
                                          ssh_hook=sshHook,
                                          dag=dag)
task_import_orders = SSHExecuteOperator(task_id="import_orders",
Exemple #23
0
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=4),
}

dag = DAG('data_pipeline',
          default_args=default_args,
          schedule_interval=timedelta(days=1))

connection = "hadoop_connection"
sshHook = SSHHook(conn_id=connection)

task0 = SSHExecuteOperator(task_id="sync_jars",
                           bash_command=sync_jars,
                           ssh_hook=sshHook,
                           dag=dag)

task1 = SSHExecuteOperator(task_id="import_nation",
                           bash_command=import_nation,
                           ssh_hook=sshHook,
                           dag=dag)

task2 = SSHExecuteOperator(task_id="import_region",
                           bash_command=import_region,
                           ssh_hook=sshHook,
                           dag=dag)

task3 = SSHExecuteOperator(task_id="spark_dimNation",
                           bash_command=spark_dimNation,
                           ssh_hook=sshHook,