def create_sub_dag(parent_dag, report_name):
    sub_dag = DAG(dag_id=parent_dag.dag_id + '.hive_' + report_name, default_args=parent_dag.default_args)

    # Use ssh operator that executes a hive script in our etl always on cluster
    hive_task = SSHExecuteOperator(task_id='hive_transformation',
                            ssh_hook=SSHHook(SSH_HOOK),
                            bash_command=parse_hive_command(report_name),
                            dag=sub_dag)

    return SubDagOperator(task_id='hive_' + report_name,
                          subdag=sub_dag,
                          default_args=parent_dag.default_args,
                          dag=parent_dag)
    bash_command=templated_command_distribute_labels,
    dag=dag)

templated_command_Node_1 = """

sh ~/Deep_Images_Hub/src/producer/auto_upload.sh ~/sample_labels_validation_aa "validation"


"""

Upload_images_From_Node_1 = SSHExecuteOperator(
    task_id="Upload_images_From_Node_1",
    bash_command=templated_command_Node_1,
    dag=dag)

sshHook_node2 = SSHHook(conn_id='Node_2')

templated_command_Node_2 = """

sh ~/Deep_Images_Hub/src/producer/auto_upload.sh ~/sample_labels_validation_ab "validation"


"""

Upload_images_From_Node_2 = SSHExecuteOperator(
    task_id="Upload_images_From_Node_2",
    bash_command=templated_command_Node_2,
    ssh_hook=sshHook_node2,
    dag=dag)

sshHook_node3 = SSHHook(conn_id='Node_3')
예제 #3
0
###
# define the workflow
###
with DAG( 'tem1_pre-processing',
        description="Pre-processing of TEM1 CryoEM data",
        schedule_interval=None,
        default_args=args,
        catchup=False,
        max_active_runs=6,
        concurrency=32,
        dagrun_timeout=3600,
    ) as dag:

    # hook to container host for lsf commands
    hook = SSHHook(conn_id=args['ssh_connection_id'])
    # lsftest_hook = SSHHook(conn_id='ssh_lsf_test')
    
    ###
    # parse the epu xml metadata file
    ###
    parameter_file = FileGlobSensor( task_id='parameter_file',
        filepath="{{ dag_run.conf['directory'] }}/**/{{ dag_run.conf['base'] }}.xml",
        recursive=True,
        poke_interval=1,
    )
    parse_parameters = FeiEpuOperator(task_id='parse_parameters',
        filepath="{{ ti.xcom_pull( task_ids='parameter_file' )[0] }}",
    )
    # upload to the logbook
    logbook_parameters = PythonOperator(task_id='logbook_parameters',
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG(
    'domain_checker', default_args=default_args, schedule_interval=timedelta(1))

#define ssh01 on Admin > Configuration >
ssh_hook_01 = SSHHook(conn_id='ssh01')

t0 = BashOperator(
    task_id='transfer_files_to_remote',
    bash_command= AIRFLOW_HOME + '/dags/transfer_files.sh',
    params={},
    retries=1,
    dag=dag)

t1 = SSHExecuteOperator(
    task_id='verify_transfer_to_remote',
    ssh_hook = sssh_hook_01,
    bash_command= AIRFLOW_HOME + '/dags/echo_date.sh',
    params={},
    retries=1,
    dag=dag)
예제 #5
0
    day = '0'+str(day)

# 테스트를 위해 설정한 시간
year = '2019'
month = '07'
day = '10'

now =  "{}-{}-{}".format(year, month, day)
'''-------------------------------------------- '''

ip_address = '172.31.20.78'
ip_domain = 'ip-172-31-20-78.ap-northeast-2.compute.internal'
project_home = '~/Project/02_Data_Batch_Processing'

# SSH 연결을 위한 SSHHook 정의
sshHook = SSHHook(ssh_conn_id='emr_cluster_conn',remote_host='{}'.format(ip_domain),username='******',password='******')

# DAG 정의
dag = DAG('DAG_data_refine',
                        schedule_interval = '*/30 * * * *',
                        start_date=datetime(2019, 8, 1), catchup=False)

# EMR 클러스터 생성하는 task
t1 = BashOperator(
  task_id = "emr_cluster_create",
  xcom_push=True,
  bash_command = """bash {}/shell_script/emr_cluster_create.sh {}""".format(project_home,ip_address),
  dag=dag
)

# EMR 클러스터에 원격으로 커맨드를 실행하기 위한 전초작업
예제 #6
0
from datetime import timedelta
import airflow
from airflow.contrib.operators import SSHOperator
from airflow import DAG
from airflow.contrib.hooks import SSHHook
import os 
import sys

args = {
    'owner': 'airflow',
    'start_date': airflow.utils.dates.days_ago(2),
}

dag = DAG(
    dag_id='process_gdelt_files',
    default_args=args,
    schedule_interval='*/15 * * * *',
    dagrun_timeout=timedelta(minutes=60),
)

command = 'source ~/.bashrc && source ~/.profile && sh ~/Documents/Insight/InsightDataEngineer/data-processing/run_spark.sh '
sshHook = SSHHook(ssh_conn_id='spark_server')

task = SSHOperator(
    task_id="run_gdelt_process",
    command=command,
    ssh_hook=sshHook,
    dag=dag)
예제 #7
0
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.operators.ssh_execute_operator import SSHExecuteOperator
from datetime import datetime, timedelta
from airflow.contrib.hooks import SSHHook
sshHook = SSHHook(conn_id='server_ssh')
import airflow

default_args = {
    'owner': 'airflow',
    'schedule_interval': '@once',
    'start_date': airflow.utils.dates.days_ago(2)
}

dag = DAG('bash_ssh', default_args=default_args)

t1 = SSHExecuteOperator(task_id="task1",
                        bash_command='echo hello >> /tmp/hello.txt',
                        ssh_hook=sshHook,
                        dag=dag)
예제 #8
0
    'provide_context': True,
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG(
    'clix_dashboard_backend_dag', default_args=default_args,
    schedule_interval=timedelta(hours=5))

# --------------------------------------------------------------------------------
# Each state is synced independently. We have four states and syncthing data folders
# corresponding to those states are synced through sync_school_data
# --------------------------------------------------------------------------------
sshHook = SSHHook(conn_id=<YOUR CONNECTION ID FROM THE UI>)

for each_state in clix_config.states:

    src = clix_config.remote_src + each_state
    dst = clix_config.local_dst + each_state

    sync_state_data = SSHExecuteOperator( task_id="task1",
    bash_command= rsync -avzhe ssh {0}@{1}:{2} {3}".format(user, ip, src, dst),
    ssh_hook=sshHook,
    dag=dag)

    sync_state_data = PythonOperator(
        task_id='sync_state_data_' + each_state,
        python_callable=sync_school_data.rsync_data_ssh,
        op_kwargs={'state': each_state, 'src': src, 'dst': dst},
default_args = {
    'owner': "avkabay1",
    'depends_on_past': False,
    'start_date': datetime(2020, 4, 15),
    'catchup': False,
    'max_active_runs': 1,
    'concurrency': 1,
    'wait_for_downstream': False,
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=1)
}

sshHook = SSHHook(ssh_conn_id="avkabay1@test-en-0002")

dag = DAG('test_gobike_tripdata',
          schedule_interval='0 8 * * 6',
          default_args=default_args)

kinit = SSHOperator(task_id='kinit',
                    command="echo '{1}' | kinit {0}".format(user, password),
                    ssh_hook=sshHook,
                    dag=dag)

python_path = '/data/venv/python2/dq-venv/bin/python2.7'
script_dir = '/home/avkabay1/spdevices/etl/test_gobike_tripdata/'
task_raw_script = 'test_raw_gobike_tripdata.py'
task_ods_script = 'test_ods_gobike_tripdata.py'
task_dm_station_script = 'test_dm_station_gobike_tripdata.py'
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
    # 'wait_for_downstream': False,
    # 'dag': dag,
    # 'adhoc':False,
    # 'sla': timedelta(hours=2),
    # 'execution_timeout': timedelta(seconds=300),
    # 'on_failure_callback': some_function,
    # 'on_success_callback': some_other_function,
    # 'on_retry_callback': another_function,
    # 'trigger_rule': u'all_success'
}

dag = DAG(dag_id='ssh_airflow_test',
          default_args=args,
          schedule_interval='*/1 * * * *',
          catchup=False)

bash_command = """python /Users/zaferdurkut/test/dizin1/ssh_test.py"""
ssh_hook = SSHHook(username=os.getenv('SSH_USER'),
                   password=os.getenv('SSH_PASSWORD'),
                   remote_host=os.getenv('SSH_HOST'))

ssh_task = SSHOperator(task_id='ssh_airflow_test_task',
                       ssh_hook=ssh_hook,
                       command=bash_command,
                       dag=dag)

ssh_task