def test_hook_created_correctly(self): TIMEOUT = 20 SSH_ID = "ssh_default" task = SSHOperator( task_id="test", command="echo -n airflow", dag=self.dag, timeout=TIMEOUT, ssh_conn_id="ssh_default" ) self.assertIsNotNone(task) task.execute(None) self.assertEquals(TIMEOUT, task.ssh_hook.timeout) self.assertEquals(SSH_ID, task.ssh_hook.ssh_conn_id)
def test_arg_checking(self): import os from airflow.exceptions import AirflowException conn_id = "conn_id_for_testing" TIMEOUT = 5 os.environ['AIRFLOW_CONN_' + conn_id.upper()] = "ssh://test_id@localhost" # Exception should be raised if neither ssh_hook nor ssh_conn_id is provided if six.PY2: self.assertRaisesRegex = self.assertRaisesRegexp with self.assertRaisesRegex(AirflowException, "Cannot operate without ssh_hook or ssh_conn_id."): task_0 = SSHOperator(task_id="test", command="echo -n airflow", timeout=TIMEOUT, dag=self.dag) task_0.execute(None) # if ssh_hook is invalid/not provided, use ssh_conn_id to create SSHHook task_1 = SSHOperator( task_id="test_1", ssh_hook="string_rather_than_SSHHook", # invalid ssh_hook ssh_conn_id=conn_id, command="echo -n airflow", timeout=TIMEOUT, dag=self.dag ) try: task_1.execute(None) except Exception: pass self.assertEqual(task_1.ssh_hook.ssh_conn_id, conn_id) task_2 = SSHOperator( task_id="test_2", ssh_conn_id=conn_id, # no ssh_hook provided command="echo -n airflow", timeout=TIMEOUT, dag=self.dag ) try: task_2.execute(None) except Exception: pass self.assertEqual(task_2.ssh_hook.ssh_conn_id, conn_id) # if both valid ssh_hook and ssh_conn_id are provided, ignore ssh_conn_id task_3 = SSHOperator( task_id="test_3", ssh_hook=self.hook, ssh_conn_id=conn_id, command="echo -n airflow", timeout=TIMEOUT, dag=self.dag ) try: task_3.execute(None) except Exception: pass self.assertEqual(task_3.ssh_hook.ssh_conn_id, self.hook.ssh_conn_id)
def test_arg_checking(self): import os from airflow.exceptions import AirflowException conn_id = "conn_id_for_testing" TIMEOUT = 5 os.environ['AIRFLOW_CONN_' + conn_id.upper()] = "ssh://test_id@localhost" # Exception should be raised if neither ssh_hook nor ssh_conn_id is provided if six.PY2: self.assertRaisesRegex = self.assertRaisesRegexp with self.assertRaisesRegex( AirflowException, "Cannot operate without ssh_hook or ssh_conn_id."): task_0 = SSHOperator(task_id="test", command="echo -n airflow", timeout=TIMEOUT, dag=self.dag) task_0.execute(None) # if ssh_hook is invalid/not provided, use ssh_conn_id to create SSHHook task_1 = SSHOperator( task_id="test_1", ssh_hook="string_rather_than_SSHHook", # invalid ssh_hook ssh_conn_id=conn_id, command="echo -n airflow", timeout=TIMEOUT, dag=self.dag) try: task_1.execute(None) except Exception: pass self.assertEqual(task_1.ssh_hook.ssh_conn_id, conn_id) task_2 = SSHOperator( task_id="test_2", ssh_conn_id=conn_id, # no ssh_hook provided command="echo -n airflow", timeout=TIMEOUT, dag=self.dag) try: task_2.execute(None) except Exception: pass self.assertEqual(task_2.ssh_hook.ssh_conn_id, conn_id) # if both valid ssh_hook and ssh_conn_id are provided, ignore ssh_conn_id task_3 = SSHOperator(task_id="test_3", ssh_hook=self.hook, ssh_conn_id=conn_id, command="echo -n airflow", timeout=TIMEOUT, dag=self.dag) try: task_3.execute(None) except Exception: pass self.assertEqual(task_3.ssh_hook.ssh_conn_id, self.hook.ssh_conn_id)
default_args = { 'owner': 'Damien Ayers', 'depends_on_past': False, # Very important, will cause a single failure to propagate forever 'start_date': datetime(2020, 3, 11), 'retries': 3, 'retry_delay': timedelta(minutes=1), 'timeout': 3600, # For running SSH Commands 'params': { 'project': 'v10', 'queue': 'normal', 'module': 'dea/unstable', 'year': '2019' } } dag = DAG( 'nci_database_backup', default_args=default_args, catchup=False, schedule_interval=None, ) with dag: run_backup = SSHOperator(task_id='execute_daily_backup', ssh_conn_id='lpgs_gadi', command=""" cd /g/data/v10/agdc/backup; ./trigger-daily-db-backup.sh &>> "/data/logs/nc-db-backup_$(date -d${1:-today} +%Y%m%d_%s).log """)
'retry_delay': timedelta(minutes=5), } dag = DAG(dag_id='primary_analysis_and_qc_processing', catchup=False, schedule_interval="@hourly", max_active_runs=1, default_args=default_args) ## TO DO ssh_hook = SSHHook(ssh_conn_id='cx1_ssh_conn') orwell_ssh_hook = SSHHook(ssh_conn_id='orwell_ssh_conn') update_exp_metadata = SSHOperator( task_id='update_exp_metadata', dag=dag, ssh_hook=ssh_hook, command= 'bash /rds/general/user/igf/home/git_repo/IGF-cron-scripts/hpc/update_exp_metadata.sh ' ) find_new_exp_for_analysis = SSHOperator( task_id='find_new_exp_for_analysis', dag=dag, ssh_hook=orwell_ssh_hook, command= 'bash /home/igf/igf_code/IGF-cron-scripts/orwell/find_new_exp_for_analysis.sh ' ) find_new_exp_for_analysis.set_upstream(update_exp_metadata) seed_analysis_pipeline = SSHOperator(
'depends_on_past': False, 'start_date': datetime(2020, 2, 1), 'email': ['*****@*****.**'], 'email_on_failure': True, } dag = DAG('testdag', default_args=default_args, catchup=False, schedule_interval=None, template_searchpath='/home/omad/airflow/dags/templates/') with dag: foo = SSHOperator(ssh_conn_id='lpgs_gadi', task_id='foo', remote_host='gadi-dm.nci.org.au', command='env') failing_task = ShortCircuitSSHOperator(task_id='failing_task', ssh_conn_id='lpgs_gadi', command='false') should_be_skipped = DummyOperator(task_id='should_be_skipped') passing_task = ShortCircuitSSHOperator(task_id='passing_task', ssh_conn_id='lpgs_gadi', command='true') should_be_run = DummyOperator(task_id='should_be_run') failing_task >> should_be_skipped passing_task >> should_be_run send_email = EmailOperator( task_id='send_email',
dag = DAG( 'nci_build_dea_module', default_args=default_args, schedule_interval=None, tags=['nci'], ) with dag: build_env_task = SSHOperator( task_id=f'build_dea_module', ssh_conn_id='lpgs_gadi', command=""" cd ~/dea-orchestration/ git reset --hard git pull cd ~/dea-orchestration/nci_environment git status module load python3/3.7.4 pip3 install --user pyyaml jinja2 ./build_environment_module.py dea/modulespec.yaml """, ) test_env_task = SSHOperator(task_id='test_dea_module', ssh_conn_id='lpgs_gadi', command=""" cd $TMPDIR git clone --depth 1 https://github.com/GeoscienceAustralia/dea-notebooks cd dea-notebooks/Frequently_used_code/ module load dea/$(date +%Y%m%d) # TODO, this will fail if run over midnight...
default_args = { 'start_date': datetime(2018, 1, 1), 'retries': 1, } dag = DAG( 'example_dag', default_args=default_args, schedule_interval='0 1 * * * ', catchup=False) task_one = SSHOperator( task_id='task_one', ssh_conn_id='private_key', remote_host=ETL_HOST, command='command for task one {{ params.task_one_param }}', params={ 'task_one_param': 1 }, dag=dag) subdag = SubDagOperator( subdag=example_subdag( 'example_dag', 'example_subdag', default_args=default_args, schedule_interval=dag.schedule_interval, catchup=dag.catchup), task_id='subdag', dag=dag) task_one >> subdag
from datetime import timedelta, datetime from airflow import DAG from airflow.contrib.operators.ssh_operator import SSHOperator default_args = { 'owner': 'airflow', 'depends_on_past': False, 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'start_date': datetime(2019, 7, 7), 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG(dag_id='load_property_sale_fact', default_args=default_args, schedule_interval='0 23 * * *') t1_bash = """ /usr/local/bin/dp/database_jobs/run_py.sh "load_property_sale_fact.py" """ t1 = SSHOperator(ssh_conn_id='ssh_aws_ec2', task_id='load_property_sale_fact', command=t1_bash, dag=dag)
Ingestion = BashOperator( task_id='Ingestion', bash_command="echo {{ dag_run.conf['ingestion_status'] }}", dag=dag, ) VirusCheck = BashOperator( task_id='VirusCheck', bash_command="echo {{ dag_run.conf['quarantine_bucket'] }}", dag=dag, ) MoveToPrecurated = SSHOperator( task_id='MoveToPrecurated', ssh_conn_id='flywheel_usw2', command= "aws s3 cp s3://{{ dag_run.conf['quarantine_bucket'] }}/ s3://{{ dag_run.conf['precurated_bucket'] }}/ --recursive", dag=dag, ) FlywheelUpload = SSHOperator( task_id='FlywheelUpload', ssh_conn_id='flywheel_usw2', command= "fw ingest template s3://{{ dag_run.conf['precurated_bucket'] }}/{{ dag_run.conf['precurated_bucket_key'] }} --config-file {{ dag_run.conf['fw_template'] }} --group {{ dag_run.conf['fw_group'] }} --project {{ dag_run.conf['fw_project'] }} --cluster https://flywheel-us-sbx.science.roche.com/ingest -f", dag=dag, ) SuccessNotification = BashOperator( task_id="SuccessNotification", bash_command="echo {{ dag_run.conf['email_list'] }}",
# Aims to download the data at 10am Pacific default_args = { "owner": "airflow", "depends_on_past": True, "wait_for_downstream": True, "start_date": datetime(2019, 9, 25, 17, tzinfo=utc_tz), "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, "retries": 3, "retry_delay": timedelta(hours=1), "task_concurrency": 1, # "execution_timeout": timedelta(minutes=2) # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG("water_supply_scraper", default_args=default_args, schedule_interval=timedelta(days=1)) # Yes, that is a space at the end, do not remove # https://cwiki.apache.org/confluence/display/AIRFLOW/Common+Pitfalls command = 'docker-compose exec -T web flask wss ' t1 = SSHOperator(ssh_conn_id='ssh_wss', task_id='run_wss', command=command, dag=dag)
default_args = { 'owner': 'price-insight', 'depends_on_past': False, 'start_date': datetime(2019, 2, 2), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False } dag = DAG('listing_stats_batch', default_args=default_args, schedule_interval='@once') data_fetch_task = SSHOperator( ssh_conn_id='data_fetch_conn', task_id='data_fetch', command='cd ~/InnSight/data_fetch; ./data_fetch.sh all', dag=dag) config_generation_task = SSHOperator( ssh_conn_id='spark_master_conn', task_id='config_generation', command='cd ~/InnSight/batch_processing; ./s3_urls_generation.sh all', dag=dag) data_cleaning_task = SSHOperator( ssh_conn_id='spark_master_conn', task_id='data_cleaning', command='source ~/.profile; ' 'cd ~/InnSight/batch_processing; ' '~/.local/bin/spark-submit '
} dag = DAG( dag_id='testSSH_zhrui', default_args=args, schedule_interval=timedelta(days=1), dagrun_timeout=timedelta(minutes=60), ) sshHook = SSHHook( remote_host='dltsprod-worker-rsagxh.eastus.cloudapp.azure.com', username='******', key_file='/home/bitnami/.ssh/id_rsa_zhrui', port=31624, timeout=10, keepalive_interval=30) t1 = SSHOperator(task_id="connectionDLTS", command='mkdir fromAirflow', ssh_hook=sshHook, dag=dag) t2 = SSHOperator( ssh_hook=sshHook, task_id='writeToRemote', command= 'touch /tmp/test_ssh_in_airflow.txt', # create a file at remote machine dag=dag) t1 >> t2
'timeout': 1200, # For running SSH Commands 'email_on_failure': True, 'email': '*****@*****.**', } dag = DAG( 'nci_build_env_module', default_args=default_args, schedule_interval=None, tags=['nci'], ) with dag: build_env_task = SSHOperator( task_id=f'build_dea_env_module', ssh_conn_id='lpgs_gadi', command=""" set -eux cd ~/dea-orchestration/ git reset --hard git pull cd ~/dea-orchestration/nci_environment git status module load python3/3.7.4 pip3 install --user pyyaml jinja2 rm -rf /g/data/v10/public/modules/dea-env/$(date +%Y%m%d)/ /g/data/v10/public/modules/modulefiles/dea-env/$(date +%Y%m%d) ./build_environment_module.py dea-env/modulespec.yaml """, )
schedule_interval=MANIFOLD_GENERATE_SITEMAP_INTERVAL) # # CREATE TASKS # # Tasks with all logic contained in a single operator can be declared here. # Tasks with custom logic are relegated to individual Python files. # generate_sitemap_bash = """ sudo su - manifold bash -c \ "cd /var/www/manifold &&\ RAILS_ENV=production bundle exec rake sitemap:create" """ generate_sitemap = SSHOperator( task_id='generate_sitemap', command=generate_sitemap_bash, dag=MANIFOLD_GENERATE_SITEMAP_DAG, ssh_conn_id='AIRFLOW_CONN_MANIFOLD_SSH_INSTANCE') post_slack = PythonOperator(task_id='slack_post_succ', python_callable=slackpostonsuccess, provide_context=True, dag=MANIFOLD_GENERATE_SITEMAP_DAG) # # SET UP TASK DEPENDENCIES # post_slack.set_upstream(generate_sitemap)
from datetime import timedelta, datetime from airflow import DAG from airflow.contrib.operators.ssh_operator import SSHOperator default_args = { 'owner': 'airflow', 'depends_on_past': False, 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'start_date': datetime(2019, 7, 7), 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG(dag_id='nightly_mat_view_refresh', default_args=default_args, schedule_interval='0 11 * * *') t1_bash = """ /usr/local/bin/dp/database_jobs/run_py.sh "refresh_mat_views_nightly.py" """ t1 = SSHOperator(ssh_conn_id='ssh_aws_ec2', task_id='nightly_mat_view_refresh', command=t1_bash, dag=dag)
from datetime import timedelta, datetime from airflow import DAG from airflow.contrib.operators.ssh_operator import SSHOperator default_args = { 'owner': 'airflow', 'depends_on_past': False, 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'start_date': datetime(2019, 7, 7), 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG(dag_id='load_rew_properties', default_args=default_args, schedule_interval='*/30 * * * *') t1_bash = """ /usr/local/bin/dp/database_jobs/run_py.sh "execute_sql_file.py --sql_file load_rew_properties.sql --job_code load_rew_properties" """ t1 = SSHOperator( ssh_conn_id='ssh_aws_ec2', task_id='load_rew_properties', command=t1_bash, dag=dag)
) # language="Shell Script" generate_list = SSHOperator( task_id='generate_list_of_s2_to_upload', # language="Shell Script" command=COMMON + dedent(""" rm -f s3_paths_list.txt # In case we've been run before for product_name in s2a_ard_granule s2b_ard_granule; do echo Searching for $product_name datasets. psql --variable=ON_ERROR_STOP=1 --csv --quiet --tuples-only --no-psqlrc \ -h dea-db.nci.org.au datacube <<EOF >> s3_paths_list.txt SELECT 's3://dea-public-data/L2/sentinel-2-nbar/S2MSIARD_NBAR/' || substring(ds.metadata#>>'{extent,center_dt}' for 10) || '/' || replace(ds.metadata#>>'{tile_id}', 'L1C', 'ARD') || '/ARD-METADATA.yaml' FROM agdc.dataset ds INNER JOIN agdc.dataset_type dst ON ds.dataset_type_ref = dst.id INNER JOIN agdc.dataset_location dsl ON ds.id = dsl.dataset_ref WHERE dst.name='$product_name' AND ds.added BETWEEN '{{ prev_execution_date }}' AND '{{ execution_date }}'; EOF done echo -n Num Datasets to upload: wc -l s3_paths_list.txt """), remote_host='gadi-dm.nci.org.au', timeout=20 * MINUTES, ) # Execute script to upload sentinel-2 data to s3 bucket
# # month 类型的任务 dag_id 需要修改 # dag = DAG( # dag_id='airflow_pyspark_template_week', # default_args=args, # schedule_interval='50 2 1 * *', # dagrun_timeout=timedelta(minutes=60), # ) # task_id也需要修改为相应的任务描述 day_partition = SSHOperator( ssh_conn_id='ws@hdp-0', task_id='device_filmora_log_day_partition', command= " cd /usr/local/bigdata/jobtaskh0/pythonjob/pyspark_template/ && spark-submit \ --num-executors 4 \ --executor-memory 4G \ --executor-cores 4 \ --driver-memory 4G \ --driver-cores 4 \ --jars /usr/hdp/3.0.1.0-187/spark2/jars/mysql-connector-java-5.1.47.jar \ --driver-class-path /usr/hdp/3.0.1.0-187/spark2/jars/mysql-connector-java-5.1.47.jar \ /usr/local/bigdata/jobtaskh0/pythonjob/uos/uid_label/device_filmora_log.py \ day \ {{ ds_nodash }} ", dag=dag) # --num-executors 50 \ # --executor-memory 4G \ # --executor-cores 4 \ # --driver-memory 1G \ # --driver-cores 4 \ # --conf spark.default.parallelism=1000\ # --conf spark.storage.memoryFraction=0.5\
from airflow.contrib.operators.ssh_operator import SSHOperator from airflow.operators.bash_operator import BashOperator from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator from airflow.utils.dates import days_ago from airflow.contrib.hooks.ssh_hook import SSHHook from airflow.exceptions import AirflowException from airflow.models import BaseOperator from airflow.utils.decorators import apply_defaults import halo_variables environment_default_args = { "owner": "airflow", "depends_on_past": False, #"start_date": date(2020, 7, 24), "start_date": days_ago(1), "retries": 0, "schedule_interval": "* */2 * * *", } dag = DAG("customerfacingservice_spark_submit", default_args=environment_default_args, concurrency=5, max_active_runs=1) create_command = "/home/airflow/wifi_uc/cfs.sh " task1 = SSHOperator(ssh_conn_id="ssh_dev_conn", command=create_command, task_id="spark_ssh", dag=dag)
executor_cores = row['EXECUTOR_CORES'] executor_mem = row['EXECUTOR_MEM'] num_executor = row['NUM_EXECUTOR'] additional_param = row['ADDITIONAL_PARAM'] dependencies = row['DEPENDENCIES'].split('|') partitions = row['PARTITIONS'] for e in etl_task_type: if e == 'EXTRACT': script_loc = etl_task_type_df.loc[etl_task_type_df['ETL_TASK_TYPE'].str.contains('EXTRACT'), 'SCRIPT_LOC'][0] script_name = etl_task_type_df.loc[etl_task_type_df['ETL_TASK_TYPE'].str.contains('EXTRACT'), 'SCRIPT_NAME'][0] complete_script_path = script_loc+script_name t3 = SSHOperator( ssh_conn_id=ssh_conn_id, task_id=str(table_name)+'_'+str(e), command= 'spark-submit --num-executors '+str(partitions)+' '+complete_script_path+' '+table_name+' '+str(partitions), dag=dag) t3 >> t1 if e == 'MERGE': script_loc = etl_task_type_df['SCRIPT_LOC'][1] script_name = etl_task_type_df['SCRIPT_NAME'][1] complete_script_path = script_loc+script_name t3 = SSHOperator( ssh_conn_id=ssh_conn_id, task_id=str(table_name)+'_'+str(e), command= 'spark-submit --num-executors '+str(num_executor)+' --executor-cores '+str(executor_cores)+' --executor-memory '+executor_mem+' --driver-memory '+driver_mem+' --driver-cores '+str(driver_cores)+' '+complete_script_path+' '+table_name , dag=dag)
docker_compose_file = '/home/ubuntu/config/docker-compose-hrrr.yml' config_file = '/code/config/hrrr_dates.ini' # run_weather_forecast_retrieval to get data from NOAA hrrr = HRRR(docker_compose_file, config_file) command = hrrr.get_compose_command() # convert_grib2nc command for conversion grib2nc = hrrr.get_compose_grib2nc() # upload to swift swift = HRRR('/home/ubuntu/config/docker-compose-swift.yml', config_file) upload_swift = swift.get_compose_upload_swift() t1 = SSHOperator(ssh_conn_id='ssh_hrrr', task_id='run_hrrr_retrieval_dates', command=command, dag=dag) t2 = SSHOperator(ssh_conn_id='ssh_hrrr', task_id='convert_grib2nc', command=grib2nc, dag=dag) t3 = SSHOperator(ssh_conn_id='ssh_hrrr', task_id='upload_swift', command=upload_swift, dag=dag) t1.set_downstream(t2) t2.set_downstream(t3)
datestring=$(date +%Y%m%d) file_prefix="${host}-${datestring}" ''') run_backup = SSHOperator( task_id='run_backup', command=COMMON + dedent(""" args="-U agdc_backup -h ${host} -p 5432" set -x # Cleanup previous failures rm -rf "${file_prefix}"*-datacube-partial.pgdump # Dump pg_dump ${args} guest > "${file_prefix}-guest.sql" pg_dump ${args} datacube -n agdc -T 'agdc.dv_*' -F c -f "${file_prefix}-datacube-partial.pgdump" mv -v "${file_prefix}-datacube-partial.pgdump" "${file_prefix}-datacube.pgdump" # The globals technically contain (weakly) hashed pg user passwords, so we'll # tighten permissions. (This shouldn't really matter, as users don't choose # their own passwords and they're long random strings, but anyway) umask 066 pg_dumpall ${args} --globals-only > "${file_prefix}-globals.sql" """), ) aws_conn = AwsHook(aws_conn_id='aws_nci_db_backup') upload_to_s3 = SSHOperator(task_id='upload_to_s3', params=dict(aws_conn=aws_conn),
dag=dag) # EMR 클러스터에 원격으로 커맨드를 실행하기 위한 전초작업 # 확실하지 않지만, SSH 접속을 한번 해두어야 Airflow SSHOperator로 접속이 가능했었다. t2 = BashOperator( task_id="emr_ssh_connect", bash_command="""bash {}/shell_script/emr_ssh_connect.sh {} {}""".format( project_home, ip_address, ip_domain), dag=dag) # 하둡의 HDFS에 접속하기 위해선 보조IP가 아닌 메인IP로 해야된다. 그래서 보조IP를 메인IP에 포워딩 해준다. # EMR 클러스터의 보조IP를 메인IP에 포워딩 t3 = SSHOperator( task_id="ip_forwarding", command= """(echo $(sudo ifconfig eth0 | grep 'inet addr' | cut -d: -f2 | awk '{{ print $1 }}') " {}") | sudo tee -a /etc/hosts""" .format(ip_domain), ssh_hook=sshHook, dag=dag) # S3에 있는 훈련데이터를 EMR 클러스터의 HDFS로 옴긴다. t4 = SSHOperator( task_id="traindata_s3_to_hdfs", command= """s3-dist-cp --src s3://jhw620/RefineData/ --dest hdfs://{}:8020/data/ --srcPattern .*[^_\$folder\$]$""" .format(ip_domain), ssh_hook=sshHook, dag=dag) # 모델 훈련 t5 = BashOperator(task_id="training_model",
# # CREATE TASKS # # Tasks with all logic contained in a single operator can be declared here. # Tasks with custom logic are relegated to individual Python files. # sync_hours_bash = """ sudo su - manifold bash -c \ "cd /var/www/manifold &&\ RAILS_ENV=production bundle exec rake sync:hours" """ sync_hours = SSHOperator( task_id='sync_hours', command=sync_hours_bash, dag=MANIFOLD_HOURS_SYNC_DAG, ssh_conn_id='AIRFLOW_CONN_MANIFOLD_SSH_INSTANCE' ) post_slack = PythonOperator( task_id='slack_post_succ', python_callable=slackpostonsuccess, provide_context=True, dag=MANIFOLD_HOURS_SYNC_DAG ) # # SET UP TASK DEPENDENCIES # post_slack.set_upstream(sync_hours)
submit_ard = SSHOperator( task_id=submit_task_id, command=COMMON + """ mkdir -p {{ params.base_dir }}{{ work_ext }} mkdir -p {{ params.base_dir }}{{ log_ext }} qsub -N ard_scene_select \ -q {{ params.queue }} \ -W umask=33 \ -l wd,walltime=0:30:00,mem=15GB,ncpus=1 -m abe \ -l storage=gdata/v10+scratch/v10+gdata/if87+gdata/fj7+scratch/fj7 \ -P {{ params.project }} -o {{ params.base_dir }}{{ log_ext }} -e {{ params.base_dir }}{{ log_ext }} \ -- /bin/bash -l -c \ "module use /g/data/v10/public/modules/modulefiles/; \ module use /g/data/v10/private/modules/modulefiles/; \ module load {{ params.module_ass }}; \ ard-scene-select \ {{ params.products_arg }} \ {{ params.config_arg }} \ --workdir {{ params.base_dir }}{{ work_ext }} \ --pkgdir {{ params.pkgdir_arg }} \ --logdir {{ params.base_dir }}{{ log_ext }} \ --env {{ params.wagl_env }} \ --project {{ params.project }} \ --walltime 02:30:00 \ {{ params.index_arg }} \ {{ params.scene_limit }} \ {{ params.interim_days_wait }} \ {{ params.days_to_exclude_arg }} \ {{ params.run_ard_arg }} " """, timeout=60 * 20, do_xcom_push=True, )
# argument file variables paths spark_script = 'main.py' json = 'review_and_evaluation_config.json' logFile = 'csdr_status.log' scriptFile = '/opt/scripts/mig/csdr/csdr_xml_validation/app/{}'.format(spark_script) xmlFilePath = 'hdfs://migration/data/raw/csdr/settlement_internalisation/to_process/' jsonPath = '/opt/scripts/csdr/mig/conf/{}'.format(json) logPath = 'hdfs://migration/data/raw/csdr/stage_status/{}'.format(logFile) xml_validation = 'sudo python {} -f {} -j {} -l {}'.format(scriptFile, xmlFilePath, jsonPath, logPath) args = { 'owner':'Airflow', 'start_date': days_ago(1) } with DAG(dag_id='mig_csdr_re_xml_validation_spark_xml_process', description='CSDR xml re validation spark process', default_args=args, schedule_interval='* * */1 * *') as dag: start = DummyOperator( task_id='start' ) post_ingestion = SSHOperator( ssh_conn_id='zaloni', task_id='esml_feedback_process', command=xml_validation ) start >> post_ingestion #>> re_validate_xml
# Default DAG parameters default_args = { 'owner': 'airflow', 'depends_past': False, 'start_date': dt(2020, 3, 23), 'retries': 0 } dag = DAG('variable_example', default_args=default_args, schedule_interval='30 07 * * *') url_awscli = Variable.get("url_awscli") directory_dest = Variable.get("directory_dest") # Install aws CLI in ssh cmd = """ mkdir -p {} \ curl "{}" -o "/tmp/awscli.zip" \ unzip /tmp/awscli.zip -d {} \ sudo {}aws/install \ rm /tmp/awscli.zip \ aws emr create-default-roles """.format(directory_dest, url_awscli, directory_dest, directory_dest) install_aws = SSHOperator(ssh_conn_id='adaltas_ssh', task_id='install_aws', command=cmd, dag=dag)
else: external_dag_id = 'nci_fractional_cover' external_task_id = f'wait_for_{product}' processing_completed = ExternalTaskSensor( task_id=f'processing_completed_{product}', external_dag_id=external_dag_id, external_task_id=external_task_id, mode='reschedule', timeout=1 * DAYS, ) download_s3_inventory = SSHOperator( task_id=f'download_s3_inventory_{product}', command=COMMON + dedent(''' mkdir -p {{work_dir}} dea-cogger save-s3-inventory --product-name "{{ params.product }}" --output-dir "{{work_dir}}" '''), params={'product': product}, ) generate_work_list = SSHOperator( task_id=f'generate_work_list_{product}', command=COMMON + dedent(""" cd {{work_dir}} dea-cogger generate-work-list --product-name "{{params.product}}" \\ --output-dir "{{work_dir}}" --s3-list "{{params.product}}_s3_inv_list.txt" \\ --time-range "time in [2019-01-01, 2025-12-31]" """), # --time-range "time in [{{prev_ds}}, {{ds}}]" timeout=2 * HOURS,
'retry_delay': timedelta(minutes=5), } dag = DAG(dag_id='seqrun_processing', catchup=False, schedule_interval="@hourly", max_active_runs=1, default_args=default_args) ## TO DO ssh_hook = SSHHook(ssh_conn_id='orwell_ssh_conn') cx1_ssh_hook = SSHHook(ssh_conn_id='cx1_ssh_conn') switch_off_project_barcode = SSHOperator( task_id='switch_off_project_barcode', dag=dag, ssh_hook=ssh_hook, command= 'bash /home/igf/igf_code/IGF-cron-scripts/orwell/switch_off_project_barcode_check.sh ' ) change_samplesheet_for_run = SSHOperator( task_id='change_samplesheet_for_run', dag=dag, ssh_hook=ssh_hook, command= 'bash /home/igf/igf_code/IGF-cron-scripts/orwell/change_samplesheet_for_seqrun.sh ' ) change_samplesheet_for_run.set_upstream(switch_off_project_barcode) restart_seqrun_processing = SSHOperator(
# 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), # 'wait_for_downstream': False, # 'dag': dag, # 'adhoc':False, # 'sla': timedelta(hours=2), # 'execution_timeout': timedelta(seconds=300), # 'on_failure_callback': some_function, # 'on_success_callback': some_other_function, # 'on_retry_callback': another_function, # 'trigger_rule': u'all_success' } dag = DAG(dag_id='ssh_airflow_test', default_args=args, schedule_interval='*/1 * * * *', catchup=False) bash_command = """python /Users/zaferdurkut/test/dizin1/ssh_test.py""" ssh_hook = SSHHook(username=os.getenv('SSH_USER'), password=os.getenv('SSH_PASSWORD'), remote_host=os.getenv('SSH_HOST')) ssh_task = SSHOperator(task_id='ssh_airflow_test_task', ssh_hook=ssh_hook, command=bash_command, dag=dag) ssh_task
from datetime import datetime from airflow import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.contrib.operators.ssh_operator import SSHOperator dag = DAG(dag_id='test_run_hadoop', start_date=datetime(2021, 1, 1), schedule_interval=None) cmd_hdfs_ls = """/usr/local/bin/hdfs dfs -ls""" #cmd_hdfs_ls="""pwd && ls""" start = DummyOperator(task_id='start', dag=dag) end = DummyOperator(task_id='end', dag=dag) hdfs_ls = SSHOperator(task_id='hdfs_ls', command=cmd_hdfs_ls, ssh_conn_id='local_ssh_default', retries=1, dag=dag) start >> hdfs_ls >> end