def test_hook_created_correctly(self):
        TIMEOUT = 20
        SSH_ID = "ssh_default"
        task = SSHOperator(
            task_id="test",
            command="echo -n airflow",
            dag=self.dag,
            timeout=TIMEOUT,
            ssh_conn_id="ssh_default"
        )
        self.assertIsNotNone(task)

        task.execute(None)

        self.assertEquals(TIMEOUT, task.ssh_hook.timeout)
        self.assertEquals(SSH_ID, task.ssh_hook.ssh_conn_id)
    def test_arg_checking(self):
        import os
        from airflow.exceptions import AirflowException
        conn_id = "conn_id_for_testing"
        TIMEOUT = 5
        os.environ['AIRFLOW_CONN_' + conn_id.upper()] = "ssh://test_id@localhost"

        # Exception should be raised if neither ssh_hook nor ssh_conn_id is provided
        if six.PY2:
            self.assertRaisesRegex = self.assertRaisesRegexp
        with self.assertRaisesRegex(AirflowException,
                                    "Cannot operate without ssh_hook or ssh_conn_id."):
            task_0 = SSHOperator(task_id="test", command="echo -n airflow",
                                 timeout=TIMEOUT, dag=self.dag)
            task_0.execute(None)

        # if ssh_hook is invalid/not provided, use ssh_conn_id to create SSHHook
        task_1 = SSHOperator(
            task_id="test_1",
            ssh_hook="string_rather_than_SSHHook",  # invalid ssh_hook
            ssh_conn_id=conn_id,
            command="echo -n airflow",
            timeout=TIMEOUT,
            dag=self.dag
        )
        try:
            task_1.execute(None)
        except Exception:
            pass
        self.assertEqual(task_1.ssh_hook.ssh_conn_id, conn_id)

        task_2 = SSHOperator(
            task_id="test_2",
            ssh_conn_id=conn_id,  # no ssh_hook provided
            command="echo -n airflow",
            timeout=TIMEOUT,
            dag=self.dag
        )
        try:
            task_2.execute(None)
        except Exception:
            pass
        self.assertEqual(task_2.ssh_hook.ssh_conn_id, conn_id)

        # if both valid ssh_hook and ssh_conn_id are provided, ignore ssh_conn_id
        task_3 = SSHOperator(
            task_id="test_3",
            ssh_hook=self.hook,
            ssh_conn_id=conn_id,
            command="echo -n airflow",
            timeout=TIMEOUT,
            dag=self.dag
        )
        try:
            task_3.execute(None)
        except Exception:
            pass
        self.assertEqual(task_3.ssh_hook.ssh_conn_id, self.hook.ssh_conn_id)
    def test_arg_checking(self):
        import os
        from airflow.exceptions import AirflowException
        conn_id = "conn_id_for_testing"
        TIMEOUT = 5
        os.environ['AIRFLOW_CONN_' +
                   conn_id.upper()] = "ssh://test_id@localhost"

        # Exception should be raised if neither ssh_hook nor ssh_conn_id is provided
        if six.PY2:
            self.assertRaisesRegex = self.assertRaisesRegexp
        with self.assertRaisesRegex(
                AirflowException,
                "Cannot operate without ssh_hook or ssh_conn_id."):
            task_0 = SSHOperator(task_id="test",
                                 command="echo -n airflow",
                                 timeout=TIMEOUT,
                                 dag=self.dag)
            task_0.execute(None)

        # if ssh_hook is invalid/not provided, use ssh_conn_id to create SSHHook
        task_1 = SSHOperator(
            task_id="test_1",
            ssh_hook="string_rather_than_SSHHook",  # invalid ssh_hook
            ssh_conn_id=conn_id,
            command="echo -n airflow",
            timeout=TIMEOUT,
            dag=self.dag)
        try:
            task_1.execute(None)
        except Exception:
            pass
        self.assertEqual(task_1.ssh_hook.ssh_conn_id, conn_id)

        task_2 = SSHOperator(
            task_id="test_2",
            ssh_conn_id=conn_id,  # no ssh_hook provided
            command="echo -n airflow",
            timeout=TIMEOUT,
            dag=self.dag)
        try:
            task_2.execute(None)
        except Exception:
            pass
        self.assertEqual(task_2.ssh_hook.ssh_conn_id, conn_id)

        # if both valid ssh_hook and ssh_conn_id are provided, ignore ssh_conn_id
        task_3 = SSHOperator(task_id="test_3",
                             ssh_hook=self.hook,
                             ssh_conn_id=conn_id,
                             command="echo -n airflow",
                             timeout=TIMEOUT,
                             dag=self.dag)
        try:
            task_3.execute(None)
        except Exception:
            pass
        self.assertEqual(task_3.ssh_hook.ssh_conn_id, self.hook.ssh_conn_id)
Esempio n. 4
0
default_args = {
    'owner': 'Damien Ayers',
    'depends_on_past':
    False,  # Very important, will cause a single failure to propagate forever
    'start_date': datetime(2020, 3, 11),
    'retries': 3,
    'retry_delay': timedelta(minutes=1),
    'timeout': 3600,  # For running SSH Commands
    'params': {
        'project': 'v10',
        'queue': 'normal',
        'module': 'dea/unstable',
        'year': '2019'
    }
}

dag = DAG(
    'nci_database_backup',
    default_args=default_args,
    catchup=False,
    schedule_interval=None,
)

with dag:
    run_backup = SSHOperator(task_id='execute_daily_backup',
                             ssh_conn_id='lpgs_gadi',
                             command="""
        cd /g/data/v10/agdc/backup;
        ./trigger-daily-db-backup.sh &>> "/data/logs/nc-db-backup_$(date -d${1:-today} +%Y%m%d_%s).log
        """)
    'retry_delay': timedelta(minutes=5),
}

dag = DAG(dag_id='primary_analysis_and_qc_processing',
          catchup=False,
          schedule_interval="@hourly",
          max_active_runs=1,
          default_args=default_args)  ## TO DO

ssh_hook = SSHHook(ssh_conn_id='cx1_ssh_conn')
orwell_ssh_hook = SSHHook(ssh_conn_id='orwell_ssh_conn')

update_exp_metadata = SSHOperator(
    task_id='update_exp_metadata',
    dag=dag,
    ssh_hook=ssh_hook,
    command=
    'bash /rds/general/user/igf/home/git_repo/IGF-cron-scripts/hpc/update_exp_metadata.sh '
)

find_new_exp_for_analysis = SSHOperator(
    task_id='find_new_exp_for_analysis',
    dag=dag,
    ssh_hook=orwell_ssh_hook,
    command=
    'bash /home/igf/igf_code/IGF-cron-scripts/orwell/find_new_exp_for_analysis.sh '
)

find_new_exp_for_analysis.set_upstream(update_exp_metadata)

seed_analysis_pipeline = SSHOperator(
Esempio n. 6
0
    'depends_on_past': False,
    'start_date': datetime(2020, 2, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': True,
}

dag = DAG('testdag',
          default_args=default_args,
          catchup=False,
          schedule_interval=None,
          template_searchpath='/home/omad/airflow/dags/templates/')

with dag:

    foo = SSHOperator(ssh_conn_id='lpgs_gadi',
                      task_id='foo',
                      remote_host='gadi-dm.nci.org.au',
                      command='env')

    failing_task = ShortCircuitSSHOperator(task_id='failing_task',
                                           ssh_conn_id='lpgs_gadi',
                                           command='false')
    should_be_skipped = DummyOperator(task_id='should_be_skipped')
    passing_task = ShortCircuitSSHOperator(task_id='passing_task',
                                           ssh_conn_id='lpgs_gadi',
                                           command='true')
    should_be_run = DummyOperator(task_id='should_be_run')
    failing_task >> should_be_skipped
    passing_task >> should_be_run

    send_email = EmailOperator(
        task_id='send_email',
dag = DAG(
    'nci_build_dea_module',
    default_args=default_args,
    schedule_interval=None,
    tags=['nci'],
)

with dag:
    build_env_task = SSHOperator(
        task_id=f'build_dea_module',
        ssh_conn_id='lpgs_gadi',
        command="""
        cd ~/dea-orchestration/
        git reset --hard
        git pull
        cd ~/dea-orchestration/nci_environment
        git status
        module load python3/3.7.4
        pip3 install --user pyyaml jinja2
        
        ./build_environment_module.py dea/modulespec.yaml
        """,
    )

    test_env_task = SSHOperator(task_id='test_dea_module',
                                ssh_conn_id='lpgs_gadi',
                                command="""
        cd $TMPDIR
        git clone --depth 1 https://github.com/GeoscienceAustralia/dea-notebooks
        cd dea-notebooks/Frequently_used_code/
        module load dea/$(date +%Y%m%d)  # TODO, this will fail if run over midnight...
Esempio n. 8
0
default_args = {
    'start_date': datetime(2018, 1, 1),
    'retries': 1,
}

dag = DAG(
    'example_dag',
    default_args=default_args,
    schedule_interval='0 1 * * * ',
    catchup=False)

task_one = SSHOperator(
    task_id='task_one',
    ssh_conn_id='private_key',
    remote_host=ETL_HOST,
    command='command for task one {{ params.task_one_param }}',
    params={
        'task_one_param': 1
    },
    dag=dag)

subdag = SubDagOperator(
    subdag=example_subdag(
        'example_dag', 'example_subdag', default_args=default_args,
        schedule_interval=dag.schedule_interval, catchup=dag.catchup),
    task_id='subdag',
    dag=dag)

task_one >> subdag
from datetime import timedelta, datetime
from airflow import DAG
from airflow.contrib.operators.ssh_operator import SSHOperator

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'start_date': datetime(2019, 7, 7),
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG(dag_id='load_property_sale_fact',
          default_args=default_args,
          schedule_interval='0 23 * * *')

t1_bash = """
/usr/local/bin/dp/database_jobs/run_py.sh "load_property_sale_fact.py"
"""

t1 = SSHOperator(ssh_conn_id='ssh_aws_ec2',
                 task_id='load_property_sale_fact',
                 command=t1_bash,
                 dag=dag)
Ingestion = BashOperator(
    task_id='Ingestion',
    bash_command="echo {{ dag_run.conf['ingestion_status'] }}",
    dag=dag,
)

VirusCheck = BashOperator(
    task_id='VirusCheck',
    bash_command="echo {{ dag_run.conf['quarantine_bucket'] }}",
    dag=dag,
)

MoveToPrecurated = SSHOperator(
    task_id='MoveToPrecurated',
    ssh_conn_id='flywheel_usw2',
    command=
    "aws s3 cp s3://{{ dag_run.conf['quarantine_bucket'] }}/ s3://{{ dag_run.conf['precurated_bucket'] }}/ --recursive",
    dag=dag,
)

FlywheelUpload = SSHOperator(
    task_id='FlywheelUpload',
    ssh_conn_id='flywheel_usw2',
    command=
    "fw ingest template s3://{{ dag_run.conf['precurated_bucket'] }}/{{ dag_run.conf['precurated_bucket_key'] }} --config-file {{ dag_run.conf['fw_template'] }} --group {{ dag_run.conf['fw_group'] }} --project {{ dag_run.conf['fw_project'] }} --cluster https://flywheel-us-sbx.science.roche.com/ingest -f",
    dag=dag,
)

SuccessNotification = BashOperator(
    task_id="SuccessNotification",
    bash_command="echo {{ dag_run.conf['email_list'] }}",
Esempio n. 11
0
# Aims to download the data at 10am Pacific
default_args = {
    "owner": "airflow",
    "depends_on_past": True,
    "wait_for_downstream": True,
    "start_date": datetime(2019, 9, 25, 17, tzinfo=utc_tz),
    "email": ["*****@*****.**"],
    "email_on_failure": False,
    "email_on_retry": False,
    "retries": 3,
    "retry_delay": timedelta(hours=1),
    "task_concurrency": 1,
    # "execution_timeout": timedelta(minutes=2)
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG("water_supply_scraper",
          default_args=default_args,
          schedule_interval=timedelta(days=1))

# Yes, that is a space at the end, do not remove
# https://cwiki.apache.org/confluence/display/AIRFLOW/Common+Pitfalls
command = 'docker-compose exec -T web flask wss '

t1 = SSHOperator(ssh_conn_id='ssh_wss',
                 task_id='run_wss',
                 command=command,
                 dag=dag)
Esempio n. 12
0
default_args = {
    'owner': 'price-insight',
    'depends_on_past': False,
    'start_date': datetime(2019, 2, 2),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False
}

dag = DAG('listing_stats_batch',
          default_args=default_args,
          schedule_interval='@once')

data_fetch_task = SSHOperator(
    ssh_conn_id='data_fetch_conn',
    task_id='data_fetch',
    command='cd ~/InnSight/data_fetch; ./data_fetch.sh all',
    dag=dag)

config_generation_task = SSHOperator(
    ssh_conn_id='spark_master_conn',
    task_id='config_generation',
    command='cd ~/InnSight/batch_processing; ./s3_urls_generation.sh all',
    dag=dag)

data_cleaning_task = SSHOperator(
    ssh_conn_id='spark_master_conn',
    task_id='data_cleaning',
    command='source ~/.profile; '
    'cd ~/InnSight/batch_processing; '
    '~/.local/bin/spark-submit '
Esempio n. 13
0
}

dag = DAG(
    dag_id='testSSH_zhrui',
    default_args=args,
    schedule_interval=timedelta(days=1),
    dagrun_timeout=timedelta(minutes=60),
)

sshHook = SSHHook(
    remote_host='dltsprod-worker-rsagxh.eastus.cloudapp.azure.com',
    username='******',
    key_file='/home/bitnami/.ssh/id_rsa_zhrui',
    port=31624,
    timeout=10,
    keepalive_interval=30)

t1 = SSHOperator(task_id="connectionDLTS",
                 command='mkdir fromAirflow',
                 ssh_hook=sshHook,
                 dag=dag)

t2 = SSHOperator(
    ssh_hook=sshHook,
    task_id='writeToRemote',
    command=
    'touch /tmp/test_ssh_in_airflow.txt',  # create a file at remote machine
    dag=dag)

t1 >> t2
    'timeout': 1200,  # For running SSH Commands
    'email_on_failure': True,
    'email': '*****@*****.**',
}

dag = DAG(
    'nci_build_env_module',
    default_args=default_args,
    schedule_interval=None,
    tags=['nci'],
)

with dag:
    build_env_task = SSHOperator(
        task_id=f'build_dea_env_module',
        ssh_conn_id='lpgs_gadi',
        command="""
        set -eux
        cd ~/dea-orchestration/
        git reset --hard
        git pull
        cd ~/dea-orchestration/nci_environment
        git status
        module load python3/3.7.4
        pip3 install --user pyyaml jinja2
        
        rm -rf /g/data/v10/public/modules/dea-env/$(date +%Y%m%d)/ /g/data/v10/public/modules/modulefiles/dea-env/$(date +%Y%m%d)
        ./build_environment_module.py dea-env/modulespec.yaml
        """,
    )
    schedule_interval=MANIFOLD_GENERATE_SITEMAP_INTERVAL)

#
# CREATE TASKS
#
# Tasks with all logic contained in a single operator can be declared here.
# Tasks with custom logic are relegated to individual Python files.
#

generate_sitemap_bash = """
sudo su - manifold bash -c \
 "cd /var/www/manifold &&\
 RAILS_ENV=production bundle exec rake sitemap:create"
"""

generate_sitemap = SSHOperator(
    task_id='generate_sitemap',
    command=generate_sitemap_bash,
    dag=MANIFOLD_GENERATE_SITEMAP_DAG,
    ssh_conn_id='AIRFLOW_CONN_MANIFOLD_SSH_INSTANCE')

post_slack = PythonOperator(task_id='slack_post_succ',
                            python_callable=slackpostonsuccess,
                            provide_context=True,
                            dag=MANIFOLD_GENERATE_SITEMAP_DAG)

#
# SET UP TASK DEPENDENCIES
#
post_slack.set_upstream(generate_sitemap)
from datetime import timedelta, datetime
from airflow import DAG
from airflow.contrib.operators.ssh_operator import SSHOperator

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'start_date': datetime(2019, 7, 7),
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG(dag_id='nightly_mat_view_refresh',
          default_args=default_args,
          schedule_interval='0 11 * * *')

t1_bash = """
/usr/local/bin/dp/database_jobs/run_py.sh "refresh_mat_views_nightly.py"
"""

t1 = SSHOperator(ssh_conn_id='ssh_aws_ec2',
                 task_id='nightly_mat_view_refresh',
                 command=t1_bash,
                 dag=dag)
Esempio n. 17
0
from datetime import timedelta, datetime
from airflow import DAG
from airflow.contrib.operators.ssh_operator import SSHOperator


default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'start_date': datetime(2019, 7, 7),
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG(dag_id='load_rew_properties',
          default_args=default_args,
          schedule_interval='*/30 * * * *')

t1_bash = """
/usr/local/bin/dp/database_jobs/run_py.sh "execute_sql_file.py --sql_file load_rew_properties.sql --job_code load_rew_properties"
"""

t1 = SSHOperator(
    ssh_conn_id='ssh_aws_ec2',
    task_id='load_rew_properties',
    command=t1_bash,
    dag=dag)
    )
    # language="Shell Script"
    generate_list = SSHOperator(
        task_id='generate_list_of_s2_to_upload',
        # language="Shell Script"
        command=COMMON + dedent("""
        
            rm -f s3_paths_list.txt  # In case we've been run before
            for product_name in s2a_ard_granule s2b_ard_granule; do
                echo Searching for $product_name datasets.
            psql --variable=ON_ERROR_STOP=1 --csv --quiet --tuples-only --no-psqlrc \
                 -h dea-db.nci.org.au datacube <<EOF >> s3_paths_list.txt
            SELECT 's3://dea-public-data/L2/sentinel-2-nbar/S2MSIARD_NBAR/' 
                    || substring(ds.metadata#>>'{extent,center_dt}' for 10) || '/' 
                    || replace(ds.metadata#>>'{tile_id}', 'L1C', 'ARD') || '/ARD-METADATA.yaml'
                FROM agdc.dataset ds
                INNER JOIN agdc.dataset_type dst ON ds.dataset_type_ref = dst.id
                INNER JOIN agdc.dataset_location dsl ON ds.id = dsl.dataset_ref
                WHERE dst.name='$product_name'
                  AND ds.added BETWEEN '{{ prev_execution_date }}' AND '{{ execution_date }}';
            EOF
            done

            echo -n Num Datasets to upload: 
            wc -l s3_paths_list.txt
        
        """),
        remote_host='gadi-dm.nci.org.au',
        timeout=20 * MINUTES,
    )

    # Execute script to upload sentinel-2 data to s3 bucket
Esempio n. 19
0
# # month 类型的任务 dag_id 需要修改
# dag = DAG(
#     dag_id='airflow_pyspark_template_week',
#     default_args=args,
#     schedule_interval='50 2 1 * *',
#     dagrun_timeout=timedelta(minutes=60),
# )
# task_id也需要修改为相应的任务描述
day_partition = SSHOperator(
    ssh_conn_id='ws@hdp-0',
    task_id='device_filmora_log_day_partition',
    command=
    " cd /usr/local/bigdata/jobtaskh0/pythonjob/pyspark_template/ && spark-submit \
                --num-executors 4 \
                --executor-memory 4G \
                --executor-cores 4 \
                --driver-memory 4G \
                --driver-cores 4 \
                --jars /usr/hdp/3.0.1.0-187/spark2/jars/mysql-connector-java-5.1.47.jar \
                --driver-class-path /usr/hdp/3.0.1.0-187/spark2/jars/mysql-connector-java-5.1.47.jar \
                /usr/local/bigdata/jobtaskh0/pythonjob/uos/uid_label/device_filmora_log.py  \
                day \
                {{ ds_nodash }} ",
    dag=dag)

# --num-executors 50 \
# --executor-memory 4G \
# --executor-cores 4 \
# --driver-memory 1G \
# --driver-cores 4 \
# --conf spark.default.parallelism=1000\
# --conf spark.storage.memoryFraction=0.5\
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator
from airflow.utils.dates import days_ago
from airflow.contrib.hooks.ssh_hook import SSHHook
from airflow.exceptions import AirflowException
from airflow.models import BaseOperator
from airflow.utils.decorators import apply_defaults
import halo_variables

environment_default_args = {
    "owner": "airflow",
    "depends_on_past": False,
    #"start_date": date(2020, 7, 24),
    "start_date": days_ago(1),
    "retries": 0,
    "schedule_interval": "* */2 * * *",
}

dag = DAG("customerfacingservice_spark_submit",
          default_args=environment_default_args,
          concurrency=5,
          max_active_runs=1)

create_command = "/home/airflow/wifi_uc/cfs.sh "

task1 = SSHOperator(ssh_conn_id="ssh_dev_conn",
                    command=create_command,
                    task_id="spark_ssh",
                    dag=dag)
Esempio n. 21
0
	executor_cores = row['EXECUTOR_CORES']
	executor_mem = row['EXECUTOR_MEM']
	num_executor = row['NUM_EXECUTOR']
	additional_param = row['ADDITIONAL_PARAM']
	dependencies = row['DEPENDENCIES'].split('|')
	partitions = row['PARTITIONS']
	
	for e in etl_task_type:
		if e == 'EXTRACT':
			script_loc = etl_task_type_df.loc[etl_task_type_df['ETL_TASK_TYPE'].str.contains('EXTRACT'), 'SCRIPT_LOC'][0]
			script_name = etl_task_type_df.loc[etl_task_type_df['ETL_TASK_TYPE'].str.contains('EXTRACT'), 'SCRIPT_NAME'][0]
			complete_script_path = script_loc+script_name
			
			t3 = SSHOperator(
					ssh_conn_id=ssh_conn_id,
					task_id=str(table_name)+'_'+str(e),					
					command= 'spark-submit --num-executors '+str(partitions)+' '+complete_script_path+' '+table_name+' '+str(partitions), 	
					dag=dag)
			t3 >> t1
		
		if e == 'MERGE':
			script_loc = etl_task_type_df['SCRIPT_LOC'][1]
			script_name = etl_task_type_df['SCRIPT_NAME'][1]
			complete_script_path = script_loc+script_name
			
			t3 = SSHOperator(
					ssh_conn_id=ssh_conn_id,
					task_id=str(table_name)+'_'+str(e),					
					command= 'spark-submit --num-executors '+str(num_executor)+' --executor-cores '+str(executor_cores)+' --executor-memory '+executor_mem+' --driver-memory '+driver_mem+' --driver-cores '+str(driver_cores)+' '+complete_script_path+' '+table_name  , 	
					dag=dag)
					
docker_compose_file = '/home/ubuntu/config/docker-compose-hrrr.yml'
config_file = '/code/config/hrrr_dates.ini'

# run_weather_forecast_retrieval to get data from NOAA
hrrr = HRRR(docker_compose_file, config_file)
command = hrrr.get_compose_command()

# convert_grib2nc command for conversion
grib2nc = hrrr.get_compose_grib2nc()

# upload to swift
swift = HRRR('/home/ubuntu/config/docker-compose-swift.yml', config_file)
upload_swift = swift.get_compose_upload_swift()

t1 = SSHOperator(ssh_conn_id='ssh_hrrr',
                 task_id='run_hrrr_retrieval_dates',
                 command=command,
                 dag=dag)

t2 = SSHOperator(ssh_conn_id='ssh_hrrr',
                 task_id='convert_grib2nc',
                 command=grib2nc,
                 dag=dag)

t3 = SSHOperator(ssh_conn_id='ssh_hrrr',
                 task_id='upload_swift',
                 command=upload_swift,
                 dag=dag)

t1.set_downstream(t2)
t2.set_downstream(t3)
        datestring=$(date +%Y%m%d)
        file_prefix="${host}-${datestring}"
    ''')

    run_backup = SSHOperator(
        task_id='run_backup',
        command=COMMON + dedent("""
            args="-U agdc_backup -h ${host} -p 5432"

            set -x

            # Cleanup previous failures
            rm -rf "${file_prefix}"*-datacube-partial.pgdump

            # Dump
            pg_dump ${args} guest > "${file_prefix}-guest.sql"
            pg_dump ${args} datacube -n agdc -T 'agdc.dv_*' -F c -f "${file_prefix}-datacube-partial.pgdump"
            mv -v "${file_prefix}-datacube-partial.pgdump" "${file_prefix}-datacube.pgdump"

            # The globals technically contain (weakly) hashed pg user passwords, so we'll
            # tighten permissions.  (This shouldn't really matter, as users don't choose
            # their own passwords and they're long random strings, but anyway)
            umask 066
            pg_dumpall ${args} --globals-only > "${file_prefix}-globals.sql"

        """),
    )

    aws_conn = AwsHook(aws_conn_id='aws_nci_db_backup')
    upload_to_s3 = SSHOperator(task_id='upload_to_s3',
                               params=dict(aws_conn=aws_conn),
Esempio n. 24
0
    dag=dag)

# EMR 클러스터에 원격으로 커맨드를 실행하기 위한 전초작업
# 확실하지 않지만, SSH 접속을 한번 해두어야 Airflow SSHOperator로 접속이 가능했었다.
t2 = BashOperator(
    task_id="emr_ssh_connect",
    bash_command="""bash {}/shell_script/emr_ssh_connect.sh {} {}""".format(
        project_home, ip_address, ip_domain),
    dag=dag)

# 하둡의 HDFS에 접속하기 위해선 보조IP가 아닌 메인IP로 해야된다. 그래서 보조IP를 메인IP에 포워딩 해준다.
# EMR 클러스터의 보조IP를 메인IP에 포워딩
t3 = SSHOperator(
    task_id="ip_forwarding",
    command=
    """(echo $(sudo ifconfig eth0 | grep 'inet addr' | cut -d: -f2 | awk '{{ print $1 }}') " {}") | sudo tee -a /etc/hosts"""
    .format(ip_domain),
    ssh_hook=sshHook,
    dag=dag)

# S3에 있는 훈련데이터를 EMR 클러스터의 HDFS로 옴긴다.
t4 = SSHOperator(
    task_id="traindata_s3_to_hdfs",
    command=
    """s3-dist-cp --src s3://jhw620/RefineData/ --dest hdfs://{}:8020/data/ --srcPattern .*[^_\$folder\$]$"""
    .format(ip_domain),
    ssh_hook=sshHook,
    dag=dag)

# 모델 훈련
t5 = BashOperator(task_id="training_model",
Esempio n. 25
0
#
# CREATE TASKS
#
# Tasks with all logic contained in a single operator can be declared here.
# Tasks with custom logic are relegated to individual Python files.
#

sync_hours_bash = """
sudo su - manifold bash -c \
 "cd /var/www/manifold &&\
 RAILS_ENV=production bundle exec rake sync:hours"
"""

sync_hours = SSHOperator(
    task_id='sync_hours',
    command=sync_hours_bash,
    dag=MANIFOLD_HOURS_SYNC_DAG,
    ssh_conn_id='AIRFLOW_CONN_MANIFOLD_SSH_INSTANCE'
)

post_slack = PythonOperator(
    task_id='slack_post_succ',
    python_callable=slackpostonsuccess,
    provide_context=True,
    dag=MANIFOLD_HOURS_SYNC_DAG
)

#
# SET UP TASK DEPENDENCIES
#
post_slack.set_upstream(sync_hours)
Esempio n. 26
0
 submit_ard = SSHOperator(
     task_id=submit_task_id,
     command=COMMON + """
     mkdir -p {{ params.base_dir }}{{ work_ext }}
     mkdir -p {{ params.base_dir }}{{ log_ext }}
     qsub -N ard_scene_select \
           -q  {{ params.queue }}  \
           -W umask=33 \
           -l wd,walltime=0:30:00,mem=15GB,ncpus=1 -m abe \
           -l storage=gdata/v10+scratch/v10+gdata/if87+gdata/fj7+scratch/fj7 \
           -P  {{ params.project }} -o {{ params.base_dir }}{{ log_ext }} -e {{ params.base_dir }}{{ log_ext }}  \
           -- /bin/bash -l -c \
               "module use /g/data/v10/public/modules/modulefiles/; \
               module use /g/data/v10/private/modules/modulefiles/; \
               module load {{ params.module_ass }}; \
               ard-scene-select \
             {{ params.products_arg }} \
             {{ params.config_arg }} \
               --workdir {{ params.base_dir }}{{ work_ext }} \
               --pkgdir {{ params.pkgdir_arg }} \
               --logdir {{ params.base_dir }}{{ log_ext }} \
               --env {{ params.wagl_env }}  \
               --project {{ params.project }} \
               --walltime 02:30:00 \
               {{ params.index_arg }} \
               {{ params.scene_limit }} \
               {{ params.interim_days_wait }} \
               {{ params.days_to_exclude_arg }} \
               {{ params.run_ard_arg }} "
     """,
     timeout=60 * 20,
     do_xcom_push=True,
 )
Esempio n. 27
0
# argument file variables paths
spark_script = 'main.py'
json = 'review_and_evaluation_config.json'
logFile = 'csdr_status.log'

scriptFile = '/opt/scripts/mig/csdr/csdr_xml_validation/app/{}'.format(spark_script)
xmlFilePath = 'hdfs://migration/data/raw/csdr/settlement_internalisation/to_process/'
jsonPath = '/opt/scripts/csdr/mig/conf/{}'.format(json)
logPath = 'hdfs://migration/data/raw/csdr/stage_status/{}'.format(logFile)

xml_validation = 'sudo python {} -f {} -j {} -l {}'.format(scriptFile, xmlFilePath, jsonPath, logPath)

args = {
    'owner':'Airflow',
    'start_date': days_ago(1)
}

with DAG(dag_id='mig_csdr_re_xml_validation_spark_xml_process', description='CSDR xml re validation spark process', default_args=args, schedule_interval='* * */1 * *') as dag:

    start = DummyOperator(
        task_id='start'
    )

    post_ingestion = SSHOperator(
        ssh_conn_id='zaloni',
        task_id='esml_feedback_process',
        command=xml_validation
    )

    start >> post_ingestion #>> re_validate_xml
# Default DAG parameters
default_args = {
    'owner': 'airflow',
    'depends_past': False,
    'start_date': dt(2020, 3, 23),
    'retries': 0
}

dag = DAG('variable_example',
          default_args=default_args,
          schedule_interval='30 07 * * *')

url_awscli = Variable.get("url_awscli")
directory_dest = Variable.get("directory_dest")

# Install aws CLI in ssh
cmd = """
mkdir -p {} \
curl "{}"  -o "/tmp/awscli.zip" \
unzip /tmp/awscli.zip -d {} \
sudo {}aws/install \
rm /tmp/awscli.zip \
aws emr create-default-roles
""".format(directory_dest, url_awscli, directory_dest, directory_dest)

install_aws = SSHOperator(ssh_conn_id='adaltas_ssh',
                          task_id='install_aws',
                          command=cmd,
                          dag=dag)
        else:
            external_dag_id = 'nci_fractional_cover'
            external_task_id = f'wait_for_{product}'
        processing_completed = ExternalTaskSensor(
            task_id=f'processing_completed_{product}',
            external_dag_id=external_dag_id,
            external_task_id=external_task_id,
            mode='reschedule',
            timeout=1 * DAYS,
        )

        download_s3_inventory = SSHOperator(
            task_id=f'download_s3_inventory_{product}',
            command=COMMON + dedent('''
                mkdir -p {{work_dir}}

                dea-cogger save-s3-inventory --product-name "{{ params.product }}" --output-dir "{{work_dir}}"
            '''),
            params={'product': product},
        )
        generate_work_list = SSHOperator(
            task_id=f'generate_work_list_{product}',
            command=COMMON + dedent("""
                cd {{work_dir}}

                dea-cogger generate-work-list --product-name "{{params.product}}" \\
                 --output-dir "{{work_dir}}" --s3-list  "{{params.product}}_s3_inv_list.txt" \\
                 --time-range "time in [2019-01-01, 2025-12-31]"
            """),
            # --time-range "time in [{{prev_ds}}, {{ds}}]"
            timeout=2 * HOURS,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG(dag_id='seqrun_processing',
          catchup=False,
          schedule_interval="@hourly",
          max_active_runs=1,
          default_args=default_args)  ## TO DO

ssh_hook = SSHHook(ssh_conn_id='orwell_ssh_conn')
cx1_ssh_hook = SSHHook(ssh_conn_id='cx1_ssh_conn')

switch_off_project_barcode = SSHOperator(
    task_id='switch_off_project_barcode',
    dag=dag,
    ssh_hook=ssh_hook,
    command=
    'bash /home/igf/igf_code/IGF-cron-scripts/orwell/switch_off_project_barcode_check.sh '
)

change_samplesheet_for_run = SSHOperator(
    task_id='change_samplesheet_for_run',
    dag=dag,
    ssh_hook=ssh_hook,
    command=
    'bash /home/igf/igf_code/IGF-cron-scripts/orwell/change_samplesheet_for_seqrun.sh '
)

change_samplesheet_for_run.set_upstream(switch_off_project_barcode)

restart_seqrun_processing = SSHOperator(
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
    # 'wait_for_downstream': False,
    # 'dag': dag,
    # 'adhoc':False,
    # 'sla': timedelta(hours=2),
    # 'execution_timeout': timedelta(seconds=300),
    # 'on_failure_callback': some_function,
    # 'on_success_callback': some_other_function,
    # 'on_retry_callback': another_function,
    # 'trigger_rule': u'all_success'
}

dag = DAG(dag_id='ssh_airflow_test',
          default_args=args,
          schedule_interval='*/1 * * * *',
          catchup=False)

bash_command = """python /Users/zaferdurkut/test/dizin1/ssh_test.py"""
ssh_hook = SSHHook(username=os.getenv('SSH_USER'),
                   password=os.getenv('SSH_PASSWORD'),
                   remote_host=os.getenv('SSH_HOST'))

ssh_task = SSHOperator(task_id='ssh_airflow_test_task',
                       ssh_hook=ssh_hook,
                       command=bash_command,
                       dag=dag)

ssh_task
Esempio n. 32
0
from datetime import datetime
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.contrib.operators.ssh_operator import SSHOperator

dag = DAG(dag_id='test_run_hadoop',
          start_date=datetime(2021, 1, 1),
          schedule_interval=None)

cmd_hdfs_ls = """/usr/local/bin/hdfs dfs -ls"""

#cmd_hdfs_ls="""pwd && ls"""

start = DummyOperator(task_id='start', dag=dag)

end = DummyOperator(task_id='end', dag=dag)

hdfs_ls = SSHOperator(task_id='hdfs_ls',
                      command=cmd_hdfs_ls,
                      ssh_conn_id='local_ssh_default',
                      retries=1,
                      dag=dag)

start >> hdfs_ls >> end