예제 #1
0
    def test_file_transfer_no_intermediate_dir_error_get(self):
        test_remote_file_content = \
            "This is remote file content \n which is also multiline " \
            "another line here \n this is last line. EOF"

        # create a test file remotely
        create_file_task = SSHOperator(task_id="test_create_file",
                                       ssh_hook=self.hook,
                                       command="echo '{0}' > {1}".format(
                                           test_remote_file_content,
                                           self.test_remote_filepath),
                                       do_xcom_push=True,
                                       dag=self.dag)
        self.assertIsNotNone(create_file_task)
        ti1 = TaskInstance(task=create_file_task,
                           execution_date=timezone.utcnow())
        ti1.run()

        # Try to GET test file from remote
        # This should raise an error with "No such file" as the directory
        # does not exist
        with self.assertRaises(Exception) as error:
            get_test_task = SFTPOperator(
                task_id="test_sftp",
                ssh_hook=self.hook,
                local_filepath=self.test_local_filepath_int_dir,
                remote_filepath=self.test_remote_filepath,
                operation=SFTPOperation.GET,
                dag=self.dag)
            self.assertIsNotNone(get_test_task)
            ti2 = TaskInstance(task=get_test_task,
                               execution_date=timezone.utcnow())
            ti2.run()
        self.assertIn('No such file', str(error.exception))
예제 #2
0
    def test_file_transfer_no_intermediate_dir_error_put(self):
        test_local_file_content = \
            b"This is local file content \n which is multiline " \
            b"continuing....with other character\nanother line here \n this is last line"
        # create a test file locally
        with open(self.test_local_filepath, 'wb') as f:
            f.write(test_local_file_content)

        # Try to put test file to remote
        # This should raise an error with "No such file" as the directory
        # does not exist
        with self.assertRaises(Exception) as error:
            put_test_task = SFTPOperator(
                task_id="test_sftp",
                ssh_hook=self.hook,
                local_filepath=self.test_local_filepath,
                remote_filepath=self.test_remote_filepath_int_dir,
                operation=SFTPOperation.PUT,
                create_intermediate_dirs=False,
                dag=self.dag)
            self.assertIsNotNone(put_test_task)
            ti2 = TaskInstance(task=put_test_task,
                               execution_date=timezone.utcnow())
            ti2.run()
        self.assertIn('No such file', str(error.exception))
예제 #3
0
    def test_pickle_file_transfer_get(self):
        test_remote_file_content = \
            "This is remote file content \n which is also multiline " \
            "another line here \n this is last line. EOF"

        # create a test file remotely
        create_file_task = SSHOperator(task_id="test_create_file",
                                       ssh_hook=self.hook,
                                       command="echo '{0}' > {1}".format(
                                           test_remote_file_content,
                                           self.test_remote_filepath),
                                       do_xcom_push=True,
                                       dag=self.dag)
        self.assertIsNotNone(create_file_task)
        ti1 = TaskInstance(task=create_file_task,
                           execution_date=timezone.utcnow())
        ti1.run()

        # get remote file to local
        get_test_task = SFTPOperator(task_id="test_sftp",
                                     ssh_hook=self.hook,
                                     local_filepath=self.test_local_filepath,
                                     remote_filepath=self.test_remote_filepath,
                                     operation=SFTPOperation.GET,
                                     dag=self.dag)
        self.assertIsNotNone(get_test_task)
        ti2 = TaskInstance(task=get_test_task,
                           execution_date=timezone.utcnow())
        ti2.run()

        # test the received content
        content_received = None
        with open(self.test_local_filepath, 'r') as f:
            content_received = f.read()
        self.assertEqual(content_received.strip(), test_remote_file_content)
예제 #4
0
    def test_pickle_file_transfer_put(self):
        test_local_file_content = \
            b"This is local file content \n which is multiline " \
            b"continuing....with other character\nanother line here \n this is last line"
        # create a test file locally
        with open(self.test_local_filepath, 'wb') as f:
            f.write(test_local_file_content)

        # put test file to remote
        put_test_task = SFTPOperator(task_id="test_sftp",
                                     ssh_hook=self.hook,
                                     local_filepath=self.test_local_filepath,
                                     remote_filepath=self.test_remote_filepath,
                                     operation=SFTPOperation.PUT,
                                     create_intermediate_dirs=True,
                                     dag=self.dag)
        self.assertIsNotNone(put_test_task)
        ti2 = TaskInstance(task=put_test_task,
                           execution_date=timezone.utcnow())
        ti2.run()

        # check the remote file content
        check_file_task = SSHOperator(task_id="test_check_file",
                                      ssh_hook=self.hook,
                                      command="cat {0}".format(
                                          self.test_remote_filepath),
                                      do_xcom_push=True,
                                      dag=self.dag)
        self.assertIsNotNone(check_file_task)
        ti3 = TaskInstance(task=check_file_task,
                           execution_date=timezone.utcnow())
        ti3.run()
        self.assertEqual(
            ti3.xcom_pull(task_ids='test_check_file',
                          key='return_value').strip(), test_local_file_content)
 COMMON = dedent("""
         {% set work_dir = '/g/data/v10/work/c3_upload_s3/' + params.product  +'/' + ts_nodash -%}
         """)
 # List all the scenes to be uploaded to S3 bucket
 list_scenes = SSHOperator(
     task_id=f"list_{product}_scenes",
     ssh_conn_id="lpgs_gadi",
     command=COMMON + LIST_SCENES_COMMAND,
     params={"product": product},
     do_xcom_push=False,
 )
 # Uploading c3_to_s3_rolling.py script to NCI
 sftp_c3_to_s3_script = SFTPOperator(
     task_id=f"sftp_c3_to_s3_script_{product}",
     local_filepath=str(Path(configuration.get("core", "dags_folder")).parent / "scripts/c3_to_s3_rolling.py"),
     remote_filepath=f"{WORK_DIR}/c3_to_s3_rolling.py",
     operation=SFTPOperation.PUT,
     create_intermediate_dirs=True,
 )
 # Execute script to upload Landsat collection 3 data to s3 bucket
 aws_hook = AwsHook(aws_conn_id=dag.default_args["aws_conn_id"])
 execute_c3_to_s3_script = SSHOperator(
     task_id=f"execute_c3_to_s3_script_{product}",
     command=COMMON + RUN_UPLOAD_SCRIPT,
     remote_host="gadi-dm.nci.org.au",
     params={
         "aws_hook": aws_hook,
         "product": product,
         "nci_dir": "/g/data/xu18/ga/",
     },
 )
    def test_arg_checking(self):
        # Exception should be raised if neither ssh_hook nor ssh_conn_id is provided
        with self.assertRaisesRegex(
                AirflowException,
                "Cannot operate without ssh_hook or ssh_conn_id."):
            task_0 = SFTPOperator(task_id="test_sftp",
                                  local_filepath=self.test_local_filepath,
                                  remote_filepath=self.test_remote_filepath,
                                  operation=SFTPOperation.PUT,
                                  dag=self.dag)
            task_0.execute(None)

        # if ssh_hook is invalid/not provided, use ssh_conn_id to create SSHHook
        task_1 = SFTPOperator(
            task_id="test_sftp",
            ssh_hook="string_rather_than_SSHHook",  # invalid ssh_hook
            ssh_conn_id=TEST_CONN_ID,
            local_filepath=self.test_local_filepath,
            remote_filepath=self.test_remote_filepath,
            operation=SFTPOperation.PUT,
            dag=self.dag)
        try:
            task_1.execute(None)
        except Exception:
            pass
        self.assertEqual(task_1.ssh_hook.ssh_conn_id, TEST_CONN_ID)

        task_2 = SFTPOperator(
            task_id="test_sftp",
            ssh_conn_id=TEST_CONN_ID,  # no ssh_hook provided
            local_filepath=self.test_local_filepath,
            remote_filepath=self.test_remote_filepath,
            operation=SFTPOperation.PUT,
            dag=self.dag)
        try:
            task_2.execute(None)
        except Exception:
            pass
        self.assertEqual(task_2.ssh_hook.ssh_conn_id, TEST_CONN_ID)

        # if both valid ssh_hook and ssh_conn_id are provided, ignore ssh_conn_id
        task_3 = SFTPOperator(task_id="test_sftp",
                              ssh_hook=self.hook,
                              ssh_conn_id=TEST_CONN_ID,
                              local_filepath=self.test_local_filepath,
                              remote_filepath=self.test_remote_filepath,
                              operation=SFTPOperation.PUT,
                              dag=self.dag)
        try:
            task_3.execute(None)
        except Exception:
            pass
        self.assertEqual(task_3.ssh_hook.ssh_conn_id, self.hook.ssh_conn_id)
예제 #7
0
    'nbart_s2b_fix_metadata_v2',
    doc_md=__doc__,
    default_args=default_args,
    catchup=True,
    schedule_interval='@daily',
    max_active_runs=4,
    default_view='tree',
    tags=['nci', 'sentinel_2'],
)

with dag:
    # Uploading s2_to_s3_rolling.py script to NCI
    upload_uploader_script = SFTPOperator(
        task_id="upload_uploader_script",
        local_filepath=str(
            Path(configuration.get('core', 'dags_folder')).parent /
            "scripts/s2b_fix_metadata.py"),
        remote_filepath=WORK_DIR + "/{{ds}}/s2b_fix_metadata.py",
        operation=SFTPOperation.PUT,
        create_intermediate_dirs=True)

    upload_utils = SFTPOperator(
        task_id="upload_utils",
        local_filepath=str(
            Path(configuration.get('core', 'dags_folder')).parent /
            "scripts/c3_to_s3_rolling.py"),
        remote_filepath=WORK_DIR + "/{{ds}}/c3_to_s3_rolling.py",
        operation=SFTPOperation.PUT,
        create_intermediate_dirs=True)

    # Execute script to upload sentinel-2 data to s3 bucket
    aws_hook = AwsHook(aws_conn_id=dag.default_args['aws_conn_id'])
    def test_arg_checking(self):
        from airflow.exceptions import AirflowException
        conn_id = "conn_id_for_testing"
        os.environ['AIRFLOW_CONN_' + conn_id.upper()] = "ssh://test_id@localhost"

        # Exception should be raised if neither ssh_hook nor ssh_conn_id is provided
        if six.PY2:
            self.assertRaisesRegex = self.assertRaisesRegexp
        with self.assertRaisesRegex(AirflowException,
                                    "Cannot operate without ssh_hook or ssh_conn_id."):
            task_0 = SFTPOperator(
                task_id="test_sftp",
                local_filepath=self.test_local_filepath,
                remote_filepath=self.test_remote_filepath,
                operation=SFTPOperation.PUT,
                dag=self.dag
            )
            task_0.execute(None)

        # if ssh_hook is invalid/not provided, use ssh_conn_id to create SSHHook
        task_1 = SFTPOperator(
            task_id="test_sftp",
            ssh_hook="string_rather_than_SSHHook",  # invalid ssh_hook
            ssh_conn_id=conn_id,
            local_filepath=self.test_local_filepath,
            remote_filepath=self.test_remote_filepath,
            operation=SFTPOperation.PUT,
            dag=self.dag
        )
        try:
            task_1.execute(None)
        except Exception:
            pass
        self.assertEqual(task_1.ssh_hook.ssh_conn_id, conn_id)

        task_2 = SFTPOperator(
            task_id="test_sftp",
            ssh_conn_id=conn_id,  # no ssh_hook provided
            local_filepath=self.test_local_filepath,
            remote_filepath=self.test_remote_filepath,
            operation=SFTPOperation.PUT,
            dag=self.dag
        )
        try:
            task_2.execute(None)
        except Exception:
            pass
        self.assertEqual(task_2.ssh_hook.ssh_conn_id, conn_id)

        # if both valid ssh_hook and ssh_conn_id are provided, ignore ssh_conn_id
        task_3 = SFTPOperator(
            task_id="test_sftp",
            ssh_hook=self.hook,
            ssh_conn_id=conn_id,
            local_filepath=self.test_local_filepath,
            remote_filepath=self.test_remote_filepath,
            operation=SFTPOperation.PUT,
            dag=self.dag
        )
        try:
            task_3.execute(None)
        except Exception:
            pass
        self.assertEqual(task_3.ssh_hook.ssh_conn_id, self.hook.ssh_conn_id)
예제 #9
0
        default_args=default_args,
        catchup=False) as dag:
    
    query_task = PythonOperator(
                task_id='make_query',
                python_callable=make_query,
                retries=2,
                retry_delay=datetime.timedelta(minutes=1),
                provide_context=True,
                )

    put_task = SFTPOperator(
                task_id='put_sftp_nifi',
                ssh_conn_id='ssh_nifi_prod',
                local_filepath=f'/home/airflow/gcs/data/{dag.dag_id}/make_query.json',
                remote_filepath='/tmp/make_query.json',
                retries=2,
                retry_delay=datetime.timedelta(minutes=10),
                operation='put',
                create_intermediate_dirs=True
                )
    
    verify_task = ReverseSFTPSensor(
                task_id='is_processed',
                path='/tmp/make_query.json',
                sftp_conn_id='ssh_nifi_prod',
                poke_interval=60*5,
                mode='reschedule'
                )

    query_task >> put_task >> verify_task
예제 #10
0
            'securityApp.log',
            'mainApp.log',
            'extApp.log',
            'timeApp.log',
            'tokenApp.log',
            'bridgeApp.log',
            'daemonApp.log',
            'notificationApp.log',
            'messageApp.log']

dl_tasks = []
for file in log_list:
    op = SFTPOperator(task_id=f"download_{file}",
                ssh_conn_id="log_server",
                local_filepath=f"{base_folder}/{file}",
                remote_filepath=f"{remote_path}/{file}",
                operation=SFTPOperation.GET,
                create_intermediate_dirs=True,
                dag=dag)
    dl_tasks.append(op)


bash_command = """
    grep -E 'Exception' --include=\\*.log -rnw '{{ params.base_folder }}' > {{ params.base_folder }}/errors.txt
    ls -l {{ params.base_folder }}/errors.txt && cat {{ params.base_folder }}/errors.txt
"""
grep_exception = BashOperator(task_id="grep_exception",
                        bash_command=bash_command,
                        params={'base_folder': base_folder},
                        dag=dag)
sshHook = SSHHook(ssh_conn_id="ssh_default")

# zip all necessary python code to send to EMR
zip_executables = BashOperator(
    task_id="prepare_executable_files",
    bash_command=
    "zip -r /home/workspace/executables.zip /home/workspace/awsc/ /home/workspace/config.cfg /home/workspace/main.py",
    dag=dag,
)

# # Copy files from Docker container folder to the EMR Spark directory
deploy_job = SFTPOperator(
    task_id="deploy",
    ssh_hook=sshHook,
    local_filepath="/home/workspace/executables.zip",
    remote_filepath="/home/hadoop/executables.zip",
    operation="put",
    create_intermediate_dirs=True,
    dag=dag,
    confirm=False,
)

#
ETL_jobs_Operator = SSHOperator(
    task_id="ETLjob",
    command="""
    unzip -o executables.zip &&
    cd home/workspace/ &&
    zip -r awsc.zip awsc  &&
    /usr/bin/spark-submit --py-files awsc.zip --master yarn main.py;
    """,
    ssh_hook=sshHook,
예제 #12
0
파일: dag.py 프로젝트: xyzlat/whirl
    df.to_sql(
        name="users",
        if_exists='replace',
        con=hook.get_sqlalchemy_engine()
    )


dag = DAG(dag_id='sftp-mock-file-to-mysql',
          default_args=default_args,
          schedule_interval='@daily',
          dagrun_timeout=timedelta(seconds=120))

sftp = SFTPOperator(
    task_id='fetch_csv_from_sftp',
    ssh_conn_id='ftp_server',
    local_filepath='/tmp/latest_users.csv',
    remote_filepath='/working-dir/{file}'.format(file=FILE),
    operation=SFTPOperation.GET,
    dag=dag
)

# lateset released version of Airflow does not do templating
# in the op_args. This has recently been fixed though:
# https://github.com/apache/airflow/pull/4691
# For now, instead of depending on the templated 'file' variable
# we'll use a static name for the file to load into mysql
csv_to_mysql = PythonOperator(
    task_id='ingest_csv_into_mysql',
    python_callable=ingest_csv_into_mysql,
    op_args=['/tmp/latest_users.csv'],
    dag=dag
)
}

dag_name = 'dga_time_series'
file_names = ['sample_submission', 'test', 'train']

dag = DAG(dag_id=dag_name, default_args=default_args, schedule_interval=None)

start = DummyOperator(task_id="start", provide_context=True, dag=dag)
hito_files_hdfs = DummyOperator(task_id="hito_files_hdfs", dag=dag)

for file_name in file_names:
    print("Filename: " + file_name)
    sftp_file_to_container_hdfs = SFTPOperator(
        task_id='pass_' + file_name + '_to_docker_hdfs',
        ssh_conn_id="ssh_default",
        local_filepath="/usr/local/spark/resources/{0}.csv".format(file_name),
        remote_filepath="/hadoop/data/{0}.csv".format(file_name),
        operation="put",
        dag=dag)

    put_file_in_hdfs = SSHOperator(
        task_id='put_' + file_name + '_in_hdfs',
        ssh_conn_id="ssh_default",
        command=
        " cd /hadoop/bin && ./hdfs dfs -test -e /{0}.csv; if [ `echo $?` -gt 0 ]; then ./hdfs dfs -put /hadoop/data/{0}.csv /; fi"
        .format(file_name),
        dag=dag)

    start >> sftp_file_to_container_hdfs >> put_file_in_hdfs >> hito_files_hdfs

entrenamiento_modelo = SparkSubmitOperator(
    'nci_s2_upload_s3_v2',
    doc_md=__doc__,
    default_args=default_args,
    catchup=True,
    schedule_interval='@daily',
    max_active_runs=4,
    default_view='tree',
    tags=['nci', 'sentinel_2'],
)

with dag:
    # Uploading s2_to_s3_rolling.py script to NCI
    upload_uploader_script = SFTPOperator(
        task_id="upload_uploader_script",
        local_filepath=str(Path(configuration.get('core', 'dags_folder')).parent / "scripts/upload_s2.py"),
        remote_filepath="/g/data/v10/work/s2_nbar_rolling_archive/{{ds}}/upload_s2.py",
        operation=SFTPOperation.PUT,
        create_intermediate_dirs=True
    )
    # language="Shell Script"
    generate_list = SSHOperator(
        task_id='generate_list_of_s2_to_upload',
        # language="Shell Script"
        command=COMMON + dedent("""
        
            rm -f s3_paths_list.txt  # In case we've been run before
            for product_name in s2a_ard_granule s2b_ard_granule; do
                echo Searching for $product_name datasets.
            psql --variable=ON_ERROR_STOP=1 --csv --quiet --tuples-only --no-psqlrc \
                 -h dea-db.nci.org.au datacube <<EOF >> s3_paths_list.txt
            SELECT 's3://dea-public-data/L2/sentinel-2-nbar/S2MSIARD_NBAR/' 
예제 #15
0
    False,
    'email_on_retry':
    False,
    'retries':
    0,
}
dag = DAG(DAG_NAME,
          schedule_interval=None,
          default_args=default_args,
          catchup=False)

sftp = SFTPOperator(
    task_id='sftp',
    # 接続先等を記述したconnectionをaiflowに登録しておく必要がある
    ssh_conn_id='test_connection',
    # remote_host='',
    local_filepath='/tmp/test/test.txt',
    remote_filepath='/home/ubuntu/test.txt',
    operation='GET',
    # confirm=,
    create_intermediate_dirs=True,
    dag=dag,
)

python_sftp = PythonOperator(
    task_id='python_sftp',
    python_callable=utils.sftp,
    provide_context=True,
    dag=dag,
)
예제 #16
0
    def test_arg_checking(self):
        from airflow.exceptions import AirflowException
        conn_id = "conn_id_for_testing"
        os.environ['AIRFLOW_CONN_' +
                   conn_id.upper()] = "ssh://test_id@localhost"

        # Exception should be raised if neither ssh_hook nor ssh_conn_id is provided
        if six.PY2:
            self.assertRaisesRegex = self.assertRaisesRegexp
        with self.assertRaisesRegex(
                AirflowException,
                "Cannot operate without ssh_hook or ssh_conn_id."):
            task_0 = SFTPOperator(task_id="test_sftp",
                                  local_filepath=self.test_local_filepath,
                                  remote_filepath=self.test_remote_filepath,
                                  operation=SFTPOperation.PUT,
                                  dag=self.dag)
            task_0.execute(None)

        # if ssh_hook is invalid/not provided, use ssh_conn_id to create SSHHook
        task_1 = SFTPOperator(
            task_id="test_sftp",
            ssh_hook="string_rather_than_SSHHook",  # invalid ssh_hook
            ssh_conn_id=conn_id,
            local_filepath=self.test_local_filepath,
            remote_filepath=self.test_remote_filepath,
            operation=SFTPOperation.PUT,
            dag=self.dag)
        try:
            task_1.execute(None)
        except Exception:
            pass
        self.assertEqual(task_1.ssh_hook.ssh_conn_id, conn_id)

        task_2 = SFTPOperator(
            task_id="test_sftp",
            ssh_conn_id=conn_id,  # no ssh_hook provided
            local_filepath=self.test_local_filepath,
            remote_filepath=self.test_remote_filepath,
            operation=SFTPOperation.PUT,
            dag=self.dag)
        try:
            task_2.execute(None)
        except Exception:
            pass
        self.assertEqual(task_2.ssh_hook.ssh_conn_id, conn_id)

        # if both valid ssh_hook and ssh_conn_id are provided, ignore ssh_conn_id
        task_3 = SFTPOperator(task_id="test_sftp",
                              ssh_hook=self.hook,
                              ssh_conn_id=conn_id,
                              local_filepath=self.test_local_filepath,
                              remote_filepath=self.test_remote_filepath,
                              operation=SFTPOperation.PUT,
                              dag=self.dag)
        try:
            task_3.execute(None)
        except Exception:
            pass
        self.assertEqual(task_3.ssh_hook.ssh_conn_id, self.hook.ssh_conn_id)
예제 #17
0
    max_active_runs=4,
    default_view='graph',
    tags=['nci', 'sentinel_2'],
)

with dag:
    WORK_DIR = "/g/data/v10/work/s2_nbar_rolling_archive/{{ ds }}_{{ var.json.nci_s2_upload_s3_config.numdays }}"
    COMMON = """
            {% set work_dir = '/g/data/v10/work/s2_nbar_rolling_archive/' 
            + ds +'_' + var.json.nci_s2_upload_s3_config.numdays -%}
            """
    # Uploading s2_to_s3_rolling.py script to NCI
    sftp_s2_to_s3_script = SFTPOperator(
        task_id="sftp_s2_to_s3_script",
        local_filepath=Path(
            Path(configuration.get('core', 'dags_folder')).parent).joinpath(
                "scripts/s2_to_s3_rolling.py").as_posix(),
        remote_filepath="{}/s2_to_s3_rolling.py".format(WORK_DIR),
        operation=SFTPOperation.PUT,
        create_intermediate_dirs=True)
    # Execute script to upload sentinel-2 data to s3 bucket
    aws_hook = AwsHook(aws_conn_id=dag.default_args['aws_conn_id'])
    execute_s2_to_s3_script = SSHOperator(task_id='execute_s2_to_s3_script',
                                          command=dedent(COMMON + """
            {% set aws_creds = params.aws_hook.get_credentials() -%}
            cd {{ work_dir }}

            # echo on and exit on fail
            set -eu

            # Load the latest stable DEA module
            module use /g/data/v10/public/modules/modulefiles
예제 #18
0
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    'wait_for_downstream': True
}

with DAG('Beeline_PO2DB',
         default_args=default_args,
         description='Тестовый поток. XLS -> PostgreSQL',
         schedule_interval=timedelta(hours=6)) as dag:

    dag.doc_md = __doc__

    # Получаем файл по SFTP
    fetch_file = SFTPOperator(task_id="sftp_get_file",
                              ssh_conn_id="ssh_local",
                              remote_filepath=rfp,
                              local_filepath=lfp,
                              operation="get",
                              create_intermediate_dirs=True)

    def process_xls_file(ds, **kwargs):
        file = pandas.read_excel(Path(lfp))
        file.columns = file.columns.map(lambda x: x.replace('(', '').replace(
            ')', ''))  # удаляем символы скобок из имен колонок
        engine = PostgresHook(
            postgres_conn_id='postgres_local').get_sqlalchemy_engine()
        file.to_sql('airflow_stg_mining_po',
                    con=engine,
                    index=True,
                    if_exists='replace',
                    schema='beeline')