def test_arg_checking(self): from airflow.exceptions import AirflowException conn_id = "conn_id_for_testing" os.environ['AIRFLOW_CONN_' + conn_id.upper()] = "ssh://test_id@localhost" # Exception should be raised if neither ssh_hook nor ssh_conn_id is provided if six.PY2: self.assertRaisesRegex = self.assertRaisesRegexp with self.assertRaisesRegex( AirflowException, "Cannot operate without ssh_hook or ssh_conn_id."): task_0 = SFTPOperator(task_id="test_sftp", local_filepath=self.test_local_filepath, remote_filepath=self.test_remote_filepath, operation=SFTPOperation.PUT, dag=self.dag) task_0.execute(None) # if ssh_hook is invalid/not provided, use ssh_conn_id to create SSHHook task_1 = SFTPOperator( task_id="test_sftp", ssh_hook="string_rather_than_SSHHook", # invalid ssh_hook ssh_conn_id=conn_id, local_filepath=self.test_local_filepath, remote_filepath=self.test_remote_filepath, operation=SFTPOperation.PUT, dag=self.dag) try: task_1.execute(None) except Exception: pass self.assertEqual(task_1.ssh_hook.ssh_conn_id, conn_id) task_2 = SFTPOperator( task_id="test_sftp", ssh_conn_id=conn_id, # no ssh_hook provided local_filepath=self.test_local_filepath, remote_filepath=self.test_remote_filepath, operation=SFTPOperation.PUT, dag=self.dag) try: task_2.execute(None) except Exception: pass self.assertEqual(task_2.ssh_hook.ssh_conn_id, conn_id) # if both valid ssh_hook and ssh_conn_id are provided, ignore ssh_conn_id task_3 = SFTPOperator(task_id="test_sftp", ssh_hook=self.hook, ssh_conn_id=conn_id, local_filepath=self.test_local_filepath, remote_filepath=self.test_remote_filepath, operation=SFTPOperation.PUT, dag=self.dag) try: task_3.execute(None) except Exception: pass self.assertEqual(task_3.ssh_hook.ssh_conn_id, self.hook.ssh_conn_id)
def test_arg_checking(self): # Exception should be raised if neither ssh_hook nor ssh_conn_id is provided with self.assertRaisesRegex(AirflowException, "Cannot operate without ssh_hook or ssh_conn_id."): task_0 = SFTPOperator( task_id="test_sftp", local_filepath=self.test_local_filepath, remote_filepath=self.test_remote_filepath, operation=SFTPOperation.PUT, dag=self.dag ) task_0.execute(None) # if ssh_hook is invalid/not provided, use ssh_conn_id to create SSHHook task_1 = SFTPOperator( task_id="test_sftp", ssh_hook="string_rather_than_SSHHook", # invalid ssh_hook ssh_conn_id=TEST_CONN_ID, local_filepath=self.test_local_filepath, remote_filepath=self.test_remote_filepath, operation=SFTPOperation.PUT, dag=self.dag ) try: task_1.execute(None) except Exception: pass self.assertEqual(task_1.ssh_hook.ssh_conn_id, TEST_CONN_ID) task_2 = SFTPOperator( task_id="test_sftp", ssh_conn_id=TEST_CONN_ID, # no ssh_hook provided local_filepath=self.test_local_filepath, remote_filepath=self.test_remote_filepath, operation=SFTPOperation.PUT, dag=self.dag ) try: task_2.execute(None) except Exception: pass self.assertEqual(task_2.ssh_hook.ssh_conn_id, TEST_CONN_ID) # if both valid ssh_hook and ssh_conn_id are provided, ignore ssh_conn_id task_3 = SFTPOperator( task_id="test_sftp", ssh_hook=self.hook, ssh_conn_id=TEST_CONN_ID, local_filepath=self.test_local_filepath, remote_filepath=self.test_remote_filepath, operation=SFTPOperation.PUT, dag=self.dag ) try: task_3.execute(None) except Exception: pass self.assertEqual(task_3.ssh_hook.ssh_conn_id, self.hook.ssh_conn_id)
def test_file_transfer_no_intermediate_dir_error_get(self): test_remote_file_content = \ "This is remote file content \n which is also multiline " \ "another line here \n this is last line. EOF" # create a test file remotely create_file_task = SSHOperator(task_id="test_create_file", ssh_hook=self.hook, command="echo '{0}' > {1}".format( test_remote_file_content, self.test_remote_filepath), do_xcom_push=True, dag=self.dag) self.assertIsNotNone(create_file_task) ti1 = TaskInstance(task=create_file_task, execution_date=timezone.utcnow()) ti1.run() # Try to GET test file from remote # This should raise an error with "No such file" as the directory # does not exist with self.assertRaises(Exception) as error: get_test_task = SFTPOperator( task_id="test_sftp", ssh_hook=self.hook, local_filepath=self.test_local_filepath_int_dir, remote_filepath=self.test_remote_filepath, operation=SFTPOperation.GET, dag=self.dag) self.assertIsNotNone(get_test_task) ti2 = TaskInstance(task=get_test_task, execution_date=timezone.utcnow()) ti2.run() self.assertIn('No such file', str(error.exception))
def test_pickle_file_transfer_get(self): test_remote_file_content = \ "This is remote file content \n which is also multiline " \ "another line here \n this is last line. EOF" # create a test file remotely create_file_task = SSHOperator(task_id="test_create_file", ssh_hook=self.hook, command="echo '{0}' > {1}".format( test_remote_file_content, self.test_remote_filepath), do_xcom_push=True, dag=self.dag) self.assertIsNotNone(create_file_task) ti1 = TaskInstance(task=create_file_task, execution_date=timezone.utcnow()) ti1.run() # get remote file to local get_test_task = SFTPOperator(task_id="test_sftp", ssh_hook=self.hook, local_filepath=self.test_local_filepath, remote_filepath=self.test_remote_filepath, operation=SFTPOperation.GET, dag=self.dag) self.assertIsNotNone(get_test_task) ti2 = TaskInstance(task=get_test_task, execution_date=timezone.utcnow()) ti2.run() # test the received content content_received = None with open(self.test_local_filepath, 'r') as f: content_received = f.read() self.assertEqual(content_received.strip(), test_remote_file_content)
def test_file_transfer_no_intermediate_dir_error_put(self): test_local_file_content = \ b"This is local file content \n which is multiline " \ b"continuing....with other character\nanother line here \n this is last line" # create a test file locally with open(self.test_local_filepath, 'wb') as f: f.write(test_local_file_content) # Try to put test file to remote # This should raise an error with "No such file" as the directory # does not exist with self.assertRaises(Exception) as error: put_test_task = SFTPOperator( task_id="test_sftp", ssh_hook=self.hook, local_filepath=self.test_local_filepath, remote_filepath=self.test_remote_filepath_int_dir, operation=SFTPOperation.PUT, create_intermediate_dirs=False, dag=self.dag) self.assertIsNotNone(put_test_task) ti2 = TaskInstance(task=put_test_task, execution_date=timezone.utcnow()) ti2.run() self.assertIn('No such file', str(error.exception))
def test_pickle_file_transfer_put(self): test_local_file_content = \ b"This is local file content \n which is multiline " \ b"continuing....with other character\nanother line here \n this is last line" # create a test file locally with open(self.test_local_filepath, 'wb') as f: f.write(test_local_file_content) # put test file to remote put_test_task = SFTPOperator(task_id="test_sftp", ssh_hook=self.hook, local_filepath=self.test_local_filepath, remote_filepath=self.test_remote_filepath, operation=SFTPOperation.PUT, create_intermediate_dirs=True, dag=self.dag) self.assertIsNotNone(put_test_task) ti2 = TaskInstance(task=put_test_task, execution_date=timezone.utcnow()) ti2.run() # check the remote file content check_file_task = SSHOperator(task_id="test_check_file", ssh_hook=self.hook, command="cat {0}".format( self.test_remote_filepath), do_xcom_push=True, dag=self.dag) self.assertIsNotNone(check_file_task) ti3 = TaskInstance(task=check_file_task, execution_date=timezone.utcnow()) ti3.run() self.assertEqual( ti3.xcom_pull(task_ids='test_check_file', key='return_value').strip(), test_local_file_content)
COMMON = dedent(""" {% set work_dir = '/g/data/v10/work/c3_upload_s3/' + params.product +'/' + ts_nodash -%} """) # List all the scenes to be uploaded to S3 bucket list_scenes = SSHOperator( task_id=f"list_{product}_scenes", ssh_conn_id="lpgs_gadi", command=COMMON + LIST_SCENES_COMMAND, params={"product": product}, do_xcom_push=False, ) # Uploading c3_to_s3_rolling.py script to NCI sftp_c3_to_s3_script = SFTPOperator( task_id=f"sftp_c3_to_s3_script_{product}", local_filepath=str(Path(configuration.get("core", "dags_folder")).parent / "scripts/c3_to_s3_rolling.py"), remote_filepath=f"{WORK_DIR}/c3_to_s3_rolling.py", operation=SFTPOperation.PUT, create_intermediate_dirs=True, ) # Execute script to upload Landsat collection 3 data to s3 bucket aws_hook = AwsHook(aws_conn_id=dag.default_args["aws_conn_id"]) execute_c3_to_s3_script = SSHOperator( task_id=f"execute_c3_to_s3_script_{product}", command=COMMON + RUN_UPLOAD_SCRIPT, remote_host="gadi-dm.nci.org.au", params={ "aws_hook": aws_hook, "product": product, "nci_dir": "/g/data/xu18/ga/", }, )
'nbart_s2b_fix_metadata_v2', doc_md=__doc__, default_args=default_args, catchup=True, schedule_interval='@daily', max_active_runs=4, default_view='tree', tags=['nci', 'sentinel_2'], ) with dag: # Uploading s2_to_s3_rolling.py script to NCI upload_uploader_script = SFTPOperator( task_id="upload_uploader_script", local_filepath=str( Path(configuration.get('core', 'dags_folder')).parent / "scripts/s2b_fix_metadata.py"), remote_filepath=WORK_DIR + "/{{ds}}/s2b_fix_metadata.py", operation=SFTPOperation.PUT, create_intermediate_dirs=True) upload_utils = SFTPOperator( task_id="upload_utils", local_filepath=str( Path(configuration.get('core', 'dags_folder')).parent / "scripts/c3_to_s3_rolling.py"), remote_filepath=WORK_DIR + "/{{ds}}/c3_to_s3_rolling.py", operation=SFTPOperation.PUT, create_intermediate_dirs=True) # Execute script to upload sentinel-2 data to s3 bucket aws_hook = AwsHook(aws_conn_id=dag.default_args['aws_conn_id'])
max_active_runs=4, default_view='graph', tags=['nci', 'sentinel_2'], ) with dag: WORK_DIR = "/g/data/v10/work/s2_nbar_rolling_archive/{{ ds }}_{{ var.json.nci_s2_upload_s3_config.numdays }}" COMMON = """ {% set work_dir = '/g/data/v10/work/s2_nbar_rolling_archive/' + ds +'_' + var.json.nci_s2_upload_s3_config.numdays -%} """ # Uploading s2_to_s3_rolling.py script to NCI sftp_s2_to_s3_script = SFTPOperator( task_id="sftp_s2_to_s3_script", local_filepath=Path( Path(configuration.get('core', 'dags_folder')).parent).joinpath( "scripts/s2_to_s3_rolling.py").as_posix(), remote_filepath="{}/s2_to_s3_rolling.py".format(WORK_DIR), operation=SFTPOperation.PUT, create_intermediate_dirs=True) # Execute script to upload sentinel-2 data to s3 bucket aws_hook = AwsHook(aws_conn_id=dag.default_args['aws_conn_id']) execute_s2_to_s3_script = SSHOperator(task_id='execute_s2_to_s3_script', command=dedent(COMMON + """ {% set aws_creds = params.aws_hook.get_credentials() -%} cd {{ work_dir }} # echo on and exit on fail set -eu # Load the latest stable DEA module module use /g/data/v10/public/modules/modulefiles
'securityApp.log', 'mainApp.log', 'extApp.log', 'timeApp.log', 'tokenApp.log', 'bridgeApp.log', 'daemonApp.log', 'notificationApp.log', 'messageApp.log'] dl_tasks = [] for file in log_list: op = SFTPOperator(task_id=f"download_{file}", ssh_conn_id="log_server", local_filepath=f"{base_folder}/{file}", remote_filepath=f"{remote_path}/{file}", operation=SFTPOperation.GET, create_intermediate_dirs=True, dag=dag) dl_tasks.append(op) bash_command = """ grep -E 'Exception' --include=\\*.log -rnw '{{ params.base_folder }}' > {{ params.base_folder }}/errors.txt ls -l {{ params.base_folder }}/errors.txt && cat {{ params.base_folder }}/errors.txt """ grep_exception = BashOperator(task_id="grep_exception", bash_command=bash_command, params={'base_folder': base_folder}, dag=dag)
sshHook = SSHHook(ssh_conn_id="ssh_default") # zip all necessary python code to send to EMR zip_executables = BashOperator( task_id="prepare_executable_files", bash_command= "zip -r /home/workspace/executables.zip /home/workspace/awsc/ /home/workspace/config.cfg /home/workspace/main.py", dag=dag, ) # # Copy files from Docker container folder to the EMR Spark directory deploy_job = SFTPOperator( task_id="deploy", ssh_hook=sshHook, local_filepath="/home/workspace/executables.zip", remote_filepath="/home/hadoop/executables.zip", operation="put", create_intermediate_dirs=True, dag=dag, confirm=False, ) # ETL_jobs_Operator = SSHOperator( task_id="ETLjob", command=""" unzip -o executables.zip && cd home/workspace/ && zip -r awsc.zip awsc && /usr/bin/spark-submit --py-files awsc.zip --master yarn main.py; """, ssh_hook=sshHook,
df.to_sql( name="users", if_exists='replace', con=hook.get_sqlalchemy_engine() ) dag = DAG(dag_id='sftp-mock-file-to-mysql', default_args=default_args, schedule_interval='@daily', dagrun_timeout=timedelta(seconds=120)) sftp = SFTPOperator( task_id='fetch_csv_from_sftp', ssh_conn_id='ftp_server', local_filepath='/tmp/latest_users.csv', remote_filepath='/working-dir/{file}'.format(file=FILE), operation=SFTPOperation.GET, dag=dag ) # lateset released version of Airflow does not do templating # in the op_args. This has recently been fixed though: # https://github.com/apache/airflow/pull/4691 # For now, instead of depending on the templated 'file' variable # we'll use a static name for the file to load into mysql csv_to_mysql = PythonOperator( task_id='ingest_csv_into_mysql', python_callable=ingest_csv_into_mysql, op_args=['/tmp/latest_users.csv'], dag=dag )
} dag_name = 'dga_time_series' file_names = ['sample_submission', 'test', 'train'] dag = DAG(dag_id=dag_name, default_args=default_args, schedule_interval=None) start = DummyOperator(task_id="start", provide_context=True, dag=dag) hito_files_hdfs = DummyOperator(task_id="hito_files_hdfs", dag=dag) for file_name in file_names: print("Filename: " + file_name) sftp_file_to_container_hdfs = SFTPOperator( task_id='pass_' + file_name + '_to_docker_hdfs', ssh_conn_id="ssh_default", local_filepath="/usr/local/spark/resources/{0}.csv".format(file_name), remote_filepath="/hadoop/data/{0}.csv".format(file_name), operation="put", dag=dag) put_file_in_hdfs = SSHOperator( task_id='put_' + file_name + '_in_hdfs', ssh_conn_id="ssh_default", command= " cd /hadoop/bin && ./hdfs dfs -test -e /{0}.csv; if [ `echo $?` -gt 0 ]; then ./hdfs dfs -put /hadoop/data/{0}.csv /; fi" .format(file_name), dag=dag) start >> sftp_file_to_container_hdfs >> put_file_in_hdfs >> hito_files_hdfs entrenamiento_modelo = SparkSubmitOperator(
'nci_s2_upload_s3_v2', doc_md=__doc__, default_args=default_args, catchup=True, schedule_interval='@daily', max_active_runs=4, default_view='tree', tags=['nci', 'sentinel_2'], ) with dag: # Uploading s2_to_s3_rolling.py script to NCI upload_uploader_script = SFTPOperator( task_id="upload_uploader_script", local_filepath=str(Path(configuration.get('core', 'dags_folder')).parent / "scripts/upload_s2.py"), remote_filepath="/g/data/v10/work/s2_nbar_rolling_archive/{{ds}}/upload_s2.py", operation=SFTPOperation.PUT, create_intermediate_dirs=True ) # language="Shell Script" generate_list = SSHOperator( task_id='generate_list_of_s2_to_upload', # language="Shell Script" command=COMMON + dedent(""" rm -f s3_paths_list.txt # In case we've been run before for product_name in s2a_ard_granule s2b_ard_granule; do echo Searching for $product_name datasets. psql --variable=ON_ERROR_STOP=1 --csv --quiet --tuples-only --no-psqlrc \ -h dea-db.nci.org.au datacube <<EOF >> s3_paths_list.txt SELECT 's3://dea-public-data/L2/sentinel-2-nbar/S2MSIARD_NBAR/'
False, 'email_on_retry': False, 'retries': 0, } dag = DAG(DAG_NAME, schedule_interval=None, default_args=default_args, catchup=False) sftp = SFTPOperator( task_id='sftp', # 接続先等を記述したconnectionをaiflowに登録しておく必要がある ssh_conn_id='test_connection', # remote_host='', local_filepath='/tmp/test/test.txt', remote_filepath='/home/ubuntu/test.txt', operation='GET', # confirm=, create_intermediate_dirs=True, dag=dag, ) python_sftp = PythonOperator( task_id='python_sftp', python_callable=utils.sftp, provide_context=True, dag=dag, )
default_args=default_args, catchup=False) as dag: query_task = PythonOperator( task_id='make_query', python_callable=make_query, retries=2, retry_delay=datetime.timedelta(minutes=1), provide_context=True, ) put_task = SFTPOperator( task_id='put_sftp_nifi', ssh_conn_id='ssh_nifi_prod', local_filepath=f'/home/airflow/gcs/data/{dag.dag_id}/make_query.json', remote_filepath='/tmp/make_query.json', retries=2, retry_delay=datetime.timedelta(minutes=10), operation='put', create_intermediate_dirs=True ) verify_task = ReverseSFTPSensor( task_id='is_processed', path='/tmp/make_query.json', sftp_conn_id='ssh_nifi_prod', poke_interval=60*5, mode='reschedule' ) query_task >> put_task >> verify_task
'retries': 1, 'retry_delay': timedelta(minutes=5), 'wait_for_downstream': True } with DAG('Beeline_PO2DB', default_args=default_args, description='Тестовый поток. XLS -> PostgreSQL', schedule_interval=timedelta(hours=6)) as dag: dag.doc_md = __doc__ # Получаем файл по SFTP fetch_file = SFTPOperator(task_id="sftp_get_file", ssh_conn_id="ssh_local", remote_filepath=rfp, local_filepath=lfp, operation="get", create_intermediate_dirs=True) def process_xls_file(ds, **kwargs): file = pandas.read_excel(Path(lfp)) file.columns = file.columns.map(lambda x: x.replace('(', '').replace( ')', '')) # удаляем символы скобок из имен колонок engine = PostgresHook( postgres_conn_id='postgres_local').get_sqlalchemy_engine() file.to_sql('airflow_stg_mining_po', con=engine, index=True, if_exists='replace', schema='beeline')