def test_execute_with_transform_script(self, mock_log, mock_Popen): process_output = [b"Foo", b"Bar", b"Baz"] process = mock_Popen.return_value process.stdout.readline.side_effect = process_output process.wait.return_value = None process.returncode = 0 bucket = "bucket" input_key = "foo" output_key = "bar" bio = io.BytesIO(b"input") conn = boto3.client('s3') conn.create_bucket(Bucket=bucket) conn.upload_fileobj(Bucket=bucket, Key=input_key, Fileobj=bio) s3_url = "s3://{0}/{1}" t = S3FileTransformOperator( source_s3_key=s3_url.format(bucket, input_key), dest_s3_key=s3_url.format(bucket, output_key), transform_script=self.transform_script, replace=True, task_id="task_id") t.execute(None) mock_log.info.assert_has_calls([ mock.call(line.decode(sys.getdefaultencoding())) for line in process_output ])
def test_execute_with_failing_transform_script(self, mock_Popen): process = mock_Popen.return_value process.stdout.readline.side_effect = [] process.wait.return_value = None process.returncode = 42 bucket = "bucket" input_key = "foo" output_key = "bar" bio = io.BytesIO(b"input") conn = boto3.client('s3') conn.create_bucket(Bucket=bucket) conn.upload_fileobj(Bucket=bucket, Key=input_key, Fileobj=bio) s3_url = "s3://{0}/{1}" t = S3FileTransformOperator( source_s3_key=s3_url.format(bucket, input_key), dest_s3_key=s3_url.format(bucket, output_key), transform_script=self.transform_script, replace=True, task_id="task_id") with self.assertRaises(AirflowException) as e: t.execute(None) self.assertEqual('Transform script failed: 42', str(e.exception))
def test_execute(self, mock_Popen): transform_script_process = mock_Popen.return_value transform_script_process.communicate.return_value = [None, None] transform_script_process.returncode = 0 bucket = "bucket" input_key = "foo" output_key = "bar" bio = io.BytesIO(b"input") conn = boto3.client('s3') conn.create_bucket(Bucket=bucket) conn.upload_fileobj(Bucket=bucket, Key=input_key, Fileobj=bio) s3_url = "s3://{0}/{1}" t = S3FileTransformOperator( source_s3_key=s3_url.format(bucket, input_key), dest_s3_key=s3_url.format(bucket, output_key), transform_script=self.transform_script, task_id="task_id") t.execute(None)
def test_execute_with_select_expression(self, mock_select_key): bucket = "bucket" input_key = "foo" output_key = "bar" bio = io.BytesIO(b"input") conn = boto3.client('s3') conn.create_bucket(Bucket=bucket) conn.upload_fileobj(Bucket=bucket, Key=input_key, Fileobj=bio) s3_url = "s3://{0}/{1}" select_expression = "SELECT * FROM S3Object s" t = S3FileTransformOperator( source_s3_key=s3_url.format(bucket, input_key), dest_s3_key=s3_url.format(bucket, output_key), select_expression=select_expression, replace=True, task_id="task_id") t.execute(None) mock_select_key.assert_called_once_with(key=s3_url.format( bucket, input_key), expression=select_expression)
def execute(self, context): super(XComEnabledAWSAthenaOperator, self).execute(context) # this gets `xcom_push`(ed) return self.query_execution_id # database query Dag set to dailey as per requirments with DAG(dag_id='partitioned_athena_and_S3move', schedule_interval ='@dailey', start_date=datetime.now()) as partit_dag: run_query = XComEnabledAWSAthenaOperator( task_id = 'run_query', query = query.create_patit_table, output_location= config['S3']['OUTPUT_LOCATION'], database = config['S3']['DATABASE'] ) move_results = S3FileTransformOperator( task_id = 'move_results', source_s3_key = config['S3']['SOURCE_S3_KEY'], dest_s3_key = config['S3']['DEST_S3_KEY'], transform_script = '/bin/cp' ) move_results.set_upstream(run_query) #Set workflow Stream run_query >> move_results
user = os.environ['WORKSHOP_USER'] bucket = f'pydata-eindhoven-2019-airflow-{user}' dag_folder = os.path.dirname(os.path.abspath(__file__)) with DAG( dag_id='custom_operator', schedule_interval='@daily', start_date=datetime.datetime(2019, 11, 27) ) as dag: preprocess_train_operator = S3FileTransformOperator( task_id='preprocess_train', transform_script=f'{dag_folder}/transform_scripts/preprocess.py', source_s3_key=f's3://{bucket}/raw_training_data.csv', dest_s3_key=f's3://{bucket}/preprocessed_training_data.csv', source_aws_conn_id='s3', dest_aws_conn_id='s3', replace=True ) train_operator = S3FileTransformOperator( task_id='train', transform_script=f'{dag_folder}/transform_scripts/train.py', source_s3_key=f's3://{bucket}/preprocessed_training_data.csv', dest_s3_key=f's3://{bucket}/trained_model.pkl', source_aws_conn_id='s3', dest_aws_conn_id='s3', replace=True )
s3_conn_id='dev1_s3', depends_on_past=False, poke_interval=2, timeout=15, soft_fail=False, bucket_key='{}input/{}'.format(Variable.get('s3_buckey'),Variable.get('s3_filename')), bucket_name=None, wildcard_match=False, dag=dag) s3_transform = S3FileTransformOperator( task_id='s3_transform', depends_on_past=False, #trigger_rule='all_success', source_s3_key="{}input/{}".format(Variable.get('s3_buckey'),Variable.get('s3_filename')), dest_s3_key="{}output/{}{}".format(Variable.get('s3_buckey'),datetime.today().strftime('%Y%m%d%H%M'),Variable.get('s3_filename')), transform_script='/home/ubuntu/airflow/dag_scripts/transform.sh', source_s3_conn_id='dev1_s3', dest_s3_conn_id='dev1_s3', replace=True, dag=dag) s3_remove = BashOperator( task_id='s3_remove', bash_command='aws s3 rm {}input/{}'.format(Variable.get('s3_buckey'),Variable.get('s3_filename')), trigger_rule='all_success', dag=dag) no_run = DummyOperator(task_id='no_run') end = DummyOperator(
default_args = { "owner": "airflow", "depends_on_past": False, "start_date": datetime(2020, 9, 7), "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, "retries": 1, "retry_delay": timedelta(minutes=5) } with DAG("s3_transformer", default_args=default_args, schedule_interval='@once') as dag: t1 = BashOperator( task_id='bash_test', bash_command='echo "hello, it should work" > s3_conn_test.txt') transformer = S3FileTransformOperator( task_id='ETL_records', description='cleans ETL_medical_records', source_s3_key='s3://XXX/YYY/ZZZ.xml', dest_s3_key='s3://XXX/YYY/WWW.xml', replace=False, transform_script='/opt/airflow/dags/scripts/transform.py', source_aws_conn_id='s3_connection', dest_aws_conn_id='s3_connection') t1.set_upstream(transformer)
description= 'Pulls tweets about a given topic from twitter for analysis', #schedule_interval=timedelta(days=1), catchup=False) as dag: tweets_to_s3 = TweetsToS3Operator( task_id='tweets_to_s3', topic='{{ dag_run.conf["topic"] }}', description='Writes tweets about a certain topic to S3', max_tweets=100, s3_key='tweet_data.' + timestamp) etl_tweets = S3FileTransformOperator( task_id='etl_tweets', description='cleans the tweet jsons pulled', source_s3_key='s3://j17devbucket/tweet_data.' + timestamp, dest_s3_key='s3://j17devbucket/cleaned_tweet_data.' + timestamp, replace=True, transform_script='scripts/etl/clean_tweets_pipeline.py') get_sentiment = S3FileTransformOperator( task_id='get_sentiment', description='Get sentiment of tweets', source_s3_key='s3://j17devbucket/cleaned_tweet_data.' + timestamp, dest_s3_key='s3://j17devbucket/analyzed_tweet_data_' + timestamp + '.json', replace=True, transform_script='scripts/nlp/sentiment_analysis.py') summarize_sentiment = S3FileTransformOperator( task_id='summarize_sentiment',
from airflow.models import DAG from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator from airflow.operators.s3_file_transform_operator import S3FileTransformOperator from datetime import datetime class XComEnabledAWSAthenaOperator(AWSAthenaOperator): def execute(self, context): super(XComEnabledAWSAthenaOperator, self).execute(context) # just so that this gets `xcom_push`(ed) return self.query_execution_id with DAG(dag_id='athena_query_and_move', schedule_interval=None, start_date=datetime(2019, 6, 7)) as dag: run_query = XComEnabledAWSAthenaOperator( task_id='run_query', query='SELECT * FROM UNNEST(SEQUENCE(0, 100))', output_location='s3://my-bucket/my-path/', database='my_database') move_results = S3FileTransformOperator( task_id='move_results', source_s3_key= 's3://mybucket/mypath/{{ task_instance.xcom_pull(task_ids="run_query") }}.csv', dest_s3_key='s3://mybucket/otherpath/myresults.parquet', transform_script='csv_to_parquet.py') move_results.set_upstream(run_query)