def test_execute_with_transform_script(self, mock_log, mock_Popen):
        process_output = [b"Foo", b"Bar", b"Baz"]

        process = mock_Popen.return_value
        process.stdout.readline.side_effect = process_output
        process.wait.return_value = None
        process.returncode = 0

        bucket = "bucket"
        input_key = "foo"
        output_key = "bar"
        bio = io.BytesIO(b"input")

        conn = boto3.client('s3')
        conn.create_bucket(Bucket=bucket)
        conn.upload_fileobj(Bucket=bucket, Key=input_key, Fileobj=bio)

        s3_url = "s3://{0}/{1}"
        t = S3FileTransformOperator(
            source_s3_key=s3_url.format(bucket, input_key),
            dest_s3_key=s3_url.format(bucket, output_key),
            transform_script=self.transform_script,
            replace=True,
            task_id="task_id")
        t.execute(None)

        mock_log.info.assert_has_calls([
            mock.call(line.decode(sys.getdefaultencoding())) for line in process_output
        ])
    def test_execute_with_failing_transform_script(self, mock_Popen):
        process = mock_Popen.return_value
        process.stdout.readline.side_effect = []
        process.wait.return_value = None
        process.returncode = 42

        bucket = "bucket"
        input_key = "foo"
        output_key = "bar"
        bio = io.BytesIO(b"input")

        conn = boto3.client('s3')
        conn.create_bucket(Bucket=bucket)
        conn.upload_fileobj(Bucket=bucket, Key=input_key, Fileobj=bio)

        s3_url = "s3://{0}/{1}"
        t = S3FileTransformOperator(
            source_s3_key=s3_url.format(bucket, input_key),
            dest_s3_key=s3_url.format(bucket, output_key),
            transform_script=self.transform_script,
            replace=True,
            task_id="task_id")

        with self.assertRaises(AirflowException) as e:
            t.execute(None)

        self.assertEqual('Transform script failed: 42', str(e.exception))
    def test_execute(self, mock_Popen):
        transform_script_process = mock_Popen.return_value
        transform_script_process.communicate.return_value = [None, None]
        transform_script_process.returncode = 0

        bucket = "bucket"
        input_key = "foo"
        output_key = "bar"
        bio = io.BytesIO(b"input")

        conn = boto3.client('s3')
        conn.create_bucket(Bucket=bucket)
        conn.upload_fileobj(Bucket=bucket, Key=input_key, Fileobj=bio)

        s3_url = "s3://{0}/{1}"
        t = S3FileTransformOperator(
            source_s3_key=s3_url.format(bucket, input_key),
            dest_s3_key=s3_url.format(bucket, output_key),
            transform_script=self.transform_script,
            task_id="task_id")
        t.execute(None)
    def test_execute_with_select_expression(self, mock_select_key):
        bucket = "bucket"
        input_key = "foo"
        output_key = "bar"
        bio = io.BytesIO(b"input")

        conn = boto3.client('s3')
        conn.create_bucket(Bucket=bucket)
        conn.upload_fileobj(Bucket=bucket, Key=input_key, Fileobj=bio)

        s3_url = "s3://{0}/{1}"
        select_expression = "SELECT * FROM S3Object s"
        t = S3FileTransformOperator(
            source_s3_key=s3_url.format(bucket, input_key),
            dest_s3_key=s3_url.format(bucket, output_key),
            select_expression=select_expression,
            replace=True,
            task_id="task_id")
        t.execute(None)

        mock_select_key.assert_called_once_with(key=s3_url.format(
            bucket, input_key),
                                                expression=select_expression)
Exemple #5
0
    
    def execute(self, context):
        super(XComEnabledAWSAthenaOperator, self).execute(context)
        # this gets `xcom_push`(ed)
        return self.query_execution_id
    
# database query Dag set to dailey as per requirments
with DAG(dag_id='partitioned_athena_and_S3move',
         schedule_interval ='@dailey',
         start_date=datetime.now()) as partit_dag:

    run_query = XComEnabledAWSAthenaOperator(
        task_id = 'run_query',
        query = query.create_patit_table,
        output_location= config['S3']['OUTPUT_LOCATION'],
        database = config['S3']['DATABASE']
    )
    
    move_results = S3FileTransformOperator(
        task_id = 'move_results',
        source_s3_key = config['S3']['SOURCE_S3_KEY'],
        dest_s3_key = config['S3']['DEST_S3_KEY'],
        transform_script = '/bin/cp'
    )
    
move_results.set_upstream(run_query)


#Set workflow Stream
run_query >> move_results
Exemple #6
0

user = os.environ['WORKSHOP_USER']
bucket = f'pydata-eindhoven-2019-airflow-{user}'
dag_folder = os.path.dirname(os.path.abspath(__file__))

with DAG(
        dag_id='custom_operator',
        schedule_interval='@daily',
        start_date=datetime.datetime(2019, 11, 27)
) as dag:
    preprocess_train_operator = S3FileTransformOperator(
        task_id='preprocess_train',
        transform_script=f'{dag_folder}/transform_scripts/preprocess.py',
        source_s3_key=f's3://{bucket}/raw_training_data.csv',
        dest_s3_key=f's3://{bucket}/preprocessed_training_data.csv',
        source_aws_conn_id='s3',
        dest_aws_conn_id='s3',
        replace=True
    )

    train_operator = S3FileTransformOperator(
        task_id='train',
        transform_script=f'{dag_folder}/transform_scripts/train.py',
        source_s3_key=f's3://{bucket}/preprocessed_training_data.csv',
        dest_s3_key=f's3://{bucket}/trained_model.pkl',
        source_aws_conn_id='s3',
        dest_aws_conn_id='s3',
        replace=True
    )
Exemple #7
0
        s3_conn_id='dev1_s3',
        depends_on_past=False,
        poke_interval=2,
        timeout=15,
        soft_fail=False,
        bucket_key='{}input/{}'.format(Variable.get('s3_buckey'),Variable.get('s3_filename')),
        bucket_name=None,
        wildcard_match=False,
        dag=dag)
    
    s3_transform = S3FileTransformOperator(
        task_id='s3_transform',
        depends_on_past=False,
	#trigger_rule='all_success',
        source_s3_key="{}input/{}".format(Variable.get('s3_buckey'),Variable.get('s3_filename')),
        dest_s3_key="{}output/{}{}".format(Variable.get('s3_buckey'),datetime.today().strftime('%Y%m%d%H%M'),Variable.get('s3_filename')),
        transform_script='/home/ubuntu/airflow/dag_scripts/transform.sh',
        source_s3_conn_id='dev1_s3',
        dest_s3_conn_id='dev1_s3',
        replace=True,
        dag=dag)	
    
    s3_remove = BashOperator(
	task_id='s3_remove',
	bash_command='aws s3 rm {}input/{}'.format(Variable.get('s3_buckey'),Variable.get('s3_filename')),
	trigger_rule='all_success',
        dag=dag)

    no_run = DummyOperator(task_id='no_run')

    end = DummyOperator(
default_args = {
    "owner": "airflow",
    "depends_on_past": False,
    "start_date": datetime(2020, 9, 7),
    "email": ["*****@*****.**"],
    "email_on_failure": False,
    "email_on_retry": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=5)
}

with DAG("s3_transformer",
         default_args=default_args,
         schedule_interval='@once') as dag:

    t1 = BashOperator(
        task_id='bash_test',
        bash_command='echo "hello, it should work" > s3_conn_test.txt')

    transformer = S3FileTransformOperator(
        task_id='ETL_records',
        description='cleans ETL_medical_records',
        source_s3_key='s3://XXX/YYY/ZZZ.xml',
        dest_s3_key='s3://XXX/YYY/WWW.xml',
        replace=False,
        transform_script='/opt/airflow/dags/scripts/transform.py',
        source_aws_conn_id='s3_connection',
        dest_aws_conn_id='s3_connection')

    t1.set_upstream(transformer)
Exemple #9
0
        description=
        'Pulls tweets about a given topic from twitter for analysis',
        #schedule_interval=timedelta(days=1),
        catchup=False) as dag:

    tweets_to_s3 = TweetsToS3Operator(
        task_id='tweets_to_s3',
        topic='{{ dag_run.conf["topic"] }}',
        description='Writes tweets about a certain topic to S3',
        max_tweets=100,
        s3_key='tweet_data.' + timestamp)

    etl_tweets = S3FileTransformOperator(
        task_id='etl_tweets',
        description='cleans the tweet jsons pulled',
        source_s3_key='s3://j17devbucket/tweet_data.' + timestamp,
        dest_s3_key='s3://j17devbucket/cleaned_tweet_data.' + timestamp,
        replace=True,
        transform_script='scripts/etl/clean_tweets_pipeline.py')

    get_sentiment = S3FileTransformOperator(
        task_id='get_sentiment',
        description='Get sentiment of tweets',
        source_s3_key='s3://j17devbucket/cleaned_tweet_data.' + timestamp,
        dest_s3_key='s3://j17devbucket/analyzed_tweet_data_' + timestamp +
        '.json',
        replace=True,
        transform_script='scripts/nlp/sentiment_analysis.py')

    summarize_sentiment = S3FileTransformOperator(
        task_id='summarize_sentiment',
Exemple #10
0
from airflow.models import DAG
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
from airflow.operators.s3_file_transform_operator import S3FileTransformOperator
from datetime import datetime


class XComEnabledAWSAthenaOperator(AWSAthenaOperator):
    def execute(self, context):
        super(XComEnabledAWSAthenaOperator, self).execute(context)
        # just so that this gets `xcom_push`(ed)
        return self.query_execution_id


with DAG(dag_id='athena_query_and_move',
         schedule_interval=None,
         start_date=datetime(2019, 6, 7)) as dag:

    run_query = XComEnabledAWSAthenaOperator(
        task_id='run_query',
        query='SELECT * FROM  UNNEST(SEQUENCE(0, 100))',
        output_location='s3://my-bucket/my-path/',
        database='my_database')

    move_results = S3FileTransformOperator(
        task_id='move_results',
        source_s3_key=
        's3://mybucket/mypath/{{ task_instance.xcom_pull(task_ids="run_query") }}.csv',
        dest_s3_key='s3://mybucket/otherpath/myresults.parquet',
        transform_script='csv_to_parquet.py')

move_results.set_upstream(run_query)