def test_execute(self, mock_get_conn, mock_run, mock_Session): column_name = "col" cur = mock.MagicMock() cur.fetchall.return_value = [(column_name, )] mock_get_conn.return_value.cursor.return_value = cur access_key = "aws_access_key_id" secret_key = "aws_secret_access_key" mock_Session.return_value = Session(access_key, secret_key) schema = "schema" table = "table" s3_bucket = "bucket" s3_key = "key" unload_options = "" t = RedshiftToS3Transfer( schema=schema, table=table, s3_bucket=s3_bucket, s3_key=s3_key, unload_options=unload_options, redshift_conn_id="redshift_conn_id", aws_conn_id="aws_conn_id", task_id="task_id", dag=None) t.execute(None) columns_query = """ SELECT column_name FROM information_schema.columns WHERE table_schema = '{0}' AND table_name = '{1}' ORDER BY ordinal_position """.format(schema, table) unload_query = """ UNLOAD ('SELECT \\'{0}\\' UNION ALL SELECT CAST({0} AS text) AS {0} FROM {1}.{2} ORDER BY 1 DESC') TO 's3://{3}/{4}/{2}_' with credentials 'aws_access_key_id={5};aws_secret_access_key={6}' {7}; """.format(column_name, schema, table, s3_bucket, s3_key, access_key, secret_key, unload_options) def _trim(s): return re.sub("\s+", " ", s.strip()) self.assertEqual(_trim(cur.execute.call_args[0][0]), _trim(columns_query)) cur.execute.assert_called_once() self.assertEqual(_trim(mock_run.call_args[0][0]), _trim(unload_query)) mock_run.assert_called_once()
def unload_from_staging(**context): """ Unload the staging table into s3 into the required folder Args: **context: Airflow context """ s3 = S3Hook(aws_conn_id='s3_etl') days_computed = context['task_instance'].xcom_pull( task_ids='compute_next_gather', key='days_to_compute') schema = context['task_instance'].xcom_pull(task_ids='load_definition', key='schema') table_name = context['task_instance'].xcom_pull(task_ids='load_definition', key='table_name') for batch_date in sorted(days_computed)[:-1]: s3_keys = s3.list_keys(bucket_name=Variable.get( 'sanitization_s3_sanit_def_files_folder').split('/')[0], prefix='___SCHEMA___' + '/' + table_name + '/batch_date=' + batch_date + '/') if s3_keys: s3.delete_objects(bucket=Variable.get( 'sanitization_s3_sanit_def_files_folder').split('/')[0], keys=s3_keys) unload_task = RedshiftToS3Transfer( task_id='unload', redshift_conn_id='snowplow_redshift', aws_conn_id='s3_etl', schema=schema, table='sta_' + table_name, s3_key='___SCHEMA___', s3_unload_path='s3://' + Variable.get( 'sanitization_s3_sanit_def_files_folder').split('/')[0] + '/' + '___SCHEMA___' + '/' + table_name + '/batch_date=' + batch_date + '/', s3_bucket=Variable.get( 'sanitization_s3_sanit_def_files_folder').split('/')[0], where_clause='batch_date = \'' + batch_date + '\'', # where_clause='batch_date >= \'' + batch_date + ' 00:00:00\' and batch_date <= \'' + # (pd.to_datetime(batch_date, format='%Y-%m-%d') + timedelta( # days=1)).strftime('%Y-%m-%d') + ' 00:00:00\'', unload_options=dag_config["module_conversion_unload_options"], retries=3) unload_task.execute(context=context) drop_staging = ABCRedshiftOperator( task_id='drop_staging', source_name='___SCHEMA___', redshift_conn_id='snowplow_redshift', sql='drop table if exists {staging_table};'.format( staging_table=schema + '.sta_' + table_name), retries=3) drop_staging.execute(context=context)
def test_execute( self, table_as_file_name, expected_s3_key, mock_run, mock_session, ): access_key = "aws_access_key_id" secret_key = "aws_secret_access_key" mock_session.return_value = Session(access_key, secret_key) schema = "schema" table = "table" s3_bucket = "bucket" s3_key = "key" unload_options = [ 'HEADER', ] RedshiftToS3Transfer(schema=schema, table=table, s3_bucket=s3_bucket, s3_key=s3_key, unload_options=unload_options, include_header=True, redshift_conn_id="redshift_conn_id", aws_conn_id="aws_conn_id", task_id="task_id", table_as_file_name=table_as_file_name, dag=None).execute(None) unload_options = '\n\t\t\t'.join(unload_options) select_query = "SELECT * FROM {schema}.{table}".format(schema=schema, table=table) unload_query = """ UNLOAD ('{select_query}') TO 's3://{s3_bucket}/{s3_key}' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {unload_options}; """.format(select_query=select_query, s3_bucket=s3_bucket, s3_key=expected_s3_key, access_key=access_key, secret_key=secret_key, unload_options=unload_options) assert mock_run.call_count == 1 assert_equal_ignore_multiple_spaces(self, mock_run.call_args[0][0], unload_query)
def test_execute(self, mock_get_conn, mock_run, mock_Session): column_name = "col" cur = mock.MagicMock() cur.fetchall.return_value = [(column_name, )] mock_get_conn.return_value.cursor.return_value = cur access_key = "aws_access_key_id" secret_key = "aws_secret_access_key" mock_Session.return_value = Session(access_key, secret_key) schema = "schema" table = "table" s3_bucket = "bucket" s3_key = "key" unload_options = ('PARALLEL OFF', ) t = RedshiftToS3Transfer(schema=schema, table=table, s3_bucket=s3_bucket, s3_key=s3_key, unload_options=unload_options, include_header=True, redshift_conn_id="redshift_conn_id", aws_conn_id="aws_conn_id", task_id="task_id", dag=None) t.execute(None) unload_options = '\n\t\t\t'.join(unload_options) columns_query = """ SELECT column_name FROM information_schema.columns WHERE table_schema = '{schema}' AND table_name = '{table}' ORDER BY ordinal_position """.format(schema=schema, table=table) unload_query = """ UNLOAD ('SELECT {column_name} FROM (SELECT 2 sort_order, CAST({column_name} AS text) AS {column_name} FROM {schema}.{table} UNION ALL SELECT 1 sort_order, \\'{column_name}\\') ORDER BY sort_order') TO 's3://{s3_bucket}/{s3_key}/{table}_' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {unload_options}; """.format(column_name=column_name, schema=schema, table=table, s3_bucket=s3_bucket, s3_key=s3_key, access_key=access_key, secret_key=secret_key, unload_options=unload_options) assert cur.execute.call_count == 1 assertEqualIgnoreMultipleSpaces(self, cur.execute.call_args[0][0], columns_query) assert mock_run.call_count == 1 assertEqualIgnoreMultipleSpaces(self, mock_run.call_args[0][0], unload_query)
from airflow.operators.redshift_to_s3_operator import RedshiftToS3Transfer from datetime import datetime, timedelta from airflow import DAG default_args = { 'owner': 'me', 'start_date': datetime.today(), 'max_active_runs': 1, } dag = DAG(dag_id='redshift_S3', default_args=default_args, schedule_interval='0 12 * * *', catchup=False ) unload_to_S3 = RedshiftToS3Transfer( task_id='unload_to_S3', schema='schema_name', table='table_name', s3_bucket='bucket_name', s3_key='s3_key', redshift_conn_id='redshift', aws_conn_id='my_s3_conn', dag=dag )
# - aws_conn_id is used to define the aws credentials you will use # You can use a newly created iam role, make sure it has the right access levels # - iterate with no access (default) until it works to keep least privilege s3_bucket_name = Variable.get("s3_bucket_name", default_var="undefined") s3_key = Variable.get("s3_key", default_var="undefined") redshift_table_name = Variable.get("redshift_table_name", default_var="undefined") redshift_airflow_connection = Variable.get("redshift_airflow_connection", default_var="undefined") aws_connection = Variable.get("aws_connection", default_var="undefined") with DAG( dag_id=DAG_ID, default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), start_date=days_ago(2), schedule_interval=None, tags=['s3','redshift'], ) as dag: unload_to_S3 = RedshiftToS3Transfer( task_id='unload_to_S3', schema='public', table=redshift_table_name, s3_bucket=s3_bucket_name, s3_key=s3_key, redshift_conn_id=redshift_airflow_connection, unload_options = ['CSV'], aws_conn_id = aws_connection ) unload_to_S3