def test_execute(self, mock_get_conn, mock_run, mock_Session):
        column_name = "col"
        cur = mock.MagicMock()
        cur.fetchall.return_value = [(column_name, )]
        mock_get_conn.return_value.cursor.return_value = cur

        access_key = "aws_access_key_id"
        secret_key = "aws_secret_access_key"
        mock_Session.return_value = Session(access_key, secret_key)

        schema = "schema"
        table = "table"
        s3_bucket = "bucket"
        s3_key = "key"
        unload_options = ""

        t = RedshiftToS3Transfer(
            schema=schema,
            table=table,
            s3_bucket=s3_bucket,
            s3_key=s3_key,
            unload_options=unload_options,
            redshift_conn_id="redshift_conn_id",
            aws_conn_id="aws_conn_id",
            task_id="task_id",
            dag=None)
        t.execute(None)

        columns_query = """
            SELECT column_name
            FROM information_schema.columns
            WHERE table_schema = '{0}'
            AND   table_name = '{1}'
            ORDER BY ordinal_position
            """.format(schema, table)

        unload_query = """
            UNLOAD ('SELECT \\'{0}\\'
                     UNION ALL
                     SELECT CAST({0} AS text) AS {0}
                     FROM {1}.{2}
                     ORDER BY 1 DESC')
            TO 's3://{3}/{4}/{2}_'
            with credentials
            'aws_access_key_id={5};aws_secret_access_key={6}'
            {7};
            """.format(column_name, schema, table,
                       s3_bucket, s3_key, access_key,
                       secret_key, unload_options)

        def _trim(s):
            return re.sub("\s+", " ", s.strip())

        self.assertEqual(_trim(cur.execute.call_args[0][0]),
                         _trim(columns_query))
        cur.execute.assert_called_once()

        self.assertEqual(_trim(mock_run.call_args[0][0]),
                         _trim(unload_query))
        mock_run.assert_called_once()
def unload_from_staging(**context):
    """
    Unload the staging table into s3 into the required folder
    Args:
        **context: Airflow context
    """
    s3 = S3Hook(aws_conn_id='s3_etl')
    days_computed = context['task_instance'].xcom_pull(
        task_ids='compute_next_gather', key='days_to_compute')
    schema = context['task_instance'].xcom_pull(task_ids='load_definition',
                                                key='schema')
    table_name = context['task_instance'].xcom_pull(task_ids='load_definition',
                                                    key='table_name')

    for batch_date in sorted(days_computed)[:-1]:
        s3_keys = s3.list_keys(bucket_name=Variable.get(
            'sanitization_s3_sanit_def_files_folder').split('/')[0],
                               prefix='___SCHEMA___' + '/' + table_name +
                               '/batch_date=' + batch_date + '/')
        if s3_keys:
            s3.delete_objects(bucket=Variable.get(
                'sanitization_s3_sanit_def_files_folder').split('/')[0],
                              keys=s3_keys)

        unload_task = RedshiftToS3Transfer(
            task_id='unload',
            redshift_conn_id='snowplow_redshift',
            aws_conn_id='s3_etl',
            schema=schema,
            table='sta_' + table_name,
            s3_key='___SCHEMA___',
            s3_unload_path='s3://' + Variable.get(
                'sanitization_s3_sanit_def_files_folder').split('/')[0] + '/' +
            '___SCHEMA___' + '/' + table_name + '/batch_date=' + batch_date +
            '/',
            s3_bucket=Variable.get(
                'sanitization_s3_sanit_def_files_folder').split('/')[0],
            where_clause='batch_date = \'' + batch_date + '\'',
            # where_clause='batch_date >= \'' + batch_date + ' 00:00:00\' and batch_date <= \'' +
            #              (pd.to_datetime(batch_date, format='%Y-%m-%d') + timedelta(
            #                  days=1)).strftime('%Y-%m-%d') + ' 00:00:00\'',
            unload_options=dag_config["module_conversion_unload_options"],
            retries=3)

        unload_task.execute(context=context)

    drop_staging = ABCRedshiftOperator(
        task_id='drop_staging',
        source_name='___SCHEMA___',
        redshift_conn_id='snowplow_redshift',
        sql='drop table if exists {staging_table};'.format(
            staging_table=schema + '.sta_' + table_name),
        retries=3)

    drop_staging.execute(context=context)
    def test_execute(
        self,
        table_as_file_name,
        expected_s3_key,
        mock_run,
        mock_session,
    ):
        access_key = "aws_access_key_id"
        secret_key = "aws_secret_access_key"
        mock_session.return_value = Session(access_key, secret_key)
        schema = "schema"
        table = "table"
        s3_bucket = "bucket"
        s3_key = "key"
        unload_options = [
            'HEADER',
        ]

        RedshiftToS3Transfer(schema=schema,
                             table=table,
                             s3_bucket=s3_bucket,
                             s3_key=s3_key,
                             unload_options=unload_options,
                             include_header=True,
                             redshift_conn_id="redshift_conn_id",
                             aws_conn_id="aws_conn_id",
                             task_id="task_id",
                             table_as_file_name=table_as_file_name,
                             dag=None).execute(None)

        unload_options = '\n\t\t\t'.join(unload_options)
        select_query = "SELECT * FROM {schema}.{table}".format(schema=schema,
                                                               table=table)
        unload_query = """
                    UNLOAD ('{select_query}')
                    TO 's3://{s3_bucket}/{s3_key}'
                    with credentials
                    'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
                    {unload_options};
                    """.format(select_query=select_query,
                               s3_bucket=s3_bucket,
                               s3_key=expected_s3_key,
                               access_key=access_key,
                               secret_key=secret_key,
                               unload_options=unload_options)

        assert mock_run.call_count == 1
        assert_equal_ignore_multiple_spaces(self, mock_run.call_args[0][0],
                                            unload_query)
    def test_execute(self, mock_get_conn, mock_run, mock_Session):
        column_name = "col"
        cur = mock.MagicMock()
        cur.fetchall.return_value = [(column_name, )]
        mock_get_conn.return_value.cursor.return_value = cur

        access_key = "aws_access_key_id"
        secret_key = "aws_secret_access_key"
        mock_Session.return_value = Session(access_key, secret_key)

        schema = "schema"
        table = "table"
        s3_bucket = "bucket"
        s3_key = "key"
        unload_options = ('PARALLEL OFF', )

        t = RedshiftToS3Transfer(schema=schema,
                                 table=table,
                                 s3_bucket=s3_bucket,
                                 s3_key=s3_key,
                                 unload_options=unload_options,
                                 include_header=True,
                                 redshift_conn_id="redshift_conn_id",
                                 aws_conn_id="aws_conn_id",
                                 task_id="task_id",
                                 dag=None)
        t.execute(None)

        unload_options = '\n\t\t\t'.join(unload_options)

        columns_query = """
            SELECT column_name
            FROM information_schema.columns
            WHERE table_schema = '{schema}'
            AND   table_name = '{table}'
            ORDER BY ordinal_position
            """.format(schema=schema, table=table)

        unload_query = """
                UNLOAD ('SELECT {column_name} FROM
                            (SELECT 2 sort_order,
                             CAST({column_name} AS text) AS {column_name}
                            FROM {schema}.{table}
                            UNION ALL
                            SELECT 1 sort_order, \\'{column_name}\\')
                         ORDER BY sort_order')
                TO 's3://{s3_bucket}/{s3_key}/{table}_'
                with credentials
                'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
                {unload_options};
                """.format(column_name=column_name,
                           schema=schema,
                           table=table,
                           s3_bucket=s3_bucket,
                           s3_key=s3_key,
                           access_key=access_key,
                           secret_key=secret_key,
                           unload_options=unload_options)

        assert cur.execute.call_count == 1
        assertEqualIgnoreMultipleSpaces(self, cur.execute.call_args[0][0],
                                        columns_query)

        assert mock_run.call_count == 1
        assertEqualIgnoreMultipleSpaces(self, mock_run.call_args[0][0],
                                        unload_query)
Beispiel #5
0
from airflow.operators.redshift_to_s3_operator import RedshiftToS3Transfer
from datetime import datetime, timedelta
from airflow import DAG

default_args = {
  'owner': 'me',
  'start_date': datetime.today(),
  'max_active_runs': 1,
}

dag = DAG(dag_id='redshift_S3',
  default_args=default_args,
  schedule_interval='0 12 * * *',
  catchup=False
)

unload_to_S3 = RedshiftToS3Transfer(
  task_id='unload_to_S3',
  schema='schema_name',
  table='table_name',
  s3_bucket='bucket_name',
  s3_key='s3_key',
  redshift_conn_id='redshift',
  aws_conn_id='my_s3_conn',
  dag=dag
)
# - aws_conn_id is used to define the aws credentials you will use
# You can use a newly created iam role, make sure it has the right access levels
#  - iterate with no access (default) until it works to keep least privilege

s3_bucket_name = Variable.get("s3_bucket_name", default_var="undefined")
s3_key = Variable.get("s3_key", default_var="undefined")
redshift_table_name = Variable.get("redshift_table_name", default_var="undefined")
redshift_airflow_connection = Variable.get("redshift_airflow_connection", default_var="undefined")
aws_connection = Variable.get("aws_connection", default_var="undefined")


with DAG(
    dag_id=DAG_ID,
    default_args=DEFAULT_ARGS,
    dagrun_timeout=timedelta(hours=2),
    start_date=days_ago(2),
    schedule_interval=None,
    tags=['s3','redshift'],

) as dag:  
    unload_to_S3 = RedshiftToS3Transfer(
    task_id='unload_to_S3',
    schema='public',
    table=redshift_table_name,
    s3_bucket=s3_bucket_name,
    s3_key=s3_key,
    redshift_conn_id=redshift_airflow_connection,
    unload_options = ['CSV'],
    aws_conn_id = aws_connection
  )
    unload_to_S3