def s3_to_final(**context):
    """
    Copy statement from s3 into final table
    Args:
        **context: Airflow context
    """
    list_s3_for_table = context['task_instance'].xcom_pull(
        task_ids='list_s3_for_table')
    table_name = context['task_instance'].xcom_pull(task_ids='load_definition',
                                                    key='table_name')
    table_columns = context['task_instance'].xcom_pull(
        task_ids='load_definition', key='table_columns')
    schema = context['task_instance'].xcom_pull(task_ids='load_definition',
                                                key='schema')
    last_batch = context['task_instance'].xcom_pull(
        task_ids='extract_last_batch_date', key='last_batch_for_table_check')
    list_s3_for_table = [
        s3_key for s3_key in list_s3_for_table if
        s3_key.split('/')[-2].split('=')[1] > str(pd.to_datetime(last_batch))
    ]
    list_s3_for_table = sorted(
        list(
            set([
                '/'.join(s3_key.split('/')[:-1]) + '/'
                for s3_key in list_s3_for_table
            ])))

    if list_s3_for_table:
        logger.info('Number of keys to push to Redshift are: {count}'.format(
            count=len(list_s3_for_table)))

        to_final_task = S3ToRedshiftTransfer(
            task_id='to_final',
            schema=schema,
            table=table_name,
            s3_bucket=Variable.get(
                'sanitization_s3_sanit_def_files_folder').split('/')[0],
            s3_key=list_s3_for_table,
            aws_conn_id='s3_etl',
            cols=table_columns,
            redshift_conn_id='snowplow_redshift',
            is_truncate=False,
            copy_options=dag_config["module_conversion_copy_options"])
        to_final_task.execute(context=context)
    else:
        logger.info('Table is up to date!! No need to push data to it!!')
Exemple #2
0
    def test_execute(self, mock_run, mock_Session):
        access_key = "aws_access_key_id"
        secret_key = "aws_secret_access_key"
        mock_Session.return_value = Session(access_key, secret_key)

        schema = "schema"
        table = "table"
        s3_bucket = "bucket"
        s3_key = "key"
        copy_options = ""

        t = S3ToRedshiftTransfer(schema=schema,
                                 table=table,
                                 s3_bucket=s3_bucket,
                                 s3_key=s3_key,
                                 copy_options=copy_options,
                                 redshift_conn_id="redshift_conn_id",
                                 aws_conn_id="aws_conn_id",
                                 task_id="task_id",
                                 dag=None)
        t.execute(None)

        copy_query = """
            COPY {schema}.{table}
            FROM 's3://{s3_bucket}/{s3_key}/{table}'
            with credentials
            'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
            {copy_options};
        """.format(schema=schema,
                   table=table,
                   s3_bucket=s3_bucket,
                   s3_key=s3_key,
                   access_key=access_key,
                   secret_key=secret_key,
                   copy_options=copy_options)

        def _trim(s):
            return re.sub("\s+", " ", s.strip())

        self.assertEqual(_trim(mock_run.call_args[0][0]), _trim(copy_query))
        mock_run.assert_called_once()
Exemple #3
0
    def test_execute(self, mock_run, mock_Session):
        access_key = "aws_access_key_id"
        secret_key = "aws_secret_access_key"
        mock_Session.return_value = Session(access_key, secret_key)

        schema = "schema"
        table = "table"
        s3_bucket = "bucket"
        s3_key = "key"
        copy_options = ""

        t = S3ToRedshiftTransfer(
            schema=schema,
            table=table,
            s3_bucket=s3_bucket,
            s3_key=s3_key,
            copy_options=copy_options,
            redshift_conn_id="redshift_conn_id",
            aws_conn_id="aws_conn_id",
            task_id="task_id",
            dag=None)
        t.execute(None)

        copy_query = """
            COPY {schema}.{table}
            FROM 's3://{s3_bucket}/{s3_key}/{table}'
            with credentials
            'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
            {copy_options};
        """.format(schema=schema,
                   table=table,
                   s3_bucket=s3_bucket,
                   s3_key=s3_key,
                   access_key=access_key,
                   secret_key=secret_key,
                   copy_options=copy_options)

        assert mock_run.call_count == 1
        assertEqualIgnoreMultipleSpaces(self, mock_run.call_args[0][0], copy_query)
    start = DummyOperator(task_id='start_execution')

    upload_raw_data = BashOperator(
        task_id='upload_raw_data_to_s3',
        bash_command='python ../upload_to_s3.py'
    )

    create_tables = PostgresOperator(
        task_id='create_tables',
        sql='create_tables.sql'
    )

    stage_tweets = S3ToRedshiftTransfer(
        task_id='stage_tweets_to_redshift',
        schema='{{ params.redshift_schema }}',
        table='staging_tweets',
        s3_bucket='{{ params.s3_bucket }}',
        s3_key='twitter_feed',
        copy_options=['COMPUPDATE OFF', 'STATUPDATE OFF', 'TRUNCATECOLUMNS']
    )

    stage_happiness = S3ToRedshiftTransfer(
        task_id='stage_happiness_to_redshift',
        schema='{{ params.redshift_schema }}',
        table='staging_happiness',
        s3_bucket='{{ params.s3_bucket }}',
        s3_key='happiness'
    )

    stage_temperature = S3ToRedshiftTransfer(
        task_id='stage_temperature_to_redshift',
        schema='{{ params.redshift_schema }}',
from datetime import datetime

from airflow.sensors.s3_key_sensor import S3KeySensor
from airflow.operators.s3_to_redshift_operator import S3ToRedshiftTransfer
from airflow.models import DAG

args = {
    'owner': 'Adil',
    'start_date': datetime(2019, 6, 20),
    'retries': 1,
}

with DAG(dag_id='nyc_taxi_to_redshift',
         default_args=args,
         schedule_interval=None) as dag:
    wait_for_s3_file = S3KeySensor(bucket_name='mktg-redshift-exchange',
                                   bucket_key='nyc-taxi/temp-taxi-data',
                                   wildcard_match=False,
                                   dag=dag)
    upload_to_redshift = S3ToRedshiftTransfer(
        schema='public',
        table='temp-taxi-data',
        s3_bucket='mktg-redshift-exchange',
        s3_key='nyc-taxi',
        copy_options=['CSV', 'IGNOREHEADER 2'])

    wait_for_s3_file >> upload_to_redshift
Exemple #6
0
    'provide_context': True
}

# Initialize the DAG
# Concurrency --> Number of tasks allowed to run concurrently
dag = DAG('test_dag1',
          concurrency=3,
          schedule_interval=None,
          default_args=default_args)

# Creates an EMR cluster
s3_to_redshift = S3ToRedshiftTransfer(task_id="s3_to_redshift",
                                      redshift_conn_id="my_redshift",
                                      aws_conn_id="my_conn_s3",
                                      table="output",
                                      s3_bucket="bsjun-test1",
                                      schema="public",
                                      s3_key="results",
                                      copy_options=["delimiter ','"],
                                      verify=True,
                                      dag=dag)
# Define the individual tasks using Python Operators

remove_files = BashOperator(
    task_id='remove_files',
    bash_command='echo 1',
    dag=dag,
)

# construct the DAG by setting the dependencies
s3_to_redshift >> remove_files
    job_flow_id=
    "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=SPARK_TEST_STEPS,
    dag=dag)

step_checker = EmrStepSensor(
    task_id='watch_step',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}",
    step_id="{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}",
    aws_conn_id='aws_default',
    dag=dag)

cluster_remover = EmrTerminateJobFlowOperator(
    task_id='remove_cluster',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}",
    aws_conn_id='aws_default',
    dag=dag)

copy_agg_to_redshift = S3ToRedshiftTransfer(task_id='copy_to_redshift',
                                            schema='nyc',
                                            table='agg_green_rides',
                                            s3_bucket=S3_BUCKET_NAME,
                                            s3_key='aggregate/agg-green-rides',
                                            dag=dag)

# construct the DAG by setting the dependencies
s3_sensor >> aws_glue_task >> cluster_creator >> step_adder >> step_checker >> cluster_remover >> copy_agg_to_redshift
    task_id='transform_weather_data',
    bash_command=
    'python3 "${AIRFLOW_HOME}/dags/scripts/transform_weather_data.py"',
    dag=dag)

transform_complaint_data = BashOperator(
    task_id='transform_complaint_data',
    bash_command=
    'python3 "${AIRFLOW_HOME}/dags/scripts/transform_complaint_data.py"',
    dag=dag)

load_weather_data = S3ToRedshiftTransfer(
    task_id='load_weather_data',
    schema='public',
    table='dim_weather',
    s3_bucket=Variable.get('aws_bucket'),
    s3_key='processed',
    redshift_conn_id='redshift_conn',
    aws_conn_id='aws_credentials',
    copy_options=['csv', "IGNOREHEADER 1"],
    dag=dag)

load_demo_data = S3ToRedshiftTransfer(task_id='load_demo_data',
                                      schema='public',
                                      table='dim_demographics',
                                      s3_bucket=Variable.get('aws_bucket'),
                                      s3_key='processed',
                                      redshift_conn_id='redshift_conn',
                                      aws_conn_id='aws_credentials',
                                      copy_options=['csv', 'IGNOREHEADER 1'],
                                      dag=dag)
Exemple #9
0
                                     'standardize_countries.py',
                                     dag=dag)
upload_raw_data = BashOperator(task_id='upload_raw_data_to_s3',
                               bash_command=pybash + 'push_to_s3.py',
                               dag=dag)

create_tables = PostgresOperator(task_id='create_tables',
                                 sql=SqlQueries.create_tables,
                                 dag=dag)

stage_tweets = S3ToRedshiftTransfer(task_id='stage_tweets_to_redshift',
                                    schema='public',
                                    table='staging_tweets',
                                    s3_bucket='udacity-capstone-cg',
                                    s3_key='staging',
                                    copy_options=[
                                        'CSV', 'IGNOREHEADER 1', 'FILLRECORD',
                                        'COMPUPDATE OFF', 'STATUPDATE OFF',
                                        'TRUNCATECOLUMNS'
                                    ],
                                    dag=dag)

stage_sentiment = S3ToRedshiftTransfer(task_id='stage_sentiment_to_redshift',
                                       schema='public',
                                       table='sentiment',
                                       s3_bucket='udacity-capstone-cg',
                                       s3_key='staging',
                                       copy_options=[
                                           'CSV', 'IGNOREHEADER 1',
                                           'FILLRECORD', 'COMPUPDATE OFF',
                                           'STATUPDATE OFF', 'TRUNCATECOLUMNS'
Exemple #10
0
    sql='sql/ddl/create_tbl_daily_exchange_rates_pre.sql',
    dag=dag)

# create empty staging table to load columnar formatted data into
create_postgres_staging = PostgresOperator(
    task_id='create_postgres_staging',
    postgres_conn_id=POSTGRES_CONN_ID,
    sql='sql/ddl/create_tbl_daily_exchange_rates_stg.sql',
    dag=dag)

# populate first staging table with new data
s3_to_postgres_pre_staging = S3ToRedshiftTransfer(
    task_id='s3_to_postgres_pre_staging',
    aws_conn_id='s3_conn_id',
    s3_bucket=S3_BUCKET_NAME,
    s3_key=s3_key,
    redshift_conn_id=POSTGRES_CONN_ID,
    schema='alphavantage',
    table='daily_exchange_rates_pre_staging',
    dag=dag)

# transform json data using psql to correct table structure
load_to_postgres_staging = PostgresOperator(
    task_id='load_to_postgres_staging',
    postgres_conn_id=POSTGRES_CONN_ID,
    sql='sql/load_to_daily_exchange_rates_stg.sql',
    dag=dag)

# load only incremental data from staging into main table
load_to_postgres = PostgresOperator(task_id='load_to_postgres',
                                    postgres_conn_id=POSTGRES_CONN_ID,