def s3_to_final(**context): """ Copy statement from s3 into final table Args: **context: Airflow context """ list_s3_for_table = context['task_instance'].xcom_pull( task_ids='list_s3_for_table') table_name = context['task_instance'].xcom_pull(task_ids='load_definition', key='table_name') table_columns = context['task_instance'].xcom_pull( task_ids='load_definition', key='table_columns') schema = context['task_instance'].xcom_pull(task_ids='load_definition', key='schema') last_batch = context['task_instance'].xcom_pull( task_ids='extract_last_batch_date', key='last_batch_for_table_check') list_s3_for_table = [ s3_key for s3_key in list_s3_for_table if s3_key.split('/')[-2].split('=')[1] > str(pd.to_datetime(last_batch)) ] list_s3_for_table = sorted( list( set([ '/'.join(s3_key.split('/')[:-1]) + '/' for s3_key in list_s3_for_table ]))) if list_s3_for_table: logger.info('Number of keys to push to Redshift are: {count}'.format( count=len(list_s3_for_table))) to_final_task = S3ToRedshiftTransfer( task_id='to_final', schema=schema, table=table_name, s3_bucket=Variable.get( 'sanitization_s3_sanit_def_files_folder').split('/')[0], s3_key=list_s3_for_table, aws_conn_id='s3_etl', cols=table_columns, redshift_conn_id='snowplow_redshift', is_truncate=False, copy_options=dag_config["module_conversion_copy_options"]) to_final_task.execute(context=context) else: logger.info('Table is up to date!! No need to push data to it!!')
def test_execute(self, mock_run, mock_Session): access_key = "aws_access_key_id" secret_key = "aws_secret_access_key" mock_Session.return_value = Session(access_key, secret_key) schema = "schema" table = "table" s3_bucket = "bucket" s3_key = "key" copy_options = "" t = S3ToRedshiftTransfer(schema=schema, table=table, s3_bucket=s3_bucket, s3_key=s3_key, copy_options=copy_options, redshift_conn_id="redshift_conn_id", aws_conn_id="aws_conn_id", task_id="task_id", dag=None) t.execute(None) copy_query = """ COPY {schema}.{table} FROM 's3://{s3_bucket}/{s3_key}/{table}' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {copy_options}; """.format(schema=schema, table=table, s3_bucket=s3_bucket, s3_key=s3_key, access_key=access_key, secret_key=secret_key, copy_options=copy_options) def _trim(s): return re.sub("\s+", " ", s.strip()) self.assertEqual(_trim(mock_run.call_args[0][0]), _trim(copy_query)) mock_run.assert_called_once()
def test_execute(self, mock_run, mock_Session): access_key = "aws_access_key_id" secret_key = "aws_secret_access_key" mock_Session.return_value = Session(access_key, secret_key) schema = "schema" table = "table" s3_bucket = "bucket" s3_key = "key" copy_options = "" t = S3ToRedshiftTransfer( schema=schema, table=table, s3_bucket=s3_bucket, s3_key=s3_key, copy_options=copy_options, redshift_conn_id="redshift_conn_id", aws_conn_id="aws_conn_id", task_id="task_id", dag=None) t.execute(None) copy_query = """ COPY {schema}.{table} FROM 's3://{s3_bucket}/{s3_key}/{table}' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {copy_options}; """.format(schema=schema, table=table, s3_bucket=s3_bucket, s3_key=s3_key, access_key=access_key, secret_key=secret_key, copy_options=copy_options) assert mock_run.call_count == 1 assertEqualIgnoreMultipleSpaces(self, mock_run.call_args[0][0], copy_query)
start = DummyOperator(task_id='start_execution') upload_raw_data = BashOperator( task_id='upload_raw_data_to_s3', bash_command='python ../upload_to_s3.py' ) create_tables = PostgresOperator( task_id='create_tables', sql='create_tables.sql' ) stage_tweets = S3ToRedshiftTransfer( task_id='stage_tweets_to_redshift', schema='{{ params.redshift_schema }}', table='staging_tweets', s3_bucket='{{ params.s3_bucket }}', s3_key='twitter_feed', copy_options=['COMPUPDATE OFF', 'STATUPDATE OFF', 'TRUNCATECOLUMNS'] ) stage_happiness = S3ToRedshiftTransfer( task_id='stage_happiness_to_redshift', schema='{{ params.redshift_schema }}', table='staging_happiness', s3_bucket='{{ params.s3_bucket }}', s3_key='happiness' ) stage_temperature = S3ToRedshiftTransfer( task_id='stage_temperature_to_redshift', schema='{{ params.redshift_schema }}',
from datetime import datetime from airflow.sensors.s3_key_sensor import S3KeySensor from airflow.operators.s3_to_redshift_operator import S3ToRedshiftTransfer from airflow.models import DAG args = { 'owner': 'Adil', 'start_date': datetime(2019, 6, 20), 'retries': 1, } with DAG(dag_id='nyc_taxi_to_redshift', default_args=args, schedule_interval=None) as dag: wait_for_s3_file = S3KeySensor(bucket_name='mktg-redshift-exchange', bucket_key='nyc-taxi/temp-taxi-data', wildcard_match=False, dag=dag) upload_to_redshift = S3ToRedshiftTransfer( schema='public', table='temp-taxi-data', s3_bucket='mktg-redshift-exchange', s3_key='nyc-taxi', copy_options=['CSV', 'IGNOREHEADER 2']) wait_for_s3_file >> upload_to_redshift
'provide_context': True } # Initialize the DAG # Concurrency --> Number of tasks allowed to run concurrently dag = DAG('test_dag1', concurrency=3, schedule_interval=None, default_args=default_args) # Creates an EMR cluster s3_to_redshift = S3ToRedshiftTransfer(task_id="s3_to_redshift", redshift_conn_id="my_redshift", aws_conn_id="my_conn_s3", table="output", s3_bucket="bsjun-test1", schema="public", s3_key="results", copy_options=["delimiter ','"], verify=True, dag=dag) # Define the individual tasks using Python Operators remove_files = BashOperator( task_id='remove_files', bash_command='echo 1', dag=dag, ) # construct the DAG by setting the dependencies s3_to_redshift >> remove_files
job_flow_id= "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}", aws_conn_id='aws_default', steps=SPARK_TEST_STEPS, dag=dag) step_checker = EmrStepSensor( task_id='watch_step', job_flow_id= "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}", step_id="{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}", aws_conn_id='aws_default', dag=dag) cluster_remover = EmrTerminateJobFlowOperator( task_id='remove_cluster', job_flow_id= "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}", aws_conn_id='aws_default', dag=dag) copy_agg_to_redshift = S3ToRedshiftTransfer(task_id='copy_to_redshift', schema='nyc', table='agg_green_rides', s3_bucket=S3_BUCKET_NAME, s3_key='aggregate/agg-green-rides', dag=dag) # construct the DAG by setting the dependencies s3_sensor >> aws_glue_task >> cluster_creator >> step_adder >> step_checker >> cluster_remover >> copy_agg_to_redshift
task_id='transform_weather_data', bash_command= 'python3 "${AIRFLOW_HOME}/dags/scripts/transform_weather_data.py"', dag=dag) transform_complaint_data = BashOperator( task_id='transform_complaint_data', bash_command= 'python3 "${AIRFLOW_HOME}/dags/scripts/transform_complaint_data.py"', dag=dag) load_weather_data = S3ToRedshiftTransfer( task_id='load_weather_data', schema='public', table='dim_weather', s3_bucket=Variable.get('aws_bucket'), s3_key='processed', redshift_conn_id='redshift_conn', aws_conn_id='aws_credentials', copy_options=['csv', "IGNOREHEADER 1"], dag=dag) load_demo_data = S3ToRedshiftTransfer(task_id='load_demo_data', schema='public', table='dim_demographics', s3_bucket=Variable.get('aws_bucket'), s3_key='processed', redshift_conn_id='redshift_conn', aws_conn_id='aws_credentials', copy_options=['csv', 'IGNOREHEADER 1'], dag=dag)
'standardize_countries.py', dag=dag) upload_raw_data = BashOperator(task_id='upload_raw_data_to_s3', bash_command=pybash + 'push_to_s3.py', dag=dag) create_tables = PostgresOperator(task_id='create_tables', sql=SqlQueries.create_tables, dag=dag) stage_tweets = S3ToRedshiftTransfer(task_id='stage_tweets_to_redshift', schema='public', table='staging_tweets', s3_bucket='udacity-capstone-cg', s3_key='staging', copy_options=[ 'CSV', 'IGNOREHEADER 1', 'FILLRECORD', 'COMPUPDATE OFF', 'STATUPDATE OFF', 'TRUNCATECOLUMNS' ], dag=dag) stage_sentiment = S3ToRedshiftTransfer(task_id='stage_sentiment_to_redshift', schema='public', table='sentiment', s3_bucket='udacity-capstone-cg', s3_key='staging', copy_options=[ 'CSV', 'IGNOREHEADER 1', 'FILLRECORD', 'COMPUPDATE OFF', 'STATUPDATE OFF', 'TRUNCATECOLUMNS'
sql='sql/ddl/create_tbl_daily_exchange_rates_pre.sql', dag=dag) # create empty staging table to load columnar formatted data into create_postgres_staging = PostgresOperator( task_id='create_postgres_staging', postgres_conn_id=POSTGRES_CONN_ID, sql='sql/ddl/create_tbl_daily_exchange_rates_stg.sql', dag=dag) # populate first staging table with new data s3_to_postgres_pre_staging = S3ToRedshiftTransfer( task_id='s3_to_postgres_pre_staging', aws_conn_id='s3_conn_id', s3_bucket=S3_BUCKET_NAME, s3_key=s3_key, redshift_conn_id=POSTGRES_CONN_ID, schema='alphavantage', table='daily_exchange_rates_pre_staging', dag=dag) # transform json data using psql to correct table structure load_to_postgres_staging = PostgresOperator( task_id='load_to_postgres_staging', postgres_conn_id=POSTGRES_CONN_ID, sql='sql/load_to_daily_exchange_rates_stg.sql', dag=dag) # load only incremental data from staging into main table load_to_postgres = PostgresOperator(task_id='load_to_postgres', postgres_conn_id=POSTGRES_CONN_ID,