def get_fact_table_dag(parent_dag_name, task_id, redshift_conn_id, table, columns, append=True, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", *args, **kwargs) begin_execution_task = DummyOperator(task_id='Begin_execution', dag=dag) create_task = PostgresOperator( task_id=f"create_{table}_table", dag=dag, postgres_conn_id=redshift_conn_id, sql=SQLQueries.create_load_test_queries[table]["CREATE"]) load_fact_dimension_task = LoadFactOperator( task_id=f"load_{table}_fact_table", dag=dag, redshift_conn_id=redshift_conn_id, table=table, columns=columns, sql_stmt=SQLQueries.create_load_test_queries[table]["LOAD"], append=append) data_quality_check_task = DataQualityOperator( task_id=f"data_quality_check_{table}_fact_table", dag=dag, redshift_conn_id=redshift_conn_id, table=table, ) end_execution_task = DummyOperator(task_id='End_execution', dag=dag) begin_execution_task >> create_task create_task >> load_fact_dimension_task load_fact_dimension_task >> data_quality_check_task data_quality_check_task >> end_execution_task return dag
def run_data_quality_subdag(parent_dag_name, child_dag_name, args, checks): dag_subdag = DAG( dag_id=f'{parent_dag_name}.{child_dag_name}', default_args=args, ) with dag_subdag: for check in checks: table = check.get('table') tests = check.get('tests') task = DataQualityOperator( task_id=f'Run_{table}_data_quality_check', default_args=args, dag=dag_subdag, redshift_conn_id='redshift', table=table, queries=SqlQueries, tests=tests) return dag_subdag
def get_load_check_dag( parent_dag_name="", task_id="", conn_id="", table="", sql_select="", load_type="", start_date="", *args, **kwargs, ): """ TODO: """ dag = DAG( f"{parent_dag_name}.{task_id}", start_date=start_date, **kwargs, ) if load_type == "fact": load_operator = LoadFactOperator elif load_type == "dimension": load_operator = LoadDimensionOperator else: raise ValueError(f"Unknown load_type {load_type}") load_task = load_operator(task_id=f"load_{table}_{load_type}_table", dag=dag, conn_id=conn_id, table=table, sql_select=sql_select) run_quality_check_task = DataQualityOperator( task_id=f'run_data_quality_check_{table}', dag=dag, sql_quality=f"SELECT COUNT(*) FROM {table}", condition=test_contains_rows, ) load_task >> run_quality_check_task return dag
def get_data_quality_dag(parent_dag_name, task_id, conn_id, tests, *args, **kwargs): # inherit DAG parameters dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) logging.info("Run all Tests") # Run all Tests for test in tests: test_task = DataQualityOperator( task_id=test['id'], dag=dag, conn_id=conn_id, test_sql=test['test_sql'], expected_result=test['expected_result'], test_callable=test_func, test_name=test['id']) # Tests can run in parallel without task dependencies return dag
def load_facts(parent_dag_name, child_dag_name, start_date, redshift_conn_id): dag = DAG( '%s.%s' % (parent_dag_name, child_dag_name), start_date=start_date, ) load_fact_bookings = LoadFactOperator( task_id='load_bookings', dag=dag, redshift_conn_id=redshift_conn_id, sql=load_statements.LOAD_BOOKING_FACTS, table='fact_bookings') run_quality_checks_facts = DataQualityOperator( task_id='data_quality_checks_facts', dag=dag, tables='fact_bookings', redshift_conn_id=redshift_conn_id, sql='SELECT COUNT(*) FROM {}') load_fact_bookings >> run_quality_checks_facts return dag
redshift_conn_id='redshift', sql_statement=SqlQueries.artist_table_insert) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, table_name='time', redshift_conn_id='redshift', sql_statement=SqlQueries.time_table_insert) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', dq_checks=[{ 'check_sql': "SELECT COUNT(*) FROM users WHERE userid is null", 'expected_results': 0 }, { 'check_sql': "SELECT COUNT(*) FROM songs WHERE songid is null", 'expected_results': 0 }]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift >> load_songplays_table start_operator >> stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_time_dimension_table load_user_dimension_table >> run_quality_checks
dag=dag redshift_conn_id="redshift", table='artists', sql_statement=SqlQueries.artist_table_insert ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag redshift_conn_id="redshift", table='time', sql_statement=SqlQueries.time_table_insert ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag redshift_conn_id="redshift", tables=['songplays', 'users', 'songs', 'artists', 'time'] ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table
task_id='Load_artist_dim_table', dag=dag, redshift_conn_id="redshift", table="artists", sql_query=SqlQueries.artist_table_insert) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id="redshift", table="time", sql_query=SqlQueries.time_table_insert) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", table_list=["songplay", "users", "songs", "artists", "time"]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> creates_tables creates_tables >> stage_songs_to_redshift creates_tables >> stage_events_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table
target_table='artists', truncate_before_load=True) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift', load_query=SqlQueries.time_table_insert, target_table='time', truncate_before_load=True) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", queries_and_results=[ SqlQueries.songplays_validation, SqlQueries.songs_validation, SqlQueries.artists_validation, SqlQueries.users_validation, SqlQueries.time_validation ]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_user_dimension_table
default_args = { 'owner': 'sparkify', 'depends_on_past': False, 'start_date': datetime(2018, 11, 1), 'retries': 0, 'retry_delay': timedelta(seconds=15), 'catchup_by_default': False, 'email_on_retry': False } dag = DAG('etl_dag_quality', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *', max_active_runs=1) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", target_table=dag_config['data_quality_check_tables'], ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> run_quality_checks run_quality_checks >> end_operator
table_to='state', options=["FORMAT AS PARQUET"], dag=dag) date_to_redshift = StageToRedshiftOperator(task_id='Date_Dimension_Table', aws_conn_id='aws_credentials', redshift_conn_id="redshift", s3_from='data-engineer-capstone', s3_prefix='date.parquet', schema_to='public', table_to='date', options=["FORMAT AS PARQUET"], dag=dag) run_quality_checks = DataQualityOperator( task_id='Data_Quality_Checks', redshift_conn_id="redshift", tables=['immigration', 'country', 'state', 'date'], dag=dag) end_operator = DummyOperator(task_id='End', dag=dag) start_operator >> immigration_to_redshift immigration_to_redshift >> country_to_redshift immigration_to_redshift >> state_to_redshift immigration_to_redshift >> date_to_redshift country_to_redshift >> run_quality_checks state_to_redshift >> run_quality_checks date_to_redshift >> run_quality_checks run_quality_checks >> end_operator
dimension_name='artists' ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift', dimension_name='time' ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', tests=[ {"test": count_query.format("songplays"), "expected_result": [6820] } , {"test": count_query.format("songs"), "expected_result": [14896] }, {"test": count_query.format("users"), "expected_result": [104] }, {"test": count_query.format("artists"), "expected_result": [10025] }, {"test": count_query.format("time"), "expected_result": [6813] } ] ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> drop_and_create_tables_task drop_and_create_tables_task >> stage_events_to_redshift drop_and_create_tables_task >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table
table="Staging_aggregations", conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="social-system-test/", s3_key="temp/post_agg_{run_id}.csv", region="us-east-1", file_type="csv") load_history_table = LoadFactOperator( task_id='Load_history_fact_table', dag=dag, provide_context=True, conn_id='redshift', table='history', query=SqlQueries.get_profile_history, truncate=True, ) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, provide_context=True, conn_id='redshift', tables=["history"]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_users_to_redshift start_operator >> get_ES_data >> stage_aggregations_to_redshift [stage_users_to_redshift, stage_aggregations_to_redshift] >> load_history_table load_history_table >> run_quality_checks >> end_operator
sql='/sql/fact_reviews.sql', postgres_conn_id='redshift') process_fact_reviews.set_upstream( [process_dim_times, process_dim_users, process_dim_business]) process_fk = PostgresOperator(dag=dag, task_id='process_foreign_keys', sql='/sql/dim_fk.sql', postgres_conn_id='redshift') process_fk.set_upstream([process_fact_tips, process_fact_reviews]) run_quality_checks = DataQualityOperator(task_id='run_data_quality_checks', dag=dag, redshift_conn_id='redshift', queries=({ "table": "dim_times", "where": "day IS NULL", "result": 0 }, { "table": "fact_review", "where": "user_id IS NULL", "result": 0 }, { "table": "fact_review", "result": 6685900 })) run_quality_checks.set_upstream(process_fk) end_operator = DummyOperator(dag=dag, task_id='end_operator') end_operator.set_upstream(run_quality_checks)
insert_mode='truncate') load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, provide_context=True, redshift_conn_id=redshift_credentials_id, target_table='time', sql=SqlQueries.time_table_insert, insert_mode='truncate') run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, provide_context=True, retries=3, redshift_conn_id=redshift_credentials_id, test_sql='', test_tbl='', expcted_results=0) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) ################### # add ordering ################### start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table
load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', table='time', select_sql_stmt=SqlQueries.time_table_insert, dag=dag) sql_count = 'SELECT COUNT(*) FROM {}' has_rows_checker = lambda records: len(records) == 1 and len(records[ 0]) == 1 and records[0][0] > 0 has_no_rows_checker = lambda records: len(records) == 1 and len(records[ 0]) == 1 and records[0][0] == 0 run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', postgres_conn_id='redshift', sql_stmts=(sql_count.format('songplays'), sql_count.format('users'), sql_count.format('songs'), sql_count.format('artists'), sql_count.format('time'), 'SELECT COUNT(*) FROM users WHERE first_name IS NULL'), result_checkers=(has_rows_checker, has_rows_checker, has_rows_checker, has_rows_checker, has_rows_checker, has_no_rows_checker), dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # Define dependencies start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_song_dimension_table
parent_dag_name=dag_name, task_id=load_artist_dimension_table_task_id, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="users", start_date=datetime(2018, 5, 1), sql_query=SqlQueries.artist_table_insert, ), task_id=load_artist_dimension_table_task_id, dag=dag, ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, provide_context=True, aws_credentials_id="aws_credentials", redshift_conn_id='redshift', tables=["songplay", "users", "song", "artist", "time"]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # Setting tasks dependencies start_operator >> create_redshift_tables >> [ stage_songs_to_redshift, stage_events_to_redshift ] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> [
append_only=False) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id="redshift", table="time", sql=getattr(SqlQueries, "time_table_insert"), append_only=False) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", tables=["songplays", "users", "songs", "artists", "time"], checks=[{ 'check_sql': "SELECT COUNT(*) FROM {}", 'expected_result': True }, { 'check_sql': "SELECT COUNT(*) FROM {} WHERE {} IS NULL", 'expected_result': False }]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> [stage_events_to_redshift, stage_songs_to_redshift] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> [ load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table
dag=dag, sql=SqlQueries.artist_table_insert, postgres_conn_id='redshift', mode='append', table='artists') load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, sql=SqlQueries.time_table_insert, postgres_conn_id='redshift', mode='append', table='time') run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, table=['songplays', 'songs', 'artists', 'users', 'time']) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> create_table create_table >> stage_events_to_redshift create_table >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_time_dimension_table load_song_dimension_table >> run_quality_checks load_user_dimension_table >> run_quality_checks
redshift_conn_id="redshift", table="time", truncate_table=True, select_query=SqlQueries.time_table_insert ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", tables=[ "songplays", "users", "songs", "artists", "time" ], dq_checks=[ {'check_sql':'SELECT COUNT(*) FROM songplays' , 'expected_result': 47740}, {'check_sql':'SELECT COUNT(*) FROM users' , 'expected_result': 104}, {'check_sql':'SELECT COUNT(*) FROM songs' , 'expected_result': 14896}, {'check_sql':'SELECT COUNT(*) FROM artists' , 'expected_result': 10025}, {'check_sql':'SELECT COUNT(*) FROM time' , 'expected_result': 47740} ] ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # Setting up the dependencies and order start_operator >> [stage_events_to_redshift,stage_songs_to_redshift] [stage_events_to_redshift,stage_songs_to_redshift] >> load_songplays_table
table='artists', redshift_conn_id='redshift', query=SqlQueries.artist_table_insert) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, switch='insert-delete', table='time', redshift_conn_id='redshift', query=SqlQueries.time_table_insert) #task to check data quality run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', provide_context=True, params={'table': ['artists', 'songplays', 'songs', 'users']}) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) #configure task dependencies start_operator >> [stage_events_to_redshift, stage_songs_to_redshift] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> [ load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table ] [ load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table
#load data to time table from stage table load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id="redshift", table="time", sql=SqlQueries.time_table_insert, loding_mode="delete-load") #Data Quality Check run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, params={ #Check name: {sql for check: result} "sonig_null_check": { SqlQueries.songid_null_check: 0 }, "artistid_null_check": { SqlQueries.artistid_null_check: 0 } }) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_artist_dimension_table
aws_credentials_id='aws_credentials', sqlWrite=SqlQueries.artist_table_insert) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, table='time', redshift_conn_id='redshift_conn_id', aws_credentials_id='aws_credentials', sqlWrite=SqlQueries.time_table_insert) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift_conn_id', aws_credentials_id='aws_credentials', table=[ "artists", "songplays", "songs", "staging_events", "staging_songs", "time", "users" ], dq_checks=dq_checks) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) #start_operator >> task_create_tables #task_create_tables >> stage_events_to_redshift #task_create_tables >> stage_songs_to_redshift start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift
dag=dag, conn_id="redshift", table="public.artists", sql_statement=SqlQueries.artist_table_insert) # load the time redshift table using the user defined operator load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, conn_id="redshift", table="public.time", sql_statement=SqlQueries.time_table_insert) # CHeck the data quality of time table to check if the records loaded properly run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, conn_id="redshift", table_name="public.time") # End operator end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # Task Dependency # this has been designed in an such way that all staging will run parallely # All Dimension will run parallely start_operator >> [stage_events_to_redshift, stage_songs_to_redshift] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> [ load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table ] [
truncate=True ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift', sql_query=SqlQueries.time_table_insert, table='time' ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', dq_checks=[ {'check_sql': 'SELECT COUNT(*) FROM users WHERE userid IS NULL', 'result': 0}, {'check_sql': 'SELECT COUNT(*) FROM artists WHERE artistid IS NULL', 'result': 0 }, {'check_sql': 'SELECT COUNT(*) FROM songs WHERE songid IS NULL', 'result': 0}, {'check_sql': 'SELECT COUNT(*) FROM time WHERE start_time IS NULL', 'result': 0 }, {'check_sql': 'SELECT COUNT(*) FROM songplays WHERE playid IS NULL', 'result': 0} ] ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> create_tables create_tables >> [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> [load_song_dimension_table, load_user_dimension_table, load_artist_dimension_table, load_time_dimension_table] >> run_quality_checks >> end_operator
table='artists', replace=True, task_id='Load_artist_dim_table', dag=dag) load_time_dimension_table = LoadDimensionOperator( redshift_conn_id='redshift', table='time', replace=True, task_id='Load_time_dim_table', dag=dag) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', tables=[ 'staging_events', 'staging_songs', 'users', 'songs', 'artists', 'time' ]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator \ >> [stage_events_to_redshift, stage_songs_to_redshift] \ >> load_songplays_table \ >> [ load_song_dimension_table, load_user_dimension_table, load_artist_dimension_table, load_time_dimension_table ] >> run_quality_checks >> end_operator
create_table_sql=SqlQueries.create_artists_table, insert_table_sql=SqlQueries.insert_artist_table, mode="overwrite", target_table="artists") load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', redshift_conn_id="redshift", aws_credentials_id="aws_credentials", create_table_sql=SqlQueries.create_times_table, insert_table_sql=SqlQueries.insert_time_table, mode="overwrite", target_table="time") run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', redshift_conn_id="redshift", table_name="songplays") end_operator = DummyOperator(task_id='Stop_execution') # Make graph start_operator >> [stage_events_to_redshift, stage_songs_to_redshift] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> [ load_artist_dimension_table, load_song_dimension_table, load_time_dimension_table, load_user_dimension_table ] [ load_artist_dimension_table, load_song_dimension_table, load_time_dimension_table, load_user_dimension_table ] >> run_quality_checks
redshift_conn_id='redshift', table='artists', selection=SqlQueries.artist_table_insert ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift', table='time', selection=SqlQueries.time_table_insert ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift' ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # ordering start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table
dag=dag, table='songs', redshift_conn_id="redshift", load_sql_stmt=SqlQueries.song_table_insert ) load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', dag=dag, table='artists', redshift_conn_id="redshift", load_sql_stmt=SqlQueries.artist_table_insert ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, table='time', redshift_conn_id="redshift", load_sql_stmt=SqlQueries.time_table_insert ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, tables=['songplays', 'users', 'songs', 'artists', 'time'], redshift_conn_id="redshift" ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag)
table=fact_table_name_and_query[0], conn_id=REDSHIFT_CONN_ID, sql=fact_table_name_and_query[1], ) dim_operators = [ LoadDimensionOperator( task_id=f'Load_{dim_table_name}_dim_table', dag=dag, table=dim_table_name, conn_id=REDSHIFT_CONN_ID, sql=dim_query, ) for dim_table_name, dim_query in dim_tables_name_to_query.items() ] run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, conn_id=REDSHIFT_CONN_ID, tables=list(dim_tables_name_to_query) + [fact_table_name_and_query[0]], ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> create_tables create_tables >> [stage_events_to_redshift, stage_songs_to_redshift] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> dim_operators dim_operators + [load_songplays_table] >> run_quality_checks run_quality_checks >> end_operator