table='artists', select_sql=SqlQueries.artist_table_insert, append_data=True, dag=dag) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', redshift_conn_id='redshift', table='time', select_sql=SqlQueries.time_table_insert, append_data=True, dag=dag) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', redshift_conn_id='redshift', dq_checks=dq_checks, expected_result=0, dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # start_operator >> create_tables start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_time_dimension_table load_user_dimension_table >> run_quality_checks
table_name='artists' ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_id='redshift', sql=SqlQueries.time_table_insert, mode='insert', table_name='time' ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_id='redshift', table_name='songplays' ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> create_tables create_tables >> stage_events_to_redshift create_tables >> stage_songs_to_redshift stage_songs_to_redshift >> load_songplays_table stage_events_to_redshift >> load_songplays_table load_songplays_table >> load_time_dimension_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_user_dimension_table load_time_dimension_table >> run_quality_checks
create_tables__operator = DummyOperator(task_id='create_tables__operator', dag=dag) # Create table place holder create_tables__covidcases = DummyOperator(task_id='create_tables__covidcases', dag=dag) create_tables__hospital = DummyOperator(task_id='create_tables__hospital', dag=dag) create_tables__masternode = DummyOperator(task_id='create_tables__masternode', dag=dag) quality_checks__operator = DummyOperator(task_id='quality_checks__operator', dag=dag) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, tableNames=['covidcases','masternode', 'hospital'] ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_to_redshift_operator stage_to_redshift_operator >> stage_covid_cases_to_redshift stage_to_redshift_operator >> stage_hospital_to_redshift stage_to_redshift_operator >> stage_masternode_to_redshift stage_covid_cases_to_redshift >> create_tables__operator
load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', redshift_conn_id="redshift", table="artists", sql_source=SqlQueries.artist_table_insert, dag=dag) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', redshift_conn_id="redshift", table="time", sql_source=SqlQueries.time_table_insert, dag=dag) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', redshift_conn_id="redshift", table="time", dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_time_dimension_table load_user_dimension_table >> run_quality_checks load_song_dimension_table >> run_quality_checks load_artist_dimension_table >> run_quality_checks
redshift_conn_id="redshift", sql_query=SqlQueries.artist_table_insert, target_table="artist") load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id="redshift", sql_query=SqlQueries.time_table_insert, target_table="time") run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, reshift_conn_id="redshift", sql_queries=[(SqlQuery.count_of_nulls_in_songs_table, 0), (SqlQuery.count_of_nulls_in_users_table, 0), (SqlQuery.count_of_nulls_in_artists_table, 0), (SqlQuery.count_of_nulls_in_time_table, 0), (SqlQuery.count_of_nulls_in_songplays_table, 0)]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_songs_to_redshift start_operator >> stage_events_to_redshift stage_songs_to_redshift >> load_songplays_table stage_events_to_redshift >> load_songplays_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_time_dimension_table load_songplays_table >> load_user_dimension_table load_song_dimension_table >> run_quality_checks
aws_credentials_id="aws_credentials", sql=SqlQueries.artist_table_insert, table="artists") load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", sql=SqlQueries.time_table_insert, table="Time") run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", table_array="artists", aws_credentials_id="aws_credentials", tables=["artists", "users", "songs", "time", "songplays"]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_time_dimension_table load_song_dimension_table >> run_quality_checks
options=["FORMAT AS PARQUET"]) load_demographics_dimension_table = StageToRedshiftOperator( task_id='Load_demo_dim_table', redshift_conn_id='redshift', table='dim_us_demographics', aws_conn_id='aws_credentials', s3_bucket='capstone-v01', s3_key='source/demo.parquet', schema='public', options=["FORMAT AS PARQUET"]) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', redshift_conn_id='redshift', tables=[ 'dim_us_demographics', 'fact_us_immigration', 'dim_countries', 'dim_us_states', 'dim_arrival_mode', 'dim_visa', 'dim_orig_port' ]) end_operator = DummyOperator(task_id='Stop_execution') start_operator >> load_immigration_table load_immigration_table >> [ load_origin_dimension_table, load_arrival_dimension_table, load_visa_dimension_table, load_countries_dimension_table, load_states_dimension_table, load_demographics_dimension_table ] >> run_quality_checks run_quality_checks >> end_operator # create_tables >>
dag=dag, query=SqlQueries.calendars_table_insert, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", operation="insert", table="DIM_CALENDARS") load_dim_calendars_table.set_upstream(create_dim_calendars_table) create_load_fact_airbnb_amst_table = LoadFactOperator( task_id='Create_Load_FACT_AIRBNB_AMST_Table', dag=dag, query=SqlQueries.CREATE_LOAD_FACT_AIRBNB_AMST, redshift_conn_id="redshift", aws_credentials_id="aws_credentials") create_load_fact_airbnb_amst_table.set_upstream(load_dim_hosts_table) create_load_fact_airbnb_amst_table.set_upstream(load_dim_reviews_table) create_load_fact_airbnb_amst_table.set_upstream(load_dim_properties_table) create_load_fact_airbnb_amst_table.set_upstream(load_dim_calendars_table) ##RUN DATA QULAITY CHECKS TO ENSURE THAT RECORDS HAD BEEN MOVED CORRECTLY THROUGH PLATFORMS WITHOUT ANY ERRORS run_quality_checks = DataQualityOperator(task_id='Run_DATA_QUALITY_Checks', dag=dag, redshift_conn_id="redshift") run_quality_checks.set_upstream(create_load_fact_airbnb_amst_table) ##DUMMY OPERATOR to indicate that the DAG has run successfully - DAG end_operator = DummyOperator(task_id='END_OPERATOR', dag=dag) end_operator.set_upstream(run_quality_checks)
redshift_conn_id="redshift", table="artists", specified_sql=SqlQueries.artist_table_insert, append_insert_type=True, primary_key="artistid") load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id="redshift", table="time", specified_sql=SqlQueries.time_table_insert) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", query="SELECT COUNT(*) FROM songplays WHERE songplay_id is NULL", ideal_result=0) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> [stage_events_to_redshift, stage_songs_to_redshift] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> [ load_song_dimension_table, load_user_dimension_table, load_artist_dimension_table, load_time_dimension_table ] [ load_song_dimension_table, load_user_dimension_table, load_artist_dimension_table, load_time_dimension_table ] >> run_quality_checks
aws_credentials_id='aws_credentials', redshift_conn_id='redshift', s3_key='song_data/', s3_bucket='dend', table='staging_songs', file_extension='json') load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag, postgres_conn_id='redshift', table='songplays', sql_query=SqlQueries.songplay_table_insert) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, postgres_conn_id='redshift', sql_queries=[ 'select count(1) from public."artists" where artistid is null ', 'select count(1) from public."songplays" where userid is null' ], expected_result=[0, 0]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> [stage_events_to_redshift, stage_songs_to_redshift ] >> load_songplays_table load_songplays_table >> dimension_subdag_task >> run_quality_checks run_quality_checks >> end_operator
aws_credentials_id="aws_credentials", table="time", sql=SqlQueries.time_table_insert, append=False ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", quality_checks=[ {'sql': "SELECT COUNT(*) FROM songplays WHERE playid is null", 'expected_result': 0}, {'sql': "SELECT COUNT(*) FROM users WHERE userid is null", 'expected_result': 0}, {'sql': "SELECT COUNT(*) FROM songs WHERE songid is null", 'expected_result': 0}, {'sql': "SELECT COUNT(*) FROM artists WHERE artistid is null", 'expected_result': 0}, {'sql': "SELECT COUNT(*) FROM time WHERE start_time is null", 'expected_result': 0}, {'sql': "SELECT COUNT(*) FROM songplays WHERE playid is NOT null", 'expected_result': -1}, {'sql': "SELECT COUNT(*) FROM users WHERE userid is NOT null", 'expected_result': -1}, {'sql': "SELECT COUNT(*) FROM songs WHERE songid is NOT null", 'expected_result': -1}, {'sql': "SELECT COUNT(*) FROM artists WHERE artistid is NOT null", 'expected_result': -1}, {'sql': "SELECT COUNT(*) FROM time WHERE start_time is NOT null", 'expected_result': -1} ] ) end_operator = DummyOperator(task_id='Stop_execution') # task dependencies
delete_load=True) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift', table='time', sql_statement=SqlQueries.time_table_insert, delete_load=True) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', tests=[{ 'query': 'SELECT count(*) FROM songplays', 'result': 6820 }, { 'query': 'SELECT count(*) FROM users', 'result': 104 }]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> [stage_events_to_redshift, stage_songs_to_redshift] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> [ load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table ] [ load_user_dimension_table, load_song_dimension_table,
append_insert=True, primary_key="artistid", ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift', table='time', select_sql=SqlQueries.time_table_insert, ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', sample_query='SELECT COUNT(*) FROM songs WHERE songid IS NULL;', expected_result=0, ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table >> run_quality_checks load_songplays_table >> load_song_dimension_table >> run_quality_checks load_songplays_table >> load_artist_dimension_table >> run_quality_checks
dag=dag, redshift_conn_id="redshift", table='time', load_query=SqlQueries.time_table_insert, append_only=False) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", tables=[{ 'name': 'songplays', 'expected_result': 1 }, { 'name': 'users', 'expected_result': 1 }, { 'name': 'songs', 'expected_result': 1 }, { 'name': 'artists', 'expected_result': 1 }, { 'name': 'time', 'expected_result': 1 }]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> [stage_events_to_redshift, stage_songs_to_redshift] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table
append_only=False, dag=dag) load_time_dimension_table = LoadDimensionOperator( task_id='load_time_dim_table', redshift_conn_id="redshift", load_sql=SqlQueries.time_table_insert, table_name="public.time", append_only=False, dag=dag) run_quality_checks = DataQualityOperator(task_id='run_data_quality_checks', redshift_conn_id="redshift", tables=[ "public.artists", "public.songplays", "public.songs", "public.time", "public.users" ], dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) #Order of Dags start_operator >> create_tables create_tables >> [stage_events_to_redshift, stage_songs_to_redshift ] >> load_songplays_table load_songplays_table >> [ load_user_dimension_table, load_song_dimension_table,
table='artists', query=SqlQueries.artist_table_insert, insert_mode='truncate-insert') load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, provide_context=True, conn_id='redshift', table='time', query=SqlQueries.time_table_insert, insert_mode='truncate-insert') run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, provide_context=True, conn_id='redshift', tables=['songplays', 'users', 'songs', 'artists', 'time']) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> [stage_events_to_redshift, stage_songs_to_redshift ] >> load_songplays_table load_songplays_table >> [ load_user_dimension_table, load_artist_dimension_table, load_song_dimension_table, load_time_dimension_table ] >> run_quality_checks run_quality_checks >> end_operator
trunc_insert=True, sql_stmt=SqlQueries.artist_table_insert ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id="redshift", table="time", trunc_insert=True, sql_stmt=SqlQueries.time_table_insert ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", tables=['songplays', 'users', 'songs', 'artists', 'time'] ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> create_tables create_tables >> stage_events_to_redshift create_tables >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table
update_mode="insert", ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id="redshift", table="time", sql_stmt=SqlQueries.time_table_insert, update_mode="insert", ) # check songplays have rows run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", table="songplays", ) # check that songsplays doesnot have Null values in playid column run_case_quality_checks = TestDataOperator( task_id='Run_data_case_quality_checks', dag=dag, redshift_conn_id="redshift", sql="select COUNT(*) from songplays where playid is NULL ", expected=0) # just a dummy operator execute nothing end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # Dag Tasks Dependecies
redshift_conn_id="redshift", table="artists", select_sql=SqlQueries.artist_table_insert, append_data=True) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id="redshift", table="time", select_sql=SqlQueries.time_table_insert, append_data=True) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", table="songplays", column_check="start_time", expected_result=0) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_time_dimension_table load_user_dimension_table >> run_quality_checks load_song_dimension_table >> run_quality_checks
table_name='artists', truncate_table=True, sql_query=SqlQueries.artist_table_insert ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift', table_name='time', truncate_table=True, sql_query=SqlQueries.time_table_insert ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) ## Dependencies #start_operator >> stage_events_to_redshift #start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table
dag=dag, redshift_conn_id="redshift", table="artist", sql="artist_table_insert", append_only=False) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id="redshift", table="time", sql="time_table_insert", append_only=False) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", tables=["songplay", "users", "song", "artist", "time"]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # Task dependencies start_operator >> stage_events_to_redshift >> load_songplays_table start_operator >> stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table >> run_quality_checks load_songplays_table >> load_song_dimension_table >> run_quality_checks load_songplays_table >> load_artist_dimension_table >> run_quality_checks load_songplays_table >> load_time_dimension_table >> run_quality_checks run_quality_checks >> end_operator
sql_list = [SqlQueries.dim_arrdate_time_insert,SqlQueries.dim_depdate_time_insert], remove_old_data = False ) load_fact_visit = LoadFactOperator( task_id='Load_fact_visit_event_table', dag = dag, provider_context=True, redshift_conn_id='redshift', table='fact_visit_event', sql = SqlQueries.fact_visit_insert ) run_quality_checks = DataQualityOperator( task_id='Run_Data_quality_checks', dag=dag, redshift_conn_id="redshift", tables=['fact_visit_event','dim_time','dim_visitor'] ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # DAG definition dim_tables_tasks = [ load_dim_visitor, load_dim_time ] start_operator >> stage_i94immi_operator >> dim_tables_tasks >> load_fact_visit >> run_quality_checks >> end_operator
run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", data_quality_checks=[ { "test_sql": "SELECT COUNT(*) FROM staging_events", "expected_result": [(8056, )] }, { "test_sql": "SELECT COUNT(*) FROM staging_songs", "expected_result": [(385252, )] }, { "test_sql": "SELECT COUNT(*) FROM songplays", "expected_result": [(7039, )] }, { "test_sql": "SELECT COUNT(*) FROM artists", "expected_result": [(45266, )] }, { "test_sql": "SELECT COUNT(*) FROM songs", "expected_result": [(384824, )] }, { "test_sql": "SELECT COUNT(*) FROM users", "expected_result": [(104, )] }, { "test_sql": "SELECT COUNT(*) FROM time", "expected_result": [(7039, )] }, { "test_sql": "SELECT COUNT(*) FROM songplays WHERE playid IS NULL", "expected_result": [(0, )] }, { "test_sql": "SELECT COUNT(*) FROM songplays WHERE start_time IS NULL", "expected_result": [(0, )] }, { "test_sql": "SELECT COUNT(*) FROM artists WHERE artistid IS NULL", "expected_result": [(0, )] }, { "test_sql": "SELECT COUNT(*) FROM songs WHERE songid IS NULL", "expected_result": [(0, )] }, { "test_sql": "SELECT COUNT(*) FROM users WHERE userid IS NULL", "expected_result": [(0, )] }, { "test_sql": "SELECT COUNT(*) FROM time WHERE start_time IS NULL", "expected_result": [(0, )] }, ])
table_name="artists") load_artist_dimension_table = SubDagOperator(subdag=artist_subdag, task_id='Load_artist_dim_table', dag=dag) # time table time_subdag = dimension_SubDAG(parent_dag=root_dag_name, task_id='Load_time_dim_table', conn_id="redshift", start_date=default_args['start_date'], query=SqlQueries.time_table_insert, table_name="time") load_time_dimension_table = SubDagOperator(subdag=time_subdag, task_id='Load_time_dim_table', dag=dag) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', conn_id="redshift", tables=["time", "users", "artists", "songplays", "songs"], dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # order start_operator >> create_tables_in_redshift >> [ stage_songs_to_redshift, stage_events_to_redshift ] >> load_songplays_table >> [ load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table ] >> run_quality_checks >> end_operator
sql_query=SqlQueries.artist_table_insert ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, provide_context=True, aws_credentials_id="aws_credentials", redshift_conn_id='redshift', sql_query=SqlQueries.time_table_insert ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, provide_context=True, aws_credentials_id="aws_credentials", redshift_conn_id='redshift', ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> [stage_events_to_redshift, stage_songs_to_redshift] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> [load_song_dimension_table, load_user_dimension_table, load_artist_dimension_table, load_time_dimension_table] [load_song_dimension_table, load_user_dimension_table, load_artist_dimension_table, load_time_dimension_table] >> run_quality_checks run_quality_checks >> end_operator
load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', redshift_conn_id="redshift", table="artists", dag=dag, ApporTrun="Trun") load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', redshift_conn_id="redshift", table="time", dag=dag, ApporTrun="Trun") run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift") end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_time_dimension_table load_user_dimension_table >> run_quality_checks load_song_dimension_table >> run_quality_checks load_artist_dimension_table >> run_quality_checks
) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, select_data_statement=SqlQueries.time_table_insert, table_name='time', truncate=True ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, empty_check=['artists', 'songplays', 'songs', 'time', 'users'], null_check={ 'artists': ['artist_id'], 'songplays': ['songplay_id', 'artist_id', 'song_id', 'userid'], 'songs': ['song_id'], 'users': ['userid'] } ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_artist_dimension_table
task_id='Load_artist_dim_table', dag=dag, redshift_conn_id="redshift", table='artists', truncate=True, sql_stmt=SqlQueries.artist_table_insert) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, table='time', truncate=True, sql_stmt=SqlQueries.time_table_insert) data_quality_check = DataQualityOperator( task_id=f"data_quality_checks", redshift_conn_id="redshift", tables=["songplays", "users", "songs", "artists", "time"], dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift >> load_songplays_table start_operator >> stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table >> data_quality_check load_songplays_table >> load_song_dimension_table >> data_quality_check load_songplays_table >> load_artist_dimension_table >> data_quality_check load_songplays_table >> load_time_dimension_table >> data_quality_check data_quality_check >> end_operator
for table in copy_s3_bucket_keys: copy_table_from_s3 = CopyToRedshiftOperator( task_id=f'copy_{table["name"]}_from_s3', dag=dag, aws_credentials_id='aws_credentials', redshift_conn_id='redshift', role_arn=REDSHIFT_ARN, table=table['name'], s3_bucket=S3_BUCKET, s3_key=table['key'], file_format=table['file_format'], delimiter=table['sep']) quality_check_table = DataQualityOperator( task_id=f'quality_check_{table["name"]}_table', dag=dag, redshift_conn_id='redshift', table=table['name'], dq_checks=table['dq_checks']) start_operator >> copy_table_from_s3 copy_table_from_s3 >> quality_check_table quality_check_table >> end_operator for table in sas_source_code_tables_data: load_table_from_sas_source_code = SASValueToRedshiftOperator( task_id=f'load_{table["name"]}_from_sas_source_code', dag=dag, aws_credentials_id='aws_credentials', redshift_conn_id='redshift', table=table['name'], s3_bucket=S3_BUCKET,
'table': 'times', 'sql': SqlQueries.time_table_insert }), start_date=default_args['start_date'], ), task_id="load_dimensions_subdag", dag=dag) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, conn_id='redshift', queries=({ "table": "times", "where": "start_time IS NULL", "result": 0 }, { "table": "songs", "where": "songid IS NULL", "result": 0 }, { "table": "songplays", "result": 20460 })) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> table_creation table_creation >> [stage_events_to_redshift, stage_songs_to_redshift] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> load_dim_subdag load_dim_subdag >> run_quality_checks