def check_table(parent_dag_name, task_id, redshift_conn_id, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) # SONGPLAYS quality_check_songplays = DataQualityOperator( task_id='Run_data_quality_checks_songplays', dag=dag, redshift_conn_id=redshift_conn_id, table="public.songplays", ) # ARTISTS quality_check_artist = DataQualityOperator( task_id='Run_data_quality_checks_artists', dag=dag, redshift_conn_id=redshift_conn_id, table="public.artists", ) # SONGS quality_check_song = DataQualityOperator( task_id='Run_data_quality_checks_songs', dag=dag, redshift_conn_id=redshift_conn_id, table="public.songs", ) # USERS quality_check_users = DataQualityOperator( task_id='Run_data_quality_checks_users', dag=dag, redshift_conn_id=redshift_conn_id, table="public.users", ) # TIME quality_check_time = DataQualityOperator( task_id='Run_data_quality_check_time', dag=dag, redshift_conn_id=redshift_conn_id, table="public.time", ) return dag
def get_dimTables_to_Redshift_subdag(parent_dag_name, task_id, redshift_conn_id, create_tbl, target_table, source_table, sql_row, sql_quality, append_data, pk, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) templated_command = """ echo "**************** The execution date : {{ ds }}" echo "**************** {{ task.owner }}, {{ task.task_id }}" echo "**************** {{ task_instance_key_str }} is running" """ info_task = BashOperator( task_id=f"Info_about_{parent_dag_name}.{task_id}_running", dag=dag, depends_on_past=False, bash_command=templated_command, ) load_dimension_table = LoadDimensionOperator( task_id=f"load_{target_table}_dim_table", dag=dag, redshift_conn_id=redshift_conn_id, create_tbl=create_tbl, target_table=target_table, source_table=source_table, append_data=append_data, pk=pk) check_dimension_quality = DataQualityOperator( task_id=f"{target_table}_quality", dag=dag, redshift_conn_id=redshift_conn_id, target_table=target_table, sql_row=sql_row, sql_quality=sql_quality, pk=pk, ) info_task >> load_dimension_table load_dimension_table >> check_dimension_quality return dag
load_dim_user = LoadDimensionOperator(task_id="Load_user_dim_table", dag=dag, table="users", subquery=SqlQueries.user_table_insert, redshift_conn_id="redshift") load_dim_time = LoadDimensionOperator(task_id="Load_time_dim_table", dag=dag, table="time", subquery=SqlQueries.time_table_insert, redshift_conn_id="redshift") data_quality = DataQualityOperator( task_id="Run_data_quality_checks", dag=dag, query="SELECT count(1) FROM songplays WHERE userId IS NULL", condition="len(records) > 0", redshift_conn_id="redshift") start_operator >> create_tables create_tables >> stage_logs create_tables >> stage_songs stage_logs >> load_fact stage_songs >> load_fact load_fact >> load_dim_song load_fact >> load_dim_artist load_fact >> load_dim_user load_fact >> load_dim_time
def function_quality_check( parent_dag_name, task_id, redshift_conn_id, table1, query1, table2, query2, table3, query3, table4, query4, table5, query5, *args, **kwargs): dag = DAG( f"{parent_dag_name}.{task_id}", **kwargs ) data_quality_songs_task = DataQualityOperator( task_id=f"{table1}_quality_check", dag=dag, table=table1, redshift_conn_id=redshift_conn_id, check_query= query1 ) data_quality_artist_task = DataQualityOperator( task_id=f"{table2}_quality_checkt", dag=dag, table=table2, redshift_conn_id=redshift_conn_id, check_query= query2 ) data_quality_users_task = DataQualityOperator( task_id=f"{table3}_quality_checkt", dag=dag, table=table3, redshift_conn_id=redshift_conn_id, check_query= query3 ) data_quality_time_task = DataQualityOperator( task_id=f"{table4}_quality_check", dag=dag, table=table4, redshift_conn_id=redshift_conn_id, check_query= query4 ) data_quality_songplay_task = DataQualityOperator( task_id=f"{table5}_quality_check", dag=dag, table=table5, redshift_conn_id=redshift_conn_id, check_query= query5 ) return dag
dag=dag, redshift_conn_id="redshift", database_name="public", table_name="artists", select_sql=SqlQueries.artist_table_insert) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id="redshift", database_name="public", table_name="time", select_sql=SqlQueries.time_table_insert) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift") end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_time_dimension_table
load_zone_table = CopyCsvRedshiftOperator( task_id='Load_zone_table', dag=dag, redshift_conn_id='redshift', table='zones', s3_bucket=Variable.get('s3_out_aq_bucket'), s3_prefix=Variable.get('s3_out_aq_prefix'), data_type='air_quality', arn=Variable.get('iam_role_arn'), ignore_header=1) quality_checks_zone_table = DataQualityOperator( task_id='Quality_checks_zone_table', dag=dag, redshift_conn_id='redshift', table='zone', test_query=SqlQueries.zone_table_quality_check, expected_res=0) quality_checks_air_quality_table = DataQualityOperator( task_id='Quality_checks_air_quality_table', dag=dag, redshift_conn_id='redshift', table='air_quality', test_query=SqlQueries.air_quality_table_quality_check, expected_res=0) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # Set task dependencies
run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", check_statements=[{ "sql": "SELECT COUNT(*) FROM songplays;", "operator": "ne", "value": 0 }, { "sql": "SELECT COUNT(*) FROM users;", "operator": "ne", "value": 0 }, { "sql": "SELECT COUNT(*) FROM songs;", "operator": "ne", "value": 0 }, { "sql": "SELECT COUNT(*) FROM artists;", "operator": "ne", "value": 0 }, { "sql": "SELECT COUNT(*) FROM time;", "operator": "ne", "value": 0 }])
task_id='Load_artist_dim_table', dag=dag, redshift_conn_id='redshift_warehouse', table='artists', sql_query=SqlQueries.artist_table_insert) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift_warehouse', table='"time"', sql_query=SqlQueries.time_table_insert) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', redshift_conn_id='redshift_warehouse', tables=['songplays', 'songs', 'artists', 'time', 'users'], dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_artist_dimension_table load_songplays_table >> load_time_dimension_table
# FACTS TABLE TASKs #################################################################################### # Task to create and insert data from staging table to fact table the check load_songplays_table = LoadFactOperator( task_id='load_songplays_fact_table', dag=dag, redshift_conn_id="redshift", target_table="songplays", create_tbl=CreateTables.songplay_table_create, source=SqlQueries.songplay_table_insert, ) check_songplays_quality = DataQualityOperator( task_id='songplays_quality', redshift_conn_id='redshift', target_table="songplays", pk="playid", sql_row=SqlQueries.has_rows_songplays, sql_quality=SqlQueries.has_null_songplays, dag=dag) #################################################################################### # DIMENSION TABLES TASKS SUBDAG #################################################################################### # Task to create, insert data and run data quality on table artists load_artists_dimension_table_id = "load_artists_dimension_table_subdag" load_artists_dimension_table_task = SubDagOperator( subdag=get_dimTables_to_Redshift_subdag( "ETL_Sparkify_0", #name parent dag load_artists_dimension_table_id, #task_id "redshift", #redshift_conn_id create_tbl=CreateTables.artist_table_create, # create_tbl target_table="artists", #t
load_artist_dimension_table = LoadDimensionOperator( task_id='load_artist_dim_table', schema='public', table='artists', sql=SqlQueries.artist_table_insert, ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', schema='public', table='time', sql=SqlQueries.time_table_insert, ) run_quality_checks = DataQualityOperator( task_id='run_data_quality_checks', schema='public', tables=['artists', 'songs', 'time', 'users', 'songplays']) # Task lists create_tables = [ create_artists_table, create_songplays_table, create_songs_table, create_staging_events_table, create_staging_songs_table, create_time_table, create_users_table, ] stage_data = [ stage_songs_to_redshift,
redshift_conn_id='aws_redshift', table_name='artists', sql_load_query=SqlQueries.artist_table_insert, load_mode='truncate') load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='aws_redshift', table_name='time', sql_load_query=SqlQueries.time_table_insert, load_mode='truncate') run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, table_list=['songplays', 'users', 'songs', 'artists', 'time'], redshift_conn_id='aws_redshift') end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # configuring the Task Dependencies start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table
load_sql_query=SqlQueries.artist_table_insert, table="artists", insert_append=True) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id="redshift", load_sql_query=SqlQueries.time_table_insert, table="time", insert_append=True) run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", list_tables=[ "staging_events", "staging_songs", "songplays", "users", "songs", "artists", "time" ]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> [stage_events_to_redshift, stage_songs_to_redshift] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> [ load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table ] [ load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table