Ejemplo n.º 1
0
def check_table(parent_dag_name, task_id, redshift_conn_id, *args, **kwargs):

    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    # SONGPLAYS
    quality_check_songplays = DataQualityOperator(
        task_id='Run_data_quality_checks_songplays',
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table="public.songplays",
    )

    # ARTISTS
    quality_check_artist = DataQualityOperator(
        task_id='Run_data_quality_checks_artists',
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table="public.artists",
    )

    # SONGS
    quality_check_song = DataQualityOperator(
        task_id='Run_data_quality_checks_songs',
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table="public.songs",
    )

    # USERS
    quality_check_users = DataQualityOperator(
        task_id='Run_data_quality_checks_users',
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table="public.users",
    )

    # TIME
    quality_check_time = DataQualityOperator(
        task_id='Run_data_quality_check_time',
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table="public.time",
    )

    return dag
Ejemplo n.º 2
0
def get_dimTables_to_Redshift_subdag(parent_dag_name, task_id,
                                     redshift_conn_id, create_tbl,
                                     target_table, source_table, sql_row,
                                     sql_quality, append_data, pk, *args,
                                     **kwargs):
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    templated_command = """
    echo "**************** The execution date : {{ ds }}"
    echo "**************** {{ task.owner }}, {{ task.task_id }}"
    echo "**************** {{ task_instance_key_str }} is running"
    """

    info_task = BashOperator(
        task_id=f"Info_about_{parent_dag_name}.{task_id}_running",
        dag=dag,
        depends_on_past=False,
        bash_command=templated_command,
    )

    load_dimension_table = LoadDimensionOperator(
        task_id=f"load_{target_table}_dim_table",
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        create_tbl=create_tbl,
        target_table=target_table,
        source_table=source_table,
        append_data=append_data,
        pk=pk)

    check_dimension_quality = DataQualityOperator(
        task_id=f"{target_table}_quality",
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        target_table=target_table,
        sql_row=sql_row,
        sql_quality=sql_quality,
        pk=pk,
    )

    info_task >> load_dimension_table
    load_dimension_table >> check_dimension_quality
    return dag
Ejemplo n.º 3
0
load_dim_user = LoadDimensionOperator(task_id="Load_user_dim_table",
                                      dag=dag,
                                      table="users",
                                      subquery=SqlQueries.user_table_insert,
                                      redshift_conn_id="redshift")

load_dim_time = LoadDimensionOperator(task_id="Load_time_dim_table",
                                      dag=dag,
                                      table="time",
                                      subquery=SqlQueries.time_table_insert,
                                      redshift_conn_id="redshift")

data_quality = DataQualityOperator(
    task_id="Run_data_quality_checks",
    dag=dag,
    query="SELECT count(1) FROM songplays WHERE userId IS NULL",
    condition="len(records) > 0",
    redshift_conn_id="redshift")

start_operator >> create_tables

create_tables >> stage_logs
create_tables >> stage_songs

stage_logs >> load_fact
stage_songs >> load_fact

load_fact >> load_dim_song
load_fact >> load_dim_artist
load_fact >> load_dim_user
load_fact >> load_dim_time
Ejemplo n.º 4
0
def function_quality_check(
        
        parent_dag_name,
        task_id,
        redshift_conn_id,
    
        table1,
        query1,
    
        table2,
        query2,
    
        table3,
        query3,
        
        table4,
        query4,
    
        table5,
        query5,
    
        *args, **kwargs):
    
    dag = DAG(
        f"{parent_dag_name}.{task_id}",
        **kwargs
    )
    
    data_quality_songs_task = DataQualityOperator(
        task_id=f"{table1}_quality_check",
        dag=dag,
        table=table1,
        redshift_conn_id=redshift_conn_id,
        check_query= query1
        
    )
    data_quality_artist_task = DataQualityOperator(
        task_id=f"{table2}_quality_checkt",
        dag=dag,
        table=table2,
        redshift_conn_id=redshift_conn_id,
        check_query= query2
        
    )
    
    data_quality_users_task = DataQualityOperator(
        task_id=f"{table3}_quality_checkt",
        dag=dag,
        table=table3,
        redshift_conn_id=redshift_conn_id,
        check_query= query3
        
    )
    data_quality_time_task = DataQualityOperator(
        task_id=f"{table4}_quality_check",
        dag=dag,
        table=table4,
        redshift_conn_id=redshift_conn_id,
        check_query= query4
        
    )
    
    data_quality_songplay_task = DataQualityOperator(
        task_id=f"{table5}_quality_check",
        dag=dag,
        table=table5,
        redshift_conn_id=redshift_conn_id,
        check_query= query5
        
    )
    
    return dag
    dag=dag,
    redshift_conn_id="redshift",
    database_name="public",
    table_name="artists",
    select_sql=SqlQueries.artist_table_insert)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    database_name="public",
    table_name="time",
    select_sql=SqlQueries.time_table_insert)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         redshift_conn_id="redshift")

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table

load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_time_dimension_table
Ejemplo n.º 6
0
load_zone_table = CopyCsvRedshiftOperator(
    task_id='Load_zone_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='zones',
    s3_bucket=Variable.get('s3_out_aq_bucket'),
    s3_prefix=Variable.get('s3_out_aq_prefix'),
    data_type='air_quality',
    arn=Variable.get('iam_role_arn'),
    ignore_header=1)

quality_checks_zone_table = DataQualityOperator(
    task_id='Quality_checks_zone_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='zone',
    test_query=SqlQueries.zone_table_quality_check,
    expected_res=0)

quality_checks_air_quality_table = DataQualityOperator(
    task_id='Quality_checks_air_quality_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='air_quality',
    test_query=SqlQueries.air_quality_table_quality_check,
    expected_res=0)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# Set task dependencies
Ejemplo n.º 7
0
run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         redshift_conn_id="redshift",
                                         check_statements=[{
                                             "sql":
                                             "SELECT COUNT(*) FROM songplays;",
                                             "operator": "ne",
                                             "value": 0
                                         }, {
                                             "sql":
                                             "SELECT COUNT(*) FROM users;",
                                             "operator": "ne",
                                             "value": 0
                                         }, {
                                             "sql":
                                             "SELECT COUNT(*) FROM songs;",
                                             "operator": "ne",
                                             "value": 0
                                         }, {
                                             "sql":
                                             "SELECT COUNT(*) FROM artists;",
                                             "operator": "ne",
                                             "value": 0
                                         }, {
                                             "sql":
                                             "SELECT COUNT(*) FROM time;",
                                             "operator": "ne",
                                             "value": 0
                                         }])
Ejemplo n.º 8
0
    task_id='Load_artist_dim_table',
    dag=dag,
    redshift_conn_id='redshift_warehouse',
    table='artists',
    sql_query=SqlQueries.artist_table_insert)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='redshift_warehouse',
    table='"time"',
    sql_query=SqlQueries.time_table_insert)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    redshift_conn_id='redshift_warehouse',
    tables=['songplays', 'songs', 'artists', 'time', 'users'],
    dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table

load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_time_dimension_table
Ejemplo n.º 9
0
# FACTS TABLE TASKs
####################################################################################
# Task to create and insert data from staging table to fact table the check
load_songplays_table = LoadFactOperator(
    task_id='load_songplays_fact_table',
    dag=dag,
    redshift_conn_id="redshift",
    target_table="songplays",
    create_tbl=CreateTables.songplay_table_create,
    source=SqlQueries.songplay_table_insert,
)

check_songplays_quality = DataQualityOperator(
    task_id='songplays_quality',
    redshift_conn_id='redshift',
    target_table="songplays",
    pk="playid",
    sql_row=SqlQueries.has_rows_songplays,
    sql_quality=SqlQueries.has_null_songplays,
    dag=dag)
####################################################################################
# DIMENSION TABLES TASKS SUBDAG
####################################################################################
# Task to create, insert data and run data quality on table artists
load_artists_dimension_table_id = "load_artists_dimension_table_subdag"
load_artists_dimension_table_task = SubDagOperator(
    subdag=get_dimTables_to_Redshift_subdag(
        "ETL_Sparkify_0",  #name parent dag
        load_artists_dimension_table_id,  #task_id
        "redshift",  #redshift_conn_id
        create_tbl=CreateTables.artist_table_create,  # create_tbl
        target_table="artists",  #t
    load_artist_dimension_table = LoadDimensionOperator(
        task_id='load_artist_dim_table',
        schema='public',
        table='artists',
        sql=SqlQueries.artist_table_insert,
    )

    load_time_dimension_table = LoadDimensionOperator(
        task_id='Load_time_dim_table',
        schema='public',
        table='time',
        sql=SqlQueries.time_table_insert,
    )

    run_quality_checks = DataQualityOperator(
        task_id='run_data_quality_checks',
        schema='public',
        tables=['artists', 'songs', 'time', 'users', 'songplays'])

# Task lists
create_tables = [
    create_artists_table,
    create_songplays_table,
    create_songs_table,
    create_staging_events_table,
    create_staging_songs_table,
    create_time_table,
    create_users_table,
]

stage_data = [
    stage_songs_to_redshift,
Ejemplo n.º 11
0
    redshift_conn_id='aws_redshift',
    table_name='artists',
    sql_load_query=SqlQueries.artist_table_insert,
    load_mode='truncate')

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='aws_redshift',
    table_name='time',
    sql_load_query=SqlQueries.time_table_insert,
    load_mode='truncate')

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    table_list=['songplays', 'users', 'songs', 'artists', 'time'],
    redshift_conn_id='aws_redshift')

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# configuring the Task Dependencies

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table

load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
Ejemplo n.º 12
0
    load_sql_query=SqlQueries.artist_table_insert,
    table="artists",
    insert_append=True)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    load_sql_query=SqlQueries.time_table_insert,
    table="time",
    insert_append=True)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         redshift_conn_id="redshift",
                                         list_tables=[
                                             "staging_events", "staging_songs",
                                             "songplays", "users", "songs",
                                             "artists", "time"
                                         ])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> [stage_events_to_redshift, stage_songs_to_redshift]
[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table
load_songplays_table >> [
    load_user_dimension_table, load_song_dimension_table,
    load_artist_dimension_table, load_time_dimension_table
]
[
    load_user_dimension_table, load_song_dimension_table,
    load_artist_dimension_table, load_time_dimension_table