table='artists',
    select_sql=SqlQueries.artist_table_insert,
    append_data=True,
    dag=dag)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    redshift_conn_id='redshift',
    table='time',
    select_sql=SqlQueries.time_table_insert,
    append_data=True,
    dag=dag)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         redshift_conn_id='redshift',
                                         dq_checks=dq_checks,
                                         expected_result=0,
                                         dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# start_operator >> create_tables
start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_time_dimension_table
load_user_dimension_table >> run_quality_checks
Example #2
0
    table_name='artists'
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_id='redshift',
    sql=SqlQueries.time_table_insert,
    mode='insert',
    table_name='time'
    
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_id='redshift',
    table_name='songplays'
)
end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)


start_operator >> create_tables
create_tables >> stage_events_to_redshift
create_tables >> stage_songs_to_redshift
stage_songs_to_redshift >> load_songplays_table
stage_events_to_redshift >> load_songplays_table
load_songplays_table >> load_time_dimension_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_user_dimension_table
load_time_dimension_table >> run_quality_checks
Example #3
0

create_tables__operator  = DummyOperator(task_id='create_tables__operator',  dag=dag)

# Create table place holder
create_tables__covidcases = DummyOperator(task_id='create_tables__covidcases',  dag=dag)
create_tables__hospital  = DummyOperator(task_id='create_tables__hospital',  dag=dag)
create_tables__masternode  = DummyOperator(task_id='create_tables__masternode',  dag=dag)



quality_checks__operator  = DummyOperator(task_id='quality_checks__operator',  dag=dag)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    tableNames=['covidcases','masternode', 'hospital']
)




end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

start_operator >> stage_to_redshift_operator                

stage_to_redshift_operator >> stage_covid_cases_to_redshift
stage_to_redshift_operator >> stage_hospital_to_redshift
stage_to_redshift_operator >> stage_masternode_to_redshift

stage_covid_cases_to_redshift >> create_tables__operator
load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
    redshift_conn_id="redshift",
    table="artists",
    sql_source=SqlQueries.artist_table_insert,
    dag=dag)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    redshift_conn_id="redshift",
    table="time",
    sql_source=SqlQueries.time_table_insert,
    dag=dag)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         redshift_conn_id="redshift",
                                         table="time",
                                         dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_time_dimension_table
load_user_dimension_table >> run_quality_checks
load_song_dimension_table >> run_quality_checks
load_artist_dimension_table >> run_quality_checks
Example #5
0
    redshift_conn_id="redshift",
    sql_query=SqlQueries.artist_table_insert,
    target_table="artist")

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    sql_query=SqlQueries.time_table_insert,
    target_table="time")

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    reshift_conn_id="redshift",
    sql_queries=[(SqlQuery.count_of_nulls_in_songs_table, 0),
                 (SqlQuery.count_of_nulls_in_users_table, 0),
                 (SqlQuery.count_of_nulls_in_artists_table, 0),
                 (SqlQuery.count_of_nulls_in_time_table, 0),
                 (SqlQuery.count_of_nulls_in_songplays_table, 0)])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)
start_operator >> stage_songs_to_redshift
start_operator >> stage_events_to_redshift
stage_songs_to_redshift >> load_songplays_table
stage_events_to_redshift >> load_songplays_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_time_dimension_table
load_songplays_table >> load_user_dimension_table
load_song_dimension_table >> run_quality_checks
Example #6
0
    aws_credentials_id="aws_credentials",
    sql=SqlQueries.artist_table_insert,
    table="artists")

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    sql=SqlQueries.time_table_insert,
    table="Time")

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    table_array="artists",
    aws_credentials_id="aws_credentials",
    tables=["artists", "users", "songs", "time", "songplays"])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_time_dimension_table
load_song_dimension_table >> run_quality_checks
Example #7
0
        options=["FORMAT AS PARQUET"])

    load_demographics_dimension_table = StageToRedshiftOperator(
        task_id='Load_demo_dim_table',
        redshift_conn_id='redshift',
        table='dim_us_demographics',
        aws_conn_id='aws_credentials',
        s3_bucket='capstone-v01',
        s3_key='source/demo.parquet',
        schema='public',
        options=["FORMAT AS PARQUET"])

    run_quality_checks = DataQualityOperator(
        task_id='Run_data_quality_checks',
        redshift_conn_id='redshift',
        tables=[
            'dim_us_demographics', 'fact_us_immigration', 'dim_countries',
            'dim_us_states', 'dim_arrival_mode', 'dim_visa', 'dim_orig_port'
        ])

    end_operator = DummyOperator(task_id='Stop_execution')

start_operator >> load_immigration_table
load_immigration_table >> [
    load_origin_dimension_table, load_arrival_dimension_table,
    load_visa_dimension_table, load_countries_dimension_table,
    load_states_dimension_table, load_demographics_dimension_table
] >> run_quality_checks
run_quality_checks >> end_operator

# create_tables >>
Example #8
0
    dag=dag,
    query=SqlQueries.calendars_table_insert,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    operation="insert",
    table="DIM_CALENDARS")
load_dim_calendars_table.set_upstream(create_dim_calendars_table)

create_load_fact_airbnb_amst_table = LoadFactOperator(
    task_id='Create_Load_FACT_AIRBNB_AMST_Table',
    dag=dag,
    query=SqlQueries.CREATE_LOAD_FACT_AIRBNB_AMST,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials")
create_load_fact_airbnb_amst_table.set_upstream(load_dim_hosts_table)
create_load_fact_airbnb_amst_table.set_upstream(load_dim_reviews_table)
create_load_fact_airbnb_amst_table.set_upstream(load_dim_properties_table)
create_load_fact_airbnb_amst_table.set_upstream(load_dim_calendars_table)

##RUN DATA QULAITY CHECKS TO ENSURE THAT RECORDS HAD BEEN MOVED CORRECTLY THROUGH PLATFORMS WITHOUT ANY ERRORS
run_quality_checks = DataQualityOperator(task_id='Run_DATA_QUALITY_Checks',
                                         dag=dag,
                                         redshift_conn_id="redshift")
run_quality_checks.set_upstream(create_load_fact_airbnb_amst_table)

##DUMMY OPERATOR to indicate that the DAG has run successfully - DAG

end_operator = DummyOperator(task_id='END_OPERATOR', dag=dag)

end_operator.set_upstream(run_quality_checks)
    redshift_conn_id="redshift",
    table="artists",
    specified_sql=SqlQueries.artist_table_insert,
    append_insert_type=True,
    primary_key="artistid")

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="time",
    specified_sql=SqlQueries.time_table_insert)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    query="SELECT COUNT(*) FROM songplays WHERE songplay_id is NULL",
    ideal_result=0)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> [stage_events_to_redshift, stage_songs_to_redshift]
[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table
load_songplays_table >> [
    load_song_dimension_table, load_user_dimension_table,
    load_artist_dimension_table, load_time_dimension_table
]
[
    load_song_dimension_table, load_user_dimension_table,
    load_artist_dimension_table, load_time_dimension_table
] >> run_quality_checks
Example #10
0
    aws_credentials_id='aws_credentials',
    redshift_conn_id='redshift',
    s3_key='song_data/',
    s3_bucket='dend',
    table='staging_songs',
    file_extension='json')

load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    dag=dag,
    postgres_conn_id='redshift',
    table='songplays',
    sql_query=SqlQueries.songplay_table_insert)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    postgres_conn_id='redshift',
    sql_queries=[
        'select count(1) from public."artists" where artistid is null ',
        'select count(1) from public."songplays" where userid is null'
    ],
    expected_result=[0, 0])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> [stage_events_to_redshift, stage_songs_to_redshift
                   ] >> load_songplays_table
load_songplays_table >> dimension_subdag_task >> run_quality_checks
run_quality_checks >> end_operator
Example #11
0
        aws_credentials_id="aws_credentials",
        table="time",
        sql=SqlQueries.time_table_insert,
        append=False
    )

    run_quality_checks = DataQualityOperator(
        task_id='Run_data_quality_checks',
        dag=dag,
        redshift_conn_id="redshift",
        aws_credentials_id="aws_credentials",
        quality_checks=[
            {'sql': "SELECT COUNT(*) FROM songplays WHERE playid is null", 'expected_result': 0},
            {'sql': "SELECT COUNT(*) FROM users WHERE userid is null", 'expected_result': 0},
            {'sql': "SELECT COUNT(*) FROM songs WHERE songid is null", 'expected_result': 0},
            {'sql': "SELECT COUNT(*) FROM artists WHERE artistid is null", 'expected_result': 0},
            {'sql': "SELECT COUNT(*) FROM time WHERE start_time is null", 'expected_result': 0},

            {'sql': "SELECT COUNT(*) FROM songplays WHERE playid is NOT null", 'expected_result': -1},
            {'sql': "SELECT COUNT(*) FROM users WHERE userid is NOT null", 'expected_result': -1},
            {'sql': "SELECT COUNT(*) FROM songs WHERE songid is NOT null", 'expected_result': -1},
            {'sql': "SELECT COUNT(*) FROM artists WHERE artistid is NOT null", 'expected_result': -1},
            {'sql': "SELECT COUNT(*) FROM time WHERE start_time is NOT null", 'expected_result': -1}

        ]
    )

    end_operator = DummyOperator(task_id='Stop_execution')


# task dependencies
Example #12
0
    delete_load=True)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='time',
    sql_statement=SqlQueries.time_table_insert,
    delete_load=True)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         redshift_conn_id='redshift',
                                         tests=[{
                                             'query':
                                             'SELECT count(*) FROM songplays',
                                             'result': 6820
                                         }, {
                                             'query':
                                             'SELECT count(*) FROM users',
                                             'result': 104
                                         }])
end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> [stage_events_to_redshift, stage_songs_to_redshift]
[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table
load_songplays_table >> [
    load_user_dimension_table, load_song_dimension_table,
    load_artist_dimension_table, load_time_dimension_table
]
[
    load_user_dimension_table, load_song_dimension_table,
Example #13
0
    append_insert=True,
    primary_key="artistid",
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='time',
    select_sql=SqlQueries.time_table_insert,
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    sample_query='SELECT COUNT(*) FROM songs WHERE songid IS NULL;',
    expected_result=0,
)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table

load_songplays_table >> load_user_dimension_table >> run_quality_checks
load_songplays_table >> load_song_dimension_table >> run_quality_checks
load_songplays_table >> load_artist_dimension_table >> run_quality_checks
    dag=dag,
    redshift_conn_id="redshift",
    table='time',
    load_query=SqlQueries.time_table_insert,
    append_only=False)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         redshift_conn_id="redshift",
                                         tables=[{
                                             'name': 'songplays',
                                             'expected_result': 1
                                         }, {
                                             'name': 'users',
                                             'expected_result': 1
                                         }, {
                                             'name': 'songs',
                                             'expected_result': 1
                                         }, {
                                             'name': 'artists',
                                             'expected_result': 1
                                         }, {
                                             'name': 'time',
                                             'expected_result': 1
                                         }])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> [stage_events_to_redshift, stage_songs_to_redshift]

[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table
Example #15
0
    append_only=False,
    dag=dag)

load_time_dimension_table = LoadDimensionOperator(
    task_id='load_time_dim_table',
    redshift_conn_id="redshift",
    load_sql=SqlQueries.time_table_insert,
    table_name="public.time",
    append_only=False,
    dag=dag)

run_quality_checks = DataQualityOperator(task_id='run_data_quality_checks',
                                         redshift_conn_id="redshift",
                                         tables=[
                                             "public.artists",
                                             "public.songplays",
                                             "public.songs", "public.time",
                                             "public.users"
                                         ],
                                         dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

#Order of Dags
start_operator >> create_tables

create_tables >> [stage_events_to_redshift, stage_songs_to_redshift
                  ] >> load_songplays_table

load_songplays_table >> [
    load_user_dimension_table, load_song_dimension_table,
Example #16
0
    table='artists',
    query=SqlQueries.artist_table_insert,
    insert_mode='truncate-insert')

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    provide_context=True,
    conn_id='redshift',
    table='time',
    query=SqlQueries.time_table_insert,
    insert_mode='truncate-insert')

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    provide_context=True,
    conn_id='redshift',
    tables=['songplays', 'users', 'songs', 'artists', 'time'])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> [stage_events_to_redshift, stage_songs_to_redshift
                   ] >> load_songplays_table

load_songplays_table >> [
    load_user_dimension_table, load_artist_dimension_table,
    load_song_dimension_table, load_time_dimension_table
] >> run_quality_checks

run_quality_checks >> end_operator
    trunc_insert=True,
    sql_stmt=SqlQueries.artist_table_insert
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="time",
    trunc_insert=True,
    sql_stmt=SqlQueries.time_table_insert
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    tables=['songplays', 'users', 'songs', 'artists', 'time']
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)


start_operator >> create_tables

create_tables >> stage_events_to_redshift
create_tables >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table

load_songplays_table >> load_user_dimension_table
    update_mode="insert",
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="time",
    sql_stmt=SqlQueries.time_table_insert,
    update_mode="insert",
)

# check songplays have rows
run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    table="songplays",
)

# check that songsplays doesnot have Null values in playid column
run_case_quality_checks = TestDataOperator(
    task_id='Run_data_case_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    sql="select COUNT(*) from songplays where playid is NULL ",
    expected=0)

# just a dummy operator execute nothing
end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# Dag Tasks Dependecies
Example #19
0
    redshift_conn_id="redshift",
    table="artists",
    select_sql=SqlQueries.artist_table_insert,
    append_data=True)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="time",
    select_sql=SqlQueries.time_table_insert,
    append_data=True)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         redshift_conn_id="redshift",
                                         table="songplays",
                                         column_check="start_time",
                                         expected_result=0)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_time_dimension_table
load_user_dimension_table >> run_quality_checks
load_song_dimension_table >> run_quality_checks
Example #20
0
    table_name='artists',
    truncate_table=True,
    sql_query=SqlQueries.artist_table_insert
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    table_name='time',
    truncate_table=True,
    sql_query=SqlQueries.time_table_insert
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)


## Dependencies

#start_operator >> stage_events_to_redshift
#start_operator >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table

load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
    dag=dag,
    redshift_conn_id="redshift",
    table="artist",
    sql="artist_table_insert",
    append_only=False)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="time",
    sql="time_table_insert",
    append_only=False)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    tables=["songplay", "users", "song", "artist", "time"])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# Task dependencies
start_operator >> stage_events_to_redshift >> load_songplays_table
start_operator >> stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table >> run_quality_checks
load_songplays_table >> load_song_dimension_table >> run_quality_checks
load_songplays_table >> load_artist_dimension_table >> run_quality_checks
load_songplays_table >> load_time_dimension_table >> run_quality_checks
run_quality_checks >> end_operator
    sql_list = [SqlQueries.dim_arrdate_time_insert,SqlQueries.dim_depdate_time_insert],
    remove_old_data = False
)

load_fact_visit = LoadFactOperator(
    task_id='Load_fact_visit_event_table',
    dag = dag,
    provider_context=True,
    redshift_conn_id='redshift',
    table='fact_visit_event',
    sql = SqlQueries.fact_visit_insert
)

run_quality_checks = DataQualityOperator(
    task_id='Run_Data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    tables=['fact_visit_event','dim_time','dim_visitor']
)


end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)


# DAG definition

dim_tables_tasks = [
    load_dim_visitor,
    load_dim_time
]

start_operator >> stage_i94immi_operator >> dim_tables_tasks >> load_fact_visit >> run_quality_checks >> end_operator
Example #23
0
run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    data_quality_checks=[
        {
            "test_sql": "SELECT COUNT(*) FROM staging_events",
            "expected_result": [(8056, )]
        },
        {
            "test_sql": "SELECT COUNT(*) FROM staging_songs",
            "expected_result": [(385252, )]
        },
        {
            "test_sql": "SELECT COUNT(*) FROM songplays",
            "expected_result": [(7039, )]
        },
        {
            "test_sql": "SELECT COUNT(*) FROM artists",
            "expected_result": [(45266, )]
        },
        {
            "test_sql": "SELECT COUNT(*) FROM songs",
            "expected_result": [(384824, )]
        },
        {
            "test_sql": "SELECT COUNT(*) FROM users",
            "expected_result": [(104, )]
        },
        {
            "test_sql": "SELECT COUNT(*) FROM time",
            "expected_result": [(7039, )]
        },
        {
            "test_sql": "SELECT COUNT(*) FROM songplays WHERE playid IS NULL",
            "expected_result": [(0, )]
        },
        {
            "test_sql":
            "SELECT COUNT(*) FROM songplays WHERE start_time IS NULL",
            "expected_result": [(0, )]
        },
        {
            "test_sql": "SELECT COUNT(*) FROM artists WHERE artistid IS NULL",
            "expected_result": [(0, )]
        },
        {
            "test_sql": "SELECT COUNT(*) FROM songs WHERE songid IS NULL",
            "expected_result": [(0, )]
        },
        {
            "test_sql": "SELECT COUNT(*) FROM users WHERE userid IS NULL",
            "expected_result": [(0, )]
        },
        {
            "test_sql": "SELECT COUNT(*) FROM time WHERE start_time IS NULL",
            "expected_result": [(0, )]
        },
    ])
Example #24
0
                                 table_name="artists")
load_artist_dimension_table = SubDagOperator(subdag=artist_subdag,
                                             task_id='Load_artist_dim_table',
                                             dag=dag)
# time table
time_subdag = dimension_SubDAG(parent_dag=root_dag_name,
                               task_id='Load_time_dim_table',
                               conn_id="redshift",
                               start_date=default_args['start_date'],
                               query=SqlQueries.time_table_insert,
                               table_name="time")
load_time_dimension_table = SubDagOperator(subdag=time_subdag,
                                           task_id='Load_time_dim_table',
                                           dag=dag)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    conn_id="redshift",
    tables=["time", "users", "artists", "songplays", "songs"],
    dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# order
start_operator >> create_tables_in_redshift >> [
    stage_songs_to_redshift, stage_events_to_redshift
] >> load_songplays_table >> [
    load_user_dimension_table, load_song_dimension_table,
    load_artist_dimension_table, load_time_dimension_table
] >> run_quality_checks >> end_operator
Example #25
0
    sql_query=SqlQueries.artist_table_insert
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    provide_context=True,
    aws_credentials_id="aws_credentials",
    redshift_conn_id='redshift',
    sql_query=SqlQueries.time_table_insert
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    provide_context=True,
    aws_credentials_id="aws_credentials",
    redshift_conn_id='redshift',
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

start_operator >> [stage_events_to_redshift, stage_songs_to_redshift]

[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table

load_songplays_table >> [load_song_dimension_table, load_user_dimension_table, load_artist_dimension_table, load_time_dimension_table]

[load_song_dimension_table, load_user_dimension_table, load_artist_dimension_table, load_time_dimension_table] >> run_quality_checks

run_quality_checks >> end_operator
Example #26
0
load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
    redshift_conn_id="redshift",
    table="artists",
    dag=dag,
    ApporTrun="Trun")

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    redshift_conn_id="redshift",
    table="time",
    dag=dag,
    ApporTrun="Trun")

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         redshift_conn_id="redshift")

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_time_dimension_table
load_user_dimension_table >> run_quality_checks
load_song_dimension_table >> run_quality_checks
load_artist_dimension_table >> run_quality_checks
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    select_data_statement=SqlQueries.time_table_insert,
    table_name='time',
    truncate=True
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    empty_check=['artists', 'songplays', 'songs', 'time', 'users'],
    null_check={
        'artists': ['artist_id'],
        'songplays': ['songplay_id', 'artist_id', 'song_id', 'userid'],
        'songs': ['song_id'],
        'users': ['userid']
    }
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_artist_dimension_table
Example #28
0
    task_id='Load_artist_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    table='artists',
    truncate=True,
    sql_stmt=SqlQueries.artist_table_insert)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    table='time',
    truncate=True,
    sql_stmt=SqlQueries.time_table_insert)

data_quality_check = DataQualityOperator(
    task_id=f"data_quality_checks",
    redshift_conn_id="redshift",
    tables=["songplays", "users", "songs", "artists", "time"],
    dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_events_to_redshift >> load_songplays_table
start_operator >> stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table >> data_quality_check
load_songplays_table >> load_song_dimension_table >> data_quality_check
load_songplays_table >> load_artist_dimension_table >> data_quality_check
load_songplays_table >> load_time_dimension_table >> data_quality_check

data_quality_check >> end_operator
for table in copy_s3_bucket_keys:
    copy_table_from_s3 = CopyToRedshiftOperator(
        task_id=f'copy_{table["name"]}_from_s3',
        dag=dag,
        aws_credentials_id='aws_credentials',
        redshift_conn_id='redshift',
        role_arn=REDSHIFT_ARN,
        table=table['name'],
        s3_bucket=S3_BUCKET,
        s3_key=table['key'],
        file_format=table['file_format'],
        delimiter=table['sep'])

    quality_check_table = DataQualityOperator(
        task_id=f'quality_check_{table["name"]}_table',
        dag=dag,
        redshift_conn_id='redshift',
        table=table['name'],
        dq_checks=table['dq_checks'])

    start_operator >> copy_table_from_s3
    copy_table_from_s3 >> quality_check_table
    quality_check_table >> end_operator

for table in sas_source_code_tables_data:
    load_table_from_sas_source_code = SASValueToRedshiftOperator(
        task_id=f'load_{table["name"]}_from_sas_source_code',
        dag=dag,
        aws_credentials_id='aws_credentials',
        redshift_conn_id='redshift',
        table=table['name'],
        s3_bucket=S3_BUCKET,
Example #30
0
        'table': 'times',
        'sql': SqlQueries.time_table_insert
    }),
    start_date=default_args['start_date'],
),
                                 task_id="load_dimensions_subdag",
                                 dag=dag)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         conn_id='redshift',
                                         queries=({
                                             "table": "times",
                                             "where": "start_time IS NULL",
                                             "result": 0
                                         }, {
                                             "table": "songs",
                                             "where": "songid IS NULL",
                                             "result": 0
                                         }, {
                                             "table": "songplays",
                                             "result": 20460
                                         }))

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> table_creation
table_creation >> [stage_events_to_redshift, stage_songs_to_redshift]
[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table
load_songplays_table >> load_dim_subdag
load_dim_subdag >> run_quality_checks