Example #1
0
def get_fact_table_dag(parent_dag_name,
                       task_id,
                       redshift_conn_id,
                       table,
                       columns,
                       append=True,
                       *args,
                       **kwargs):

    dag = DAG(f"{parent_dag_name}.{task_id}", *args, **kwargs)

    begin_execution_task = DummyOperator(task_id='Begin_execution', dag=dag)

    create_task = PostgresOperator(
        task_id=f"create_{table}_table",
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=SQLQueries.create_load_test_queries[table]["CREATE"])

    load_fact_dimension_task = LoadFactOperator(
        task_id=f"load_{table}_fact_table",
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table=table,
        columns=columns,
        sql_stmt=SQLQueries.create_load_test_queries[table]["LOAD"],
        append=append)

    data_quality_check_task = DataQualityOperator(
        task_id=f"data_quality_check_{table}_fact_table",
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table=table,
    )

    end_execution_task = DummyOperator(task_id='End_execution', dag=dag)

    begin_execution_task >> create_task
    create_task >> load_fact_dimension_task
    load_fact_dimension_task >> data_quality_check_task
    data_quality_check_task >> end_execution_task

    return dag
Example #2
0
def run_data_quality_subdag(parent_dag_name, child_dag_name, args, checks):
    dag_subdag = DAG(
        dag_id=f'{parent_dag_name}.{child_dag_name}',
        default_args=args,
    )
    with dag_subdag:
        for check in checks:
            table = check.get('table')
            tests = check.get('tests')
            task = DataQualityOperator(
                task_id=f'Run_{table}_data_quality_check',
                default_args=args,
                dag=dag_subdag,
                redshift_conn_id='redshift',
                table=table,
                queries=SqlQueries,
                tests=tests)

    return dag_subdag
def get_load_check_dag(
    parent_dag_name="",
    task_id="",
    conn_id="",
    table="",
    sql_select="",
    load_type="",
    start_date="",
    *args,
    **kwargs,
):
    """
    TODO:
    """
    dag = DAG(
        f"{parent_dag_name}.{task_id}",
        start_date=start_date,
        **kwargs,
    )

    if load_type == "fact":
        load_operator = LoadFactOperator
    elif load_type == "dimension":
        load_operator = LoadDimensionOperator
    else:
        raise ValueError(f"Unknown load_type {load_type}")

    load_task = load_operator(task_id=f"load_{table}_{load_type}_table",
                              dag=dag,
                              conn_id=conn_id,
                              table=table,
                              sql_select=sql_select)

    run_quality_check_task = DataQualityOperator(
        task_id=f'run_data_quality_check_{table}',
        dag=dag,
        sql_quality=f"SELECT COUNT(*) FROM {table}",
        condition=test_contains_rows,
    )

    load_task >> run_quality_check_task

    return dag
def get_data_quality_dag(parent_dag_name, task_id, conn_id, tests, *args,
                         **kwargs):

    # inherit DAG parameters
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    logging.info("Run all Tests")

    # Run all Tests
    for test in tests:
        test_task = DataQualityOperator(
            task_id=test['id'],
            dag=dag,
            conn_id=conn_id,
            test_sql=test['test_sql'],
            expected_result=test['expected_result'],
            test_callable=test_func,
            test_name=test['id'])

    # Tests can run in parallel without task dependencies
    return dag
Example #5
0
def load_facts(parent_dag_name, child_dag_name, start_date, redshift_conn_id):
    dag = DAG(
        '%s.%s' % (parent_dag_name, child_dag_name),
        start_date=start_date,
    )

    load_fact_bookings = LoadFactOperator(
        task_id='load_bookings',
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        sql=load_statements.LOAD_BOOKING_FACTS,
        table='fact_bookings')

    run_quality_checks_facts = DataQualityOperator(
        task_id='data_quality_checks_facts',
        dag=dag,
        tables='fact_bookings',
        redshift_conn_id=redshift_conn_id,
        sql='SELECT COUNT(*) FROM {}')

    load_fact_bookings >> run_quality_checks_facts

    return dag
    redshift_conn_id='redshift',
    sql_statement=SqlQueries.artist_table_insert)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    table_name='time',
    redshift_conn_id='redshift',
    sql_statement=SqlQueries.time_table_insert)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    dq_checks=[{
        'check_sql': "SELECT COUNT(*) FROM users WHERE userid is null",
        'expected_results': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM songs WHERE songid is null",
        'expected_results': 0
    }])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_events_to_redshift >> load_songplays_table
start_operator >> stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_time_dimension_table
load_user_dimension_table >> run_quality_checks
    dag=dag
    redshift_conn_id="redshift",
    table='artists',
    sql_statement=SqlQueries.artist_table_insert
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag
    redshift_conn_id="redshift",
    table='time',
    sql_statement=SqlQueries.time_table_insert
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag
    redshift_conn_id="redshift",
    tables=['songplays', 'users', 'songs', 'artists', 'time']
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)


start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift


stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table
Example #8
0
    task_id='Load_artist_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="artists",
    sql_query=SqlQueries.artist_table_insert)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="time",
    sql_query=SqlQueries.time_table_insert)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    table_list=["songplay", "users", "songs", "artists", "time"])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> creates_tables

creates_tables >> stage_songs_to_redshift
creates_tables >> stage_events_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table

load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
Example #9
0
    target_table='artists',
    truncate_before_load=True)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    load_query=SqlQueries.time_table_insert,
    target_table='time',
    truncate_before_load=True)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         redshift_conn_id="redshift",
                                         queries_and_results=[
                                             SqlQueries.songplays_validation,
                                             SqlQueries.songs_validation,
                                             SqlQueries.artists_validation,
                                             SqlQueries.users_validation,
                                             SqlQueries.time_validation
                                         ])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table

load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_user_dimension_table
Example #10
0
default_args = {
    'owner': 'sparkify',
    'depends_on_past': False,
    'start_date': datetime(2018, 11, 1),
    'retries': 0,
    'retry_delay': timedelta(seconds=15),
    'catchup_by_default': False,
    'email_on_retry': False
}

dag = DAG('etl_dag_quality',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *',
          max_active_runs=1)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    target_table=dag_config['data_quality_check_tables'],
)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> run_quality_checks
run_quality_checks >> end_operator
                                            table_to='state',
                                            options=["FORMAT AS PARQUET"],
                                            dag=dag)

date_to_redshift = StageToRedshiftOperator(task_id='Date_Dimension_Table',
                                           aws_conn_id='aws_credentials',
                                           redshift_conn_id="redshift",
                                           s3_from='data-engineer-capstone',
                                           s3_prefix='date.parquet',
                                           schema_to='public',
                                           table_to='date',
                                           options=["FORMAT AS PARQUET"],
                                           dag=dag)

run_quality_checks = DataQualityOperator(
    task_id='Data_Quality_Checks',
    redshift_conn_id="redshift",
    tables=['immigration', 'country', 'state', 'date'],
    dag=dag)

end_operator = DummyOperator(task_id='End', dag=dag)

start_operator >> immigration_to_redshift
immigration_to_redshift >> country_to_redshift
immigration_to_redshift >> state_to_redshift
immigration_to_redshift >> date_to_redshift
country_to_redshift >> run_quality_checks
state_to_redshift >> run_quality_checks
date_to_redshift >> run_quality_checks
run_quality_checks >> end_operator
    dimension_name='artists'
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    dimension_name='time'
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    tests=[
        {"test": count_query.format("songplays"), "expected_result": [6820] } , 
        {"test": count_query.format("songs"), "expected_result": [14896] },
        {"test": count_query.format("users"), "expected_result": [104] },
        {"test": count_query.format("artists"), "expected_result": [10025] },
        {"test": count_query.format("time"), "expected_result": [6813] }
    ]
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

start_operator >> drop_and_create_tables_task

drop_and_create_tables_task >> stage_events_to_redshift
drop_and_create_tables_task >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
    table="Staging_aggregations",
    conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="social-system-test/",
    s3_key="temp/post_agg_{run_id}.csv",
    region="us-east-1",
    file_type="csv")

load_history_table = LoadFactOperator(
    task_id='Load_history_fact_table',
    dag=dag,
    provide_context=True,
    conn_id='redshift',
    table='history',
    query=SqlQueries.get_profile_history,
    truncate=True,
)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         provide_context=True,
                                         conn_id='redshift',
                                         tables=["history"])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_users_to_redshift
start_operator >> get_ES_data >> stage_aggregations_to_redshift
[stage_users_to_redshift, stage_aggregations_to_redshift] >> load_history_table
load_history_table >> run_quality_checks >> end_operator
Example #14
0
                                        sql='/sql/fact_reviews.sql',
                                        postgres_conn_id='redshift')
process_fact_reviews.set_upstream(
    [process_dim_times, process_dim_users, process_dim_business])

process_fk = PostgresOperator(dag=dag,
                              task_id='process_foreign_keys',
                              sql='/sql/dim_fk.sql',
                              postgres_conn_id='redshift')
process_fk.set_upstream([process_fact_tips, process_fact_reviews])

run_quality_checks = DataQualityOperator(task_id='run_data_quality_checks',
                                         dag=dag,
                                         redshift_conn_id='redshift',
                                         queries=({
                                             "table": "dim_times",
                                             "where": "day IS NULL",
                                             "result": 0
                                         }, {
                                             "table": "fact_review",
                                             "where": "user_id IS NULL",
                                             "result": 0
                                         }, {
                                             "table": "fact_review",
                                             "result": 6685900
                                         }))
run_quality_checks.set_upstream(process_fk)

end_operator = DummyOperator(dag=dag, task_id='end_operator')
end_operator.set_upstream(run_quality_checks)
    insert_mode='truncate')

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    provide_context=True,
    redshift_conn_id=redshift_credentials_id,
    target_table='time',
    sql=SqlQueries.time_table_insert,
    insert_mode='truncate')

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    provide_context=True,
    retries=3,
    redshift_conn_id=redshift_credentials_id,
    test_sql='',
    test_tbl='',
    expcted_results=0)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

###################
# add ordering
###################
start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    table='time',
    select_sql_stmt=SqlQueries.time_table_insert,
    dag=dag)

sql_count = 'SELECT COUNT(*) FROM {}'
has_rows_checker = lambda records: len(records) == 1 and len(records[
    0]) == 1 and records[0][0] > 0
has_no_rows_checker = lambda records: len(records) == 1 and len(records[
    0]) == 1 and records[0][0] == 0
run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    postgres_conn_id='redshift',
    sql_stmts=(sql_count.format('songplays'), sql_count.format('users'),
               sql_count.format('songs'), sql_count.format('artists'),
               sql_count.format('time'),
               'SELECT COUNT(*) FROM users WHERE first_name IS NULL'),
    result_checkers=(has_rows_checker, has_rows_checker, has_rows_checker,
                     has_rows_checker, has_rows_checker, has_no_rows_checker),
    dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# Define dependencies
start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table

load_songplays_table >> load_song_dimension_table
        parent_dag_name=dag_name,
        task_id=load_artist_dimension_table_task_id,
        redshift_conn_id="redshift",
        aws_credentials_id="aws_credentials",
        table="users",
        start_date=datetime(2018, 5, 1),
        sql_query=SqlQueries.artist_table_insert,
    ),
    task_id=load_artist_dimension_table_task_id,
    dag=dag,
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    provide_context=True,
    aws_credentials_id="aws_credentials",
    redshift_conn_id='redshift',
    tables=["songplay", "users", "song", "artist", "time"])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# Setting tasks dependencies

start_operator >> create_redshift_tables >> [
    stage_songs_to_redshift, stage_events_to_redshift
]

[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table

load_songplays_table >> [
Example #18
0
    append_only=False)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="time",
    sql=getattr(SqlQueries, "time_table_insert"),
    append_only=False)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    tables=["songplays", "users", "songs", "artists", "time"],
    checks=[{
        'check_sql': "SELECT COUNT(*) FROM {}",
        'expected_result': True
    }, {
        'check_sql': "SELECT COUNT(*) FROM {} WHERE {} IS NULL",
        'expected_result': False
    }])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> [stage_events_to_redshift, stage_songs_to_redshift]

[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table

load_songplays_table >> [
    load_user_dimension_table, load_song_dimension_table,
    load_artist_dimension_table, load_time_dimension_table
    dag=dag,
    sql=SqlQueries.artist_table_insert,
    postgres_conn_id='redshift',
    mode='append',
    table='artists')

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    sql=SqlQueries.time_table_insert,
    postgres_conn_id='redshift',
    mode='append',
    table='time')

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    table=['songplays', 'songs', 'artists', 'users', 'time'])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> create_table
create_table >> stage_events_to_redshift
create_table >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_time_dimension_table
load_song_dimension_table >> run_quality_checks
load_user_dimension_table >> run_quality_checks
    redshift_conn_id="redshift",
    table="time",
    truncate_table=True,
    select_query=SqlQueries.time_table_insert
)

run_quality_checks = DataQualityOperator(
   task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    tables=[
        "songplays",
        "users",
        "songs",
        "artists",
        "time"
    ],
    dq_checks=[
            {'check_sql':'SELECT COUNT(*) FROM songplays' , 'expected_result': 47740},
            {'check_sql':'SELECT COUNT(*) FROM users' , 'expected_result': 104},
            {'check_sql':'SELECT COUNT(*) FROM songs' , 'expected_result': 14896},
            {'check_sql':'SELECT COUNT(*) FROM artists' , 'expected_result': 10025},
            {'check_sql':'SELECT COUNT(*) FROM time' , 'expected_result': 47740}
]
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

# Setting up the dependencies and order
start_operator >> [stage_events_to_redshift,stage_songs_to_redshift]
[stage_events_to_redshift,stage_songs_to_redshift] >> load_songplays_table
Example #21
0
    table='artists',
    redshift_conn_id='redshift',
    query=SqlQueries.artist_table_insert)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    switch='insert-delete',
    table='time',
    redshift_conn_id='redshift',
    query=SqlQueries.time_table_insert)

#task to check data quality
run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    provide_context=True,
    params={'table': ['artists', 'songplays', 'songs', 'users']})

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

#configure task dependencies
start_operator >> [stage_events_to_redshift, stage_songs_to_redshift]
[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table
load_songplays_table >> [
    load_user_dimension_table, load_song_dimension_table,
    load_artist_dimension_table, load_time_dimension_table
]
[
    load_user_dimension_table, load_song_dimension_table,
    load_artist_dimension_table, load_time_dimension_table
#load data to time table from stage table
load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="time",
    sql=SqlQueries.time_table_insert,
    loding_mode="delete-load")

#Data Quality Check
run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    params={
        #Check name: {sql for check: result}
        "sonig_null_check": {
            SqlQueries.songid_null_check: 0
        },
        "artistid_null_check": {
            SqlQueries.artistid_null_check: 0
        }
    })

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_artist_dimension_table
    aws_credentials_id='aws_credentials',
    sqlWrite=SqlQueries.artist_table_insert)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    table='time',
    redshift_conn_id='redshift_conn_id',
    aws_credentials_id='aws_credentials',
    sqlWrite=SqlQueries.time_table_insert)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         redshift_conn_id='redshift_conn_id',
                                         aws_credentials_id='aws_credentials',
                                         table=[
                                             "artists", "songplays", "songs",
                                             "staging_events", "staging_songs",
                                             "time", "users"
                                         ],
                                         dq_checks=dq_checks)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

#start_operator >> task_create_tables

#task_create_tables >> stage_events_to_redshift
#task_create_tables >> stage_songs_to_redshift

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
    dag=dag,
    conn_id="redshift",
    table="public.artists",
    sql_statement=SqlQueries.artist_table_insert)

# load the time redshift table using the user defined operator
load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    conn_id="redshift",
    table="public.time",
    sql_statement=SqlQueries.time_table_insert)

# CHeck the data quality of time table to check if the records loaded properly
run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         conn_id="redshift",
                                         table_name="public.time")

# End operator
end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# Task Dependency
# this has been designed in an such way that all staging will run parallely
# All Dimension will run parallely
start_operator >> [stage_events_to_redshift, stage_songs_to_redshift]
[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table
load_songplays_table >> [
    load_user_dimension_table, load_song_dimension_table,
    load_artist_dimension_table, load_time_dimension_table
]
[
    truncate=True
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    sql_query=SqlQueries.time_table_insert,
    table='time'
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    dq_checks=[
        {'check_sql': 'SELECT COUNT(*) FROM users WHERE userid IS NULL', 'result': 0},
        {'check_sql': 'SELECT COUNT(*) FROM artists WHERE artistid IS NULL', 'result': 0 },
        {'check_sql': 'SELECT COUNT(*) FROM songs WHERE songid IS NULL', 'result': 0},
        {'check_sql': 'SELECT COUNT(*) FROM time WHERE start_time IS NULL', 'result': 0 },
        {'check_sql': 'SELECT COUNT(*) FROM songplays WHERE playid IS NULL', 'result': 0}
    ]
)
end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

start_operator >> create_tables 
create_tables >> [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table
load_songplays_table >> [load_song_dimension_table, load_user_dimension_table, load_artist_dimension_table, load_time_dimension_table] >> run_quality_checks >> end_operator


    table='artists',
    replace=True,
    task_id='Load_artist_dim_table',
    dag=dag)

load_time_dimension_table = LoadDimensionOperator(
    redshift_conn_id='redshift',
    table='time',
    replace=True,
    task_id='Load_time_dim_table',
    dag=dag)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         redshift_conn_id='redshift',
                                         tables=[
                                             'staging_events', 'staging_songs',
                                             'users', 'songs', 'artists',
                                             'time'
                                         ])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator \
    >> [stage_events_to_redshift, stage_songs_to_redshift] \
    >> load_songplays_table \
    >> [
        load_song_dimension_table,
        load_user_dimension_table,
        load_artist_dimension_table,
        load_time_dimension_table
    ] >> run_quality_checks >> end_operator
Example #27
0
        create_table_sql=SqlQueries.create_artists_table,
        insert_table_sql=SqlQueries.insert_artist_table,
        mode="overwrite",
        target_table="artists")

    load_time_dimension_table = LoadDimensionOperator(
        task_id='Load_time_dim_table',
        redshift_conn_id="redshift",
        aws_credentials_id="aws_credentials",
        create_table_sql=SqlQueries.create_times_table,
        insert_table_sql=SqlQueries.insert_time_table,
        mode="overwrite",
        target_table="time")

    run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                             redshift_conn_id="redshift",
                                             table_name="songplays")

    end_operator = DummyOperator(task_id='Stop_execution')

# Make graph
start_operator >> [stage_events_to_redshift, stage_songs_to_redshift]
[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table
load_songplays_table >> [
    load_artist_dimension_table, load_song_dimension_table,
    load_time_dimension_table, load_user_dimension_table
]
[
    load_artist_dimension_table, load_song_dimension_table,
    load_time_dimension_table, load_user_dimension_table
] >> run_quality_checks
Example #28
0
    redshift_conn_id='redshift',
    table='artists',
    selection=SqlQueries.artist_table_insert
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='time',
    selection=SqlQueries.time_table_insert
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift'
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)


# ordering
start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table

load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
    dag=dag,
    table='songs',
    redshift_conn_id="redshift",
    load_sql_stmt=SqlQueries.song_table_insert
)

load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
    dag=dag,
    table='artists',
    redshift_conn_id="redshift",
    load_sql_stmt=SqlQueries.artist_table_insert
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    table='time',
    redshift_conn_id="redshift",
    load_sql_stmt=SqlQueries.time_table_insert
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    tables=['songplays', 'users', 'songs', 'artists', 'time'],
    redshift_conn_id="redshift"
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)
    table=fact_table_name_and_query[0],
    conn_id=REDSHIFT_CONN_ID,
    sql=fact_table_name_and_query[1],
)

dim_operators = [
    LoadDimensionOperator(
        task_id=f'Load_{dim_table_name}_dim_table',
        dag=dag,
        table=dim_table_name,
        conn_id=REDSHIFT_CONN_ID,
        sql=dim_query,
    ) for dim_table_name, dim_query in dim_tables_name_to_query.items()
]

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    conn_id=REDSHIFT_CONN_ID,
    tables=list(dim_tables_name_to_query) + [fact_table_name_and_query[0]],
)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> create_tables
create_tables >> [stage_events_to_redshift, stage_songs_to_redshift]
[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table
load_songplays_table >> dim_operators
dim_operators + [load_songplays_table] >> run_quality_checks
run_quality_checks >> end_operator