Ejemplo n.º 1
0
def sub_dag(parent_dag_name, child_dag_name, start_date, schedule_interval,
            *args, **kwargs):
    dag = DAG(
        '%s.%s' % (parent_dag_name, child_dag_name),
        start_date=start_date,
        schedule_interval=schedule_interval,
    )

    run_check_songplays = DataQualityOperator(
        task_id='Run_data_check_songplays',
        dag=dag,
        table='songplays',
        column='playid',
        is_null=['playid', 'start_time', 'userid'],
        redshift_conn_id='redshift_conn_id')

    run_check_user = DataQualityOperator(task_id='Run_data_check_user',
                                         dag=dag,
                                         table='users',
                                         column='userid',
                                         is_null=['userid'],
                                         redshift_conn_id='redshift_conn_id')

    run_check_song = DataQualityOperator(task_id='Run_data_check_song',
                                         dag=dag,
                                         table='songs',
                                         column='songid',
                                         is_null=['songid'],
                                         redshift_conn_id='redshift_conn_id')

    run_check_artist = DataQualityOperator(task_id='Run_data_check_artist',
                                           dag=dag,
                                           table='artists',
                                           column='artistid',
                                           is_null=['artistid'],
                                           redshift_conn_id='redshift_conn_id')

    run_check_time = DataQualityOperator(task_id='Run_data_check_time',
                                         dag=dag,
                                         table='"time"',
                                         column='start_time',
                                         is_null=['start_time'],
                                         redshift_conn_id='redshift_conn_id')

    return dag
Ejemplo n.º 2
0
def stage_s3_to_redshift_dag(
    parent_dag_name: str,
    task_id: str,
    redshift_conn_id: str = "",
    aws_credentials_id: str = "",
    target_table: str = "",
    s3_bucket: str = None,
    s3_key: str = None,
    json_path: Optional[str] = None,
    ignore_headers: Optional[int] = None,
    delimiter: Optional[str] = None,
    default_args: Dict[str, Any] = dict,
    *args,
    **kwargs,
):
    dag = DAG(dag_id=f"{parent_dag_name}.{task_id}",
              default_args=default_args,
              **kwargs)

    stage_events_to_redshift = StageToRedshiftOperator(
        task_id=f"{parent_dag_name}.Stage_events",
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        target_table=target_table,
        s3_bucket=s3_bucket,
        s3_key=s3_key,
        json_path=json_path,
        ignore_headers=ignore_headers,
        delimiter=delimiter,
        dag=dag,
        *args,
        **kwargs)

    validation_songplays = DataQualityValidator(
        sql_statement=f"SELECT COUNT(*) FROM {target_table}",
        result_to_assert=0,
        should_assert_for_equality=False,
    )

    check_data_task = DataQualityOperator(
        task_id=f"{parent_dag_name}.Data_Quality_Check",
        redshift_conn_id=redshift_conn_id,
        data_quality_validations=[validation_songplays],
        dag=dag,
    )

    stage_events_to_redshift >> check_data_task

    return dag
Ejemplo n.º 3
0
)

dim_operators = [
    StageToRedshiftOperator(
        task_id=f"Copy_{dim_table_name}_dim_table",
        dag=dag,
        table=dim_table_name,
        conn_id=REDSHIFT_CONN_ID,
        aws_credentials_id=AWS_CREDENTIALS_ID,
        s3_bucket=INPUT_BUCKET,
        s3_key=f"{dim_table_name}.csv/*.csv",
        file_format="CSV",
        provide_context=True,
    ) for dim_table_name in dim_table_names
]

run_quality_checks = DataQualityOperator(
    task_id="Run_data_quality_checks",
    dag=dag,
    conn_id=REDSHIFT_CONN_ID,
    tables=dim_table_names + [fact_table_name],
)

end_operator = DummyOperator(task_id="Stop_execution", dag=dag)

start_operator >> [process_sale, process_postcode]
process_sale >> create_sale_tables >> dim_operators[:-1] + [stage_sale_table]
process_postcode >> create_postcode_tables >> dim_operators[-1]
dim_operators + [stage_sale_table] >> run_quality_checks
run_quality_checks >> end_operator
load_dim_date = StageToRedshiftOperator(
    task_id='load_dim_date',
    dag=dag,
    table="dim_date",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="i94project",
    s3_key="stage/output/date.parquet",
    # copy_options=("CSV", "REGION 'us-west-2'", "IGNOREHEADER 1")
    copy_options=('FORMAT AS PARQUET',)
)

checks = SqlChecks()
run_quality_checks = DataQualityOperator(
    task_id='run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    dq_checks=checks.load_dq_checks()
)

end_operator = DummyOperator(task_id='end_execution', dag=dag)

# task dependencies
start_operator >> [load_dim_arrival_type,
                   load_dim_visa_mode,
                   load_dim_visa_type,
                   process_fact_immigration,
                   process_dim_us_state,
                   process_dim_us_airport,
                   process_dim_country]

process_fact_immigration >> [load_dim_date,
Ejemplo n.º 5
0
    table="songplays",
    data_source=SqlQueries.songplay_table_insert,
    dag=dag)

dim_tables_and_sources = [
    ("users", SqlQueries.user_table_insert),
    ("songs", SqlQueries.song_table_insert),
    ("artists", SqlQueries.artist_table_insert),
    ("time", SqlQueries.time_table_insert),
]

load_dimension_tables = LoadDimensionOperator(task_id='load_dim_tables',
                                              redshift_conn_id="redshift",
                                              tables=dim_tables_and_sources,
                                              dag=dag)

run_quality_checks = DataQualityOperator(task_id='run_data_quality_checks',
                                         redshift_conn_id="redshift",
                                         tables=dim_tables_and_sources,
                                         dag=dag)

end_operator = DummyOperator(task_id='stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_dimension_tables
load_dimension_tables >> run_quality_checks
run_quality_checks >> end_operator
    select_sql=SqlQueries.insert_artists_table,
    mode='truncate')

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    table='time',
    select_sql=SqlQueries.insert_time_table,
    mode='truncate')

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    check_stmts=[{
        'sql': 'SELECT COUNT(*) FROM songplays;',
        'op': 'gt',
        'val': 0
    }, {
        'sql': 'SELECT COUNT(*) FROM songplays WHERE songid IS NULL;',
        'op': 'eq',
        'val': 0
    }])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# DAG dependencies
start_operator >> create_table_staging_songs
start_operator >> create_table_staging_events
start_operator >> create_table_songplays
start_operator >> create_table_artists
start_operator >> create_table_songs
start_operator >> create_table_users
Ejemplo n.º 7
0
    redshift_conn_id='redshift',
    table='artists',
    sql=SqlQueries.artist_table_insert,
    append=False)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='time',
    sql=SqlQueries.time_table_insert,
    append=False)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    tables=['songplays', 'users', 'songs', 'artists', 'time'])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

#Task dependencies
start_operator >> create_tables_task

create_tables_task >> stage_events_to_redshift >> load_songplays_table
create_tables_task >> stage_songs_to_redshift >> load_songplays_table

load_songplays_table >> load_artist_dimension_table >> run_quality_checks
load_songplays_table >> load_user_dimension_table >> run_quality_checks
load_songplays_table >> load_song_dimension_table >> run_quality_checks
load_songplays_table >> load_time_dimension_table >> run_quality_checks
    redshift_conn_id='redshift',
    query_list=dim_tables_insert_queries,
    query_type='insert',
    dag=dag,
)

insert_facts_tables = RedshifQueriesOperator(
    task_id='insert_facts_tables',
    redshift_conn_id='redshift',
    query_list=facts_tables_insert_queries,
    query_type='insert',
    dag=dag,
)

run_quality_checks = DataQualityOperator(task_id='run_quality_checks',
                                         redshift_conn_id='redshift',
                                         tables_list=tables_list,
                                         dag=dag)

clean_demography_csv >> copy_demography_to_S3

clean_airports_csv >> copy_airports_to_S3

clean_immigration_parquet >> copy_immigration_to_S3

[copy_demography_to_S3, copy_airports_to_S3, copy_immigration_to_S3
 ] >> drop_redshift_tables >> create_redshift_tables >> [
     stage_demography_to_redshift, stage_airports_to_redshift,
     stage_immigration_to_redshift
 ] >> insert_dim_tables >> insert_facts_tables >> run_quality_checks
Ejemplo n.º 9
0
                                         , dag=dag
                                         , task_id='Load_Special_Dimension_Table'
                                         )
city_dim_table = LoadDimensionOperator(table='city_dim'
                                      , sql_query=SqlQueries.city_dim_insert
                                      , redshift_conn_id=redshift_conn_id
                                      , dag=dag
                                      , task_id='Load_City_Dimension_Table'
                                      )

# Check Data Quality
run_quality_checks = DataQualityOperator(tables=['candidate_fact'
                                                , 'city_fact'
                                                , 'candidate_dim'
                                                , 'student_dim'
                                                , 'special_dim'
                                                , 'city_dim']
                                        , redshift_conn_id=redshift_conn_id
                                        , dag=dag
                                        , task_id='Checking_Data_Quality'
                                        )
# Ending Operator
end_operator = DummyOperator(task_id='Stop_Execution')

#################################

# DAG: Start loading data into S3
start_operator >> brazil_to_s3
start_operator >> enem_to_s3

# DAG: Load data from S3 into Redshift stage tables
Ejemplo n.º 10
0
load_covid19_fact_table = LoadTableOperator(
    task_id='load_covid19_fact_table',
    dag=dag,
    table='fact.us_covid_19',
    select_sql=SqlLoad.insert_covid_table,
    mode='truncate')

load_accident_fact_table = LoadTableOperator(
    task_id='load_accident_fact_table',
    dag=dag,
    table='fact.us_accidents',
    select_sql=SqlLoad.insert_accident_table,
    mode='truncate')

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         check_stmts=data_quality_args)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# DAG dependencies

start_operator >> [
    create_staging_schema, create_dim_schema, create_fact_schema
]

[create_staging_schema, create_dim_schema, create_fact_schema
 ] >> schema_created

schema_created >> [
    create_table_staging_covid_19, create_table_staging_us_demographics,
Ejemplo n.º 11
0
    task_id='Load_i94visitors_fact',
    dag=dag,
    redshift_conn_id="redshift",
    table_query=SqlQueries.visitors_fact_insert)

load_dates_dim = LoadDimensionOperator(task_id='Load_dates_dim',
                                       dag=dag,
                                       redshift_conn_id="redshift",
                                       table_query=SqlQueries.dates_dim_insert,
                                       table="dates_dim",
                                       truncate_flag='Y')

run_quality_check = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    check_query=
    "select count(1) from public.i94visitors_fact where reasonforvisit is null",
    expected_count=10000)

run_intg_check = IntegrityCheckOperator(
    task_id='Run_data_integrity_check',
    dag=dag,
    redshift_conn_id="redshift",
    check_query=
    "select count(airportid) from public.i94visitors_fact where airportid not in (select airportid from airports_dim)",
    table_name="i94visitors_fact")

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_i94vistors
Ejemplo n.º 12
0
    table='artists',
    redshift_conn_id='redshift',
    sql_create=SqlQueries.artist_table_create,
    sql_select=SqlQueries.artist_table_insert)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    table='time',
    redshift_conn_id='redshift',
    sql_create=SqlQueries.time_table_create,
    sql_select=SqlQueries.time_table_insert)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    tables=['songplays', 'users', 'artists', 'songs'],
    sql_check=SqlQueries.check)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

#---#---# Define Task Dependencies #---#---#

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_time_dimension_table
load_songplays_table >> load_user_dimension_table
    dag=dag)

load_dim_foreign_table = LoadToRedshiftOperator(
    task_id='Load_Dim_Foreign_Table',
    aws_credentials_id='aws_credentials',
    redshift_conn_id='redshift',
    s3_bucket='my-bucket',
    s3_key='dim_foreign_table',
    table='dim_foreign_table',
    mode='truncate',
    dag=dag)

run_quality_checks = DataQualityOperator(task_id='Run_Data_Quality_Checks',
                                         redshift_conn_id='redshift',
                                         tables=[
                                             'fact_table', 'dim_state_table',
                                             'dim_visa_table',
                                             'dim_foreign_table'
                                         ],
                                         dag=dag)

end_operator = DummyOperator(task_id='Stop_Execution', dag=dag)

start_operator >> load_fact_table

load_fact_table >> [
    load_dim_state_table, load_dim_visa_table, load_dim_foreign_table
]

[load_dim_state_table, load_dim_visa_table, load_dim_foreign_table
 ] >> run_quality_checks
    target_table="artists",
    sql=SqlQueries.artist_table_insert,
    append = True
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    target_table="time",
    sql=SqlQueries.time_table_insert
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    tables=[ 'songplays' ,'songs','artists', 'users','time']
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

start_operator.set_downstream([stage_events_to_redshift, stage_songs_to_redshift])

load_songplays_table.set_upstream(
    [stage_events_to_redshift, stage_songs_to_redshift])
    
load_songplays_table.set_downstream([load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table,load_time_dimension_table])

run_quality_checks.set_upstream(
    [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table,load_time_dimension_table])
    
    sql_load_data=SqlQueries.artist_table_insert
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    table="time",
    redshift_conn_id="redshift",
    sql_load_data=SqlQueries.time_table_insert
)


run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    dq_query = 'SELECT COUNT(*) FROM public.songplays WHERE userid IS NULL',
    expected_result = 0
)


end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)


start_operator >> create_tables

create_tables >> stage_events_to_redshift
create_tables >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
Ejemplo n.º 16
0
load_time_dimension_table = LoadDimensionOperator(
    task_id=load_time_dimension_table_task_id,
    redshift_conn_id="redshift",
    table="time",
    aws_credentials_id="aws_credentials",
    start_date= datetime(2018, 5, 1),
    sql_source=SqlQueries.time_table_insert,
    dag=dag
)

run_quality_checks = DataQualityOperator(
    task_id=run_data_quality_checks_task_id,
    redshift_conn_id="redshift",
    table="time",
    dag=dag,
    provide_context=True,
    aws_credentials_id="aws_credentials",
    tables=["staging_events", "users", 'staging_songs', "songs", "artists", "time"]
)

end_operator = DummyOperator(task_id=stop_execution_task_id,  dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_time_dimension_table
Ejemplo n.º 17
0
    redshift_conn_id="redshift",
    sql_query=SqlQueries.time_table_insert,
    mode="truncate-insert")

# Task to perform quality checks on the data uploaded in Redshift
run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    qa_check_list=[{
        'check_sql': "SELECT COUNT(*) FROM users WHERE userid is null",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM users",
        'expected_result': 104
    }, {
        'check_sql': "SELECT COUNT(*) FROM artists WHERE artistid is null",
        'expected_result': 0
    }, {
        'check_sql':
        "SELECT COUNT(*) FROM songplays WHERE playid is null AND userid IS NULL",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM songs WHERE songid is null",
        'expected_result': 0
    }],
    redshift_conn_id="redshift")

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# order of execution for the dag
start_operator >> create_table
Ejemplo n.º 18
0
        "bc",
        "temperature",
        "presure",
        "humidity",
        "clouds",
        "wind_speed",
        "wind_deg",
        "measure_date",
        "weather_id",
    ],
)

run_quality_checks = DataQualityOperator(
    task_id="Run_data_quality_checks",
    dag=dag,
    conn_id="postgres",
    table=
    None,  # If table is none, means that quality checks are done to all tables
)

end_operator = DummyOperator(task_id="Stop_execution", dag=dag)

# Level 1
start_operator >> create_location_table
start_operator >> create_time_table
start_operator >> create_weather_table
start_operator >> create_staging_table

# Level 2
create_location_table >> create_measures_table
create_location_table >> populate_location_table
Ejemplo n.º 19
0
        parent_dag_name=dag_name,
        task_id=load_time_dimension_table_task_id,
        redshift_conn_id="redshift",
        aws_credentials_id="aws_credentials",
        table="users",
        start_date= datetime(2018, 5, 1),
        sql_query=SqlQueries.time_table_insert,
    ),
    task_id=load_time_dimension_table_task_id,
    dag=dag
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    provide_context=True,
    aws_credentials_id="aws_credentials",
    redshift_conn_id='redshift',
    tables=["songplay", "users", "song", "artist", "time"]
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

# Setting tasks dependencies

start_operator >> create_redshift_tables >> [stage_songs_to_redshift, stage_events_to_redshift]

[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table

load_songplays_table >> [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table,
                           load_time_dimension_table] >> run_quality_checks
Ejemplo n.º 20
0
)

copy_business_reviews_toRedshift = StageToRedshiftOperator(
    task_id='copy_business_reviews_toRedshift',
    dag=dag,
    table="business_reviews",
    s3_bucket=Variable.get("yelp_bucket"),
    s3_key='output/business_reviews.json',
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    is_json=True
)

data_quality_task = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    table="business_reviews",
)

end_operator = DummyOperator(task_id='end_of_execution', dag=dag)

copy_start_operator = DummyOperator(task_id='copy_start_operator', dag=dag)

start_operator >> state_dim_task >> business_reviews_fact_task
start_operator >> category_dim_task >> business_reviews_fact_task
start_operator >> ambience_dim_task >> business_reviews_fact_task
start_operator >> user_dim_task >> business_reviews_fact_task

# business_reviews_fact_task >> delete_from_s3_task
business_reviews_fact_task >> create_tables_task
Ejemplo n.º 21
0
drop_tables = SQLFileOperator(dag=dag,
                              task_id='drop_tables',
                              query_file='drop_tables.sql',
                              message='Dropping tables',
                              should_run=should_run)
create_tables = SQLFileOperator(dag=dag,
                                task_id='create_tables',
                                query_file='create_tables.sql',
                                message='Creating tables',
                                should_run=should_run)

copy_immigration_table = CSVToTableOperator(dag=dag,
                                            task_id='copy_immigration_table',
                                            should_run=should_run)
copy_airport_codes_table = CSVToTableOperator(
    dag=dag, task_id='copy_airport_codes_table', should_run=should_run)
copy_global_temperatures_table = CSVToTableOperator(
    dag=dag, task_id='copy_global_temperatures_table', should_run=should_run)
copy_us_cities_table = CSVToTableOperator(dag=dag,
                                          task_id='copy_us_cities_table',
                                          should_run=should_run)
quality_checks_task = DataQualityOperator(dag=dag,
                                          task_id='data_quality_checks',
                                          quality_checks=quality_checks,
                                          should_run=should_run)

(start_operator >> drop_tables >> create_tables >> [
    copy_immigration_table, copy_airport_codes_table,
    copy_global_temperatures_table, copy_us_cities_table
] >> quality_checks_task >> end_operator)
Ejemplo n.º 22
0
          catchup=False
          )


start_data_to_redshift_operation = DummyOperator(
    task_id='Begin_Migrating_Data_To_Staging_Tables',  dag=dag)

end_data_to_redshift_operation = DummyOperator(
    task_id='Begin_Loading_Data_To_Fact_Dimension_Tables',  dag=dag)

end_of_tasks = DummyOperator(
    task_id='End_Of_Execution',  dag=dag)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    conn_id='redshift',
    target_tables=["dim_airport_table", "dim_demographic_table", "dim_visitor_table", "fact_city_data_table"],
)

for data_type, table_name in staging_tables.items():
    stage_data_to_redshift = S3ToRedshiftOperator(
        task_id='Stage_' + data_type,
        dag=dag,
        table=table_name,
        drop_table=True,
        s3_bucket='udend-data',
        s3_folder=data_type,
        aws_connection_id='aws_credentials',
        redshift_connection_id='redshift',
        create_query=globals()[table_name],
        copy_options="json 'auto'"
Ejemplo n.º 23
0
load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
    redshift_conn_id="redshift",
    destination_table="artists",
    sql_query=SqlQueries.artist_table_insert,
    dag=dag)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    redshift_conn_id="redshift",
    destination_table="time",
    sql_query=SqlQueries.time_table_insert,
    dag=dag)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         redshift_conn_id="redshift",
                                         target_table="time",
                                         dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_time_dimension_table
load_time_dimension_table >> run_quality_checks
load_artist_dimension_table >> run_quality_checks
load_song_dimension_table >> run_quality_checks
Ejemplo n.º 24
0
    table='lounges',
    redshift_conn_id='redshift',
    sql=SqlQueries.lounges_table_insert)

load_fact_ratings_table = LoadFactOperator(
    task_id='Load_fact_ratings_fact_table',
    dag=dag,
    redshift_conn_id='redshift',
    sql=SqlQueries.fact_ratings_table_insert)

ensure_data_load_in_dims = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    test_cases=[
        (SqlQueries.airlines_count_test, operator.gt, 0),
        (SqlQueries.aircrafts_count_test, operator.gt, 0),
        (SqlQueries.lounges_count_test, operator.gt, 0),
        (SqlQueries.aircrafts_count_test, operator.gt, 0),
        (SqlQueries.aircrafts_count_test, operator.gt, 0),
    ])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_airlines_to_redshift
start_operator >> stage_airports_to_redshift
start_operator >> stage_lounges_to_redshift
start_operator >> stage_seats_to_redshift

stage_airlines_to_redshift >> load_airlines_dimension_table >> load_aircrafts_dimension_table >> load_passengers_dimension_table
stage_airports_to_redshift >> load_airports_dimension_table >> load_lounges_dimension_table >> load_passengers_dimension_table
Ejemplo n.º 25
0
    insert_query=SqlQueries.rides_table_insert,
    task_id='Load_rides_facts_table',
    dag=dag)

load_stations_dimension_table = LoadDimensionOperator(
    redshift_conn_id="redshift",
    table="stations",
    create_query=SqlQueries.create_stations,
    insert_query=SqlQueries.stations_table_insert,
    task_id='Load_stations_dim_table',
    dag=dag)

load_weather_dimension_table = LoadDimensionOperator(
    redshift_conn_id="redshift",
    table="weather",
    create_query=SqlQueries.create_weather,
    insert_query=SqlQueries.weather_table_insert,
    task_id='Load_weather_dim_table',
    dag=dag)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         redshift_conn_id="redshift",
                                         dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> [stage_weather_raw_to_redshift, stage_bikes_raw_to_redshift
                   ] >> wait_operator >> [
                       load_rides_facts_table, load_stations_dimension_table,
                       load_weather_dimension_table
                   ] >> run_quality_checks >> end_operator
    final_table=dim_artists_table_name,
    dql_sql=SqlQueries.artist_table_insert,
    dag=dag)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    redshift_conn_id=AIRFLOW_REDSHIFT_CONN_ID,
    final_table=dim_time_table_name,
    dql_sql=SqlQueries.time_table_insert,
    dag=dag)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    redshift_conn_id=AIRFLOW_REDSHIFT_CONN_ID,
    data_quality_validations=[
        validator_stage_events, validator_stage_songs, validator_songplays,
        validator_songs, validator_artists, validator_events, validator_time,
        validator_users
    ],
    dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# DAG dependency setup
start_operator >> db_setup_task

db_setup_task >> [
    stage_events_s3_to_redshift_and_validate_task,
    stage_songs_s3_to_redshift_and_validate_task,
] >> load_songplays_table_task
Ejemplo n.º 27
0
    redshift_conn_id="redshift",
    table="artists",
    sql=SqlQueries.artist_table_insert,
    update_strategy="overwrite")

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="time",
    sql=SqlQueries.time_table_insert,
    update_strategy="overwrite")

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    tables=["songplays", "artists", "songs", "users", "time"])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# Tasks dependencies
start_operator >> create_tables >> [
    stage_events_to_redshift, stage_songs_to_redshift
]

[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table

load_songplays_table >> [
    load_user_dimension_table, load_song_dimension_table,
    load_artist_dimension_table, load_time_dimension_table
    dag=dag,
    table="d_port",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket=bucket,
    s3_key='immigration_processed_files/port/I94_Port.csv',
    file_format='csv',
    create_stmt=SqlQueries.create_table_d_port)

# Check loaded data not null

run_quality_checks_airports = DataQualityOperator(
    task_id="run_quality_checks_airports",
    dag=dag,
    redshift_conn_id="redshift",
    dq_checks=[{
        'check_sql':
        "SELECT COUNT(*) FROM staging_airport WHERE ident is null",
        'expected_result': 0
    }])

run_quality_checks_us_cities_demo = DataQualityOperator(
    task_id="run_quality_checks_us_cities_demo",
    dag=dag,
    redshift_conn_id="redshift",
    dq_checks=[{
        'check_sql':
        "SELECT COUNT(*) FROM staging_us_cities_demographics WHERE city is null",
        'expected_result': 0
    }])
Ejemplo n.º 29
0
    s3_path='s3://capstone-project-janga/equipments.json',
    table='equipments'
)

stage_notifications_to_redshift = StageToRedshiftOperator(
    task_id='Stage_notifications',
    dag=dag,
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    s3_path='s3://capstone-project-janga/notifications.json/',
    table='notifications'
)

test_equipments_and_notifications_table = DataQualityOperator(
    task_id='test_equipments_and_notifications_table',
    dag=dag,
    redshift_conn_id='redshift',
    tables=['equipments', 'notifications', ]
)

insert_locations = PythonOperator(
    task_id='Insert_locations',
    dag=dag,
    python_callable=create_location_table
)

insert_equipment_notifications = PythonOperator(
    task_id='Insert_equipment_notifications',
    dag=dag,
    python_callable=create_equipment_notification_table
)
Ejemplo n.º 30
0
    insert_mode='truncate'
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='redshift', 
    table='time',
    sql_query=SqlQueries.time_table_insert,
    insert_mode='truncate'
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',     
    sql_query='select count(*) from songplays where userid is null;',
    expected_result=0
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

#
# Task ordering for the DAG tasks 
# 
start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table