コード例 #1
0
    sql_statement=f"SELECT COUNT(*) FROM {table_name_staging_game_match}",
    result_to_assert=0,
    should_assert_for_equality=False,
)

validator_fact_game_match = DataQualityValidator(
    sql_statement=f"SELECT COUNT(*) FROM {table_name_fact_game_match}",
    result_to_assert=0,
    should_assert_for_equality=False,
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    redshift_conn_id=AWS_REDSHIFT_CONN_ID,
    data_quality_validations=[
        validator_staging_game_match,
        validator_fact_game_match
    ],
    dag=dag
)

# OPERATORS
start_operator = DummyOperator(
    task_id="Begin_Execution",
    dag=dag,
)
# fetch_external_summoner_data_to_s3_task = DummyOperator(
#     task_id="Fetch_External_Summoner_To_S3_Data_Task",
#     dag=dag,
# )
fetch_external_champion_to_s3_data_task = FetchAndStageChampionsExternalData(
コード例 #2
0
                                         , dag=dag
                                         , task_id='Load_Special_Dimension_Table'
                                         )
city_dim_table = LoadDimensionOperator(table='city_dim'
                                      , sql_query=SqlQueries.city_dim_insert
                                      , redshift_conn_id=redshift_conn_id
                                      , dag=dag
                                      , task_id='Load_City_Dimension_Table'
                                      )

# Check Data Quality
run_quality_checks = DataQualityOperator(tables=['candidate_fact'
                                                , 'city_fact'
                                                , 'candidate_dim'
                                                , 'student_dim'
                                                , 'special_dim'
                                                , 'city_dim']
                                        , redshift_conn_id=redshift_conn_id
                                        , dag=dag
                                        , task_id='Checking_Data_Quality'
                                        )
# Ending Operator
end_operator = DummyOperator(task_id='Stop_Execution')

#################################

# DAG: Start loading data into S3
start_operator >> brazil_to_s3
start_operator >> enem_to_s3

# DAG: Load data from S3 into Redshift stage tables
    select_sql=SqlQueries.insert_artists_table,
    mode='truncate')

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    table='time',
    select_sql=SqlQueries.insert_time_table,
    mode='truncate')

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    check_stmts=[{
        'sql': 'SELECT COUNT(*) FROM songplays;',
        'op': 'gt',
        'val': 0
    }, {
        'sql': 'SELECT COUNT(*) FROM songplays WHERE songid IS NULL;',
        'op': 'eq',
        'val': 0
    }])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# DAG dependencies
start_operator >> create_table_staging_songs
start_operator >> create_table_staging_events
start_operator >> create_table_songplays
start_operator >> create_table_artists
start_operator >> create_table_songs
start_operator >> create_table_users
コード例 #4
0
    redshift_conn_id='redshift',
    query_list=dim_tables_insert_queries,
    query_type='insert',
    dag=dag,
)

insert_facts_tables = RedshifQueriesOperator(
    task_id='insert_facts_tables',
    redshift_conn_id='redshift',
    query_list=facts_tables_insert_queries,
    query_type='insert',
    dag=dag,
)

run_quality_checks = DataQualityOperator(task_id='run_quality_checks',
                                         redshift_conn_id='redshift',
                                         tables_list=tables_list,
                                         dag=dag)

clean_demography_csv >> copy_demography_to_S3

clean_airports_csv >> copy_airports_to_S3

clean_immigration_parquet >> copy_immigration_to_S3

[copy_demography_to_S3, copy_airports_to_S3, copy_immigration_to_S3
 ] >> drop_redshift_tables >> create_redshift_tables >> [
     stage_demography_to_redshift, stage_airports_to_redshift,
     stage_immigration_to_redshift
 ] >> insert_dim_tables >> insert_facts_tables >> run_quality_checks
コード例 #5
0
ファイル: etl_dag.py プロジェクト: gcbeltramini/etl-project
drop_tables = SQLFileOperator(dag=dag,
                              task_id='drop_tables',
                              query_file='drop_tables.sql',
                              message='Dropping tables',
                              should_run=should_run)
create_tables = SQLFileOperator(dag=dag,
                                task_id='create_tables',
                                query_file='create_tables.sql',
                                message='Creating tables',
                                should_run=should_run)

copy_immigration_table = CSVToTableOperator(dag=dag,
                                            task_id='copy_immigration_table',
                                            should_run=should_run)
copy_airport_codes_table = CSVToTableOperator(
    dag=dag, task_id='copy_airport_codes_table', should_run=should_run)
copy_global_temperatures_table = CSVToTableOperator(
    dag=dag, task_id='copy_global_temperatures_table', should_run=should_run)
copy_us_cities_table = CSVToTableOperator(dag=dag,
                                          task_id='copy_us_cities_table',
                                          should_run=should_run)
quality_checks_task = DataQualityOperator(dag=dag,
                                          task_id='data_quality_checks',
                                          quality_checks=quality_checks,
                                          should_run=should_run)

(start_operator >> drop_tables >> create_tables >> [
    copy_immigration_table, copy_airport_codes_table,
    copy_global_temperatures_table, copy_us_cities_table
] >> quality_checks_task >> end_operator)
コード例 #6
0
    dag=dag,
    table="d_port",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket=bucket,
    s3_key='immigration_processed_files/port/I94_Port.csv',
    file_format='csv',
    create_stmt=SqlQueries.create_table_d_port)

# Check loaded data not null

run_quality_checks_airports = DataQualityOperator(
    task_id="run_quality_checks_airports",
    dag=dag,
    redshift_conn_id="redshift",
    dq_checks=[{
        'check_sql':
        "SELECT COUNT(*) FROM staging_airport WHERE ident is null",
        'expected_result': 0
    }])

run_quality_checks_us_cities_demo = DataQualityOperator(
    task_id="run_quality_checks_us_cities_demo",
    dag=dag,
    redshift_conn_id="redshift",
    dq_checks=[{
        'check_sql':
        "SELECT COUNT(*) FROM staging_us_cities_demographics WHERE city is null",
        'expected_result': 0
    }])
コード例 #7
0
    redshift_conn_id="redshift",
    sql_query=SqlQueries.time_table_insert,
    mode="truncate-insert")

# Task to perform quality checks on the data uploaded in Redshift
run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    qa_check_list=[{
        'check_sql': "SELECT COUNT(*) FROM users WHERE userid is null",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM users",
        'expected_result': 104
    }, {
        'check_sql': "SELECT COUNT(*) FROM artists WHERE artistid is null",
        'expected_result': 0
    }, {
        'check_sql':
        "SELECT COUNT(*) FROM songplays WHERE playid is null AND userid IS NULL",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM songs WHERE songid is null",
        'expected_result': 0
    }],
    redshift_conn_id="redshift")

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# order of execution for the dag
start_operator >> create_table
コード例 #8
0
load_time_dimension_table = LoadDimensionOperator(
    task_id=load_time_dimension_table_task_id,
    redshift_conn_id="redshift",
    table="time",
    aws_credentials_id="aws_credentials",
    start_date= datetime(2018, 5, 1),
    sql_source=SqlQueries.time_table_insert,
    dag=dag
)

run_quality_checks = DataQualityOperator(
    task_id=run_data_quality_checks_task_id,
    redshift_conn_id="redshift",
    table="time",
    dag=dag,
    provide_context=True,
    aws_credentials_id="aws_credentials",
    tables=["staging_events", "users", 'staging_songs', "songs", "artists", "time"]
)

end_operator = DummyOperator(task_id=stop_execution_task_id,  dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_time_dimension_table
コード例 #9
0
    sql_load_data=SqlQueries.artist_table_insert
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    table="time",
    redshift_conn_id="redshift",
    sql_load_data=SqlQueries.time_table_insert
)


run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    dq_query = 'SELECT COUNT(*) FROM public.songplays WHERE userid IS NULL',
    expected_result = 0
)


end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)


start_operator >> create_tables

create_tables >> stage_events_to_redshift
create_tables >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
コード例 #10
0
    insert_mode='truncate'
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='redshift', 
    table='time',
    sql_query=SqlQueries.time_table_insert,
    insert_mode='truncate'
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',     
    sql_query='select count(*) from songplays where userid is null;',
    expected_result=0
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

#
# Task ordering for the DAG tasks 
# 
start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
コード例 #11
0
ファイル: etl.py プロジェクト: Sambeth/Airflow-Data-Pipeline
    table="songplays",
    data_source=SqlQueries.songplay_table_insert,
    dag=dag)

dim_tables_and_sources = [
    ("users", SqlQueries.user_table_insert),
    ("songs", SqlQueries.song_table_insert),
    ("artists", SqlQueries.artist_table_insert),
    ("time", SqlQueries.time_table_insert),
]

load_dimension_tables = LoadDimensionOperator(task_id='load_dim_tables',
                                              redshift_conn_id="redshift",
                                              tables=dim_tables_and_sources,
                                              dag=dag)

run_quality_checks = DataQualityOperator(task_id='run_data_quality_checks',
                                         redshift_conn_id="redshift",
                                         tables=dim_tables_and_sources,
                                         dag=dag)

end_operator = DummyOperator(task_id='stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_dimension_tables
load_dimension_tables >> run_quality_checks
run_quality_checks >> end_operator
コード例 #12
0
    final_table=dim_artists_table_name,
    dql_sql=SqlQueries.artist_table_insert,
    dag=dag)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    redshift_conn_id=AIRFLOW_REDSHIFT_CONN_ID,
    final_table=dim_time_table_name,
    dql_sql=SqlQueries.time_table_insert,
    dag=dag)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    redshift_conn_id=AIRFLOW_REDSHIFT_CONN_ID,
    data_quality_validations=[
        validator_stage_events, validator_stage_songs, validator_songplays,
        validator_songs, validator_artists, validator_events, validator_time,
        validator_users
    ],
    dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# DAG dependency setup
start_operator >> db_setup_task

db_setup_task >> [
    stage_events_s3_to_redshift_and_validate_task,
    stage_songs_s3_to_redshift_and_validate_task,
] >> load_songplays_table_task
コード例 #13
0
    table='artists',
    select_sql=SqlQueries.artist_table_insert,
    append_insert=True,
    dag=dag)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    redshift_conn_id='redshift',
    table='time',
    select_sql=SqlQueries.time_table_insert,
    append_insert=True,
    dag=dag)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         redshift_conn_id='redshift',
                                         sql=SqlQueries.check_count,
                                         expected_result=0,
                                         dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# Operator Dependencies
start_operator >> create_tables
create_tables >> stage_events_to_redshift >> load_songplays_table
create_tables >> stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table >> run_quality_checks
load_songplays_table >> load_song_dimension_table >> run_quality_checks
load_songplays_table >> load_artist_dimension_table >> run_quality_checks
load_songplays_table >> load_time_dimension_table >> run_quality_checks
run_quality_checks >> end_operator
コード例 #14
0
load_visitor_arrival_fact_table = LoadFactOperator(
    task_id='Load_fact_visitor_arrival_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='fact_visitor_arrival',
    select_query=SqlQueries.visitor_arrival_table_insert,
    truncate_insert=True
)

run_has_rows_quality_checks = DataQualityOperator(
    task_id='Run_has_rows_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    checks=[
        {"test": "SELECT COUNT(*) FROM dim_port", "not_equals": "0"},
        {"test": "SELECT COUNT(*) FROM dim_us_city", "not_equals": "0"},
        {"test": "SELECT COUNT(*) FROM dim_us_state", "not_equals": "0"},
        {"test": "SELECT COUNT(*) FROM dim_country", "not_equals": "0"},
        {"test": "SELECT COUNT(*) FROM dim_date", "not_equals": "0"},
        {"test": "SELECT COUNT(*) FROM fact_visitor_arrival", "not_equals": "0"}
    ]
)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# Execution order
start_operator >> [
    stage_airport_codes_to_redshift,
    stage_countries_mapping_to_redshift,
    stage_travel_mode_mapping_to_redshift,
    stage_us_cities_to_redshift,
コード例 #15
0
          catchup=False
          )


start_data_to_redshift_operation = DummyOperator(
    task_id='Begin_Migrating_Data_To_Staging_Tables',  dag=dag)

end_data_to_redshift_operation = DummyOperator(
    task_id='Begin_Loading_Data_To_Fact_Dimension_Tables',  dag=dag)

end_of_tasks = DummyOperator(
    task_id='End_Of_Execution',  dag=dag)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    conn_id='redshift',
    target_tables=["dim_airport_table", "dim_demographic_table", "dim_visitor_table", "fact_city_data_table"],
)

for data_type, table_name in staging_tables.items():
    stage_data_to_redshift = S3ToRedshiftOperator(
        task_id='Stage_' + data_type,
        dag=dag,
        table=table_name,
        drop_table=True,
        s3_bucket='udend-data',
        s3_folder=data_type,
        aws_connection_id='aws_credentials',
        redshift_connection_id='redshift',
        create_query=globals()[table_name],
        copy_options="json 'auto'"
コード例 #16
0
ファイル: sparkify_dag.py プロジェクト: jomavera/dataPipeline
    table='artists',
    redshift_conn_id='redshift',
    sql_create=SqlQueries.artist_table_create,
    sql_select=SqlQueries.artist_table_insert)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    table='time',
    redshift_conn_id='redshift',
    sql_create=SqlQueries.time_table_create,
    sql_select=SqlQueries.time_table_insert)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    tables=['songplays', 'users', 'artists', 'songs'],
    sql_check=SqlQueries.check)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

#---#---# Define Task Dependencies #---#---#

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_time_dimension_table
load_songplays_table >> load_user_dimension_table
コード例 #17
0
    table='lounges',
    redshift_conn_id='redshift',
    sql=SqlQueries.lounges_table_insert)

load_fact_ratings_table = LoadFactOperator(
    task_id='Load_fact_ratings_fact_table',
    dag=dag,
    redshift_conn_id='redshift',
    sql=SqlQueries.fact_ratings_table_insert)

ensure_data_load_in_dims = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    test_cases=[
        (SqlQueries.airlines_count_test, operator.gt, 0),
        (SqlQueries.aircrafts_count_test, operator.gt, 0),
        (SqlQueries.lounges_count_test, operator.gt, 0),
        (SqlQueries.aircrafts_count_test, operator.gt, 0),
        (SqlQueries.aircrafts_count_test, operator.gt, 0),
    ])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_airlines_to_redshift
start_operator >> stage_airports_to_redshift
start_operator >> stage_lounges_to_redshift
start_operator >> stage_seats_to_redshift

stage_airlines_to_redshift >> load_airlines_dimension_table >> load_aircrafts_dimension_table >> load_passengers_dimension_table
stage_airports_to_redshift >> load_airports_dimension_table >> load_lounges_dimension_table >> load_passengers_dimension_table
    dag=dag)

load_dim_foreign_table = LoadToRedshiftOperator(
    task_id='Load_Dim_Foreign_Table',
    aws_credentials_id='aws_credentials',
    redshift_conn_id='redshift',
    s3_bucket='my-bucket',
    s3_key='dim_foreign_table',
    table='dim_foreign_table',
    mode='truncate',
    dag=dag)

run_quality_checks = DataQualityOperator(task_id='Run_Data_Quality_Checks',
                                         redshift_conn_id='redshift',
                                         tables=[
                                             'fact_table', 'dim_state_table',
                                             'dim_visa_table',
                                             'dim_foreign_table'
                                         ],
                                         dag=dag)

end_operator = DummyOperator(task_id='Stop_Execution', dag=dag)

start_operator >> load_fact_table

load_fact_table >> [
    load_dim_state_table, load_dim_visa_table, load_dim_foreign_table
]

[load_dim_state_table, load_dim_visa_table, load_dim_foreign_table
 ] >> run_quality_checks
コード例 #19
0
)

dim_operators = [
    StageToRedshiftOperator(
        task_id=f"Copy_{dim_table_name}_dim_table",
        dag=dag,
        table=dim_table_name,
        conn_id=REDSHIFT_CONN_ID,
        aws_credentials_id=AWS_CREDENTIALS_ID,
        s3_bucket=INPUT_BUCKET,
        s3_key=f"{dim_table_name}.csv/*.csv",
        file_format="CSV",
        provide_context=True,
    ) for dim_table_name in dim_table_names
]

run_quality_checks = DataQualityOperator(
    task_id="Run_data_quality_checks",
    dag=dag,
    conn_id=REDSHIFT_CONN_ID,
    tables=dim_table_names + [fact_table_name],
)

end_operator = DummyOperator(task_id="Stop_execution", dag=dag)

start_operator >> [process_sale, process_postcode]
process_sale >> create_sale_tables >> dim_operators[:-1] + [stage_sale_table]
process_postcode >> create_postcode_tables >> dim_operators[-1]
dim_operators + [stage_sale_table] >> run_quality_checks
run_quality_checks >> end_operator
コード例 #20
0
load_covid19_fact_table = LoadTableOperator(
    task_id='load_covid19_fact_table',
    dag=dag,
    table='fact.us_covid_19',
    select_sql=SqlLoad.insert_covid_table,
    mode='truncate')

load_accident_fact_table = LoadTableOperator(
    task_id='load_accident_fact_table',
    dag=dag,
    table='fact.us_accidents',
    select_sql=SqlLoad.insert_accident_table,
    mode='truncate')

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         check_stmts=data_quality_args)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# DAG dependencies

start_operator >> [
    create_staging_schema, create_dim_schema, create_fact_schema
]

[create_staging_schema, create_dim_schema, create_fact_schema
 ] >> schema_created

schema_created >> [
    create_table_staging_covid_19, create_table_staging_us_demographics,
コード例 #21
0
        parent_dag_name=dag_name,
        task_id=load_time_dimension_table_task_id,
        redshift_conn_id="redshift",
        aws_credentials_id="aws_credentials",
        table="users",
        start_date= datetime(2018, 5, 1),
        sql_query=SqlQueries.time_table_insert,
    ),
    task_id=load_time_dimension_table_task_id,
    dag=dag
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    provide_context=True,
    aws_credentials_id="aws_credentials",
    redshift_conn_id='redshift',
    tables=["songplay", "users", "song", "artist", "time"]
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

# Setting tasks dependencies

start_operator >> create_redshift_tables >> [stage_songs_to_redshift, stage_events_to_redshift]

[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table

load_songplays_table >> [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table,
                           load_time_dimension_table] >> run_quality_checks
コード例 #22
0
    task_id='Load_i94visitors_fact',
    dag=dag,
    redshift_conn_id="redshift",
    table_query=SqlQueries.visitors_fact_insert)

load_dates_dim = LoadDimensionOperator(task_id='Load_dates_dim',
                                       dag=dag,
                                       redshift_conn_id="redshift",
                                       table_query=SqlQueries.dates_dim_insert,
                                       table="dates_dim",
                                       truncate_flag='Y')

run_quality_check = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    check_query=
    "select count(1) from public.i94visitors_fact where reasonforvisit is null",
    expected_count=10000)

run_intg_check = IntegrityCheckOperator(
    task_id='Run_data_integrity_check',
    dag=dag,
    redshift_conn_id="redshift",
    check_query=
    "select count(airportid) from public.i94visitors_fact where airportid not in (select airportid from airports_dim)",
    table_name="i94visitors_fact")

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_i94vistors
コード例 #23
0
    task_id='Load_artist_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='artists',
    sql=SqlQueries.artists_table_insert)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='time',
    sql=SqlQueries.time_table_insert)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    table_list=['songplays', 'artists', 'songs', 'users', 'time'])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> table_created_check

table_created_check >> [stage_events_to_redshift, stage_songs_to_redshift]

stage_events_to_redshift >> [
    load_songplays_fact_table, load_users_dimension_table
]
stage_songs_to_redshift >> [
    load_songplays_fact_table, load_artists_dimension_table,
    load_songs_dimension_table
コード例 #24
0
        "bc",
        "temperature",
        "presure",
        "humidity",
        "clouds",
        "wind_speed",
        "wind_deg",
        "measure_date",
        "weather_id",
    ],
)

run_quality_checks = DataQualityOperator(
    task_id="Run_data_quality_checks",
    dag=dag,
    conn_id="postgres",
    table=
    None,  # If table is none, means that quality checks are done to all tables
)

end_operator = DummyOperator(task_id="Stop_execution", dag=dag)

# Level 1
start_operator >> create_location_table
start_operator >> create_time_table
start_operator >> create_weather_table
start_operator >> create_staging_table

# Level 2
create_location_table >> create_measures_table
create_location_table >> populate_location_table
コード例 #25
0
    task_id='Load_artist_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    sql_query=SqlQueries.artist_table_insert,
    table_name="artists",
    append_only=False)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    sql_query=SqlQueries.time_table_insert,
    table_name="time",
    append_only=False)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    tables=["artists", "songplays", "songs", "time", "users"])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# Task dependencies
start_operator >> stage_events_to_redshift >> load_songplays_table
start_operator >> stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table >> run_quality_checks
load_songplays_table >> load_song_dimension_table >> run_quality_checks
load_songplays_table >> load_artist_dimension_table >> run_quality_checks
load_songplays_table >> load_time_dimension_table >> run_quality_checks
run_quality_checks >> end_operator
コード例 #26
0
    task_id='Load_artist_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="artists",
    sql_columns=SqlQueries.artist_table_insert)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="time",
    sql_columns=SqlQueries.time_table_insert)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    table_collection=["songplays", "users", "songs", "artists", "time"])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> create_staging_songs_table >> transient_operator
start_operator >> create_staging_events_table >> transient_operator
start_operator >> create_songplays_table >> transient_operator
start_operator >> create_artists_table >> transient_operator
start_operator >> create_songs_table >> transient_operator
start_operator >> create_users_table >> transient_operator
start_operator >> create_time_table >> transient_operator

transient_operator >> stage_events_to_redshift >> load_songplays_table
transient_operator >> stage_songs_to_redshift >> load_songplays_table
コード例 #27
0
load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
    redshift_conn_id="redshift",
    destination_table="artists",
    sql_query=SqlQueries.artist_table_insert,
    dag=dag)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    redshift_conn_id="redshift",
    destination_table="time",
    sql_query=SqlQueries.time_table_insert,
    dag=dag)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         redshift_conn_id="redshift",
                                         target_table="time",
                                         dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_artist_dimension_table
load_songplays_table >> load_time_dimension_table
load_time_dimension_table >> run_quality_checks
load_artist_dimension_table >> run_quality_checks
load_song_dimension_table >> run_quality_checks
コード例 #28
0
)

copy_business_reviews_toRedshift = StageToRedshiftOperator(
    task_id='copy_business_reviews_toRedshift',
    dag=dag,
    table="business_reviews",
    s3_bucket=Variable.get("yelp_bucket"),
    s3_key='output/business_reviews.json',
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    is_json=True
)

data_quality_task = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    table="business_reviews",
)

end_operator = DummyOperator(task_id='end_of_execution', dag=dag)

copy_start_operator = DummyOperator(task_id='copy_start_operator', dag=dag)

start_operator >> state_dim_task >> business_reviews_fact_task
start_operator >> category_dim_task >> business_reviews_fact_task
start_operator >> ambience_dim_task >> business_reviews_fact_task
start_operator >> user_dim_task >> business_reviews_fact_task

# business_reviews_fact_task >> delete_from_s3_task
business_reviews_fact_task >> create_tables_task
コード例 #29
0
    insert_query=SqlQueries.rides_table_insert,
    task_id='Load_rides_facts_table',
    dag=dag)

load_stations_dimension_table = LoadDimensionOperator(
    redshift_conn_id="redshift",
    table="stations",
    create_query=SqlQueries.create_stations,
    insert_query=SqlQueries.stations_table_insert,
    task_id='Load_stations_dim_table',
    dag=dag)

load_weather_dimension_table = LoadDimensionOperator(
    redshift_conn_id="redshift",
    table="weather",
    create_query=SqlQueries.create_weather,
    insert_query=SqlQueries.weather_table_insert,
    task_id='Load_weather_dim_table',
    dag=dag)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         redshift_conn_id="redshift",
                                         dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> [stage_weather_raw_to_redshift, stage_bikes_raw_to_redshift
                   ] >> wait_operator >> [
                       load_rides_facts_table, load_stations_dimension_table,
                       load_weather_dimension_table
                   ] >> run_quality_checks >> end_operator
コード例 #30
0
    task_id='Load_time_dim_table',
    dag=dag,
    table='time',
    redshift_conn_id='redshift',
    dimension_query=SqlQueries.time_table_insert,
    delect_or_append=
    'delete'  # can either be 'append' to append data or 'delete' to truncate table and then add data. 
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    table='songplays',
    column='userid',
    redshift_conn_id='redshift',
    dq_checks=[{
        'check_sql': "SELECT COUNT(*) FROM users WHERE userid is null",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM songs WHERE songid is null",
        'expected_result': 0
    }],
)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table