s3_bucket="udacity-dend", s3_key="song_data/", aws_region="us-west-2", json="auto") load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag, redshift_conn_id="redshift", sql_insert=SqlQueries.songplay_table_insert, destination_table="public.songplays") load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, redshift_conn_id="redshift", sql_insert=SqlQueries.user_table_insert, destination_table="public.users", delete=True) load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, redshift_conn_id="redshift", sql_insert=SqlQueries.song_table_insert, destination_table="public.songs", delete=True) load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', dag=dag,
json_path='auto') #4. Use staging tables to populate fact table load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag, redshift_conn_id="redshift", table="songplays", sql_query=SqlQueries.songplay_table_insert, delete_first=True) #5. Use staging tables to populate user table load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, redshift_conn_id="redshift", table="users", sql_query=SqlQueries.user_table_insert, delete_first=True) #6. Use staging tables to populate song_table table load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, redshift_conn_id="redshift", table="songs", sql_query=SqlQueries.song_table_insert, delete_first=True) #7. Use staging tables to populate artist table load_artist_dimension_table = LoadDimensionOperator(
redshift_conn_id = 'redshift', table="songplays", sql_query = SqlQueries.songplay_table_insert, dag=dag, append_only=False ) """ connecting to redshift running the LoadDimensionOperator operator with sql_queries.py """ load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', redshift_conn_id = 'redshift', table="users", sql_query = SqlQueries.user_table_insert, dag=dag, append_only=False ) """ connecting to redshift running the LoadDimensionOperator operator with sql_queries.py """ load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', redshift_conn_id = 'redshift', table="songs", sql_query = SqlQueries.song_table_insert,
append_data=True, aws_credentials={ 'key': AWS_KEY, 'secret': AWS_SECRET }, region='us-west-2', sql_statement=SqlQueries.songplays_table_insert, provide_context=True) load_users_dimension_table = LoadDimensionOperator( task_id='Load_users_dim_table', dag=dag, target_table='users', redshift_conn_id='redshift', append_data=False, aws_credentials={ 'key': AWS_KEY, 'secret': AWS_SECRET }, region='us-west-2', sql_statement=SqlQueries.users_table_insert, provide_context=True) load_songs_dimension_table = LoadDimensionOperator( task_id='Load_songs_dim_table', dag=dag, target_table='songs', redshift_conn_id='redshift', append_data=False, aws_credentials={ 'key': AWS_KEY,
aws_credentials_id='aws_credentials', s3_bucket='udacity-dend', s3_key= 'song_data', # load a small portion of song data with 'song_data/A/A/A' json_path='auto') load_songplays_table = LoadFactOperator(task_id='Load_songplays_fact_table', dag=dag, conn_id='redshift', sql=SqlQueries.songplay_table_insert, target_table='songplays') load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, conn_id='redshift', sql=SqlQueries.user_table_insert, target_table='users', delete_first=True) load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, conn_id='redshift', sql=SqlQueries.song_table_insert, target_table='songs', delete_first=True) load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', dag=dag,
s3_key="song_data", aws_credentials_id='aws_credentials' ) load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag, redshift_conn_id='redshift', table='songplays', sql_query=SqlQueries.songplay_table_insert ) load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, redshift_conn_id='redshift', table='users', sql_query=SqlQueries.user_table_insert ) load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, redshift_conn_id='redshift', table='songs', sql_query=SqlQueries.song_table_insert ) load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', dag=dag,
}, { 'item': "devise", "query": SqlQueries.devise_table_insert }, { 'item': "cours", "query": SqlQueries.cours_table_insert }, { 'item': "magasin", "query": SqlQueries.magasin_table_insert }] for dimension_item in dimension_items: load_dimension_table = LoadDimensionOperator( task_id=f"load_{dimension_item['item']}_dimension_table", dag=dag, redshift_conn_id="redshift", table=dimension_item["item"], query=dimension_item["query"], append=False) load_dimension_tables.append(load_dimension_table) ### Build fact table milestone_2 = DummyOperator(task_id='milestone_2', dag=dag) Load_sales_fact_table = LoadFactOperator(task_id='Load_sales_fact_table', dag=dag, redshift_conn_id="redshift", table="sales", query=SqlQueries.sales_table_insert) ### Quality checks
s3_bucket="udacity-dend", s3_key="song_data/A/A/A", table="staging_songs", json="auto") load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag, redshift_conn_id="redshift", destination_table="songplays", facts_sql=SqlQueries.songplay_table_insert) load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, redshift_conn_id="redshift", destination_table="users", dim_sql=SqlQueries.user_table_insert, append_mode=False) load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, redshift_conn_id="redshift", destination_table="songs", dim_sql=SqlQueries.song_table_insert, append_mode=False) load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', dag=dag,
) # Calling LoadFactOperator to load the data into songplays fact table load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', sql_statement=SqlQueries.songplay_table_insert, target_table='songplays', redshift_connection_id='redshift', dag=dag ) # Calling LoadDimensionOperator to load the data into users dimension table load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', target_table='users', redshift_connection_id='redshift', sql_statement=SqlQueries.user_table_insert, truncate=False, dag=dag ) # Calling LoadDimensionOperator to load the data into songs dimension table load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', target_table='songs', redshift_connection_id='redshift', sql_statement=SqlQueries.song_table_insert, truncate=False, dag=dag ) # Calling LoadDimensionOperator to load the data into artists dimension table
task_id='Stage_songs', dag=dag, aws_credentials_id="aws_credentials", iam_role="Redshift_Read_S3", redshift_conn_id="redshift", s3_json_structure_path="s3://udacity-redshift/song_paths.json", s3_data_path="s3://udacity-dend/song_data", table='staging_songs') load_songplays_table = LoadFactOperator(task_id='Load_songplays_fact_table', dag=dag, redshift_conn_id="redshift") load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, redshift_conn_id="redshift", table="users") load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, redshift_conn_id="redshift", table="songs") load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', dag=dag, redshift_conn_id="redshift", table="artists")
dag=dag) stage_reviews_to_redshift = StageToRedshiftOperator( task_id='stage_reviews', redshift_conn_id='redshift', table='staging_reviews', s3_bucket='podcast-project', s3_key='reviews.csv', dag=dag) load_podcast_stats_table = LoadDimensionOperator( task_id='load_podcast_agg_table', redshift_conn_id='redshift', destination_table='podcast_agg_reviews', query_dimension=""" SELECT sp.podcast_id, sp.title ,avg(sr.rating) ,count(r.*) FROM staging_podcast AS sp LEFT JOIN staging_reviews AS sr ON sp.podcast_id = sr.podcast_id WHERE sr.rating IS NOT NULL GROUP BY sp.podcast_id, sp.title;""", dag=dag) load_categories_stats_table = LoadDimensionOperator( task_id='load_categories_agg_table', redshift_conn_id='redshift', destination_table='categories_agg_reviews', query_dimension="""(category, total_podcast, category_avg_rating) SELECT sc.category,count(distinct sc.podcast_id),avg(sr.rating) FROM staging_category AS sc LEFT JOIN staging_reviews AS sr ON sc.podcast_id = sr.podcast_id WHERE sr.rating IS NOT NULL
json_file="auto", region="us-west-2") load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag, target_table="songplays", sql_table_create=SqlQueries.songplay_table_create, sql_table_insert=SqlQueries.songplay_table_insert, redshift_conn_id="redshift", mode="") load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, target_table="users", sql_table_create=SqlQueries.user_table_create, sql_table_insert=SqlQueries.user_table_insert, redshift_conn_id="redshift") load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, target_table="songs", sql_table_create=SqlQueries.song_table_create, sql_table_insert=SqlQueries.song_table_insert, redshift_conn_id="redshift") load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', dag=dag,
task_id='Stage_songs', dag=dag, table="staging_songs", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="song_data/A/B/C/TRABCEI128F424C983.json") load_songplays_table = LoadFactOperator(task_id='Load_songplays_fact_table', dag=dag, redshift_conn_id="redshift", destination_table="songplays") load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, redshift_conn_id="redshift", sql_statement=sql_queries.user_table_insert, table_name='users') load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, redshift_conn_id="redshift", sql_statement=sql_queries.song_table_insert, table_name='songs') load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', dag=dag, redshift_conn_id="redshift", sql_statement=sql_queries.artist_table_insert,
aws_credentials_id='aws_credentials', json='s3://udacity-dend/song_data', s3_bucket='udacity-dend', s3_key='song_data') load_songplays_table = LoadFactOperator(task_id='Load_songplays_fact_table', dag=dag, aws_credentials='aws_credentials', table='public.songplays', truncate_table=True, query=SqlQueries.songplay_table_insert) load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, aws_credentials='aws_credentials', table='public.users', truncate_table=True, query=SqlQueries.user_table_insert) load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, aws_credentials='aws_credentials', table='public.songs', truncate_table=True, query=SqlQueries.song_table_insert) load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', dag=dag,
email_on_retry=False) load_songplays_table = LoadFactOperator(task_id='Load_songplays_fact_table', dag=dag, table="songplays", redshift_conn_id='redshift', depends_on_past=False, retries=3, retry_delay=timedelta(minutes=5), email_on_retry=False) load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, table="users", redshift_conn_id='redshift', depends_on_past=False, retries=3, retry_delay=timedelta(minutes=5), email_on_retry=False) load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, table="songs", redshift_conn_id='redshift', depends_on_past=False, retries=3, retry_delay=timedelta(minutes=5), email_on_retry=False)
from operators.load_dimension import LoadDimensionOperator from helpers import SqlQueries def subdag( parent_dag_name, task_id, redshift_conn_id, aws_credentials_id, table, sql_query, *args, **kwargs): dag = DAG( f"{parent_dag_name}.{task_id}", **kwargs ) """ Inserts Data into a dimensional redshift table from staging tables. """ load_dimension_table = LoadDimensionOperator( task_id=f"load_{table}_dim_table", dag=dag, table=table, redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, sql_query=sql_query ) load_dimension_table
s3_bucket='udacity-dend', s3_key='song_data', aws_region='us-west-2') load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag, redshift_conn_id='redshift', append_data=append_data, query=SqlQueries.songplay_table_insert, ) load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, redshift_conn_id='redshift', append_data=append_data, table='users', query=SqlQueries.user_table_insert, ) load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, redshift_conn_id='redshift', append_data=append_data, table='songs', query=SqlQueries.song_table_insert, ) load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table',
task_id='create_dwh_tables', dag=dag, create_or_delete='create', staging_or_dwh='dwh', redshift_conn_id='redshift_conn_id', ) load_artists_table = LoadFactOperator(task_id='load_artists_fact_table', redshift_conn_id="redshift_conn_id", table='artists', append=True, dag=dag) load_concerts_table = LoadDimensionOperator( task_id='load_concerts_dimension_table', redshift_conn_id="redshift_conn_id", table='concerts', append=True, dag=dag) load_songs_table = LoadDimensionOperator(task_id='load_songs_dimension_table', redshift_conn_id="redshift_conn_id", table='songs', append=True, dag=dag) check_data_quality = DataQualityOperator( task_id='check_data_quality', dag=dag, redshift_conn_id='redshift_conn_id', dq_checks=[ {
json_format="'auto'", ) load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag, table='songplays', redshift_conn_id="redshift", truncate_table=False, select_sql=SqlQueries.songplay_table_insert, ) load_user_dimension_table = LoadDimensionOperator( task_id='Load_users_dim_table', dag=dag, table='users', redshift_conn_id="redshift", truncate_table=True, select_sql=SqlQueries.user_table_insert, ) load_song_dimension_table = LoadDimensionOperator( task_id='Load_songs_dim_table', dag=dag, table='songs', redshift_conn_id="redshift", truncate_table=True, select_sql=SqlQueries.song_table_insert, ) load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artists_dim_table',
aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="song_data/", format_json=Variable.get('json_song_format', default_var=default_json_song_format)) load_songplays_table = LoadFactOperator(task_id='Load_songplays_fact_table', redshift_conn_id="redshift", table='songplays', dag=dag) load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', redshift_conn_id="redshift", table='users', params={ 'append_flag': Variable.get('append_flag', default_var=default_append_flag) }, dag=dag) load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', redshift_conn_id="redshift", table='songs', params={ 'append_flag': Variable.get('append_flag', default_var=default_append_flag) }, dag=dag)
s3_data_path="s3://udacity-dend/song_data", json_schema="auto", ) load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag, redshift_conn_id="redshift", sql_query=SqlQueries.songplay_table_insert, filter_expr="WHERE page='NextSong'" ) load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, redshift_conn_id="redshift", sql_query=SqlQueries.user_table_insert, filter_expr="WHERE page='NextSong'" ) load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, redshift_conn_id="redshift", sql_query=SqlQueries.song_table_insert, filter_expr="" ) load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', dag=dag,
provide_context=True, ) load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag, redshift_conn='redshift', table='songplays', sql=SqlQueries.songplay_table_insert, provide_context=False, ) load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, redshift_conn='redshift', table='users', sql=SqlQueries.user_table_insert, provide_context=False, ) load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, redshift_conn='redshift', table='songs', sql=SqlQueries.song_table_insert, provide_context=False, ) load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table',
ignore_header=1, delimiter=';', ) load_immigration_facts_table = LoadFactOperator( task_id='Load_immigration_facts_fact_table', dag=dag, redshift_conn_id="redshift", sql_query=insert_queries['immigration_facts'], filter_expr="WHERE cicid is not null", ) load_states_dimension_table = LoadDimensionOperator( task_id='Load_states_dim_table', dag=dag, redshift_conn_id="redshift", sql_query=insert_queries['states'], filter_expr="", mode='append') load_cities_dimension_table = LoadDimensionOperator( task_id='Load_cities_dim_table', dag=dag, redshift_conn_id="redshift", sql_query=insert_queries['cities'], filter_expr="", mode='append') load_times_dimension_table = LoadDimensionOperator( task_id='Load_times_dim_table', dag=dag,
end_staging = DummyOperator(task_id='end_staging', dag=dag) # Load Fact table load_fact_table = LoadFactOperator( task_id='load_fact_table', dag=dag, redshift_conn_id="redshift", table_name="fact_temperature", sql_insert_stmt=SqlQueries.fact_table_insert, truncate=False) # Load Dimension table load_time_dimension_table = LoadDimensionOperator( task_id='load_time_dimension_table', dag=dag, redshift_conn_id="redshift", table_name="time", sql_insert_stmt=SqlQueries.time_table_insert, truncate=False) # Load Dimension table load_airport_dimension_table = LoadDimensionOperator( task_id='load_airport_dimension_table', dag=dag, redshift_conn_id="redshift", table_name="airport", sql_insert_stmt=SqlQueries.airport_table_insert, truncate=False) # Load Dimension table load_demographic_dimension_table = LoadDimensionOperator(
schema='public', table='staging_songs', redshift_conn_id='redshift', aws_conn_id='aws_credentials', copy_options=["JSON 'auto ignorecase'"]) load_songplays = LoadFactOperator( task_id='load_fact_songplays', dag=dag, insert_table_sql=insert_tables.songplays, redshift_conn_id='redshift') load_users = LoadDimensionOperator( task_id='load_dim_users', dag=dag, insert_table_sql=insert_tables.users, schema='public', table='users', truncate=False, redshift_conn_id='redshift') load_songs = LoadDimensionOperator( task_id='load_dim_songs', dag=dag, insert_table_sql=insert_tables.songs, schema='public', table='songs', truncate=False, redshift_conn_id='redshift') load_artists = LoadDimensionOperator( task_id='load_dim_artists',
append_data=True, aws_credentials={ 'key': AWS_KEY, 'secret': AWS_SECRET }, region='us-east-1', sql_statement=SqlQueries.pleasurevisits_table_insert, provide_context=True) load_flights_dimension_table = LoadDimensionOperator( task_id='Load_flights_dim_table', dag=dag, target_table='flights', redshift_conn_id='redshift', append_data=False, aws_credentials={ 'key': AWS_KEY, 'secret': AWS_SECRET }, region='us-east-1', sql_statement=SqlQueries.flights_table_insert, provide_context=True) load_cities_dimension_table = LoadDimensionOperator( task_id='Load_cities_dim_table', dag=dag, target_table='cities', redshift_conn_id='redshift', append_data=False, aws_credentials={ 'key': AWS_KEY,
s3_bucket="udacity-dend", s3_key="song_data/A/A/A", ) load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag, table='songplays', redshift_conn_id='redshift', aws_conn_id='aws_credentials', insert_sql_qry=SqlQueries.songplay_table_insert) load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, table='users', redshift_conn_id='redshift', aws_conn_id='aws_credentials', insert_sql_qry=SqlQueries.user_table_insert) load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, table='songs', redshift_conn_id='redshift', aws_conn_id='aws_credentials', insert_sql_qry=SqlQueries.song_table_insert) load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', dag=dag,
append=True, dag=dag ) load_carpark_availability_table = LoadFactOperator( task_id='load_carpark_availability_fact_table', redshift_conn_id="redshift", table='carpark_availability', append=True, dag=dag ) load_carpark_table = LoadDimensionOperator( task_id='load_carpark_info_dimension_table', redshift_conn_id="redshift", table='carpark', append=False, dag=dag ) load_weather_station_table = LoadDimensionOperator( task_id='load_weather_stations_info_dimension_table', redshift_conn_id="redshift", table='weather_stations', append=False, dag=dag ) load_time_table = LoadDimensionOperator( task_id='load_time_dimension_table', redshift_conn_id='redshift',
table="i94country_code", s3_bucket=S3_BUCKET, s3_key="raw/i94_immigration_labels_description/country_code.csv") load_usa_travelers_info = LoadFactOperator( task_id="load_usa_travelers_info", dag=dag, redshift_conn_id=REDSHIFT_CONN_ID, schema=SCHEMA_NAME, table="city_state_travelers_entry", insert_sql=SqlQueries.city_state_travelers_entry_insert) load_arrival_date = LoadDimensionOperator( task_id="load_arrival_date", dag=dag, redshift_conn_id=REDSHIFT_CONN_ID, schema=SCHEMA_NAME, table="arrival_date", insert_sql=SqlQueries.arrival_date_insert) # Data Quality # get the dq_checks_settings for data quality # file: [airflow_file]/plugins/helpers/dq_check_settings.json airflow_file = pathlib.Path(__file__).parent.parent.absolute() dq_check_settings = os.path.join(airflow_file, "plugins", "helpers", "dq_check_settings.json") with open(dq_check_settings) as json_file: dq_checks = json.load(json_file) dq_checks = dq_checks['dq_checks'] run_quality_checks = DataQualityOperator(task_id="run_data_quality_checks",
table="staging_songs", s3_bucket="udacity-dend", json_path="auto", region="us-west-2", overwrite=True) load_songplays_table = LoadFactOperator(task_id='Load_songplays_fact_table', dag=dag, sql=SqlQueries.songplay_table_insert, redshift_conn_id="redshift", target_table="public.songplays") load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, sql=SqlQueries.user_table_insert, redshift_conn_id="redshift", target_table="public.users", overwrite=True) load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, sql=SqlQueries.song_table_insert, redshift_conn_id="redshift", target_table="public.songs", overwrite=True) load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', dag=dag,