def stage_s3_to_redshift_dag( parent_dag_name: str, task_id: str, redshift_conn_id: str = "", aws_credentials_id: str = "", target_table: str = "", s3_bucket: str = None, s3_key: str = None, json_path: Optional[str] = None, ignore_headers: Optional[int] = None, delimiter: Optional[str] = None, default_args: Dict[str, Any] = dict, *args, **kwargs, ): dag = DAG(dag_id=f"{parent_dag_name}.{task_id}", default_args=default_args, **kwargs) stage_events_to_redshift = StageToRedshiftOperator( task_id=f"{parent_dag_name}.Stage_events", redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, target_table=target_table, s3_bucket=s3_bucket, s3_key=s3_key, json_path=json_path, ignore_headers=ignore_headers, delimiter=delimiter, dag=dag, *args, **kwargs) validation_songplays = DataQualityValidator( sql_statement=f"SELECT COUNT(*) FROM {target_table}", result_to_assert=0, should_assert_for_equality=False, ) check_data_task = DataQualityOperator( task_id=f"{parent_dag_name}.Data_Quality_Check", redshift_conn_id=redshift_conn_id, data_quality_validations=[validation_songplays], dag=dag, ) stage_events_to_redshift >> check_data_task return dag
truncate=False, dag=dag ) # Calling LoadDimensionOperator to load the data into time dimension table load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', target_table='time', redshift_connection_id='redshift', sql_statement=SqlQueries.time_table_insert, truncate=False, dag=dag ) # Runs the QualityCheck via DataQualityOperator on all fact and dimension tables once the data is loaded run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', redshift_connection_id = 'redshift', target_table=("songplays","users","songs","artists","time"), validate_column=("playid","userid","song_id","artist_id","start_time"), dag=dag ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # Task order start_operator >> [stage_log_events_to_redshift,stage_songs_to_redshift] [stage_log_events_to_redshift,stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> [load_user_dimension_table,load_song_dimension_table,load_artist_dimension_table,load_time_dimension_table] [load_user_dimension_table,load_song_dimension_table,load_artist_dimension_table,load_time_dimension_table] >> run_quality_checks run_quality_checks >> end_operator
task_id='Load_time_dim_table', redshift_conn_id = 'redshift', table="time", sql_query = SqlQueries.time_table_insert, dag=dag, append_only=False ) """ connecting to redshift running the DataQualityOperator operator """ run_quality_checks = DataQualityOperator( task_id='run_data_quality_checks', redshift_conn_id="redshift", dag=dag, tables=["songplay", "users", "song", "artist", "time"] ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) """ Order of dags; starting with the start_operator and ending with the end_operator """ start_operator >> create_tables_in_redshift create_tables_in_redshift >> [stage_songs_to_redshift, stage_events_to_redshift] >> load_songplays_table load_songplays_table >> [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table] >> run_quality_checks >> end_operator
load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, conn_id='redshift', sql=SqlQueries.time_table_insert, target_table='time', truncate_first=True) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, conn_id='redshift', dq_checks=[{ 'check_sql': "SELECT COUNT(*) FROM songplays WHERE start_time IS NULL", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM users WHERE userid IS NULL", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM songs WHERE songid IS NULL", 'expected_result': 0 }]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> create_tables create_tables >> stage_events_to_redshift create_tables >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table
load_cities_dimension_table = LoadDimensionOperator( task_id='Load_cities_dim_table', dag=dag, redshift_conn_id="redshift", sql_query=insert_queries['cities'], filter_expr="", mode='append') load_times_dimension_table = LoadDimensionOperator( task_id='Load_times_dim_table', dag=dag, redshift_conn_id="redshift", sql_query=insert_queries['times'], filter_expr="", mode='append') run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", table_list=['immigration_facts', 'states', 'cities', 'times']) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> (stage_immigration_to_redshift, stage_demography_to_redshift ) >> load_immigration_facts_table load_immigration_facts_table >> ( load_states_dimension_table, load_cities_dimension_table, load_times_dimension_table) >> run_quality_checks run_quality_checks >> end_operator
sql_statement=SqlQueries.time_table_insert, provide_context=True) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', aws_credentials={ 'key': AWS_KEY, 'secret': AWS_SECRET }, region='us-west-2', provide_context=True, dq_checks=[{ 'check_sql': "SELECT COUNT(*) FROM songplays WHERE playid is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM users WHERE userid is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM artists WHERE artistid is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM songs WHERE songid is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM time WHERE start_time is null", 'expected_result': 0 }]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag)
sql=sql_queries_cloud.pres_staging_table_create) pres_staging_table_populate = StageToRedshiftOperator( task_id="pres_staging_table_populate", dag=dag, provide_context=True, redshift_conn_id="my_redshift_conn", aws_credentials_id="my_aws_conn", table="pres_staging_table", s3_bucket="prescribing-data", s3_key= "{{ execution_date.year }}_{{ ds[5:7] }}/T{{ execution_date.year }}{{ ds[5:7] }}PDPI_BNFT", header=True) pres_fact_table_insert = PostgresOperator( task_id="pres_fact_table_insert", dag=dag, postgres_conn_id="my_redshift_conn", sql=sql_queries_cloud.pres_fact_table_insert) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, conn_id="my_redshift_conn", quality_checks=sql_queries_cloud.quality_tests) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> drop_staging_tables >> pres_staging_table_create pres_staging_table_create >> pres_staging_table_populate >> pres_fact_table_insert pres_fact_table_insert >> run_quality_checks >> end_operator
load_time_table = LoadFactOperator(task_id='Load_time_dim_table', dag=dag, conn_id='redshift', sql=SqlQueries.time_table_insert, target_table='time') run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, conn_id='redshift', dq_checks=[{ 'check_sql': "SELECT COUNT(*) FROM arrivals WHERE cicid IS NULL", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM admissions WHERE admnum IS NULL", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM time WHERE arrdate IS NULL", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM states WHERE state_code IS NULL", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM states WHERE state_code IS NULL", 'expected_result': 0 }]) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> create_tables create_tables >> [
truncate=False) # Load Dimension table load_demographic_dimension_table = LoadDimensionOperator( task_id='load_demographic_dimension_table', dag=dag, redshift_conn_id="redshift", table_name="demographic", sql_insert_stmt=SqlQueries.demographic_table_insert, truncate=False) end_loading = DummyOperator(task_id='end_loading', dag=dag) run_quality_checks_on_fact_table = DataQualityOperator( task_id='run_quality_checks_on_fact_table', dag=dag, redshift_conn_id="redshift", table="fact_temperature") run_quality_checks_on_time_table = DataQualityOperator( task_id='run_quality_checks_on_time_table', dag=dag, redshift_conn_id="redshift", table="time") run_quality_checks_on_airport_table = DataQualityOperator( task_id='run_quality_checks_on_airport_table', dag=dag, redshift_conn_id="redshift", table="airport")
) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn='redshift', table='time', sql=SqlQueries.time_table_insert, provide_context=False, ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn='redshift', quality_checks=[{ 'check_sql': "SELECT COUNT(*) FROM users", 'expected_result': 140 }], provide_context=False, ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> [stage_events_to_redshift, stage_songs_to_redshift] [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table load_songplays_table >> [ load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table ] [ load_user_dimension_table, load_artist_dimension_table,
load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', redshift_conn_id="redshift", table='time', params={ 'append_flag': Variable.get('append_flag', default_var=default_append_flag) }, dag=dag) # TODO Implement the checks as an SubDag. run_quality_checks = DummyOperator(task_id='Run_checks_execution', dag=dag) run_empty_tables = DataQualityOperator( task_id='Empty_tables_test', redshift_conn_id="redshift", sql_commands=SqlQueries.cardinality_queries, check_function=lambda x: x > 0, dag=dag, ) run_dangling_keys = DataQualityOperator( task_id='Foreign_Key_violation', redshift_conn_id="redshift", sql_commands=SqlQueries.foreign_key_queries, check_function=lambda x: x < 1, dag=dag, ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) loading_phase_operator = DummyOperator(task_id='Loading', dag=dag)
redshift_conn_id="redshift_conn_id", table='songs', append=True, dag=dag) check_data_quality = DataQualityOperator( task_id='check_data_quality', dag=dag, redshift_conn_id='redshift_conn_id', dq_checks=[ { 'check_sql': "SELECT COUNT(*) FROM public.songs_dwh WHERE artist is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM public.artists_dwh WHERE artists is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM public.concerts_dwh WHERE artist is null", 'expected_result': 0 }, ]) end_operator = DummyOperator(task_id='stop_execution', dag=dag) start_operator >> export_events_to_s3 start_operator >> export_songs_to_s3
load_complaint_data = S3ToRedshiftTransfer( task_id='load_complaint_data', schema='public', table='fact_complaints', s3_bucket=Variable.get('aws_bucket'), s3_key='processed', redshift_conn_id='redshift_conn', aws_conn_id='aws_credentials', copy_options=['csv', "IGNOREHEADER 1"], dag=dag) data_quality_query_one = "SELECT COUNT(*) FROM fact_complaints WHERE created_date is null" data_quality_check_one = DataQualityOperator( task_id='data_quality_check_one', dag=dag, redshift_conn_id="redshift", data_quality_check=data_quality_query_one) data_quality_query_two = "SELECT COUNT(*) FROM fact_complaints WHERE community_board is null" data_quality_check_two = DataQualityOperator( task_id='data_quality_check_two', dag=dag, redshift_conn_id="redshift_conn", data_quality_check=data_quality_query_two) end_operator = DummyOperator(task_id='execution_complete', dag=dag) start_operator >> [ get_weather_data, upload_demographics_data, upload_complaint_data
dag=dag) load_demographics_dim_to_redshift = LoadParquetToRedshift( task_id='load_demographics_dim', redshift_conn_id='redshift', table='demographics_dim', aws_credentials_id='aws_credentials', s3_bucket='data-engineering-nd', s3_key='capstone-project/output/demographics_dim.parquet/', dag=dag) run_quality_checks = DataQualityOperator( task_id='run_quality_checks', redshift_connection_id='redshift', target_table=("immig_fact", "visa_type_dim", "travel_mode_dim", "time_dim", "airport_dim", "country_temperature_dim", "demographics_dim"), validate_column=("cicid", "visa_category_id", "travel_mode_id", "arrival_sas", "airport_code", "country_code", "stateCode"), dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> copy_files_task copy_files_task >> [ immigration_etl_task, airport_etl_task, temperature_etl_task, demographics_etl_task ] [ immigration_etl_task, airport_etl_task, temperature_etl_task, demographics_etl_task
# COPY SELECTED DATA FROM STAGING TABLE TO FACT TABLE load_gdelt_fact_table = stage2table( task_id='load_gdelt_fact_table', dag=dag, redshift_conn_id="redshift", target_table="gdelt_events", target_columns=sql_statements.gdelt_fact_columns, insert_mode="append", # delete_load/append query=sql_statements.gdelt_events_table_insert) # RUN DATA QUALITY CHECKS run_quality_checks = DataQualityOperator( task_id='run_gdelt_quality_checks', dag=dag, redshift_conn_id="redshift", tests=[ (sql_statements.gdelt_check_nulls, "{}[0][0] == 0"), # there are no NULL values in significant fields (sql_statements.gdelt_num_records, "{}[0][0] >= 100" ) # it would be unusual to see less than 100 events ]) # CLEAR STAGING TABLE for a selected day only, as there might be multiple concurrent processes running clearing_staging_gdelt_events = PythonOperator( task_id='clearing_staging_gdelt_events', dag=dag, python_callable=redshift_date_delete_rows, provide_context=True, op_kwargs={ 'redshift_conn_id': 'redshift', 'target_table': 'staging_gdelt_events' },
load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, table='time', redshift_conn_id="redshift", truncate_table=True, select_sql=SqlQueries.time_table_insert, ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, tables=['songplays', 'users', 'songs', 'artists', 'time'], columns={ 'songplays': ['userid', 'playid'], 'users': ['first_name', 'last_name'], 'songs': ['songid', 'title', 'duration'], 'artists': ['artistid', 'name'], 'time': ['hour', 'day', 'week', 'month'], }, redshift_conn_id="redshift", ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator \ >> create_tables \ >> [stage_events_to_redshift, stage_songs_to_redshift] \ >> load_songplays_table \ >> [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table] \ >> run_quality_checks \
def stage_dim_s3_to_redshift( parent_dag_name, child_dag_name, start_date, end_date, schedule_interval, redshift_conn_id, s3_data, create_sql, table, s3_bucket, s3_key, iam_role, region, file_format, *args, **kwargs): """ Subdag used to create dimension table, copy data from s3 to Redshift dimension table and lastly perform a data quality check. Keyword Arguments: parent_dag_name -- Parent DAG name defined in `main_dag.py` dag object child_dag_name -- Child DAG name used to define subdag ID redshift_conn_id -- Redshift connection ID (str) aws_credentials_id -- AWS connection ID (str) table -- Staging table name (str) create_sql -- Create staging table query (str) s3_bucket -- AWS S3 bucket name (str) s3_key -- AWS S3 bucket data directory/file (str) region -- Redshift cluster configured region (str) file_format -- File format for AWS S3 files (currently only: 'JSON' or 'CSV') (str) """ dag = DAG( dag_id=f"{parent_dag_name}.{child_dag_name}", start_date=start_date, end_date=end_date, schedule_interval=schedule_interval, **kwargs ) start_task = DummyOperator(task_id=f'{table}', dag=dag) create_task = CreatedTableOperator( task_id=f'create_{table}_table', redshift_conn_id=redshift_conn_id, create_sql=create_sql.format(table), table=table, provide_context=True ) copy_task = StageToRedshiftOperator( task_id=f'staging_{table}_table', dag=dag, table=table, redshift_conn_id=redshift_conn_id, s3_bucket=s3_bucket, s3_key=s3_key, iam_role=iam_role, s3_data=s3_data, region=region, file_format=file_format, provide_context=True ) check_task = DataQualityOperator( task_id=f'data_quality_check_{table}', dag=dag, redshift_conn_id=redshift_conn_id, table=table, provide_context=True ) start_task >> create_task create_task >> copy_task copy_task >> check_task return dag
run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', aws_credentials={ 'key': AWS_KEY, 'secret': AWS_SECRET }, region='us-west-2', provide_context=True, dq_checks=[{ 'check_sql': "SELECT COUNT(*) FROM pleasurevisits WHERE pleasurevisit_id is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM flights WHERE flight_num is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM cities WHERE city is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM visitors WHERE adm_num is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM arrival WHERE arrival_date is null", 'expected_result': 0 }, { 'check_sql': "SELECT COUNT(*) FROM departure WHERE dep_date is null", 'expected_result': 0 }])
dag=dag) load_airport_table = StageToRedshiftOperator(task_id='load_airport_table', redshift_conn_id='redshift', aws_credentials='aws_credentials', s3_bucket='amr-transformed', s3_key='airport.parquet', table='airports', copy_options='FORMAT AS PARQUET', provide_context=True, dag=dag) #Data quality task data_quality_task = DataQualityOperator( task_id='data_quality_task', redshift_conn_id='redshift', tables=['immigration', 'weather', 'states', 'airports', 'dates'], provide_context=True, dag=dag) #defining the dependencies start_task >> run_emr_task run_emr_task >> create_table_operator create_table_operator >> load_dates_table create_table_operator >> load_immigration_table create_table_operator >> load_weather_table create_table_operator >> load_state_table create_table_operator >> load_airport_table load_dates_table >> data_quality_task load_immigration_table >> data_quality_task load_weather_table >> data_quality_task load_state_table >> data_quality_task
table="weather", file_format='CSV') s3_stations_to_redshift = StageToRedshiftOperator( task_id='s3_stations_to_redshift', dag=dag, conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket='ud-covid-citibike', s3_key='stations', table="stations", file_format='CSV') run_data_quality_checks = DataQualityOperator( task_id='run_data_quality_checks', dag=dag, provide_context=True, redshift_conn_id="redshift", tables=['stations', 'weather', 'covid', 'bike', 'dates']) middle_operator = DummyOperator(task_id='middle_dummy_operator', dag=dag) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> [transform_citibike, transform_weather, transform_covid] [transform_citibike, transform_weather, transform_covid] >> middle_operator middle_operator >> [ upload_dates_to_s3, upload_covid_to_s3, upload_stations_to_s3, upload_weather_to_s3, upload_bike_to_s3 ]
load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', dag=dag, redshift_conn_id="redshift", sql_query=SqlQueries.artist_table_insert, filter_expr="" ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id="redshift", sql_query=SqlQueries.time_table_insert, filter_expr="" ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id="redshift", table_list = ['times', 'artists', 'songs', 'users', 'songplays'] ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> (stage_songs_to_redshift, stage_events_to_redshift) >> load_songplays_table load_songplays_table >> (load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table) >> run_quality_checks run_quality_checks >> end_operator
redshift_conn_id='redshift', table='dim_time', sql_list = [SqlQueries.dim_arrdate_time_insert,SqlQueries.dim_depdate_time_insert], update_mode="overwrite", provider_context=True, dag = dag, ) load_fact_visit = LoadFactOperator( task_id='Load_fact_visit_event_table', redshift_conn_id='redshift', table='fact_visit_event', sql = SqlQueries.fact_visit_insert, update_mode="update", provider_context=True, dag = dag, ) data_quality_operator = DataQualityOperator( task_id='Run_data_quality_tables', redshift_conn_id='redshift', table_list=['dim_visitor','dim_time','dim_mode','dim_location','dim_port','dim_address','dim_visa','dim_airline','fact_visit_event'], provider_context=True, dag = dag, ) dim_list = [dim_mode_lookup_operator,dim_location_lookup_operator,dim_port_lookup_operator,dim_address_lookup_operator, dim_visa_lookup_operator, dim_airline_lookup_operator, load_dim_visitor,load_dim_time] start_operator >> create_tables_operator >> stage_i94immi_operator >> dim_list >> load_fact_visit >> data_quality_operator >> end_operator
copy_stmt = sql_queries.SQLQueries.COPY_CSV_TABLE.format(**PARAMS) elif table in sql_queries.General.PARQUET_TABLES: PARAMS['s3_uri'] = ('s3://{base_bucket}/i94_parquet_data'.format(**PARAMS)) copy_stmt = sql_queries.SQLQueries.COPY_PARQUET_TABLE.format(**PARAMS) else: logging.info(f"WARNING: Unable to COPY {table}") continue # COPY task task_copy_table = PostgresOperator( task_id=f"copy_{table}", postgres_conn_id="redshift", sql=copy_stmt, dag=dag ) logging.info(f"Successfully Copied {table}") # Data Quality Check Task task_data_quality = DataQualityOperator( task_id=f"data_quality_check_on_{table}", redshift_conn_id="redshift", table=table, dag=dag ) task_write_sas_codes_to_s3 >> task_create_table task_create_table >> task_copy_table task_copy_table >> task_data_quality task_data_quality >> etl_success etl_begin >> task_write_sas_codes_to_s3
def stage_fact_s3_to_redshift( parent_dag_name, child_dag_name, start_date, end_date, schedule_interval, redshift_conn_id, degree_list, s3_data, create_sql, s3_bucket, s3_key, iam_role, region, file_format, extra_copy_parameters='', *args, **kwargs): """ Subdag used to create staging table, copy data from s3 to staging table in redshift and lastly perform a data quality check. Keyword Arguments: parent_dag_name -- Parent DAG name defined in `main_dag.py` dag object child_dag_name -- Child DAG name used to define subdag ID start_date -- DAG start date end_date -- DAG end date schedule_interval -- (e.g. '@monthly', '@weekly', etc.) redshift_conn_id -- Redshift connection ID (str) degree_list -- List of degree names (list) aws_credentials_id -- AWS connection ID (str) s3_bucket -- AWS S3 bucket name (str) s3_date -- S3 data name used to format staging table name create_sql -- SQL used to create staging table s3_key -- AWS S3 bucket data directory/file (str) region -- Redshift cluster configured region (str) file_format -- File format for AWS S3 files (currently only: 'JSON' or 'CSV') (str) """ dag = DAG( dag_id=f"{parent_dag_name}.{child_dag_name}", start_date=start_date, end_date=end_date, schedule_interval=schedule_interval, **kwargs ) for degree in degree_list: table = f'{degree}_{s3_data}' error_table = f'{table}_errors' start_task = DummyOperator(task_id=f'{degree}', dag=dag) create_task = CreatedTableOperator( task_id=f'create_{table}_table', redshift_conn_id=redshift_conn_id, create_sql=create_sql.format(table), table=table, provide_context=True ) copy_task = StageToRedshiftOperator( task_id=f'staging_{table}_table', dag=dag, table=table, redshift_conn_id=redshift_conn_id, s3_bucket=s3_bucket, s3_key=s3_key, iam_role=iam_role, s3_data=s3_data, degree=degree, region=region, file_format=file_format, extra_copy_parameters=extra_copy_parameters, provide_context=True ) #push count to xcom for stl count comparison count_check_task = DataQualityOperator( task_id=f'data_quality_check_{table}', dag=dag, redshift_conn_id=redshift_conn_id, table=table, provide_context=True ) check_stl_branch = STLCheckOperator( task_id=f'stl_check_{table}', table=table, error_table=error_table, redshift_conn_id=redshift_conn_id ) staging_success_task = PythonOperator( task_id=f'staging_success_check_{table}', python_callable=staging_success_check, op_kwargs={'redshift_conn_id': redshift_conn_id, 'table': table, 'error_table': error_table}, dag=dag, provide_context=True ) start_task >> create_task create_task >> copy_task copy_task >> [check_stl_branch, count_check_task] check_stl_branch >> staging_success_task return dag
postgres_conn_id = "redshift_dend", sql = SqlQueries.load_table['iso_country_region'], dag = dag ) load_us_cities_demographics = PostgresOperator( task_id = "load_us_cities_demographics", postgres_conn_id = "redshift_dend", sql = SqlQueries.load_table['us_cities_demographics'], dag = dag ) data_quality_check = DataQualityOperator( task_id = "data_quality_check", redshift_conn_id = "redshift_dend", tables = [key for key in SqlQueries.create_table if 'stage' not in key], trigger_rule = "all_done", dag = dag ) local_file_archive = PythonOperator( task_id = 'local_file_archive', python_callable = local_file_archive, dag = dag ) s3_file_archive = BashOperator( task_id = 's3_file_archive', bash_command = "aws s3 mv s3://bucket-vincent/ s3://bucket-vincent-archive/{{ ds_nodash }}/ --recursive", dag = dag )
append=False, dag=dag ) load_time_table = LoadDimensionOperator( task_id='load_time_dimension_table', redshift_conn_id='redshift', table='time', append=True, dag=dag ) run_quality_checks_fact = DataQualityOperator( task_id='Run_data_quality_checks_fact', redshift_conn_id="redshift", execution_time='{{ts_nodash}}', table_to_check='fact', dag=dag ) run_quality_checks_dimension = DataQualityOperator( task_id='Run_data_quality_checks_dimension', redshift_conn_id="redshift", execution_time='{{ts_nodash}}', table_to_check='dimension', dag=dag ) calculate_daily_carpark_stats = DailyFactsCalculatorOperator( task_id = "calculate_and_create_daily_carpark_availability_table", dag = dag,
table='artists', redshift_conn_id='redshift', aws_conn_id='aws_credentials', insert_sql_qry=SqlQueries.artist_table_insert) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, table='time', redshift_conn_id='redshift', aws_conn_id='aws_credentials', insert_sql_qry=SqlQueries.time_table_insert) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, tables=['songplays', 'users', 'songs', 'artists', 'time'], redshift_conn_id='redshift') end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # task ordering: start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_user_dimension_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_artist_dimension_table
load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, table="time", redshift_conn_id='redshift', depends_on_past=False, retries=3, retry_delay=timedelta(minutes=5), email_on_retry=False) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, table='songplays', fields=['user_id', 'song_id', 'artist_id', 'start_time'], redshift_conn_id='redshift', depends_on_past=False, retries=3, retry_delay=timedelta(minutes=5), email_on_retry=False) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events_to_redshift start_operator >> stage_songs_to_redshift stage_events_to_redshift >> load_songplays_table stage_songs_to_redshift >> load_songplays_table load_songplays_table >> load_song_dimension_table load_songplays_table >> load_user_dimension_table
load_arrival_date = LoadDimensionOperator( task_id="load_arrival_date", dag=dag, redshift_conn_id=REDSHIFT_CONN_ID, schema=SCHEMA_NAME, table="arrival_date", insert_sql=SqlQueries.arrival_date_insert) # Data Quality # get the dq_checks_settings for data quality # file: [airflow_file]/plugins/helpers/dq_check_settings.json airflow_file = pathlib.Path(__file__).parent.parent.absolute() dq_check_settings = os.path.join(airflow_file, "plugins", "helpers", "dq_check_settings.json") with open(dq_check_settings) as json_file: dq_checks = json.load(json_file) dq_checks = dq_checks['dq_checks'] run_quality_checks = DataQualityOperator(task_id="run_data_quality_checks", dag=dag, redshift_conn_id=REDSHIFT_CONN_ID, dq_checks=dq_checks) end_operator = DummyOperator(task_id="end_execution", dag=dag) # Task Dependencies start_operator >> [ copy_immigration_data, copy_usa_port, copy_travel_way, copy_visa_code, copy_country_code ] >> load_usa_travelers_info load_usa_travelers_info >> load_arrival_date >> run_quality_checks >> end_operator
task_id='Load_artist_dim_table', dag=dag, redshift_conn_id='redshift', table='artists', sql_query=SqlQueries.artist_table_insert, append_insert=True, primary_key="artistid" ) load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift', table='time', sql_query=SqlQueries.time_table_insert ) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', dq_checks=[ {'check_sql': "SELECT COUNT(*) FROM users WHERE userid is null", 'expected_result': 0}, {'check_sql': "SELECT COUNT(*) FROM songs WHERE songid is null", 'expected_result': 0} ] ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> create_tables >> [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table >> [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table] >> run_quality_checks >> end_operator