def stage_s3_to_redshift_dag(
    parent_dag_name: str,
    task_id: str,
    redshift_conn_id: str = "",
    aws_credentials_id: str = "",
    target_table: str = "",
    s3_bucket: str = None,
    s3_key: str = None,
    json_path: Optional[str] = None,
    ignore_headers: Optional[int] = None,
    delimiter: Optional[str] = None,
    default_args: Dict[str, Any] = dict,
    *args,
    **kwargs,
):
    dag = DAG(dag_id=f"{parent_dag_name}.{task_id}",
              default_args=default_args,
              **kwargs)

    stage_events_to_redshift = StageToRedshiftOperator(
        task_id=f"{parent_dag_name}.Stage_events",
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        target_table=target_table,
        s3_bucket=s3_bucket,
        s3_key=s3_key,
        json_path=json_path,
        ignore_headers=ignore_headers,
        delimiter=delimiter,
        dag=dag,
        *args,
        **kwargs)

    validation_songplays = DataQualityValidator(
        sql_statement=f"SELECT COUNT(*) FROM {target_table}",
        result_to_assert=0,
        should_assert_for_equality=False,
    )

    check_data_task = DataQualityOperator(
        task_id=f"{parent_dag_name}.Data_Quality_Check",
        redshift_conn_id=redshift_conn_id,
        data_quality_validations=[validation_songplays],
        dag=dag,
    )

    stage_events_to_redshift >> check_data_task

    return dag
Beispiel #2
0
    truncate=False,
    dag=dag
)

# Calling LoadDimensionOperator to load the data into time dimension table
load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    target_table='time',
    redshift_connection_id='redshift',
    sql_statement=SqlQueries.time_table_insert,
    truncate=False,
    dag=dag
)

# Runs the QualityCheck via DataQualityOperator on all fact and dimension tables once the data is loaded
run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    redshift_connection_id = 'redshift',
    target_table=("songplays","users","songs","artists","time"),
    validate_column=("playid","userid","song_id","artist_id","start_time"),
    dag=dag
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

# Task order
start_operator >> [stage_log_events_to_redshift,stage_songs_to_redshift]
[stage_log_events_to_redshift,stage_songs_to_redshift] >> load_songplays_table
load_songplays_table >> [load_user_dimension_table,load_song_dimension_table,load_artist_dimension_table,load_time_dimension_table]
[load_user_dimension_table,load_song_dimension_table,load_artist_dimension_table,load_time_dimension_table] >> run_quality_checks
run_quality_checks >> end_operator
    task_id='Load_time_dim_table',
    redshift_conn_id = 'redshift',
    table="time",
    sql_query = SqlQueries.time_table_insert,
    dag=dag,
    append_only=False
)

"""
connecting to redshift
running the DataQualityOperator operator
"""
run_quality_checks = DataQualityOperator(

    task_id='run_data_quality_checks',
    redshift_conn_id="redshift",
    dag=dag,
    tables=["songplay", "users", "song", "artist", "time"]
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

"""
Order of dags; starting with the start_operator and ending with the end_operator
"""
start_operator >> create_tables_in_redshift
create_tables_in_redshift >> [stage_songs_to_redshift, stage_events_to_redshift] >> load_songplays_table

load_songplays_table >> [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table] >> run_quality_checks >> end_operator
Beispiel #4
0
load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    conn_id='redshift',
    sql=SqlQueries.time_table_insert,
    target_table='time',
    truncate_first=True)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    conn_id='redshift',
    dq_checks=[{
        'check_sql': "SELECT COUNT(*) FROM songplays WHERE start_time IS NULL",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM users WHERE userid IS NULL",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM songs WHERE songid IS NULL",
        'expected_result': 0
    }])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> create_tables

create_tables >> stage_events_to_redshift
create_tables >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
load_cities_dimension_table = LoadDimensionOperator(
    task_id='Load_cities_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    sql_query=insert_queries['cities'],
    filter_expr="",
    mode='append')

load_times_dimension_table = LoadDimensionOperator(
    task_id='Load_times_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    sql_query=insert_queries['times'],
    filter_expr="",
    mode='append')

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    table_list=['immigration_facts', 'states', 'cities', 'times'])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> (stage_immigration_to_redshift, stage_demography_to_redshift
                   ) >> load_immigration_facts_table
load_immigration_facts_table >> (
    load_states_dimension_table, load_cities_dimension_table,
    load_times_dimension_table) >> run_quality_checks
run_quality_checks >> end_operator
    sql_statement=SqlQueries.time_table_insert,
    provide_context=True)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    aws_credentials={
        'key': AWS_KEY,
        'secret': AWS_SECRET
    },
    region='us-west-2',
    provide_context=True,
    dq_checks=[{
        'check_sql': "SELECT COUNT(*) FROM songplays WHERE playid is null",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM users WHERE userid is null",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM artists WHERE artistid is null",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM songs WHERE songid is null",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM time WHERE start_time is null",
        'expected_result': 0
    }])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)
Beispiel #7
0
    sql=sql_queries_cloud.pres_staging_table_create)

pres_staging_table_populate = StageToRedshiftOperator(
    task_id="pres_staging_table_populate",
    dag=dag,
    provide_context=True,
    redshift_conn_id="my_redshift_conn",
    aws_credentials_id="my_aws_conn",
    table="pres_staging_table",
    s3_bucket="prescribing-data",
    s3_key=
    "{{ execution_date.year }}_{{ ds[5:7] }}/T{{ execution_date.year }}{{ ds[5:7] }}PDPI_BNFT",
    header=True)

pres_fact_table_insert = PostgresOperator(
    task_id="pres_fact_table_insert",
    dag=dag,
    postgres_conn_id="my_redshift_conn",
    sql=sql_queries_cloud.pres_fact_table_insert)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    conn_id="my_redshift_conn",
    quality_checks=sql_queries_cloud.quality_tests)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> drop_staging_tables >> pres_staging_table_create
pres_staging_table_create >> pres_staging_table_populate >> pres_fact_table_insert
pres_fact_table_insert >> run_quality_checks >> end_operator
load_time_table = LoadFactOperator(task_id='Load_time_dim_table',
                                   dag=dag,
                                   conn_id='redshift',
                                   sql=SqlQueries.time_table_insert,
                                   target_table='time')

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    conn_id='redshift',
    dq_checks=[{
        'check_sql': "SELECT COUNT(*) FROM arrivals WHERE cicid IS NULL",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM admissions WHERE admnum IS NULL",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM time WHERE arrdate IS NULL",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM states WHERE state_code IS NULL",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM states WHERE state_code IS NULL",
        'expected_result': 0
    }])

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> create_tables

create_tables >> [
Beispiel #9
0
    truncate=False)

# Load Dimension table
load_demographic_dimension_table = LoadDimensionOperator(
    task_id='load_demographic_dimension_table',
    dag=dag,
    redshift_conn_id="redshift",
    table_name="demographic",
    sql_insert_stmt=SqlQueries.demographic_table_insert,
    truncate=False)

end_loading = DummyOperator(task_id='end_loading', dag=dag)

run_quality_checks_on_fact_table = DataQualityOperator(
    task_id='run_quality_checks_on_fact_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="fact_temperature")

run_quality_checks_on_time_table = DataQualityOperator(
    task_id='run_quality_checks_on_time_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="time")

run_quality_checks_on_airport_table = DataQualityOperator(
    task_id='run_quality_checks_on_airport_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="airport")
Beispiel #10
0
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn='redshift',
    table='time',
    sql=SqlQueries.time_table_insert,
    provide_context=False,
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn='redshift',
    quality_checks=[{
        'check_sql': "SELECT COUNT(*) FROM users",
        'expected_result': 140
    }],
    provide_context=False,
)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> [stage_events_to_redshift, stage_songs_to_redshift]
[stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table
load_songplays_table >> [
    load_user_dimension_table, load_song_dimension_table,
    load_artist_dimension_table, load_time_dimension_table
]
[
    load_user_dimension_table, load_artist_dimension_table,
load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    redshift_conn_id="redshift",
    table='time',
    params={
        'append_flag': Variable.get('append_flag',
                                    default_var=default_append_flag)
    },
    dag=dag)
# TODO Implement the checks as an SubDag.
run_quality_checks = DummyOperator(task_id='Run_checks_execution', dag=dag)

run_empty_tables = DataQualityOperator(
    task_id='Empty_tables_test',
    redshift_conn_id="redshift",
    sql_commands=SqlQueries.cardinality_queries,
    check_function=lambda x: x > 0,
    dag=dag,
)

run_dangling_keys = DataQualityOperator(
    task_id='Foreign_Key_violation',
    redshift_conn_id="redshift",
    sql_commands=SqlQueries.foreign_key_queries,
    check_function=lambda x: x < 1,
    dag=dag,
)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

loading_phase_operator = DummyOperator(task_id='Loading', dag=dag)
                                         redshift_conn_id="redshift_conn_id",
                                         table='songs',
                                         append=True,
                                         dag=dag)

check_data_quality = DataQualityOperator(
    task_id='check_data_quality',
    dag=dag,
    redshift_conn_id='redshift_conn_id',
    dq_checks=[
        {
            'check_sql':
            "SELECT COUNT(*) FROM public.songs_dwh WHERE artist is null",
            'expected_result': 0
        },
        {
            'check_sql':
            "SELECT COUNT(*) FROM public.artists_dwh WHERE artists is null",
            'expected_result': 0
        },
        {
            'check_sql':
            "SELECT COUNT(*) FROM public.concerts_dwh WHERE artist is null",
            'expected_result': 0
        },
    ])

end_operator = DummyOperator(task_id='stop_execution', dag=dag)

start_operator >> export_events_to_s3
start_operator >> export_songs_to_s3
load_complaint_data = S3ToRedshiftTransfer(
    task_id='load_complaint_data',
    schema='public',
    table='fact_complaints',
    s3_bucket=Variable.get('aws_bucket'),
    s3_key='processed',
    redshift_conn_id='redshift_conn',
    aws_conn_id='aws_credentials',
    copy_options=['csv', "IGNOREHEADER 1"],
    dag=dag)

data_quality_query_one = "SELECT COUNT(*) FROM fact_complaints WHERE created_date is null"

data_quality_check_one = DataQualityOperator(
    task_id='data_quality_check_one',
    dag=dag,
    redshift_conn_id="redshift",
    data_quality_check=data_quality_query_one)

data_quality_query_two = "SELECT COUNT(*) FROM fact_complaints WHERE community_board is null"

data_quality_check_two = DataQualityOperator(
    task_id='data_quality_check_two',
    dag=dag,
    redshift_conn_id="redshift_conn",
    data_quality_check=data_quality_query_two)

end_operator = DummyOperator(task_id='execution_complete', dag=dag)

start_operator >> [
    get_weather_data, upload_demographics_data, upload_complaint_data
    dag=dag)

load_demographics_dim_to_redshift = LoadParquetToRedshift(
    task_id='load_demographics_dim',
    redshift_conn_id='redshift',
    table='demographics_dim',
    aws_credentials_id='aws_credentials',
    s3_bucket='data-engineering-nd',
    s3_key='capstone-project/output/demographics_dim.parquet/',
    dag=dag)

run_quality_checks = DataQualityOperator(
    task_id='run_quality_checks',
    redshift_connection_id='redshift',
    target_table=("immig_fact", "visa_type_dim", "travel_mode_dim", "time_dim",
                  "airport_dim", "country_temperature_dim",
                  "demographics_dim"),
    validate_column=("cicid", "visa_category_id", "travel_mode_id",
                     "arrival_sas", "airport_code", "country_code",
                     "stateCode"),
    dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> copy_files_task
copy_files_task >> [
    immigration_etl_task, airport_etl_task, temperature_etl_task,
    demographics_etl_task
]
[
    immigration_etl_task, airport_etl_task, temperature_etl_task,
    demographics_etl_task
Beispiel #15
0
# COPY SELECTED DATA FROM STAGING TABLE TO FACT TABLE
load_gdelt_fact_table = stage2table(
    task_id='load_gdelt_fact_table',
    dag=dag,
    redshift_conn_id="redshift",
    target_table="gdelt_events",
    target_columns=sql_statements.gdelt_fact_columns,
    insert_mode="append",  # delete_load/append
    query=sql_statements.gdelt_events_table_insert)

# RUN DATA QUALITY CHECKS
run_quality_checks = DataQualityOperator(
    task_id='run_gdelt_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    tests=[
        (sql_statements.gdelt_check_nulls,
         "{}[0][0] == 0"),  # there are no NULL values in significant fields
        (sql_statements.gdelt_num_records, "{}[0][0] >= 100"
         )  # it would be unusual to see less than 100 events
    ])

# CLEAR STAGING TABLE for a selected day only, as there might be multiple concurrent processes running
clearing_staging_gdelt_events = PythonOperator(
    task_id='clearing_staging_gdelt_events',
    dag=dag,
    python_callable=redshift_date_delete_rows,
    provide_context=True,
    op_kwargs={
        'redshift_conn_id': 'redshift',
        'target_table': 'staging_gdelt_events'
    },
load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    table='time',
    redshift_conn_id="redshift",
    truncate_table=True,
    select_sql=SqlQueries.time_table_insert,
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    tables=['songplays', 'users', 'songs', 'artists', 'time'],
    columns={
        'songplays': ['userid', 'playid'],
        'users': ['first_name', 'last_name'],
        'songs': ['songid', 'title', 'duration'],
        'artists': ['artistid', 'name'],
        'time': ['hour', 'day', 'week', 'month'],
    },
    redshift_conn_id="redshift",
)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator \
    >> create_tables \
    >> [stage_events_to_redshift, stage_songs_to_redshift] \
    >> load_songplays_table \
    >> [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table] \
    >> run_quality_checks \
def stage_dim_s3_to_redshift(
    parent_dag_name,
    child_dag_name,
    start_date,
    end_date,
    schedule_interval,
    redshift_conn_id,
    s3_data,
    create_sql,
    table,
    s3_bucket,
    s3_key,
    iam_role,
    region,
    file_format,
    *args, **kwargs):

    """
    Subdag used to create dimension table, copy data from s3 to Redshift dimension table and lastly perform a data quality check.

    Keyword Arguments:
    parent_dag_name -- Parent DAG name defined in `main_dag.py` dag object
    child_dag_name -- Child DAG name used to define subdag ID
    redshift_conn_id   -- Redshift connection ID (str)
    aws_credentials_id -- AWS connection ID (str)
    table -- Staging table name (str)
    create_sql -- Create staging table query (str)
    s3_bucket -- AWS S3 bucket name (str)
    s3_key -- AWS S3 bucket data directory/file (str)
    region -- Redshift cluster configured region (str)
    file_format -- File format for AWS S3 files  (currently only: 'JSON' or 'CSV') (str)
    """

    dag = DAG(
        dag_id=f"{parent_dag_name}.{child_dag_name}",
        start_date=start_date,
        end_date=end_date,
        schedule_interval=schedule_interval,
        **kwargs
    )

    start_task = DummyOperator(task_id=f'{table}',  dag=dag)
    
    create_task = CreatedTableOperator(
        task_id=f'create_{table}_table',
        redshift_conn_id=redshift_conn_id,
        create_sql=create_sql.format(table),
        table=table,
        provide_context=True
    )

    copy_task = StageToRedshiftOperator(
        task_id=f'staging_{table}_table',
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        s3_bucket=s3_bucket,
        s3_key=s3_key,
        iam_role=iam_role,
        s3_data=s3_data, 
        region=region,
        file_format=file_format,
        provide_context=True
    )

    check_task = DataQualityOperator(
        task_id=f'data_quality_check_{table}',
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table=table,
        provide_context=True
    )

    start_task >> create_task
    create_task >> copy_task
    copy_task >> check_task

    return dag
run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    aws_credentials={
        'key': AWS_KEY,
        'secret': AWS_SECRET
    },
    region='us-west-2',
    provide_context=True,
    dq_checks=[{
        'check_sql':
        "SELECT COUNT(*) FROM pleasurevisits WHERE pleasurevisit_id is null",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM flights WHERE flight_num is null",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM cities WHERE city is null",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM visitors WHERE adm_num is null",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM arrival WHERE arrival_date is null",
        'expected_result': 0
    }, {
        'check_sql': "SELECT COUNT(*) FROM departure WHERE dep_date is null",
        'expected_result': 0
    }])
                                           dag=dag)

load_airport_table = StageToRedshiftOperator(task_id='load_airport_table',
                                             redshift_conn_id='redshift',
                                             aws_credentials='aws_credentials',
                                             s3_bucket='amr-transformed',
                                             s3_key='airport.parquet',
                                             table='airports',
                                             copy_options='FORMAT AS PARQUET',
                                             provide_context=True,
                                             dag=dag)

#Data quality task
data_quality_task = DataQualityOperator(
    task_id='data_quality_task',
    redshift_conn_id='redshift',
    tables=['immigration', 'weather', 'states', 'airports', 'dates'],
    provide_context=True,
    dag=dag)

#defining the dependencies
start_task >> run_emr_task
run_emr_task >> create_table_operator
create_table_operator >> load_dates_table
create_table_operator >> load_immigration_table
create_table_operator >> load_weather_table
create_table_operator >> load_state_table
create_table_operator >> load_airport_table
load_dates_table >> data_quality_task
load_immigration_table >> data_quality_task
load_weather_table >> data_quality_task
load_state_table >> data_quality_task
Beispiel #20
0
    table="weather",
    file_format='CSV')

s3_stations_to_redshift = StageToRedshiftOperator(
    task_id='s3_stations_to_redshift',
    dag=dag,
    conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket='ud-covid-citibike',
    s3_key='stations',
    table="stations",
    file_format='CSV')

run_data_quality_checks = DataQualityOperator(
    task_id='run_data_quality_checks',
    dag=dag,
    provide_context=True,
    redshift_conn_id="redshift",
    tables=['stations', 'weather', 'covid', 'bike', 'dates'])

middle_operator = DummyOperator(task_id='middle_dummy_operator', dag=dag)
end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> [transform_citibike, transform_weather, transform_covid]

[transform_citibike, transform_weather, transform_covid] >> middle_operator

middle_operator >> [
    upload_dates_to_s3, upload_covid_to_s3, upload_stations_to_s3,
    upload_weather_to_s3, upload_bike_to_s3
]
Beispiel #21
0
load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    sql_query=SqlQueries.artist_table_insert,
    filter_expr=""
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    sql_query=SqlQueries.time_table_insert,
    filter_expr=""
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    table_list = ['times', 'artists', 'songs', 'users', 'songplays']
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

start_operator >> (stage_songs_to_redshift, stage_events_to_redshift) >> load_songplays_table
load_songplays_table >> (load_user_dimension_table,
                         load_song_dimension_table,
                         load_artist_dimension_table,
                         load_time_dimension_table) >> run_quality_checks
run_quality_checks >> end_operator
    redshift_conn_id='redshift',
    table='dim_time',
    sql_list = [SqlQueries.dim_arrdate_time_insert,SqlQueries.dim_depdate_time_insert],
    update_mode="overwrite",
    provider_context=True,
    dag = dag,
)

load_fact_visit = LoadFactOperator(
    task_id='Load_fact_visit_event_table',
    redshift_conn_id='redshift',
    table='fact_visit_event',
    sql = SqlQueries.fact_visit_insert,
    update_mode="update",
    provider_context=True,
    dag = dag,
)

data_quality_operator = DataQualityOperator(
    task_id='Run_data_quality_tables',
    redshift_conn_id='redshift',
    table_list=['dim_visitor','dim_time','dim_mode','dim_location','dim_port','dim_address','dim_visa','dim_airline','fact_visit_event'],
    provider_context=True,
    dag = dag,
)


dim_list = [dim_mode_lookup_operator,dim_location_lookup_operator,dim_port_lookup_operator,dim_address_lookup_operator, dim_visa_lookup_operator, dim_airline_lookup_operator, load_dim_visitor,load_dim_time]


start_operator >> create_tables_operator >> stage_i94immi_operator >> dim_list >> load_fact_visit >> data_quality_operator >> end_operator
        copy_stmt = sql_queries.SQLQueries.COPY_CSV_TABLE.format(**PARAMS)
    elif table in sql_queries.General.PARQUET_TABLES:
        PARAMS['s3_uri'] = ('s3://{base_bucket}/i94_parquet_data'.format(**PARAMS))
        copy_stmt = sql_queries.SQLQueries.COPY_PARQUET_TABLE.format(**PARAMS)
    else:
        logging.info(f"WARNING: Unable to COPY {table}")
        continue

    # COPY task
    task_copy_table = PostgresOperator(
        task_id=f"copy_{table}",
        postgres_conn_id="redshift",
        sql=copy_stmt,
        dag=dag
    )
    logging.info(f"Successfully Copied {table}")

    # Data Quality Check Task
    task_data_quality = DataQualityOperator(
        task_id=f"data_quality_check_on_{table}",
        redshift_conn_id="redshift",
        table=table,
        dag=dag
    )

    task_write_sas_codes_to_s3 >> task_create_table
    task_create_table >> task_copy_table
    task_copy_table >> task_data_quality
    task_data_quality >> etl_success

etl_begin >> task_write_sas_codes_to_s3
Beispiel #24
0
def stage_fact_s3_to_redshift(
    parent_dag_name,
    child_dag_name,
    start_date,
    end_date,
    schedule_interval,
    redshift_conn_id,
    degree_list,
    s3_data,
    create_sql,
    s3_bucket,
    s3_key,
    iam_role,
    region,
    file_format,
    extra_copy_parameters='',
    *args, **kwargs):

    """
    Subdag used to create staging table, copy data from s3 to staging table in redshift and lastly perform a data quality check.

    Keyword Arguments:
    parent_dag_name -- Parent DAG name defined in `main_dag.py` dag object
    child_dag_name -- Child DAG name used to define subdag ID
    start_date -- DAG start date
    end_date -- DAG end date
    schedule_interval -- (e.g. '@monthly', '@weekly', etc.)
    redshift_conn_id   -- Redshift connection ID (str)
    degree_list -- List of degree names (list)
    aws_credentials_id -- AWS connection ID (str)
    s3_bucket -- AWS S3 bucket name (str)
    s3_date -- S3 data name used to format staging table name
    create_sql -- SQL used to create staging table 
    s3_key -- AWS S3 bucket data directory/file (str)
    region -- Redshift cluster configured region (str)
    file_format -- File format for AWS S3 files  (currently only: 'JSON' or 'CSV') (str)
    """

    dag = DAG(
        dag_id=f"{parent_dag_name}.{child_dag_name}",
        start_date=start_date,
        end_date=end_date,
        schedule_interval=schedule_interval,
        **kwargs
    )

    for degree in degree_list:
        table = f'{degree}_{s3_data}'
        error_table = f'{table}_errors'

        start_task = DummyOperator(task_id=f'{degree}',  dag=dag)

        create_task = CreatedTableOperator(
            task_id=f'create_{table}_table',
            redshift_conn_id=redshift_conn_id,
            create_sql=create_sql.format(table),
            table=table,
            provide_context=True
        )

        copy_task = StageToRedshiftOperator(
            task_id=f'staging_{table}_table',
            dag=dag,
            table=table,
            redshift_conn_id=redshift_conn_id,
            s3_bucket=s3_bucket,
            s3_key=s3_key,
            iam_role=iam_role,
            s3_data=s3_data, 
            degree=degree,
            region=region,
            file_format=file_format,
            extra_copy_parameters=extra_copy_parameters,
            provide_context=True
            )

        #push count to xcom for stl count comparison
        count_check_task = DataQualityOperator(
            task_id=f'data_quality_check_{table}',
            dag=dag,
            redshift_conn_id=redshift_conn_id,
            table=table,
            provide_context=True
        )

        check_stl_branch = STLCheckOperator(
            task_id=f'stl_check_{table}',
            table=table,
            error_table=error_table,
            redshift_conn_id=redshift_conn_id
        )

        staging_success_task = PythonOperator(
            task_id=f'staging_success_check_{table}',
            python_callable=staging_success_check,
            op_kwargs={'redshift_conn_id': redshift_conn_id, 'table': table, 'error_table': error_table},
            dag=dag,
            provide_context=True
        )

        start_task >> create_task
        create_task >> copy_task
        copy_task >> [check_stl_branch, count_check_task]
        check_stl_branch >> staging_success_task

    return dag
    postgres_conn_id = "redshift_dend",
    sql = SqlQueries.load_table['iso_country_region'],
    dag = dag
)

load_us_cities_demographics = PostgresOperator(
    task_id = "load_us_cities_demographics",
    postgres_conn_id = "redshift_dend",
    sql = SqlQueries.load_table['us_cities_demographics'],
    dag = dag
)

data_quality_check = DataQualityOperator(
    task_id = "data_quality_check",
    redshift_conn_id = "redshift_dend",
    tables = [key for key in SqlQueries.create_table if 'stage' not in key],
    trigger_rule = "all_done",
    dag = dag
)

local_file_archive = PythonOperator(
    task_id = 'local_file_archive',
    python_callable = local_file_archive,
    dag = dag
)

s3_file_archive = BashOperator(
    task_id = 's3_file_archive',
    bash_command = "aws s3 mv s3://bucket-vincent/ s3://bucket-vincent-archive/{{ ds_nodash }}/ --recursive",
    dag = dag
)
Beispiel #26
0
        append=False,
        dag=dag        
)

load_time_table = LoadDimensionOperator(
        task_id='load_time_dimension_table',
        redshift_conn_id='redshift',
        table='time',
        append=True,
        dag=dag
)

run_quality_checks_fact = DataQualityOperator(
        task_id='Run_data_quality_checks_fact',
        redshift_conn_id="redshift",
        execution_time='{{ts_nodash}}',
        table_to_check='fact',
        dag=dag
)

run_quality_checks_dimension = DataQualityOperator(
        task_id='Run_data_quality_checks_dimension',
        redshift_conn_id="redshift",
        execution_time='{{ts_nodash}}',
        table_to_check='dimension',
        dag=dag
)

calculate_daily_carpark_stats = DailyFactsCalculatorOperator(
        task_id = "calculate_and_create_daily_carpark_availability_table",
        dag = dag,
    table='artists',
    redshift_conn_id='redshift',
    aws_conn_id='aws_credentials',
    insert_sql_qry=SqlQueries.artist_table_insert)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    table='time',
    redshift_conn_id='redshift',
    aws_conn_id='aws_credentials',
    insert_sql_qry=SqlQueries.time_table_insert)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    tables=['songplays', 'users', 'songs', 'artists', 'time'],
    redshift_conn_id='redshift')

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

# task ordering:
start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table

load_songplays_table >> load_user_dimension_table
load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_artist_dimension_table
load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    table="time",
    redshift_conn_id='redshift',
    depends_on_past=False,
    retries=3,
    retry_delay=timedelta(minutes=5),
    email_on_retry=False)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    table='songplays',
    fields=['user_id', 'song_id', 'artist_id', 'start_time'],
    redshift_conn_id='redshift',
    depends_on_past=False,
    retries=3,
    retry_delay=timedelta(minutes=5),
    email_on_retry=False)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift

stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table

load_songplays_table >> load_song_dimension_table
load_songplays_table >> load_user_dimension_table
Beispiel #29
0
load_arrival_date = LoadDimensionOperator(
    task_id="load_arrival_date",
    dag=dag,
    redshift_conn_id=REDSHIFT_CONN_ID,
    schema=SCHEMA_NAME,
    table="arrival_date",
    insert_sql=SqlQueries.arrival_date_insert)

# Data Quality
# get the dq_checks_settings for data quality
# file: [airflow_file]/plugins/helpers/dq_check_settings.json
airflow_file = pathlib.Path(__file__).parent.parent.absolute()
dq_check_settings = os.path.join(airflow_file, "plugins", "helpers",
                                 "dq_check_settings.json")
with open(dq_check_settings) as json_file:
    dq_checks = json.load(json_file)
    dq_checks = dq_checks['dq_checks']

run_quality_checks = DataQualityOperator(task_id="run_data_quality_checks",
                                         dag=dag,
                                         redshift_conn_id=REDSHIFT_CONN_ID,
                                         dq_checks=dq_checks)

end_operator = DummyOperator(task_id="end_execution", dag=dag)

# Task Dependencies
start_operator >> [
    copy_immigration_data, copy_usa_port, copy_travel_way, copy_visa_code,
    copy_country_code
] >> load_usa_travelers_info
load_usa_travelers_info >> load_arrival_date >> run_quality_checks >> end_operator
Beispiel #30
0
    task_id='Load_artist_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='artists',
    sql_query=SqlQueries.artist_table_insert,
    append_insert=True,
    primary_key="artistid"
)

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='time',
    sql_query=SqlQueries.time_table_insert
)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    dq_checks=[
        {'check_sql': "SELECT COUNT(*) FROM users WHERE userid is null", 'expected_result': 0},
        {'check_sql': "SELECT COUNT(*) FROM songs WHERE songid is null", 'expected_result': 0}
    ]
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

start_operator >> create_tables >> [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table >> [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table] >> run_quality_checks >> end_operator