def get_staging_to_dim(
        parent_dag_name,
        task_id,
        redshift_conn_id,
        dest_table,
        truncate,
        insert_sql_query,
        validate_sql_query,
        equals=None,
        at_least=1,
        source_table="",
        *args, **kwargs):
    dag = DAG(
        f"{parent_dag_name}.{task_id}",
        **kwargs
    )

    load_to_dimension_table = LoadDimensionOperator(
        task_id=f"load_{dest_table}_dim_table",
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        dest_table=dest_table,
        source_table=source_table,
        sql_query=insert_sql_query,
        truncate=truncate
    )
    load_to_dimension_table.doc_md = """\
    ### Load Dimension Table
    This task inserts staging data to \
    dimension table.
    """

    run_quality_check = DataQualityOperator(
        task_id=f"check_{dest_table}_data",
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table=dest_table,
        sql_query=validate_sql_query,
        equals=equals,
        at_least=at_least,
    )
    run_quality_check.doc_md = """\
    ### Check Dimension Table Data
    This task runs validation checks on \
    the newly loaded dimension table.
    """

    # Connect tasks
    load_to_dimension_table >> run_quality_check

    return dag
Beispiel #2
0
fact_weather_task.doc_md = """\
### Load weather facts
This task populates the `fact_weather` table with \
numerical data.
"""

# Add task to check that we have data in the facts table
run_quality_checks = DataQualityOperator(
    task_id='run_data_quality_checks',
    dag=dag,
    redshift_conn_id=db_conn_name,
    table="fact_weather",
    sql_query=SqlQueries.row_count,
    equals=366 * 24,  # we expect to find hourly data for one year
)
run_quality_checks.doc_md = """\
### Validate the `fact_weather` table.
This task validates that the correct amount of rows \
have been inserted. We run data validation that expects to find a total of \
24 rows for 366 days (2016 was a leap year).


Any deviation raises and Exception and ends up in task \
execution failure.
"""

# Add drop staging tables task for all weather data
sql_query = """
    {% for table, _ in tables.items() %}
    DROP TABLE staging_weather_{{ table }};
    {% endfor %}
# Staged data quality checks

quality_check_staged_metadata = DataQualityOperator(
    task_id='quality_check_staged_metadata',
    dag=dag,
    provide_context=True,
    redshift_conn_id='redshift',
    queries=[
        {
            "query":
            helpers.RedshiftStagedValidationQueries.MetadataFirstRowsQuery,
            "expected_result_function":
            helpers.DataValidationChecks.ValidateNoEmptyColumnsInResult
        },
    ])
quality_check_staged_metadata.doc_md = """
# Runs quality checks and validation scripts on data as described in queries
"""

quality_check_staged_authors = DataQualityOperator(
    task_id='quality_check_staged_authors',
    dag=dag,
    provide_context=True,
    redshift_conn_id='redshift',
    queries=[
        {
            "query":
            helpers.RedshiftStagedValidationQueries.AuthorsFirstRowsQuery,
            "expected_result_function":
            helpers.DataValidationChecks.ValidateNoEmptyColumnsInResult
        },