def get_staging_to_dim( parent_dag_name, task_id, redshift_conn_id, dest_table, truncate, insert_sql_query, validate_sql_query, equals=None, at_least=1, source_table="", *args, **kwargs): dag = DAG( f"{parent_dag_name}.{task_id}", **kwargs ) load_to_dimension_table = LoadDimensionOperator( task_id=f"load_{dest_table}_dim_table", dag=dag, redshift_conn_id=redshift_conn_id, dest_table=dest_table, source_table=source_table, sql_query=insert_sql_query, truncate=truncate ) load_to_dimension_table.doc_md = """\ ### Load Dimension Table This task inserts staging data to \ dimension table. """ run_quality_check = DataQualityOperator( task_id=f"check_{dest_table}_data", dag=dag, redshift_conn_id=redshift_conn_id, table=dest_table, sql_query=validate_sql_query, equals=equals, at_least=at_least, ) run_quality_check.doc_md = """\ ### Check Dimension Table Data This task runs validation checks on \ the newly loaded dimension table. """ # Connect tasks load_to_dimension_table >> run_quality_check return dag
fact_weather_task.doc_md = """\ ### Load weather facts This task populates the `fact_weather` table with \ numerical data. """ # Add task to check that we have data in the facts table run_quality_checks = DataQualityOperator( task_id='run_data_quality_checks', dag=dag, redshift_conn_id=db_conn_name, table="fact_weather", sql_query=SqlQueries.row_count, equals=366 * 24, # we expect to find hourly data for one year ) run_quality_checks.doc_md = """\ ### Validate the `fact_weather` table. This task validates that the correct amount of rows \ have been inserted. We run data validation that expects to find a total of \ 24 rows for 366 days (2016 was a leap year). Any deviation raises and Exception and ends up in task \ execution failure. """ # Add drop staging tables task for all weather data sql_query = """ {% for table, _ in tables.items() %} DROP TABLE staging_weather_{{ table }}; {% endfor %}
# Staged data quality checks quality_check_staged_metadata = DataQualityOperator( task_id='quality_check_staged_metadata', dag=dag, provide_context=True, redshift_conn_id='redshift', queries=[ { "query": helpers.RedshiftStagedValidationQueries.MetadataFirstRowsQuery, "expected_result_function": helpers.DataValidationChecks.ValidateNoEmptyColumnsInResult }, ]) quality_check_staged_metadata.doc_md = """ # Runs quality checks and validation scripts on data as described in queries """ quality_check_staged_authors = DataQualityOperator( task_id='quality_check_staged_authors', dag=dag, provide_context=True, redshift_conn_id='redshift', queries=[ { "query": helpers.RedshiftStagedValidationQueries.AuthorsFirstRowsQuery, "expected_result_function": helpers.DataValidationChecks.ValidateNoEmptyColumnsInResult },