コード例 #1
0
class TestDataTransfer(TestCase):
    def setUp(self):
        self.oltp_hook = PostgresHook('oltp')
        self.olap_hook = PostgresHook('olap')

    def test_validate_sales_pipeline(self):
        """Validate if Sales Pipeline DAG run correctly"""
        date = '2020-01-01'

        create_table('purchases', self.oltp_hook)
        insert_initial_data('purchases', self.oltp_hook)

        create_table('products', self.oltp_hook)
        insert_initial_data('products', self.oltp_hook)

        create_table('stg_purchases', self.olap_hook)
        create_table('stg_products', self.olap_hook)
        create_table('products_sales', self.olap_hook)

        execute_dag('products_sales_pipeline', date)

        stg_purchases_result = self.olap_hook.get_pandas_df(
            'select * from stg_purchases')
        stg_purchases_expected = output_expected_as_df(f'stg_purchases_{date}')
        assert_frame_equal(stg_purchases_result, stg_purchases_expected)
        assert len(stg_purchases_result) == 3

        stg_products_result = self.olap_hook.get_pandas_df(
            'select * from stg_products')
        stg_products_expected = output_expected_as_df('stg_products')
        assert_frame_equal(stg_products_result, stg_products_expected)
        assert len(stg_products_result) == 5

        product_sales_result = self.olap_hook.get_pandas_df(
            'select * from products_sales')
        product_sales_expected = output_expected_as_df('products_sales')
        assert_frame_equal(product_sales_result, product_sales_expected)
        assert len(product_sales_result) == 3

        agg_result = self.olap_hook.get_pandas_df(
            'select * from agg_sales_category')
        agg_expected = output_expected_as_df('agg_sales_category')
        assert_frame_equal(agg_result, agg_expected)
        assert len(agg_result) == 2
コード例 #2
0
    def execute(self, context):
        self.log.info("Start Data Quality checks")
        redshift_conn = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.log.info("Redshift Connection has been successfully established")

        test_pairs = zip(self.sql_queries, self.test_results)
        for query, test in test_pairs:
            self.log.debug(f"Run data quality query : {query}")
            result = redshift_conn.get_pandas_df(query)
            self.log.debug(f"Result from test query : {result}")
            if test(result):
                self.log.info("Data quality check passed.")
            else:
                self.log.info('Data quality check failed')
                raise AssertionError('Data quality check failed')