Esempio n. 1
0
def conn():
    h = Connector(TEST_DB_URI)

    schemas = ["tmp", "temporary", "data_quality", "raw"]
    create_queries = [f"create schema if not exists {s}" for s in schemas]
    drop_queries = [f"drop schema if exists {s} cascade" for s in schemas]

    for c in create_queries:
        h.execute(c)

    yield h

    for d in drop_queries:
        h.execute(d)
Esempio n. 2
0
def test_set_medians(conn: Connector, monkeypatch):
    DQBase.metadata.clear()
    qc = create_default_quality_check_class(
        ResultTable(schema_name="data_quality", table_name="t"))
    qc.__table__.create(conn.engine)
    instance = qc()

    conn.execute("""
        insert into data_quality.quality_check_t(failed, passed, task_ts)
        values
          (10, 200, '2018-09-11T13:00:00'),
          (3, 22, '2018-09-10T13:00:00'),
          (11, 110, '2018-09-09T13:00:00'),
          (55, 476, '2018-09-08T13:00:00'),
          (77, 309, '2018-07-12T13:00:00') -- should not be taken
    """)

    monkeypatch.setattr("contessa.models.datetime", FakedDatetime)
    instance.set_medians(conn)

    assert instance.median_30_day_failed == 10.5
    assert instance.median_30_day_passed == 155
Esempio n. 3
0
def test_set_medians(conn: Connector, monkeypatch):
    DQBase.metadata.clear()
    qc = create_default_check_class(
        ResultTable(schema_name="data_quality",
                    table_name="t",
                    model_cls=QualityCheck))
    qc.__table__.create(conn.engine)
    instance = qc()

    conn.execute("""
        insert into data_quality.quality_check_t(attribute, rule_name, rule_type, failed, passed, task_ts, time_filter)
        values
          ('a', 'b', 'not_null', 10, 200, '2018-09-11T13:00:00', 'not_set'),
          ('a', 'b', 'not_null', 3, 22, '2018-09-10T13:00:00', 'not_set'),
          ('a', 'b', 'not_null', 11, 110, '2018-09-09T13:00:00', 'not_set'),
          ('a', 'b', 'not_null', 55, 476, '2018-09-08T13:00:00', 'not_set'),
          ('a', 'b', 'not_null', 77, 309, '2018-07-12T13:00:00', 'not_set') -- should not be taken
    """)

    monkeypatch.setattr("contessa.models.datetime", FakedDatetime)
    instance.set_medians(conn)

    assert instance.median_30_day_failed == 10.5
    assert instance.median_30_day_passed == 155
Esempio n. 4
0
class TestMigrationsResolverInit(unittest.TestCase):
    def setUp(self):
        """
        Init a temporary table with some data.
        """

        sql = [
            f"DROP SCHEMA IF EXISTS {DATA_QUALITY_SCHEMA} CASCADE;"
            f"CREATE SCHEMA IF NOT EXISTS {DATA_QUALITY_SCHEMA};",
            get_quality_table_creation_script(DATA_QUALITY_SCHEMA,
                                              DATA_QUALITY_TABLE_1),
            get_quality_table_creation_script(DATA_QUALITY_SCHEMA,
                                              DATA_QUALITY_TABLE_2),
        ]
        self.conn = Connector(TEST_DB_URI)
        for s in sql:
            self.conn.execute(s)

    def tearDown(self):
        """
        Drop all created tables.
        """
        self.conn.execute(f"DROP schema {DATA_QUALITY_SCHEMA} CASCADE;")
        DQBase.metadata.clear()

    def test_migration_table_exists_init(self):
        versions_migrations = {"0.1.4": "0.1.4-hash", "0.1.5": "0.1.5-hash"}

        m = MigrationsResolver(versions_migrations, "0.1.4", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        migration_table_exists = m.migrations_table_exists()

        assert migration_table_exists is False

    def test_get_current_migration_init(self):
        versions_migrations = {"0.1.4": "0.1.4-hash", "0.1.5": "0.1.5-hash"}

        m = MigrationsResolver(versions_migrations, "0.1.4", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        current = m.get_applied_migration()

        assert current is None

    def test_is_on_head_init(self):
        versions_migrations = {"0.1.4": "0.1.4-hash", "0.1.5": "0.1.5-hash"}

        m = MigrationsResolver(versions_migrations, "0.1.4", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        is_on_head = m.is_on_head()

        is_on_head is False

    def test_get_migrations_to_head__is_before_init(self):
        versions_migrations = {
            "0.1.2": "0.1.2-hash",
            "0.1.3": "0.1.3-hash",
            "0.1.4": "0.1.4-hash",
            "0.1.5": "0.1.5-hash",
            "0.1.6": "0.1.6-hash",
            "0.1.7": "0.1.7-hash",
        }

        m = MigrationsResolver(versions_migrations, "0.1.7", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        migrations = m.get_migration_to_head()
        assert migrations[0] is "upgrade"
        assert migrations[1] is "0.1.7-hash"
Esempio n. 5
0
class TestMigrationsResolver(unittest.TestCase):
    def setUp(self):
        """
        Init a temporary table with some data.
        """

        sql = [
            f"DROP SCHEMA IF EXISTS {DATA_QUALITY_SCHEMA} CASCADE;"
            f"CREATE SCHEMA IF NOT EXISTS {DATA_QUALITY_SCHEMA};",
            get_quality_table_creation_script(DATA_QUALITY_SCHEMA,
                                              DATA_QUALITY_TABLE_1),
            get_quality_table_creation_script(DATA_QUALITY_SCHEMA,
                                              DATA_QUALITY_TABLE_2),
            f"""
                    create table {DATA_QUALITY_SCHEMA}.{MIGRATION_TABLE}
                        (
                            version_num varchar(32) not null
                                constraint alembic_version_pkc
                                    primary key
                        );
                        INSERT INTO {DATA_QUALITY_SCHEMA}.{MIGRATION_TABLE} (version_num) VALUES ('0.1.4-hash');
                    """,
        ]
        self.conn = Connector(TEST_DB_URI)
        for s in sql:
            self.conn.execute(s)

    def tearDown(self):
        """
        Drop all created tables.
        """
        # self.conn.execute(f"DROP schema data_quality_test CASCADE;")
        DQBase.metadata.clear()

    def test_schema_exists(self):
        versions_migrations = {"0.1.4": "0.1.4-hash", "0.1.5": "0.1.5-hash"}

        m = MigrationsResolver(versions_migrations, "0.1.4", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        schema_exists = m.schema_exists()

        assert schema_exists

        m = MigrationsResolver(versions_migrations, "0.1.4", SQLALCHEMY_URL,
                               "not_existing_schema")
        schema_exists = m.schema_exists()

        assert schema_exists is False

    def test_migration_table_exists(self):
        versions_migrations = {"0.1.4": "0.1.4-hash", "0.1.5": "0.1.5-hash"}

        m = MigrationsResolver(versions_migrations, "0.1.4", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        migration_table_exists = m.migrations_table_exists()

        assert migration_table_exists

    def test_get_current_migration(self):
        versions_migrations = {"0.1.4": "0.1.4-hash", "0.1.5": "0.1.5-hash"}

        m = MigrationsResolver(versions_migrations, "0.1.4", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        current = m.get_applied_migration()

        assert current == "0.1.4-hash"

    def test_is_on_head(self):
        versions_migrations = {"0.1.4": "0.1.4-hash", "0.1.5": "0.1.5-hash"}

        m = MigrationsResolver(versions_migrations, "0.1.4", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        is_on_head = m.is_on_head()

        assert is_on_head

    def test_is_on_head_no_on_head(self):
        versions_migrations = {"0.1.4": "0.1.4-hash", "0.1.5": "0.1.5-hash"}

        m = MigrationsResolver(versions_migrations, "0.1.5", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        is_on_head = m.is_on_head()

        assert is_on_head is False

    def test_is_on_head_with_fallback(self):
        versions_migrations = {"0.1.4": "0.1.4-hash", "0.1.6": "0.1.6-hash"}

        m = MigrationsResolver(versions_migrations, "0.1.5", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        is_on_head = m.is_on_head()

        assert is_on_head

    def test_get_migrations_to_head__already_on_head(self):
        versions_migrations = {"0.1.4": "0.1.4-hash", "0.1.5": "0.1.5-hash"}

        m = MigrationsResolver(versions_migrations, "0.1.4", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        migrations = m.get_migration_to_head()
        assert migrations is None

    def test_get_migrations_to_head__package_greather_than_map_max(self):
        versions_migrations = {"0.1.4": "0.1.4-hash", "0.1.5": "0.1.5-hash"}

        m = MigrationsResolver(versions_migrations, "0.1.6", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        migrations = m.get_migration_to_head()
        assert migrations[0] is "upgrade"
        assert migrations[1] is "0.1.5-hash"

    def test_get_migrations_to_head__is_down_from_head(self):
        versions_migrations = {
            "0.1.2": "0.1.2-hash",
            "0.1.3": "0.1.3-hash",
            "0.1.4": "0.1.4-hash",
            "0.1.5": "0.1.5-hash",
            "0.1.6": "0.1.6-hash",
            "0.1.7": "0.1.7-hash",
        }

        m = MigrationsResolver(versions_migrations, "0.1.7", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        migrations = m.get_migration_to_head()
        assert migrations[0] is "upgrade"
        assert migrations[1] is "0.1.7-hash"

    def test_get_migrations_to_head__is_down_from_head_with_fallback(self):
        versions_migrations = {
            "0.1.2": "0.1.2-hash",
            "0.1.3": "0.1.3-hash",
            "0.1.4": "0.1.4-hash",
            "0.1.5": "0.1.5-hash",
            "0.1.8": "0.1.8-hash",
            "0.1.9": "0.1.9-hash",
        }

        m = MigrationsResolver(versions_migrations, "0.1.7", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        migrations = m.get_migration_to_head()
        assert migrations[0] is "upgrade"
        assert migrations[1] is "0.1.5-hash"

    def test_get_migrations_to_head__is_up_from_head(self):
        versions_migrations = {
            "0.1.2": "0.1.2-hash",
            "0.1.3": "0.1.3-hash",
            "0.1.4": "0.1.4-hash",
            "0.1.5": "0.1.5-hash",
            "0.1.6": "0.1.6-hash",
            "0.1.7": "0.1.7-hash",
        }

        m = MigrationsResolver(versions_migrations, "0.1.2", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        migrations = m.get_migration_to_head()
        assert migrations[0] is "downgrade"
        assert migrations[1] is "0.1.2-hash"

    def test_get_migrations_to_head__is_up_from_head_with_fallback(self):
        versions_migrations = {
            "0.1.1": "0.1.1-hash",
            "0.1.3": "0.1.3-hash",
            "0.1.4": "0.1.4-hash",
            "0.1.5": "0.1.5-hash",
            "0.1.6": "0.1.6-hash",
            "0.1.7": "0.1.7-hash",
        }

        m = MigrationsResolver(versions_migrations, "0.1.2", SQLALCHEMY_URL,
                               DATA_QUALITY_SCHEMA)
        migrations = m.get_migration_to_head()
        assert migrations[0] is "downgrade"
        assert migrations[1] is "0.1.1-hash"
Esempio n. 6
0
class TestDataQualityOperator(unittest.TestCase):
    def setUp(self):
        """
        Init a temporary table with some data.
        """
        self.table_name = "booking_all_v2"
        self.ts_nodash = (FakedDatetime.now().isoformat().replace("-",
                                                                  "").replace(
                                                                      ":", ""))
        self.tmp_table_name = f"{self.table_name}_{self.ts_nodash}"
        self.now = FakedDatetime.now()

        sql = [
            "DROP SCHEMA if exists tmp CASCADE;",
            "DROP SCHEMA if exists data_quality CASCADE;",
            "CREATE SCHEMA IF NOT EXISTS tmp;",
            "CREATE SCHEMA IF NOT EXISTS data_quality;",
            "CREATE SCHEMA IF NOT EXISTS hello;",
            f"""
                    CREATE TABLE IF NOT EXISTS tmp.{self.table_name}(
                      id SERIAL PRIMARY KEY,
                      src text,
                      dst text,
                      price int,
                      turnover_after_refunds double precision,
                      initial_price double precision,
                      created_at timestamptz
                    )
                """,
            f"""
                    CREATE TABLE IF NOT EXISTS tmp.{self.tmp_table_name}(
                      id SERIAL PRIMARY KEY,
                      src text,
                      dst text,
                      price int,
                      turnover_after_refunds double precision,
                      initial_price double precision,
                      created_at timestamptz
                    )
                """,
            f"""
                    INSERT INTO tmp.{self.tmp_table_name}
                        (src, dst, price, turnover_after_refunds, initial_price, created_at)
                    VALUES
                        ('BTS', NULL, 1, 100, 11, '2018-09-12T11:50:00'),
                        -- this is older than 30 days.
                        -- not in stats when time_filter = `created_at`
                        (NULL, 'PEK', 33, 1.1, 13, '2018-01-12T15:50:00'),
                        ('VIE', 'JFK', 4, 5.5, 23.4, '2018-09-11T11:50:00'),
                        ('VIE', 'VIE', 4, 0.0, 0.0, '2018-09-11T11:50:00')
                """,
            f"""
                INSERT INTO tmp.{self.table_name}
                    (src, dst, price, turnover_after_refunds, initial_price, created_at)
                VALUES
                    ('BTS', NULL, 1, 100, 11, '2018-09-12T13:00:00'),
                    -- this is older than 30 days.
                    -- not in stats when time_filter = `created_at`
                    (NULL, 'PEK', 33, 1.1, 13, '2018-01-12T13:00:00'),
                    ('VIE', 'JFK', 4, 5.5, 23.4, '2018-09-11T13:00:00'),
                    ('VIE', 'VIE', 4, 0.0, 0.0, '2018-09-11T13:00:00')
            """,
        ]
        self.conn = Connector(TEST_DB_URI)
        for s in sql:
            self.conn.execute(s)

        self.contessa_runner = ContessaRunner(TEST_DB_URI)

    def tearDown(self):
        """
        Drop all created tables.
        """
        self.conn.execute(f"DROP schema tmp CASCADE;")
        self.conn.execute(f"DROP schema data_quality CASCADE;")
        self.conn.execute(f"DROP schema hello CASCADE;")
        DQBase.metadata.clear()

    @mock.patch("contessa.executor.datetime", FakedDatetime)
    def test_execute_tmp(self):
        sql = """
            SELECT
              CASE WHEN src = 'BTS' and dst is null THEN false ELSE true END as res
            from {{ table_fullname }}
        """
        rules = [
            {
                "name": "not_null_name",
                "type": "not_null",
                "column": "dst",
                "time_filter": "created_at",
            },
            {
                "name": "gt_name",
                "type": "gt",
                "column": "price",
                "value": 10,
                "time_filter": "created_at",
            },
            {
                "name": "sql_name",
                "type": "sql",
                "sql": sql,
                "column": "src_dst",
                "description": "test sql rule",
            },
            {
                "name": "not_name",
                "type": "not",
                "column": "src",
                "value": "dst"
            },
        ]
        self.contessa_runner.run(
            check_table={
                "schema_name": "tmp",
                "table_name": self.tmp_table_name
            },
            result_table={
                "schema_name": "data_quality",
                "table_name": self.table_name
            },
            raw_rules=rules,
            context={"task_ts": self.now},
        )

        rows = self.conn.get_records(f"""
            SELECT * from data_quality.quality_check_{self.table_name}
            order by created_at
        """).fetchall()
        self.assertEqual(len(rows), 4)

        notnull_rule = rows[0]
        self.assertEqual(notnull_rule["failed"], 1)
        self.assertEqual(notnull_rule["passed"], 2)
        self.assertEqual(notnull_rule["attribute"], "dst")
        self.assertEqual(notnull_rule["task_ts"].timestamp(),
                         self.now.timestamp())

        gt_rule = rows[1]
        self.assertEqual(gt_rule["failed"], 3)
        self.assertEqual(gt_rule["passed"], 0)
        self.assertEqual(gt_rule["attribute"], "price")

        sql_rule = rows[2]
        self.assertEqual(sql_rule["failed"], 1)
        self.assertEqual(sql_rule["passed"], 3)
        self.assertEqual(sql_rule["attribute"], "src_dst")

        not_column_rule = rows[3]
        self.assertEqual(not_column_rule["failed"], 1)
        self.assertEqual(not_column_rule["passed"], 3)
        self.assertEqual(not_column_rule["attribute"], "src")

    @mock.patch("contessa.executor.datetime", FakedDatetime)
    def test_execute_dst(self):
        sql = """
            SELECT
              CASE WHEN src = 'BTS' AND dst is null THEN false ELSE true END as res
            FROM {{ table_fullname }}
            WHERE created_at BETWEEN
                timestamptz '{{task_ts}}' - INTERVAL '1 hour'
                AND
                timestamptz '{{task_ts}}' + INTERVAL '1 hour'
        """
        rules = [
            {
                "name": "not_null_name",
                "type": "not_null",
                "column": "dst",
                "time_filter": "created_at",
            },
            {
                "name": "sql_name",
                "type": "sql",
                "sql": sql,
                "column": "src_dst",
                "description": "test sql rule",
            },
        ]
        self.contessa_runner.run(
            check_table={
                "schema_name": "tmp",
                "table_name": self.tmp_table_name
            },
            result_table={
                "schema_name": "data_quality",
                "table_name": self.table_name
            },
            raw_rules=rules,
            context={"task_ts": self.now},
        )

        rows = self.conn.get_records(f"""
            SELECT * from data_quality.quality_check_{self.table_name}
            order by created_at
        """).fetchall()
        self.assertEqual(len(rows), 2)

        notnull_rule = rows[0]
        self.assertEqual(notnull_rule["failed"], 1)
        self.assertEqual(notnull_rule["passed"], 2)
        self.assertEqual(notnull_rule["attribute"], "dst")
        self.assertEqual(notnull_rule["task_ts"].timestamp(),
                         self.now.timestamp())

        sql_rule = rows[1]
        self.assertEqual(sql_rule["failed"], 1)
        self.assertEqual(sql_rule["passed"], 0)
        self.assertEqual(sql_rule["attribute"], "src_dst")

    def test_different_schema(self):
        rules = [{
            "name": "not_nul_name",
            "type": "not_null",
            "column": "dst",
            "time_filter": "created_at",
        }]
        self.contessa_runner.run(
            check_table={
                "schema_name": "tmp",
                "table_name": self.tmp_table_name
            },
            result_table={
                "schema_name": "hello",
                "table_name": "abcde"
            },
            raw_rules=rules,
            context={"task_ts": self.now},
        )
        rows = self.conn.get_records(f"""
                SELECT 1 from hello.quality_check_abcde
            """).fetchall()
        self.assertEqual(len(rows), 1)
Esempio n. 7
0
class TestConsistencyChecker(unittest.TestCase):
    def setUp(self):
        """
        Init a temporary table with some data.
        """
        self.left_table_name = "raw_booking"
        self.right_table_name = "booking"
        self.result_table_name = "booking"
        self.ts_nodash = (
            FakedDatetime.now().isoformat().replace("-", "").replace(":", "")
        )
        self.now = FakedDatetime.now()

        sql = [
            "DROP SCHEMA if exists tmp CASCADE;",
            "DROP SCHEMA if exists data_quality CASCADE;",
            "CREATE SCHEMA IF NOT EXISTS tmp;",
            "CREATE SCHEMA IF NOT EXISTS data_quality;",
            "CREATE SCHEMA IF NOT EXISTS hello;",
            f"""
                    CREATE TABLE IF NOT EXISTS tmp.{self.left_table_name}(
                      id SERIAL PRIMARY KEY,
                      src text,
                      dst text,
                      price int,
                      turnover_after_refunds double precision,
                      initial_price double precision,
                      created_at timestamptz
                    )
                """,
            f"""
                    CREATE TABLE IF NOT EXISTS hello.{self.right_table_name}(
                      id SERIAL PRIMARY KEY,
                      src text,
                      dst text,
                      price int,
                      turnover_after_refunds double precision,
                      initial_price double precision,
                      created_at timestamptz
                    )
                """,
            f"""
                INSERT INTO tmp.{self.left_table_name}
                    (src, dst, price, turnover_after_refunds, initial_price, created_at)
                VALUES
                    ('BTS', NULL, 1, 100, 11, '2018-09-12T11:50:00'),
                    (NULL, 'PEK', 33, 1.1, 13, '2018-01-12T15:50:00'),
                    ('VIE', 'JFK', 4, 5.5, 23.4, '2018-09-11T11:50:00'),
                    ('VIE', 'VIE', 4, 0.0, 0.0, '2018-09-11T11:50:00')
                """,
            f"""
                INSERT INTO hello.{self.right_table_name}
                    (src, dst, price, turnover_after_refunds, initial_price, created_at)
                VALUES
                    ('BTS', NULL, 1, 100, 11, '2018-09-12T11:50:00'),
                    (NULL, 'PEK', 33, 1.1, 13, '2018-01-12T15:50:00'),
                    ('VIE', 'JFK', 4, 5.5, 23.4, '2018-09-11T11:50:00')
            """,
        ]
        self.conn = Connector(TEST_DB_URI)
        for s in sql:
            self.conn.execute(s)

        self.consistency_checker = ConsistencyChecker(TEST_DB_URI)

    def tearDown(self):
        """
        Drop all created tables.
        """
        self.conn.execute(f"DROP schema tmp CASCADE;")
        self.conn.execute(f"DROP schema data_quality CASCADE;")
        self.conn.execute(f"DROP schema hello CASCADE;")
        DQBase.metadata.clear()

    @mock.patch("contessa.executor.datetime", FakedDatetime)
    def test_execute_consistency_false(self):
        self.consistency_checker.run(
            self.consistency_checker.COUNT,
            left_check_table={"schema_name": "tmp", "table_name": self.left_table_name},
            right_check_table={
                "schema_name": "hello",
                "table_name": self.right_table_name,
            },
            result_table={
                "schema_name": "data_quality",
                "table_name": self.result_table_name,
            },
            context={"task_ts": self.now},
        )
        rows = self.conn.get_records(
            f"""
            SELECT * from data_quality.consistency_check_{self.result_table_name}
            order by created_at
        """
        )
        self.assertEqual(rows.fetchone()["status"], "invalid")

    @mock.patch("contessa.executor.datetime", FakedDatetime)
    def test_execute_consistency_true(self):
        # add missing record to the right table
        self.conn.execute(
            f"""
            INSERT INTO hello.{self.right_table_name}
                (src, dst, price, turnover_after_refunds, initial_price, created_at)
            VALUES
                ('VIE', 'VIE', 4, 0.0, 0.0, '2018-09-11T11:50:00')
        """
        )

        self.consistency_checker.run(
            self.consistency_checker.COUNT,
            left_check_table={"schema_name": "tmp", "table_name": self.left_table_name},
            right_check_table={
                "schema_name": "hello",
                "table_name": self.right_table_name,
            },
            result_table={
                "schema_name": "data_quality",
                "table_name": self.result_table_name,
            },
            context={"task_ts": self.now},
        )

        rows = self.conn.get_records(
            f"""
            SELECT * from data_quality.consistency_check_{self.result_table_name}
            order by created_at
        """
        )
        self.assertEqual(rows.fetchone()["status"], "valid")

    @mock.patch("contessa.executor.datetime", FakedDatetime)
    def test_execute_consistency_diff(self):
        """ Test scenario where table diff is done on tables with different column order."""
        self.inconsistent_colums_left = "user"
        self.inconsistent_colums_right = "user_inconsistent"
        self.conn = Connector(TEST_DB_URI)
        self.consistency_checker = ConsistencyChecker(TEST_DB_URI)

        sql = [
            f"""
                CREATE TABLE IF NOT EXISTS tmp.{self.inconsistent_colums_left}(
                  id SERIAL PRIMARY KEY,
                  name text
                )
            """,
            f"""
                CREATE TABLE IF NOT EXISTS tmp.{self.inconsistent_colums_right}(
                  name text,
                  id SERIAL PRIMARY KEY
                )
            """,
            f"""
                INSERT INTO tmp.{self.inconsistent_colums_left}
                VALUES
                    (1, 'John Doe')
                """,
            f"""
                INSERT INTO tmp.{self.inconsistent_colums_right}
                VALUES
                    ('John Doe', 1)
            """,
        ]

        for s in sql:
            self.conn.execute(s)

        self.consistency_checker.run(
            self.consistency_checker.DIFF,
            left_check_table={
                "schema_name": "tmp",
                "table_name": self.inconsistent_colums_left,
            },
            right_check_table={
                "schema_name": "tmp",
                "table_name": self.inconsistent_colums_right,
            },
            result_table={
                "schema_name": "data_quality",
                "table_name": self.result_table_name,
            },
            context={"task_ts": self.now},
        )

        rows = self.conn.get_records(
            f"""
                SELECT * from data_quality.consistency_check_{self.result_table_name}
                order by created_at
            """
        )

        self.assertEqual(rows.fetchone()["status"], "valid")

    @mock.patch("contessa.executor.datetime", FakedDatetime)
    def test_execute_consistency_sqls(self):
        result = self.consistency_checker.run(
            self.consistency_checker.DIFF,
            left_check_table={"schema_name": "tmp", "table_name": self.left_table_name},
            right_check_table={
                "schema_name": "hello",
                "table_name": self.right_table_name,
            },
            left_custom_sql="SELECT 16349;",
            right_custom_sql="SELECT 16349;",
            context={"task_ts": self.now},
        )

        self.assertEqual("valid", result.status)

        result = self.consistency_checker.run(
            self.consistency_checker.DIFF,
            left_check_table={"schema_name": "tmp", "table_name": self.left_table_name},
            right_check_table={
                "schema_name": "hello",
                "table_name": self.right_table_name,
            },
            left_custom_sql="SELECT 42;",
            right_custom_sql="SELECT 16349;",
            context={"task_ts": self.now},
        )

        self.assertEqual("invalid", result.status)
Esempio n. 8
0
class TestReturnResults(unittest.TestCase):
    def setUp(self):
        """
        Init a temporary table with some data.
        """
        self.left_table_name = "raw_booking"
        self.right_table_name = "booking"
        self.ts_nodash = (FakedDatetime.now().isoformat().replace("-",
                                                                  "").replace(
                                                                      ":", ""))
        self.now = FakedDatetime.now()

        sql = [
            "DROP SCHEMA if exists tmp CASCADE;",
            "CREATE SCHEMA IF NOT EXISTS tmp;",
            "CREATE SCHEMA IF NOT EXISTS hello;",
            f"""
                    CREATE TABLE IF NOT EXISTS tmp.{self.left_table_name}(
                      id SERIAL PRIMARY KEY,
                      src text,
                      dst text,
                      price int,
                      turnover_after_refunds double precision,
                      initial_price double precision,
                      created_at timestamptz
                    )
                """,
            f"""
                    CREATE TABLE IF NOT EXISTS hello.{self.right_table_name}(
                      id SERIAL PRIMARY KEY,
                      src text,
                      dst text,
                      price int,
                      turnover_after_refunds double precision,
                      initial_price double precision,
                      created_at timestamptz
                    )
                """,
            f"""
                INSERT INTO tmp.{self.left_table_name}
                    (src, dst, price, turnover_after_refunds, initial_price, created_at)
                VALUES
                    ('BTS', NULL, 1, 100, 11, '2018-09-12T11:50:00'),
                    (NULL, 'PEK', 33, 1.1, 13, '2018-01-12T15:50:00'),
                    ('VIE', 'JFK', 4, 5.5, 23.4, '2018-09-11T11:50:00'),
                    ('VIE', 'VIE', 4, 0.0, 0.0, '2018-09-11T11:50:00')
                """,
            f"""
                INSERT INTO hello.{self.right_table_name}
                    (src, dst, price, turnover_after_refunds, initial_price, created_at)
                VALUES
                    ('BTS', NULL, 1, 100, 11, '2018-09-12T11:50:00'),
                    (NULL, 'PEK', 33, 1.1, 13, '2018-01-12T15:50:00'),
                    ('VIE', 'JFK', 4, 5.5, 23.4, '2018-09-11T11:50:00')
            """,
        ]
        self.conn = Connector(TEST_DB_URI)
        for s in sql:
            self.conn.execute(s)

        self.consistency_checker = ConsistencyChecker(TEST_DB_URI)

    def tearDown(self):
        """
        Drop all created tables.
        """
        self.conn.execute(f"DROP schema tmp CASCADE;")
        self.conn.execute(f"DROP schema hello CASCADE;")
        DQBase.metadata.clear()

    @mock.patch("contessa.executor.datetime", FakedDatetime)
    def test_execute_consistency(self):
        result = self.consistency_checker.run(
            self.consistency_checker.COUNT,
            left_check_table={
                "schema_name": "tmp",
                "table_name": self.left_table_name
            },
            right_check_table={
                "schema_name": "hello",
                "table_name": self.right_table_name,
            },
            context={"task_ts": self.now},
        )

        self.assertEqual(result.status, "invalid")
        self.assertEqual(result.context["left_table_name"], "tmp.raw_booking")
        self.assertEqual(result.context["right_table_name"], "hello.booking")