def setUp(self): """ Init a temporary table with some data. """ self.QUALITY_TABLE_1 = ResultTable(DATA_QUALITY_SCHEMA, "table_1", QualityCheck) self.CONSISTENCY_TABLE_1 = ResultTable(DATA_QUALITY_SCHEMA, "table_2", ConsistencyCheck) sql = [ f"DROP SCHEMA IF EXISTS {DATA_QUALITY_SCHEMA} CASCADE;", f"CREATE SCHEMA IF NOT EXISTS {DATA_QUALITY_SCHEMA};", self.ddl_quality_check_0_2_4(self.QUALITY_TABLE_1), self.ddl_consistency_check_0_2_4(self.CONSISTENCY_TABLE_1), f""" INSERT INTO {self.QUALITY_TABLE_1.fullname}( attribute, rule_name, rule_type, rule_description, total_records, time_filter, task_ts) VALUES('a', 'stuff', 'not_null', 'This is the rule.', 10, NULL, \'{FakedDatetime.now().isoformat()}\'); """ f""" INSERT INTO {self.CONSISTENCY_TABLE_1.fullname}( type, name, description, left_table, right_table, status, time_filter, task_ts ) VALUES( \'{ConsistencyChecker.COUNT}\', 'hello', 'aa', 'tmp.a', 'tmp.b', 'hello', NULL, \'{FakedDatetime.now().isoformat()}\' ); """, ] for s in sql: self.conn.execute(s)
def test_quality_check_init_row(rule, results, conn: Connector): DQBase.metadata.clear() qc = create_default_quality_check_class( ResultTable(schema_name="data_quality", table_name="booking")) assert qc.__tablename__ == "quality_check_booking" assert qc.__name__ == "DataQualityQualityCheckBooking" t = datetime.datetime(2019, 8, 10, 10, 0, 0) qc.__table__.create(conn.engine) instance = qc() instance.init_row(rule, results, conn, context={"task_ts": t}) assert instance.task_ts == t assert instance.attribute == "src" assert instance.rule_name == "not_null" assert instance.rule_description == "True when data is null." assert instance.total_records == 5 assert instance.failed == 2 assert instance.passed == 3 assert instance.failed_percentage == 40 assert instance.passed_percentage == 60 assert instance.median_30_day_failed is None assert instance.median_30_day_passed is None assert instance.time_filter is None assert instance.status == "invalid"
def run( self, raw_rules: List[Dict[str, str]], check_table: Dict, result_table: Optional[ Dict] = None, # todo - docs for quality name, maybe defaults.. context: Optional[Dict] = None, ) -> Union[CheckResult, QualityCheck]: check_table = Table(**check_table) context = self.get_context(check_table, context) normalized_rules = self.normalize_rules(raw_rules) refresh_executors(check_table, self.conn, context) if result_table: result_table = ResultTable(**result_table, model_cls=self.model_cls) quality_check_class = self.get_quality_check_class(result_table) self.conn.ensure_table(quality_check_class.__table__) else: quality_check_class = CheckResult rules = self.build_rules(normalized_rules) objs = self.do_quality_checks(quality_check_class, rules, context) if result_table: self.conn.upsert(objs) return objs
def setUp(self): """ Init a temporary table with some data. """ self.DATA_QUALITY_TABLE_1 = ResultTable(DATA_QUALITY_SCHEMA, "example_table", QualityCheck) self.DATA_QUALITY_TABLE_2 = ResultTable(DATA_QUALITY_SCHEMA, "another_table", QualityCheck) sql = [ f"DROP SCHEMA IF EXISTS {DATA_QUALITY_SCHEMA} CASCADE;", f"CREATE SCHEMA IF NOT EXISTS {DATA_QUALITY_SCHEMA};", get_quality_table_creation_script_0_1_4(self.DATA_QUALITY_TABLE_1), get_quality_table_creation_script_0_1_4(self.DATA_QUALITY_TABLE_2), f""" INSERT INTO {self.DATA_QUALITY_TABLE_1.fullname} (attribute, rule_name, rule_description, total_records, failed, median_30_day_failed, failed_percentage, passed, median_30_day_passed, passed_percentage, status, time_filter, task_ts, created_at, id) VALUES ('src', 'not_null', 'True when data is null.', 41136, 0, null, 0, 41136, null, 100, 'valid', null, '2019-11-09 00:00:00.000000', '2019-11-12 12:41:28.391365', 75597); INSERT INTO {self.DATA_QUALITY_TABLE_1.fullname} (attribute, rule_name, rule_description, total_records, failed, median_30_day_failed, failed_percentage, passed, median_30_day_passed, passed_percentage, status, time_filter, task_ts, created_at, id) VALUES ('dst', 'not_null', 'True when data is null.', 41136, 0, null, 0, 41136, null, 100, 'valid', null, '2019-11-09 00:00:00.000000', '2019-11-12 12:41:28.391365', 75598); INSERT INTO {self.DATA_QUALITY_TABLE_1.fullname} (attribute, rule_name, rule_description, total_records, failed, median_30_day_failed, failed_percentage, passed, median_30_day_passed, passed_percentage, status, time_filter, task_ts, created_at, id) VALUES ('departure_time', 'not_null', 'True when data is null.', 41136, 0, null, 0, 41136, null, 100, 'valid', null, '2019-11-09 00:00:00.000000', '2019-11-12 12:41:28.391365', 75599); """, ] for s in sql: self.conn.execute(s)
def run( self, method: str, left_check_table: Dict, right_check_table: Dict, result_table: Optional[Dict] = None, columns: Optional[List[str]] = None, time_filter: Optional[Union[str, List[Dict], TimeFilter]] = None, left_custom_sql: str = None, right_custom_sql: str = None, context: Optional[Dict] = None, example_selector: ExampleSelector = default_example_selector, ) -> Union[CheckResult, ConsistencyCheck]: if left_custom_sql and right_custom_sql: if columns or time_filter: raise ValueError( "When using custom sqls you cannot change 'columns' or 'time_filter' attribute" ) time_filter = parse_time_filter(time_filter) left_check_table = Table(**left_check_table) right_check_table = Table(**right_check_table) context = self.get_context(left_check_table, right_check_table, context) result = self.do_consistency_check( method, columns, time_filter, left_check_table, right_check_table, left_custom_sql, right_custom_sql, context, example_selector, ) if result_table: result_table = ResultTable(**result_table, model_cls=self.model_cls) quality_check_class = create_default_check_class(result_table) self.right_conn.ensure_table(quality_check_class.__table__) self.upsert(quality_check_class, result) return result obj = CheckResult() obj.init_row_consistency(**result) return obj
def run( self, raw_rules: List[Dict[str, str]], check_table: Dict, result_table: Dict, # todo - docs for quality name, maybe defaults.. context: Optional[Dict] = None, ): check_table = Table(**check_table) result_table = ResultTable(**result_table) context = self.get_context(check_table, context) normalized_rules = self.normalize_rules(raw_rules) refresh_executors(check_table, self.conn, context) quality_check_class = self.get_quality_check_class(result_table) self.ensure_table(quality_check_class) rules = self.build_rules(normalized_rules) objs = self.do_quality_checks(quality_check_class, rules, context) self.insert(objs)
def test_set_medians(conn: Connector, monkeypatch): DQBase.metadata.clear() qc = create_default_quality_check_class( ResultTable(schema_name="data_quality", table_name="t")) qc.__table__.create(conn.engine) instance = qc() conn.execute(""" insert into data_quality.quality_check_t(failed, passed, task_ts) values (10, 200, '2018-09-11T13:00:00'), (3, 22, '2018-09-10T13:00:00'), (11, 110, '2018-09-09T13:00:00'), (55, 476, '2018-09-08T13:00:00'), (77, 309, '2018-07-12T13:00:00') -- should not be taken """) monkeypatch.setattr("contessa.models.datetime", FakedDatetime) instance.set_medians(conn) assert instance.median_30_day_failed == 10.5 assert instance.median_30_day_passed == 155
def test_set_medians(conn: Connector, monkeypatch): DQBase.metadata.clear() qc = create_default_check_class( ResultTable(schema_name="data_quality", table_name="t", model_cls=QualityCheck)) qc.__table__.create(conn.engine) instance = qc() conn.execute(""" insert into data_quality.quality_check_t(attribute, rule_name, rule_type, failed, passed, task_ts, time_filter) values ('a', 'b', 'not_null', 10, 200, '2018-09-11T13:00:00', 'not_set'), ('a', 'b', 'not_null', 3, 22, '2018-09-10T13:00:00', 'not_set'), ('a', 'b', 'not_null', 11, 110, '2018-09-09T13:00:00', 'not_set'), ('a', 'b', 'not_null', 55, 476, '2018-09-08T13:00:00', 'not_set'), ('a', 'b', 'not_null', 77, 309, '2018-07-12T13:00:00', 'not_set') -- should not be taken """) monkeypatch.setattr("contessa.models.datetime", FakedDatetime) instance.set_medians(conn) assert instance.median_30_day_failed == 10.5 assert instance.median_30_day_passed == 155
def test_generic_typ_qc_class_no_prefix(dummy_contessa): assert (dummy_contessa.get_quality_check_class( ResultTable("tmp", "mytable", QualityCheck)).__name__ == "TmpQualityCheckMytable")