def validate_with_json_schema(self) -> None: """Run JSON schema check and output results. It will try to find all errors, but there are no guarantees. Slower than `check_with_json_schema()` """ res = schema_rules.validate(self.schema, self.source_items.raw) self.save_result(res) res.show()
def validate_with_json_schema(self): """Run JSON schema check and output results. It will try to find all errors, but there are no guarantees. Slower than `check_with_json_schema()` """ res = schema_rules.validate(self.schema, items_dicts=self.source_items.dicts, fast=False) self.save_result(res) self.report.write_result(res, short=False)
def glance(self): """Run JSON schema check and output results. In most cases it will stop after the first error per item. Usable for big jobs as it's about 100x faster than `validate_with_json_schema()`. """ res = schema_rules.validate( self.schema, items_dicts=self.source_items.dicts, fast=True ) self.save_result(res) res.show()
def data_quality_report(self, bucket: Optional[str] = None): if helpers.is_collection_key(self.source): raise ValueError("Collections are not supported") if not self.schema: raise ValueError("Schema is empty") if not self.report.results: self.save_result( schema_rules.validate(self.schema, items_dicts=self.source_items.dicts, fast=False)) DataQualityReport(self.source_items, self.schema, self.report, bucket)
def glance(self) -> None: """Run JSON schema check and output results. In most cases it will return only the first error per item. Usable for big jobs as it's about 100x faster than `validate_with_json_schema()`. """ res = schema_rules.validate( self.schema.raw, self.source_items.raw, self.source_items.df.index, fast=True, ) self.save_result(res) res.show()
def create_figures(self, items: JobItems): dups = self.report.results.get( "Duplicates", duplicate_rules.find_by_tags(items.df, self.schema.tags) ) price_was_now_result = price_rules.compare_was_now(items.df, self.schema.tags) no_of_price_warns = price_was_now_result.err_items_count no_of_checked_price_items = price_was_now_result.items_count crawlera_user = api.get_crawlera_user(items.job) validation_errors = self.report.results.get( "JSON Schema Validation", schema_rules.validate( self.schema.raw, raw_items=items.raw, keys=items.df.index, fast=False ), ).get_errors_count() garbage_symbols_result = self.report.results.get( "Garbage Symbols", garbage_symbols(items.df) ) quality_estimation, field_accuracy = generate_quality_estimation( items.job, crawlera_user, validation_errors, dups.err_items_count, dups.items_count, no_of_price_warns, no_of_checked_price_items, tested=True, garbage_symbols=garbage_symbols_result, ) self.score_table(quality_estimation, field_accuracy) self.job_summary_table(items.job) self.rules_summary_table( items.df, validation_errors, self.schema.tags.get("name_field", ""), self.schema.tags.get("product_url_field", ""), dups.err_items_count, dups.items_count, self.schema.tags.get("product_price_field", ""), self.schema.tags.get("product_price_was_field", ""), no_of_checked_price_items, no_of_price_warns, garbage_symbols=garbage_symbols_result, ) self.scraped_fields_coverage(items.df) self.coverage_by_categories(items.df, self.schema.tags)
def run_schema_rules(self) -> None: if not self.schema: return self.save_result( schema_rules.validate(self.schema, self.source_items.raw)) tagged_fields = sr.Tags().get(self.schema) target_columns = (self.target_items.df.columns.values if self.target_items else None) check_tags_result = schema_rules.check_tags( self.source_items.df.columns.values, target_columns, tagged_fields) self.save_result(check_tags_result) if check_tags_result.errors: return self.run_customized_rules(self.source_items, tagged_fields) self.compare_with_customized_rules(self.source_items, self.target_items, tagged_fields)
def run_schema_rules(self): if not self.schema: return self.save_result( schema_rules.validate(self.schema, self.source_items.dicts)) json_fields = schema_tools.JsonFields(self.schema) target_columns = (self.target_items.df.columns.values if self.target_items else np.array([])) check_tags_result = schema_rules.check_tags( self.source_items.df.columns.values, target_columns, json_fields.tagged) self.save_result(check_tags_result) if check_tags_result.errors: return self.run_customized_rules(self.source_items, json_fields) self.compare_with_customized_rules(self.source_items, self.target_items, json_fields.tagged)
def create_figures(self, items): tagged_fields = Tags().get(self.schema) dup_items_result = duplicate_rules.check_items(items.df, tagged_fields) no_of_checked_duplicated_items = dup_items_result.items_count no_of_duplicated_items = dup_items_result.err_items_count dup_skus_result = duplicate_rules.check_uniqueness( items.df, tagged_fields) no_of_checked_skus_items = dup_skus_result.items_count no_of_duplicated_skus = dup_skus_result.err_items_count price_was_now_result = price_rules.compare_was_now( items.df, tagged_fields) no_of_price_warns = price_was_now_result.err_items_count no_of_checked_price_items = price_was_now_result.items_count crawlera_user = api.get_crawlera_user(items.job) validation_errors = self.report.results.get( "JSON Schema Validation", schema_rules.validate(self.schema, raw_items=items.raw, fast=False), ).get_errors_count() garbage_symbols_result = self.report.results.get( "Garbage Symbols", garbage_symbols(items)) quality_estimation, field_accuracy = generate_quality_estimation( items.job, crawlera_user, validation_errors, no_of_duplicated_items, no_of_checked_duplicated_items, no_of_duplicated_skus, no_of_checked_skus_items, no_of_price_warns, no_of_checked_price_items, tested=True, garbage_symbols=garbage_symbols_result, ) self.score_table(quality_estimation, field_accuracy) self.job_summary_table(items.job) self.rules_summary_table( items.df, validation_errors, tagged_fields.get("name_field", ""), tagged_fields.get("product_url_field", ""), no_of_checked_duplicated_items, no_of_duplicated_items, tagged_fields.get("unique", []), no_of_checked_skus_items, no_of_duplicated_skus, tagged_fields.get("product_price_field", ""), tagged_fields.get("product_price_was_field", ""), no_of_checked_price_items, no_of_price_warns, garbage_symbols=garbage_symbols_result, ) self.scraped_fields_coverage(items.df) self.coverage_by_categories(items.df, tagged_fields)
def test_validate_passed(get_schema, get_raw_items): result = validate(get_schema, get_raw_items, range(len(get_raw_items))) assert result == create_result("JSON Schema Validation", {})
def test_validate(get_raw_items, schema, expected_messages): result = validate(schema, get_raw_items, range(len(get_raw_items))) assert result == create_result("JSON Schema Validation", expected_messages)
def test_validate_passed(get_schema, get_raw_items): assert_results_equal( validate(get_schema, get_raw_items, range(len(get_raw_items))), create_result("JSON Schema Validation", {}), )
def test_validate(get_raw_items, schema, expected_messages): assert_results_equal( validate(schema, get_raw_items, range(len(get_raw_items))), create_result("JSON Schema Validation", expected_messages), )