def test_find_by_name_url(data, tagged_fields, expected_messages): df = pd.DataFrame(data) result = duplicates.find_by_name_url(df, tagged_fields) assert result == create_result( "Duplicates By **name_field, product_url_field** Tags", expected_messages, items_count=len(df), )
def run_customized_rules(self, items, tagged_fields): self.save_result(price_rules.compare_was_now(items.df, tagged_fields)) self.save_result( duplicate_rules.find_by_unique(items.df, tagged_fields)) self.save_result( duplicate_rules.find_by_name_url(items.df, tagged_fields)) self.save_result( category_rules.get_coverage_per_category( items.df, tagged_fields.get("category", []) + self.schema.enums))
def create_figures(self, items: CloudItems): name_url_dups = self.report.results.get( "Duplicates By **name_field, product_url_field** Tags", duplicate_rules.find_by_name_url(items.df, self.schema.tags), ) uniques = self.report.results.get( "Duplicates By **unique** Tag", duplicate_rules.find_by_unique(items.df, self.schema.tags), ) price_was_now_result = price_rules.compare_was_now( items.df, self.schema.tags) no_of_price_warns = price_was_now_result.err_items_count no_of_checked_price_items = price_was_now_result.items_count crawlera_user = api.get_crawlera_user(items.job) validation_errors = self.report.results.get( "JSON Schema Validation", schema_rules.validate(self.schema.raw, raw_items=items.raw, keys=items.df.index, fast=False), ).get_errors_count() garbage_symbols_result = self.report.results.get( "Garbage Symbols", garbage_symbols(items.df)) quality_estimation, field_accuracy = generate_quality_estimation( items.job, crawlera_user, validation_errors, name_url_dups.err_items_count, name_url_dups.items_count, uniques.err_items_count, uniques.items_count, no_of_price_warns, no_of_checked_price_items, tested=True, garbage_symbols=garbage_symbols_result, ) self.score_table(quality_estimation, field_accuracy) self.job_summary_table(items.job) self.rules_summary_table( items.df, validation_errors, self.schema.tags.get("name_field", ""), self.schema.tags.get("product_url_field", ""), name_url_dups.items_count, name_url_dups.err_items_count, self.schema.tags.get("unique", []), uniques.items_count, uniques.err_items_count, self.schema.tags.get("product_price_field", ""), self.schema.tags.get("product_price_was_field", ""), no_of_checked_price_items, no_of_price_warns, garbage_symbols=garbage_symbols_result, ) self.scraped_fields_coverage(items.df) self.coverage_by_categories(items.df, self.schema.tags)