def run_general_rules(self): self.save_result(garbage_symbols(self.source_items)) self.save_result( coverage_rules.check_fields_coverage( self.source_items.df.drop(columns=["_type", "_key"]) ) )
def run_general_rules(self): self.save_result(garbage_symbols(self.source_items.df)) df = self.source_items.df self.save_result( coverage_rules.check_fields_coverage( df.drop(columns=df.columns[df.columns.str.startswith("_")]))) self.save_result(category_rules.get_categories(df))
def test_garbage_symbols(raw_items, expected_messages, expected_items_count): assert_results_equal( garbage_symbols(pd.DataFrame(raw_items)), create_result("Garbage Symbols", expected_messages, items_count=expected_items_count), )
def create_figures(self, items, items_dicts): tagged_fields = Tags().get(self.schema) no_of_validated_items = len(items.df.index) dup_items_result = duplicate_rules.check_items(items.df, tagged_fields) no_of_checked_duplicated_items = dup_items_result.items_count no_of_duplicated_items = dup_items_result.err_items_count dup_skus_result = duplicate_rules.check_uniqueness( items.df, tagged_fields) no_of_checked_skus_items = dup_skus_result.items_count no_of_duplicated_skus = dup_skus_result.err_items_count price_was_now_result = price_rules.compare_was_now( items.df, tagged_fields) no_of_price_warns = price_was_now_result.err_items_count no_of_checked_price_items = price_was_now_result.items_count garbage_symbols_result = garbage_symbols(items) crawlera_user = api.get_crawlera_user(items.job) no_of_validation_warnings = self.report.results.get( "JSON Schema Validation").get_errors_count() quality_estimation, field_accuracy = generate_quality_estimation( items.job, crawlera_user, no_of_validation_warnings, no_of_duplicated_items, no_of_checked_duplicated_items, no_of_duplicated_skus, no_of_checked_skus_items, no_of_price_warns, no_of_validated_items, tested=True, garbage_symbols=garbage_symbols_result, ) cleaned_df = self.drop_service_columns(items.df) self.score_table(quality_estimation, field_accuracy) self.job_summary_table(items.job) self.rules_summary_table( cleaned_df, no_of_validation_warnings, tagged_fields.get("name_field", ""), tagged_fields.get("product_url_field", ""), no_of_checked_duplicated_items, no_of_duplicated_items, tagged_fields.get("unique", []), no_of_checked_skus_items, no_of_duplicated_skus, tagged_fields.get("product_price_field", ""), tagged_fields.get("product_price_was_field", ""), no_of_checked_price_items, no_of_price_warns, garbage_symbols=garbage_symbols_result, ) self.scraped_fields_coverage(items.job.key, cleaned_df) self.coverage_by_categories(cleaned_df, tagged_fields)
def test_garbage_symbols(get_job_items, expected_messages, expected_items_count, expected_err_items_count): assert garbage_symbols(get_job_items) == create_result( "Garbage Symbols", expected_messages, items_count=expected_items_count, err_items_count=expected_err_items_count, )
def test_garbage_symbols(raw_items, expected_messages, expected_items_count, expected_err_items_count): assert garbage_symbols(pd.DataFrame(raw_items)) == create_result( "Garbage Symbols", expected_messages, items_count=expected_items_count, err_items_count=expected_err_items_count, )
def run_general_rules(self): self.save_result(garbage_symbols(self.source_items.df)) df = self.source_items.df self.save_result( coverage_rules.check_fields_coverage( df.drop(columns=df.columns[df.columns.str.startswith("_")]))) self.save_result(category_rules.get_categories(df)) if getattr(self, "uniques", None): self.save_result( duplicate_rules.find_by(self.source_items.df, self.uniques))
def create_figures(self, items: JobItems): dups = self.report.results.get( "Duplicates", duplicate_rules.find_by_tags(items.df, self.schema.tags) ) price_was_now_result = price_rules.compare_was_now(items.df, self.schema.tags) no_of_price_warns = price_was_now_result.err_items_count no_of_checked_price_items = price_was_now_result.items_count crawlera_user = api.get_crawlera_user(items.job) validation_errors = self.report.results.get( "JSON Schema Validation", schema_rules.validate( self.schema.raw, raw_items=items.raw, keys=items.df.index, fast=False ), ).get_errors_count() garbage_symbols_result = self.report.results.get( "Garbage Symbols", garbage_symbols(items.df) ) quality_estimation, field_accuracy = generate_quality_estimation( items.job, crawlera_user, validation_errors, dups.err_items_count, dups.items_count, no_of_price_warns, no_of_checked_price_items, tested=True, garbage_symbols=garbage_symbols_result, ) self.score_table(quality_estimation, field_accuracy) self.job_summary_table(items.job) self.rules_summary_table( items.df, validation_errors, self.schema.tags.get("name_field", ""), self.schema.tags.get("product_url_field", ""), dups.err_items_count, dups.items_count, self.schema.tags.get("product_price_field", ""), self.schema.tags.get("product_price_was_field", ""), no_of_checked_price_items, no_of_price_warns, garbage_symbols=garbage_symbols_result, ) self.scraped_fields_coverage(items.df) self.coverage_by_categories(items.df, self.schema.tags)
def create_figures(self, items): tagged_fields = Tags().get(self.schema) dup_items_result = duplicate_rules.check_items(items.df, tagged_fields) no_of_checked_duplicated_items = dup_items_result.items_count no_of_duplicated_items = dup_items_result.err_items_count dup_skus_result = duplicate_rules.check_uniqueness( items.df, tagged_fields) no_of_checked_skus_items = dup_skus_result.items_count no_of_duplicated_skus = dup_skus_result.err_items_count price_was_now_result = price_rules.compare_was_now( items.df, tagged_fields) no_of_price_warns = price_was_now_result.err_items_count no_of_checked_price_items = price_was_now_result.items_count crawlera_user = api.get_crawlera_user(items.job) validation_errors = self.report.results.get( "JSON Schema Validation", schema_rules.validate(self.schema, raw_items=items.raw, fast=False), ).get_errors_count() garbage_symbols_result = self.report.results.get( "Garbage Symbols", garbage_symbols(items)) quality_estimation, field_accuracy = generate_quality_estimation( items.job, crawlera_user, validation_errors, no_of_duplicated_items, no_of_checked_duplicated_items, no_of_duplicated_skus, no_of_checked_skus_items, no_of_price_warns, no_of_checked_price_items, tested=True, garbage_symbols=garbage_symbols_result, ) self.score_table(quality_estimation, field_accuracy) self.job_summary_table(items.job) self.rules_summary_table( items.df, validation_errors, tagged_fields.get("name_field", ""), tagged_fields.get("product_url_field", ""), no_of_checked_duplicated_items, no_of_duplicated_items, tagged_fields.get("unique", []), no_of_checked_skus_items, no_of_duplicated_skus, tagged_fields.get("product_price_field", ""), tagged_fields.get("product_price_was_field", ""), no_of_checked_price_items, no_of_price_warns, garbage_symbols=garbage_symbols_result, ) self.scraped_fields_coverage(items.df) self.coverage_by_categories(items.df, tagged_fields)