Ejemplo n.º 1
0
 def validate_with_json_schema(self) -> None:
     """Run JSON schema check and output results. It will try to find all errors, but
     there are no guarantees. Slower than `check_with_json_schema()`
     """
     res = schema_rules.validate(self.schema, self.source_items.raw)
     self.save_result(res)
     res.show()
Ejemplo n.º 2
0
 def validate_with_json_schema(self):
     """Run JSON schema check and output results. It will try to find all errors, but
     there are no guarantees. Slower than `check_with_json_schema()`
     """
     res = schema_rules.validate(self.schema,
                                 items_dicts=self.source_items.dicts,
                                 fast=False)
     self.save_result(res)
     self.report.write_result(res, short=False)
Ejemplo n.º 3
0
 def glance(self):
     """Run JSON schema check and output results. In most cases it will stop after
     the first error per item. Usable for big jobs as it's about 100x faster than
     `validate_with_json_schema()`.
     """
     res = schema_rules.validate(
         self.schema, items_dicts=self.source_items.dicts, fast=True
     )
     self.save_result(res)
     res.show()
Ejemplo n.º 4
0
    def data_quality_report(self, bucket: Optional[str] = None):
        if helpers.is_collection_key(self.source):
            raise ValueError("Collections are not supported")
        if not self.schema:
            raise ValueError("Schema is empty")
        if not self.report.results:
            self.save_result(
                schema_rules.validate(self.schema,
                                      items_dicts=self.source_items.dicts,
                                      fast=False))

        DataQualityReport(self.source_items, self.schema, self.report, bucket)
Ejemplo n.º 5
0
 def glance(self) -> None:
     """Run JSON schema check and output results. In most cases it will return
     only the first error per item. Usable for big jobs as it's about 100x faster than
     `validate_with_json_schema()`.
     """
     res = schema_rules.validate(
         self.schema.raw,
         self.source_items.raw,
         self.source_items.df.index,
         fast=True,
     )
     self.save_result(res)
     res.show()
Ejemplo n.º 6
0
    def create_figures(self, items: JobItems):
        dups = self.report.results.get(
            "Duplicates", duplicate_rules.find_by_tags(items.df, self.schema.tags)
        )

        price_was_now_result = price_rules.compare_was_now(items.df, self.schema.tags)
        no_of_price_warns = price_was_now_result.err_items_count
        no_of_checked_price_items = price_was_now_result.items_count

        crawlera_user = api.get_crawlera_user(items.job)

        validation_errors = self.report.results.get(
            "JSON Schema Validation",
            schema_rules.validate(
                self.schema.raw, raw_items=items.raw, keys=items.df.index, fast=False
            ),
        ).get_errors_count()

        garbage_symbols_result = self.report.results.get(
            "Garbage Symbols", garbage_symbols(items.df)
        )

        quality_estimation, field_accuracy = generate_quality_estimation(
            items.job,
            crawlera_user,
            validation_errors,
            dups.err_items_count,
            dups.items_count,
            no_of_price_warns,
            no_of_checked_price_items,
            tested=True,
            garbage_symbols=garbage_symbols_result,
        )

        self.score_table(quality_estimation, field_accuracy)
        self.job_summary_table(items.job)
        self.rules_summary_table(
            items.df,
            validation_errors,
            self.schema.tags.get("name_field", ""),
            self.schema.tags.get("product_url_field", ""),
            dups.err_items_count,
            dups.items_count,
            self.schema.tags.get("product_price_field", ""),
            self.schema.tags.get("product_price_was_field", ""),
            no_of_checked_price_items,
            no_of_price_warns,
            garbage_symbols=garbage_symbols_result,
        )
        self.scraped_fields_coverage(items.df)
        self.coverage_by_categories(items.df, self.schema.tags)
Ejemplo n.º 7
0
    def run_schema_rules(self) -> None:
        if not self.schema:
            return
        self.save_result(
            schema_rules.validate(self.schema, self.source_items.raw))

        tagged_fields = sr.Tags().get(self.schema)
        target_columns = (self.target_items.df.columns.values
                          if self.target_items else None)

        check_tags_result = schema_rules.check_tags(
            self.source_items.df.columns.values, target_columns, tagged_fields)
        self.save_result(check_tags_result)
        if check_tags_result.errors:
            return

        self.run_customized_rules(self.source_items, tagged_fields)
        self.compare_with_customized_rules(self.source_items,
                                           self.target_items, tagged_fields)
Ejemplo n.º 8
0
    def run_schema_rules(self):
        if not self.schema:
            return

        self.save_result(
            schema_rules.validate(self.schema, self.source_items.dicts))

        json_fields = schema_tools.JsonFields(self.schema)
        target_columns = (self.target_items.df.columns.values
                          if self.target_items else np.array([]))

        check_tags_result = schema_rules.check_tags(
            self.source_items.df.columns.values, target_columns,
            json_fields.tagged)
        self.save_result(check_tags_result)
        if check_tags_result.errors:
            return

        self.run_customized_rules(self.source_items, json_fields)
        self.compare_with_customized_rules(self.source_items,
                                           self.target_items,
                                           json_fields.tagged)
Ejemplo n.º 9
0
    def create_figures(self, items):
        tagged_fields = Tags().get(self.schema)

        dup_items_result = duplicate_rules.check_items(items.df, tagged_fields)
        no_of_checked_duplicated_items = dup_items_result.items_count
        no_of_duplicated_items = dup_items_result.err_items_count

        dup_skus_result = duplicate_rules.check_uniqueness(
            items.df, tagged_fields)
        no_of_checked_skus_items = dup_skus_result.items_count
        no_of_duplicated_skus = dup_skus_result.err_items_count

        price_was_now_result = price_rules.compare_was_now(
            items.df, tagged_fields)
        no_of_price_warns = price_was_now_result.err_items_count
        no_of_checked_price_items = price_was_now_result.items_count

        crawlera_user = api.get_crawlera_user(items.job)

        validation_errors = self.report.results.get(
            "JSON Schema Validation",
            schema_rules.validate(self.schema, raw_items=items.raw,
                                  fast=False),
        ).get_errors_count()

        garbage_symbols_result = self.report.results.get(
            "Garbage Symbols", garbage_symbols(items))

        quality_estimation, field_accuracy = generate_quality_estimation(
            items.job,
            crawlera_user,
            validation_errors,
            no_of_duplicated_items,
            no_of_checked_duplicated_items,
            no_of_duplicated_skus,
            no_of_checked_skus_items,
            no_of_price_warns,
            no_of_checked_price_items,
            tested=True,
            garbage_symbols=garbage_symbols_result,
        )

        self.score_table(quality_estimation, field_accuracy)
        self.job_summary_table(items.job)
        self.rules_summary_table(
            items.df,
            validation_errors,
            tagged_fields.get("name_field", ""),
            tagged_fields.get("product_url_field", ""),
            no_of_checked_duplicated_items,
            no_of_duplicated_items,
            tagged_fields.get("unique", []),
            no_of_checked_skus_items,
            no_of_duplicated_skus,
            tagged_fields.get("product_price_field", ""),
            tagged_fields.get("product_price_was_field", ""),
            no_of_checked_price_items,
            no_of_price_warns,
            garbage_symbols=garbage_symbols_result,
        )
        self.scraped_fields_coverage(items.df)
        self.coverage_by_categories(items.df, tagged_fields)
Ejemplo n.º 10
0
def test_validate_passed(get_schema, get_raw_items):
    result = validate(get_schema, get_raw_items, range(len(get_raw_items)))
    assert result == create_result("JSON Schema Validation", {})
Ejemplo n.º 11
0
def test_validate(get_raw_items, schema, expected_messages):
    result = validate(schema, get_raw_items, range(len(get_raw_items)))
    assert result == create_result("JSON Schema Validation", expected_messages)
Ejemplo n.º 12
0
def test_validate_passed(get_schema, get_raw_items):
    assert_results_equal(
        validate(get_schema, get_raw_items, range(len(get_raw_items))),
        create_result("JSON Schema Validation", {}),
    )
Ejemplo n.º 13
0
def test_validate(get_raw_items, schema, expected_messages):
    assert_results_equal(
        validate(schema, get_raw_items, range(len(get_raw_items))),
        create_result("JSON Schema Validation", expected_messages),
    )