예제 #1
0
    def show(self, short: bool = False, keys_limit: int = 10):
        from arche.report import Report

        Report.write_summary(self)
        Report.write_rule_details(self, short=short, keys_limit=keys_limit)
        for f in self.figures:
            pio.show(f)
예제 #2
0
    def show(self, short: bool = False, keys_limit: int = 10):
        from arche.report import Report

        IPython.display.clear_output()
        Report.write_summary(self)
        Report.write_rule_details(self, short=short, keys_limit=keys_limit)
        for f in self.figures:
            f.show()
예제 #3
0
class Arche:
    def __init__(
        self,
        source: str,
        schema: Optional[SchemaSource] = None,
        target: Optional[str] = None,
        start: int = 0,
        count: Optional[int] = None,
        filters: Optional[api.Filters] = None,
        expand: bool = True,
    ):
        """
        Args:
            source: a data source to validate. Supports job or collection keys
            schema: a JSON schema source used to run validation
            target: a data source to compare with
            start: an item number to start reading from
            count: the amount of items to read from start
            filters: Scrapinghub filtering
            expand: if enabled, use flattened data in garbage rules, affects performance, see flatten_df # noqa
        """
        self.source = source
        if target == self.source:
            logger.warning(
                "'target' is the same as 'source', and will be ignored")
            self.target = None
        else:
            self.target = target
        self.start = start
        self.count = count
        self.filters = filters
        self.expand = expand
        self.schema_source = schema
        if schema:
            self._schema = get_schema(schema)
        else:
            self._schema = None
        self._source_items = None
        self._target_items = None

        self.report = Report()

    @property
    def source_items(self):
        if not self._source_items:
            self._source_items = self.get_items(self.source, self.start,
                                                self.count, self.filters,
                                                self.expand)
        return self._source_items

    @property
    def target_items(self):
        if not self.target:
            return None
        if not self._target_items:
            self._target_items = self.get_items(self.target, self.start,
                                                self.count, self.filters,
                                                self.expand)
        return self._target_items

    @property
    def schema(self):
        if not self._schema and self.schema_source:
            self._schema = get_schema(self.schema_source)
        return self._schema

    @schema.setter
    def schema(self, schema_source):
        self.schema_source = schema_source
        self._schema = get_schema(schema_source)

    @staticmethod
    def get_items(
        source: str,
        start: int,
        count: Optional[int],
        filters: Optional[api.Filters],
        expand: bool,
    ) -> Union[JobItems, CollectionItems]:
        if helpers.is_job_key(source):
            return JobItems(key=source,
                            start=start,
                            count=count,
                            filters=filters,
                            expand=expand)
        elif helpers.is_collection_key(source):
            if start:
                raise ValueError(
                    "Collections API does not support 'start' parameter")
            return CollectionItems(key=source,
                                   count=count,
                                   filters=filters,
                                   expand=expand)
        else:
            raise ValueError(
                f"'{source}' is not a valid job or collection key")

    def save_result(self, rule_result):
        self.report.save(rule_result)

    def basic_json_schema(self, items_numbers: List[int] = None):
        basic_json_schema(self.source)

    def report_all(self):
        self.run_all_rules()
        self.report.write_summary()
        self.report.write("\n" * 2)
        self.report.write_details(short=True)

    def run_all_rules(self):
        if helpers.is_job_key(self.source_items.key):
            self.check_metadata(self.source_items.job)
            if self.target_items:
                self.compare_metadata(self.source_items.job,
                                      self.target_items.job)
        self.run_general_rules()
        self.run_comparison_rules()
        self.run_schema_rules()

    def data_quality_report(self, bucket: Optional[str] = None):
        if helpers.is_collection_key(self.source):
            raise ValueError("Collections are not supported")
        if not self.schema:
            raise ValueError("Schema is empty")
        if not self.report.results:
            self.save_result(
                schema_rules.validate(self.schema,
                                      items_dicts=self.source_items.dicts,
                                      fast=False))

        DataQualityReport(self.source_items, self.schema, self.report, bucket)

    @lru_cache(maxsize=32)
    def run_general_rules(self):
        self.save_result(garbage_symbols(self.source_items))
        self.save_result(
            coverage_rules.check_fields_coverage(self.source_items.df))

    def validate_with_json_schema(self):
        """Run JSON schema check and output results. It will try to find all errors, but
        there are no guarantees. Slower than `check_with_json_schema()`
        """
        res = schema_rules.validate(self.schema,
                                    items_dicts=self.source_items.dicts,
                                    fast=False)
        self.save_result(res)
        self.report.write_result(res, short=False)

    def glance(self):
        """Run JSON schema check and output results. In most cases it will stop after
        the first error per item. Usable for big jobs as it's about 100x faster than
        `validate_with_json_schema()`.
        """
        res = schema_rules.validate(self.schema,
                                    items_dicts=self.source_items.dicts,
                                    fast=True)
        self.save_result(res)
        self.report.write_result(res, short=False)

    def run_schema_rules(self):
        if not self.schema:
            return

        self.save_result(
            schema_rules.validate(self.schema, self.source_items.dicts))

        json_fields = schema_tools.JsonFields(self.schema)
        target_columns = (self.target_items.df.columns.values
                          if self.target_items else np.array([]))

        check_tags_result = schema_rules.check_tags(
            self.source_items.df.columns.values, target_columns,
            json_fields.tagged)
        self.save_result(check_tags_result)
        if check_tags_result.errors:
            return

        self.run_customized_rules(self.source_items, json_fields)
        self.compare_with_customized_rules(self.source_items,
                                           self.target_items,
                                           json_fields.tagged)

    @lru_cache(maxsize=32)
    def run_customized_rules(self, items, fields):
        self.save_result(price_rules.compare_was_now(items.df, fields.tagged))
        self.save_result(
            duplicate_rules.check_uniqueness(items.df, fields.tagged))
        self.save_result(duplicate_rules.check_items(items.df, fields.tagged))
        self.save_result(
            category_coverage.get_coverage_per_category(
                items.df, fields.tagged))

    @lru_cache(maxsize=32)
    def check_metadata(self, job):
        self.save_result(metadata_rules.check_outcome(job))
        self.save_result(metadata_rules.check_errors(job))
        self.save_result(metadata_rules.check_response_ratio(job))

    @lru_cache(maxsize=32)
    def compare_metadata(self, source_job, target_job):
        self.save_result(
            metadata_rules.compare_spider_names(source_job, target_job))
        self.save_result(metadata_rules.compare_errors(source_job, target_job))
        self.save_result(
            metadata_rules.compare_number_of_scraped_items(
                source_job, target_job))
        self.save_result(
            coverage_rules.compare_fields_counts(source_job, target_job))
        self.save_result(
            metadata_rules.compare_response_ratio(source_job, target_job))
        self.save_result(metadata_rules.compare_runtime(
            source_job, target_job))
        self.save_result(
            metadata_rules.compare_finish_time(source_job, target_job))

    @lru_cache(maxsize=32)
    def run_comparison_rules(self):
        if not self.target_items:
            return
        self.save_result(
            coverage_rules.compare_scraped_fields(self.source_items.df,
                                                  self.target_items.df))
        self.save_result(
            compare_boolean_fields(self.source_items.df, self.target_items.df))

    def compare_with_customized_rules(self, source_items, target_items,
                                      tagged_fields):
        if not target_items:
            return
        self.save_result(
            category_coverage.compare_coverage_per_category(
                source_items.key,
                target_items.key,
                source_items.df,
                target_items.df,
                tagged_fields,
            ))
        self.save_result(
            price_rules.compare_prices_for_same_urls(source_items.df,
                                                     target_items.df,
                                                     tagged_fields))
        self.save_result(
            price_rules.compare_names_for_same_urls(source_items.df,
                                                    target_items.df,
                                                    tagged_fields))
        self.save_result(
            price_rules.compare_prices_for_same_names(source_items.df,
                                                      target_items.df,
                                                      tagged_fields))