def test_write_details(mocker, get_df, capsys, messages, expected_details): mock_pio_show = mocker.patch("plotly.io.show", autospec=True) r = Report() for m in messages: result = create_result(*m, stats=[get_df]) r.save(result) r.write_details() mock_pio_show.assert_called_with(result.figures[0]) assert capsys.readouterr().out == expected_details
def test_write_details(mocker, get_df, capsys, messages, expected_details): mock_pio_show = mocker.patch("plotly.io.show", autospec=True) md_mock = mocker.patch("arche.report.display_markdown", autospec=True) r = Report() for m in messages: result = create_result(*m, stats=[get_df]) r.save(result) r.write_details() mock_pio_show.assert_called_with(result.figures[0]) calls = [mocker.call(e) for e in expected_details] md_mock.assert_has_calls(calls, any_order=True)
def test_report_call(mocker, get_df, capsys, messages, expected_details): mocked_display = mocker.patch("arche.report.display_html", autospec=True) r = Report() for m in messages: result = create_result(*m, stats=[get_df]) r.save(result) r() report_html = get_report_from_iframe(mocked_display.mock_calls[0][1][0]) assert report_html.count("Plotly.newPlot") == 2 assert report_html.count("rule name here - INFO") == 2 assert report_html.count("other result there - INFO") == 2
class Arche: def __init__( self, source: Union[str, pd.DataFrame, RawItems], schema: Optional[SchemaSource] = None, target: Optional[Union[str, pd.DataFrame]] = None, count: Optional[int] = None, start: Union[str, int] = None, filters: Optional[api.Filters] = None, expand: bool = None, ): """ Args: source: a data source to validate, accepts job keys, pandas df, lists schema: a JSON schema source used to run validation target: a data source to compare with count: the amount of items to read from start start: an item key to start reading from filters: Scrapinghub filtering, see https://python-scrapinghub.readthedocs.io/en/latest/client/apidocs.html#scrapinghub.client.items.Items # noqa """ if expand: maintenance.deprecate( "'expand' parameter is deprecated and will be removed in the next 0.3.7" " release. See CHANGES.md for more details.", gone_in="0.3.7", ) if isinstance(source, str) and target == source: raise ValueError( "'target' is equal to 'source'. Data to compare should have different sources." ) if isinstance(source, pd.DataFrame): logger.warning( "Pandas stores `NA` (missing) data differently, " "which might affect schema validation. " "Should you care, consider passing raw data in array-like types.\n" "For more details, see https://pandas.pydata.org/pandas-docs/" "stable/user_guide/gotchas.html#nan-integer-na-values-and-na-type-promotions" ) self.source = source self._schema = None self.schema_source = None if schema: self.schema = schema self.target = target self.start = start self.count = count self.filters = filters self._source_items = None self._target_items = None self.report = Report() @property def source_items(self): if not self._source_items: self._source_items = self.get_items(self.source, self.count, self.start, self.filters) return self._source_items @property def target_items(self): if self.target is None: return None if not self._target_items: self._target_items = self.get_items(self.target, self.count, self.start, self.filters) return self._target_items @property def schema(self): if not self._schema and self.schema_source: self._schema = Schema(self.schema_source) return self._schema @schema.setter def schema(self, schema_source): self.schema_source = schema_source self._schema = Schema(schema_source) @staticmethod def get_items( source: Union[str, pd.DataFrame, RawItems], count: Optional[int], start: Optional[str], filters: Optional[api.Filters], ) -> Items: if isinstance(source, pd.DataFrame): return Items.from_df(source) elif isinstance(source, Iterable) and not isinstance(source, str): return Items.from_array(cast(RawItems, source)) elif helpers.is_job_key(source): return JobItems(source, count, int(start or 0), filters) elif helpers.is_collection_key(source): return CollectionItems(source, count, start, filters) else: raise ValueError( f"'{source}' is not a valid job or collection key") def save_result(self, rule_result): self.report.save(rule_result) def report_all(self, short: bool = False, uniques: List[Union[str, List[str]]] = None) -> None: """Report on all included rules. Args: uniques: see `arche.rules.duplicates.find_by` """ if uniques: self.uniques = uniques self.run_all_rules() IPython.display.clear_output() self.report(keys_limit=10 if short else None) def run_all_rules(self) -> None: if isinstance(self.source_items, JobItems): self.check_metadata(self.source_items.job) if self.target_items: self.compare_metadata(self.source_items.job, self.target_items.job) self.run_general_rules() self.run_comparison_rules() self.run_schema_rules() def data_quality_report(self, bucket: Optional[str] = None): if helpers.is_collection_key(str(self.source)): raise ValueError("Collections are not supported") if not self.schema: raise ValueError("Schema is empty") IPython.display.clear_output() DataQualityReport(self.source_items, self.schema, self.report, bucket) def run_general_rules(self): self.save_result(garbage_symbols(self.source_items.df)) df = self.source_items.df self.save_result( coverage_rules.check_fields_coverage( df.drop(columns=df.columns[df.columns.str.startswith("_")]))) self.save_result(category_rules.get_categories(df)) if getattr(self, "uniques", None): self.save_result( duplicate_rules.find_by(self.source_items.df, self.uniques)) def validate_with_json_schema(self) -> None: """Run JSON schema check and output results. It will try to find all errors, but there are no guarantees. Slower than `check_with_json_schema()` """ res = schema_rules.validate(self.schema.raw, self.source_items.raw, self.source_items.df.index) self.save_result(res) self.report(res) def glance(self) -> None: """Run JSON schema check and output results. In most cases it will return only the first error per item. Usable for big jobs as it's about 100x faster than `validate_with_json_schema()`. """ res = schema_rules.validate( self.schema.raw, self.source_items.raw, self.source_items.df.index, fast=True, ) self.save_result(res) res.show() def run_schema_rules(self) -> None: if not self.schema: return self.save_result( schema_rules.validate(self.schema.raw, self.source_items.raw, self.source_items.df.index)) target_columns = (self.target_items.df.columns.values if self.target_items else None) check_tags_result = schema_rules.check_tags( self.source_items.df.columns.values, target_columns, self.schema.tags) self.save_result(check_tags_result) if check_tags_result.errors: return self.run_customized_rules(self.source_items, self.schema.tags) self.compare_with_customized_rules(self.source_items, self.target_items, self.schema.tags) def run_customized_rules(self, items, tagged_fields): self.save_result(price_rules.compare_was_now(items.df, tagged_fields)) self.save_result(duplicate_rules.find_by_tags(items.df, tagged_fields)) self.save_result( category_rules.get_coverage_per_category( items.df, tagged_fields.get("category", []) + self.schema.enums)) @lru_cache(maxsize=32) def check_metadata(self, job): self.save_result(metadata_rules.check_outcome(job)) self.save_result(metadata_rules.check_errors(job)) @lru_cache(maxsize=32) def compare_metadata(self, source_job, target_job): self.save_result( metadata_rules.compare_spider_names(source_job, target_job)) self.save_result( metadata_rules.compare_number_of_scraped_items( source_job, target_job)) self.save_result(coverage_rules.get_difference(source_job, target_job)) self.save_result( metadata_rules.compare_response_ratio(source_job, target_job)) self.save_result(metadata_rules.compare_runtime( source_job, target_job)) self.save_result( metadata_rules.compare_finish_time(source_job, target_job)) @lru_cache(maxsize=32) def run_comparison_rules(self): if not self.target_items: return for r in [ coverage_rules.compare_scraped_fields, compare_boolean_fields ]: self.save_result(r(self.source_items.df, self.target_items.df)) def compare_with_customized_rules(self, source_items, target_items, tagged_fields): if not target_items: return self.save_result( category_rules.get_difference( source_items.df, target_items.df, tagged_fields.get("category", []) + self.schema.enums, )) for r in [ price_rules.compare_prices_for_same_urls, price_rules.compare_names_for_same_urls, price_rules.compare_prices_for_same_names, ]: self.save_result(r(source_items.df, target_items.df, tagged_fields)) self.save_result( compare.tagged_fields( source_items.df, target_items.df, tagged_fields, ["product_url_field", "name_field"], ))
class Arche: def __init__( self, source: str, schema: Optional[sr.SchemaSource] = None, target: Optional[str] = None, start: int = 0, count: Optional[int] = None, filters: Optional[api.Filters] = None, expand: bool = True, ): """ Args: source: a data source to validate. Supports job or collection keys schema: a JSON schema source used to run validation target: a data source to compare with start: an item number to start reading from count: the amount of items to read from start filters: Scrapinghub filtering expand: if enabled, use flattened data in garbage rules, affects performance, see flatten_df # noqa """ self.source = source if target == self.source: logger.warning("'target' is the same as 'source', and will be ignored") self.target = None else: self.target = target self.start = start self.count = count self.filters = filters self.expand = expand self.schema_source = None self._schema = None if schema: self.schema = sr.get_schema(schema) self._source_items = None self._target_items = None self.report = Report() @property def source_items(self): if not self._source_items: self._source_items = self.get_items( self.source, self.start, self.count, self.filters, self.expand ) return self._source_items @property def target_items(self): if not self.target: return None if not self._target_items: self._target_items = self.get_items( self.target, self.start, self.count, self.filters, self.expand ) return self._target_items @property def schema(self): if not self._schema and self.schema_source: self._schema = sr.get_schema(self.schema_source) return self._schema @schema.setter def schema(self, schema_source): self.schema_source = schema_source self._schema = sr.get_schema(schema_source) @staticmethod def get_items( source: str, start: int, count: Optional[int], filters: Optional[api.Filters], expand: bool, ) -> Union[JobItems, CollectionItems]: if helpers.is_job_key(source): return JobItems( key=source, start=start, count=count, filters=filters, expand=expand ) elif helpers.is_collection_key(source): if start: raise ValueError("Collections API does not support 'start' parameter") return CollectionItems( key=source, count=count, filters=filters, expand=expand ) else: raise ValueError(f"'{source}' is not a valid job or collection key") def save_result(self, rule_result): self.report.save(rule_result) def basic_json_schema(self, items_numbers: List[int] = None): """Prints a json schema based on data from `self.source` Args: items_numbers: array of item numbers to create a schema from """ maintenance.deprecate( "'Arche.basic_json_schema()' was deprecated in 2019.03.25 and " "will be removed in 2019.04.22.", replacement="Use 'basic_json_schema()' instead", gone_in="0.4.0", ) schema.basic_json_schema(self.source, items_numbers) def report_all(self): self.run_all_rules() self.report.write_summaries() self.report.write("\n" * 2) self.report.write_details(short=True) def run_all_rules(self): if helpers.is_job_key(self.source_items.key): self.check_metadata(self.source_items.job) if self.target_items: self.compare_metadata(self.source_items.job, self.target_items.job) self.run_general_rules() self.run_comparison_rules() self.run_schema_rules() def data_quality_report(self, bucket: Optional[str] = None): if helpers.is_collection_key(self.source): raise ValueError("Collections are not supported") if not self.schema: raise ValueError("Schema is empty") if not self.report.results: self.save_result( schema_rules.validate( self.schema, items_dicts=self.source_items.dicts, fast=False ) ) DataQualityReport(self.source_items, self.schema, self.report, bucket) @lru_cache(maxsize=32) def run_general_rules(self): self.save_result(garbage_symbols(self.source_items)) self.save_result( coverage_rules.check_fields_coverage( self.source_items.df.drop(columns=["_type", "_key"]) ) ) def validate_with_json_schema(self): """Run JSON schema check and output results. It will try to find all errors, but there are no guarantees. Slower than `check_with_json_schema()` """ res = schema_rules.validate( self.schema, items_dicts=self.source_items.dicts, fast=False ) self.save_result(res) res.show() def glance(self): """Run JSON schema check and output results. In most cases it will stop after the first error per item. Usable for big jobs as it's about 100x faster than `validate_with_json_schema()`. """ res = schema_rules.validate( self.schema, items_dicts=self.source_items.dicts, fast=True ) self.save_result(res) res.show() def run_schema_rules(self): if not self.schema: return self.save_result(schema_rules.validate(self.schema, self.source_items.dicts)) tagged_fields = sr.Tags().get(self.schema) target_columns = ( self.target_items.df.columns.values if self.target_items else None ) check_tags_result = schema_rules.check_tags( self.source_items.df.columns.values, target_columns, tagged_fields ) self.save_result(check_tags_result) if check_tags_result.errors: return self.run_customized_rules(self.source_items, tagged_fields) self.compare_with_customized_rules( self.source_items, self.target_items, tagged_fields ) def run_customized_rules(self, items, tagged_fields): self.save_result(price_rules.compare_was_now(items.df, tagged_fields)) self.save_result(duplicate_rules.check_uniqueness(items.df, tagged_fields)) self.save_result(duplicate_rules.check_items(items.df, tagged_fields)) self.save_result( category_rules.get_coverage_per_category( items.df, tagged_fields.get("category", []) ) ) @lru_cache(maxsize=32) def check_metadata(self, job): self.save_result(metadata_rules.check_outcome(job)) self.save_result(metadata_rules.check_errors(job)) self.save_result(metadata_rules.check_response_ratio(job)) @lru_cache(maxsize=32) def compare_metadata(self, source_job, target_job): self.save_result(metadata_rules.compare_spider_names(source_job, target_job)) self.save_result(metadata_rules.compare_errors(source_job, target_job)) self.save_result( metadata_rules.compare_number_of_scraped_items(source_job, target_job) ) self.save_result(coverage_rules.get_difference(source_job, target_job)) self.save_result(metadata_rules.compare_response_ratio(source_job, target_job)) self.save_result(metadata_rules.compare_runtime(source_job, target_job)) self.save_result(metadata_rules.compare_finish_time(source_job, target_job)) @lru_cache(maxsize=32) def run_comparison_rules(self): if not self.target_items: return self.save_result( coverage_rules.compare_scraped_fields( self.source_items.df, self.target_items.df ) ) self.save_result( compare_boolean_fields(self.source_items.df, self.target_items.df) ) def compare_with_customized_rules(self, source_items, target_items, tagged_fields): if not target_items: return self.save_result( category_rules.get_difference( source_items.key, target_items.key, source_items.df, target_items.df, tagged_fields.get("category", []), ) ) self.save_result( price_rules.compare_prices_for_same_urls( source_items.df, target_items.df, tagged_fields ) ) self.save_result( price_rules.compare_names_for_same_urls( source_items.df, target_items.df, tagged_fields ) ) self.save_result( price_rules.compare_prices_for_same_names( source_items.df, target_items.df, tagged_fields ) )
def test_save(): r = Report() dummy_result = create_result("dummy", {Level.INFO: [("outcome", )]}) r.save(dummy_result) assert r.results == {dummy_result.name: dummy_result}
def test_write_details(capsys, messages, expected_details): r = Report() for m in messages: r.save(create_result(*m)) r.write_details() assert capsys.readouterr().out == expected_details
def test_wipe(): r = Report() r.save(create_result("dummy", {Level.INFO: [("outcome",)]})) r.wipe() assert r.results == {}