def get_categories(df: pd.DataFrame, max_uniques: int = 10) -> Result: """Find category columns. A category column is the column which holds a limited number of possible values, including `NAN`. Args: df: data max_uniques: filter which determines which columns to use. Only columns with the number of unique values less than or equal to `max_uniques` are category columns. Returns: A result with stats containing value counts of categorical columns. """ result = Result("Categories") columns = find_likely_cats(df, max_uniques) result.stats = [ value_counts for value_counts in tqdm( map(lambda c: df[c].value_counts(dropna=False), columns), desc="Finding categories", total=len(columns), ) if len(value_counts) <= max_uniques ] if not result.stats: result.add_info("Categories were not found") return result result.add_info(f"{len(result.stats)} category field(s)") result.outcome = Outcome.INFO return result
def check_uniqueness(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]) -> Result: """Verify if each item field tagged with `unique` is unique. Returns: A result containing field names and keys for non unique items """ unique_fields = tagged_fields.get("unique", []) result = Result("Uniqueness") if not unique_fields: result.add_info("'unique' tag was not found in schema") return result err_keys = set() for field in unique_fields: result.items_count = df[field].count() duplicates = df[df[field].duplicated(keep=False)][[field, "_key"]] errors = {} for _, d in duplicates.groupby([field]): keys = list(d["_key"]) msg = f"same '{d[field].iloc[0]}' {field}" errors[msg] = keys err_keys = err_keys.union(keys) if not duplicates.empty: result.add_error( f"'{field}' contains {len(duplicates[field].unique())} duplicated value(s)", errors=errors, ) result.err_items_count = len(err_keys) return result
def check_outcome(job: Job) -> Result: state = api.get_job_state(job) reason = api.get_job_close_reason(job) result = Result("Job Outcome") if state != "finished" or reason != "finished": result.add_error(f"Job has '{state}' state, '{reason}' close reason") return result
def find_by(df: pd.DataFrame, uniques: List[Union[str, List[str]]]) -> Result: """Find equal items rows in `df` by `uniques`. I.e. if two items have the same uniques's element value, they are considered duplicates. Args: uniques: list containing columns and list of columns to identify duplicates. List of columns means that all list columns values should be equal. Returns: Any duplicates """ result = Result("Duplicates") result.items_count = len(df) df = df.dropna(subset=list(set(flatten(uniques))), how="all") for columns in uniques: mask = columns if isinstance(columns, list) else [columns] duplicates = df[df.duplicated(columns, keep=False)][mask] if duplicates.empty: continue errors = {} grouped = duplicates.groupby(columns) for _, d in grouped: msgs = [f"'{d[c].iloc[0]}' `{c}`" for c in mask] errors[f"same {', '.join(msgs)}"] = list(d.index) result.add_error( f"{', '.join(mask)} contains {len(grouped)} duplicated value(s)", errors=errors, ) return result
def check_tags(source_columns: np.ndarray, target_columns: np.ndarray, tags: TaggedFields) -> Result: result = Result("Tags") found_tags = sorted(list(tags)) if found_tags: result.add_info(f"Used - {', '.join(found_tags)}") all_tags = set([name for name, _ in Tag.__members__.items()]) not_used_tags = sorted(all_tags - set(tags)) if not_used_tags: result.add_info(f"Not used - {', '.join(not_used_tags)}") tagged_fields = [] for tag in tags: tagged_fields.extend(tags[tag]) missing_in_source = sorted(set(tagged_fields) - set(source_columns)) if missing_in_source: result.add_error( f"{str(missing_in_source)[1:-1]} field(s) was not found in " "source, but specified in schema") if target_columns is not None: missing_in_target = sorted(set(tagged_fields) - set(target_columns)) if missing_in_target: result.add_error( f"{str(missing_in_target)[1:-1]} field(s) was not found " "in target, but specified in schema") if result.errors: result.add_error("Skipping tag rules") return result
def compare_runtime(source_job: Job, target_job: Job) -> Result: source_runtime = api.get_runtime(source_job) target_runtime = api.get_runtime(target_job) result = Result("Compare Runtime") if not source_runtime or not target_runtime: result.add_warning("Jobs are not finished") elif source_runtime > target_runtime: runtime_ratio_diff = helpers.ratio_diff(source_runtime, target_runtime) msg = ( f"Sources differ on {runtime_ratio_diff}% - " f"{helpers.ms_to_time(source_runtime)} and " f"{helpers.ms_to_time(target_runtime)}" ) if runtime_ratio_diff > 0.2: result.add_error(msg) elif runtime_ratio_diff > 0.1: result.add_warning(msg) else: result.add_info(msg) else: result.add_info( f"Similar or better runtime - {helpers.ms_to_time(source_runtime)} and " f"{helpers.ms_to_time(target_runtime)}" ) return result
def anomalies(target: str, sample: List[str]) -> Result: """Find fields with significant deviation. Significant means `dev > 2 * std()` Args: target: where to look for anomalies sample: a list of jobs keys to infer metadata from Returns: A Result with a dataframe of significant deviations """ result = Result("Anomalies") raw_stats = [job.items.stats() for job in api.get_jobs(sample + [target])] counts = (pd.DataFrame( rs.get("counts") for rs in raw_stats).fillna(0).drop(columns="_type")) items_len = [rs["totals"]["input_values"] for rs in raw_stats] stats = counts.apply(lambda x: x / items_len) stats.index = sample + [target] stats.rename(index={target: "target"}, inplace=True) stats.loc["mean"] = stats.loc[sample].mean() stats.loc["std"] = stats.loc[sample].std() stats = stats.T stats["target deviation"] = stats["target"] - stats["mean"] devs = stats[(stats["target deviation"].abs() > 2 * stats["std"])] devs.name = "Anomalies" if not devs.empty: result.add_error( f"{len(devs.index)} field(s) with significant coverage deviation") result.stats = [devs] return result
def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result: """Verify if each item field tagged with `unique` is unique. Returns: A result containing field names and keys for non unique items """ unique_fields = tagged_fields.get("unique", []) result = Result("Duplicates By **unique** Tag") if not unique_fields: result.add_info(Outcome.SKIPPED) return result err_keys = set() for field in unique_fields: result.items_count = df[field].count() duplicates = df[df.duplicated(field, keep=False)][[field]] errors = {} for _, d in duplicates.groupby([field]): keys = list(d.index) msg = f"same '{d[field].iloc[0]}' `{field}`" errors[msg] = keys err_keys = err_keys.union(keys) if not duplicates.empty: result.add_error( f"{field} contains {len(duplicates[field].unique())} duplicated value(s)", errors=errors, ) result.err_items_count = len(err_keys) return result
def check_tags(source_columns, target_columns, tags): result = Result("Tags") found_tags = list(tags) if found_tags: result.add_info(", ".join(found_tags)) tagged_fields = [] for tag in tags: tagged_fields.extend(tags[tag]) missing_in_source = sorted(set(tagged_fields) - set(source_columns)) if missing_in_source: result.add_error( f"{str(missing_in_source)[1:-1]} field(s) was not found in " "source, but specified in schema") if target_columns.size > 0: missing_in_target = sorted(set(tagged_fields) - set(target_columns)) if missing_in_target: result.add_error( f"{str(missing_in_target)[1:-1]} field(s) was not found " "in target, but specified in schema") if result.errors: result.add_error("Skipping tag rules") return result
def compare_scraped_fields(source_df, target_df): source_field_coverage = dict( source_df.count().sort_values(ascending=False)) target_field_coverage = dict( target_df.count().sort_values(ascending=False)) result = Result("Scraped Fields") missing_fields = set(target_df.columns.values) - set( source_df.columns.values) if missing_fields: detailed_messages = ["Missing Fields"] for field in missing_fields: target_coverage = target_field_coverage[field] / len( target_df) * 100 detailed_messages.append( f"{field} - coverage - {int(target_coverage)}% - " f"{target_field_coverage[field]} items") result.add_error(f"{len(missing_fields)} field(s) are missing", "\n".join(detailed_messages)) new_fields = set(source_df.columns.values) - set(target_df.columns.values) if new_fields: detailed_messages = ["New Fields"] for field in new_fields: source_coverage = source_field_coverage[field] / len( source_df) * 100 detailed_messages.append( f"{field} - coverage - {int(source_coverage)}% - " f"{source_field_coverage[field]} items") result.add_info(f"{len(new_fields)} field(s) are new", "\n".join(detailed_messages)) return result
def compare_names_for_same_urls( source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: Dict[str, List[str]], ): """For each pair of items that have the same `product_url_field` tagged field, compare `name_field` field""" result = Result("Compare Names Per Url") url_field = tagged_fields.get("product_url_field") if not url_field: result.add_info("product_url_field tag is not set") return result url_field = url_field[0] name_field = tagged_fields.get("name_field") diff_names_count = 0 if not name_field: result.add_info("name_field tag is not set") return result name_field = name_field[0] if any([ name_field not in source_df.columns.values, name_field not in target_df.columns.values, ]): return same_urls = source_df[(source_df[url_field].isin( target_df[url_field].values))][url_field] detailed_messages = [] for url in same_urls: if url.strip() != "nan": source_name = source_df[source_df[url_field] == url][name_field].iloc[0] target_name = target_df[target_df[url_field] == url][name_field].iloc[0] if (source_name != target_name and source_name.strip() != "nan" and target_name.strip() != "nan"): diff_names_count += 1 source_key = source_df[source_df[url_field] == url]["_key"].iloc[0] target_key = target_df[target_df[url_field] == url]["_key"].iloc[0] msg = ( f"different names for url: {url}\nsource name is {source_name} " f"for {source_key}\ntarget name is {target_name} for {target_key}" ) detailed_messages.append(msg) res = f"{len(same_urls)} checked, {diff_names_count} errors" if detailed_messages: result.add_error(res, detailed="\n".join(detailed_messages)) else: result.add_info(res) return result
def check_items(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]) -> Result: """Check for items with the same name and url""" name_fields = tagged_fields.get("name_field") url_fields = tagged_fields.get("product_url_field") result = Result("Duplicated Items") if not name_fields or not url_fields: result.add_info( "'name_field' and 'product_url_field' tags were not found in schema" ) else: result.items_count = len(df) errors = {} name_field = name_fields[0] url_field = url_fields[0] df = df[[name_field, url_field, "_key"]] duplicates = df[df[[name_field, url_field]].duplicated(keep=False)] if duplicates.empty: return result result.err_items_count = len(duplicates) for _, d in duplicates.groupby([name_field, url_field]): msg = ( f"same '{d[name_field].iloc[0]}' name and '{d[url_field].iloc[0]}' url" ) errors[msg] = list(d["_key"]) result.add_error( f"{len(duplicates)} duplicate(s) with same name and url", errors=errors) return result
def check_response_ratio(job: Job) -> Result: requests_number = api.get_requests_count(job) items_count = api.get_items_count(job) result = Result("Responses Per Item Ratio") result.add_info( f"Number of responses / Number of scraped items - " f"{round(requests_number / items_count, 2)}" ) return result
def compare_prices_for_same_urls(source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: TaggedFields) -> Result: """For each pair of items that have the same `product_url_field` tagged field, compare `product_price_field` field Returns: A result containing pairs of items from `source_df` and `target_df` which `product_price_field` differ. """ result = Result("Compare Prices For Same Urls") url_field_list: Optional[List[str]] = tagged_fields.get( "product_url_field") if not url_field_list: result.outcome = Outcome.SKIPPED return result url_field = url_field_list[0] source_df = source_df.dropna(subset=[url_field]) target_df = target_df.dropna(subset=[url_field]) same_urls = source_df[(source_df[url_field].isin( target_df[url_field].values))][url_field] price_fields = tagged_fields.get("product_price_field") if not price_fields: result.add_info("product_price_field tag is not set") else: price_field = price_fields[0] detailed_messages = [] for url in same_urls: if url.strip() != "nan": source_price = source_df[source_df[url_field] == url][price_field].iloc[0] target_price = target_df[target_df[url_field] == url][price_field].iloc[0] if (is_number(source_price) and is_number(target_price) and ratio_diff(source_price, target_price) > 0.1): source_key = source_df[source_df[url_field] == url].index[0] target_key = target_df[target_df[url_field] == url].index[0] msg = ( f"different prices for url: {url}\nsource price is {source_price} " f"for {source_key}\ntarget price is {target_price} for {target_key}" ) detailed_messages.append(msg) res = f"{len(same_urls)} checked, {len(detailed_messages)} errors" if detailed_messages: result.add_error(res, detailed="\n".join(detailed_messages)) else: result.add_info(res) return result
def compare_was_now(df: pd.DataFrame, tagged_fields: TaggedFields): """Compare price_was and price_now tagged fields""" price_was_fields = tagged_fields.get("product_price_was_field") price_fields = tagged_fields.get("product_price_field") items_number = len(df.index) result = Result("Compare Price Was And Now") if not price_was_fields or not price_fields: result.add_info(Outcome.SKIPPED) return result price_field = price_fields[0] price_was_field = price_was_fields[0] prices = df.copy() prices[price_was_field] = prices[price_was_field].astype(float) prices[price_field] = prices[price_field].astype(float) df_prices_less = pd.DataFrame( prices[prices[price_was_field] < prices[price_field]], columns=[price_was_field, price_field], ) price_less_percent = "{:.2%}".format(len(df_prices_less) / items_number) if not df_prices_less.empty: error = f"Past price is less than current for {len(df_prices_less)} items" result.add_error( f"{price_less_percent} ({len(df_prices_less)}) of " f"items with {price_was_field} < {price_field}", detailed=f"{error}:\n{list(df_prices_less.index)}", ) df_prices_equals = pd.DataFrame( prices[prices[price_was_field] == prices[price_field]], columns=[price_was_field, price_field], ) price_equal_percent = "{:.2%}".format(len(df_prices_equals) / items_number) if not df_prices_equals.empty: result.add_warning( ( f"{price_equal_percent} ({len(df_prices_equals)}) " f"of items with {price_was_field} = {price_field}" ), detailed=( f"Prices equal for {len(df_prices_equals)} items:\n" f"{list(df_prices_equals.index)}" ), ) result.err_items_count = len(df_prices_equals) + len(df_prices_less) result.items_count = len(df.index) return result
def compare_was_now(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]): """Compare price_was and price_now tagged fields""" price_was_fields = tagged_fields.get("product_price_was_field") price_fields = tagged_fields.get("product_price_field") items_number = len(df.index) result = Result("Compare Price Was And Now") if (price_was_fields and price_was_fields[0] in df.columns and price_fields and price_fields[0] in df.columns): price_field = price_fields[0] price_was_field = price_was_fields[0] prices = df.copy() prices[price_was_field] = prices[price_was_field].astype(float) prices[price_field] = prices[price_field].astype(float) df_prices_less = pd.DataFrame( prices[prices[price_was_field] < prices[price_field]], columns=["_key", price_was_field, price_field], ) price_less_percent = "{:.2%}".format( len(df_prices_less) / items_number) if not df_prices_less.empty: error = f"Past price is less than current for {len(df_prices_less)} items" result.add_error( f"{price_less_percent} ({len(df_prices_less)}) of " f"items with {price_was_field} < {price_field}", detailed=f"{error}:\n{list(df_prices_less['_key'])}", ) df_prices_equals = pd.DataFrame( prices[prices[price_was_field] == prices[price_field]], columns=["_key", price_was_field, price_field], ) price_equal_percent = "{:.2%}".format( len(df_prices_equals) / items_number) if not df_prices_equals.empty: result.add_warning( (f"{price_equal_percent} ({len(df_prices_equals)}) " f"of items with {price_was_field} = {price_field}"), detailed=(f"Prices equal for {len(df_prices_equals)} items:\n" f"{list(df_prices_equals['_key'])}"), ) result.err_items_count = len(df_prices_equals) + len(df_prices_less) result.items_count = len(df.index) else: result.add_info( "product_price_field or product_price_was_field tags were not " "found in schema") return result
def compare_spider_names(source_job: Job, target_job: Job) -> Result: s_name = source_job.metadata.get("spider") t_name = target_job.metadata.get("spider") result = Result("Spider Names") if s_name != t_name: result.add_warning( f"{source_job.key} spider is {s_name}, {target_job.key} spider is {t_name}" ) return result
def get_coverage_per_category(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]): result = Result("Coverage For Scraped Categories") category_fields = tagged_fields.get("category", []) for f in category_fields: value_counts = df[f].value_counts() result.add_info(f"{len(value_counts)} categories in '{f}'", stats=value_counts) return result
def compare_spider_names(source_job: Job, target_job: Job) -> Result: name1 = source_job.metadata.get("spider") name2 = target_job.metadata.get("spider") result = Result("Spider Names") if name1 != name2: result.add_warning( f"{source_job.key} spider is {name1}, {target_job.key} spider is {name2}" ) return result
def check_errors(job: Job) -> Result: errors_count = api.get_errors_count(job) result = Result("Job Errors") if errors_count: url = f"{SH_URL}/{job.key}/log?filterType=error&filterAndHigher" result.add_error( f"{errors_count} error(s)", detailed=f"Errors for {job.key} - {url}" ) else: result.add_info(f"No errors") return result
def compare_boolean_fields( source_df: pd.DataFrame, target_df: pd.DataFrame, err_thr: float = 0.10, warn_thr: float = 0.05, ) -> Result: """Compare booleans distribution between two dataframes Returns: A result containing dataframe with distributions and messages if differences are in thresholds """ source_bool = source_df.select_dtypes(include="bool") target_bool = target_df.select_dtypes(include="bool") result = Result("Boolean Fields") if not fields_to_compare(source_bool, target_bool): result.outcome = Outcome.SKIPPED return result dummy = pd.DataFrame(columns=[True, False]) source_counts = pd.concat( [dummy, source_bool.apply(pd.value_counts, normalize=True).T], sort=False ).fillna(0.0) target_counts = pd.concat( [dummy, target_bool.apply(pd.value_counts, normalize=True).T], sort=False ).fillna(0.0) difs = (source_counts - target_counts)[True] bool_covs = pd.concat( [ source_counts.rename("{}_source".format), target_counts.rename("{}_target".format), ] ).sort_index() bool_covs.name = "Coverage for boolean fields" result.stats.append(bool_covs) err_diffs = difs[difs.abs() > err_thr] if not err_diffs.empty: result.add_error( f"{', '.join(err_diffs.index)} relative frequencies differ " f"by more than {err_thr:.0%}" ) warn_diffs = difs[(difs.abs() > warn_thr) & (difs.abs() <= err_thr)] if not warn_diffs.empty: result.add_warning( f"{', '.join(warn_diffs.index)} relative frequencies differ by " f"{warn_thr:.0%}-{err_thr:.0%}" ) return result
def check_errors(source_job: Job) -> Result: source_errs = api.get_errors_count(source_job) result = Result("Job Errors") if not source_errs: return result errors_url = "{}/{}/log?filterType=error&filterAndHigher" result.add_error( f"{source_errs} error(s) - {errors_url.format(SH_URL, source_job.key)}" ) return result
def get_difference( source_df: pd.DataFrame, target_df: pd.DataFrame, category_names: List[str], source_key: str = "source", target_key: str = "target", ) -> Result: """Find and show differences between categories coverage, including nan values. Coverage means value counts divided on total size. Args: source_df: a data you want to compare target_df: a data you want to compare with category_names: list of columns which values to compare source_key: label for `source_df` target_key: label for `target_df` Returns: A result instance with messages containing significant difference defined by thresholds, a dataframe showing all normalized value counts in percents, a series containing significant difference. """ result = Result("Category Coverage Difference") warn_thr = 0.10 err_thr = 0.20 for c in category_names: cats = ( pd.DataFrame( { source_key: source_df[c].value_counts(dropna=False, normalize=True), target_key: target_df[c].value_counts(dropna=False, normalize=True), } ) .fillna(0) .sort_values(by=[source_key, target_key], kind="mergesort") ) cats.name = f"Coverage for {c}" result.stats.append(cats) cat_difs = (cats[source_key] - cats[target_key]).abs() cat_difs = cat_difs[cat_difs > warn_thr] cat_difs.name = f"Coverage difference more than {warn_thr:.0%} for {c}" if not cat_difs.empty: result.stats.append(cat_difs) errs = cat_difs[cat_difs > err_thr] if not errs.empty: result.add_warning( f"The difference is greater than {err_thr:.0%} for {len(errs)} value(s) of {c}" ) if not category_names: result.add_info(Outcome.SKIPPED) return result
def compare_finish_time(source_job: Job, target_job: Job) -> Result: diff_in_days = api.get_finish_time_difference_in_days(source_job, target_job) result = Result("Finish Time") if diff_in_days == 0: result.add_info("Less than 1 day difference") else: if diff_in_days is None: result.add_warning("Jobs are not finished") else: result.add_warning(f"{diff_in_days} day(s) difference between 2 jobs") return result
def compare_scraped_fields(source_df: pd.DataFrame, target_df: pd.DataFrame) -> Result: """Find new or missing columns between source_df and target_df""" result = Result("Scraped Fields") missing_fields = target_df.columns.difference(source_df.columns) if missing_fields.array: result.add_error(f"Missing - {', '.join(missing_fields)}") new_fields = source_df.columns.difference(target_df.columns) if new_fields.array: result.add_info(f"New - {', '.join(new_fields)}") return result
def create_result( rule_name, messages, stats=None, err_items_count=None, items_count=None ): result = Result(rule_name) for level, messages in messages.items(): for message in messages: result.add_message(level, *message) if stats: result.stats = stats if err_items_count: result.err_items_count = err_items_count if items_count: result.items_count = items_count return result
def find_by_name_url(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result: """Check for items with the same name and url""" name_fields = tagged_fields.get("name_field") url_fields = tagged_fields.get("product_url_field") name = "Duplicates By **name_field, product_url_field** Tags" result = Result(name) if not name_fields or not url_fields: result.add_info(Outcome.SKIPPED) return result name_field = name_fields[0] url_field = url_fields[0] result = find_by(df, [name_field, url_field]) result.name = name return result
def compare_errors(source_job: Job, target_job: Job) -> Result: errors_count1 = api.get_errors_count(source_job) errors_count2 = api.get_errors_count(target_job) result = Result("Compare Job Errors") if errors_count1: errors_url = "{}/{}/log?filterType=error&filterAndHigher" detailed_msg = ( f"{errors_count1} error(s) for {source_job.key} - " f"{errors_url.format(SH_URL, source_job.key)}\n" f"{errors_count2} error(s) for {target_job.key} - " f"{errors_url.format(SH_URL, target_job.key)}" ) result.add_error(f"{errors_count1} and {errors_count2} errors", detailed_msg) return result
def find_by_tags(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result: """Check for duplicates based on schema tags. In particular, look for items with the same `name_field` and `product_url_field`, and for uniqueness among `unique` field""" name_fields = tagged_fields.get("name_field") url_fields = tagged_fields.get("product_url_field") columns_to_check: List = tagged_fields.get("unique", []) if (not name_fields or not url_fields) and not columns_to_check: result = Result("Duplicates") result.add_info(Outcome.SKIPPED) return result if name_fields and url_fields: columns_to_check.extend([[name_fields[0], url_fields[0]]]) return find_by(df, columns_to_check)
def get_difference(source_job: Job, target_job: Job, err_thr: float = 0.10, warn_thr: float = 0.05) -> Result: """Get difference between jobs coverages. The coverage is job fields counts divided on the job size. Args: source_job: a base job, the difference is calculated from it target_job: a job to compare err_thr: a threshold for errors warn_thr: a threshold for warnings Returns: A Result instance with huge dif and stats with fields counts coverage and dif """ result = Result("Coverage Difference") f_counts = (pd.DataFrame({ source_job.key: api.get_counts(source_job), target_job.key: api.get_counts(target_job), }).drop(index=["_type"]).fillna(0).sort_values(by=[source_job.key], kind="mergesort")) f_counts[source_job.key] = f_counts[source_job.key].divide( api.get_items_count(source_job)) f_counts[target_job.key] = f_counts[target_job.key].divide( api.get_items_count(target_job)) f_counts.name = "Coverage from job stats fields counts" result.stats.append(f_counts) coverage_difs = f_counts[source_job.key] - f_counts[target_job.key] coverage_difs = coverage_difs[coverage_difs.abs() > warn_thr].sort_values( kind="mergesoft") coverage_difs.name = f"Coverage difference more than {warn_thr:.0%}" if not coverage_difs.empty: result.stats.append(coverage_difs) errs = coverage_difs[coverage_difs.abs() > err_thr] if not errs.empty: result.add_error( f"The difference is greater than {err_thr:.0%} for {len(errs)} field(s)" ) warns = coverage_difs[(coverage_difs > warn_thr) & (coverage_difs <= err_thr)] if not warns.empty: result.add_warning( f"The difference is between {warn_thr:.0%} and {err_thr:.0%} " f"for {len(warns)} field(s)") return result