def compare_scraped_fields(source_df, target_df): source_field_coverage = dict( source_df.count().sort_values(ascending=False)) target_field_coverage = dict( target_df.count().sort_values(ascending=False)) result = Result("Scraped Fields") missing_fields = set(target_df.columns.values) - set( source_df.columns.values) if missing_fields: detailed_messages = ["Missing Fields"] for field in missing_fields: target_coverage = target_field_coverage[field] / len( target_df) * 100 detailed_messages.append( f"{field} - coverage - {int(target_coverage)}% - " f"{target_field_coverage[field]} items") result.add_error(f"{len(missing_fields)} field(s) are missing", "\n".join(detailed_messages)) new_fields = set(source_df.columns.values) - set(target_df.columns.values) if new_fields: detailed_messages = ["New Fields"] for field in new_fields: source_coverage = source_field_coverage[field] / len( source_df) * 100 detailed_messages.append( f"{field} - coverage - {int(source_coverage)}% - " f"{source_field_coverage[field]} items") result.add_info(f"{len(new_fields)} field(s) are new", "\n".join(detailed_messages)) return result
def find_by(df: pd.DataFrame, uniques: List[Union[str, List[str]]]) -> Result: """Find equal items rows in `df` by `uniques`. I.e. if two items have the same uniques's element value, they are considered duplicates. Args: uniques: list containing columns and list of columns to identify duplicates. List of columns means that all list columns values should be equal. Returns: Any duplicates """ result = Result("Duplicates") result.items_count = len(df) df = df.dropna(subset=list(set(flatten(uniques))), how="all") for columns in uniques: mask = columns if isinstance(columns, list) else [columns] duplicates = df[df.duplicated(columns, keep=False)][mask] if duplicates.empty: continue errors = {} grouped = duplicates.groupby(columns) for _, d in grouped: msgs = [f"'{d[c].iloc[0]}' `{c}`" for c in mask] errors[f"same {', '.join(msgs)}"] = list(d.index) result.add_error( f"{', '.join(mask)} contains {len(grouped)} duplicated value(s)", errors=errors, ) return result
def check_uniqueness(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]) -> Result: """Verify if each item field tagged with `unique` is unique. Returns: A result containing field names and keys for non unique items """ unique_fields = tagged_fields.get("unique", []) result = Result("Uniqueness") if not unique_fields: result.add_info("'unique' tag was not found in schema") return result err_keys = set() for field in unique_fields: result.items_count = df[field].count() duplicates = df[df[field].duplicated(keep=False)][[field, "_key"]] errors = {} for _, d in duplicates.groupby([field]): keys = list(d["_key"]) msg = f"same '{d[field].iloc[0]}' {field}" errors[msg] = keys err_keys = err_keys.union(keys) if not duplicates.empty: result.add_error( f"'{field}' contains {len(duplicates[field].unique())} duplicated value(s)", errors=errors, ) result.err_items_count = len(err_keys) return result
def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result: """Verify if each item field tagged with `unique` is unique. Returns: A result containing field names and keys for non unique items """ unique_fields = tagged_fields.get("unique", []) result = Result("Duplicates By **unique** Tag") if not unique_fields: result.add_info(Outcome.SKIPPED) return result err_keys = set() for field in unique_fields: result.items_count = df[field].count() duplicates = df[df.duplicated(field, keep=False)][[field]] errors = {} for _, d in duplicates.groupby([field]): keys = list(d.index) msg = f"same '{d[field].iloc[0]}' `{field}`" errors[msg] = keys err_keys = err_keys.union(keys) if not duplicates.empty: result.add_error( f"{field} contains {len(duplicates[field].unique())} duplicated value(s)", errors=errors, ) result.err_items_count = len(err_keys) return result
def anomalies(target: str, sample: List[str]) -> Result: """Find fields with significant deviation. Significant means `dev > 2 * std()` Args: target: where to look for anomalies sample: a list of jobs keys to infer metadata from Returns: A Result with a dataframe of significant deviations """ result = Result("Anomalies") raw_stats = [job.items.stats() for job in api.get_jobs(sample + [target])] counts = (pd.DataFrame( rs.get("counts") for rs in raw_stats).fillna(0).drop(columns="_type")) items_len = [rs["totals"]["input_values"] for rs in raw_stats] stats = counts.apply(lambda x: x / items_len) stats.index = sample + [target] stats.rename(index={target: "target"}, inplace=True) stats.loc["mean"] = stats.loc[sample].mean() stats.loc["std"] = stats.loc[sample].std() stats = stats.T stats["target deviation"] = stats["target"] - stats["mean"] devs = stats[(stats["target deviation"].abs() > 2 * stats["std"])] devs.name = "Anomalies" if not devs.empty: result.add_error( f"{len(devs.index)} field(s) with significant coverage deviation") result.stats = [devs] return result
def compare_names_for_same_urls( source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: Dict[str, List[str]], ): """For each pair of items that have the same `product_url_field` tagged field, compare `name_field` field""" result = Result("Compare Names Per Url") url_field = tagged_fields.get("product_url_field") if not url_field: result.add_info("product_url_field tag is not set") return result url_field = url_field[0] name_field = tagged_fields.get("name_field") diff_names_count = 0 if not name_field: result.add_info("name_field tag is not set") return result name_field = name_field[0] if any([ name_field not in source_df.columns.values, name_field not in target_df.columns.values, ]): return same_urls = source_df[(source_df[url_field].isin( target_df[url_field].values))][url_field] detailed_messages = [] for url in same_urls: if url.strip() != "nan": source_name = source_df[source_df[url_field] == url][name_field].iloc[0] target_name = target_df[target_df[url_field] == url][name_field].iloc[0] if (source_name != target_name and source_name.strip() != "nan" and target_name.strip() != "nan"): diff_names_count += 1 source_key = source_df[source_df[url_field] == url]["_key"].iloc[0] target_key = target_df[target_df[url_field] == url]["_key"].iloc[0] msg = ( f"different names for url: {url}\nsource name is {source_name} " f"for {source_key}\ntarget name is {target_name} for {target_key}" ) detailed_messages.append(msg) res = f"{len(same_urls)} checked, {diff_names_count} errors" if detailed_messages: result.add_error(res, detailed="\n".join(detailed_messages)) else: result.add_info(res) return result
def check_items(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]) -> Result: """Check for items with the same name and url""" name_fields = tagged_fields.get("name_field") url_fields = tagged_fields.get("product_url_field") result = Result("Duplicated Items") if not name_fields or not url_fields: result.add_info( "'name_field' and 'product_url_field' tags were not found in schema" ) else: result.items_count = len(df) errors = {} name_field = name_fields[0] url_field = url_fields[0] df = df[[name_field, url_field, "_key"]] duplicates = df[df[[name_field, url_field]].duplicated(keep=False)] if duplicates.empty: return result result.err_items_count = len(duplicates) for _, d in duplicates.groupby([name_field, url_field]): msg = ( f"same '{d[name_field].iloc[0]}' name and '{d[url_field].iloc[0]}' url" ) errors[msg] = list(d["_key"]) result.add_error( f"{len(duplicates)} duplicate(s) with same name and url", errors=errors) return result
def compare_runtime(source_job: Job, target_job: Job) -> Result: source_runtime = api.get_runtime(source_job) target_runtime = api.get_runtime(target_job) result = Result("Compare Runtime") if not source_runtime or not target_runtime: result.add_warning("Jobs are not finished") elif source_runtime > target_runtime: runtime_ratio_diff = helpers.ratio_diff(source_runtime, target_runtime) msg = ( f"Sources differ on {runtime_ratio_diff}% - " f"{helpers.ms_to_time(source_runtime)} and " f"{helpers.ms_to_time(target_runtime)}" ) if runtime_ratio_diff > 0.2: result.add_error(msg) elif runtime_ratio_diff > 0.1: result.add_warning(msg) else: result.add_info(msg) else: result.add_info( f"Similar or better runtime - {helpers.ms_to_time(source_runtime)} and " f"{helpers.ms_to_time(target_runtime)}" ) return result
def check_outcome(job: Job) -> Result: state = api.get_job_state(job) reason = api.get_job_close_reason(job) result = Result("Job Outcome") if state != "finished" or reason != "finished": result.add_error(f"Job has '{state}' state, '{reason}' close reason") return result
def compare_prices_for_same_urls(source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: TaggedFields) -> Result: """For each pair of items that have the same `product_url_field` tagged field, compare `product_price_field` field Returns: A result containing pairs of items from `source_df` and `target_df` which `product_price_field` differ. """ result = Result("Compare Prices For Same Urls") url_field_list: Optional[List[str]] = tagged_fields.get( "product_url_field") if not url_field_list: result.outcome = Outcome.SKIPPED return result url_field = url_field_list[0] source_df = source_df.dropna(subset=[url_field]) target_df = target_df.dropna(subset=[url_field]) same_urls = source_df[(source_df[url_field].isin( target_df[url_field].values))][url_field] price_fields = tagged_fields.get("product_price_field") if not price_fields: result.add_info("product_price_field tag is not set") else: price_field = price_fields[0] detailed_messages = [] for url in same_urls: if url.strip() != "nan": source_price = source_df[source_df[url_field] == url][price_field].iloc[0] target_price = target_df[target_df[url_field] == url][price_field].iloc[0] if (is_number(source_price) and is_number(target_price) and ratio_diff(source_price, target_price) > 0.1): source_key = source_df[source_df[url_field] == url].index[0] target_key = target_df[target_df[url_field] == url].index[0] msg = ( f"different prices for url: {url}\nsource price is {source_price} " f"for {source_key}\ntarget price is {target_price} for {target_key}" ) detailed_messages.append(msg) res = f"{len(same_urls)} checked, {len(detailed_messages)} errors" if detailed_messages: result.add_error(res, detailed="\n".join(detailed_messages)) else: result.add_info(res) return result
def compare_was_now(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]): """Compare price_was and price_now tagged fields""" price_was_fields = tagged_fields.get("product_price_was_field") price_fields = tagged_fields.get("product_price_field") items_number = len(df.index) result = Result("Compare Price Was And Now") if (price_was_fields and price_was_fields[0] in df.columns and price_fields and price_fields[0] in df.columns): price_field = price_fields[0] price_was_field = price_was_fields[0] prices = df.copy() prices[price_was_field] = prices[price_was_field].astype(float) prices[price_field] = prices[price_field].astype(float) df_prices_less = pd.DataFrame( prices[prices[price_was_field] < prices[price_field]], columns=["_key", price_was_field, price_field], ) price_less_percent = "{:.2%}".format( len(df_prices_less) / items_number) if not df_prices_less.empty: error = f"Past price is less than current for {len(df_prices_less)} items" result.add_error( f"{price_less_percent} ({len(df_prices_less)}) of " f"items with {price_was_field} < {price_field}", detailed=f"{error}:\n{list(df_prices_less['_key'])}", ) df_prices_equals = pd.DataFrame( prices[prices[price_was_field] == prices[price_field]], columns=["_key", price_was_field, price_field], ) price_equal_percent = "{:.2%}".format( len(df_prices_equals) / items_number) if not df_prices_equals.empty: result.add_warning( (f"{price_equal_percent} ({len(df_prices_equals)}) " f"of items with {price_was_field} = {price_field}"), detailed=(f"Prices equal for {len(df_prices_equals)} items:\n" f"{list(df_prices_equals['_key'])}"), ) result.err_items_count = len(df_prices_equals) + len(df_prices_less) result.items_count = len(df.index) else: result.add_info( "product_price_field or product_price_was_field tags were not " "found in schema") return result
def compare_was_now(df: pd.DataFrame, tagged_fields: TaggedFields): """Compare price_was and price_now tagged fields""" price_was_fields = tagged_fields.get("product_price_was_field") price_fields = tagged_fields.get("product_price_field") items_number = len(df.index) result = Result("Compare Price Was And Now") if not price_was_fields or not price_fields: result.add_info(Outcome.SKIPPED) return result price_field = price_fields[0] price_was_field = price_was_fields[0] prices = df.copy() prices[price_was_field] = prices[price_was_field].astype(float) prices[price_field] = prices[price_field].astype(float) df_prices_less = pd.DataFrame( prices[prices[price_was_field] < prices[price_field]], columns=[price_was_field, price_field], ) price_less_percent = "{:.2%}".format(len(df_prices_less) / items_number) if not df_prices_less.empty: error = f"Past price is less than current for {len(df_prices_less)} items" result.add_error( f"{price_less_percent} ({len(df_prices_less)}) of " f"items with {price_was_field} < {price_field}", detailed=f"{error}:\n{list(df_prices_less.index)}", ) df_prices_equals = pd.DataFrame( prices[prices[price_was_field] == prices[price_field]], columns=[price_was_field, price_field], ) price_equal_percent = "{:.2%}".format(len(df_prices_equals) / items_number) if not df_prices_equals.empty: result.add_warning( ( f"{price_equal_percent} ({len(df_prices_equals)}) " f"of items with {price_was_field} = {price_field}" ), detailed=( f"Prices equal for {len(df_prices_equals)} items:\n" f"{list(df_prices_equals.index)}" ), ) result.err_items_count = len(df_prices_equals) + len(df_prices_less) result.items_count = len(df.index) return result
def compare_boolean_fields( source_df: pd.DataFrame, target_df: pd.DataFrame, err_thr: float = 0.10, warn_thr: float = 0.05, ) -> Result: """Compare booleans distribution between two dataframes Returns: A result containing dataframe with distributions and messages if differences are in thresholds """ source_bool = source_df.select_dtypes(include="bool") target_bool = target_df.select_dtypes(include="bool") result = Result("Boolean Fields") if not fields_to_compare(source_bool, target_bool): result.outcome = Outcome.SKIPPED return result dummy = pd.DataFrame(columns=[True, False]) source_counts = pd.concat( [dummy, source_bool.apply(pd.value_counts, normalize=True).T], sort=False ).fillna(0.0) target_counts = pd.concat( [dummy, target_bool.apply(pd.value_counts, normalize=True).T], sort=False ).fillna(0.0) difs = (source_counts - target_counts)[True] bool_covs = pd.concat( [ source_counts.rename("{}_source".format), target_counts.rename("{}_target".format), ] ).sort_index() bool_covs.name = "Coverage for boolean fields" result.stats.append(bool_covs) err_diffs = difs[difs.abs() > err_thr] if not err_diffs.empty: result.add_error( f"{', '.join(err_diffs.index)} relative frequencies differ " f"by more than {err_thr:.0%}" ) warn_diffs = difs[(difs.abs() > warn_thr) & (difs.abs() <= err_thr)] if not warn_diffs.empty: result.add_warning( f"{', '.join(warn_diffs.index)} relative frequencies differ by " f"{warn_thr:.0%}-{err_thr:.0%}" ) return result
def check_errors(job: Job) -> Result: errors_count = api.get_errors_count(job) result = Result("Job Errors") if errors_count: url = f"{SH_URL}/{job.key}/log?filterType=error&filterAndHigher" result.add_error( f"{errors_count} error(s)", detailed=f"Errors for {job.key} - {url}" ) else: result.add_info(f"No errors") return result
def check_errors(source_job: Job) -> Result: source_errs = api.get_errors_count(source_job) result = Result("Job Errors") if not source_errs: return result errors_url = "{}/{}/log?filterType=error&filterAndHigher" result.add_error( f"{source_errs} error(s) - {errors_url.format(SH_URL, source_job.key)}" ) return result
def compare_scraped_fields(source_df: pd.DataFrame, target_df: pd.DataFrame) -> Result: """Find new or missing columns between source_df and target_df""" result = Result("Scraped Fields") missing_fields = target_df.columns.difference(source_df.columns) if missing_fields.array: result.add_error(f"Missing - {', '.join(missing_fields)}") new_fields = source_df.columns.difference(target_df.columns) if new_fields.array: result.add_info(f"New - {', '.join(new_fields)}") return result
def compare_errors(source_job: Job, target_job: Job) -> Result: errors_count1 = api.get_errors_count(source_job) errors_count2 = api.get_errors_count(target_job) result = Result("Compare Job Errors") if errors_count1: errors_url = "{}/{}/log?filterType=error&filterAndHigher" detailed_msg = ( f"{errors_count1} error(s) for {source_job.key} - " f"{errors_url.format(SH_URL, source_job.key)}\n" f"{errors_count2} error(s) for {target_job.key} - " f"{errors_url.format(SH_URL, target_job.key)}" ) result.add_error(f"{errors_count1} and {errors_count2} errors", detailed_msg) return result
def compare_prices_for_same_names(source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: TaggedFields): result = Result("Compare Prices For Same Names") name_field_tag = tagged_fields.get("name_field") if not name_field_tag: result.outcome = Outcome.SKIPPED return result name_field = name_field_tag[0] source_df = source_df[source_df[name_field].notnull()] target_df = target_df[target_df[name_field].notnull()] same_names = source_df[(source_df[name_field].isin( target_df[name_field].values))][name_field] price_fields = tagged_fields.get("product_price_field") if not price_fields: result.add_info("product_price_field tag is not set") return result price_field = price_fields[0] detailed_messages = [] for name in same_names: if name.strip() != "nan": source_price = source_df[source_df[name_field] == name][price_field].iloc[0] target_price = target_df[target_df[name_field] == name][price_field].iloc[0] if is_number(source_price) and is_number(target_price): if ratio_diff(source_price, target_price) > 0.1: source_key = source_df[source_df[name_field] == name].index[0] target_key = target_df[target_df[name_field] == name].index[0] msg = ( f"different price for {name}\nsource price is {source_price} " f"for {source_key}\ntarget price is {target_price} for {target_key}" ) detailed_messages.append(msg) result_msg = f"{len(same_names)} checked, {len(detailed_messages)} errors" if detailed_messages: result.add_error(result_msg, detailed="\n".join(detailed_messages)) else: result.add_info(result_msg) return result
def get_difference(source_job: Job, target_job: Job, err_thr: float = 0.10, warn_thr: float = 0.05) -> Result: """Get difference between jobs coverages. The coverage is job fields counts divided on the job size. Args: source_job: a base job, the difference is calculated from it target_job: a job to compare err_thr: a threshold for errors warn_thr: a threshold for warnings Returns: A Result instance with huge dif and stats with fields counts coverage and dif """ result = Result("Coverage Difference") f_counts = (pd.DataFrame({ source_job.key: api.get_counts(source_job), target_job.key: api.get_counts(target_job), }).drop(index=["_type"]).fillna(0).sort_values(by=[source_job.key], kind="mergesort")) f_counts[source_job.key] = f_counts[source_job.key].divide( api.get_items_count(source_job)) f_counts[target_job.key] = f_counts[target_job.key].divide( api.get_items_count(target_job)) f_counts.name = "Coverage from job stats fields counts" result.stats.append(f_counts) coverage_difs = f_counts[source_job.key] - f_counts[target_job.key] coverage_difs = coverage_difs[coverage_difs.abs() > warn_thr].sort_values( kind="mergesoft") coverage_difs.name = f"Coverage difference more than {warn_thr:.0%}" if not coverage_difs.empty: result.stats.append(coverage_difs) errs = coverage_difs[coverage_difs.abs() > err_thr] if not errs.empty: result.add_error( f"The difference is greater than {err_thr:.0%} for {len(errs)} field(s)" ) warns = coverage_difs[(coverage_difs > warn_thr) & (coverage_difs <= err_thr)] if not warns.empty: result.add_warning( f"The difference is between {warn_thr:.0%} and {err_thr:.0%} " f"for {len(warns)} field(s)") return result
def compare_names_for_same_urls(source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: TaggedFields): """For each pair of items that have the same `product_url_field` tagged field, compare `name_field` field""" result = Result("Compare Names Per Url") url_field_list: Optional[List[str]] = tagged_fields.get( "product_url_field") name_field_list: Optional[List[str]] = tagged_fields.get("name_field") if not url_field_list or not name_field_list: result.outcome = Outcome.SKIPPED return result name_field: str = name_field_list[0] url_field: str = url_field_list[0] diff_names_count = 0 same_urls = source_df[(source_df[url_field].isin( target_df[url_field].values))][url_field] detailed_messages = [] for url in same_urls: if url.strip() != "nan": source_name = source_df[source_df[url_field] == url][name_field].iloc[0] target_name = target_df[target_df[url_field] == url][name_field].iloc[0] if (source_name != target_name and source_name.strip() != "nan" and target_name.strip() != "nan"): diff_names_count += 1 source_key = source_df[source_df[url_field] == url].index[0] target_key = target_df[target_df[url_field] == url].index[0] msg = ( f"different names for url: {url}\nsource name is {source_name} " f"for {source_key}\ntarget name is {target_name} for {target_key}" ) detailed_messages.append(msg) res = f"{len(same_urls)} checked, {diff_names_count} errors" if detailed_messages: result.add_error(res, detailed="\n".join(detailed_messages)) else: result.add_info(res) return result
def compare_fields_counts(source_job, target_job): """Compare the relative difference between field counts to items count Args: source_job: a base job, the difference is calculated from it target_job: a job to compare Returns: A Result instance """ source_items_count = get_items_count(source_job) target_items_count = get_items_count(target_job) result = Result("Fields Counts") source_fields = pd.DataFrame( {"Count1": source_job.items.stats().get("counts", None)}) target_fields = pd.DataFrame( {"Count2": target_job.items.stats().get("counts", None)}) fields = pd.concat([source_fields, target_fields], axis=1, sort=True).fillna(0) fields["Difference, %"] = fields.apply( lambda row: ratio_diff(row["Count1"] / source_items_count, row[ "Count2"] / target_items_count) * 100, axis=1, ) fields["Difference, %"] = fields["Difference, %"].astype(int) fields.sort_values(by=["Difference, %"], ascending=False) err_diffs = fields[fields["Difference, %"] > 10] if not err_diffs.empty: result.add_error( f"Coverage difference is greater than 10% for " f"{len(err_diffs)} field(s)", err_diffs.to_string(columns=["Difference, %"]), ) warn_diffs = fields[(fields["Difference, %"] > 5) & (fields["Difference, %"] <= 10)] if not warn_diffs.empty: outcome_msg = (f"Coverage difference is between 5% and 10% for " f"{len(warn_diffs)} field(s)") result.add_warning(outcome_msg, warn_diffs.to_string(columns=["Difference, %"])) return result
def compare_response_ratio(source_job: Job, target_job: Job) -> Result: """Compare request with response per item ratio""" s_ratio = round( api.get_requests_count(source_job) / api.get_items_count(source_job), 2) t_ratio = round( api.get_requests_count(target_job) / api.get_items_count(target_job), 2) response_ratio_diff = helpers.ratio_diff(s_ratio, t_ratio) msg = f"Difference is {response_ratio_diff:.2%} - {s_ratio} and {t_ratio}" result = Result("Compare Responses Per Item Ratio") if response_ratio_diff > 0.2: result.add_error(msg) elif response_ratio_diff > 0.1: result.add_warning(msg) return result
def check_fields_coverage(df): fields_coverage = pd.DataFrame(df.count(), columns=["Values Count"]) fields_coverage.index.name = "Field" fields_coverage["Percent"] = fields_coverage.apply( lambda row: int(row["Values Count"] / len(df) * 100), axis=1) detailed_msg = fields_coverage.sort_values( by=["Percent", "Field"]).to_string() empty_fields = fields_coverage[fields_coverage["Values Count"] == 0] result_msg = f"{len(empty_fields)} totally empty field(s)" result = Result("Fields Coverage") if empty_fields.empty: result.add_info(result_msg, detailed_msg) else: result.add_error(result_msg, detailed_msg) return result
def compare_number_of_scraped_items(source_job: Job, target_job: Job) -> Result: items_count1 = api.get_items_count(source_job) items_count2 = api.get_items_count(target_job) diff = helpers.ratio_diff(items_count1, items_count2) result = Result("Total Scraped Items") if 0 <= diff < 0.05: if diff == 0: msg = "Same number of items" else: msg = f"Almost the same number of items - {items_count1} and {items_count2}" result.add_info(msg) else: msg = f"{items_count1} differs from {items_count2} on {diff * 100}%" if 0.05 <= diff < 0.10: result.add_warning(msg) elif diff >= 0.10: result.add_error(msg) return result
def garbage_symbols(df: pd.DataFrame) -> Result: """Find unwanted symbols in `np.object` columns. Returns: A result containing item keys per field which contained any trash symbol """ garbage = ( r"(?P<spaces>^\s|\s$)" r"|(?P<html_entities>&[a-zA-Z]{2,}?;|&#\d*?;)" r"|(?P<css>[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?\s*?})" r"|(?P<html_tags></??(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|" r"blockquote)\s*?/??>|<!--|-->)") errors = {} row_keys = set() rule_result = Result("Garbage Symbols", items_count=len(df)) for column in tqdm_notebook(df.select_dtypes([np.object]).columns, desc="Garbage Symbols"): matches = df[column].apply(str).str.extractall(garbage, flags=re.IGNORECASE) if not matches.empty: error_keys = df.loc[matches.unstack().index.values].index bad_texts = matches.stack().value_counts().index.sort_values( ).tolist() # escape backslashes for markdown repr, `\n > \\n` bad_texts = [ f"'{codecs.encode(bx, 'unicode_escape').decode()[:20]}'" for bx in bad_texts ] error = (f"{len(error_keys)/len(df)*100:.1f}% of '{column}' " f"values contain `{', '.join(bad_texts)}`") errors[error] = list(error_keys) row_keys = row_keys.union(error_keys) if errors: rule_result.add_error( f"{len(row_keys)/len(df) * 100:.1f}% ({len(row_keys)}) items affected", errors=errors, ) rule_result.err_items_count = len(row_keys) return rule_result
def compare_number_of_scraped_items(source_job: Job, target_job: Job) -> Result: s_count = api.get_items_count(source_job) t_count = api.get_items_count(target_job) diff = helpers.ratio_diff(s_count, t_count) result = Result("Total Scraped Items") if 0 <= diff < 0.05: if diff == 0: msg = "Same number of items" else: msg = f"Almost the same number of items - {s_count} and {t_count}" result.add_info(msg) else: msg = f"{s_count} differs from {t_count} on {diff:.2%}" if 0.05 <= diff < 0.10: result.add_warning(msg) elif diff >= 0.10: result.add_error(msg) return result
def validate(schema: Schema, items_dicts: Items, fast: bool = False) -> Result: """Run JSON schema validation against Items. Args: fast: defines if we use fastjsonschema or jsonschema validation """ validator = JsonSchemaValidator(schema) validator.run(items_dicts, fast) result = Result("JSON Schema Validation") errors = validator.errors schema_result_message = ( f"{len(items_dicts)} items were checked, {len(errors)} error(s)") if errors: result.add_error(schema_result_message, errors=errors) else: result.add_info(schema_result_message) return result
def compare_response_ratio(source_job: Job, target_job: Job) -> Result: """Compare request with response per item ratio""" items_count1 = api.get_items_count(source_job) items_count2 = api.get_items_count(target_job) source_ratio = round(api.get_requests_count(source_job) / items_count1, 2) target_ratio = round(api.get_requests_count(target_job) / items_count2, 2) response_ratio_diff = helpers.ratio_diff(source_ratio, target_ratio) msg = "Difference is {}% - {} and {}".format( response_ratio_diff * 100, source_ratio, target_ratio ) result = Result("Compare Responses Per Item Ratio") if response_ratio_diff > 0.2: result.add_error(msg) elif response_ratio_diff > 0.1: result.add_warning(msg) return result
def garbage_symbols(items: Items) -> Result: """Find unwanted symbols in `np.object` columns. Returns: A result containing item keys per field which contained any trash symbol """ garbage = ( r"(?P<spaces>^\s|\s$)" r"|(?P<html_entities>&|®)" r"|(?P<css>(?:(?:\.|#)[^#. ]+\s*){.+})" r"|(?P<html_tags></?(h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|" r"blockquote)\s*/?>|<!--|-->)") errors = {} row_keys = set() rule_result = Result("Garbage Symbols", items_count=items.size) for column in items.flat_df.select_dtypes([np.object]): matches = items.flat_df[column].str.extractall(garbage, flags=re.IGNORECASE) matches = matches[["spaces", "html_entities", "css", "html_tags"]] if not matches.empty: error_keys = items.flat_df.iloc[ matches.unstack().index.values]["_key"] original_column = items.get_origin_column_name(column) bad_texts = matches.stack().value_counts().index.sort_values( ).tolist() error = ( f"{len(error_keys)/items.size*100:.1f}% of '{original_column}' " f"values contain {[t[:20] for t in bad_texts]}") errors[error] = list(error_keys) row_keys = row_keys.union(error_keys) if errors: rule_result.add_error( f"{len(row_keys)/items.size * 100:.1f}% ({len(row_keys)}) items affected", errors=errors, ) rule_result.err_items_count = len(row_keys) return rule_result
def check_fields_coverage(df: pd.DataFrame) -> Result: """Get fields coverage from df. Coverage reflects the percentage of real values (exluding `nan`) per column. Args: df: a data to count the coverage Returns: A result with coverage for all columns in provided df. If column contains only `nan`, treat it as an error. """ fields_coverage = df.count().sort_values(ascending=False) fields_coverage.name = f"Fields coverage for {len(df):_} items" empty_fields = fields_coverage[fields_coverage == 0] result = Result("Fields Coverage") result.stats = [fields_coverage] if not empty_fields.empty: result.add_error(f"{len(empty_fields)} empty field(s)") return result