def compare_runtime(source_job: Job, target_job: Job) -> Result: source_runtime = api.get_runtime(source_job) target_runtime = api.get_runtime(target_job) result = Result("Compare Runtime") if not source_runtime or not target_runtime: result.add_warning("Jobs are not finished") elif source_runtime > target_runtime: runtime_ratio_diff = helpers.ratio_diff(source_runtime, target_runtime) msg = ( f"Sources differ on {runtime_ratio_diff}% - " f"{helpers.ms_to_time(source_runtime)} and " f"{helpers.ms_to_time(target_runtime)}" ) if runtime_ratio_diff > 0.2: result.add_error(msg) elif runtime_ratio_diff > 0.1: result.add_warning(msg) else: result.add_info(msg) else: result.add_info( f"Similar or better runtime - {helpers.ms_to_time(source_runtime)} and " f"{helpers.ms_to_time(target_runtime)}" ) return result
def compare_was_now(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]): """Compare price_was and price_now tagged fields""" price_was_fields = tagged_fields.get("product_price_was_field") price_fields = tagged_fields.get("product_price_field") items_number = len(df.index) result = Result("Compare Price Was And Now") if (price_was_fields and price_was_fields[0] in df.columns and price_fields and price_fields[0] in df.columns): price_field = price_fields[0] price_was_field = price_was_fields[0] prices = df.copy() prices[price_was_field] = prices[price_was_field].astype(float) prices[price_field] = prices[price_field].astype(float) df_prices_less = pd.DataFrame( prices[prices[price_was_field] < prices[price_field]], columns=["_key", price_was_field, price_field], ) price_less_percent = "{:.2%}".format( len(df_prices_less) / items_number) if not df_prices_less.empty: error = f"Past price is less than current for {len(df_prices_less)} items" result.add_error( f"{price_less_percent} ({len(df_prices_less)}) of " f"items with {price_was_field} < {price_field}", detailed=f"{error}:\n{list(df_prices_less['_key'])}", ) df_prices_equals = pd.DataFrame( prices[prices[price_was_field] == prices[price_field]], columns=["_key", price_was_field, price_field], ) price_equal_percent = "{:.2%}".format( len(df_prices_equals) / items_number) if not df_prices_equals.empty: result.add_warning( (f"{price_equal_percent} ({len(df_prices_equals)}) " f"of items with {price_was_field} = {price_field}"), detailed=(f"Prices equal for {len(df_prices_equals)} items:\n" f"{list(df_prices_equals['_key'])}"), ) result.err_items_count = len(df_prices_equals) + len(df_prices_less) result.items_count = len(df.index) else: result.add_info( "product_price_field or product_price_was_field tags were not " "found in schema") return result
def compare_was_now(df: pd.DataFrame, tagged_fields: TaggedFields): """Compare price_was and price_now tagged fields""" price_was_fields = tagged_fields.get("product_price_was_field") price_fields = tagged_fields.get("product_price_field") items_number = len(df.index) result = Result("Compare Price Was And Now") if not price_was_fields or not price_fields: result.add_info(Outcome.SKIPPED) return result price_field = price_fields[0] price_was_field = price_was_fields[0] prices = df.copy() prices[price_was_field] = prices[price_was_field].astype(float) prices[price_field] = prices[price_field].astype(float) df_prices_less = pd.DataFrame( prices[prices[price_was_field] < prices[price_field]], columns=[price_was_field, price_field], ) price_less_percent = "{:.2%}".format(len(df_prices_less) / items_number) if not df_prices_less.empty: error = f"Past price is less than current for {len(df_prices_less)} items" result.add_error( f"{price_less_percent} ({len(df_prices_less)}) of " f"items with {price_was_field} < {price_field}", detailed=f"{error}:\n{list(df_prices_less.index)}", ) df_prices_equals = pd.DataFrame( prices[prices[price_was_field] == prices[price_field]], columns=[price_was_field, price_field], ) price_equal_percent = "{:.2%}".format(len(df_prices_equals) / items_number) if not df_prices_equals.empty: result.add_warning( ( f"{price_equal_percent} ({len(df_prices_equals)}) " f"of items with {price_was_field} = {price_field}" ), detailed=( f"Prices equal for {len(df_prices_equals)} items:\n" f"{list(df_prices_equals.index)}" ), ) result.err_items_count = len(df_prices_equals) + len(df_prices_less) result.items_count = len(df.index) return result
def compare_spider_names(source_job: Job, target_job: Job) -> Result: s_name = source_job.metadata.get("spider") t_name = target_job.metadata.get("spider") result = Result("Spider Names") if s_name != t_name: result.add_warning( f"{source_job.key} spider is {s_name}, {target_job.key} spider is {t_name}" ) return result
def compare_spider_names(source_job: Job, target_job: Job) -> Result: name1 = source_job.metadata.get("spider") name2 = target_job.metadata.get("spider") result = Result("Spider Names") if name1 != name2: result.add_warning( f"{source_job.key} spider is {name1}, {target_job.key} spider is {name2}" ) return result
def compare_boolean_fields( source_df: pd.DataFrame, target_df: pd.DataFrame, err_thr: float = 0.10, warn_thr: float = 0.05, ) -> Result: """Compare booleans distribution between two dataframes Returns: A result containing dataframe with distributions and messages if differences are in thresholds """ source_bool = source_df.select_dtypes(include="bool") target_bool = target_df.select_dtypes(include="bool") result = Result("Boolean Fields") if not fields_to_compare(source_bool, target_bool): result.outcome = Outcome.SKIPPED return result dummy = pd.DataFrame(columns=[True, False]) source_counts = pd.concat( [dummy, source_bool.apply(pd.value_counts, normalize=True).T], sort=False ).fillna(0.0) target_counts = pd.concat( [dummy, target_bool.apply(pd.value_counts, normalize=True).T], sort=False ).fillna(0.0) difs = (source_counts - target_counts)[True] bool_covs = pd.concat( [ source_counts.rename("{}_source".format), target_counts.rename("{}_target".format), ] ).sort_index() bool_covs.name = "Coverage for boolean fields" result.stats.append(bool_covs) err_diffs = difs[difs.abs() > err_thr] if not err_diffs.empty: result.add_error( f"{', '.join(err_diffs.index)} relative frequencies differ " f"by more than {err_thr:.0%}" ) warn_diffs = difs[(difs.abs() > warn_thr) & (difs.abs() <= err_thr)] if not warn_diffs.empty: result.add_warning( f"{', '.join(warn_diffs.index)} relative frequencies differ by " f"{warn_thr:.0%}-{err_thr:.0%}" ) return result
def get_difference( source_df: pd.DataFrame, target_df: pd.DataFrame, category_names: List[str], source_key: str = "source", target_key: str = "target", ) -> Result: """Find and show differences between categories coverage, including nan values. Coverage means value counts divided on total size. Args: source_df: a data you want to compare target_df: a data you want to compare with category_names: list of columns which values to compare source_key: label for `source_df` target_key: label for `target_df` Returns: A result instance with messages containing significant difference defined by thresholds, a dataframe showing all normalized value counts in percents, a series containing significant difference. """ result = Result("Category Coverage Difference") warn_thr = 0.10 err_thr = 0.20 for c in category_names: cats = ( pd.DataFrame( { source_key: source_df[c].value_counts(dropna=False, normalize=True), target_key: target_df[c].value_counts(dropna=False, normalize=True), } ) .fillna(0) .sort_values(by=[source_key, target_key], kind="mergesort") ) cats.name = f"Coverage for {c}" result.stats.append(cats) cat_difs = (cats[source_key] - cats[target_key]).abs() cat_difs = cat_difs[cat_difs > warn_thr] cat_difs.name = f"Coverage difference more than {warn_thr:.0%} for {c}" if not cat_difs.empty: result.stats.append(cat_difs) errs = cat_difs[cat_difs > err_thr] if not errs.empty: result.add_warning( f"The difference is greater than {err_thr:.0%} for {len(errs)} value(s) of {c}" ) if not category_names: result.add_info(Outcome.SKIPPED) return result
def compare_finish_time(source_job: Job, target_job: Job) -> Result: diff_in_days = api.get_finish_time_difference_in_days(source_job, target_job) result = Result("Finish Time") if diff_in_days == 0: result.add_info("Less than 1 day difference") else: if diff_in_days is None: result.add_warning("Jobs are not finished") else: result.add_warning(f"{diff_in_days} day(s) difference between 2 jobs") return result
def get_difference(source_job: Job, target_job: Job, err_thr: float = 0.10, warn_thr: float = 0.05) -> Result: """Get difference between jobs coverages. The coverage is job fields counts divided on the job size. Args: source_job: a base job, the difference is calculated from it target_job: a job to compare err_thr: a threshold for errors warn_thr: a threshold for warnings Returns: A Result instance with huge dif and stats with fields counts coverage and dif """ result = Result("Coverage Difference") f_counts = (pd.DataFrame({ source_job.key: api.get_counts(source_job), target_job.key: api.get_counts(target_job), }).drop(index=["_type"]).fillna(0).sort_values(by=[source_job.key], kind="mergesort")) f_counts[source_job.key] = f_counts[source_job.key].divide( api.get_items_count(source_job)) f_counts[target_job.key] = f_counts[target_job.key].divide( api.get_items_count(target_job)) f_counts.name = "Coverage from job stats fields counts" result.stats.append(f_counts) coverage_difs = f_counts[source_job.key] - f_counts[target_job.key] coverage_difs = coverage_difs[coverage_difs.abs() > warn_thr].sort_values( kind="mergesoft") coverage_difs.name = f"Coverage difference more than {warn_thr:.0%}" if not coverage_difs.empty: result.stats.append(coverage_difs) errs = coverage_difs[coverage_difs.abs() > err_thr] if not errs.empty: result.add_error( f"The difference is greater than {err_thr:.0%} for {len(errs)} field(s)" ) warns = coverage_difs[(coverage_difs > warn_thr) & (coverage_difs <= err_thr)] if not warns.empty: result.add_warning( f"The difference is between {warn_thr:.0%} and {err_thr:.0%} " f"for {len(warns)} field(s)") return result
def compare_fields_counts(source_job, target_job): """Compare the relative difference between field counts to items count Args: source_job: a base job, the difference is calculated from it target_job: a job to compare Returns: A Result instance """ source_items_count = get_items_count(source_job) target_items_count = get_items_count(target_job) result = Result("Fields Counts") source_fields = pd.DataFrame( {"Count1": source_job.items.stats().get("counts", None)}) target_fields = pd.DataFrame( {"Count2": target_job.items.stats().get("counts", None)}) fields = pd.concat([source_fields, target_fields], axis=1, sort=True).fillna(0) fields["Difference, %"] = fields.apply( lambda row: ratio_diff(row["Count1"] / source_items_count, row[ "Count2"] / target_items_count) * 100, axis=1, ) fields["Difference, %"] = fields["Difference, %"].astype(int) fields.sort_values(by=["Difference, %"], ascending=False) err_diffs = fields[fields["Difference, %"] > 10] if not err_diffs.empty: result.add_error( f"Coverage difference is greater than 10% for " f"{len(err_diffs)} field(s)", err_diffs.to_string(columns=["Difference, %"]), ) warn_diffs = fields[(fields["Difference, %"] > 5) & (fields["Difference, %"] <= 10)] if not warn_diffs.empty: outcome_msg = (f"Coverage difference is between 5% and 10% for " f"{len(warn_diffs)} field(s)") result.add_warning(outcome_msg, warn_diffs.to_string(columns=["Difference, %"])) return result
def compare_response_ratio(source_job: Job, target_job: Job) -> Result: """Compare request with response per item ratio""" s_ratio = round( api.get_requests_count(source_job) / api.get_items_count(source_job), 2) t_ratio = round( api.get_requests_count(target_job) / api.get_items_count(target_job), 2) response_ratio_diff = helpers.ratio_diff(s_ratio, t_ratio) msg = f"Difference is {response_ratio_diff:.2%} - {s_ratio} and {t_ratio}" result = Result("Compare Responses Per Item Ratio") if response_ratio_diff > 0.2: result.add_error(msg) elif response_ratio_diff > 0.1: result.add_warning(msg) return result
def compare_number_of_scraped_items(source_job: Job, target_job: Job) -> Result: items_count1 = api.get_items_count(source_job) items_count2 = api.get_items_count(target_job) diff = helpers.ratio_diff(items_count1, items_count2) result = Result("Total Scraped Items") if 0 <= diff < 0.05: if diff == 0: msg = "Same number of items" else: msg = f"Almost the same number of items - {items_count1} and {items_count2}" result.add_info(msg) else: msg = f"{items_count1} differs from {items_count2} on {diff * 100}%" if 0.05 <= diff < 0.10: result.add_warning(msg) elif diff >= 0.10: result.add_error(msg) return result
def compare_number_of_scraped_items(source_job: Job, target_job: Job) -> Result: s_count = api.get_items_count(source_job) t_count = api.get_items_count(target_job) diff = helpers.ratio_diff(s_count, t_count) result = Result("Total Scraped Items") if 0 <= diff < 0.05: if diff == 0: msg = "Same number of items" else: msg = f"Almost the same number of items - {s_count} and {t_count}" result.add_info(msg) else: msg = f"{s_count} differs from {t_count} on {diff:.2%}" if 0.05 <= diff < 0.10: result.add_warning(msg) elif diff >= 0.10: result.add_error(msg) return result
def compare_response_ratio(source_job: Job, target_job: Job) -> Result: """Compare request with response per item ratio""" items_count1 = api.get_items_count(source_job) items_count2 = api.get_items_count(target_job) source_ratio = round(api.get_requests_count(source_job) / items_count1, 2) target_ratio = round(api.get_requests_count(target_job) / items_count2, 2) response_ratio_diff = helpers.ratio_diff(source_ratio, target_ratio) msg = "Difference is {}% - {} and {}".format( response_ratio_diff * 100, source_ratio, target_ratio ) result = Result("Compare Responses Per Item Ratio") if response_ratio_diff > 0.2: result.add_error(msg) elif response_ratio_diff > 0.1: result.add_warning(msg) return result
def compare_boolean_fields(source_df, target_df): source_bool = source_df.select_dtypes(include="bool") target_bool = target_df.select_dtypes(include="bool") result = Result("Boolean Fields") if not fields_to_compare(source_bool, target_bool): result.add_info("No fields to compare") return result source_relative_fr = get_bool_relative_frequency(source_bool) target_relative_fr = get_bool_relative_frequency(target_bool) relative_diffs = abs(source_relative_fr - target_relative_fr) * 100 err_diffs = relative_diffs[(relative_diffs > 10).all(1)] if not err_diffs.empty: result.add_error( (f"{err_diffs.index.values} relative frequencies differ " "by more than 10%"), err_diffs.to_string(), ) warn_diffs = relative_diffs[((relative_diffs <= 10) & (relative_diffs > 5)).all(1)] if not warn_diffs.empty: result.add_warning( f"{warn_diffs.index.values} relative frequencies differ by 5-10%", warn_diffs.to_string(), ) if err_diffs.empty and warn_diffs.empty: result.add_info( f"{relative_diffs.index.values} relative frequencies are equal " "or differ by less than 5%", relative_diffs.to_string( header=["Difference in False, %", "Difference in True, %"]), ) return result