def get_difference(source_job: Job, target_job: Job, err_thr: float = 0.10, warn_thr: float = 0.05) -> Result: """Get difference between jobs coverages. The coverage is job fields counts divided on the job size. Args: source_job: a base job, the difference is calculated from it target_job: a job to compare err_thr: a threshold for errors warn_thr: a threshold for warnings Returns: A Result instance with huge dif and stats with fields counts coverage and dif """ result = Result("Coverage Difference") f_counts = (pd.DataFrame({ source_job.key: api.get_counts(source_job), target_job.key: api.get_counts(target_job), }).drop(index=["_type"]).fillna(0).sort_values(by=[source_job.key], kind="mergesort")) f_counts[source_job.key] = f_counts[source_job.key].divide( api.get_items_count(source_job)) f_counts[target_job.key] = f_counts[target_job.key].divide( api.get_items_count(target_job)) f_counts.name = "Coverage from job stats fields counts" result.stats.append(f_counts) coverage_difs = f_counts[source_job.key] - f_counts[target_job.key] coverage_difs = coverage_difs[coverage_difs.abs() > warn_thr].sort_values( kind="mergesoft") coverage_difs.name = f"Coverage difference more than {warn_thr:.0%}" if not coverage_difs.empty: result.stats.append(coverage_difs) errs = coverage_difs[coverage_difs.abs() > err_thr] if not errs.empty: result.add_error( f"The difference is greater than {err_thr:.0%} for {len(errs)} field(s)" ) warns = coverage_difs[(coverage_difs > warn_thr) & (coverage_difs <= err_thr)] if not warns.empty: result.add_warning( f"The difference is between {warn_thr:.0%} and {err_thr:.0%} " f"for {len(warns)} field(s)") return result
def create_json_schema(source_key: str, item_numbers: List[int] = None) -> dict: client = ScrapinghubClient() if helpers.is_collection_key(source_key): store = api.get_collection(source_key) items_count = store.count() elif helpers.is_job_key(source_key): job = client.get_job(source_key) items_count = api.get_items_count(job) store = job.items else: logger.error(f"{source_key} is not a job or collection key") return if items_count == 0: logger.error(f"{source_key} does not have any items") return item_n_err = "{} is a bad item number, choose numbers between 0 and {}" if item_numbers: item_numbers.sort() if item_numbers[-1] >= items_count or item_numbers[0] < 0: logger.error(item_n_err.format(item_numbers[-1], items_count - 1)) return else: item_numbers = set_item_no(items_count) samples = [] for n in item_numbers: items = api.get_items(source_key, start_index=n, count=1) samples.append(items[0]) return infer_schema(samples)
def create_json_schema(source_key: str, item_numbers: Optional[List[int]] = None) -> dict: if helpers.is_collection_key(source_key): store = api.get_collection(source_key) items_count = store.count() elif helpers.is_job_key(source_key): job = ScrapinghubClient().get_job(source_key) items_count = api.get_items_count(job) else: raise ValueError( f"'{source_key}' is not a valid job or collection key") if items_count == 0: raise ValueError(f"'{source_key}' does not have any items") item_n_err = "{} is a bad item number, choose numbers between 0 and {}" if item_numbers: item_numbers.sort() if item_numbers[-1] >= items_count or item_numbers[0] < 0: raise ValueError( item_n_err.format(item_numbers[-1], items_count - 1)) else: item_numbers = set_item_no(items_count) samples = [] for n in item_numbers: items = api.get_items(source_key, start_index=n, count=1, p_bar=None) samples.append(items[0]) return infer_schema(samples)
def create_json_schema(source_key: str, items_numbers: Optional[List[int]] = None) -> RawSchema: """Create schema based on sampled `source_key` items.""" if helpers.is_collection_key(source_key): store = api.get_collection(source_key) items_count = store.count() start_mask = "" elif helpers.is_job_key(source_key): items_count = api.get_items_count(api.get_job(source_key)) start_mask = f"{source_key}/" else: raise ValueError( f"'{source_key}' is not a valid job or collection key") if items_count == 0: raise ValueError(f"'{source_key}' does not have any items") items_numbers = items_numbers or set_item_no(items_count) if max(items_numbers) >= items_count or min(items_numbers) < 0: raise ValueError( f"Expected values between 0 and {items_count}, got '{items_numbers}'" ) samples = [] for n in items_numbers: item = api.get_items(source_key, count=1, start_index=n, start=f"{start_mask}{n}", p_bar=None)[0] item.pop("_type", None) item.pop("_key", None) samples.append(item) return infer_schema(samples)
def compare_fields_counts(source_job, target_job): """Compare the relative difference between field counts to items count Args: source_job: a base job, the difference is calculated from it target_job: a job to compare Returns: A Result instance """ source_items_count = get_items_count(source_job) target_items_count = get_items_count(target_job) result = Result("Fields Counts") source_fields = pd.DataFrame( {"Count1": source_job.items.stats().get("counts", None)}) target_fields = pd.DataFrame( {"Count2": target_job.items.stats().get("counts", None)}) fields = pd.concat([source_fields, target_fields], axis=1, sort=True).fillna(0) fields["Difference, %"] = fields.apply( lambda row: ratio_diff(row["Count1"] / source_items_count, row[ "Count2"] / target_items_count) * 100, axis=1, ) fields["Difference, %"] = fields["Difference, %"].astype(int) fields.sort_values(by=["Difference, %"], ascending=False) err_diffs = fields[fields["Difference, %"] > 10] if not err_diffs.empty: result.add_error( f"Coverage difference is greater than 10% for " f"{len(err_diffs)} field(s)", err_diffs.to_string(columns=["Difference, %"]), ) warn_diffs = fields[(fields["Difference, %"] > 5) & (fields["Difference, %"] <= 10)] if not warn_diffs.empty: outcome_msg = (f"Coverage difference is between 5% and 10% for " f"{len(warn_diffs)} field(s)") result.add_warning(outcome_msg, warn_diffs.to_string(columns=["Difference, %"])) return result
def compare_response_ratio(source_job: Job, target_job: Job) -> Result: """Compare request with response per item ratio""" s_ratio = round( api.get_requests_count(source_job) / api.get_items_count(source_job), 2) t_ratio = round( api.get_requests_count(target_job) / api.get_items_count(target_job), 2) response_ratio_diff = helpers.ratio_diff(s_ratio, t_ratio) msg = f"Difference is {response_ratio_diff:.2%} - {s_ratio} and {t_ratio}" result = Result("Compare Responses Per Item Ratio") if response_ratio_diff > 0.2: result.add_error(msg) elif response_ratio_diff > 0.1: result.add_warning(msg) return result
def compare_number_of_scraped_items(source_job: Job, target_job: Job) -> Result: items_count1 = api.get_items_count(source_job) items_count2 = api.get_items_count(target_job) diff = helpers.ratio_diff(items_count1, items_count2) result = Result("Total Scraped Items") if 0 <= diff < 0.05: if diff == 0: msg = "Same number of items" else: msg = f"Almost the same number of items - {items_count1} and {items_count2}" result.add_info(msg) else: msg = f"{items_count1} differs from {items_count2} on {diff * 100}%" if 0.05 <= diff < 0.10: result.add_warning(msg) elif diff >= 0.10: result.add_error(msg) return result
def check_response_ratio(job: Job) -> Result: requests_number = api.get_requests_count(job) items_count = api.get_items_count(job) result = Result("Responses Per Item Ratio") result.add_info( f"Number of responses / Number of scraped items - " f"{round(requests_number / items_count, 2)}" ) return result
def compare_number_of_scraped_items(source_job: Job, target_job: Job) -> Result: s_count = api.get_items_count(source_job) t_count = api.get_items_count(target_job) diff = helpers.ratio_diff(s_count, t_count) result = Result("Total Scraped Items") if 0 <= diff < 0.05: if diff == 0: msg = "Same number of items" else: msg = f"Almost the same number of items - {s_count} and {t_count}" result.add_info(msg) else: msg = f"{s_count} differs from {t_count} on {diff:.2%}" if 0.05 <= diff < 0.10: result.add_warning(msg) elif diff >= 0.10: result.add_error(msg) return result
def compare_response_ratio(source_job: Job, target_job: Job) -> Result: """Compare request with response per item ratio""" items_count1 = api.get_items_count(source_job) items_count2 = api.get_items_count(target_job) source_ratio = round(api.get_requests_count(source_job) / items_count1, 2) target_ratio = round(api.get_requests_count(target_job) / items_count2, 2) response_ratio_diff = helpers.ratio_diff(source_ratio, target_ratio) msg = "Difference is {}% - {} and {}".format( response_ratio_diff * 100, source_ratio, target_ratio ) result = Result("Compare Responses Per Item Ratio") if response_ratio_diff > 0.2: result.add_error(msg) elif response_ratio_diff > 0.1: result.add_warning(msg) return result
def test_get_items_count(metadata, stats, expected_count): assert api.get_items_count(Job(metadata=metadata, stats=stats)) == expected_count
def generate_quality_estimation(job, crawlera_user, no_of_validation_warnings, no_of_duplicated_items, checked_dup_items_count, no_of_duplicated_skus, no_of_checked_skus_items, no_of_price_warns, no_of_checked_price_items, tested, **kwargs): no_of_scraped_items = api.get_items_count(job) no_of_errors = api.get_errors_count(job) job_state = api.get_job_state(job) job_close_reason = api.get_job_close_reason(job) response_status_count = api.get_response_status_count(job) adherence_to_schema_percent = float( get_adherence_to_schema_percent(no_of_validation_warnings, no_of_scraped_items)) duplicated_items_percent = float( get_duplicated_items_percent(no_of_duplicated_items, no_of_scraped_items)) duplicated_skus_percent = float( get_duplicated_skus_percent(no_of_duplicated_skus, no_of_scraped_items)) crawlera_incapsula_percent = float( get_crawlera_incapsula_percent(crawlera_user)) no_of_errors_percent = float(get_errors_count_percent(no_of_errors)) price_was_price_now_comparison_percent = float( get_price_was_price_now_comparison_percent(no_of_price_warns, no_of_scraped_items)) outcome_percent = float(get_outcome_percent(job_state, job_close_reason)) response_status_count_percent = float( get_response_status_count_percent(response_status_count)) tested_percent = float(get_tested_percent(tested)) if all([ checked_dup_items_count == 0, no_of_checked_skus_items == 0, no_of_checked_price_items == 0, ]): quality_estimation = (adherence_to_schema_percent * 60 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) elif checked_dup_items_count == 0 and no_of_checked_skus_items == 0: quality_estimation = (adherence_to_schema_percent * 55 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + price_was_price_now_comparison_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) elif checked_dup_items_count == 0 and no_of_checked_price_items == 0: quality_estimation = (adherence_to_schema_percent * 55 / 100 + duplicated_skus_percent * 5 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) elif no_of_checked_skus_items == 0 and no_of_checked_price_items == 0: quality_estimation = (adherence_to_schema_percent * 50 / 100 + duplicated_items_percent * 10 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) elif checked_dup_items_count == 0: quality_estimation = (adherence_to_schema_percent * 50 / 100 + duplicated_skus_percent * 5 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + price_was_price_now_comparison_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) elif no_of_checked_skus_items == 0: quality_estimation = (adherence_to_schema_percent * 45 / 100 + duplicated_items_percent * 10 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + price_was_price_now_comparison_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) elif no_of_checked_price_items == 0: quality_estimation = (adherence_to_schema_percent * 45 / 100 + duplicated_items_percent * 10 / 100 + duplicated_skus_percent * 5 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) else: quality_estimation = (adherence_to_schema_percent * 40 / 100 + duplicated_items_percent * 10 / 100 + duplicated_skus_percent * 5 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + price_was_price_now_comparison_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) field_accuracy = adherence_to_schema_percent * 100 / 100 for rule_result in kwargs.values(): if rule_result.err_items_count / rule_result.items_count < 0.1: quality_estimation = quality_estimation * 0.95 else: quality_estimation = quality_estimation * 0.90 return int(quality_estimation), int(field_accuracy)
def limit(self) -> int: if not self._limit: self._limit = api.get_items_count(self.job) return self._limit
def job_summary_table(job) -> go.FigureWidget: job_url = f"{SH_URL}/{job.key}" job_state = api.get_job_state(job) job_close_reason = api.get_job_close_reason(job) no_of_scraped_items = api.get_items_count(job) no_of_errors = api.get_errors_count(job) job_runtime = api.get_runtime_s(job) / 1000 run_time = helpers.ms_to_time(job_runtime) crawling_speed = round(job_runtime / 60 / no_of_scraped_items, 3) request_success_ratio = round( api.get_requests_count(job) / float(no_of_scraped_items), 2) max_memusage = api.get_max_memusage(job) response_status_count = api.get_response_status_count(job) crawlera_stat_value = api.get_crawlera_user(job) if not crawlera_stat_value: crawlera_stat_value = "Not Used" job_stats_values = [ "Job URL", "Spider State", "Spider Close Reason", "Number of Scraped Items", "Number of Errors", "Runtime", "Request Success Ratio [requests/scraped items]", "Crawling Speed [items/min]", "Crawlera user", "Max Memory Usage [Bytes]", "Response Status Count", ] stats_values = [ '<a href="' + job_url + '">' + job_url + "</a>", job_state, job_close_reason, no_of_scraped_items, no_of_errors, run_time, request_success_ratio, crawling_speed, crawlera_stat_value, max_memusage, "200: " + str(response_status_count[0]) + "<br>" + "301: " + str(response_status_count[1]) + "<br>" + "404: " + str(response_status_count[2]) + "<br>" + "503: " + str(response_status_count[3]) + "<br>", ] trace = go.Table( columnorder=[1, 2], columnwidth=[300, 200], header=dict( values=["<b>Job Stat</b>", "<b>Stat Value</b>"], fill=dict(color="gray"), align=["left"] * 5, font=dict(color="black", size=14), height=30, ), cells=dict( values=[job_stats_values, stats_values], fill=dict(color="lightgrey"), font=dict(color="black", size=12), height=25, align=["left"] * 5, ), ) spider = job.metadata.get("spider") layout = go.Layout( title=f"Summary for spider {spider}", autosize=True, margin=dict(t=40, b=25, l=0, r=0), height=445, ) return go.FigureWidget(data=[trace], layout=layout)