def check_outcome(job: Job) -> Result: state = api.get_job_state(job) reason = api.get_job_close_reason(job) result = Result("Job Outcome") if state != "finished" or reason != "finished": result.add_error(f"Job has '{state}' state, '{reason}' close reason") return result
def generate_quality_estimation(job, crawlera_user, no_of_validation_warnings, no_of_duplicated_items, checked_dup_items_count, no_of_duplicated_skus, no_of_checked_skus_items, no_of_price_warns, no_of_checked_price_items, tested, **kwargs): no_of_scraped_items = api.get_items_count(job) no_of_errors = api.get_errors_count(job) job_state = api.get_job_state(job) job_close_reason = api.get_job_close_reason(job) response_status_count = api.get_response_status_count(job) adherence_to_schema_percent = float( get_adherence_to_schema_percent(no_of_validation_warnings, no_of_scraped_items)) duplicated_items_percent = float( get_duplicated_items_percent(no_of_duplicated_items, no_of_scraped_items)) duplicated_skus_percent = float( get_duplicated_skus_percent(no_of_duplicated_skus, no_of_scraped_items)) crawlera_incapsula_percent = float( get_crawlera_incapsula_percent(crawlera_user)) no_of_errors_percent = float(get_errors_count_percent(no_of_errors)) price_was_price_now_comparison_percent = float( get_price_was_price_now_comparison_percent(no_of_price_warns, no_of_scraped_items)) outcome_percent = float(get_outcome_percent(job_state, job_close_reason)) response_status_count_percent = float( get_response_status_count_percent(response_status_count)) tested_percent = float(get_tested_percent(tested)) if all([ checked_dup_items_count == 0, no_of_checked_skus_items == 0, no_of_checked_price_items == 0, ]): quality_estimation = (adherence_to_schema_percent * 60 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) elif checked_dup_items_count == 0 and no_of_checked_skus_items == 0: quality_estimation = (adherence_to_schema_percent * 55 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + price_was_price_now_comparison_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) elif checked_dup_items_count == 0 and no_of_checked_price_items == 0: quality_estimation = (adherence_to_schema_percent * 55 / 100 + duplicated_skus_percent * 5 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) elif no_of_checked_skus_items == 0 and no_of_checked_price_items == 0: quality_estimation = (adherence_to_schema_percent * 50 / 100 + duplicated_items_percent * 10 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) elif checked_dup_items_count == 0: quality_estimation = (adherence_to_schema_percent * 50 / 100 + duplicated_skus_percent * 5 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + price_was_price_now_comparison_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) elif no_of_checked_skus_items == 0: quality_estimation = (adherence_to_schema_percent * 45 / 100 + duplicated_items_percent * 10 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + price_was_price_now_comparison_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) elif no_of_checked_price_items == 0: quality_estimation = (adherence_to_schema_percent * 45 / 100 + duplicated_items_percent * 10 / 100 + duplicated_skus_percent * 5 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) else: quality_estimation = (adherence_to_schema_percent * 40 / 100 + duplicated_items_percent * 10 / 100 + duplicated_skus_percent * 5 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + price_was_price_now_comparison_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100) field_accuracy = adherence_to_schema_percent * 100 / 100 for rule_result in kwargs.values(): if rule_result.err_items_count / rule_result.items_count < 0.1: quality_estimation = quality_estimation * 0.95 else: quality_estimation = quality_estimation * 0.90 return int(quality_estimation), int(field_accuracy)
def job_summary_table(job) -> go.FigureWidget: job_url = f"{SH_URL}/{job.key}" job_state = api.get_job_state(job) job_close_reason = api.get_job_close_reason(job) no_of_scraped_items = api.get_items_count(job) no_of_errors = api.get_errors_count(job) job_runtime = api.get_runtime_s(job) / 1000 run_time = helpers.ms_to_time(job_runtime) crawling_speed = round(job_runtime / 60 / no_of_scraped_items, 3) request_success_ratio = round( api.get_requests_count(job) / float(no_of_scraped_items), 2) max_memusage = api.get_max_memusage(job) response_status_count = api.get_response_status_count(job) crawlera_stat_value = api.get_crawlera_user(job) if not crawlera_stat_value: crawlera_stat_value = "Not Used" job_stats_values = [ "Job URL", "Spider State", "Spider Close Reason", "Number of Scraped Items", "Number of Errors", "Runtime", "Request Success Ratio [requests/scraped items]", "Crawling Speed [items/min]", "Crawlera user", "Max Memory Usage [Bytes]", "Response Status Count", ] stats_values = [ '<a href="' + job_url + '">' + job_url + "</a>", job_state, job_close_reason, no_of_scraped_items, no_of_errors, run_time, request_success_ratio, crawling_speed, crawlera_stat_value, max_memusage, "200: " + str(response_status_count[0]) + "<br>" + "301: " + str(response_status_count[1]) + "<br>" + "404: " + str(response_status_count[2]) + "<br>" + "503: " + str(response_status_count[3]) + "<br>", ] trace = go.Table( columnorder=[1, 2], columnwidth=[300, 200], header=dict( values=["<b>Job Stat</b>", "<b>Stat Value</b>"], fill=dict(color="gray"), align=["left"] * 5, font=dict(color="black", size=14), height=30, ), cells=dict( values=[job_stats_values, stats_values], fill=dict(color="lightgrey"), font=dict(color="black", size=12), height=25, align=["left"] * 5, ), ) spider = job.metadata.get("spider") layout = go.Layout( title=f"Summary for spider {spider}", autosize=True, margin=dict(t=40, b=25, l=0, r=0), height=445, ) return go.FigureWidget(data=[trace], layout=layout)