예제 #1
0
def compare_errors(source_job: Job, target_job: Job) -> Result:
    errors_count1 = api.get_errors_count(source_job)
    errors_count2 = api.get_errors_count(target_job)

    result = Result("Compare Job Errors")
    if errors_count1:
        errors_url = "{}/{}/log?filterType=error&filterAndHigher"
        detailed_msg = (
            f"{errors_count1} error(s) for {source_job.key} - "
            f"{errors_url.format(SH_URL, source_job.key)}\n"
            f"{errors_count2} error(s) for {target_job.key} - "
            f"{errors_url.format(SH_URL, target_job.key)}"
        )
        result.add_error(f"{errors_count1} and {errors_count2} errors", detailed_msg)
    return result
예제 #2
0
def check_errors(source_job: Job, target_job: Optional[Job] = None) -> Result:
    source_errs = api.get_errors_count(source_job)
    result = Result("Job Errors")
    if not source_errs:
        return result

    errors_url = "{}/{}/log?filterType=error&filterAndHigher"
    result.add_error(
        f"{source_errs} error(s) - {errors_url.format(SH_URL, source_job.key)}"
    )
    if target_job:
        target_errs = api.get_errors_count(target_job)
        result.add_error(
            f"{target_errs} error(s) - {errors_url.format(SH_URL, target_job.key)}"
        )
    return result
예제 #3
0
def check_errors(job: Job) -> Result:
    errors_count = api.get_errors_count(job)
    result = Result("Job Errors")
    if errors_count:
        url = f"{SH_URL}/{job.key}/log?filterType=error&filterAndHigher"
        result.add_error(
            f"{errors_count} error(s)", detailed=f"Errors for {job.key} - {url}"
        )
    else:
        result.add_info(f"No errors")
    return result
예제 #4
0
def generate_quality_estimation(job, crawlera_user, no_of_validation_warnings,
                                no_of_duplicated_items,
                                checked_dup_items_count, no_of_duplicated_skus,
                                no_of_checked_skus_items, no_of_price_warns,
                                no_of_checked_price_items, tested, **kwargs):
    no_of_scraped_items = api.get_items_count(job)
    no_of_errors = api.get_errors_count(job)

    job_state = api.get_job_state(job)
    job_close_reason = api.get_job_close_reason(job)
    response_status_count = api.get_response_status_count(job)

    adherence_to_schema_percent = float(
        get_adherence_to_schema_percent(no_of_validation_warnings,
                                        no_of_scraped_items))
    duplicated_items_percent = float(
        get_duplicated_items_percent(no_of_duplicated_items,
                                     no_of_scraped_items))
    duplicated_skus_percent = float(
        get_duplicated_skus_percent(no_of_duplicated_skus,
                                    no_of_scraped_items))

    crawlera_incapsula_percent = float(
        get_crawlera_incapsula_percent(crawlera_user))

    no_of_errors_percent = float(get_errors_count_percent(no_of_errors))
    price_was_price_now_comparison_percent = float(
        get_price_was_price_now_comparison_percent(no_of_price_warns,
                                                   no_of_scraped_items))
    outcome_percent = float(get_outcome_percent(job_state, job_close_reason))
    response_status_count_percent = float(
        get_response_status_count_percent(response_status_count))
    tested_percent = float(get_tested_percent(tested))

    if all([
            checked_dup_items_count == 0,
            no_of_checked_skus_items == 0,
            no_of_checked_price_items == 0,
    ]):
        quality_estimation = (adherence_to_schema_percent * 60 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)
    elif checked_dup_items_count == 0 and no_of_checked_skus_items == 0:
        quality_estimation = (adherence_to_schema_percent * 55 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              price_was_price_now_comparison_percent * 5 / 100
                              + outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)
    elif checked_dup_items_count == 0 and no_of_checked_price_items == 0:
        quality_estimation = (adherence_to_schema_percent * 55 / 100 +
                              duplicated_skus_percent * 5 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)
    elif no_of_checked_skus_items == 0 and no_of_checked_price_items == 0:
        quality_estimation = (adherence_to_schema_percent * 50 / 100 +
                              duplicated_items_percent * 10 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)
    elif checked_dup_items_count == 0:
        quality_estimation = (adherence_to_schema_percent * 50 / 100 +
                              duplicated_skus_percent * 5 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              price_was_price_now_comparison_percent * 5 / 100
                              + outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)
    elif no_of_checked_skus_items == 0:
        quality_estimation = (adherence_to_schema_percent * 45 / 100 +
                              duplicated_items_percent * 10 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              price_was_price_now_comparison_percent * 5 / 100
                              + outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)
    elif no_of_checked_price_items == 0:
        quality_estimation = (adherence_to_schema_percent * 45 / 100 +
                              duplicated_items_percent * 10 / 100 +
                              duplicated_skus_percent * 5 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)
    else:
        quality_estimation = (adherence_to_schema_percent * 40 / 100 +
                              duplicated_items_percent * 10 / 100 +
                              duplicated_skus_percent * 5 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              price_was_price_now_comparison_percent * 5 / 100
                              + outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)

    field_accuracy = adherence_to_schema_percent * 100 / 100

    for rule_result in kwargs.values():
        if rule_result.err_items_count / rule_result.items_count < 0.1:
            quality_estimation = quality_estimation * 0.95
        else:
            quality_estimation = quality_estimation * 0.90

    return int(quality_estimation), int(field_accuracy)
예제 #5
0
def job_summary_table(job) -> go.FigureWidget:
    job_url = f"{SH_URL}/{job.key}"
    job_state = api.get_job_state(job)
    job_close_reason = api.get_job_close_reason(job)
    no_of_scraped_items = api.get_items_count(job)
    no_of_errors = api.get_errors_count(job)

    job_runtime = api.get_runtime_s(job) / 1000
    run_time = helpers.ms_to_time(job_runtime)
    crawling_speed = round(job_runtime / 60 / no_of_scraped_items, 3)

    request_success_ratio = round(
        api.get_requests_count(job) / float(no_of_scraped_items), 2)

    max_memusage = api.get_max_memusage(job)
    response_status_count = api.get_response_status_count(job)

    crawlera_stat_value = api.get_crawlera_user(job)
    if not crawlera_stat_value:
        crawlera_stat_value = "Not Used"

    job_stats_values = [
        "Job URL",
        "Spider State",
        "Spider Close Reason",
        "Number of Scraped Items",
        "Number of Errors",
        "Runtime",
        "Request Success Ratio [requests/scraped items]",
        "Crawling Speed [items/min]",
        "Crawlera user",
        "Max Memory Usage [Bytes]",
        "Response Status Count",
    ]
    stats_values = [
        '<a href="' + job_url + '">' + job_url + "</a>",
        job_state,
        job_close_reason,
        no_of_scraped_items,
        no_of_errors,
        run_time,
        request_success_ratio,
        crawling_speed,
        crawlera_stat_value,
        max_memusage,
        "200: " + str(response_status_count[0]) + "<br>" + "301: " +
        str(response_status_count[1]) + "<br>" + "404: " +
        str(response_status_count[2]) + "<br>" + "503: " +
        str(response_status_count[3]) + "<br>",
    ]

    trace = go.Table(
        columnorder=[1, 2],
        columnwidth=[300, 200],
        header=dict(
            values=["<b>Job Stat</b>", "<b>Stat Value</b>"],
            fill=dict(color="gray"),
            align=["left"] * 5,
            font=dict(color="black", size=14),
            height=30,
        ),
        cells=dict(
            values=[job_stats_values, stats_values],
            fill=dict(color="lightgrey"),
            font=dict(color="black", size=12),
            height=25,
            align=["left"] * 5,
        ),
    )
    spider = job.metadata.get("spider")
    layout = go.Layout(
        title=f"Summary for spider {spider}",
        autosize=True,
        margin=dict(t=40, b=25, l=0, r=0),
        height=445,
    )

    return go.FigureWidget(data=[trace], layout=layout)