def create_figures(self, items, items_dicts):
        jf = JsonFields(self.schema)
        tagged_fields = jf.tagged
        no_of_validated_items = len(items.df.index)

        dup_items_result = duplicate_rules.check_items(items.df, tagged_fields)
        no_of_checked_duplicated_items = dup_items_result.items_count
        no_of_duplicated_items = dup_items_result.err_items_count

        dup_skus_result = duplicate_rules.check_uniqueness(
            items.df, tagged_fields)
        no_of_checked_skus_items = dup_skus_result.items_count
        no_of_duplicated_skus = dup_skus_result.err_items_count

        price_was_now_result = price_rules.compare_was_now(
            items.df, tagged_fields)
        no_of_price_warns = price_was_now_result.err_items_count
        no_of_checked_price_items = price_was_now_result.items_count

        garbage_symbols_result = garbage_symbols(items)

        crawlera_user = api.get_crawlera_user(items.job)
        no_of_validation_warnings = self.report.results.get(
            "JSON Schema Validation").get_errors_count()
        quality_estimation, field_accuracy = generate_quality_estimation(
            items.job,
            crawlera_user,
            no_of_validation_warnings,
            no_of_duplicated_items,
            no_of_checked_duplicated_items,
            no_of_duplicated_skus,
            no_of_checked_skus_items,
            no_of_price_warns,
            no_of_validated_items,
            tested=True,
            garbage_symbols=garbage_symbols_result,
        )

        cleaned_df = self.drop_service_columns(items.df)

        self.score_table(quality_estimation, field_accuracy)
        self.job_summary_table(items.job)
        self.rules_summary_table(
            cleaned_df,
            no_of_validation_warnings,
            tagged_fields.get("name_field", ""),
            tagged_fields.get("product_url_field", ""),
            no_of_checked_duplicated_items,
            no_of_duplicated_items,
            tagged_fields.get("unique", []),
            no_of_checked_skus_items,
            no_of_duplicated_skus,
            tagged_fields.get("product_price_field", ""),
            tagged_fields.get("product_price_was_field", ""),
            no_of_checked_price_items,
            no_of_price_warns,
            garbage_symbols=garbage_symbols_result,
        )
        self.scraped_fields_coverage(items.job.key, cleaned_df)
        self.coverage_by_categories(cleaned_df, tagged_fields)
Beispiel #2
0
    def create_figures(self, items: JobItems):
        dups = self.report.results.get(
            "Duplicates", duplicate_rules.find_by_tags(items.df, self.schema.tags)
        )

        price_was_now_result = price_rules.compare_was_now(items.df, self.schema.tags)
        no_of_price_warns = price_was_now_result.err_items_count
        no_of_checked_price_items = price_was_now_result.items_count

        crawlera_user = api.get_crawlera_user(items.job)

        validation_errors = self.report.results.get(
            "JSON Schema Validation",
            schema_rules.validate(
                self.schema.raw, raw_items=items.raw, keys=items.df.index, fast=False
            ),
        ).get_errors_count()

        garbage_symbols_result = self.report.results.get(
            "Garbage Symbols", garbage_symbols(items.df)
        )

        quality_estimation, field_accuracy = generate_quality_estimation(
            items.job,
            crawlera_user,
            validation_errors,
            dups.err_items_count,
            dups.items_count,
            no_of_price_warns,
            no_of_checked_price_items,
            tested=True,
            garbage_symbols=garbage_symbols_result,
        )

        self.score_table(quality_estimation, field_accuracy)
        self.job_summary_table(items.job)
        self.rules_summary_table(
            items.df,
            validation_errors,
            self.schema.tags.get("name_field", ""),
            self.schema.tags.get("product_url_field", ""),
            dups.err_items_count,
            dups.items_count,
            self.schema.tags.get("product_price_field", ""),
            self.schema.tags.get("product_price_was_field", ""),
            no_of_checked_price_items,
            no_of_price_warns,
            garbage_symbols=garbage_symbols_result,
        )
        self.scraped_fields_coverage(items.df)
        self.coverage_by_categories(items.df, self.schema.tags)
Beispiel #3
0
    def create_figures(self, items):
        tagged_fields = Tags().get(self.schema)

        dup_items_result = duplicate_rules.check_items(items.df, tagged_fields)
        no_of_checked_duplicated_items = dup_items_result.items_count
        no_of_duplicated_items = dup_items_result.err_items_count

        dup_skus_result = duplicate_rules.check_uniqueness(
            items.df, tagged_fields)
        no_of_checked_skus_items = dup_skus_result.items_count
        no_of_duplicated_skus = dup_skus_result.err_items_count

        price_was_now_result = price_rules.compare_was_now(
            items.df, tagged_fields)
        no_of_price_warns = price_was_now_result.err_items_count
        no_of_checked_price_items = price_was_now_result.items_count

        crawlera_user = api.get_crawlera_user(items.job)

        validation_errors = self.report.results.get(
            "JSON Schema Validation",
            schema_rules.validate(self.schema, raw_items=items.raw,
                                  fast=False),
        ).get_errors_count()

        garbage_symbols_result = self.report.results.get(
            "Garbage Symbols", garbage_symbols(items))

        quality_estimation, field_accuracy = generate_quality_estimation(
            items.job,
            crawlera_user,
            validation_errors,
            no_of_duplicated_items,
            no_of_checked_duplicated_items,
            no_of_duplicated_skus,
            no_of_checked_skus_items,
            no_of_price_warns,
            no_of_checked_price_items,
            tested=True,
            garbage_symbols=garbage_symbols_result,
        )

        self.score_table(quality_estimation, field_accuracy)
        self.job_summary_table(items.job)
        self.rules_summary_table(
            items.df,
            validation_errors,
            tagged_fields.get("name_field", ""),
            tagged_fields.get("product_url_field", ""),
            no_of_checked_duplicated_items,
            no_of_duplicated_items,
            tagged_fields.get("unique", []),
            no_of_checked_skus_items,
            no_of_duplicated_skus,
            tagged_fields.get("product_price_field", ""),
            tagged_fields.get("product_price_was_field", ""),
            no_of_checked_price_items,
            no_of_price_warns,
            garbage_symbols=garbage_symbols_result,
        )
        self.scraped_fields_coverage(items.df)
        self.coverage_by_categories(items.df, tagged_fields)
Beispiel #4
0
def job_summary_table(job) -> go.FigureWidget:
    job_url = f"{SH_URL}/{job.key}"
    job_state = api.get_job_state(job)
    job_close_reason = api.get_job_close_reason(job)
    no_of_scraped_items = api.get_items_count(job)
    no_of_errors = api.get_errors_count(job)

    job_runtime = api.get_runtime_s(job) / 1000
    run_time = helpers.ms_to_time(job_runtime)
    crawling_speed = round(job_runtime / 60 / no_of_scraped_items, 3)

    request_success_ratio = round(
        api.get_requests_count(job) / float(no_of_scraped_items), 2)

    max_memusage = api.get_max_memusage(job)
    response_status_count = api.get_response_status_count(job)

    crawlera_stat_value = api.get_crawlera_user(job)
    if not crawlera_stat_value:
        crawlera_stat_value = "Not Used"

    job_stats_values = [
        "Job URL",
        "Spider State",
        "Spider Close Reason",
        "Number of Scraped Items",
        "Number of Errors",
        "Runtime",
        "Request Success Ratio [requests/scraped items]",
        "Crawling Speed [items/min]",
        "Crawlera user",
        "Max Memory Usage [Bytes]",
        "Response Status Count",
    ]
    stats_values = [
        '<a href="' + job_url + '">' + job_url + "</a>",
        job_state,
        job_close_reason,
        no_of_scraped_items,
        no_of_errors,
        run_time,
        request_success_ratio,
        crawling_speed,
        crawlera_stat_value,
        max_memusage,
        "200: " + str(response_status_count[0]) + "<br>" + "301: " +
        str(response_status_count[1]) + "<br>" + "404: " +
        str(response_status_count[2]) + "<br>" + "503: " +
        str(response_status_count[3]) + "<br>",
    ]

    trace = go.Table(
        columnorder=[1, 2],
        columnwidth=[300, 200],
        header=dict(
            values=["<b>Job Stat</b>", "<b>Stat Value</b>"],
            fill=dict(color="gray"),
            align=["left"] * 5,
            font=dict(color="black", size=14),
            height=30,
        ),
        cells=dict(
            values=[job_stats_values, stats_values],
            fill=dict(color="lightgrey"),
            font=dict(color="black", size=12),
            height=25,
            align=["left"] * 5,
        ),
    )
    spider = job.metadata.get("spider")
    layout = go.Layout(
        title=f"Summary for spider {spider}",
        autosize=True,
        margin=dict(t=40, b=25, l=0, r=0),
        height=445,
    )

    return go.FigureWidget(data=[trace], layout=layout)