Example #1
0
def compare_scraped_fields(source_df, target_df):
    source_field_coverage = dict(
        source_df.count().sort_values(ascending=False))
    target_field_coverage = dict(
        target_df.count().sort_values(ascending=False))

    result = Result("Scraped Fields")
    missing_fields = set(target_df.columns.values) - set(
        source_df.columns.values)
    if missing_fields:
        detailed_messages = ["Missing Fields"]
        for field in missing_fields:
            target_coverage = target_field_coverage[field] / len(
                target_df) * 100
            detailed_messages.append(
                f"{field} - coverage - {int(target_coverage)}% - "
                f"{target_field_coverage[field]} items")
        result.add_error(f"{len(missing_fields)} field(s) are missing",
                         "\n".join(detailed_messages))

    new_fields = set(source_df.columns.values) - set(target_df.columns.values)
    if new_fields:
        detailed_messages = ["New Fields"]
        for field in new_fields:
            source_coverage = source_field_coverage[field] / len(
                source_df) * 100
            detailed_messages.append(
                f"{field} - coverage - {int(source_coverage)}% - "
                f"{source_field_coverage[field]} items")
        result.add_info(f"{len(new_fields)} field(s) are new",
                        "\n".join(detailed_messages))

    return result
Example #2
0
def find_by(df: pd.DataFrame, uniques: List[Union[str, List[str]]]) -> Result:
    """Find equal items rows in `df` by `uniques`. I.e. if two items have the same
    uniques's element value, they are considered duplicates.

    Args:
        uniques: list containing columns and list of columns to identify duplicates.
        List of columns means that all list columns values should be equal.

    Returns:
        Any duplicates
    """
    result = Result("Duplicates")
    result.items_count = len(df)

    df = df.dropna(subset=list(set(flatten(uniques))), how="all")
    for columns in uniques:
        mask = columns if isinstance(columns, list) else [columns]
        duplicates = df[df.duplicated(columns, keep=False)][mask]
        if duplicates.empty:
            continue

        errors = {}
        grouped = duplicates.groupby(columns)
        for _, d in grouped:
            msgs = [f"'{d[c].iloc[0]}' `{c}`" for c in mask]
            errors[f"same {', '.join(msgs)}"] = list(d.index)
        result.add_error(
            f"{', '.join(mask)} contains {len(grouped)} duplicated value(s)",
            errors=errors,
        )
    return result
Example #3
0
def check_uniqueness(df: pd.DataFrame,
                     tagged_fields: Dict[str, List[str]]) -> Result:
    """Verify if each item field tagged with `unique` is unique.

    Returns:
        A result containing field names and keys for non unique items
    """
    unique_fields = tagged_fields.get("unique", [])
    result = Result("Uniqueness")

    if not unique_fields:
        result.add_info("'unique' tag was not found in schema")
        return result

    err_keys = set()
    for field in unique_fields:
        result.items_count = df[field].count()
        duplicates = df[df[field].duplicated(keep=False)][[field, "_key"]]
        errors = {}
        for _, d in duplicates.groupby([field]):
            keys = list(d["_key"])
            msg = f"same '{d[field].iloc[0]}' {field}"
            errors[msg] = keys
            err_keys = err_keys.union(keys)
        if not duplicates.empty:
            result.add_error(
                f"'{field}' contains {len(duplicates[field].unique())} duplicated value(s)",
                errors=errors,
            )

    result.err_items_count = len(err_keys)
    return result
Example #4
0
def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
    """Verify if each item field tagged with `unique` is unique.

    Returns:
        A result containing field names and keys for non unique items
    """
    unique_fields = tagged_fields.get("unique", [])
    result = Result("Duplicates By **unique** Tag")

    if not unique_fields:
        result.add_info(Outcome.SKIPPED)
        return result

    err_keys = set()
    for field in unique_fields:
        result.items_count = df[field].count()
        duplicates = df[df.duplicated(field, keep=False)][[field]]
        errors = {}
        for _, d in duplicates.groupby([field]):
            keys = list(d.index)
            msg = f"same '{d[field].iloc[0]}' `{field}`"
            errors[msg] = keys
            err_keys = err_keys.union(keys)
        if not duplicates.empty:
            result.add_error(
                f"{field} contains {len(duplicates[field].unique())} duplicated value(s)",
                errors=errors,
            )

    result.err_items_count = len(err_keys)
    return result
Example #5
0
def anomalies(target: str, sample: List[str]) -> Result:
    """Find fields with significant deviation. Significant means `dev > 2 * std()`

    Args:
        target: where to look for anomalies
        sample: a list of jobs keys to infer metadata from

    Returns:
        A Result with a dataframe of significant deviations
    """
    result = Result("Anomalies")
    raw_stats = [job.items.stats() for job in api.get_jobs(sample + [target])]

    counts = (pd.DataFrame(
        rs.get("counts") for rs in raw_stats).fillna(0).drop(columns="_type"))
    items_len = [rs["totals"]["input_values"] for rs in raw_stats]
    stats = counts.apply(lambda x: x / items_len)
    stats.index = sample + [target]
    stats.rename(index={target: "target"}, inplace=True)
    stats.loc["mean"] = stats.loc[sample].mean()
    stats.loc["std"] = stats.loc[sample].std()
    stats = stats.T
    stats["target deviation"] = stats["target"] - stats["mean"]
    devs = stats[(stats["target deviation"].abs() > 2 * stats["std"])]
    devs.name = "Anomalies"
    if not devs.empty:
        result.add_error(
            f"{len(devs.index)} field(s) with significant coverage deviation")
        result.stats = [devs]

    return result
Example #6
0
def compare_names_for_same_urls(
    source_df: pd.DataFrame,
    target_df: pd.DataFrame,
    tagged_fields: Dict[str, List[str]],
):
    """For each pair of items that have the same `product_url_field` tagged field,
    compare `name_field` field"""

    result = Result("Compare Names Per Url")
    url_field = tagged_fields.get("product_url_field")
    if not url_field:
        result.add_info("product_url_field tag is not set")
        return result

    url_field = url_field[0]
    name_field = tagged_fields.get("name_field")

    diff_names_count = 0
    if not name_field:
        result.add_info("name_field tag is not set")
        return result

    name_field = name_field[0]
    if any([
            name_field not in source_df.columns.values,
            name_field not in target_df.columns.values,
    ]):
        return

    same_urls = source_df[(source_df[url_field].isin(
        target_df[url_field].values))][url_field]

    detailed_messages = []
    for url in same_urls:
        if url.strip() != "nan":
            source_name = source_df[source_df[url_field] ==
                                    url][name_field].iloc[0]
            target_name = target_df[target_df[url_field] ==
                                    url][name_field].iloc[0]

            if (source_name != target_name and source_name.strip() != "nan"
                    and target_name.strip() != "nan"):
                diff_names_count += 1
                source_key = source_df[source_df[url_field] ==
                                       url]["_key"].iloc[0]
                target_key = target_df[target_df[url_field] ==
                                       url]["_key"].iloc[0]
                msg = (
                    f"different names for url: {url}\nsource name is {source_name} "
                    f"for {source_key}\ntarget name is {target_name} for {target_key}"
                )
                detailed_messages.append(msg)

    res = f"{len(same_urls)} checked, {diff_names_count} errors"
    if detailed_messages:
        result.add_error(res, detailed="\n".join(detailed_messages))
    else:
        result.add_info(res)

    return result
Example #7
0
def check_items(df: pd.DataFrame, tagged_fields: Dict[str,
                                                      List[str]]) -> Result:
    """Check for items with the same name and url"""

    name_fields = tagged_fields.get("name_field")
    url_fields = tagged_fields.get("product_url_field")
    result = Result("Duplicated Items")
    if not name_fields or not url_fields:
        result.add_info(
            "'name_field' and 'product_url_field' tags were not found in schema"
        )
    else:
        result.items_count = len(df)
        errors = {}
        name_field = name_fields[0]
        url_field = url_fields[0]
        df = df[[name_field, url_field, "_key"]]
        duplicates = df[df[[name_field, url_field]].duplicated(keep=False)]
        if duplicates.empty:
            return result

        result.err_items_count = len(duplicates)
        for _, d in duplicates.groupby([name_field, url_field]):
            msg = (
                f"same '{d[name_field].iloc[0]}' name and '{d[url_field].iloc[0]}' url"
            )
            errors[msg] = list(d["_key"])
        result.add_error(
            f"{len(duplicates)} duplicate(s) with same name and url",
            errors=errors)
    return result
Example #8
0
def compare_runtime(source_job: Job, target_job: Job) -> Result:
    source_runtime = api.get_runtime(source_job)
    target_runtime = api.get_runtime(target_job)

    result = Result("Compare Runtime")
    if not source_runtime or not target_runtime:
        result.add_warning("Jobs are not finished")
    elif source_runtime > target_runtime:
        runtime_ratio_diff = helpers.ratio_diff(source_runtime, target_runtime)
        msg = (
            f"Sources differ on {runtime_ratio_diff}% - "
            f"{helpers.ms_to_time(source_runtime)} and "
            f"{helpers.ms_to_time(target_runtime)}"
        )
        if runtime_ratio_diff > 0.2:
            result.add_error(msg)
        elif runtime_ratio_diff > 0.1:
            result.add_warning(msg)
        else:
            result.add_info(msg)
    else:
        result.add_info(
            f"Similar or better runtime - {helpers.ms_to_time(source_runtime)} and "
            f"{helpers.ms_to_time(target_runtime)}"
        )
    return result
Example #9
0
def check_outcome(job: Job) -> Result:
    state = api.get_job_state(job)
    reason = api.get_job_close_reason(job)
    result = Result("Job Outcome")
    if state != "finished" or reason != "finished":
        result.add_error(f"Job has '{state}' state, '{reason}' close reason")
    return result
Example #10
0
def compare_prices_for_same_urls(source_df: pd.DataFrame,
                                 target_df: pd.DataFrame,
                                 tagged_fields: TaggedFields) -> Result:
    """For each pair of items that have the same `product_url_field` tagged field,
    compare `product_price_field` field

    Returns:
        A result containing pairs of items from `source_df` and `target_df`
        which `product_price_field` differ.
    """
    result = Result("Compare Prices For Same Urls")
    url_field_list: Optional[List[str]] = tagged_fields.get(
        "product_url_field")
    if not url_field_list:
        result.outcome = Outcome.SKIPPED
        return result

    url_field = url_field_list[0]

    source_df = source_df.dropna(subset=[url_field])
    target_df = target_df.dropna(subset=[url_field])

    same_urls = source_df[(source_df[url_field].isin(
        target_df[url_field].values))][url_field]

    price_fields = tagged_fields.get("product_price_field")
    if not price_fields:
        result.add_info("product_price_field tag is not set")
    else:
        price_field = price_fields[0]
        detailed_messages = []
        for url in same_urls:
            if url.strip() != "nan":
                source_price = source_df[source_df[url_field] ==
                                         url][price_field].iloc[0]
                target_price = target_df[target_df[url_field] ==
                                         url][price_field].iloc[0]

                if (is_number(source_price) and is_number(target_price)
                        and ratio_diff(source_price, target_price) > 0.1):
                    source_key = source_df[source_df[url_field] ==
                                           url].index[0]
                    target_key = target_df[target_df[url_field] ==
                                           url].index[0]
                    msg = (
                        f"different prices for url: {url}\nsource price is {source_price} "
                        f"for {source_key}\ntarget price is {target_price} for {target_key}"
                    )
                    detailed_messages.append(msg)

        res = f"{len(same_urls)} checked, {len(detailed_messages)} errors"
        if detailed_messages:
            result.add_error(res, detailed="\n".join(detailed_messages))
        else:
            result.add_info(res)

    return result
Example #11
0
def compare_was_now(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]):
    """Compare price_was and price_now tagged fields"""

    price_was_fields = tagged_fields.get("product_price_was_field")
    price_fields = tagged_fields.get("product_price_field")
    items_number = len(df.index)

    result = Result("Compare Price Was And Now")

    if (price_was_fields and price_was_fields[0] in df.columns and price_fields
            and price_fields[0] in df.columns):
        price_field = price_fields[0]
        price_was_field = price_was_fields[0]
        prices = df.copy()
        prices[price_was_field] = prices[price_was_field].astype(float)
        prices[price_field] = prices[price_field].astype(float)

        df_prices_less = pd.DataFrame(
            prices[prices[price_was_field] < prices[price_field]],
            columns=["_key", price_was_field, price_field],
        )

        price_less_percent = "{:.2%}".format(
            len(df_prices_less) / items_number)

        if not df_prices_less.empty:
            error = f"Past price is less than current for {len(df_prices_less)} items"
            result.add_error(
                f"{price_less_percent} ({len(df_prices_less)}) of "
                f"items with {price_was_field} < {price_field}",
                detailed=f"{error}:\n{list(df_prices_less['_key'])}",
            )

        df_prices_equals = pd.DataFrame(
            prices[prices[price_was_field] == prices[price_field]],
            columns=["_key", price_was_field, price_field],
        )
        price_equal_percent = "{:.2%}".format(
            len(df_prices_equals) / items_number)

        if not df_prices_equals.empty:
            result.add_warning(
                (f"{price_equal_percent} ({len(df_prices_equals)}) "
                 f"of items with {price_was_field} = {price_field}"),
                detailed=(f"Prices equal for {len(df_prices_equals)} items:\n"
                          f"{list(df_prices_equals['_key'])}"),
            )

        result.err_items_count = len(df_prices_equals) + len(df_prices_less)
        result.items_count = len(df.index)

    else:
        result.add_info(
            "product_price_field or product_price_was_field tags were not "
            "found in schema")
    return result
Example #12
0
def compare_was_now(df: pd.DataFrame, tagged_fields: TaggedFields):
    """Compare price_was and price_now tagged fields"""

    price_was_fields = tagged_fields.get("product_price_was_field")
    price_fields = tagged_fields.get("product_price_field")
    items_number = len(df.index)

    result = Result("Compare Price Was And Now")

    if not price_was_fields or not price_fields:
        result.add_info(Outcome.SKIPPED)
        return result

    price_field = price_fields[0]
    price_was_field = price_was_fields[0]
    prices = df.copy()
    prices[price_was_field] = prices[price_was_field].astype(float)
    prices[price_field] = prices[price_field].astype(float)

    df_prices_less = pd.DataFrame(
        prices[prices[price_was_field] < prices[price_field]],
        columns=[price_was_field, price_field],
    )

    price_less_percent = "{:.2%}".format(len(df_prices_less) / items_number)

    if not df_prices_less.empty:
        error = f"Past price is less than current for {len(df_prices_less)} items"
        result.add_error(
            f"{price_less_percent} ({len(df_prices_less)}) of "
            f"items with {price_was_field} < {price_field}",
            detailed=f"{error}:\n{list(df_prices_less.index)}",
        )

    df_prices_equals = pd.DataFrame(
        prices[prices[price_was_field] == prices[price_field]],
        columns=[price_was_field, price_field],
    )
    price_equal_percent = "{:.2%}".format(len(df_prices_equals) / items_number)

    if not df_prices_equals.empty:
        result.add_warning(
            (
                f"{price_equal_percent} ({len(df_prices_equals)}) "
                f"of items with {price_was_field} = {price_field}"
            ),
            detailed=(
                f"Prices equal for {len(df_prices_equals)} items:\n"
                f"{list(df_prices_equals.index)}"
            ),
        )

    result.err_items_count = len(df_prices_equals) + len(df_prices_less)
    result.items_count = len(df.index)

    return result
Example #13
0
def compare_boolean_fields(
    source_df: pd.DataFrame,
    target_df: pd.DataFrame,
    err_thr: float = 0.10,
    warn_thr: float = 0.05,
) -> Result:
    """Compare booleans distribution between two dataframes

    Returns:
        A result containing dataframe with distributions and messages if differences
        are in thresholds
    """

    source_bool = source_df.select_dtypes(include="bool")
    target_bool = target_df.select_dtypes(include="bool")

    result = Result("Boolean Fields")
    if not fields_to_compare(source_bool, target_bool):
        result.outcome = Outcome.SKIPPED
        return result

    dummy = pd.DataFrame(columns=[True, False])
    source_counts = pd.concat(
        [dummy, source_bool.apply(pd.value_counts, normalize=True).T], sort=False
    ).fillna(0.0)
    target_counts = pd.concat(
        [dummy, target_bool.apply(pd.value_counts, normalize=True).T], sort=False
    ).fillna(0.0)
    difs = (source_counts - target_counts)[True]

    bool_covs = pd.concat(
        [
            source_counts.rename("{}_source".format),
            target_counts.rename("{}_target".format),
        ]
    ).sort_index()
    bool_covs.name = "Coverage for boolean fields"
    result.stats.append(bool_covs)

    err_diffs = difs[difs.abs() > err_thr]
    if not err_diffs.empty:
        result.add_error(
            f"{', '.join(err_diffs.index)} relative frequencies differ "
            f"by more than {err_thr:.0%}"
        )

    warn_diffs = difs[(difs.abs() > warn_thr) & (difs.abs() <= err_thr)]
    if not warn_diffs.empty:
        result.add_warning(
            f"{', '.join(warn_diffs.index)} relative frequencies differ by "
            f"{warn_thr:.0%}-{err_thr:.0%}"
        )

    return result
Example #14
0
def check_errors(job: Job) -> Result:
    errors_count = api.get_errors_count(job)
    result = Result("Job Errors")
    if errors_count:
        url = f"{SH_URL}/{job.key}/log?filterType=error&filterAndHigher"
        result.add_error(
            f"{errors_count} error(s)", detailed=f"Errors for {job.key} - {url}"
        )
    else:
        result.add_info(f"No errors")
    return result
Example #15
0
def check_errors(source_job: Job) -> Result:
    source_errs = api.get_errors_count(source_job)
    result = Result("Job Errors")
    if not source_errs:
        return result

    errors_url = "{}/{}/log?filterType=error&filterAndHigher"
    result.add_error(
        f"{source_errs} error(s) - {errors_url.format(SH_URL, source_job.key)}"
    )
    return result
Example #16
0
def compare_scraped_fields(source_df: pd.DataFrame,
                           target_df: pd.DataFrame) -> Result:
    """Find new or missing columns between source_df and target_df"""
    result = Result("Scraped Fields")
    missing_fields = target_df.columns.difference(source_df.columns)

    if missing_fields.array:
        result.add_error(f"Missing - {', '.join(missing_fields)}")

    new_fields = source_df.columns.difference(target_df.columns)
    if new_fields.array:
        result.add_info(f"New - {', '.join(new_fields)}")

    return result
Example #17
0
def compare_errors(source_job: Job, target_job: Job) -> Result:
    errors_count1 = api.get_errors_count(source_job)
    errors_count2 = api.get_errors_count(target_job)

    result = Result("Compare Job Errors")
    if errors_count1:
        errors_url = "{}/{}/log?filterType=error&filterAndHigher"
        detailed_msg = (
            f"{errors_count1} error(s) for {source_job.key} - "
            f"{errors_url.format(SH_URL, source_job.key)}\n"
            f"{errors_count2} error(s) for {target_job.key} - "
            f"{errors_url.format(SH_URL, target_job.key)}"
        )
        result.add_error(f"{errors_count1} and {errors_count2} errors", detailed_msg)
    return result
Example #18
0
def compare_prices_for_same_names(source_df: pd.DataFrame,
                                  target_df: pd.DataFrame,
                                  tagged_fields: TaggedFields):
    result = Result("Compare Prices For Same Names")
    name_field_tag = tagged_fields.get("name_field")
    if not name_field_tag:
        result.outcome = Outcome.SKIPPED
        return result

    name_field = name_field_tag[0]
    source_df = source_df[source_df[name_field].notnull()]
    target_df = target_df[target_df[name_field].notnull()]

    same_names = source_df[(source_df[name_field].isin(
        target_df[name_field].values))][name_field]

    price_fields = tagged_fields.get("product_price_field")
    if not price_fields:
        result.add_info("product_price_field tag is not set")
        return result
    price_field = price_fields[0]

    detailed_messages = []
    for name in same_names:
        if name.strip() != "nan":
            source_price = source_df[source_df[name_field] ==
                                     name][price_field].iloc[0]
            target_price = target_df[target_df[name_field] ==
                                     name][price_field].iloc[0]
            if is_number(source_price) and is_number(target_price):
                if ratio_diff(source_price, target_price) > 0.1:
                    source_key = source_df[source_df[name_field] ==
                                           name].index[0]
                    target_key = target_df[target_df[name_field] ==
                                           name].index[0]
                    msg = (
                        f"different price for {name}\nsource price is {source_price} "
                        f"for {source_key}\ntarget price is {target_price} for {target_key}"
                    )
                    detailed_messages.append(msg)

    result_msg = f"{len(same_names)} checked, {len(detailed_messages)} errors"
    if detailed_messages:
        result.add_error(result_msg, detailed="\n".join(detailed_messages))
    else:
        result.add_info(result_msg)

    return result
Example #19
0
def get_difference(source_job: Job,
                   target_job: Job,
                   err_thr: float = 0.10,
                   warn_thr: float = 0.05) -> Result:
    """Get difference between jobs coverages. The coverage is job fields counts
    divided on the job size.

    Args:
        source_job: a base job, the difference is calculated from it
        target_job: a job to compare
        err_thr: a threshold for errors
        warn_thr: a threshold for warnings

    Returns:
        A Result instance with huge dif and stats with fields counts coverage and dif
    """
    result = Result("Coverage Difference")
    f_counts = (pd.DataFrame({
        source_job.key: api.get_counts(source_job),
        target_job.key: api.get_counts(target_job),
    }).drop(index=["_type"]).fillna(0).sort_values(by=[source_job.key],
                                                   kind="mergesort"))
    f_counts[source_job.key] = f_counts[source_job.key].divide(
        api.get_items_count(source_job))
    f_counts[target_job.key] = f_counts[target_job.key].divide(
        api.get_items_count(target_job))
    f_counts.name = "Coverage from job stats fields counts"
    result.stats.append(f_counts)

    coverage_difs = f_counts[source_job.key] - f_counts[target_job.key]
    coverage_difs = coverage_difs[coverage_difs.abs() > warn_thr].sort_values(
        kind="mergesoft")
    coverage_difs.name = f"Coverage difference more than {warn_thr:.0%}"
    if not coverage_difs.empty:
        result.stats.append(coverage_difs)

    errs = coverage_difs[coverage_difs.abs() > err_thr]
    if not errs.empty:
        result.add_error(
            f"The difference is greater than {err_thr:.0%} for {len(errs)} field(s)"
        )
    warns = coverage_difs[(coverage_difs > warn_thr)
                          & (coverage_difs <= err_thr)]
    if not warns.empty:
        result.add_warning(
            f"The difference is between {warn_thr:.0%} and {err_thr:.0%} "
            f"for {len(warns)} field(s)")
    return result
Example #20
0
def compare_names_for_same_urls(source_df: pd.DataFrame,
                                target_df: pd.DataFrame,
                                tagged_fields: TaggedFields):
    """For each pair of items that have the same `product_url_field` tagged field,
    compare `name_field` field"""

    result = Result("Compare Names Per Url")
    url_field_list: Optional[List[str]] = tagged_fields.get(
        "product_url_field")
    name_field_list: Optional[List[str]] = tagged_fields.get("name_field")
    if not url_field_list or not name_field_list:
        result.outcome = Outcome.SKIPPED
        return result

    name_field: str = name_field_list[0]
    url_field: str = url_field_list[0]
    diff_names_count = 0

    same_urls = source_df[(source_df[url_field].isin(
        target_df[url_field].values))][url_field]

    detailed_messages = []
    for url in same_urls:
        if url.strip() != "nan":
            source_name = source_df[source_df[url_field] ==
                                    url][name_field].iloc[0]
            target_name = target_df[target_df[url_field] ==
                                    url][name_field].iloc[0]

            if (source_name != target_name and source_name.strip() != "nan"
                    and target_name.strip() != "nan"):
                diff_names_count += 1
                source_key = source_df[source_df[url_field] == url].index[0]
                target_key = target_df[target_df[url_field] == url].index[0]
                msg = (
                    f"different names for url: {url}\nsource name is {source_name} "
                    f"for {source_key}\ntarget name is {target_name} for {target_key}"
                )
                detailed_messages.append(msg)

    res = f"{len(same_urls)} checked, {diff_names_count} errors"
    if detailed_messages:
        result.add_error(res, detailed="\n".join(detailed_messages))
    else:
        result.add_info(res)

    return result
Example #21
0
def compare_fields_counts(source_job, target_job):
    """Compare the relative difference between field counts to items count

    Args:
        source_job: a base job, the difference is calculated from it
        target_job: a job to compare

    Returns:
        A Result instance
    """
    source_items_count = get_items_count(source_job)
    target_items_count = get_items_count(target_job)
    result = Result("Fields Counts")

    source_fields = pd.DataFrame(
        {"Count1": source_job.items.stats().get("counts", None)})
    target_fields = pd.DataFrame(
        {"Count2": target_job.items.stats().get("counts", None)})
    fields = pd.concat([source_fields, target_fields], axis=1,
                       sort=True).fillna(0)
    fields["Difference, %"] = fields.apply(
        lambda row: ratio_diff(row["Count1"] / source_items_count, row[
            "Count2"] / target_items_count) * 100,
        axis=1,
    )
    fields["Difference, %"] = fields["Difference, %"].astype(int)
    fields.sort_values(by=["Difference, %"], ascending=False)

    err_diffs = fields[fields["Difference, %"] > 10]
    if not err_diffs.empty:
        result.add_error(
            f"Coverage difference is greater than 10% for "
            f"{len(err_diffs)} field(s)",
            err_diffs.to_string(columns=["Difference, %"]),
        )

    warn_diffs = fields[(fields["Difference, %"] > 5)
                        & (fields["Difference, %"] <= 10)]
    if not warn_diffs.empty:
        outcome_msg = (f"Coverage difference is between 5% and 10% for "
                       f"{len(warn_diffs)} field(s)")
        result.add_warning(outcome_msg,
                           warn_diffs.to_string(columns=["Difference, %"]))

    return result
Example #22
0
def compare_response_ratio(source_job: Job, target_job: Job) -> Result:
    """Compare request with response per item ratio"""
    s_ratio = round(
        api.get_requests_count(source_job) / api.get_items_count(source_job),
        2)
    t_ratio = round(
        api.get_requests_count(target_job) / api.get_items_count(target_job),
        2)

    response_ratio_diff = helpers.ratio_diff(s_ratio, t_ratio)
    msg = f"Difference is {response_ratio_diff:.2%} - {s_ratio} and {t_ratio}"

    result = Result("Compare Responses Per Item Ratio")
    if response_ratio_diff > 0.2:
        result.add_error(msg)
    elif response_ratio_diff > 0.1:
        result.add_warning(msg)
    return result
Example #23
0
def check_fields_coverage(df):
    fields_coverage = pd.DataFrame(df.count(), columns=["Values Count"])
    fields_coverage.index.name = "Field"
    fields_coverage["Percent"] = fields_coverage.apply(
        lambda row: int(row["Values Count"] / len(df) * 100), axis=1)

    detailed_msg = fields_coverage.sort_values(
        by=["Percent", "Field"]).to_string()

    empty_fields = fields_coverage[fields_coverage["Values Count"] == 0]
    result_msg = f"{len(empty_fields)} totally empty field(s)"

    result = Result("Fields Coverage")
    if empty_fields.empty:
        result.add_info(result_msg, detailed_msg)
    else:
        result.add_error(result_msg, detailed_msg)
    return result
Example #24
0
def compare_number_of_scraped_items(source_job: Job, target_job: Job) -> Result:
    items_count1 = api.get_items_count(source_job)
    items_count2 = api.get_items_count(target_job)
    diff = helpers.ratio_diff(items_count1, items_count2)
    result = Result("Total Scraped Items")
    if 0 <= diff < 0.05:
        if diff == 0:
            msg = "Same number of items"
        else:
            msg = f"Almost the same number of items - {items_count1} and {items_count2}"
        result.add_info(msg)
    else:
        msg = f"{items_count1} differs from {items_count2} on {diff * 100}%"
        if 0.05 <= diff < 0.10:
            result.add_warning(msg)
        elif diff >= 0.10:
            result.add_error(msg)
    return result
Example #25
0
def garbage_symbols(df: pd.DataFrame) -> Result:
    """Find unwanted symbols in `np.object` columns.

    Returns:
        A result containing item keys per field which contained any trash symbol
    """
    garbage = (
        r"(?P<spaces>^\s|\s$)"
        r"|(?P<html_entities>&[a-zA-Z]{2,}?;|&#\d*?;)"
        r"|(?P<css>[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?\s*?})"
        r"|(?P<html_tags></??(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
        r"blockquote)\s*?/??>|<!--|-->)")

    errors = {}
    row_keys = set()
    rule_result = Result("Garbage Symbols", items_count=len(df))

    for column in tqdm_notebook(df.select_dtypes([np.object]).columns,
                                desc="Garbage Symbols"):
        matches = df[column].apply(str).str.extractall(garbage,
                                                       flags=re.IGNORECASE)
        if not matches.empty:
            error_keys = df.loc[matches.unstack().index.values].index
            bad_texts = matches.stack().value_counts().index.sort_values(
            ).tolist()
            # escape backslashes for markdown repr, `\n > \\n`
            bad_texts = [
                f"'{codecs.encode(bx, 'unicode_escape').decode()[:20]}'"
                for bx in bad_texts
            ]
            error = (f"{len(error_keys)/len(df)*100:.1f}% of '{column}' "
                     f"values contain `{', '.join(bad_texts)}`")

            errors[error] = list(error_keys)
            row_keys = row_keys.union(error_keys)
    if errors:
        rule_result.add_error(
            f"{len(row_keys)/len(df) * 100:.1f}% ({len(row_keys)}) items affected",
            errors=errors,
        )
        rule_result.err_items_count = len(row_keys)

    return rule_result
Example #26
0
def compare_number_of_scraped_items(source_job: Job,
                                    target_job: Job) -> Result:
    s_count = api.get_items_count(source_job)
    t_count = api.get_items_count(target_job)
    diff = helpers.ratio_diff(s_count, t_count)
    result = Result("Total Scraped Items")
    if 0 <= diff < 0.05:
        if diff == 0:
            msg = "Same number of items"
        else:
            msg = f"Almost the same number of items - {s_count} and {t_count}"
        result.add_info(msg)
    else:
        msg = f"{s_count} differs from {t_count} on {diff:.2%}"
        if 0.05 <= diff < 0.10:
            result.add_warning(msg)
        elif diff >= 0.10:
            result.add_error(msg)
    return result
Example #27
0
def validate(schema: Schema, items_dicts: Items, fast: bool = False) -> Result:
    """Run JSON schema validation against Items.

    Args:
        fast: defines if we use fastjsonschema or jsonschema validation
    """
    validator = JsonSchemaValidator(schema)
    validator.run(items_dicts, fast)
    result = Result("JSON Schema Validation")

    errors = validator.errors
    schema_result_message = (
        f"{len(items_dicts)} items were checked, {len(errors)} error(s)")

    if errors:
        result.add_error(schema_result_message, errors=errors)
    else:
        result.add_info(schema_result_message)
    return result
Example #28
0
def compare_response_ratio(source_job: Job, target_job: Job) -> Result:
    """Compare request with response per item ratio"""
    items_count1 = api.get_items_count(source_job)
    items_count2 = api.get_items_count(target_job)

    source_ratio = round(api.get_requests_count(source_job) / items_count1, 2)
    target_ratio = round(api.get_requests_count(target_job) / items_count2, 2)

    response_ratio_diff = helpers.ratio_diff(source_ratio, target_ratio)
    msg = "Difference is {}% - {} and {}".format(
        response_ratio_diff * 100, source_ratio, target_ratio
    )

    result = Result("Compare Responses Per Item Ratio")
    if response_ratio_diff > 0.2:
        result.add_error(msg)
    elif response_ratio_diff > 0.1:
        result.add_warning(msg)
    return result
Example #29
0
def garbage_symbols(items: Items) -> Result:
    """Find unwanted symbols in `np.object` columns.

    Returns:
        A result containing item keys per field which contained any trash symbol
    """
    garbage = (
        r"(?P<spaces>^\s|\s$)"
        r"|(?P<html_entities>&amp|&reg)"
        r"|(?P<css>(?:(?:\.|#)[^#. ]+\s*){.+})"
        r"|(?P<html_tags></?(h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
        r"blockquote)\s*/?>|<!--|-->)")

    errors = {}
    row_keys = set()
    rule_result = Result("Garbage Symbols", items_count=items.size)

    for column in items.flat_df.select_dtypes([np.object]):
        matches = items.flat_df[column].str.extractall(garbage,
                                                       flags=re.IGNORECASE)
        matches = matches[["spaces", "html_entities", "css", "html_tags"]]
        if not matches.empty:
            error_keys = items.flat_df.iloc[
                matches.unstack().index.values]["_key"]
            original_column = items.get_origin_column_name(column)
            bad_texts = matches.stack().value_counts().index.sort_values(
            ).tolist()
            error = (
                f"{len(error_keys)/items.size*100:.1f}% of '{original_column}' "
                f"values contain {[t[:20] for t in bad_texts]}")
            errors[error] = list(error_keys)
            row_keys = row_keys.union(error_keys)

    if errors:
        rule_result.add_error(
            f"{len(row_keys)/items.size * 100:.1f}% ({len(row_keys)}) items affected",
            errors=errors,
        )
        rule_result.err_items_count = len(row_keys)

    return rule_result
Example #30
0
def check_fields_coverage(df: pd.DataFrame) -> Result:
    """Get fields coverage from df. Coverage reflects the percentage of real values
    (exluding `nan`) per column.

    Args:
        df: a data to count the coverage

    Returns:
        A result with coverage for all columns in provided df. If column contains only `nan`,
        treat it as an error.
    """
    fields_coverage = df.count().sort_values(ascending=False)
    fields_coverage.name = f"Fields coverage for {len(df):_} items"

    empty_fields = fields_coverage[fields_coverage == 0]

    result = Result("Fields Coverage")
    result.stats = [fields_coverage]
    if not empty_fields.empty:
        result.add_error(f"{len(empty_fields)} empty field(s)")
    return result