Esempio n. 1
0
def get_categories(df: pd.DataFrame, max_uniques: int = 10) -> Result:
    """Find category columns. A category column is the column which holds a limited number
    of possible values, including `NAN`.

    Args:
        df: data
        max_uniques: filter which determines which columns to use. Only columns with
        the number of unique values less than or equal to `max_uniques` are category columns.

    Returns:
        A result with stats containing value counts of categorical columns.
    """
    result = Result("Categories")

    columns = find_likely_cats(df, max_uniques)
    result.stats = [
        value_counts
        for value_counts in tqdm(
            map(lambda c: df[c].value_counts(dropna=False), columns),
            desc="Finding categories",
            total=len(columns),
        )
        if len(value_counts) <= max_uniques
    ]
    if not result.stats:
        result.add_info("Categories were not found")
        return result
    result.add_info(f"{len(result.stats)} category field(s)")
    result.outcome = Outcome.INFO
    return result
Esempio n. 2
0
def check_uniqueness(df: pd.DataFrame,
                     tagged_fields: Dict[str, List[str]]) -> Result:
    """Verify if each item field tagged with `unique` is unique.

    Returns:
        A result containing field names and keys for non unique items
    """
    unique_fields = tagged_fields.get("unique", [])
    result = Result("Uniqueness")

    if not unique_fields:
        result.add_info("'unique' tag was not found in schema")
        return result

    err_keys = set()
    for field in unique_fields:
        result.items_count = df[field].count()
        duplicates = df[df[field].duplicated(keep=False)][[field, "_key"]]
        errors = {}
        for _, d in duplicates.groupby([field]):
            keys = list(d["_key"])
            msg = f"same '{d[field].iloc[0]}' {field}"
            errors[msg] = keys
            err_keys = err_keys.union(keys)
        if not duplicates.empty:
            result.add_error(
                f"'{field}' contains {len(duplicates[field].unique())} duplicated value(s)",
                errors=errors,
            )

    result.err_items_count = len(err_keys)
    return result
Esempio n. 3
0
def check_outcome(job: Job) -> Result:
    state = api.get_job_state(job)
    reason = api.get_job_close_reason(job)
    result = Result("Job Outcome")
    if state != "finished" or reason != "finished":
        result.add_error(f"Job has '{state}' state, '{reason}' close reason")
    return result
Esempio n. 4
0
def find_by(df: pd.DataFrame, uniques: List[Union[str, List[str]]]) -> Result:
    """Find equal items rows in `df` by `uniques`. I.e. if two items have the same
    uniques's element value, they are considered duplicates.

    Args:
        uniques: list containing columns and list of columns to identify duplicates.
        List of columns means that all list columns values should be equal.

    Returns:
        Any duplicates
    """
    result = Result("Duplicates")
    result.items_count = len(df)

    df = df.dropna(subset=list(set(flatten(uniques))), how="all")
    for columns in uniques:
        mask = columns if isinstance(columns, list) else [columns]
        duplicates = df[df.duplicated(columns, keep=False)][mask]
        if duplicates.empty:
            continue

        errors = {}
        grouped = duplicates.groupby(columns)
        for _, d in grouped:
            msgs = [f"'{d[c].iloc[0]}' `{c}`" for c in mask]
            errors[f"same {', '.join(msgs)}"] = list(d.index)
        result.add_error(
            f"{', '.join(mask)} contains {len(grouped)} duplicated value(s)",
            errors=errors,
        )
    return result
Esempio n. 5
0
def check_tags(source_columns: np.ndarray, target_columns: np.ndarray,
               tags: TaggedFields) -> Result:
    result = Result("Tags")

    found_tags = sorted(list(tags))
    if found_tags:
        result.add_info(f"Used - {', '.join(found_tags)}")

    all_tags = set([name for name, _ in Tag.__members__.items()])
    not_used_tags = sorted(all_tags - set(tags))
    if not_used_tags:
        result.add_info(f"Not used - {', '.join(not_used_tags)}")

    tagged_fields = []
    for tag in tags:
        tagged_fields.extend(tags[tag])

    missing_in_source = sorted(set(tagged_fields) - set(source_columns))
    if missing_in_source:
        result.add_error(
            f"{str(missing_in_source)[1:-1]} field(s) was not found in "
            "source, but specified in schema")

    if target_columns is not None:
        missing_in_target = sorted(set(tagged_fields) - set(target_columns))
        if missing_in_target:
            result.add_error(
                f"{str(missing_in_target)[1:-1]} field(s) was not found "
                "in target, but specified in schema")

    if result.errors:
        result.add_error("Skipping tag rules")

    return result
Esempio n. 6
0
def compare_runtime(source_job: Job, target_job: Job) -> Result:
    source_runtime = api.get_runtime(source_job)
    target_runtime = api.get_runtime(target_job)

    result = Result("Compare Runtime")
    if not source_runtime or not target_runtime:
        result.add_warning("Jobs are not finished")
    elif source_runtime > target_runtime:
        runtime_ratio_diff = helpers.ratio_diff(source_runtime, target_runtime)
        msg = (
            f"Sources differ on {runtime_ratio_diff}% - "
            f"{helpers.ms_to_time(source_runtime)} and "
            f"{helpers.ms_to_time(target_runtime)}"
        )
        if runtime_ratio_diff > 0.2:
            result.add_error(msg)
        elif runtime_ratio_diff > 0.1:
            result.add_warning(msg)
        else:
            result.add_info(msg)
    else:
        result.add_info(
            f"Similar or better runtime - {helpers.ms_to_time(source_runtime)} and "
            f"{helpers.ms_to_time(target_runtime)}"
        )
    return result
Esempio n. 7
0
def anomalies(target: str, sample: List[str]) -> Result:
    """Find fields with significant deviation. Significant means `dev > 2 * std()`

    Args:
        target: where to look for anomalies
        sample: a list of jobs keys to infer metadata from

    Returns:
        A Result with a dataframe of significant deviations
    """
    result = Result("Anomalies")
    raw_stats = [job.items.stats() for job in api.get_jobs(sample + [target])]

    counts = (pd.DataFrame(
        rs.get("counts") for rs in raw_stats).fillna(0).drop(columns="_type"))
    items_len = [rs["totals"]["input_values"] for rs in raw_stats]
    stats = counts.apply(lambda x: x / items_len)
    stats.index = sample + [target]
    stats.rename(index={target: "target"}, inplace=True)
    stats.loc["mean"] = stats.loc[sample].mean()
    stats.loc["std"] = stats.loc[sample].std()
    stats = stats.T
    stats["target deviation"] = stats["target"] - stats["mean"]
    devs = stats[(stats["target deviation"].abs() > 2 * stats["std"])]
    devs.name = "Anomalies"
    if not devs.empty:
        result.add_error(
            f"{len(devs.index)} field(s) with significant coverage deviation")
        result.stats = [devs]

    return result
Esempio n. 8
0
def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
    """Verify if each item field tagged with `unique` is unique.

    Returns:
        A result containing field names and keys for non unique items
    """
    unique_fields = tagged_fields.get("unique", [])
    result = Result("Duplicates By **unique** Tag")

    if not unique_fields:
        result.add_info(Outcome.SKIPPED)
        return result

    err_keys = set()
    for field in unique_fields:
        result.items_count = df[field].count()
        duplicates = df[df.duplicated(field, keep=False)][[field]]
        errors = {}
        for _, d in duplicates.groupby([field]):
            keys = list(d.index)
            msg = f"same '{d[field].iloc[0]}' `{field}`"
            errors[msg] = keys
            err_keys = err_keys.union(keys)
        if not duplicates.empty:
            result.add_error(
                f"{field} contains {len(duplicates[field].unique())} duplicated value(s)",
                errors=errors,
            )

    result.err_items_count = len(err_keys)
    return result
Esempio n. 9
0
def check_tags(source_columns, target_columns, tags):
    result = Result("Tags")

    found_tags = list(tags)
    if found_tags:
        result.add_info(", ".join(found_tags))

    tagged_fields = []
    for tag in tags:
        tagged_fields.extend(tags[tag])

    missing_in_source = sorted(set(tagged_fields) - set(source_columns))
    if missing_in_source:
        result.add_error(
            f"{str(missing_in_source)[1:-1]} field(s) was not found in "
            "source, but specified in schema")

    if target_columns.size > 0:
        missing_in_target = sorted(set(tagged_fields) - set(target_columns))
        if missing_in_target:
            result.add_error(
                f"{str(missing_in_target)[1:-1]} field(s) was not found "
                "in target, but specified in schema")

    if result.errors:
        result.add_error("Skipping tag rules")

    return result
Esempio n. 10
0
def compare_scraped_fields(source_df, target_df):
    source_field_coverage = dict(
        source_df.count().sort_values(ascending=False))
    target_field_coverage = dict(
        target_df.count().sort_values(ascending=False))

    result = Result("Scraped Fields")
    missing_fields = set(target_df.columns.values) - set(
        source_df.columns.values)
    if missing_fields:
        detailed_messages = ["Missing Fields"]
        for field in missing_fields:
            target_coverage = target_field_coverage[field] / len(
                target_df) * 100
            detailed_messages.append(
                f"{field} - coverage - {int(target_coverage)}% - "
                f"{target_field_coverage[field]} items")
        result.add_error(f"{len(missing_fields)} field(s) are missing",
                         "\n".join(detailed_messages))

    new_fields = set(source_df.columns.values) - set(target_df.columns.values)
    if new_fields:
        detailed_messages = ["New Fields"]
        for field in new_fields:
            source_coverage = source_field_coverage[field] / len(
                source_df) * 100
            detailed_messages.append(
                f"{field} - coverage - {int(source_coverage)}% - "
                f"{source_field_coverage[field]} items")
        result.add_info(f"{len(new_fields)} field(s) are new",
                        "\n".join(detailed_messages))

    return result
Esempio n. 11
0
def compare_names_for_same_urls(
    source_df: pd.DataFrame,
    target_df: pd.DataFrame,
    tagged_fields: Dict[str, List[str]],
):
    """For each pair of items that have the same `product_url_field` tagged field,
    compare `name_field` field"""

    result = Result("Compare Names Per Url")
    url_field = tagged_fields.get("product_url_field")
    if not url_field:
        result.add_info("product_url_field tag is not set")
        return result

    url_field = url_field[0]
    name_field = tagged_fields.get("name_field")

    diff_names_count = 0
    if not name_field:
        result.add_info("name_field tag is not set")
        return result

    name_field = name_field[0]
    if any([
            name_field not in source_df.columns.values,
            name_field not in target_df.columns.values,
    ]):
        return

    same_urls = source_df[(source_df[url_field].isin(
        target_df[url_field].values))][url_field]

    detailed_messages = []
    for url in same_urls:
        if url.strip() != "nan":
            source_name = source_df[source_df[url_field] ==
                                    url][name_field].iloc[0]
            target_name = target_df[target_df[url_field] ==
                                    url][name_field].iloc[0]

            if (source_name != target_name and source_name.strip() != "nan"
                    and target_name.strip() != "nan"):
                diff_names_count += 1
                source_key = source_df[source_df[url_field] ==
                                       url]["_key"].iloc[0]
                target_key = target_df[target_df[url_field] ==
                                       url]["_key"].iloc[0]
                msg = (
                    f"different names for url: {url}\nsource name is {source_name} "
                    f"for {source_key}\ntarget name is {target_name} for {target_key}"
                )
                detailed_messages.append(msg)

    res = f"{len(same_urls)} checked, {diff_names_count} errors"
    if detailed_messages:
        result.add_error(res, detailed="\n".join(detailed_messages))
    else:
        result.add_info(res)

    return result
Esempio n. 12
0
def check_items(df: pd.DataFrame, tagged_fields: Dict[str,
                                                      List[str]]) -> Result:
    """Check for items with the same name and url"""

    name_fields = tagged_fields.get("name_field")
    url_fields = tagged_fields.get("product_url_field")
    result = Result("Duplicated Items")
    if not name_fields or not url_fields:
        result.add_info(
            "'name_field' and 'product_url_field' tags were not found in schema"
        )
    else:
        result.items_count = len(df)
        errors = {}
        name_field = name_fields[0]
        url_field = url_fields[0]
        df = df[[name_field, url_field, "_key"]]
        duplicates = df[df[[name_field, url_field]].duplicated(keep=False)]
        if duplicates.empty:
            return result

        result.err_items_count = len(duplicates)
        for _, d in duplicates.groupby([name_field, url_field]):
            msg = (
                f"same '{d[name_field].iloc[0]}' name and '{d[url_field].iloc[0]}' url"
            )
            errors[msg] = list(d["_key"])
        result.add_error(
            f"{len(duplicates)} duplicate(s) with same name and url",
            errors=errors)
    return result
Esempio n. 13
0
def check_response_ratio(job: Job) -> Result:
    requests_number = api.get_requests_count(job)
    items_count = api.get_items_count(job)
    result = Result("Responses Per Item Ratio")
    result.add_info(
        f"Number of responses / Number of scraped items - "
        f"{round(requests_number / items_count, 2)}"
    )
    return result
Esempio n. 14
0
def compare_prices_for_same_urls(source_df: pd.DataFrame,
                                 target_df: pd.DataFrame,
                                 tagged_fields: TaggedFields) -> Result:
    """For each pair of items that have the same `product_url_field` tagged field,
    compare `product_price_field` field

    Returns:
        A result containing pairs of items from `source_df` and `target_df`
        which `product_price_field` differ.
    """
    result = Result("Compare Prices For Same Urls")
    url_field_list: Optional[List[str]] = tagged_fields.get(
        "product_url_field")
    if not url_field_list:
        result.outcome = Outcome.SKIPPED
        return result

    url_field = url_field_list[0]

    source_df = source_df.dropna(subset=[url_field])
    target_df = target_df.dropna(subset=[url_field])

    same_urls = source_df[(source_df[url_field].isin(
        target_df[url_field].values))][url_field]

    price_fields = tagged_fields.get("product_price_field")
    if not price_fields:
        result.add_info("product_price_field tag is not set")
    else:
        price_field = price_fields[0]
        detailed_messages = []
        for url in same_urls:
            if url.strip() != "nan":
                source_price = source_df[source_df[url_field] ==
                                         url][price_field].iloc[0]
                target_price = target_df[target_df[url_field] ==
                                         url][price_field].iloc[0]

                if (is_number(source_price) and is_number(target_price)
                        and ratio_diff(source_price, target_price) > 0.1):
                    source_key = source_df[source_df[url_field] ==
                                           url].index[0]
                    target_key = target_df[target_df[url_field] ==
                                           url].index[0]
                    msg = (
                        f"different prices for url: {url}\nsource price is {source_price} "
                        f"for {source_key}\ntarget price is {target_price} for {target_key}"
                    )
                    detailed_messages.append(msg)

        res = f"{len(same_urls)} checked, {len(detailed_messages)} errors"
        if detailed_messages:
            result.add_error(res, detailed="\n".join(detailed_messages))
        else:
            result.add_info(res)

    return result
Esempio n. 15
0
def compare_was_now(df: pd.DataFrame, tagged_fields: TaggedFields):
    """Compare price_was and price_now tagged fields"""

    price_was_fields = tagged_fields.get("product_price_was_field")
    price_fields = tagged_fields.get("product_price_field")
    items_number = len(df.index)

    result = Result("Compare Price Was And Now")

    if not price_was_fields or not price_fields:
        result.add_info(Outcome.SKIPPED)
        return result

    price_field = price_fields[0]
    price_was_field = price_was_fields[0]
    prices = df.copy()
    prices[price_was_field] = prices[price_was_field].astype(float)
    prices[price_field] = prices[price_field].astype(float)

    df_prices_less = pd.DataFrame(
        prices[prices[price_was_field] < prices[price_field]],
        columns=[price_was_field, price_field],
    )

    price_less_percent = "{:.2%}".format(len(df_prices_less) / items_number)

    if not df_prices_less.empty:
        error = f"Past price is less than current for {len(df_prices_less)} items"
        result.add_error(
            f"{price_less_percent} ({len(df_prices_less)}) of "
            f"items with {price_was_field} < {price_field}",
            detailed=f"{error}:\n{list(df_prices_less.index)}",
        )

    df_prices_equals = pd.DataFrame(
        prices[prices[price_was_field] == prices[price_field]],
        columns=[price_was_field, price_field],
    )
    price_equal_percent = "{:.2%}".format(len(df_prices_equals) / items_number)

    if not df_prices_equals.empty:
        result.add_warning(
            (
                f"{price_equal_percent} ({len(df_prices_equals)}) "
                f"of items with {price_was_field} = {price_field}"
            ),
            detailed=(
                f"Prices equal for {len(df_prices_equals)} items:\n"
                f"{list(df_prices_equals.index)}"
            ),
        )

    result.err_items_count = len(df_prices_equals) + len(df_prices_less)
    result.items_count = len(df.index)

    return result
Esempio n. 16
0
def compare_was_now(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]):
    """Compare price_was and price_now tagged fields"""

    price_was_fields = tagged_fields.get("product_price_was_field")
    price_fields = tagged_fields.get("product_price_field")
    items_number = len(df.index)

    result = Result("Compare Price Was And Now")

    if (price_was_fields and price_was_fields[0] in df.columns and price_fields
            and price_fields[0] in df.columns):
        price_field = price_fields[0]
        price_was_field = price_was_fields[0]
        prices = df.copy()
        prices[price_was_field] = prices[price_was_field].astype(float)
        prices[price_field] = prices[price_field].astype(float)

        df_prices_less = pd.DataFrame(
            prices[prices[price_was_field] < prices[price_field]],
            columns=["_key", price_was_field, price_field],
        )

        price_less_percent = "{:.2%}".format(
            len(df_prices_less) / items_number)

        if not df_prices_less.empty:
            error = f"Past price is less than current for {len(df_prices_less)} items"
            result.add_error(
                f"{price_less_percent} ({len(df_prices_less)}) of "
                f"items with {price_was_field} < {price_field}",
                detailed=f"{error}:\n{list(df_prices_less['_key'])}",
            )

        df_prices_equals = pd.DataFrame(
            prices[prices[price_was_field] == prices[price_field]],
            columns=["_key", price_was_field, price_field],
        )
        price_equal_percent = "{:.2%}".format(
            len(df_prices_equals) / items_number)

        if not df_prices_equals.empty:
            result.add_warning(
                (f"{price_equal_percent} ({len(df_prices_equals)}) "
                 f"of items with {price_was_field} = {price_field}"),
                detailed=(f"Prices equal for {len(df_prices_equals)} items:\n"
                          f"{list(df_prices_equals['_key'])}"),
            )

        result.err_items_count = len(df_prices_equals) + len(df_prices_less)
        result.items_count = len(df.index)

    else:
        result.add_info(
            "product_price_field or product_price_was_field tags were not "
            "found in schema")
    return result
Esempio n. 17
0
def compare_spider_names(source_job: Job, target_job: Job) -> Result:
    s_name = source_job.metadata.get("spider")
    t_name = target_job.metadata.get("spider")

    result = Result("Spider Names")
    if s_name != t_name:
        result.add_warning(
            f"{source_job.key} spider is {s_name}, {target_job.key} spider is {t_name}"
        )
    return result
Esempio n. 18
0
def get_coverage_per_category(df: pd.DataFrame,
                              tagged_fields: Dict[str, List[str]]):
    result = Result("Coverage For Scraped Categories")

    category_fields = tagged_fields.get("category", [])
    for f in category_fields:
        value_counts = df[f].value_counts()
        result.add_info(f"{len(value_counts)} categories in '{f}'",
                        stats=value_counts)
    return result
Esempio n. 19
0
def compare_spider_names(source_job: Job, target_job: Job) -> Result:
    name1 = source_job.metadata.get("spider")
    name2 = target_job.metadata.get("spider")

    result = Result("Spider Names")
    if name1 != name2:
        result.add_warning(
            f"{source_job.key} spider is {name1}, {target_job.key} spider is {name2}"
        )
    return result
Esempio n. 20
0
def check_errors(job: Job) -> Result:
    errors_count = api.get_errors_count(job)
    result = Result("Job Errors")
    if errors_count:
        url = f"{SH_URL}/{job.key}/log?filterType=error&filterAndHigher"
        result.add_error(
            f"{errors_count} error(s)", detailed=f"Errors for {job.key} - {url}"
        )
    else:
        result.add_info(f"No errors")
    return result
Esempio n. 21
0
def compare_boolean_fields(
    source_df: pd.DataFrame,
    target_df: pd.DataFrame,
    err_thr: float = 0.10,
    warn_thr: float = 0.05,
) -> Result:
    """Compare booleans distribution between two dataframes

    Returns:
        A result containing dataframe with distributions and messages if differences
        are in thresholds
    """

    source_bool = source_df.select_dtypes(include="bool")
    target_bool = target_df.select_dtypes(include="bool")

    result = Result("Boolean Fields")
    if not fields_to_compare(source_bool, target_bool):
        result.outcome = Outcome.SKIPPED
        return result

    dummy = pd.DataFrame(columns=[True, False])
    source_counts = pd.concat(
        [dummy, source_bool.apply(pd.value_counts, normalize=True).T], sort=False
    ).fillna(0.0)
    target_counts = pd.concat(
        [dummy, target_bool.apply(pd.value_counts, normalize=True).T], sort=False
    ).fillna(0.0)
    difs = (source_counts - target_counts)[True]

    bool_covs = pd.concat(
        [
            source_counts.rename("{}_source".format),
            target_counts.rename("{}_target".format),
        ]
    ).sort_index()
    bool_covs.name = "Coverage for boolean fields"
    result.stats.append(bool_covs)

    err_diffs = difs[difs.abs() > err_thr]
    if not err_diffs.empty:
        result.add_error(
            f"{', '.join(err_diffs.index)} relative frequencies differ "
            f"by more than {err_thr:.0%}"
        )

    warn_diffs = difs[(difs.abs() > warn_thr) & (difs.abs() <= err_thr)]
    if not warn_diffs.empty:
        result.add_warning(
            f"{', '.join(warn_diffs.index)} relative frequencies differ by "
            f"{warn_thr:.0%}-{err_thr:.0%}"
        )

    return result
Esempio n. 22
0
def check_errors(source_job: Job) -> Result:
    source_errs = api.get_errors_count(source_job)
    result = Result("Job Errors")
    if not source_errs:
        return result

    errors_url = "{}/{}/log?filterType=error&filterAndHigher"
    result.add_error(
        f"{source_errs} error(s) - {errors_url.format(SH_URL, source_job.key)}"
    )
    return result
Esempio n. 23
0
def get_difference(
    source_df: pd.DataFrame,
    target_df: pd.DataFrame,
    category_names: List[str],
    source_key: str = "source",
    target_key: str = "target",
) -> Result:
    """Find and show differences between categories coverage, including nan values.
    Coverage means value counts divided on total size.

    Args:
        source_df: a data you want to compare
        target_df: a data you want to compare with
        category_names: list of columns which values to compare
        source_key: label for `source_df`
        target_key: label for `target_df`

    Returns:
        A result instance with messages containing significant difference defined by
        thresholds, a dataframe showing all normalized value counts in percents,
        a series containing significant difference.
    """
    result = Result("Category Coverage Difference")
    warn_thr = 0.10
    err_thr = 0.20

    for c in category_names:
        cats = (
            pd.DataFrame(
                {
                    source_key: source_df[c].value_counts(dropna=False, normalize=True),
                    target_key: target_df[c].value_counts(dropna=False, normalize=True),
                }
            )
            .fillna(0)
            .sort_values(by=[source_key, target_key], kind="mergesort")
        )
        cats.name = f"Coverage for {c}"
        result.stats.append(cats)
        cat_difs = (cats[source_key] - cats[target_key]).abs()
        cat_difs = cat_difs[cat_difs > warn_thr]
        cat_difs.name = f"Coverage difference more than {warn_thr:.0%} for {c}"
        if not cat_difs.empty:
            result.stats.append(cat_difs)
        errs = cat_difs[cat_difs > err_thr]
        if not errs.empty:
            result.add_warning(
                f"The difference is greater than {err_thr:.0%} for {len(errs)} value(s) of {c}"
            )

    if not category_names:
        result.add_info(Outcome.SKIPPED)
    return result
Esempio n. 24
0
def compare_finish_time(source_job: Job, target_job: Job) -> Result:
    diff_in_days = api.get_finish_time_difference_in_days(source_job, target_job)

    result = Result("Finish Time")
    if diff_in_days == 0:
        result.add_info("Less than 1 day difference")
    else:
        if diff_in_days is None:
            result.add_warning("Jobs are not finished")
        else:
            result.add_warning(f"{diff_in_days} day(s) difference between 2 jobs")

    return result
Esempio n. 25
0
def compare_scraped_fields(source_df: pd.DataFrame,
                           target_df: pd.DataFrame) -> Result:
    """Find new or missing columns between source_df and target_df"""
    result = Result("Scraped Fields")
    missing_fields = target_df.columns.difference(source_df.columns)

    if missing_fields.array:
        result.add_error(f"Missing - {', '.join(missing_fields)}")

    new_fields = source_df.columns.difference(target_df.columns)
    if new_fields.array:
        result.add_info(f"New - {', '.join(new_fields)}")

    return result
Esempio n. 26
0
def create_result(
    rule_name, messages, stats=None, err_items_count=None, items_count=None
):
    result = Result(rule_name)
    for level, messages in messages.items():
        for message in messages:
            result.add_message(level, *message)

    if stats:
        result.stats = stats
    if err_items_count:
        result.err_items_count = err_items_count
    if items_count:
        result.items_count = items_count
    return result
Esempio n. 27
0
def find_by_name_url(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
    """Check for items with the same name and url"""

    name_fields = tagged_fields.get("name_field")
    url_fields = tagged_fields.get("product_url_field")
    name = "Duplicates By **name_field, product_url_field** Tags"
    result = Result(name)
    if not name_fields or not url_fields:
        result.add_info(Outcome.SKIPPED)
        return result
    name_field = name_fields[0]
    url_field = url_fields[0]
    result = find_by(df, [name_field, url_field])
    result.name = name
    return result
Esempio n. 28
0
def compare_errors(source_job: Job, target_job: Job) -> Result:
    errors_count1 = api.get_errors_count(source_job)
    errors_count2 = api.get_errors_count(target_job)

    result = Result("Compare Job Errors")
    if errors_count1:
        errors_url = "{}/{}/log?filterType=error&filterAndHigher"
        detailed_msg = (
            f"{errors_count1} error(s) for {source_job.key} - "
            f"{errors_url.format(SH_URL, source_job.key)}\n"
            f"{errors_count2} error(s) for {target_job.key} - "
            f"{errors_url.format(SH_URL, target_job.key)}"
        )
        result.add_error(f"{errors_count1} and {errors_count2} errors", detailed_msg)
    return result
Esempio n. 29
0
def find_by_tags(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
    """Check for duplicates based on schema tags. In particular, look for items with
    the same `name_field` and `product_url_field`, and for uniqueness among `unique` field"""

    name_fields = tagged_fields.get("name_field")
    url_fields = tagged_fields.get("product_url_field")
    columns_to_check: List = tagged_fields.get("unique", [])
    if (not name_fields or not url_fields) and not columns_to_check:
        result = Result("Duplicates")
        result.add_info(Outcome.SKIPPED)
        return result
    if name_fields and url_fields:
        columns_to_check.extend([[name_fields[0], url_fields[0]]])

    return find_by(df, columns_to_check)
Esempio n. 30
0
def get_difference(source_job: Job,
                   target_job: Job,
                   err_thr: float = 0.10,
                   warn_thr: float = 0.05) -> Result:
    """Get difference between jobs coverages. The coverage is job fields counts
    divided on the job size.

    Args:
        source_job: a base job, the difference is calculated from it
        target_job: a job to compare
        err_thr: a threshold for errors
        warn_thr: a threshold for warnings

    Returns:
        A Result instance with huge dif and stats with fields counts coverage and dif
    """
    result = Result("Coverage Difference")
    f_counts = (pd.DataFrame({
        source_job.key: api.get_counts(source_job),
        target_job.key: api.get_counts(target_job),
    }).drop(index=["_type"]).fillna(0).sort_values(by=[source_job.key],
                                                   kind="mergesort"))
    f_counts[source_job.key] = f_counts[source_job.key].divide(
        api.get_items_count(source_job))
    f_counts[target_job.key] = f_counts[target_job.key].divide(
        api.get_items_count(target_job))
    f_counts.name = "Coverage from job stats fields counts"
    result.stats.append(f_counts)

    coverage_difs = f_counts[source_job.key] - f_counts[target_job.key]
    coverage_difs = coverage_difs[coverage_difs.abs() > warn_thr].sort_values(
        kind="mergesoft")
    coverage_difs.name = f"Coverage difference more than {warn_thr:.0%}"
    if not coverage_difs.empty:
        result.stats.append(coverage_difs)

    errs = coverage_difs[coverage_difs.abs() > err_thr]
    if not errs.empty:
        result.add_error(
            f"The difference is greater than {err_thr:.0%} for {len(errs)} field(s)"
        )
    warns = coverage_difs[(coverage_difs > warn_thr)
                          & (coverage_difs <= err_thr)]
    if not warns.empty:
        result.add_warning(
            f"The difference is between {warn_thr:.0%} and {err_thr:.0%} "
            f"for {len(warns)} field(s)")
    return result