Example #1
0
def find_by(df: pd.DataFrame, uniques: List[Union[str, List[str]]]) -> Result:
    """Find equal items rows in `df` by `uniques`. I.e. if two items have the same
    uniques's element value, they are considered duplicates.

    Args:
        uniques: list containing columns and list of columns to identify duplicates.
        List of columns means that all list columns values should be equal.

    Returns:
        Any duplicates
    """
    result = Result("Duplicates")
    result.items_count = len(df)

    df = df.dropna(subset=list(set(flatten(uniques))), how="all")
    for columns in uniques:
        mask = columns if isinstance(columns, list) else [columns]
        duplicates = df[df.duplicated(columns, keep=False)][mask]
        if duplicates.empty:
            continue

        errors = {}
        grouped = duplicates.groupby(columns)
        for _, d in grouped:
            msgs = [f"'{d[c].iloc[0]}' `{c}`" for c in mask]
            errors[f"same {', '.join(msgs)}"] = list(d.index)
        result.add_error(
            f"{', '.join(mask)} contains {len(grouped)} duplicated value(s)",
            errors=errors,
        )
    return result
Example #2
0
def compare_names_for_same_urls(
    source_df: pd.DataFrame,
    target_df: pd.DataFrame,
    tagged_fields: Dict[str, List[str]],
):
    """For each pair of items that have the same `product_url_field` tagged field,
    compare `name_field` field"""

    result = Result("Compare Names Per Url")
    url_field = tagged_fields.get("product_url_field")
    if not url_field:
        result.add_info("product_url_field tag is not set")
        return result

    url_field = url_field[0]
    name_field = tagged_fields.get("name_field")

    diff_names_count = 0
    if not name_field:
        result.add_info("name_field tag is not set")
        return result

    name_field = name_field[0]
    if any([
            name_field not in source_df.columns.values,
            name_field not in target_df.columns.values,
    ]):
        return

    same_urls = source_df[(source_df[url_field].isin(
        target_df[url_field].values))][url_field]

    detailed_messages = []
    for url in same_urls:
        if url.strip() != "nan":
            source_name = source_df[source_df[url_field] ==
                                    url][name_field].iloc[0]
            target_name = target_df[target_df[url_field] ==
                                    url][name_field].iloc[0]

            if (source_name != target_name and source_name.strip() != "nan"
                    and target_name.strip() != "nan"):
                diff_names_count += 1
                source_key = source_df[source_df[url_field] ==
                                       url]["_key"].iloc[0]
                target_key = target_df[target_df[url_field] ==
                                       url]["_key"].iloc[0]
                msg = (
                    f"different names for url: {url}\nsource name is {source_name} "
                    f"for {source_key}\ntarget name is {target_name} for {target_key}"
                )
                detailed_messages.append(msg)

    res = f"{len(same_urls)} checked, {diff_names_count} errors"
    if detailed_messages:
        result.add_error(res, detailed="\n".join(detailed_messages))
    else:
        result.add_info(res)

    return result
Example #3
0
def check_outcome(job: Job) -> Result:
    state = api.get_job_state(job)
    reason = api.get_job_close_reason(job)
    result = Result("Job Outcome")
    if state != "finished" or reason != "finished":
        result.add_error(f"Job has '{state}' state, '{reason}' close reason")
    return result
Example #4
0
def anomalies(target: str, sample: List[str]) -> Result:
    """Find fields with significant deviation. Significant means `dev > 2 * std()`

    Args:
        target: where to look for anomalies
        sample: a list of jobs keys to infer metadata from

    Returns:
        A Result with a dataframe of significant deviations
    """
    result = Result("Anomalies")
    raw_stats = [job.items.stats() for job in api.get_jobs(sample + [target])]

    counts = (pd.DataFrame(
        rs.get("counts") for rs in raw_stats).fillna(0).drop(columns="_type"))
    items_len = [rs["totals"]["input_values"] for rs in raw_stats]
    stats = counts.apply(lambda x: x / items_len)
    stats.index = sample + [target]
    stats.rename(index={target: "target"}, inplace=True)
    stats.loc["mean"] = stats.loc[sample].mean()
    stats.loc["std"] = stats.loc[sample].std()
    stats = stats.T
    stats["target deviation"] = stats["target"] - stats["mean"]
    devs = stats[(stats["target deviation"].abs() > 2 * stats["std"])]
    devs.name = "Anomalies"
    if not devs.empty:
        result.add_error(
            f"{len(devs.index)} field(s) with significant coverage deviation")
        result.stats = [devs]

    return result
Example #5
0
def check_tags(source_columns, target_columns, tags):
    result = Result("Tags")

    found_tags = list(tags)
    if found_tags:
        result.add_info(", ".join(found_tags))

    tagged_fields = []
    for tag in tags:
        tagged_fields.extend(tags[tag])

    missing_in_source = sorted(set(tagged_fields) - set(source_columns))
    if missing_in_source:
        result.add_error(
            f"{str(missing_in_source)[1:-1]} field(s) was not found in "
            "source, but specified in schema")

    if target_columns.size > 0:
        missing_in_target = sorted(set(tagged_fields) - set(target_columns))
        if missing_in_target:
            result.add_error(
                f"{str(missing_in_target)[1:-1]} field(s) was not found "
                "in target, but specified in schema")

    if result.errors:
        result.add_error("Skipping tag rules")

    return result
Example #6
0
def check_response_ratio(job: Job) -> Result:
    requests_number = api.get_requests_count(job)
    items_count = api.get_items_count(job)
    result = Result("Responses Per Item Ratio")
    result.add_info(
        f"Number of responses / Number of scraped items - "
        f"{round(requests_number / items_count, 2)}"
    )
    return result
Example #7
0
def compare_was_now(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]):
    """Compare price_was and price_now tagged fields"""

    price_was_fields = tagged_fields.get("product_price_was_field")
    price_fields = tagged_fields.get("product_price_field")
    items_number = len(df.index)

    result = Result("Compare Price Was And Now")

    if (price_was_fields and price_was_fields[0] in df.columns and price_fields
            and price_fields[0] in df.columns):
        price_field = price_fields[0]
        price_was_field = price_was_fields[0]
        prices = df.copy()
        prices[price_was_field] = prices[price_was_field].astype(float)
        prices[price_field] = prices[price_field].astype(float)

        df_prices_less = pd.DataFrame(
            prices[prices[price_was_field] < prices[price_field]],
            columns=["_key", price_was_field, price_field],
        )

        price_less_percent = "{:.2%}".format(
            len(df_prices_less) / items_number)

        if not df_prices_less.empty:
            error = f"Past price is less than current for {len(df_prices_less)} items"
            result.add_error(
                f"{price_less_percent} ({len(df_prices_less)}) of "
                f"items with {price_was_field} < {price_field}",
                detailed=f"{error}:\n{list(df_prices_less['_key'])}",
            )

        df_prices_equals = pd.DataFrame(
            prices[prices[price_was_field] == prices[price_field]],
            columns=["_key", price_was_field, price_field],
        )
        price_equal_percent = "{:.2%}".format(
            len(df_prices_equals) / items_number)

        if not df_prices_equals.empty:
            result.add_warning(
                (f"{price_equal_percent} ({len(df_prices_equals)}) "
                 f"of items with {price_was_field} = {price_field}"),
                detailed=(f"Prices equal for {len(df_prices_equals)} items:\n"
                          f"{list(df_prices_equals['_key'])}"),
            )

        result.err_items_count = len(df_prices_equals) + len(df_prices_less)
        result.items_count = len(df.index)

    else:
        result.add_info(
            "product_price_field or product_price_was_field tags were not "
            "found in schema")
    return result
Example #8
0
def compare_was_now(df: pd.DataFrame, tagged_fields: TaggedFields):
    """Compare price_was and price_now tagged fields"""

    price_was_fields = tagged_fields.get("product_price_was_field")
    price_fields = tagged_fields.get("product_price_field")
    items_number = len(df.index)

    result = Result("Compare Price Was And Now")

    if not price_was_fields or not price_fields:
        result.add_info(Outcome.SKIPPED)
        return result

    price_field = price_fields[0]
    price_was_field = price_was_fields[0]
    prices = df.copy()
    prices[price_was_field] = prices[price_was_field].astype(float)
    prices[price_field] = prices[price_field].astype(float)

    df_prices_less = pd.DataFrame(
        prices[prices[price_was_field] < prices[price_field]],
        columns=[price_was_field, price_field],
    )

    price_less_percent = "{:.2%}".format(len(df_prices_less) / items_number)

    if not df_prices_less.empty:
        error = f"Past price is less than current for {len(df_prices_less)} items"
        result.add_error(
            f"{price_less_percent} ({len(df_prices_less)}) of "
            f"items with {price_was_field} < {price_field}",
            detailed=f"{error}:\n{list(df_prices_less.index)}",
        )

    df_prices_equals = pd.DataFrame(
        prices[prices[price_was_field] == prices[price_field]],
        columns=[price_was_field, price_field],
    )
    price_equal_percent = "{:.2%}".format(len(df_prices_equals) / items_number)

    if not df_prices_equals.empty:
        result.add_warning(
            (
                f"{price_equal_percent} ({len(df_prices_equals)}) "
                f"of items with {price_was_field} = {price_field}"
            ),
            detailed=(
                f"Prices equal for {len(df_prices_equals)} items:\n"
                f"{list(df_prices_equals.index)}"
            ),
        )

    result.err_items_count = len(df_prices_equals) + len(df_prices_less)
    result.items_count = len(df.index)

    return result
Example #9
0
def compare_spider_names(source_job: Job, target_job: Job) -> Result:
    s_name = source_job.metadata.get("spider")
    t_name = target_job.metadata.get("spider")

    result = Result("Spider Names")
    if s_name != t_name:
        result.add_warning(
            f"{source_job.key} spider is {s_name}, {target_job.key} spider is {t_name}"
        )
    return result
Example #10
0
def compare_spider_names(source_job: Job, target_job: Job) -> Result:
    name1 = source_job.metadata.get("spider")
    name2 = target_job.metadata.get("spider")

    result = Result("Spider Names")
    if name1 != name2:
        result.add_warning(
            f"{source_job.key} spider is {name1}, {target_job.key} spider is {name2}"
        )
    return result
Example #11
0
def get_coverage_per_category(df: pd.DataFrame,
                              tagged_fields: Dict[str, List[str]]):
    result = Result("Coverage For Scraped Categories")

    category_fields = tagged_fields.get("category", [])
    for f in category_fields:
        value_counts = df[f].value_counts()
        result.add_info(f"{len(value_counts)} categories in '{f}'",
                        stats=value_counts)
    return result
Example #12
0
def check_errors(source_job: Job) -> Result:
    source_errs = api.get_errors_count(source_job)
    result = Result("Job Errors")
    if not source_errs:
        return result

    errors_url = "{}/{}/log?filterType=error&filterAndHigher"
    result.add_error(
        f"{source_errs} error(s) - {errors_url.format(SH_URL, source_job.key)}"
    )
    return result
Example #13
0
def compare_prices_for_same_urls(source_df: pd.DataFrame,
                                 target_df: pd.DataFrame,
                                 tagged_fields: TaggedFields) -> Result:
    """For each pair of items that have the same `product_url_field` tagged field,
    compare `product_price_field` field

    Returns:
        A result containing pairs of items from `source_df` and `target_df`
        which `product_price_field` differ.
    """
    result = Result("Compare Prices For Same Urls")
    url_field_list: Optional[List[str]] = tagged_fields.get(
        "product_url_field")
    if not url_field_list:
        result.outcome = Outcome.SKIPPED
        return result

    url_field = url_field_list[0]

    source_df = source_df.dropna(subset=[url_field])
    target_df = target_df.dropna(subset=[url_field])

    same_urls = source_df[(source_df[url_field].isin(
        target_df[url_field].values))][url_field]

    price_fields = tagged_fields.get("product_price_field")
    if not price_fields:
        result.add_info("product_price_field tag is not set")
    else:
        price_field = price_fields[0]
        detailed_messages = []
        for url in same_urls:
            if url.strip() != "nan":
                source_price = source_df[source_df[url_field] ==
                                         url][price_field].iloc[0]
                target_price = target_df[target_df[url_field] ==
                                         url][price_field].iloc[0]

                if (is_number(source_price) and is_number(target_price)
                        and ratio_diff(source_price, target_price) > 0.1):
                    source_key = source_df[source_df[url_field] ==
                                           url].index[0]
                    target_key = target_df[target_df[url_field] ==
                                           url].index[0]
                    msg = (
                        f"different prices for url: {url}\nsource price is {source_price} "
                        f"for {source_key}\ntarget price is {target_price} for {target_key}"
                    )
                    detailed_messages.append(msg)

        res = f"{len(same_urls)} checked, {len(detailed_messages)} errors"
        if detailed_messages:
            result.add_error(res, detailed="\n".join(detailed_messages))
        else:
            result.add_info(res)

    return result
Example #14
0
def get_difference(
    source_df: pd.DataFrame,
    target_df: pd.DataFrame,
    category_names: List[str],
    err_thr: float = 0.2,
    warn_thr: float = 0.1,
) -> Result:
    """Find and show differences between categories coverage, including `nan` values.
    Coverage means value counts divided on total size.

    Args:
        source_df: a data you want to compare
        target_df: a data you want to compare with
        category_names: list of columns which values to compare
        err_thr: sets error threshold
        warn_thr: warning threshold

    Returns:
        A result instance with messages containing significant difference defined by
        thresholds, a dataframe showing all normalized value counts in percents and
        a series containing significant difference.
    """
    source_key = "source"
    target_key = "target"
    result = Result("Category Coverage Difference")

    for c in category_names:
        cats = (
            pd.DataFrame(
                {
                    source_key: source_df[c].value_counts(dropna=False, normalize=True),
                    target_key: target_df[c].value_counts(dropna=False, normalize=True),
                }
            )
            .fillna(0)
            .sort_values(by=[source_key, target_key], kind="mergesort")
        )
        cats.name = f"Coverage for {c}"
        result.stats.append(cats)
        cat_difs = cats[source_key] - cats[target_key]
        cat_difs = cat_difs[cat_difs.abs() > warn_thr]
        cat_difs.name = f"Coverage difference more than {warn_thr:.0%} for {c}"
        if not cat_difs.empty:
            result.stats.append(cat_difs)
        errs = cat_difs[cat_difs.abs() > err_thr]
        if not errs.empty:
            result.add_warning(
                f"The difference is greater than {err_thr:.0%} for {len(errs)} value(s) of {c}"
            )
    if not category_names:
        result.outcome = Outcome.SKIPPED
    return result
Example #15
0
def check_items(df: pd.DataFrame, tagged_fields: Dict[str,
                                                      List[str]]) -> Result:
    """Check for items with the same name and url"""

    name_fields = tagged_fields.get("name_field")
    url_fields = tagged_fields.get("product_url_field")
    result = Result("Duplicated Items")
    if not name_fields or not url_fields:
        result.add_info(
            "'name_field' and 'product_url_field' tags were not found in schema"
        )
    else:
        result.items_count = len(df)
        errors = {}
        name_field = name_fields[0]
        url_field = url_fields[0]
        df = df[[name_field, url_field, "_key"]]
        duplicates = df[df[[name_field, url_field]].duplicated(keep=False)]
        if duplicates.empty:
            return result

        result.err_items_count = len(duplicates)
        for _, d in duplicates.groupby([name_field, url_field]):
            msg = (
                f"same '{d[name_field].iloc[0]}' name and '{d[url_field].iloc[0]}' url"
            )
            errors[msg] = list(d["_key"])
        result.add_error(
            f"{len(duplicates)} duplicate(s) with same name and url",
            errors=errors)
    return result
Example #16
0
def check_uniqueness(df: pd.DataFrame,
                     tagged_fields: Dict[str, List[str]]) -> Result:
    """Verify if each item field tagged with `unique` is unique.

    Returns:
        A result containing field names and keys for non unique items
    """
    unique_fields = tagged_fields.get("unique", [])
    result = Result("Uniqueness")

    if not unique_fields:
        result.add_info("'unique' tag was not found in schema")
        return result

    err_keys = set()
    for field in unique_fields:
        result.items_count = df[field].count()
        duplicates = df[df[field].duplicated(keep=False)][[field, "_key"]]
        errors = {}
        for _, d in duplicates.groupby([field]):
            keys = list(d["_key"])
            msg = f"same '{d[field].iloc[0]}' {field}"
            errors[msg] = keys
            err_keys = err_keys.union(keys)
        if not duplicates.empty:
            result.add_error(
                f"'{field}' contains {len(duplicates[field].unique())} duplicated value(s)",
                errors=errors,
            )

    result.err_items_count = len(err_keys)
    return result
Example #17
0
def get_categories(df: pd.DataFrame, max_uniques: int = 10) -> Result:
    """Find category columns. A category column is the column which holds a limited number
    of possible values, including `NAN`.

    Args:
        df: data
        max_uniques: filter which determines which columns to use. Only columns with
        the number of unique values less than or equal to `max_uniques` are category columns.

    Returns:
        A result with stats containing value counts of categorical columns.
    """
    result = Result("Categories")

    columns = find_likely_cats(df, max_uniques)
    result.stats = [
        value_counts
        for value_counts in tqdm(
            map(lambda c: df[c].value_counts(dropna=False), columns),
            desc="Finding categories",
            total=len(columns),
        )
        if len(value_counts) <= max_uniques
    ]
    if not result.stats:
        result.add_info("Categories were not found")
        return result
    result.add_info(f"{len(result.stats)} category field(s)")
    result.outcome = Outcome.INFO
    return result
Example #18
0
def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
    """Verify if each item field tagged with `unique` is unique.

    Returns:
        A result containing field names and keys for non unique items
    """
    unique_fields = tagged_fields.get("unique", [])
    result = Result("Duplicates By **unique** Tag")

    if not unique_fields:
        result.add_info(Outcome.SKIPPED)
        return result

    err_keys = set()
    for field in unique_fields:
        result.items_count = df[field].count()
        duplicates = df[df.duplicated(field, keep=False)][[field]]
        errors = {}
        for _, d in duplicates.groupby([field]):
            keys = list(d.index)
            msg = f"same '{d[field].iloc[0]}' `{field}`"
            errors[msg] = keys
            err_keys = err_keys.union(keys)
        if not duplicates.empty:
            result.add_error(
                f"{field} contains {len(duplicates[field].unique())} duplicated value(s)",
                errors=errors,
            )

    result.err_items_count = len(err_keys)
    return result
Example #19
0
def compare_boolean_fields(source_df: pd.DataFrame,
                           target_df: pd.DataFrame) -> Result:
    """Compare booleans distribution between two dataframes

    Returns:
        A result containing dataframe with distributions and messages if differences
        are in thresholds
    """
    warn_thr = 0.05
    err_thr = 0.10
    source_bool = source_df.select_dtypes(include="bool")
    target_bool = target_df.select_dtypes(include="bool")

    result = Result("Boolean Fields")
    if not fields_to_compare(source_bool, target_bool):
        result.add_info("SKIPPED")
        return result

    dummy = pd.DataFrame(columns=[True, False])
    source_counts = pd.concat(
        [dummy, source_bool.apply(pd.value_counts, normalize=True).T],
        sort=False).fillna(0.0)
    target_counts = pd.concat(
        [dummy, target_bool.apply(pd.value_counts, normalize=True).T],
        sort=False).fillna(0.0)
    difs = (source_counts - target_counts).abs()[True]

    bool_covs = pd.concat([
        source_counts.rename("{}_source".format),
        target_counts.rename("{}_target".format),
    ]).sort_index()
    bool_covs.name = "Coverage for boolean fields"
    result.stats.append(bool_covs)

    err_diffs = difs[difs > err_thr]
    if not err_diffs.empty:
        result.add_error(
            f"{', '.join(err_diffs.index)} relative frequencies differ "
            f"by more than {err_thr:.0%}")

    warn_diffs = difs[(difs > warn_thr) & (difs <= err_thr)]
    if not warn_diffs.empty:
        result.add_warning(
            f"{', '.join(warn_diffs.index)} relative frequencies differ by "
            f"{warn_thr:.0%}-{err_thr:.0%}")
    if err_diffs.empty and warn_diffs.empty:
        result.add_info("PASSED")

    return result
Example #20
0
def compare_errors(source_job: Job, target_job: Job) -> Result:
    errors_count1 = api.get_errors_count(source_job)
    errors_count2 = api.get_errors_count(target_job)

    result = Result("Compare Job Errors")
    if errors_count1:
        errors_url = "{}/{}/log?filterType=error&filterAndHigher"
        detailed_msg = (
            f"{errors_count1} error(s) for {source_job.key} - "
            f"{errors_url.format(SH_URL, source_job.key)}\n"
            f"{errors_count2} error(s) for {target_job.key} - "
            f"{errors_url.format(SH_URL, target_job.key)}"
        )
        result.add_error(f"{errors_count1} and {errors_count2} errors", detailed_msg)
    return result
Example #21
0
def find_by_name_url(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
    """Check for items with the same name and url"""

    name_fields = tagged_fields.get("name_field")
    url_fields = tagged_fields.get("product_url_field")
    name = "Duplicates By **name_field, product_url_field** Tags"
    result = Result(name)
    if not name_fields or not url_fields:
        result.add_info(Outcome.SKIPPED)
        return result
    name_field = name_fields[0]
    url_field = url_fields[0]
    result = find_by(df, [name_field, url_field])
    result.name = name
    return result
Example #22
0
def find_by_tags(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
    """Check for duplicates based on schema tags. In particular, look for items with
    the same `name_field` and `product_url_field`, and for uniqueness among `unique` field"""

    name_fields = tagged_fields.get("name_field")
    url_fields = tagged_fields.get("product_url_field")
    columns_to_check: List = tagged_fields.get("unique", [])
    if (not name_fields or not url_fields) and not columns_to_check:
        result = Result("Duplicates")
        result.add_info(Outcome.SKIPPED)
        return result
    if name_fields and url_fields:
        columns_to_check.extend([[name_fields[0], url_fields[0]]])

    return find_by(df, columns_to_check)
Example #23
0
def get_coverage_per_category(df: pd.DataFrame,
                              category_names: List) -> Result:
    """Get value counts per column, excluding nan.

    Args:
        df: a source data to assess
        category_names: list of columns which values counts to see

    Returns:
        Number of categories per field, value counts series for each field.
    """
    result = Result("Coverage For Scraped Categories")

    for c in category_names:
        value_counts = df[c].value_counts(ascending=True)
        result.add_info(f"{len(value_counts)} categories in '{c}'")
        result.stats.append(value_counts)
    return result
Example #24
0
def compare_prices_for_same_names(source_df: pd.DataFrame,
                                  target_df: pd.DataFrame,
                                  tagged_fields: TaggedFields):
    result = Result("Compare Prices For Same Names")
    name_field_tag = tagged_fields.get("name_field")
    if not name_field_tag:
        result.outcome = Outcome.SKIPPED
        return result

    name_field = name_field_tag[0]
    source_df = source_df[source_df[name_field].notnull()]
    target_df = target_df[target_df[name_field].notnull()]

    same_names = source_df[(source_df[name_field].isin(
        target_df[name_field].values))][name_field]

    price_fields = tagged_fields.get("product_price_field")
    if not price_fields:
        result.add_info("product_price_field tag is not set")
        return result
    price_field = price_fields[0]

    detailed_messages = []
    for name in same_names:
        if name.strip() != "nan":
            source_price = source_df[source_df[name_field] ==
                                     name][price_field].iloc[0]
            target_price = target_df[target_df[name_field] ==
                                     name][price_field].iloc[0]
            if is_number(source_price) and is_number(target_price):
                if ratio_diff(source_price, target_price) > 0.1:
                    source_key = source_df[source_df[name_field] ==
                                           name].index[0]
                    target_key = target_df[target_df[name_field] ==
                                           name].index[0]
                    msg = (
                        f"different price for {name}\nsource price is {source_price} "
                        f"for {source_key}\ntarget price is {target_price} for {target_key}"
                    )
                    detailed_messages.append(msg)

    result_msg = f"{len(same_names)} checked, {len(detailed_messages)} errors"
    if detailed_messages:
        result.add_error(result_msg, detailed="\n".join(detailed_messages))
    else:
        result.add_info(result_msg)

    return result
Example #25
0
def garbage_symbols(df: pd.DataFrame) -> Result:
    """Find unwanted symbols in `np.object` columns.

    Returns:
        A result containing item keys per field which contained any trash symbol
    """
    garbage = (
        r"(?P<spaces>^\s|\s$)"
        r"|(?P<html_entities>&[a-zA-Z]{2,}?;|&#\d*?;)"
        r"|(?P<css>[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?\s*?})"
        r"|(?P<html_tags></??(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
        r"blockquote)\s*?/??>|<!--|-->)")

    errors = {}
    row_keys = set()
    rule_result = Result("Garbage Symbols", items_count=len(df))

    for column in tqdm_notebook(df.select_dtypes([np.object]).columns,
                                desc="Garbage Symbols"):
        matches = df[column].apply(str).str.extractall(garbage,
                                                       flags=re.IGNORECASE)
        if not matches.empty:
            error_keys = df.loc[matches.unstack().index.values].index
            bad_texts = matches.stack().value_counts().index.sort_values(
            ).tolist()
            # escape backslashes for markdown repr, `\n > \\n`
            bad_texts = [
                f"'{codecs.encode(bx, 'unicode_escape').decode()[:20]}'"
                for bx in bad_texts
            ]
            error = (f"{len(error_keys)/len(df)*100:.1f}% of '{column}' "
                     f"values contain `{', '.join(bad_texts)}`")

            errors[error] = list(error_keys)
            row_keys = row_keys.union(error_keys)
    if errors:
        rule_result.add_error(
            f"{len(row_keys)/len(df) * 100:.1f}% ({len(row_keys)}) items affected",
            errors=errors,
        )
        rule_result.err_items_count = len(row_keys)

    return rule_result
Example #26
0
def compare_finish_time(source_job: Job, target_job: Job) -> Result:
    diff_in_days = api.get_finish_time_difference_in_days(source_job, target_job)

    result = Result("Finish Time")
    if diff_in_days == 0:
        result.add_info("Less than 1 day difference")
    else:
        if diff_in_days is None:
            result.add_warning("Jobs are not finished")
        else:
            result.add_warning(f"{diff_in_days} day(s) difference between 2 jobs")

    return result
Example #27
0
def garbage_symbols(items: Items) -> Result:
    """Find unwanted symbols in `np.object` columns.

    Returns:
        A result containing item keys per field which contained any trash symbol
    """
    garbage = (
        r"(?P<spaces>^\s|\s$)"
        r"|(?P<html_entities>&amp|&reg)"
        r"|(?P<css>(?:(?:\.|#)[^#. ]+\s*){.+})"
        r"|(?P<html_tags></?(h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
        r"blockquote)\s*/?>|<!--|-->)")

    errors = {}
    row_keys = set()
    rule_result = Result("Garbage Symbols", items_count=items.size)

    for column in items.flat_df.select_dtypes([np.object]):
        matches = items.flat_df[column].str.extractall(garbage,
                                                       flags=re.IGNORECASE)
        matches = matches[["spaces", "html_entities", "css", "html_tags"]]
        if not matches.empty:
            error_keys = items.flat_df.iloc[
                matches.unstack().index.values]["_key"]
            original_column = items.get_origin_column_name(column)
            bad_texts = matches.stack().value_counts().index.sort_values(
            ).tolist()
            error = (
                f"{len(error_keys)/items.size*100:.1f}% of '{original_column}' "
                f"values contain {[t[:20] for t in bad_texts]}")
            errors[error] = list(error_keys)
            row_keys = row_keys.union(error_keys)

    if errors:
        rule_result.add_error(
            f"{len(row_keys)/items.size * 100:.1f}% ({len(row_keys)}) items affected",
            errors=errors,
        )
        rule_result.err_items_count = len(row_keys)

    return rule_result
Example #28
0
def check_fields_coverage(df: pd.DataFrame) -> Result:
    """Get fields coverage from df. Coverage reflects the percentage of real values
    (exluding `nan`) per column.

    Args:
        df: a data to count the coverage

    Returns:
        A result with coverage for all columns in provided df. If column contains only `nan`,
        treat it as an error.
    """
    fields_coverage = df.count().sort_values(ascending=False)
    fields_coverage.name = f"Fields coverage for {len(df):_} items"

    empty_fields = fields_coverage[fields_coverage == 0]

    result = Result("Fields Coverage")
    result.stats = [fields_coverage]
    if not empty_fields.empty:
        result.add_error(f"{len(empty_fields)} empty field(s)")
    return result
Example #29
0
def validate(
    schema: RawSchema, raw_items: RawItems, keys: pd.Index, fast: bool = False
) -> Result:
    """Run JSON schema validation against data.

    Args:
        fast: defines if we use fastjsonschema or jsonschema validation

    Returns:
        Schema errors if any
    """
    validate_func = fast_validate if fast else full_validate
    errors = validate_func(schema, raw_items, keys)
    result = Result("JSON Schema Validation")
    err_items = len(set(itertools.chain.from_iterable(errors.values())))
    if errors:
        result.add_error(
            f"{err_items} ({err_items/len(raw_items):.0%}) items have {len(errors)} errors",
            errors=errors,
        )
    return result
Example #30
0
def compare_scraped_fields(source_df, target_df):
    source_field_coverage = dict(
        source_df.count().sort_values(ascending=False))
    target_field_coverage = dict(
        target_df.count().sort_values(ascending=False))

    result = Result("Scraped Fields")
    missing_fields = set(target_df.columns.values) - set(
        source_df.columns.values)
    if missing_fields:
        detailed_messages = ["Missing Fields"]
        for field in missing_fields:
            target_coverage = target_field_coverage[field] / len(
                target_df) * 100
            detailed_messages.append(
                f"{field} - coverage - {int(target_coverage)}% - "
                f"{target_field_coverage[field]} items")
        result.add_error(f"{len(missing_fields)} field(s) are missing",
                         "\n".join(detailed_messages))

    new_fields = set(source_df.columns.values) - set(target_df.columns.values)
    if new_fields:
        detailed_messages = ["New Fields"]
        for field in new_fields:
            source_coverage = source_field_coverage[field] / len(
                source_df) * 100
            detailed_messages.append(
                f"{field} - coverage - {int(source_coverage)}% - "
                f"{source_field_coverage[field]} items")
        result.add_info(f"{len(new_fields)} field(s) are new",
                        "\n".join(detailed_messages))

    return result