Exemple #1
0
def check_items(df: pd.DataFrame, tagged_fields: Dict[str,
                                                      List[str]]) -> Result:
    """Check for items with the same name and url"""

    name_fields = tagged_fields.get("name_field")
    url_fields = tagged_fields.get("product_url_field")
    result = Result("Duplicated Items")
    if not name_fields or not url_fields:
        result.add_info(
            "'name_field' and 'product_url_field' tags were not found in schema"
        )
    else:
        result.items_count = len(df)
        errors = {}
        name_field = name_fields[0]
        url_field = url_fields[0]
        df = df[[name_field, url_field, "_key"]]
        duplicates = df[df[[name_field, url_field]].duplicated(keep=False)]
        if duplicates.empty:
            return result

        result.err_items_count = len(duplicates)
        for _, d in duplicates.groupby([name_field, url_field]):
            msg = (
                f"same '{d[name_field].iloc[0]}' name and '{d[url_field].iloc[0]}' url"
            )
            errors[msg] = list(d["_key"])
        result.add_error(
            f"{len(duplicates)} duplicate(s) with same name and url",
            errors=errors)
    return result
Exemple #2
0
def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
    """Verify if each item field tagged with `unique` is unique.

    Returns:
        A result containing field names and keys for non unique items
    """
    unique_fields = tagged_fields.get("unique", [])
    result = Result("Duplicates By **unique** Tag")

    if not unique_fields:
        result.add_info(Outcome.SKIPPED)
        return result

    err_keys = set()
    for field in unique_fields:
        result.items_count = df[field].count()
        duplicates = df[df.duplicated(field, keep=False)][[field]]
        errors = {}
        for _, d in duplicates.groupby([field]):
            keys = list(d.index)
            msg = f"same '{d[field].iloc[0]}' `{field}`"
            errors[msg] = keys
            err_keys = err_keys.union(keys)
        if not duplicates.empty:
            result.add_error(
                f"{field} contains {len(duplicates[field].unique())} duplicated value(s)",
                errors=errors,
            )

    result.err_items_count = len(err_keys)
    return result
Exemple #3
0
def check_uniqueness(df: pd.DataFrame,
                     tagged_fields: Dict[str, List[str]]) -> Result:
    """Verify if each item field tagged with `unique` is unique.

    Returns:
        A result containing field names and keys for non unique items
    """
    unique_fields = tagged_fields.get("unique", [])
    result = Result("Uniqueness")

    if not unique_fields:
        result.add_info("'unique' tag was not found in schema")
        return result

    err_keys = set()
    for field in unique_fields:
        result.items_count = df[field].count()
        duplicates = df[df[field].duplicated(keep=False)][[field, "_key"]]
        errors = {}
        for _, d in duplicates.groupby([field]):
            keys = list(d["_key"])
            msg = f"same '{d[field].iloc[0]}' {field}"
            errors[msg] = keys
            err_keys = err_keys.union(keys)
        if not duplicates.empty:
            result.add_error(
                f"'{field}' contains {len(duplicates[field].unique())} duplicated value(s)",
                errors=errors,
            )

    result.err_items_count = len(err_keys)
    return result
Exemple #4
0
def compare_was_now(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]):
    """Compare price_was and price_now tagged fields"""

    price_was_fields = tagged_fields.get("product_price_was_field")
    price_fields = tagged_fields.get("product_price_field")
    items_number = len(df.index)

    result = Result("Compare Price Was And Now")

    if (price_was_fields and price_was_fields[0] in df.columns and price_fields
            and price_fields[0] in df.columns):
        price_field = price_fields[0]
        price_was_field = price_was_fields[0]
        prices = df.copy()
        prices[price_was_field] = prices[price_was_field].astype(float)
        prices[price_field] = prices[price_field].astype(float)

        df_prices_less = pd.DataFrame(
            prices[prices[price_was_field] < prices[price_field]],
            columns=["_key", price_was_field, price_field],
        )

        price_less_percent = "{:.2%}".format(
            len(df_prices_less) / items_number)

        if not df_prices_less.empty:
            error = f"Past price is less than current for {len(df_prices_less)} items"
            result.add_error(
                f"{price_less_percent} ({len(df_prices_less)}) of "
                f"items with {price_was_field} < {price_field}",
                detailed=f"{error}:\n{list(df_prices_less['_key'])}",
            )

        df_prices_equals = pd.DataFrame(
            prices[prices[price_was_field] == prices[price_field]],
            columns=["_key", price_was_field, price_field],
        )
        price_equal_percent = "{:.2%}".format(
            len(df_prices_equals) / items_number)

        if not df_prices_equals.empty:
            result.add_warning(
                (f"{price_equal_percent} ({len(df_prices_equals)}) "
                 f"of items with {price_was_field} = {price_field}"),
                detailed=(f"Prices equal for {len(df_prices_equals)} items:\n"
                          f"{list(df_prices_equals['_key'])}"),
            )

        result.err_items_count = len(df_prices_equals) + len(df_prices_less)
        result.items_count = len(df.index)

    else:
        result.add_info(
            "product_price_field or product_price_was_field tags were not "
            "found in schema")
    return result
Exemple #5
0
def compare_was_now(df: pd.DataFrame, tagged_fields: TaggedFields):
    """Compare price_was and price_now tagged fields"""

    price_was_fields = tagged_fields.get("product_price_was_field")
    price_fields = tagged_fields.get("product_price_field")
    items_number = len(df.index)

    result = Result("Compare Price Was And Now")

    if not price_was_fields or not price_fields:
        result.add_info(Outcome.SKIPPED)
        return result

    price_field = price_fields[0]
    price_was_field = price_was_fields[0]
    prices = df.copy()
    prices[price_was_field] = prices[price_was_field].astype(float)
    prices[price_field] = prices[price_field].astype(float)

    df_prices_less = pd.DataFrame(
        prices[prices[price_was_field] < prices[price_field]],
        columns=[price_was_field, price_field],
    )

    price_less_percent = "{:.2%}".format(len(df_prices_less) / items_number)

    if not df_prices_less.empty:
        error = f"Past price is less than current for {len(df_prices_less)} items"
        result.add_error(
            f"{price_less_percent} ({len(df_prices_less)}) of "
            f"items with {price_was_field} < {price_field}",
            detailed=f"{error}:\n{list(df_prices_less.index)}",
        )

    df_prices_equals = pd.DataFrame(
        prices[prices[price_was_field] == prices[price_field]],
        columns=[price_was_field, price_field],
    )
    price_equal_percent = "{:.2%}".format(len(df_prices_equals) / items_number)

    if not df_prices_equals.empty:
        result.add_warning(
            (
                f"{price_equal_percent} ({len(df_prices_equals)}) "
                f"of items with {price_was_field} = {price_field}"
            ),
            detailed=(
                f"Prices equal for {len(df_prices_equals)} items:\n"
                f"{list(df_prices_equals.index)}"
            ),
        )

    result.err_items_count = len(df_prices_equals) + len(df_prices_less)
    result.items_count = len(df.index)

    return result
Exemple #6
0
def create_result(
    rule_name, messages, stats=None, err_items_count=None, items_count=None
):
    result = Result(rule_name)
    for level, messages in messages.items():
        for message in messages:
            result.add_message(level, *message)

    if stats:
        result.stats = stats
    if err_items_count:
        result.err_items_count = err_items_count
    if items_count:
        result.items_count = items_count
    return result
Exemple #7
0
def create_result(rule_name,
                  messages,
                  err_items_count=None,
                  checked_fields=None,
                  items_count=None):
    result = Result(rule_name)
    for level, messages in messages.items():
        for message in messages:
            result.add_message(level, *message)

    if err_items_count:
        result.err_items_count = err_items_count
    if checked_fields:
        result.checked_fields = checked_fields
    if items_count:
        result.items_count = items_count
    return result
Exemple #8
0
def garbage_symbols(df: pd.DataFrame) -> Result:
    """Find unwanted symbols in `np.object` columns.

    Returns:
        A result containing item keys per field which contained any trash symbol
    """
    garbage = (
        r"(?P<spaces>^\s|\s$)"
        r"|(?P<html_entities>&[a-zA-Z]{2,}?;|&#\d*?;)"
        r"|(?P<css>[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?\s*?})"
        r"|(?P<html_tags></??(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
        r"blockquote)\s*?/??>|<!--|-->)")

    errors = {}
    row_keys = set()
    rule_result = Result("Garbage Symbols", items_count=len(df))

    for column in tqdm_notebook(df.select_dtypes([np.object]).columns,
                                desc="Garbage Symbols"):
        matches = df[column].apply(str).str.extractall(garbage,
                                                       flags=re.IGNORECASE)
        if not matches.empty:
            error_keys = df.loc[matches.unstack().index.values].index
            bad_texts = matches.stack().value_counts().index.sort_values(
            ).tolist()
            # escape backslashes for markdown repr, `\n > \\n`
            bad_texts = [
                f"'{codecs.encode(bx, 'unicode_escape').decode()[:20]}'"
                for bx in bad_texts
            ]
            error = (f"{len(error_keys)/len(df)*100:.1f}% of '{column}' "
                     f"values contain `{', '.join(bad_texts)}`")

            errors[error] = list(error_keys)
            row_keys = row_keys.union(error_keys)
    if errors:
        rule_result.add_error(
            f"{len(row_keys)/len(df) * 100:.1f}% ({len(row_keys)}) items affected",
            errors=errors,
        )
        rule_result.err_items_count = len(row_keys)

    return rule_result
Exemple #9
0
def garbage_symbols(items: Items) -> Result:
    """Find unwanted symbols in `np.object` columns.

    Returns:
        A result containing item keys per field which contained any trash symbol
    """
    garbage = (
        r"(?P<spaces>^\s|\s$)"
        r"|(?P<html_entities>&amp|&reg)"
        r"|(?P<css>(?:(?:\.|#)[^#. ]+\s*){.+})"
        r"|(?P<html_tags></?(h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
        r"blockquote)\s*/?>|<!--|-->)")

    errors = {}
    row_keys = set()
    rule_result = Result("Garbage Symbols", items_count=items.size)

    for column in items.flat_df.select_dtypes([np.object]):
        matches = items.flat_df[column].str.extractall(garbage,
                                                       flags=re.IGNORECASE)
        matches = matches[["spaces", "html_entities", "css", "html_tags"]]
        if not matches.empty:
            error_keys = items.flat_df.iloc[
                matches.unstack().index.values]["_key"]
            original_column = items.get_origin_column_name(column)
            bad_texts = matches.stack().value_counts().index.sort_values(
            ).tolist()
            error = (
                f"{len(error_keys)/items.size*100:.1f}% of '{original_column}' "
                f"values contain {[t[:20] for t in bad_texts]}")
            errors[error] = list(error_keys)
            row_keys = row_keys.union(error_keys)

    if errors:
        rule_result.add_error(
            f"{len(row_keys)/items.size * 100:.1f}% ({len(row_keys)}) items affected",
            errors=errors,
        )
        rule_result.err_items_count = len(row_keys)

    return rule_result
Exemple #10
0
def find_by(df: pd.DataFrame, columns: List[str]) -> Result:
    """Compare items rows in `df` by `columns`

    Returns:
        Any duplicates
    """
    result = Result(f"Duplicates")
    result.items_count = len(df)
    df = df.dropna(subset=columns, how="all")
    duplicates = df[df.duplicated(columns, keep=False)][columns]
    if duplicates.empty:
        return result

    result.err_items_count = len(duplicates)
    errors = {}
    for _, d in duplicates.groupby(columns):
        msgs = [f"'{d[c].iloc[0]}' `{c}`" for c in columns]
        errors[f"same {', '.join(msgs)}"] = list(d.index)

    result.add_error(
        f"{len(duplicates)} duplicate(s) with same {', '.join(columns)}",
        errors=errors)
    return result