コード例 #1
0
ファイル: json_schema.py プロジェクト: gitter-badger/arche
def check_tags(source_columns, target_columns, tags):
    result = Result("Tags")

    found_tags = list(tags)
    if found_tags:
        result.add_info(", ".join(found_tags))

    tagged_fields = []
    for tag in tags:
        tagged_fields.extend(tags[tag])

    missing_in_source = sorted(set(tagged_fields) - set(source_columns))
    if missing_in_source:
        result.add_error(
            f"{str(missing_in_source)[1:-1]} field(s) was not found in "
            "source, but specified in schema")

    if target_columns.size > 0:
        missing_in_target = sorted(set(tagged_fields) - set(target_columns))
        if missing_in_target:
            result.add_error(
                f"{str(missing_in_target)[1:-1]} field(s) was not found "
                "in target, but specified in schema")

    if result.errors:
        result.add_error("Skipping tag rules")

    return result
コード例 #2
0
ファイル: coverage.py プロジェクト: gitter-badger/arche
def compare_scraped_fields(source_df, target_df):
    source_field_coverage = dict(
        source_df.count().sort_values(ascending=False))
    target_field_coverage = dict(
        target_df.count().sort_values(ascending=False))

    result = Result("Scraped Fields")
    missing_fields = set(target_df.columns.values) - set(
        source_df.columns.values)
    if missing_fields:
        detailed_messages = ["Missing Fields"]
        for field in missing_fields:
            target_coverage = target_field_coverage[field] / len(
                target_df) * 100
            detailed_messages.append(
                f"{field} - coverage - {int(target_coverage)}% - "
                f"{target_field_coverage[field]} items")
        result.add_error(f"{len(missing_fields)} field(s) are missing",
                         "\n".join(detailed_messages))

    new_fields = set(source_df.columns.values) - set(target_df.columns.values)
    if new_fields:
        detailed_messages = ["New Fields"]
        for field in new_fields:
            source_coverage = source_field_coverage[field] / len(
                source_df) * 100
            detailed_messages.append(
                f"{field} - coverage - {int(source_coverage)}% - "
                f"{source_field_coverage[field]} items")
        result.add_info(f"{len(new_fields)} field(s) are new",
                        "\n".join(detailed_messages))

    return result
コード例 #3
0
def compare_runtime(source_job: Job, target_job: Job) -> Result:
    source_runtime = api.get_runtime(source_job)
    target_runtime = api.get_runtime(target_job)

    result = Result("Compare Runtime")
    if not source_runtime or not target_runtime:
        result.add_warning("Jobs are not finished")
    elif source_runtime > target_runtime:
        runtime_ratio_diff = helpers.ratio_diff(source_runtime, target_runtime)
        msg = (
            f"Sources differ on {runtime_ratio_diff}% - "
            f"{helpers.ms_to_time(source_runtime)} and "
            f"{helpers.ms_to_time(target_runtime)}"
        )
        if runtime_ratio_diff > 0.2:
            result.add_error(msg)
        elif runtime_ratio_diff > 0.1:
            result.add_warning(msg)
        else:
            result.add_info(msg)
    else:
        result.add_info(
            f"Similar or better runtime - {helpers.ms_to_time(source_runtime)} and "
            f"{helpers.ms_to_time(target_runtime)}"
        )
    return result
コード例 #4
0
def check_uniqueness(df: pd.DataFrame,
                     tagged_fields: Dict[str, List[str]]) -> Result:
    """Verify if each item field tagged with `unique` is unique.

    Returns:
        A result containing field names and keys for non unique items
    """
    unique_fields = tagged_fields.get("unique", [])
    result = Result("Uniqueness")

    if not unique_fields:
        result.add_info("'unique' tag was not found in schema")
        return result

    err_keys = set()
    for field in unique_fields:
        result.items_count = df[field].count()
        duplicates = df[df[field].duplicated(keep=False)][[field, "_key"]]
        errors = {}
        for _, d in duplicates.groupby([field]):
            keys = list(d["_key"])
            msg = f"same '{d[field].iloc[0]}' {field}"
            errors[msg] = keys
            err_keys = err_keys.union(keys)
        if not duplicates.empty:
            result.add_error(
                f"'{field}' contains {len(duplicates[field].unique())} duplicated value(s)",
                errors=errors,
            )

    result.err_items_count = len(err_keys)
    return result
コード例 #5
0
def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
    """Verify if each item field tagged with `unique` is unique.

    Returns:
        A result containing field names and keys for non unique items
    """
    unique_fields = tagged_fields.get("unique", [])
    result = Result("Duplicates By **unique** Tag")

    if not unique_fields:
        result.add_info(Outcome.SKIPPED)
        return result

    err_keys = set()
    for field in unique_fields:
        result.items_count = df[field].count()
        duplicates = df[df.duplicated(field, keep=False)][[field]]
        errors = {}
        for _, d in duplicates.groupby([field]):
            keys = list(d.index)
            msg = f"same '{d[field].iloc[0]}' `{field}`"
            errors[msg] = keys
            err_keys = err_keys.union(keys)
        if not duplicates.empty:
            result.add_error(
                f"{field} contains {len(duplicates[field].unique())} duplicated value(s)",
                errors=errors,
            )

    result.err_items_count = len(err_keys)
    return result
コード例 #6
0
ファイル: json_schema.py プロジェクト: vipulgupta2048/arche
def check_tags(source_columns: np.ndarray, target_columns: np.ndarray,
               tags: TaggedFields) -> Result:
    result = Result("Tags")

    found_tags = sorted(list(tags))
    if found_tags:
        result.add_info(f"Used - {', '.join(found_tags)}")

    all_tags = set([name for name, _ in Tag.__members__.items()])
    not_used_tags = sorted(all_tags - set(tags))
    if not_used_tags:
        result.add_info(f"Not used - {', '.join(not_used_tags)}")

    tagged_fields = []
    for tag in tags:
        tagged_fields.extend(tags[tag])

    missing_in_source = sorted(set(tagged_fields) - set(source_columns))
    if missing_in_source:
        result.add_error(
            f"{str(missing_in_source)[1:-1]} field(s) was not found in "
            "source, but specified in schema")

    if target_columns is not None:
        missing_in_target = sorted(set(tagged_fields) - set(target_columns))
        if missing_in_target:
            result.add_error(
                f"{str(missing_in_target)[1:-1]} field(s) was not found "
                "in target, but specified in schema")

    if result.errors:
        result.add_error("Skipping tag rules")

    return result
コード例 #7
0
def check_items(df: pd.DataFrame, tagged_fields: Dict[str,
                                                      List[str]]) -> Result:
    """Check for items with the same name and url"""

    name_fields = tagged_fields.get("name_field")
    url_fields = tagged_fields.get("product_url_field")
    result = Result("Duplicated Items")
    if not name_fields or not url_fields:
        result.add_info(
            "'name_field' and 'product_url_field' tags were not found in schema"
        )
    else:
        result.items_count = len(df)
        errors = {}
        name_field = name_fields[0]
        url_field = url_fields[0]
        df = df[[name_field, url_field, "_key"]]
        duplicates = df[df[[name_field, url_field]].duplicated(keep=False)]
        if duplicates.empty:
            return result

        result.err_items_count = len(duplicates)
        for _, d in duplicates.groupby([name_field, url_field]):
            msg = (
                f"same '{d[name_field].iloc[0]}' name and '{d[url_field].iloc[0]}' url"
            )
            errors[msg] = list(d["_key"])
        result.add_error(
            f"{len(duplicates)} duplicate(s) with same name and url",
            errors=errors)
    return result
コード例 #8
0
ファイル: category.py プロジェクト: zanachka/arche
def get_categories(df: pd.DataFrame, max_uniques: int = 10) -> Result:
    """Find category columns. A category column is the column which holds a limited number
    of possible values, including `NAN`.

    Args:
        df: data
        max_uniques: filter which determines which columns to use. Only columns with
        the number of unique values less than or equal to `max_uniques` are category columns.

    Returns:
        A result with stats containing value counts of categorical columns.
    """
    result = Result("Categories")

    columns = find_likely_cats(df, max_uniques)
    result.stats = [
        value_counts
        for value_counts in tqdm(
            map(lambda c: df[c].value_counts(dropna=False), columns),
            desc="Finding categories",
            total=len(columns),
        )
        if len(value_counts) <= max_uniques
    ]
    if not result.stats:
        result.add_info("Categories were not found")
        return result
    result.add_info(f"{len(result.stats)} category field(s)")
    result.outcome = Outcome.INFO
    return result
コード例 #9
0
def compare_prices_for_same_urls(source_df: pd.DataFrame,
                                 target_df: pd.DataFrame,
                                 tagged_fields: TaggedFields) -> Result:
    """For each pair of items that have the same `product_url_field` tagged field,
    compare `product_price_field` field

    Returns:
        A result containing pairs of items from `source_df` and `target_df`
        which `product_price_field` differ.
    """
    result = Result("Compare Prices For Same Urls")
    url_field_list: Optional[List[str]] = tagged_fields.get(
        "product_url_field")
    if not url_field_list:
        result.outcome = Outcome.SKIPPED
        return result

    url_field = url_field_list[0]

    source_df = source_df.dropna(subset=[url_field])
    target_df = target_df.dropna(subset=[url_field])

    same_urls = source_df[(source_df[url_field].isin(
        target_df[url_field].values))][url_field]

    price_fields = tagged_fields.get("product_price_field")
    if not price_fields:
        result.add_info("product_price_field tag is not set")
    else:
        price_field = price_fields[0]
        detailed_messages = []
        for url in same_urls:
            if url.strip() != "nan":
                source_price = source_df[source_df[url_field] ==
                                         url][price_field].iloc[0]
                target_price = target_df[target_df[url_field] ==
                                         url][price_field].iloc[0]

                if (is_number(source_price) and is_number(target_price)
                        and ratio_diff(source_price, target_price) > 0.1):
                    source_key = source_df[source_df[url_field] ==
                                           url].index[0]
                    target_key = target_df[target_df[url_field] ==
                                           url].index[0]
                    msg = (
                        f"different prices for url: {url}\nsource price is {source_price} "
                        f"for {source_key}\ntarget price is {target_price} for {target_key}"
                    )
                    detailed_messages.append(msg)

        res = f"{len(same_urls)} checked, {len(detailed_messages)} errors"
        if detailed_messages:
            result.add_error(res, detailed="\n".join(detailed_messages))
        else:
            result.add_info(res)

    return result
コード例 #10
0
def check_outcome(job: Job) -> Result:
    state = api.get_job_state(job)
    reason = api.get_job_close_reason(job)
    result = Result("Job Outcome")
    if state != "finished" or reason != "finished":
        result.add_error(f"Job has '{state}' state, '{reason}' close reason")
    else:
        result.add_info("Finished")
    return result
コード例 #11
0
def check_response_ratio(job: Job) -> Result:
    requests_number = api.get_requests_count(job)
    items_count = api.get_items_count(job)
    result = Result("Responses Per Item Ratio")
    result.add_info(
        f"Number of responses / Number of scraped items - "
        f"{round(requests_number / items_count, 2)}"
    )
    return result
コード例 #12
0
def compare_was_now(df: pd.DataFrame, tagged_fields: TaggedFields):
    """Compare price_was and price_now tagged fields"""

    price_was_fields = tagged_fields.get("product_price_was_field")
    price_fields = tagged_fields.get("product_price_field")
    items_number = len(df.index)

    result = Result("Compare Price Was And Now")

    if not price_was_fields or not price_fields:
        result.add_info(Outcome.SKIPPED)
        return result

    price_field = price_fields[0]
    price_was_field = price_was_fields[0]
    prices = df.copy()
    prices[price_was_field] = prices[price_was_field].astype(float)
    prices[price_field] = prices[price_field].astype(float)

    df_prices_less = pd.DataFrame(
        prices[prices[price_was_field] < prices[price_field]],
        columns=[price_was_field, price_field],
    )

    price_less_percent = "{:.2%}".format(len(df_prices_less) / items_number)

    if not df_prices_less.empty:
        error = f"Past price is less than current for {len(df_prices_less)} items"
        result.add_error(
            f"{price_less_percent} ({len(df_prices_less)}) of "
            f"items with {price_was_field} < {price_field}",
            detailed=f"{error}:\n{list(df_prices_less.index)}",
        )

    df_prices_equals = pd.DataFrame(
        prices[prices[price_was_field] == prices[price_field]],
        columns=[price_was_field, price_field],
    )
    price_equal_percent = "{:.2%}".format(len(df_prices_equals) / items_number)

    if not df_prices_equals.empty:
        result.add_warning(
            (
                f"{price_equal_percent} ({len(df_prices_equals)}) "
                f"of items with {price_was_field} = {price_field}"
            ),
            detailed=(
                f"Prices equal for {len(df_prices_equals)} items:\n"
                f"{list(df_prices_equals.index)}"
            ),
        )

    result.err_items_count = len(df_prices_equals) + len(df_prices_less)
    result.items_count = len(df.index)

    return result
コード例 #13
0
ファイル: price.py プロジェクト: gitter-badger/arche
def compare_was_now(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]):
    """Compare price_was and price_now tagged fields"""

    price_was_fields = tagged_fields.get("product_price_was_field")
    price_fields = tagged_fields.get("product_price_field")
    items_number = len(df.index)

    result = Result("Compare Price Was And Now")

    if (price_was_fields and price_was_fields[0] in df.columns and price_fields
            and price_fields[0] in df.columns):
        price_field = price_fields[0]
        price_was_field = price_was_fields[0]
        prices = df.copy()
        prices[price_was_field] = prices[price_was_field].astype(float)
        prices[price_field] = prices[price_field].astype(float)

        df_prices_less = pd.DataFrame(
            prices[prices[price_was_field] < prices[price_field]],
            columns=["_key", price_was_field, price_field],
        )

        price_less_percent = "{:.2%}".format(
            len(df_prices_less) / items_number)

        if not df_prices_less.empty:
            error = f"Past price is less than current for {len(df_prices_less)} items"
            result.add_error(
                f"{price_less_percent} ({len(df_prices_less)}) of "
                f"items with {price_was_field} < {price_field}",
                detailed=f"{error}:\n{list(df_prices_less['_key'])}",
            )

        df_prices_equals = pd.DataFrame(
            prices[prices[price_was_field] == prices[price_field]],
            columns=["_key", price_was_field, price_field],
        )
        price_equal_percent = "{:.2%}".format(
            len(df_prices_equals) / items_number)

        if not df_prices_equals.empty:
            result.add_warning(
                (f"{price_equal_percent} ({len(df_prices_equals)}) "
                 f"of items with {price_was_field} = {price_field}"),
                detailed=(f"Prices equal for {len(df_prices_equals)} items:\n"
                          f"{list(df_prices_equals['_key'])}"),
            )

        result.err_items_count = len(df_prices_equals) + len(df_prices_less)
        result.items_count = len(df.index)

    else:
        result.add_info(
            "product_price_field or product_price_was_field tags were not "
            "found in schema")
    return result
コード例 #14
0
def get_coverage_per_category(df: pd.DataFrame,
                              tagged_fields: Dict[str, List[str]]):
    result = Result("Coverage For Scraped Categories")

    category_fields = tagged_fields.get("category", [])
    for f in category_fields:
        value_counts = df[f].value_counts()
        result.add_info(f"{len(value_counts)} categories in '{f}'",
                        stats=value_counts)
    return result
コード例 #15
0
def check_errors(job: Job) -> Result:
    errors_count = api.get_errors_count(job)
    result = Result("Job Errors")
    if errors_count:
        url = f"{SH_URL}/{job.key}/log?filterType=error&filterAndHigher"
        result.add_error(
            f"{errors_count} error(s)", detailed=f"Errors for {job.key} - {url}"
        )
    else:
        result.add_info(f"No errors")
    return result
コード例 #16
0
def get_difference(
    source_df: pd.DataFrame,
    target_df: pd.DataFrame,
    category_names: List[str],
    source_key: str = "source",
    target_key: str = "target",
) -> Result:
    """Find and show differences between categories coverage, including nan values.
    Coverage means value counts divided on total size.

    Args:
        source_df: a data you want to compare
        target_df: a data you want to compare with
        category_names: list of columns which values to compare
        source_key: label for `source_df`
        target_key: label for `target_df`

    Returns:
        A result instance with messages containing significant difference defined by
        thresholds, a dataframe showing all normalized value counts in percents,
        a series containing significant difference.
    """
    result = Result("Category Coverage Difference")
    warn_thr = 0.10
    err_thr = 0.20

    for c in category_names:
        cats = (
            pd.DataFrame(
                {
                    source_key: source_df[c].value_counts(dropna=False, normalize=True),
                    target_key: target_df[c].value_counts(dropna=False, normalize=True),
                }
            )
            .fillna(0)
            .sort_values(by=[source_key, target_key], kind="mergesort")
        )
        cats.name = f"Coverage for {c}"
        result.stats.append(cats)
        cat_difs = (cats[source_key] - cats[target_key]).abs()
        cat_difs = cat_difs[cat_difs > warn_thr]
        cat_difs.name = f"Coverage difference more than {warn_thr:.0%} for {c}"
        if not cat_difs.empty:
            result.stats.append(cat_difs)
        errs = cat_difs[cat_difs > err_thr]
        if not errs.empty:
            result.add_warning(
                f"The difference is greater than {err_thr:.0%} for {len(errs)} value(s) of {c}"
            )

    if not category_names:
        result.add_info(Outcome.SKIPPED)
    return result
コード例 #17
0
def compare_finish_time(source_job: Job, target_job: Job) -> Result:
    diff_in_days = api.get_finish_time_difference_in_days(source_job, target_job)

    result = Result("Finish Time")
    if diff_in_days == 0:
        result.add_info("Less than 1 day difference")
    else:
        if diff_in_days is None:
            result.add_warning("Jobs are not finished")
        else:
            result.add_warning(f"{diff_in_days} day(s) difference between 2 jobs")

    return result
コード例 #18
0
def compare_boolean_fields(
    source_df: pd.DataFrame,
    target_df: pd.DataFrame,
    err_thr: float = 0.10,
    warn_thr: float = 0.05,
) -> Result:
    """Compare booleans distribution between two dataframes

    Returns:
        A result containing dataframe with distributions and messages if differences
        are in thresholds
    """

    source_bool = source_df.select_dtypes(include="bool")
    target_bool = target_df.select_dtypes(include="bool")

    result = Result("Boolean Fields")
    if not fields_to_compare(source_bool, target_bool):
        result.add_info(Outcome.SKIPPED)
        return result

    dummy = pd.DataFrame(columns=[True, False])
    source_counts = pd.concat(
        [dummy, source_bool.apply(pd.value_counts, normalize=True).T],
        sort=False).fillna(0.0)
    target_counts = pd.concat(
        [dummy, target_bool.apply(pd.value_counts, normalize=True).T],
        sort=False).fillna(0.0)
    difs = (source_counts - target_counts)[True]

    bool_covs = pd.concat([
        source_counts.rename("{}_source".format),
        target_counts.rename("{}_target".format),
    ]).sort_index()
    bool_covs.name = "Coverage for boolean fields"
    result.stats.append(bool_covs)

    err_diffs = difs[difs.abs() > err_thr]
    if not err_diffs.empty:
        result.add_error(
            f"{', '.join(err_diffs.index)} relative frequencies differ "
            f"by more than {err_thr:.0%}")

    warn_diffs = difs[(difs.abs() > warn_thr) & (difs.abs() <= err_thr)]
    if not warn_diffs.empty:
        result.add_warning(
            f"{', '.join(warn_diffs.index)} relative frequencies differ by "
            f"{warn_thr:.0%}-{err_thr:.0%}")

    return result
コード例 #19
0
def compare_scraped_fields(source_df: pd.DataFrame,
                           target_df: pd.DataFrame) -> Result:
    """Find new or missing columns between source_df and target_df"""
    result = Result("Scraped Fields")
    missing_fields = target_df.columns.difference(source_df.columns)

    if missing_fields.array:
        result.add_error(f"Missing - {', '.join(missing_fields)}")

    new_fields = source_df.columns.difference(target_df.columns)
    if new_fields.array:
        result.add_info(f"New - {', '.join(new_fields)}")

    return result
コード例 #20
0
def find_by_name_url(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
    """Check for items with the same name and url"""

    name_fields = tagged_fields.get("name_field")
    url_fields = tagged_fields.get("product_url_field")
    name = "Duplicates By **name_field, product_url_field** Tags"
    result = Result(name)
    if not name_fields or not url_fields:
        result.add_info(Outcome.SKIPPED)
        return result
    name_field = name_fields[0]
    url_field = url_fields[0]
    result = find_by(df, [name_field, url_field])
    result.name = name
    return result
コード例 #21
0
def find_by_tags(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
    """Check for duplicates based on schema tags. In particular, look for items with
    the same `name_field` and `product_url_field`, and for uniqueness among `unique` field"""

    name_fields = tagged_fields.get("name_field")
    url_fields = tagged_fields.get("product_url_field")
    columns_to_check: List = tagged_fields.get("unique", [])
    if (not name_fields or not url_fields) and not columns_to_check:
        result = Result("Duplicates")
        result.add_info(Outcome.SKIPPED)
        return result
    if name_fields and url_fields:
        columns_to_check.extend([[name_fields[0], url_fields[0]]])

    return find_by(df, columns_to_check)
コード例 #22
0
def compare_prices_for_same_names(source_df: pd.DataFrame,
                                  target_df: pd.DataFrame,
                                  tagged_fields: TaggedFields):
    result = Result("Compare Prices For Same Names")
    name_field_tag = tagged_fields.get("name_field")
    if not name_field_tag:
        result.outcome = Outcome.SKIPPED
        return result

    name_field = name_field_tag[0]
    source_df = source_df[source_df[name_field].notnull()]
    target_df = target_df[target_df[name_field].notnull()]

    same_names = source_df[(source_df[name_field].isin(
        target_df[name_field].values))][name_field]

    price_fields = tagged_fields.get("product_price_field")
    if not price_fields:
        result.add_info("product_price_field tag is not set")
        return result
    price_field = price_fields[0]

    detailed_messages = []
    for name in same_names:
        if name.strip() != "nan":
            source_price = source_df[source_df[name_field] ==
                                     name][price_field].iloc[0]
            target_price = target_df[target_df[name_field] ==
                                     name][price_field].iloc[0]
            if is_number(source_price) and is_number(target_price):
                if ratio_diff(source_price, target_price) > 0.1:
                    source_key = source_df[source_df[name_field] ==
                                           name].index[0]
                    target_key = target_df[target_df[name_field] ==
                                           name].index[0]
                    msg = (
                        f"different price for {name}\nsource price is {source_price} "
                        f"for {source_key}\ntarget price is {target_price} for {target_key}"
                    )
                    detailed_messages.append(msg)

    result_msg = f"{len(same_names)} checked, {len(detailed_messages)} errors"
    if detailed_messages:
        result.add_error(result_msg, detailed="\n".join(detailed_messages))
    else:
        result.add_info(result_msg)

    return result
コード例 #23
0
def compare_names_for_same_urls(
    source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: TaggedFields
):
    """For each pair of items that have the same `product_url_field` tagged field,
    compare `name_field` field"""

    result = Result("Compare Names Per Url")
    url_field = tagged_fields.get("product_url_field")
    name_field = tagged_fields.get("name_field")
    if not url_field or not name_field:
        result.add_info(Outcome.SKIPPED)
        return result

    name_field = name_field[0]
    url_field = url_field[0]
    diff_names_count = 0

    same_urls = source_df[(source_df[url_field].isin(target_df[url_field].values))][
        url_field
    ]

    detailed_messages = []
    for url in same_urls:
        if url.strip() != "nan":
            source_name = source_df[source_df[url_field] == url][name_field].iloc[0]
            target_name = target_df[target_df[url_field] == url][name_field].iloc[0]

            if (
                source_name != target_name
                and source_name.strip() != "nan"
                and target_name.strip() != "nan"
            ):
                diff_names_count += 1
                source_key = source_df[source_df[url_field] == url].index[0]
                target_key = target_df[target_df[url_field] == url].index[0]
                msg = (
                    f"different names for url: {url}\nsource name is {source_name} "
                    f"for {source_key}\ntarget name is {target_name} for {target_key}"
                )
                detailed_messages.append(msg)

    res = f"{len(same_urls)} checked, {diff_names_count} errors"
    if detailed_messages:
        result.add_error(res, detailed="\n".join(detailed_messages))
    else:
        result.add_info(res)

    return result
コード例 #24
0
ファイル: coverage.py プロジェクト: gitter-badger/arche
def check_fields_coverage(df):
    fields_coverage = pd.DataFrame(df.count(), columns=["Values Count"])
    fields_coverage.index.name = "Field"
    fields_coverage["Percent"] = fields_coverage.apply(
        lambda row: int(row["Values Count"] / len(df) * 100), axis=1)

    detailed_msg = fields_coverage.sort_values(
        by=["Percent", "Field"]).to_string()

    empty_fields = fields_coverage[fields_coverage["Values Count"] == 0]
    result_msg = f"{len(empty_fields)} totally empty field(s)"

    result = Result("Fields Coverage")
    if empty_fields.empty:
        result.add_info(result_msg, detailed_msg)
    else:
        result.add_error(result_msg, detailed_msg)
    return result
コード例 #25
0
ファイル: category.py プロジェクト: vipulgupta2048/arche
def get_coverage_per_category(df: pd.DataFrame,
                              category_names: List) -> Result:
    """Get value counts per column, excluding nan.

    Args:
        df: a source data to assess
        category_names: list of columns which values counts to see

    Returns:
        Number of categories per field, value counts series for each field.
    """
    result = Result("Coverage For Scraped Categories")

    for c in category_names:
        value_counts = df[c].value_counts(ascending=True)
        result.add_info(f"{len(value_counts)} categories in '{c}'")
        result.stats.append(value_counts)
    return result
コード例 #26
0
def compare_number_of_scraped_items(source_job: Job, target_job: Job) -> Result:
    items_count1 = api.get_items_count(source_job)
    items_count2 = api.get_items_count(target_job)
    diff = helpers.ratio_diff(items_count1, items_count2)
    result = Result("Total Scraped Items")
    if 0 <= diff < 0.05:
        if diff == 0:
            msg = "Same number of items"
        else:
            msg = f"Almost the same number of items - {items_count1} and {items_count2}"
        result.add_info(msg)
    else:
        msg = f"{items_count1} differs from {items_count2} on {diff * 100}%"
        if 0.05 <= diff < 0.10:
            result.add_warning(msg)
        elif diff >= 0.10:
            result.add_error(msg)
    return result
コード例 #27
0
ファイル: json_schema.py プロジェクト: gitter-badger/arche
def validate(schema: Schema, items_dicts: Items, fast: bool = False) -> Result:
    """Run JSON schema validation against Items.

    Args:
        fast: defines if we use fastjsonschema or jsonschema validation
    """
    validator = JsonSchemaValidator(schema)
    validator.run(items_dicts, fast)
    result = Result("JSON Schema Validation")

    errors = validator.errors
    schema_result_message = (
        f"{len(items_dicts)} items were checked, {len(errors)} error(s)")

    if errors:
        result.add_error(schema_result_message, errors=errors)
    else:
        result.add_info(schema_result_message)
    return result
コード例 #28
0
ファイル: metadata.py プロジェクト: zanachka/arche
def compare_number_of_scraped_items(source_job: Job,
                                    target_job: Job) -> Result:
    s_count = api.get_items_count(source_job)
    t_count = api.get_items_count(target_job)
    diff = helpers.ratio_diff(s_count, t_count)
    result = Result("Total Scraped Items")
    if 0 <= diff < 0.05:
        if diff == 0:
            msg = "Same number of items"
        else:
            msg = f"Almost the same number of items - {s_count} and {t_count}"
        result.add_info(msg)
    else:
        msg = f"{s_count} differs from {t_count} on {diff:.2%}"
        if 0.05 <= diff < 0.10:
            result.add_warning(msg)
        elif diff >= 0.10:
            result.add_error(msg)
    return result
コード例 #29
0
def validate(schema: RawSchema,
             raw_items: RawItems,
             keys: pd.Index,
             fast: bool = False) -> Result:
    """Run JSON schema validation against data.

    Args:
        fast: defines if we use fastjsonschema or jsonschema validation

    Returns:
        Schema errors if any
    """
    validate_func = fast_validate if fast else full_validate
    errors = validate_func(schema, raw_items, keys)
    result = Result("JSON Schema Validation")

    schema_result_message = (
        f"{len(raw_items)} items were checked, {len(errors)} error(s)")
    if errors:
        result.add_error(schema_result_message, errors=errors)
    else:
        result.add_info(schema_result_message)
    return result
コード例 #30
0
ファイル: coverage.py プロジェクト: vipulgupta2048/arche
def check_fields_coverage(df: pd.DataFrame) -> Result:
    """Get fields coverage from df. Coverage reflects the percentage of real values
    (exluding `nan`) per column.

    Args:
        df: a data to count the coverage

    Returns:
        A result with coverage for all columns in provided df. If column contains only `nan`,
        treat it as an error.
    """
    fields_coverage = df.count().sort_values(ascending=False)
    fields_coverage.name = f"Fields coverage for {len(df):_} items"

    empty_fields = fields_coverage[fields_coverage == 0]

    result = Result("Fields Coverage")
    result.stats = [fields_coverage]
    if empty_fields.empty:
        result.add_info("PASSED")
    else:
        result.add_error(f"{len(empty_fields)} empty field(s)")
    return result