Beispiel #1
0
 def get_items(
     source: Union[str, pd.DataFrame, RawItems],
     start: int,
     count: Optional[int],
     filters: Optional[api.Filters],
     expand: bool,
 ) -> Union[JobItems, CollectionItems]:
     if isinstance(source, pd.DataFrame):
         return Items.from_df(source, expand=expand)
     elif isinstance(source, Iterable) and not isinstance(source, str):
         return Items.from_array(source, expand=expand)
     elif helpers.is_job_key(source):
         return JobItems(key=source,
                         start=start,
                         count=count,
                         filters=filters,
                         expand=expand)
     elif helpers.is_collection_key(source):
         if start:
             raise ValueError(
                 "Collections API does not support 'start' parameter")
         return CollectionItems(key=source,
                                count=count,
                                filters=filters,
                                expand=expand)
     else:
         raise ValueError(
             f"'{source}' is not a valid job or collection key")
Beispiel #2
0
def test_flat_df(data, expected_data, expected_map):
    i = Items.from_array(data, expand=True)
    pd.testing.assert_frame_equal(i.flat_df,
                                  pd.DataFrame(expected_data),
                                  check_like=False)
    for new, old in expected_map.items():
        assert i.origin_column_name(new) == old
Beispiel #3
0
def test_process_df():
    df = Items.process_df(
        pd.DataFrame([[dict(), list(), "NameItem"]],
                     columns=["a", "b", "_type"]))
    exp_df = pd.DataFrame([[np.nan, np.nan, "NameItem"]],
                          columns=["a", "b", "_type"])
    pd.testing.assert_frame_equal(df, exp_df)
Beispiel #4
0
 def get_items(
     source: Union[str, pd.DataFrame, RawItems],
     count: Optional[int],
     start: Optional[str],
     filters: Optional[api.Filters],
 ) -> Items:
     if isinstance(source, pd.DataFrame):
         return Items.from_df(source)
     elif isinstance(source, Iterable) and not isinstance(source, str):
         return Items.from_array(cast(RawItems, source))
     elif helpers.is_job_key(source):
         return JobItems(source, count, int(start or 0), filters)
     elif helpers.is_collection_key(source):
         return CollectionItems(source, count, start, filters)
     else:
         raise ValueError(
             f"'{source}' is not a valid job or collection key")
Beispiel #5
0
def garbage_symbols(items: Items) -> Result:
    """Find unwanted symbols in `np.object` columns.

    Returns:
        A result containing item keys per field which contained any trash symbol
    """
    garbage = (
        r"(?P<spaces>^\s|\s$)"
        r"|(?P<html_entities>&amp|&reg)"
        r"|(?P<css>(?:(?:\.|#)[^#. ]+\s*){.+})"
        r"|(?P<html_tags></?(h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
        r"blockquote)\s*/?>|<!--|-->)")

    errors = {}
    row_keys = set()
    rule_result = Result("Garbage Symbols", items_count=items.size)

    for column in items.flat_df.select_dtypes([np.object]):
        matches = items.flat_df[column].str.extractall(garbage,
                                                       flags=re.IGNORECASE)
        matches = matches[["spaces", "html_entities", "css", "html_tags"]]
        if not matches.empty:
            error_keys = items.flat_df.iloc[
                matches.unstack().index.values]["_key"]
            original_column = items.get_origin_column_name(column)
            bad_texts = matches.stack().value_counts().index.sort_values(
            ).tolist()
            error = (
                f"{len(error_keys)/items.size*100:.1f}% of '{original_column}' "
                f"values contain {[t[:20] for t in bad_texts]}")
            errors[error] = list(error_keys)
            row_keys = row_keys.union(error_keys)

    if errors:
        rule_result.add_error(
            f"{len(row_keys)/items.size * 100:.1f}% ({len(row_keys)}) items affected",
            errors=errors,
        )
        rule_result.err_items_count = len(row_keys)

    return rule_result
Beispiel #6
0
def test_items_from_array(raw):
    items = Items.from_array(raw)
    np.testing.assert_array_equal(items.raw, np.array(raw))
    pd.testing.assert_frame_equal(items.df, pd.DataFrame(list(raw)))
Beispiel #7
0
def test_items_from_df(df, expected_raw, expected_df):
    items = Items.from_df(df)
    np.testing.assert_array_equal(items.raw, expected_raw)
    pd.testing.assert_frame_equal(items.df, expected_df)
Beispiel #8
0
def test_no_categorize():
    df = pd.DataFrame({"a": [i for i in range(99)]})
    Items.categorize(df)
    assert df.select_dtypes(["category"]).empty
Beispiel #9
0
def test_categorize(data, expected_cats):
    df = pd.DataFrame(data)
    Items.categorize(df)
    np.testing.assert_array_equal(
        df.select_dtypes(["category"]).columns.values, expected_cats)
Beispiel #10
0
def test_origin_column_name(get_cloud_items, name, expected_name):
    items = Items.from_df(pd.DataFrame(get_cloud_items))
    assert items.origin_column_name(name) == expected_name