def get_items( source: Union[str, pd.DataFrame, RawItems], start: int, count: Optional[int], filters: Optional[api.Filters], expand: bool, ) -> Union[JobItems, CollectionItems]: if isinstance(source, pd.DataFrame): return Items.from_df(source, expand=expand) elif isinstance(source, Iterable) and not isinstance(source, str): return Items.from_array(source, expand=expand) elif helpers.is_job_key(source): return JobItems(key=source, start=start, count=count, filters=filters, expand=expand) elif helpers.is_collection_key(source): if start: raise ValueError( "Collections API does not support 'start' parameter") return CollectionItems(key=source, count=count, filters=filters, expand=expand) else: raise ValueError( f"'{source}' is not a valid job or collection key")
def test_flat_df(data, expected_data, expected_map): i = Items.from_array(data, expand=True) pd.testing.assert_frame_equal(i.flat_df, pd.DataFrame(expected_data), check_like=False) for new, old in expected_map.items(): assert i.origin_column_name(new) == old
def test_process_df(): df = Items.process_df( pd.DataFrame([[dict(), list(), "NameItem"]], columns=["a", "b", "_type"])) exp_df = pd.DataFrame([[np.nan, np.nan, "NameItem"]], columns=["a", "b", "_type"]) pd.testing.assert_frame_equal(df, exp_df)
def get_items( source: Union[str, pd.DataFrame, RawItems], count: Optional[int], start: Optional[str], filters: Optional[api.Filters], ) -> Items: if isinstance(source, pd.DataFrame): return Items.from_df(source) elif isinstance(source, Iterable) and not isinstance(source, str): return Items.from_array(cast(RawItems, source)) elif helpers.is_job_key(source): return JobItems(source, count, int(start or 0), filters) elif helpers.is_collection_key(source): return CollectionItems(source, count, start, filters) else: raise ValueError( f"'{source}' is not a valid job or collection key")
def garbage_symbols(items: Items) -> Result: """Find unwanted symbols in `np.object` columns. Returns: A result containing item keys per field which contained any trash symbol """ garbage = ( r"(?P<spaces>^\s|\s$)" r"|(?P<html_entities>&|®)" r"|(?P<css>(?:(?:\.|#)[^#. ]+\s*){.+})" r"|(?P<html_tags></?(h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|" r"blockquote)\s*/?>|<!--|-->)") errors = {} row_keys = set() rule_result = Result("Garbage Symbols", items_count=items.size) for column in items.flat_df.select_dtypes([np.object]): matches = items.flat_df[column].str.extractall(garbage, flags=re.IGNORECASE) matches = matches[["spaces", "html_entities", "css", "html_tags"]] if not matches.empty: error_keys = items.flat_df.iloc[ matches.unstack().index.values]["_key"] original_column = items.get_origin_column_name(column) bad_texts = matches.stack().value_counts().index.sort_values( ).tolist() error = ( f"{len(error_keys)/items.size*100:.1f}% of '{original_column}' " f"values contain {[t[:20] for t in bad_texts]}") errors[error] = list(error_keys) row_keys = row_keys.union(error_keys) if errors: rule_result.add_error( f"{len(row_keys)/items.size * 100:.1f}% ({len(row_keys)}) items affected", errors=errors, ) rule_result.err_items_count = len(row_keys) return rule_result
def test_items_from_array(raw): items = Items.from_array(raw) np.testing.assert_array_equal(items.raw, np.array(raw)) pd.testing.assert_frame_equal(items.df, pd.DataFrame(list(raw)))
def test_items_from_df(df, expected_raw, expected_df): items = Items.from_df(df) np.testing.assert_array_equal(items.raw, expected_raw) pd.testing.assert_frame_equal(items.df, expected_df)
def test_no_categorize(): df = pd.DataFrame({"a": [i for i in range(99)]}) Items.categorize(df) assert df.select_dtypes(["category"]).empty
def test_categorize(data, expected_cats): df = pd.DataFrame(data) Items.categorize(df) np.testing.assert_array_equal( df.select_dtypes(["category"]).columns.values, expected_cats)
def test_origin_column_name(get_cloud_items, name, expected_name): items = Items.from_df(pd.DataFrame(get_cloud_items)) assert items.origin_column_name(name) == expected_name