def test_dataframe_scatter_by_map(map_size, nelem, keep): strlist = ["dog", "cat", "fish", "bird", "pig", "fox", "cow", "goat"] np.random.seed(0) df = DataFrame() df["a"] = np.random.choice(strlist[:map_size], nelem) df["b"] = np.random.uniform(low=0, high=map_size, size=nelem) df["c"] = np.random.randint(map_size, size=nelem) df["d"] = df["a"].astype("category") def _check_scatter_by_map(dfs, col): assert len(dfs) == map_size nrows = 0 # print(col._column) name = col.name for i, df in enumerate(dfs): nrows += len(df) if len(df) > 0: # Make sure the column types were preserved assert isinstance(df[name]._column, type(col._column)) try: sr = df[name].astype(np.int32) except ValueError: sr = df[name] assert sr.nunique() <= 1 if sr.nunique() == 1: if isinstance(df[name]._column, NumericalColumn): assert sr.iloc[0] == i assert nrows == nelem _check_scatter_by_map(df.scatter_by_map("a", map_size, keep_index=keep), df["a"]) _check_scatter_by_map(df.scatter_by_map("b", map_size, keep_index=keep), df["b"]) _check_scatter_by_map(df.scatter_by_map("c", map_size, keep_index=keep), df["c"]) _check_scatter_by_map(df.scatter_by_map("d", map_size, keep_index=keep), df["d"]) if map_size == 2 and nelem == 100: df.scatter_by_map("a") # Auto-detect map_size with pytest.raises(ValueError): df.scatter_by_map("a", map_size=1, debug=True) # Bad map_size # Test GenericIndex df2 = df.set_index("c") generic_result = df2.scatter_by_map("b", map_size, keep_index=keep) _check_scatter_by_map(generic_result, df2["b"]) if keep: for frame in generic_result: isinstance(frame.index, type(df2.index)) # Test MultiIndex df2 = df.set_index(["a", "c"]) multiindex_result = df2.scatter_by_map("b", map_size, keep_index=keep) _check_scatter_by_map(multiindex_result, df2["b"]) if keep: for frame in multiindex_result: isinstance(frame.index, type(df2.index))
def test_dataframe_scatter_by_map(map_size, nelem): strlist = ["dog", "cat", "fish", "bird", "pig", "fox", "cow", "goat"] np.random.seed(0) df = DataFrame() df["a"] = np.random.choice(strlist[:map_size], nelem) df["b"] = np.random.uniform(low=0, high=map_size, size=nelem) df["c"] = np.random.randint(map_size, size=nelem) df["d"] = df["a"]._column.as_categorical_column(np.int32) def _check_scatter_by_map(dfs, col): assert len(dfs) == map_size nrows = 0 for df in dfs: nrows += len(df) assert df[col].astype(np.int32).nunique() <= 1 assert nrows == nelem _check_scatter_by_map(df.scatter_by_map("a", map_size), "a") _check_scatter_by_map(df.scatter_by_map("b", map_size), "b") _check_scatter_by_map(df.scatter_by_map("c", map_size), "c") _check_scatter_by_map(df.scatter_by_map("d", map_size), "d") if map_size == 2 and nelem == 100: df.scatter_by_map("a") # Auto-detect map_size with pytest.raises(ValueError): df.scatter_by_map("a", 1) # Bad map_size