def test_drop_singletons_slow(): rs = np.random.RandomState(0) c1 = rs.randint(0, 10000, (40000, 1)) c2 = rs.randint(0, 20000, (40000, 1)) cats = np.concatenate([c1, c2], 1) retain = in_2core_graph_slow(cats) nonsingletons = cats[retain] for col in (c1, c2): uniq, counts = np.unique(col, return_counts=True) assert not np.any(np.isin(col[retain], uniq[counts == 1])) idx = np.arange(40000) cols = {"c1": c1.copy(), "c2": c2.copy()} for i in range(40000): last = cols["c1"].shape[0] for col in cols: keep = in_2core_graph_slow(cols[col]) for col2 in cols: cols[col2] = cols[col2][keep] idx = idx[keep] if cols["c1"].shape[0] == last: break expected = np.concatenate([c1[idx], c2[idx]], 1) assert_array_equal(nonsingletons, expected) expected = np.concatenate([cols["c1"], cols["c2"]], 1) assert_array_equal(nonsingletons, expected) dummies, _ = dummy_matrix(cats, output_format="csr", precondition=False) to_drop = dummies[~retain] assert to_drop.sum() == 2 * (~retain).sum()
def test_drop_singletons(): rs = np.random.RandomState(0) c1 = rs.randint(0, 10000, (40000, 1)) c2 = rs.randint(0, 20000, (40000, 1)) cats = np.concatenate([c1, c2], 1) remain = in_2core_graph(cats) expected = in_2core_graph_slow(cats) assert_array_equal(remain, expected)
def test_drop_singletons_large(): rs = np.random.RandomState(1234) m = 2000000 c1 = rs.randint(0, m // 3, m) c2 = rs.randint(0, m // 20, m) cats = np.column_stack([c1, c2]) retain = in_2core_graph(cats) expected = in_2core_graph_slow(cats) assert_array_equal(retain, expected)
def test_drop_singletons_single(): rs = np.random.RandomState(0) cats = rs.randint(0, 10000, (40000, 1)) retain = in_2core_graph_slow(cats) nonsingletons = cats[retain] cats = pd.Series(cats[:, 0]) vc = cats.value_counts() expected = np.sort(np.asarray(vc.index[vc > 1])) assert_array_equal(np.unique(nonsingletons), expected) assert vc[vc > 1].sum() == nonsingletons.shape[0] singletons = np.asarray(vc.index[vc == 1]) assert nonsingletons.shape[0] == (40000 - singletons.shape[0]) assert not np.any(np.isin(nonsingletons, singletons))
def test_drop_singletons_pandas(): rs = np.random.RandomState(0) c1 = rs.randint(0, 10000, (40000, 1)) c2 = rs.randint(0, 20000, (40000, 1)) df = [ pd.Series(["{0}{1}".format(let, c) for c in cat.ravel()], dtype="category") for let, cat in zip("AB", (c1, c2)) ] df = pd.concat(df, 1) df.columns = ["cat1", "cat2"] cats = df remain = in_2core_graph(cats) expected = in_2core_graph_slow(cats) assert_array_equal(remain, expected)