def test_hash_bucket(col, num_buckets=5): df = pd.DataFrame({ "range": np.arange(10), "range_duplicated": np.repeat(np.arange(2), 5), "random": np.random.randint(0, 100, 10), }) hashed = _hash_bucket(df, [col], num_buckets) assert (hashed.groupby(col).agg({_KTK_HASH_BUCKET: "nunique"}) == 1).all().all() # Check that hashing is consistent for small dataframe sizes (where df.col.nunique() < num_buckets) df_sample = df.iloc[[0, 7]] hashed_sample = _hash_bucket(df_sample, [col], num_buckets) expected = hashed.loc[df_sample.index] pdt.assert_frame_equal(expected, hashed_sample)
def test_hashing_determinism(): """Make sure that the hashing algorithm used by pandas is independent of any context variables""" df = pd.DataFrame({"range": np.arange(10)}) hashed = _hash_bucket(df, ["range"], 5) expected = pd.DataFrame({ "range": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], _KTK_HASH_BUCKET: np.uint8([0, 0, 1, 2, 0, 3, 2, 0, 1, 4]), }) pdt.assert_frame_equal(hashed, expected)