Beispiel #1
0
def test_hash_bucket(col, num_buckets=5):
    df = pd.DataFrame({
        "range": np.arange(10),
        "range_duplicated": np.repeat(np.arange(2), 5),
        "random": np.random.randint(0, 100, 10),
    })
    hashed = _hash_bucket(df, [col], num_buckets)
    assert (hashed.groupby(col).agg({_KTK_HASH_BUCKET:
                                     "nunique"}) == 1).all().all()

    # Check that hashing is consistent for small dataframe sizes (where df.col.nunique() < num_buckets)
    df_sample = df.iloc[[0, 7]]
    hashed_sample = _hash_bucket(df_sample, [col], num_buckets)
    expected = hashed.loc[df_sample.index]
    pdt.assert_frame_equal(expected, hashed_sample)
Beispiel #2
0
def test_hashing_determinism():
    """Make sure that the hashing algorithm used by pandas is independent of any context variables"""
    df = pd.DataFrame({"range": np.arange(10)})
    hashed = _hash_bucket(df, ["range"], 5)
    expected = pd.DataFrame({
        "range": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        _KTK_HASH_BUCKET:
        np.uint8([0, 0, 1, 2, 0, 3, 2, 0, 1, 4]),
    })
    pdt.assert_frame_equal(hashed, expected)