def test_group_by(df): udaf = f.udaf(Accumulator, pa.float64(), pa.float64(), [pa.float64()]) df = df.aggregate([f.col("b")], [udaf(f.col("a"))]) batches = df.collect() arrays = [batch.column(1) for batch in batches] joined = pa.concat_arrays(arrays) assert joined == pa.array([1.0 + 2.0, 3.0])
def test_string_functions(df): df = df.select(f.md5(f.col("a")), f.lower(f.col("a"))) result = df.collect() assert len(result) == 1 result = result[0] assert result.column(0) == pa.array([ "8b1a9953c4611296a827abf8c47804d7", "f5a7924e621e84c9280a9a27e1bcb7f6", "9033e0e305f247c0c3c80d0c7848c8b3", ]) assert result.column(1) == pa.array(["hello", "world", "!"])
def test_filter(df): df = df.select( f.col("a") + f.col("b"), f.col("a") - f.col("b"), ).filter(f.col("a") > f.lit(2)) # execute and collect the first (and only) batch result = df.collect()[0] assert result.column(0) == pa.array([9]) assert result.column(1) == pa.array([-3])
def test_select(df): df = df.select( f.col("a") + f.col("b"), f.col("a") - f.col("b"), ) # execute and collect the first (and only) batch result = df.collect()[0] assert result.column(0) == pa.array([5, 7, 9]) assert result.column(1) == pa.array([-3, -3, -3])
def test_built_in_aggregation(df): col_a = f.col("a") col_b = f.col("b") df = df.aggregate( [], [f.max(col_a), f.min(col_a), f.count(col_a), f.approx_distinct(col_b)], ) result = df.collect()[0] assert result.column(0) == pa.array([3]) assert result.column(1) == pa.array([1]) assert result.column(2) == pa.array([3], type=pa.uint64()) assert result.column(3) == pa.array([2], type=pa.uint64())
def test_math_functions(df): values = np.array([0.1, -0.7, 0.55]) col_v = f.col("value") df = df.select( f.abs(col_v), f.sin(col_v), f.cos(col_v), f.tan(col_v), f.asin(col_v), f.acos(col_v), f.exp(col_v), f.ln(col_v + f.lit(1)), f.log2(col_v + f.lit(1)), f.log10(col_v + f.lit(1)), f.random(), ) result = df.collect() assert len(result) == 1 result = result[0] np.testing.assert_array_almost_equal(result.column(0), np.abs(values)) np.testing.assert_array_almost_equal(result.column(1), np.sin(values)) np.testing.assert_array_almost_equal(result.column(2), np.cos(values)) np.testing.assert_array_almost_equal(result.column(3), np.tan(values)) np.testing.assert_array_almost_equal(result.column(4), np.arcsin(values)) np.testing.assert_array_almost_equal(result.column(5), np.arccos(values)) np.testing.assert_array_almost_equal(result.column(6), np.exp(values)) np.testing.assert_array_almost_equal(result.column(7), np.log(values + 1.0)) np.testing.assert_array_almost_equal(result.column(8), np.log2(values + 1.0)) np.testing.assert_array_almost_equal(result.column(9), np.log10(values + 1.0)) np.testing.assert_array_less(result.column(10), np.ones_like(values))
def test_sort(df): df = df.sort([f.col("b").sort(ascending=False)]) table = pa.Table.from_batches(df.collect()) expected = {"a": [3, 2, 1], "b": [6, 5, 4]} assert table.to_pydict() == expected
def test_udf(df): # is_null is a pa function over arrays udf = f.udf(lambda x: x.is_null(), [pa.int64()], pa.bool_()) df = df.select(udf(f.col("a"))) result = df.collect()[0].column(0) assert result == pa.array([False, False, False])
def test_aggregate(df): udaf = f.udaf(Accumulator, pa.float64(), pa.float64(), [pa.float64()]) df = df.aggregate([], [udaf(f.col("a"))]) # execute and collect the first (and only) batch result = df.collect()[0] assert result.column(0) == pa.array([1.0 + 2.0 + 3.0])
def test_join(): ctx = ExecutionContext() batch = pa.RecordBatch.from_arrays( [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]]) batch = pa.RecordBatch.from_arrays( [pa.array([1, 2]), pa.array([8, 10])], names=["a", "c"], ) df1 = ctx.create_dataframe([[batch]]) df = df.join(df1, join_keys=(["a"], ["a"]), how="inner") df = df.sort([f.col("a").sort(ascending=True)]) table = pa.Table.from_batches(df.collect()) expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} assert table.to_pydict() == expected
def test_hash_functions(df): df = df.select(*[ f.digest(f.col("a"), f.lit(m)) for m in ("md5", "sha256", "sha512", "blake2s", "blake3") ]) result = df.collect() assert len(result) == 1 result = result[0] b = bytearray.fromhex assert result.column(0) == pa.array([ b("8B1A9953C4611296A827ABF8C47804D7"), b("F5A7924E621E84C9280A9A27E1BCB7F6"), b("9033E0E305F247C0C3C80D0C7848C8B3"), ]) assert result.column(1) == pa.array([ b("185F8DB32271FE25F561A6FC938B2E264306EC304EDA518007D1764826381969"), b("78AE647DC5544D227130A0682A51E30BC7777FBB6D8A8F17007463A3ECD1D524"), b("BB7208BC9B5D7C04F1236A82A0093A5E33F40423D5BA8D4266F7092C3BA43B62"), ]) assert result.column(2) == pa.array([ b("3615F80C9D293ED7402687F94B22D58E529B8CC7916F8FAC7FDDF7FBD5AF4CF777D3D795A7A00A16BF7E7F3FB9561EE9BAAE480DA9FE7A18769E71886B03F315" ), b("8EA77393A42AB8FA92500FB077A9509CC32BC95E72712EFA116EDAF2EDFAE34FBB682EFDD6C5DD13C117E08BD4AAEF71291D8AACE2F890273081D0677C16DF0F" ), b("3831A6A6155E509DEE59A7F451EB35324D8F8F2DF6E3708894740F98FDEE23889F4DE5ADB0C5010DFB555CDA77C8AB5DC902094C52DE3278F35A75EBC25F093A" ), ]) assert result.column(3) == pa.array([ b("F73A5FBF881F89B814871F46E26AD3FA37CB2921C5E8561618639015B3CCBB71"), b("B792A0383FB9E7A189EC150686579532854E44B71AC394831DAED169BA85CCC5"), b("27988A0E51812297C77A433F635233346AEE29A829DCF4F46E0F58F402C6CFCB"), ]) assert result.column(4) == pa.array([ b("FBC2B0516EE8744D293B980779178A3508850FDCFE965985782C39601B65794F"), b("BF73D18575A736E4037D45F9E316085B86C19BE6363DE6AA789E13DEAACC1C4E"), b("C8D11B9F7237E4034ADBCD2005735F9BC4C597C75AD89F4492BEC8F77D15F7EB"), ])