Esempio n. 1
0
def test_group_by(df):
    udaf = f.udaf(Accumulator, pa.float64(), pa.float64(), [pa.float64()])

    df = df.aggregate([f.col("b")], [udaf(f.col("a"))])

    batches = df.collect()
    arrays = [batch.column(1) for batch in batches]
    joined = pa.concat_arrays(arrays)
    assert joined == pa.array([1.0 + 2.0, 3.0])
def test_string_functions(df):
    df = df.select(f.md5(f.col("a")), f.lower(f.col("a")))
    result = df.collect()
    assert len(result) == 1
    result = result[0]
    assert result.column(0) == pa.array([
        "8b1a9953c4611296a827abf8c47804d7",
        "f5a7924e621e84c9280a9a27e1bcb7f6",
        "9033e0e305f247c0c3c80d0c7848c8b3",
    ])
    assert result.column(1) == pa.array(["hello", "world", "!"])
Esempio n. 3
0
def test_filter(df):
    df = df.select(
        f.col("a") + f.col("b"),
        f.col("a") - f.col("b"),
    ).filter(f.col("a") > f.lit(2))

    # execute and collect the first (and only) batch
    result = df.collect()[0]

    assert result.column(0) == pa.array([9])
    assert result.column(1) == pa.array([-3])
Esempio n. 4
0
def test_select(df):
    df = df.select(
        f.col("a") + f.col("b"),
        f.col("a") - f.col("b"),
    )

    # execute and collect the first (and only) batch
    result = df.collect()[0]

    assert result.column(0) == pa.array([5, 7, 9])
    assert result.column(1) == pa.array([-3, -3, -3])
def test_built_in_aggregation(df):
    col_a = f.col("a")
    col_b = f.col("b")
    df = df.aggregate(
        [],
        [f.max(col_a),
         f.min(col_a),
         f.count(col_a),
         f.approx_distinct(col_b)],
    )
    result = df.collect()[0]
    assert result.column(0) == pa.array([3])
    assert result.column(1) == pa.array([1])
    assert result.column(2) == pa.array([3], type=pa.uint64())
    assert result.column(3) == pa.array([2], type=pa.uint64())
Esempio n. 6
0
def test_math_functions(df):
    values = np.array([0.1, -0.7, 0.55])
    col_v = f.col("value")
    df = df.select(
        f.abs(col_v),
        f.sin(col_v),
        f.cos(col_v),
        f.tan(col_v),
        f.asin(col_v),
        f.acos(col_v),
        f.exp(col_v),
        f.ln(col_v + f.lit(1)),
        f.log2(col_v + f.lit(1)),
        f.log10(col_v + f.lit(1)),
        f.random(),
    )
    result = df.collect()
    assert len(result) == 1
    result = result[0]
    np.testing.assert_array_almost_equal(result.column(0), np.abs(values))
    np.testing.assert_array_almost_equal(result.column(1), np.sin(values))
    np.testing.assert_array_almost_equal(result.column(2), np.cos(values))
    np.testing.assert_array_almost_equal(result.column(3), np.tan(values))
    np.testing.assert_array_almost_equal(result.column(4), np.arcsin(values))
    np.testing.assert_array_almost_equal(result.column(5), np.arccos(values))
    np.testing.assert_array_almost_equal(result.column(6), np.exp(values))
    np.testing.assert_array_almost_equal(result.column(7),
                                         np.log(values + 1.0))
    np.testing.assert_array_almost_equal(result.column(8),
                                         np.log2(values + 1.0))
    np.testing.assert_array_almost_equal(result.column(9),
                                         np.log10(values + 1.0))
    np.testing.assert_array_less(result.column(10), np.ones_like(values))
Esempio n. 7
0
def test_sort(df):
    df = df.sort([f.col("b").sort(ascending=False)])

    table = pa.Table.from_batches(df.collect())
    expected = {"a": [3, 2, 1], "b": [6, 5, 4]}

    assert table.to_pydict() == expected
Esempio n. 8
0
def test_udf(df):
    # is_null is a pa function over arrays
    udf = f.udf(lambda x: x.is_null(), [pa.int64()], pa.bool_())

    df = df.select(udf(f.col("a")))
    result = df.collect()[0].column(0)

    assert result == pa.array([False, False, False])
Esempio n. 9
0
def test_aggregate(df):
    udaf = f.udaf(Accumulator, pa.float64(), pa.float64(), [pa.float64()])

    df = df.aggregate([], [udaf(f.col("a"))])

    # execute and collect the first (and only) batch
    result = df.collect()[0]

    assert result.column(0) == pa.array([1.0 + 2.0 + 3.0])
Esempio n. 10
0
def test_join():
    ctx = ExecutionContext()

    batch = pa.RecordBatch.from_arrays(
        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
        names=["a", "b"],
    )
    df = ctx.create_dataframe([[batch]])

    batch = pa.RecordBatch.from_arrays(
        [pa.array([1, 2]), pa.array([8, 10])],
        names=["a", "c"],
    )
    df1 = ctx.create_dataframe([[batch]])

    df = df.join(df1, join_keys=(["a"], ["a"]), how="inner")
    df = df.sort([f.col("a").sort(ascending=True)])
    table = pa.Table.from_batches(df.collect())

    expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]}
    assert table.to_pydict() == expected
def test_hash_functions(df):
    df = df.select(*[
        f.digest(f.col("a"), f.lit(m))
        for m in ("md5", "sha256", "sha512", "blake2s", "blake3")
    ])
    result = df.collect()
    assert len(result) == 1
    result = result[0]
    b = bytearray.fromhex
    assert result.column(0) == pa.array([
        b("8B1A9953C4611296A827ABF8C47804D7"),
        b("F5A7924E621E84C9280A9A27E1BCB7F6"),
        b("9033E0E305F247C0C3C80D0C7848C8B3"),
    ])
    assert result.column(1) == pa.array([
        b("185F8DB32271FE25F561A6FC938B2E264306EC304EDA518007D1764826381969"),
        b("78AE647DC5544D227130A0682A51E30BC7777FBB6D8A8F17007463A3ECD1D524"),
        b("BB7208BC9B5D7C04F1236A82A0093A5E33F40423D5BA8D4266F7092C3BA43B62"),
    ])
    assert result.column(2) == pa.array([
        b("3615F80C9D293ED7402687F94B22D58E529B8CC7916F8FAC7FDDF7FBD5AF4CF777D3D795A7A00A16BF7E7F3FB9561EE9BAAE480DA9FE7A18769E71886B03F315"
          ),
        b("8EA77393A42AB8FA92500FB077A9509CC32BC95E72712EFA116EDAF2EDFAE34FBB682EFDD6C5DD13C117E08BD4AAEF71291D8AACE2F890273081D0677C16DF0F"
          ),
        b("3831A6A6155E509DEE59A7F451EB35324D8F8F2DF6E3708894740F98FDEE23889F4DE5ADB0C5010DFB555CDA77C8AB5DC902094C52DE3278F35A75EBC25F093A"
          ),
    ])
    assert result.column(3) == pa.array([
        b("F73A5FBF881F89B814871F46E26AD3FA37CB2921C5E8561618639015B3CCBB71"),
        b("B792A0383FB9E7A189EC150686579532854E44B71AC394831DAED169BA85CCC5"),
        b("27988A0E51812297C77A433F635233346AEE29A829DCF4F46E0F58F402C6CFCB"),
    ])
    assert result.column(4) == pa.array([
        b("FBC2B0516EE8744D293B980779178A3508850FDCFE965985782C39601B65794F"),
        b("BF73D18575A736E4037D45F9E316085B86C19BE6363DE6AA789E13DEAACC1C4E"),
        b("C8D11B9F7237E4034ADBCD2005735F9BC4C597C75AD89F4492BEC8F77D15F7EB"),
    ])