Beispiel #1
0
 def test_dropna(self):
     e = self.engine
     a = e.to_df([[4, None, 6], [1, 2, 3], [4, None, None]],
                 "a:double,b:double,c:double")
     c = e.dropna(a, metadata=(dict(a=1)))  # default
     d = e.dropna(a, how="all")
     f = e.dropna(a, how="any", thresh=2)
     g = e.dropna(a, how="any", subset=["a", "c"])
     h = e.dropna(a, how="any", thresh=1, subset=["a", "c"])
     df_eq(
         c,
         [[1, 2, 3]],
         "a:double,b:double,c:double",
         metadata=dict(a=1),
         throw=True,
     )
     df_eq(
         d,
         [[4, None, 6], [1, 2, 3], [4, None, None]],
         "a:double,b:double,c:double",
         throw=True,
     )
     df_eq(f, [[4, None, 6], [1, 2, 3]],
           "a:double,b:double,c:double",
           throw=True)
     df_eq(g, [[4, None, 6], [1, 2, 3]],
           "a:double,b:double,c:double",
           throw=True)
     df_eq(
         h,
         [[4, None, 6], [1, 2, 3], [4, None, None]],
         "a:double,b:double,c:double",
         throw=True,
     )
Beispiel #2
0
    def test_map_in_pandas(self):
        if not hasattr(ps.DataFrame, "mapInPandas"):
            return

        def add(cursor, data):
            assert isinstance(data, LocalDataFrameIterableDataFrame)

            def get_dfs():
                for df in data.native:
                    pdf = df.as_pandas()
                    pdf["zz"] = pdf["xx"] + pdf["yy"]
                    yield PandasDataFrame(pdf)

            return LocalDataFrameIterableDataFrame(get_dfs())

        e = self.engine
        np.random.seed(0)
        df = pd.DataFrame(np.random.randint(0, 5, (100000, 2)),
                          columns=["xx", "yy"])
        expected = PandasDataFrame(df.assign(zz=df.xx + df.yy),
                                   "xx:int,yy:int,zz:int")
        a = e.to_df(df)
        # no partition
        c = e.map(a, add, "xx:int,yy:int,zz:int", PartitionSpec(num=16))
        df_eq(c, expected, throw=True)
Beispiel #3
0
 def test_map_with_dict_col(self):
     e = self.engine
     dt = datetime.now()
     # test dict
     o = ArrayDataFrame([[dt, dict(a=1)]], "a:datetime,b:{a:int}")
     c = e.map(o, select_top, o.schema, PartitionSpec(by=["a"]))
     df_eq(c, o, no_pandas=True, check_order=True, throw=True)
Beispiel #4
0
def test_workflow():
    builder = FugueWorkflow()

    a = builder.create_data([[0], [0], [1]], "a:int")
    raises(InvalidOperationError, lambda: a._task.copy())
    raises(InvalidOperationError, lambda: copy.copy(a._task))
    raises(InvalidOperationError, lambda: copy.deepcopy(a._task))
    a.show()
    a.show()

    raises(FugueWorkflowCompileError, lambda: builder.df(123))

    b = a.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"]))
    b.show()
    builder.create_data([[0], [1]], "b:int").show()
    c = ArrayDataFrame([[100]], "a:int")
    builder.show(a, b, c)
    b = a.partition(by=["a"]).transform(mock_tf2).persist().broadcast()
    b.show()

    builder.run()
    df_eq(a.result, [[0], [0], [1]], "a:int")
    raises(TypeError, lambda: builder.run("abc"))
    builder.run(FugueWorkflowContext())
    df_eq(a.result, [[0], [0], [1]], "a:int")
    builder.run("NativeExecutionEngine")
    df_eq(b.result, [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
    df_eq(b.compute(), [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
    df_eq(b.compute(NativeExecutionEngine), [[0, 2], [0, 2], [1, 1]],
          "a:int,b:int")
Beispiel #5
0
 def test_save_and_load_parquet(self):
     e = self.engine
     b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long")
     path = os.path.join(self.tmpdir, "a", "b")
     e.save_df(b, path, format_hint="parquet")
     c = e.load_df(path, format_hint="parquet", columns=["a", "c"])
     df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True)
Beispiel #6
0
 def test_load_csv_folder(self):
     e = self.engine
     native = NativeExecutionEngine()
     a = ArrayDataFrame([[6.1, 1.1]], "c:double,a:double")
     b = ArrayDataFrame([[2.1, 7.1], [4.1, 8.1]], "c:double,a:double")
     path = os.path.join(self.tmpdir, "a", "b")
     native.save_df(a,
                    os.path.join(path, "a.csv"),
                    format_hint="csv",
                    header=True)
     native.save_df(b,
                    os.path.join(path, "b.csv"),
                    format_hint="csv",
                    header=True)
     FileSystem().touch(os.path.join(path, "_SUCCESS"))
     c = e.load_df(
         path,
         format_hint="csv",
         header=True,
         infer_schema=True,
         columns=["a", "c"],
     )
     df_eq(c, [[1.1, 6.1], [7.1, 2.1], [8.1, 4.1]],
           "a:double,c:double",
           throw=True)
Beispiel #7
0
        def test_workflows(self):
            a = FugueWorkflow().df([[0]], "a:int")
            df_eq(a.compute(self.engine), [[0]], "a:int")

            a = _FugueInteractiveWorkflow(self.engine).df([[0]],
                                                          "a:int").persist()
            df_eq(a.result, [[0]], "a:int")
Beispiel #8
0
        def test_to_df_general(self):
            e = self.engine
            o = ArrayDataFrame(
                [[1.1, 2.2], [3.3, 4.4]],
                "a:double,b:double",
                dict(a=1),
            )
            # all engines should accept these types of inputs
            # should take fugue.DataFrame
            df_eq(o, e.to_df(o), throw=True)
            # should take array, shema and metadata
            df_eq(
                o,
                e.to_df([[1.1, 2.2], [3.3, 4.4]], "a:double,b:double",
                        dict(a=1)),
                throw=True,
            )
            # should take pandas dataframe
            pdf = pd.DataFrame([[1.1, 2.2], [3.3, 4.4]], columns=["a", "b"])
            df_eq(o, e.to_df(pdf, metadata=dict(a=1)), throw=True)

            # should convert string to datetime in to_df
            df_eq(
                e.to_df([["2020-01-01"]], "a:datetime"),
                [[datetime(2020, 1, 1)]],
                "a:datetime",
                throw=True,
            )

            # should handle empty pandas dataframe
            o = ArrayDataFrame([], "a:double,b:str")
            pdf = pd.DataFrame([[0.1, "a"]], columns=["a", "b"])
            pdf = pdf[pdf.a < 0]
            df_eq(o, e.to_df(pdf), throw=True)
 def test__join_with_null_keys(self):
     # SQL will not match null values
     e = self.engine
     a = e.to_df([[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int")
     b = e.to_df([[1, 2, 33], [4, None, 63]], "a:double,b:double,d:int")
     c = e.join(a, b, how="INNER")
     df_eq(c, [[1, 2, 3, 33]], "a:double,b:double,c:int,d:int", throw=True)
Beispiel #10
0
        def test_assign(self):
            e = self.engine
            o = ArrayDataFrame(
                [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
                "a:double,b:int",
                dict(a=1),
            )
            a = e.to_df(o)

            b = e.assign(
                a,
                [
                    lit(1, "x"),
                    col("b").cast(str), (col("b") + 1).alias("c").cast(int)
                ],
            )
            df_eq(
                b,
                [
                    [1, "2", 1, 3],
                    [None, "2", 1, 3],
                    [None, "1", 1, 2],
                    [3, "4", 1, 5],
                    [None, "4", 1, 5],
                ],
                "a:double,b:str,x:long,c:long",
                throw=True,
            )
Beispiel #11
0
 def test_save_and_load_avro(self):
     # TODO: switch to c:int,a:long when we can preserve schema to avro
     e = self.engine
     b = ArrayDataFrame([[6, 1], [2, 7]], "c:long,a:long")
     path = os.path.join(self.tmpdir, "a", "b")
     e.save_df(b, path, format_hint="avro")
     c = e.load_df(path, format_hint="avro", columns=["a", "c"])
     df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True)
Beispiel #12
0
 def test_to_df(self):
     e = self.engine
     a = e.to_df([[1, 2], [3, 4]], "a:int,b:int", dict(a=1))
     df_eq(a, [[1, 2], [3, 4]], "a:int,b:int", dict(a=1), throw=True)
     a = e.to_df(PandasDataFrame([[1, 2], [3, 4]], "a:int,b:int",
                                 dict(a=1)))
     df_eq(a, [[1, 2], [3, 4]], "a:int,b:int", dict(a=1), throw=True)
     assert a is e.to_df(a)
Beispiel #13
0
 def assert_eq(df, df_expected=None, raw=False):
     if df_expected is None:
         df_expected = df
     df_actual = deserialize_df(serialize_df(df))
     if raw:
         assert df_expected.native == df_actual.native
     else:
         df_eq(df_expected, df_actual, throw=True)
Beispiel #14
0
 def test_load_parquet_folder(self):
     e = self.engine
     native = NativeExecutionEngine()
     a = ArrayDataFrame([[6, 1]], "c:int,a:long")
     b = ArrayDataFrame([[2, 7], [4, 8]], "c:int,a:long")
     path = os.path.join(self.tmpdir, "a", "b")
     native.save_df(a, os.path.join(path, "a.parquet"))
     native.save_df(b, os.path.join(path, "b.parquet"))
     FileSystem().touch(os.path.join(path, "_SUCCESS"))
     c = e.load_df(path, format_hint="parquet", columns=["a", "c"])
     df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True)
 def test_subtract(self):
     e = self.engine
     a = e.to_df([[1, 2, 3], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int")
     b = e.to_df([[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int")
     c = e.subtract(a, b, metadata=dict(a=1))
     df_eq(
         c,
         [[1, 2, 3]],
         "a:double,b:double,c:int",
         metadata=dict(a=1),
         throw=True,
     )
Beispiel #16
0
 def test_distinct(self):
     e = self.engine
     a = e.to_df([[4, None, 6], [1, 2, 3], [4, None, 6]],
                 "a:double,b:double,c:int")
     c = e.distinct(a, metadata=dict(a=1))
     df_eq(
         c,
         [[4, None, 6], [1, 2, 3]],
         "a:double,b:double,c:int",
         metadata=dict(a=1),
         throw=True,
     )
Beispiel #17
0
 def test_to_df(self):
     e = self.engine
     o = ArrayDataFrame(
         [[1, 2]],
         "a:int,b:int",
         dict(a=1),
     )
     a = e.to_df(o)
     assert a is not o
     df_eq(a, o, throw=True)
     a = e.to_df([[1, None]], "a:int,b:int", dict(a=1))
     df_eq(a, [[1, None]], "a:int,b:int", dict(a=1), throw=True)
Beispiel #18
0
        def test_map(self):
            def noop(cursor, data):
                return data

            def on_init(partition_no, data):
                # TODO: this test is not sufficient
                assert partition_no >= 0
                data.peek_array()

            e = self.engine
            o = ArrayDataFrame(
                [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
                "a:double,b:int",
                dict(a=1),
            )
            a = e.to_df(o)
            # no partition
            c = e.map(a, noop, a.schema, PartitionSpec(), dict(a=1))
            df_eq(c, o, throw=True)
            # with key partition
            c = e.map(a, noop, a.schema, PartitionSpec(by=["a"], presort="b"),
                      dict(a=1))
            df_eq(c, o, throw=True)
            # select top
            c = e.map(a, select_top, a.schema,
                      PartitionSpec(by=["a"], presort="b"))
            df_eq(c, [[None, 1], [1, 2], [3, 4]], "a:double,b:int", throw=True)
            # select top with another order
            c = e.map(
                a,
                select_top,
                a.schema,
                PartitionSpec(partition_by=["a"], presort="b DESC"),
                metadata=dict(a=1),
            )
            df_eq(
                c,
                [[None, 4], [1, 2], [3, 4]],
                "a:double,b:int",
                metadata=dict(a=1),
                throw=True,
            )
            # add num_partitions, on_init should not matter
            c = e.map(
                a,
                select_top,
                a.schema,
                PartitionSpec(partition_by=["a"],
                              presort="b DESC",
                              num_partitions=3),
                on_init=on_init,
            )
            df_eq(c, [[None, 4], [1, 2], [3, 4]], "a:double,b:int", throw=True)
Beispiel #19
0
 def test_load_avro_folder(self):
     # TODO: switch to c:int,a:long when we can preserve schema to avro
     e = self.engine
     native = NativeExecutionEngine()
     a = ArrayDataFrame([[6, 1]], "c:long,a:long")
     b = ArrayDataFrame([[2, 7], [4, 8]], "c:long,a:long")
     path = os.path.join(self.tmpdir, "a", "b")
     native.save_df(a, os.path.join(path, "a.avro"))
     native.save_df(b, os.path.join(path, "b.avro"))
     FileSystem().touch(os.path.join(path, "_SUCCESS"))
     c = e.load_df(path, format_hint="avro", columns=["a", "c"])
     df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:long", throw=True)
Beispiel #20
0
        def test_io(self):
            e = self.engine
            b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long")
            path = os.path.join(self.tmpdir, "a")
            e.save_df(b, path, format_hint="parquet", force_single=True)
            assert e.fs.isfile(path)
            c = e.load_df(path, format_hint="parquet", columns=["a", "c"])
            df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True)

            path = os.path.join(self.tmpdir, "b.csv")
            e.save_df(b, path, header=True)
            c = e.load_df(path, header=True, columns="c:int,a:long")
            df_eq(c, b, throw=True)
Beispiel #21
0
 def test_save_and_load_csv(self):
     e = self.engine
     b = ArrayDataFrame([[6.1, 1.1], [2.1, 7.1]], "c:double,a:double")
     path = os.path.join(self.tmpdir, "a", "b")
     e.save_df(b, path, format_hint="csv", header=True)
     c = e.load_df(
         path,
         format_hint="csv",
         header=True,
         infer_schema=True,
         columns=["a", "c"],
     )
     df_eq(c, [[1.1, 6.1], [7.1, 2.1]], "a:double,c:double", throw=True)
Beispiel #22
0
        def test_select(self):
            df = self.df([], "a:str,b:int")[["b"]]
            assert df.schema == "b:int"
            raises(FugueDataFrameOperationError, lambda: df[["a"]])  # not existed
            raises(FugueDataFrameOperationError, lambda: df[[]])  # empty

            df = self.df([["a", 1]], "a:str,b:int")[["b"]]
            assert df.schema == "b:int"
            raises(FugueDataFrameOperationError, lambda: df[["a"]])  # not existed
            raises(FugueDataFrameOperationError, lambda: df[[]])  # empty
            assert [[1]] == df.as_array(type_safe=True)

            df = self.df([["a", 1, 2]], "a:str,b:int,c:int")
            df_eq(df[["c", "a"]], [[2, "a"]], "a:str,c:int")
 def test_fillna(self):
     e = self.engine
     a = e.to_df(
         [[4, None, 6], [1, 2, 3], [4, None, None]], "a:double,b:double,c:double"
     )
     c = e.fillna(a, value=1, metadata=(dict(a=1)))
     d = e.fillna(a, {"b": 99, "c": -99})
     f = e.fillna(a, value=-99, subset=["c"])
     g = e.fillna(a, {"b": 99, "c": -99}, subset=["c"])  # subset ignored
     df_eq(
         c,
         [[4, 1, 6], [1, 2, 3], [4, 1, 1]],
         "a:double,b:double,c:double",
         metadata=dict(a=1),
         throw=True,
     )
     df_eq(
         d,
         [[4, 99, 6], [1, 2, 3], [4, 99, -99]],
         "a:double,b:double,c:double",
         throw=True,
     )
     df_eq(
         f,
         [[4, None, 6], [1, 2, 3], [4, None, -99]],
         "a:double,b:double,c:double",
         throw=True,
     )
     df_eq(g, d, throw=True)
     raises(ValueError, lambda: e.fillna(a, {"b": None, c: "99"}))
     raises(ValueError, lambda: e.fillna(a, None))
     raises(ValueError, lambda: e.fillna(a, ["b"]))
Beispiel #24
0
        def test__join_outer_pandas_incompatible(self):
            e = self.engine

            a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str")
            b = e.to_df([["6", 1], ["2", 7]], "c:int,a:int")
            c = e.join(a, b, how="left_OUTER", on=["a"], metadata=dict(a=1))
            df_eq(
                c,
                [[1, "2", 6], [3, "4", None]],
                "a:int,b:str,c:int",
                metadata=dict(a=1),
                throw=True,
            )
            c = e.join(b, a, how="left_outer", on=["a"])
            df_eq(c, [[6, 1, "2"], [2, 7, None]],
                  "c:int,a:int,b:str",
                  throw=True)

            a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str")
            b = e.to_df([[True, 1], [False, 7]], "c:bool,a:int")
            c = e.join(a, b, how="left_OUTER", on=["a"])
            df_eq(c, [[1, "2", True], [3, "4", None]],
                  "a:int,b:str,c:bool",
                  throw=True)
            c = e.join(b, a, how="left_outer", on=["a"])
            df_eq(c, [[True, 1, "2"], [False, 7, None]],
                  "c:bool,a:int,b:str",
                  throw=True)
        def test_sample_n(self):
            engine = self.engine
            a = engine.to_df([[x] for x in range(100)], "a:int")

            b = engine.sample(a, n=90, replace=False, metadata=(dict(a=1)))
            c = engine.sample(a, n=90, replace=True, metadata=(dict(a=1)))
            d = engine.sample(a, n=90, seed=1, metadata=(dict(a=1)))
            d2 = engine.sample(a, n=90, seed=1, metadata=(dict(a=1)))
            e = engine.sample(a, n=90, seed=2, metadata=(dict(a=1)))
            assert not df_eq(b, c, throw=False)
            df_eq(d, d2, throw=True)
            assert not df_eq(d, e, throw=False)
            assert abs(len(e.as_array()) - 90) < 2
            assert e.metadata == dict(a=1)
def test_function_wrapper():
    for f in [f20, f21, f212, f22, f23, f24, f25, f26, f30, f31, f32, f35]:
        df = ArrayDataFrame([[0]], "a:int")
        w = FunctionWrapper(f, "^[ldsp][ldsp]$", "[ldspq]")
        res = w.run([df],
                    dict(a=df),
                    ignore_unknown=False,
                    output_schema="a:int")
        df_eq(res, [[0], [0]], "a:int", throw=True)
        w.run([df], dict(a=df), ignore_unknown=False, output=False)

    # test other data types, simple operations
    w = FunctionWrapper(f27)
    assert 3 == w(1, 2)
    assert 3 == w.run([1, 2], dict(), ignore_unknown=False)
    assert 3 == w.run([5], dict(a=1, b=2),
                      ignore_unknown=True)  # dict will overwrite
    assert 3 == w.run([], dict(a=1, b=2, c=4), ignore_unknown=True)
    raises(ValueError,
           lambda: w.run([], dict(a=1, b=2, c=4), ignore_unknown=False))

    # test default and required
    w = FunctionWrapper(f28)
    assert 3 == w.run([], dict(a=1, b=2), ignore_unknown=False)
    assert 2 == w.run([], dict(a=1), ignore_unknown=False)
    assert 3 == w.run([], dict(a=1, b=2), ignore_unknown=True)
    assert 3 == w.run([], dict(a=1, b=2, c=4), ignore_unknown=True)
    raises(ValueError,
           lambda: w.run([], dict(a=1, b=2, c=4), ignore_unknown=False))
    raises(ValueError, lambda: w.run([], dict(b=2), ignore_unknown=True))

    # test kwargs
    w = FunctionWrapper(f29)
    assert 3 == w.run([], dict(a=1, b=2), ignore_unknown=False)
    assert 1 == w.run([], dict(a=1), ignore_unknown=False)
    assert 3 == w.run([], dict(a=1, b=2), ignore_unknown=True)
    assert 7 == w.run([], dict(a=1, b=2, c=4), ignore_unknown=True)
    assert 7 == w.run([], dict(a=1, b=2, c=4), ignore_unknown=False)

    # test method inside class
    class Test(object):
        def t(self, a=1, b=2) -> int:
            return a + b

    test = Test()
    # instance method test
    w = FunctionWrapper(test.t, "^0?.*", ".*")
    assert 4 == w.run([], kwargs={"b": 3}, ignore_unknown=True)
    assert 5 == w.run([2], kwargs={"b": 3}, ignore_unknown=True)
 def test_map_with_binary(self):
     e = self.engine
     o = ArrayDataFrame(
         [[pickle.dumps(BinaryObject("a"))], [pickle.dumps(BinaryObject("b"))]],
         "a:bytes",
     )
     c = e.map(o, binary_map, o.schema, PartitionSpec())
     expected = ArrayDataFrame(
         [
             [pickle.dumps(BinaryObject("ax"))],
             [pickle.dumps(BinaryObject("bx"))],
         ],
         "a:bytes",
     )
     df_eq(expected, c, no_pandas=True, check_order=True, throw=True)
        def test_map_with_special_values(self):
            def with_nat(cursor, data):
                df = data.as_pandas()
                df["nat"] = pd.NaT
                schema = data.schema + "nat:datetime"
                return PandasDataFrame(df, schema)

            e = self.engine
            # test with multiple key with null values
            o = ArrayDataFrame(
                [[1, None, 1], [1, None, 0], [None, None, 1]],
                "a:double,b:double,c:int",
                dict(a=1),
            )
            c = e.map(
                o, select_top, o.schema, PartitionSpec(by=["a", "b"], presort="c")
            )
            df_eq(
                c,
                [[1, None, 0], [None, None, 1]],
                "a:double,b:double,c:int",
                throw=True,
            )
            # test datetime with nat
            dt = datetime.now()
            o = ArrayDataFrame(
                [
                    [dt, 2, 1],
                    [None, 2, None],
                    [None, 1, None],
                    [dt, 5, 1],
                    [None, 4, None],
                ],
                "a:datetime,b:int,c:double",
                dict(a=1),
            )
            c = e.map(
                o, select_top, o.schema, PartitionSpec(by=["a", "c"], presort="b DESC")
            )
            df_eq(
                c,
                [[None, 4, None], [dt, 5, 1]],
                "a:datetime,b:int,c:double",
                throw=True,
            )
            d = e.map(
                c, with_nat, "a:datetime,b:int,c:double,nat:datetime", PartitionSpec()
            )
            df_eq(
                d,
                [[None, 4, None, None], [dt, 5, 1, None]],
                "a:datetime,b:int,c:double,nat:datetime",
                throw=True,
            )
            # test list
            o = ArrayDataFrame([[dt, [1, 2]]], "a:datetime,b:[int]")
            c = e.map(o, select_top, o.schema, PartitionSpec(by=["a"]))
            df_eq(c, o, check_order=True, throw=True)
Beispiel #29
0
        def test_save_single_and_load_parquet(self):
            e = self.engine
            b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long")
            path = os.path.join(self.tmpdir, "a", "b")
            e.fs.makedirs(path, recreate=True)
            # over write folder with single file
            e.save_df(b, path, format_hint="parquet", force_single=True)
            assert e.fs.isfile(path)
            c = e.load_df(path, format_hint="parquet", columns=["a", "c"])
            df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True)

            # overwirte single with folder (if applicable)
            b = ArrayDataFrame([[60, 1], [20, 7]], "c:int,a:long")
            e.save_df(b, path, format_hint="parquet", mode="overwrite")
            c = e.load_df(path, format_hint="parquet", columns=["a", "c"])
            df_eq(c, [[1, 60], [7, 20]], "a:long,c:int", throw=True)
Beispiel #30
0
        def test_aggregate(self):
            e = self.engine
            o = ArrayDataFrame(
                [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
                "a:double,b:int",
                dict(a=1),
            )
            a = e.to_df(o)

            b = e.aggregate(
                df=a,
                partition_spec=None,
                agg_cols=[
                    ff.max(col("b")),
                    (ff.max(col("b")) * 2).cast("int32").alias("c"),
                ],
            )
            df_eq(b, [[4, 8]], "b:int,c:int", throw=True)

            b = e.aggregate(
                df=a,
                partition_spec=PartitionSpec(by=["a"]),
                agg_cols=[
                    ff.max(col("b")),
                    (ff.max(col("b")) * 2).cast("int32").alias("c"),
                ],
            )
            df_eq(
                b,
                [[None, 4, 8], [1, 2, 4], [3, 4, 8]],
                "a:double,b:int,c:int",
                throw=True,
            )

            with raises(ValueError):
                e.aggregate(
                    df=a,
                    partition_spec=PartitionSpec(by=["a"]),
                    agg_cols=[ff.max(col("b")), lit(1)],
                )

            with raises(ValueError):
                e.aggregate(
                    df=a,
                    partition_spec=PartitionSpec(by=["a"]),
                    agg_cols=[],
                )