def test_dropna(self): e = self.engine a = e.to_df([[4, None, 6], [1, 2, 3], [4, None, None]], "a:double,b:double,c:double") c = e.dropna(a, metadata=(dict(a=1))) # default d = e.dropna(a, how="all") f = e.dropna(a, how="any", thresh=2) g = e.dropna(a, how="any", subset=["a", "c"]) h = e.dropna(a, how="any", thresh=1, subset=["a", "c"]) df_eq( c, [[1, 2, 3]], "a:double,b:double,c:double", metadata=dict(a=1), throw=True, ) df_eq( d, [[4, None, 6], [1, 2, 3], [4, None, None]], "a:double,b:double,c:double", throw=True, ) df_eq(f, [[4, None, 6], [1, 2, 3]], "a:double,b:double,c:double", throw=True) df_eq(g, [[4, None, 6], [1, 2, 3]], "a:double,b:double,c:double", throw=True) df_eq( h, [[4, None, 6], [1, 2, 3], [4, None, None]], "a:double,b:double,c:double", throw=True, )
def test_map_in_pandas(self): if not hasattr(ps.DataFrame, "mapInPandas"): return def add(cursor, data): assert isinstance(data, LocalDataFrameIterableDataFrame) def get_dfs(): for df in data.native: pdf = df.as_pandas() pdf["zz"] = pdf["xx"] + pdf["yy"] yield PandasDataFrame(pdf) return LocalDataFrameIterableDataFrame(get_dfs()) e = self.engine np.random.seed(0) df = pd.DataFrame(np.random.randint(0, 5, (100000, 2)), columns=["xx", "yy"]) expected = PandasDataFrame(df.assign(zz=df.xx + df.yy), "xx:int,yy:int,zz:int") a = e.to_df(df) # no partition c = e.map(a, add, "xx:int,yy:int,zz:int", PartitionSpec(num=16)) df_eq(c, expected, throw=True)
def test_map_with_dict_col(self): e = self.engine dt = datetime.now() # test dict o = ArrayDataFrame([[dt, dict(a=1)]], "a:datetime,b:{a:int}") c = e.map(o, select_top, o.schema, PartitionSpec(by=["a"])) df_eq(c, o, no_pandas=True, check_order=True, throw=True)
def test_workflow(): builder = FugueWorkflow() a = builder.create_data([[0], [0], [1]], "a:int") raises(InvalidOperationError, lambda: a._task.copy()) raises(InvalidOperationError, lambda: copy.copy(a._task)) raises(InvalidOperationError, lambda: copy.deepcopy(a._task)) a.show() a.show() raises(FugueWorkflowCompileError, lambda: builder.df(123)) b = a.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"])) b.show() builder.create_data([[0], [1]], "b:int").show() c = ArrayDataFrame([[100]], "a:int") builder.show(a, b, c) b = a.partition(by=["a"]).transform(mock_tf2).persist().broadcast() b.show() builder.run() df_eq(a.result, [[0], [0], [1]], "a:int") raises(TypeError, lambda: builder.run("abc")) builder.run(FugueWorkflowContext()) df_eq(a.result, [[0], [0], [1]], "a:int") builder.run("NativeExecutionEngine") df_eq(b.result, [[0, 2], [0, 2], [1, 1]], "a:int,b:int") df_eq(b.compute(), [[0, 2], [0, 2], [1, 1]], "a:int,b:int") df_eq(b.compute(NativeExecutionEngine), [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
def test_save_and_load_parquet(self): e = self.engine b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") e.save_df(b, path, format_hint="parquet") c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True)
def test_load_csv_folder(self): e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6.1, 1.1]], "c:double,a:double") b = ArrayDataFrame([[2.1, 7.1], [4.1, 8.1]], "c:double,a:double") path = os.path.join(self.tmpdir, "a", "b") native.save_df(a, os.path.join(path, "a.csv"), format_hint="csv", header=True) native.save_df(b, os.path.join(path, "b.csv"), format_hint="csv", header=True) FileSystem().touch(os.path.join(path, "_SUCCESS")) c = e.load_df( path, format_hint="csv", header=True, infer_schema=True, columns=["a", "c"], ) df_eq(c, [[1.1, 6.1], [7.1, 2.1], [8.1, 4.1]], "a:double,c:double", throw=True)
def test_workflows(self): a = FugueWorkflow().df([[0]], "a:int") df_eq(a.compute(self.engine), [[0]], "a:int") a = _FugueInteractiveWorkflow(self.engine).df([[0]], "a:int").persist() df_eq(a.result, [[0]], "a:int")
def test_to_df_general(self): e = self.engine o = ArrayDataFrame( [[1.1, 2.2], [3.3, 4.4]], "a:double,b:double", dict(a=1), ) # all engines should accept these types of inputs # should take fugue.DataFrame df_eq(o, e.to_df(o), throw=True) # should take array, shema and metadata df_eq( o, e.to_df([[1.1, 2.2], [3.3, 4.4]], "a:double,b:double", dict(a=1)), throw=True, ) # should take pandas dataframe pdf = pd.DataFrame([[1.1, 2.2], [3.3, 4.4]], columns=["a", "b"]) df_eq(o, e.to_df(pdf, metadata=dict(a=1)), throw=True) # should convert string to datetime in to_df df_eq( e.to_df([["2020-01-01"]], "a:datetime"), [[datetime(2020, 1, 1)]], "a:datetime", throw=True, ) # should handle empty pandas dataframe o = ArrayDataFrame([], "a:double,b:str") pdf = pd.DataFrame([[0.1, "a"]], columns=["a", "b"]) pdf = pdf[pdf.a < 0] df_eq(o, e.to_df(pdf), throw=True)
def test__join_with_null_keys(self): # SQL will not match null values e = self.engine a = e.to_df([[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int") b = e.to_df([[1, 2, 33], [4, None, 63]], "a:double,b:double,d:int") c = e.join(a, b, how="INNER") df_eq(c, [[1, 2, 3, 33]], "a:double,b:double,c:int,d:int", throw=True)
def test_assign(self): e = self.engine o = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) b = e.assign( a, [ lit(1, "x"), col("b").cast(str), (col("b") + 1).alias("c").cast(int) ], ) df_eq( b, [ [1, "2", 1, 3], [None, "2", 1, 3], [None, "1", 1, 2], [3, "4", 1, 5], [None, "4", 1, 5], ], "a:double,b:str,x:long,c:long", throw=True, )
def test_save_and_load_avro(self): # TODO: switch to c:int,a:long when we can preserve schema to avro e = self.engine b = ArrayDataFrame([[6, 1], [2, 7]], "c:long,a:long") path = os.path.join(self.tmpdir, "a", "b") e.save_df(b, path, format_hint="avro") c = e.load_df(path, format_hint="avro", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True)
def test_to_df(self): e = self.engine a = e.to_df([[1, 2], [3, 4]], "a:int,b:int", dict(a=1)) df_eq(a, [[1, 2], [3, 4]], "a:int,b:int", dict(a=1), throw=True) a = e.to_df(PandasDataFrame([[1, 2], [3, 4]], "a:int,b:int", dict(a=1))) df_eq(a, [[1, 2], [3, 4]], "a:int,b:int", dict(a=1), throw=True) assert a is e.to_df(a)
def assert_eq(df, df_expected=None, raw=False): if df_expected is None: df_expected = df df_actual = deserialize_df(serialize_df(df)) if raw: assert df_expected.native == df_actual.native else: df_eq(df_expected, df_actual, throw=True)
def test_load_parquet_folder(self): e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1]], "c:int,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") native.save_df(a, os.path.join(path, "a.parquet")) native.save_df(b, os.path.join(path, "b.parquet")) FileSystem().touch(os.path.join(path, "_SUCCESS")) c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True)
def test_subtract(self): e = self.engine a = e.to_df([[1, 2, 3], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int") b = e.to_df([[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int") c = e.subtract(a, b, metadata=dict(a=1)) df_eq( c, [[1, 2, 3]], "a:double,b:double,c:int", metadata=dict(a=1), throw=True, )
def test_distinct(self): e = self.engine a = e.to_df([[4, None, 6], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int") c = e.distinct(a, metadata=dict(a=1)) df_eq( c, [[4, None, 6], [1, 2, 3]], "a:double,b:double,c:int", metadata=dict(a=1), throw=True, )
def test_to_df(self): e = self.engine o = ArrayDataFrame( [[1, 2]], "a:int,b:int", dict(a=1), ) a = e.to_df(o) assert a is not o df_eq(a, o, throw=True) a = e.to_df([[1, None]], "a:int,b:int", dict(a=1)) df_eq(a, [[1, None]], "a:int,b:int", dict(a=1), throw=True)
def test_map(self): def noop(cursor, data): return data def on_init(partition_no, data): # TODO: this test is not sufficient assert partition_no >= 0 data.peek_array() e = self.engine o = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) # no partition c = e.map(a, noop, a.schema, PartitionSpec(), dict(a=1)) df_eq(c, o, throw=True) # with key partition c = e.map(a, noop, a.schema, PartitionSpec(by=["a"], presort="b"), dict(a=1)) df_eq(c, o, throw=True) # select top c = e.map(a, select_top, a.schema, PartitionSpec(by=["a"], presort="b")) df_eq(c, [[None, 1], [1, 2], [3, 4]], "a:double,b:int", throw=True) # select top with another order c = e.map( a, select_top, a.schema, PartitionSpec(partition_by=["a"], presort="b DESC"), metadata=dict(a=1), ) df_eq( c, [[None, 4], [1, 2], [3, 4]], "a:double,b:int", metadata=dict(a=1), throw=True, ) # add num_partitions, on_init should not matter c = e.map( a, select_top, a.schema, PartitionSpec(partition_by=["a"], presort="b DESC", num_partitions=3), on_init=on_init, ) df_eq(c, [[None, 4], [1, 2], [3, 4]], "a:double,b:int", throw=True)
def test_load_avro_folder(self): # TODO: switch to c:int,a:long when we can preserve schema to avro e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1]], "c:long,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:long,a:long") path = os.path.join(self.tmpdir, "a", "b") native.save_df(a, os.path.join(path, "a.avro")) native.save_df(b, os.path.join(path, "b.avro")) FileSystem().touch(os.path.join(path, "_SUCCESS")) c = e.load_df(path, format_hint="avro", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:long", throw=True)
def test_io(self): e = self.engine b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long") path = os.path.join(self.tmpdir, "a") e.save_df(b, path, format_hint="parquet", force_single=True) assert e.fs.isfile(path) c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True) path = os.path.join(self.tmpdir, "b.csv") e.save_df(b, path, header=True) c = e.load_df(path, header=True, columns="c:int,a:long") df_eq(c, b, throw=True)
def test_save_and_load_csv(self): e = self.engine b = ArrayDataFrame([[6.1, 1.1], [2.1, 7.1]], "c:double,a:double") path = os.path.join(self.tmpdir, "a", "b") e.save_df(b, path, format_hint="csv", header=True) c = e.load_df( path, format_hint="csv", header=True, infer_schema=True, columns=["a", "c"], ) df_eq(c, [[1.1, 6.1], [7.1, 2.1]], "a:double,c:double", throw=True)
def test_select(self): df = self.df([], "a:str,b:int")[["b"]] assert df.schema == "b:int" raises(FugueDataFrameOperationError, lambda: df[["a"]]) # not existed raises(FugueDataFrameOperationError, lambda: df[[]]) # empty df = self.df([["a", 1]], "a:str,b:int")[["b"]] assert df.schema == "b:int" raises(FugueDataFrameOperationError, lambda: df[["a"]]) # not existed raises(FugueDataFrameOperationError, lambda: df[[]]) # empty assert [[1]] == df.as_array(type_safe=True) df = self.df([["a", 1, 2]], "a:str,b:int,c:int") df_eq(df[["c", "a"]], [[2, "a"]], "a:str,c:int")
def test_fillna(self): e = self.engine a = e.to_df( [[4, None, 6], [1, 2, 3], [4, None, None]], "a:double,b:double,c:double" ) c = e.fillna(a, value=1, metadata=(dict(a=1))) d = e.fillna(a, {"b": 99, "c": -99}) f = e.fillna(a, value=-99, subset=["c"]) g = e.fillna(a, {"b": 99, "c": -99}, subset=["c"]) # subset ignored df_eq( c, [[4, 1, 6], [1, 2, 3], [4, 1, 1]], "a:double,b:double,c:double", metadata=dict(a=1), throw=True, ) df_eq( d, [[4, 99, 6], [1, 2, 3], [4, 99, -99]], "a:double,b:double,c:double", throw=True, ) df_eq( f, [[4, None, 6], [1, 2, 3], [4, None, -99]], "a:double,b:double,c:double", throw=True, ) df_eq(g, d, throw=True) raises(ValueError, lambda: e.fillna(a, {"b": None, c: "99"})) raises(ValueError, lambda: e.fillna(a, None)) raises(ValueError, lambda: e.fillna(a, ["b"]))
def test__join_outer_pandas_incompatible(self): e = self.engine a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str") b = e.to_df([["6", 1], ["2", 7]], "c:int,a:int") c = e.join(a, b, how="left_OUTER", on=["a"], metadata=dict(a=1)) df_eq( c, [[1, "2", 6], [3, "4", None]], "a:int,b:str,c:int", metadata=dict(a=1), throw=True, ) c = e.join(b, a, how="left_outer", on=["a"]) df_eq(c, [[6, 1, "2"], [2, 7, None]], "c:int,a:int,b:str", throw=True) a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str") b = e.to_df([[True, 1], [False, 7]], "c:bool,a:int") c = e.join(a, b, how="left_OUTER", on=["a"]) df_eq(c, [[1, "2", True], [3, "4", None]], "a:int,b:str,c:bool", throw=True) c = e.join(b, a, how="left_outer", on=["a"]) df_eq(c, [[True, 1, "2"], [False, 7, None]], "c:bool,a:int,b:str", throw=True)
def test_sample_n(self): engine = self.engine a = engine.to_df([[x] for x in range(100)], "a:int") b = engine.sample(a, n=90, replace=False, metadata=(dict(a=1))) c = engine.sample(a, n=90, replace=True, metadata=(dict(a=1))) d = engine.sample(a, n=90, seed=1, metadata=(dict(a=1))) d2 = engine.sample(a, n=90, seed=1, metadata=(dict(a=1))) e = engine.sample(a, n=90, seed=2, metadata=(dict(a=1))) assert not df_eq(b, c, throw=False) df_eq(d, d2, throw=True) assert not df_eq(d, e, throw=False) assert abs(len(e.as_array()) - 90) < 2 assert e.metadata == dict(a=1)
def test_function_wrapper(): for f in [f20, f21, f212, f22, f23, f24, f25, f26, f30, f31, f32, f35]: df = ArrayDataFrame([[0]], "a:int") w = FunctionWrapper(f, "^[ldsp][ldsp]$", "[ldspq]") res = w.run([df], dict(a=df), ignore_unknown=False, output_schema="a:int") df_eq(res, [[0], [0]], "a:int", throw=True) w.run([df], dict(a=df), ignore_unknown=False, output=False) # test other data types, simple operations w = FunctionWrapper(f27) assert 3 == w(1, 2) assert 3 == w.run([1, 2], dict(), ignore_unknown=False) assert 3 == w.run([5], dict(a=1, b=2), ignore_unknown=True) # dict will overwrite assert 3 == w.run([], dict(a=1, b=2, c=4), ignore_unknown=True) raises(ValueError, lambda: w.run([], dict(a=1, b=2, c=4), ignore_unknown=False)) # test default and required w = FunctionWrapper(f28) assert 3 == w.run([], dict(a=1, b=2), ignore_unknown=False) assert 2 == w.run([], dict(a=1), ignore_unknown=False) assert 3 == w.run([], dict(a=1, b=2), ignore_unknown=True) assert 3 == w.run([], dict(a=1, b=2, c=4), ignore_unknown=True) raises(ValueError, lambda: w.run([], dict(a=1, b=2, c=4), ignore_unknown=False)) raises(ValueError, lambda: w.run([], dict(b=2), ignore_unknown=True)) # test kwargs w = FunctionWrapper(f29) assert 3 == w.run([], dict(a=1, b=2), ignore_unknown=False) assert 1 == w.run([], dict(a=1), ignore_unknown=False) assert 3 == w.run([], dict(a=1, b=2), ignore_unknown=True) assert 7 == w.run([], dict(a=1, b=2, c=4), ignore_unknown=True) assert 7 == w.run([], dict(a=1, b=2, c=4), ignore_unknown=False) # test method inside class class Test(object): def t(self, a=1, b=2) -> int: return a + b test = Test() # instance method test w = FunctionWrapper(test.t, "^0?.*", ".*") assert 4 == w.run([], kwargs={"b": 3}, ignore_unknown=True) assert 5 == w.run([2], kwargs={"b": 3}, ignore_unknown=True)
def test_map_with_binary(self): e = self.engine o = ArrayDataFrame( [[pickle.dumps(BinaryObject("a"))], [pickle.dumps(BinaryObject("b"))]], "a:bytes", ) c = e.map(o, binary_map, o.schema, PartitionSpec()) expected = ArrayDataFrame( [ [pickle.dumps(BinaryObject("ax"))], [pickle.dumps(BinaryObject("bx"))], ], "a:bytes", ) df_eq(expected, c, no_pandas=True, check_order=True, throw=True)
def test_map_with_special_values(self): def with_nat(cursor, data): df = data.as_pandas() df["nat"] = pd.NaT schema = data.schema + "nat:datetime" return PandasDataFrame(df, schema) e = self.engine # test with multiple key with null values o = ArrayDataFrame( [[1, None, 1], [1, None, 0], [None, None, 1]], "a:double,b:double,c:int", dict(a=1), ) c = e.map( o, select_top, o.schema, PartitionSpec(by=["a", "b"], presort="c") ) df_eq( c, [[1, None, 0], [None, None, 1]], "a:double,b:double,c:int", throw=True, ) # test datetime with nat dt = datetime.now() o = ArrayDataFrame( [ [dt, 2, 1], [None, 2, None], [None, 1, None], [dt, 5, 1], [None, 4, None], ], "a:datetime,b:int,c:double", dict(a=1), ) c = e.map( o, select_top, o.schema, PartitionSpec(by=["a", "c"], presort="b DESC") ) df_eq( c, [[None, 4, None], [dt, 5, 1]], "a:datetime,b:int,c:double", throw=True, ) d = e.map( c, with_nat, "a:datetime,b:int,c:double,nat:datetime", PartitionSpec() ) df_eq( d, [[None, 4, None, None], [dt, 5, 1, None]], "a:datetime,b:int,c:double,nat:datetime", throw=True, ) # test list o = ArrayDataFrame([[dt, [1, 2]]], "a:datetime,b:[int]") c = e.map(o, select_top, o.schema, PartitionSpec(by=["a"])) df_eq(c, o, check_order=True, throw=True)
def test_save_single_and_load_parquet(self): e = self.engine b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") e.fs.makedirs(path, recreate=True) # over write folder with single file e.save_df(b, path, format_hint="parquet", force_single=True) assert e.fs.isfile(path) c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True) # overwirte single with folder (if applicable) b = ArrayDataFrame([[60, 1], [20, 7]], "c:int,a:long") e.save_df(b, path, format_hint="parquet", mode="overwrite") c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) df_eq(c, [[1, 60], [7, 20]], "a:long,c:int", throw=True)
def test_aggregate(self): e = self.engine o = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) b = e.aggregate( df=a, partition_spec=None, agg_cols=[ ff.max(col("b")), (ff.max(col("b")) * 2).cast("int32").alias("c"), ], ) df_eq(b, [[4, 8]], "b:int,c:int", throw=True) b = e.aggregate( df=a, partition_spec=PartitionSpec(by=["a"]), agg_cols=[ ff.max(col("b")), (ff.max(col("b")) * 2).cast("int32").alias("c"), ], ) df_eq( b, [[None, 4, 8], [1, 2, 4], [3, 4, 8]], "a:double,b:int,c:int", throw=True, ) with raises(ValueError): e.aggregate( df=a, partition_spec=PartitionSpec(by=["a"]), agg_cols=[ff.max(col("b")), lit(1)], ) with raises(ValueError): e.aggregate( df=a, partition_spec=PartitionSpec(by=["a"]), agg_cols=[], )