def test_basic(): gen = SQLExpressionGenerator() assert "a" == gen.generate(col("a")) assert "a AS bc" == gen.generate(col("a").alias("bc")) assert "'a'" == gen.generate(lit("a")) assert "'a' AS bc" == gen.generate(lit("a").alias("bc")) assert "CAST(a AS long) AS a" == gen.generate(col("a").cast(int))
def test_functions(): gen = SQLExpressionGenerator() assert "COALESCE(a,b+c,(d+e)-1,NULL) IS NULL" == gen.generate( f.coalesce(col("a"), col("b") + col("c"), col("d") + col("e") - 1, null()).is_null()) assert ( "MY(MIN(x),MAX(y+1),AVG(z),2,aa=FIRST(a),bb=LAST('b'),cc=COUNT(DISTINCT *)) AS x" == gen.generate( function( "MY", f.min(col("x")), f.max(col("y") + 1), f.avg(col("z")), 2, aa=f.first(col("a")), bb=f.last(lit("b")), cc=f.count_distinct(col("*")), ).alias("x"))) def dummy(expr): yield "DUMMY" if expr.is_distinct: yield " D" gen.add_func_handler("MY", dummy) assert "DUMMY D AS x" == gen.generate( function("MY", 2, 3, arg_distinct=True).alias("x"))
def test_assign(self): e = self.engine o = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) b = e.assign( a, [ lit(1, "x"), col("b").cast(str), (col("b") + 1).alias("c").cast(int) ], ) df_eq( b, [ [1, "2", 1, 3], [None, "2", 1, 3], [None, "1", 1, 2], [3, "4", 1, 5], [None, "4", 1, 5], ], "a:double,b:str,x:long,c:long", throw=True, )
def test_binary_op(): assert "+(ab,1)" == str(col("ab") + 1) assert "+(ab,x)" == str(col("ab") + col("x")) assert "+('x',a)" == str("x" + col("a")) assert "+('x','a')" == str("x" + lit("a")) assert "-(a,1)" == str(col("a") - 1) assert "-(1.1,a)" == str(1.1 - col("a")) assert "*(a,1)" == str(col("a") * 1) assert "*(1.1,a)" == str(1.1 * col("a")) assert "/(a,1)" == str(col("a") / 1) assert "/(1.1,a)" == str(1.1 / col("a")) assert "+(ab,1)" == str((col("ab") + 1)) assert "+(ab,1) AS xx" == str((col("ab") + 1).alias("xx")) assert "+(ab,1) AS xx" == str((col("ab") + 1).alias("xx")) assert "&(a,TRUE)" == str(col("a") & True) assert "&(TRUE,a)" == str(True & col("a")) assert "&(a,FALSE)" == str(col("a") & False) assert "&(FALSE,a)" == str(False & col("a")) assert "|(a,TRUE)" == str(col("a") | True) assert "|(TRUE,a)" == str(True | col("a")) assert "|(a,FALSE)" == str(col("a") | False) assert "|(FALSE,a)" == str(False | col("a")) assert "<(a,1)" == str(col("a") < 1) assert "<(a,b)" == str(col("a") < col("b")) assert ">(a,1.1)" == str(1.1 < col("a")) assert "<(1.1,a)" == str(lit(1.1) < col("a")) assert "<=(a,1)" == str(col("a") <= 1) assert ">=(a,1.1)" == str(1.1 <= col("a")) assert ">(a,1)" == str(col("a") > 1) assert "<(a,1.1)" == str(1.1 > col("a")) assert ">=(a,1)" == str(col("a") >= 1) assert "<=(a,1.1)" == str(1.1 >= col("a")) assert "==(a,1)" == str(col("a") == 1) assert "==(a,1.1)" == str(1.1 == col("a")) assert "!=(a,1)" == str(col("a") != 1) assert "!=(a,1.1)" == str(1.1 != col("a"))
def test_is_agg(): assert f.is_agg(f.first(col("a"))) assert f.is_agg(f.count_distinct(col("a")).alias("x")) assert f.is_agg(f.first(col("a") + 1)) assert f.is_agg(f.first(col("a")) + 1) assert f.is_agg((f.first(col("a")) < 1).alias("x")) assert f.is_agg(col("a") * f.first(col("a")) + 1) assert not f.is_agg(col("a")) assert not f.is_agg(lit("a")) assert not f.is_agg(col("a") + col("b")) assert not f.is_agg(null())
def test_schema_inference(): schema = Schema("a:int,b:str,c:bool,d:double") assert pa.int32() == col("a").infer_type(schema) assert pa.int32() == (-col("a")).infer_type(schema) assert pa.int64() == (-col("a")).cast(int).infer_type(schema) assert pa.int64() == (-col("a").cast(int)).infer_type(schema) assert pa.string() == col("b").infer_type(schema) assert (-col("b")).infer_type(schema) is None assert (~col("b")).infer_type(schema) is None assert pa.bool_() == col("c").infer_type(schema) assert pa.bool_() == (~col("c")).alias("x").infer_type(schema) assert pa.float64() == col("d").infer_type(schema) assert pa.float64() == (-col("d").alias("x")).infer_type(schema) assert col("x").infer_type(schema) is None assert pa.string() == col("x").cast(str).infer_type(schema) assert col("*").infer_type(schema) is None assert pa.bool_() == (col("a") < col("d")).infer_type(schema) assert pa.bool_() == (col("a") > col("d")).infer_type(schema) assert pa.bool_() == (col("a") <= col("d")).infer_type(schema) assert pa.bool_() == (col("a") >= col("d")).infer_type(schema) assert pa.bool_() == (col("a") == col("d")).infer_type(schema) assert pa.bool_() == (col("a") != col("d")).infer_type(schema) assert pa.bool_() == (~(col("a") != col("d"))).infer_type(schema) assert pa.int64() == (~(col("a") != col("d"))).cast(int).infer_type(schema) assert (col("a") - col("d")).infer_type(schema) is None assert pa.int64() == lit(1).infer_type(schema) assert pa.string() == lit("a").infer_type(schema) assert pa.bool_() == lit(False).infer_type(schema) assert pa.string() == lit(False).cast(str).infer_type(schema) assert pa.float64() == lit(2.2).infer_type(schema) assert null().infer_type(schema) is None assert pa.string() == null().cast(str).infer_type(schema) assert function("a", col("a").cast("int")).infer_type(schema) is None assert pa.string() == function( "a", col("a").cast("int")).cast(str).infer_type(schema)
def test_aggregate(self): e = self.engine o = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) b = e.aggregate( df=a, partition_spec=None, agg_cols=[ ff.max(col("b")), (ff.max(col("b")) * 2).cast("int32").alias("c"), ], ) df_eq(b, [[4, 8]], "b:int,c:int", throw=True) b = e.aggregate( df=a, partition_spec=PartitionSpec(by=["a"]), agg_cols=[ ff.max(col("b")), (ff.max(col("b")) * 2).cast("int32").alias("c"), ], ) df_eq( b, [[None, 4, 8], [1, 2, 4], [3, 4, 8]], "a:double,b:int,c:int", throw=True, ) with raises(ValueError): e.aggregate( df=a, partition_spec=PartitionSpec(by=["a"]), agg_cols=[ff.max(col("b")), lit(1)], ) with raises(ValueError): e.aggregate( df=a, partition_spec=PartitionSpec(by=["a"]), agg_cols=[], )
def test_correct_select_schema(): schema = Schema("a:double,b:str") gen = SQLExpressionGenerator() sc = SelectColumns(col("*"), col("c")) output = Schema("a:double,b:str,c:str") c = gen.correct_select_schema(schema, sc, output) assert c is None output = Schema("a:int,b:int,c:str") c = gen.correct_select_schema(schema, sc, output) assert c == "a:double,b:str" sc = SelectColumns(f.count(col("*")).alias("t"), col("c").alias("a")) output = Schema("t:int,a:str") c = gen.correct_select_schema(schema, sc, output) assert c is None sc = SelectColumns((col("a") + col("b")).cast(str).alias("a"), lit(1, "c")) output = Schema("a:int,c:str") c = gen.correct_select_schema(schema, sc, output) assert c == "a:str,c:long"
def test_lit_col(): assert "NULL" == str(lit(None)) assert "TRUE" == str(null().is_null()) assert "FALSE" == str(null().not_null()) assert "'a'" == str(lit("a")) assert "'a\"\\'\\\\'" == str(lit("a\"'\\")) assert "'a' AS x" == str(lit("a", "x")) assert "TRUE" == str(lit("a").not_null()) assert "FALSE" == str(lit("a").is_null()) assert "1.1" == str(lit(1.1)) assert "11" == str(lit(11)) assert "TRUE" == str(lit(True)) assert "FALSE" == str(lit(False)) assert "1 AS xx" == str(lit(1).alias("xx")) assert "'ab' AS xx" == str(lit("ab").alias("xx")) raises(NotImplementedError, lambda: lit([1, 2])) assert to_uuid(lit("a")) != to_uuid(col("a")) assert to_uuid(lit(1)) != to_uuid(lit("1")) assert to_uuid(null()) == to_uuid(null()) assert to_uuid(null()) != to_uuid(lit(1)) assert to_uuid(lit("a")) != to_uuid(lit("a").alias("v")) assert to_uuid(lit("a")) != to_uuid(lit("a").cast(int)) assert to_uuid(lit("a").cast(int).alias("v")) == to_uuid( lit("a").alias("v").cast(int))
def test_get_column_mentions(): expr = (col("a") + col("b")) * function( "x", col("b"), a=col("c"), b=lit(1)) assert set(["a", "b", "c"]) == set(_get_column_mentions(expr))
def test_select_columns(): # not all with names cols = SelectColumns(col("a"), lit(1, "b"), col("bb") + col("cc"), f.first(col("c"))) assert to_uuid(cols) == to_uuid(cols) raises(ValueError, lambda: cols.assert_all_with_names()) # distinct cols2 = SelectColumns( col("a"), lit(1, "b"), col("bb") + col("cc"), f.first(col("c")), arg_distinct=True, ) assert to_uuid(cols) != to_uuid(cols2) # duplicated names cols = SelectColumns(col("a").alias("b"), lit(1, "b")) assert to_uuid(cols) != to_uuid( SelectColumns(col("a").alias("b"), lit(1, "c"))) raises(ValueError, lambda: cols.assert_all_with_names()) # with *, all cols must have alias cols = SelectColumns(col("*"), col("a")).assert_no_agg() raises(ValueError, lambda: cols.assert_all_with_names()) # * can be used at most once raises(ValueError, lambda: SelectColumns(col("*"), col("*"), col("a").alias("p"))) # * can't be used with aggregation raises(ValueError, lambda: SelectColumns(col("*"), f.first(col("a")).alias("x"))) cols = SelectColumns( col("aa").alias("a").cast(int), lit(1, "b"), (col("bb") + col("cc")).alias("c"), f.first(col("c")).alias("d"), ).assert_all_with_names() raises(AssertionError, lambda: cols.assert_no_agg()) assert not cols.simple assert 1 == len(cols.simple_cols) assert "CAST(aa AS long) AS a" == str(cols.simple_cols[0]) assert cols.has_literals assert 1 == len(cols.literals) assert "1 AS b" == str(cols.literals[0]) assert cols.has_agg assert 1 == len(cols.non_agg_funcs) assert "+(bb,cc) AS c" == str(cols.non_agg_funcs[0]) assert 1 == len(cols.agg_funcs) assert "FIRST(c) AS d" == str(cols.agg_funcs[0]) assert 2 == len(cols.group_keys) # a, c assert "aa" == cols.group_keys[0].output_name assert "" == cols.group_keys[1].output_name assert isinstance(cols.group_keys[1], _BinaryOpExpr) cols = SelectColumns(col("a")).assert_no_wildcard() assert cols.simple assert not cols.has_literals assert not cols.has_agg cols = SelectColumns(col("x"), col("*"), col("y") + col("z")) cols = cols.replace_wildcard(Schema("a:int,b:int")) assert "x" == str(cols.all_cols[0])
def test_select(): gen = SQLExpressionGenerator() # no aggregation cols = SelectColumns(col("*")) assert "SELECT * FROM x" == gen.select(cols, "x") cols = SelectColumns(col("a"), lit(1).alias("b"), (col("b") + col("c")).alias("x")) where = (col("a") > 5).alias("aa") assert "SELECT a, 1 AS b, b+c AS x FROM t WHERE a>5" == gen.select( cols, "t", where=where) # aggregation without literals cols = SelectColumns(f.max(col("c")).alias("c"), col("a", "aa"), col("b")) assert "SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b" == gen.select( cols, "t") where = col("a") < 10 having = (f.max(col("a")) > 5).alias("aaa") assert ( "SELECT MAX(c) AS c, a AS aa, b FROM t WHERE a<10 GROUP BY a, b HAVING MAX(a)>5" == gen.select(cols, "t", where=where, having=having)) cols = SelectColumns( f.min(col("c") + 1).alias("c"), f.avg(col("d") + col("e")).cast(int).alias("d"), ) assert "SELECT MIN(c+1) AS c, CAST(AVG(d+e) AS long) AS d FROM t" == gen.select( cols, "t") # aggregation with literals cols = SelectColumns(lit(1, "k"), f.max(col("c")).alias("c"), lit(2, "j"), col("a", "aa"), col("b")) assert ( "SELECT 1 AS k, c, 2 AS j, aa, b FROM (SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b)" == gen.select(cols, "t")) cols = SelectColumns(lit(1, "k"), f.max(col("c")).alias("c"), lit(2, "j")) assert "SELECT 1 AS k, c, 2 AS j FROM (SELECT MAX(c) AS c FROM t)" == gen.select( cols, "t") cols = SelectColumns(lit(1, "k"), col("a"), f.max(col("c")).alias("c"), lit(2, "j")) assert ( "SELECT 1 AS k, a, c, 2 AS j FROM (SELECT a, MAX(c) AS c FROM t GROUP BY a)" == gen.select(cols, "t")) # cast cols = SelectColumns( col("c").cast(float), f.avg(col("d") + col("e")).cast(int).alias("d"), ) assert ( "SELECT CAST(c AS double) AS c, CAST(AVG(d+e) AS long) AS d FROM t GROUP BY c" == gen.select(cols, "t")) # infer alias cols = SelectColumns( (-col("c")).cast(float), f.max(col("e")).cast(int), f.avg(col("d") + col("e")).cast(int).alias("d"), ) assert ("SELECT CAST(-c AS double) AS c, CAST(MAX(e) AS long) AS e, " "CAST(AVG(d+e) AS long) AS d FROM t GROUP BY -c" == gen.select( cols, "t"))
def test_functions(): schema = Schema("a:int,b:str,c:bool,d:double") expr = f.coalesce(col("a"), 1, None, col("b") + col("c")) assert "COALESCE(a,1,NULL,+(b,c))" == str(expr) assert expr.infer_type(schema) is None expr = f.min(col("a")) assert "MIN(a)" == str(expr) assert pa.int32() == expr.infer_type(schema) assert "MIN(a) AS a" == str(expr.infer_alias()) assert "CAST(MIN(a) AS long) AS a" == str(expr.cast(int).infer_alias()) assert "MIN(a) AS b" == str(expr.alias("b").infer_alias()) assert "MIN(-(a)) AS a" == str(f.min(-col("a")).infer_alias()) expr = f.min(lit(1.1)) assert "MIN(1.1)" == str(expr) assert pa.float64() == expr.infer_type(schema) expr = f.max(col("a")) assert "MAX(a)" == str(expr) assert pa.int32() == expr.infer_type(schema) expr = f.max(lit(1.1)) assert "MAX(1.1)" == str(expr) assert pa.float64() == expr.infer_type(schema) expr = f.first(col("a")) assert "FIRST(a)" == str(expr) assert pa.int32() == expr.infer_type(schema) expr = f.first(lit(1.1)) assert "FIRST(1.1)" == str(expr) assert pa.float64() == expr.infer_type(schema) expr = f.last(col("a")) assert "LAST(a)" == str(expr) assert pa.int32() == expr.infer_type(schema) expr = f.last(lit(1.1)) assert "LAST(1.1)" == str(expr) assert pa.float64() == expr.infer_type(schema) expr = f.avg(col("a")) assert "AVG(a)" == str(expr) assert expr.infer_type(schema) is None expr = f.sum(col("a")) assert "SUM(a)" == str(expr) assert expr.infer_type(schema) is None expr = f.count(col("a")) assert "COUNT(a)" == str(expr) assert expr.infer_type(schema) is None expr = f.count_distinct(col("a")) assert "COUNT(DISTINCT a)" == str(expr) assert expr.infer_type(schema) is None assert "COUNT(DISTINCT a) AS a" == str(expr.infer_alias()) expr = f.count_distinct(col("*")) assert "COUNT(DISTINCT *)" == str(expr) assert expr.infer_type(schema) is None assert "COUNT(DISTINCT *)" == str(expr.infer_alias())
def test_select(self): e = self.engine o = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) # simple b = e.select( a, SelectColumns(col("b"), (col("b") + 1).alias("c").cast(str))) df_eq( b, [[2, "3"], [2, "3"], [1, "2"], [4, "5"], [4, "5"]], "b:int,c:str", throw=True, ) # with distinct b = e.select( a, SelectColumns(col("b"), (col("b") + 1).alias("c").cast(str), arg_distinct=True), ) df_eq( b, [[2, "3"], [1, "2"], [4, "5"]], "b:int,c:str", throw=True, ) # wildcard b = e.select(a, SelectColumns(col("*")), where=col("a") + col("b") == 3) df_eq(b, [[1, 2]], "a:double,b:int", throw=True) # aggregation b = e.select( a, SelectColumns(col("a"), ff.sum(col("b")).cast(float).alias("b"))) df_eq(b, [[1, 2], [3, 4], [None, 7]], "a:double,b:double", throw=True) # having # https://github.com/fugue-project/fugue/issues/222 col_b = ff.sum(col("b")) b = e.select( a, SelectColumns(col("a"), col_b.cast(float).alias("b")), having=(col_b >= 7) | (col("a") == 1), ) df_eq(b, [[1, 2], [None, 7]], "a:double,b:double", throw=True) # literal + alias inference # https://github.com/fugue-project/fugue/issues/222 col_b = ff.sum(col("b")) b = e.select( a, SelectColumns(col("a"), lit(1, "o").cast(str), col_b.cast(float)), having=(col_b >= 7) | (col("a") == 1), ) df_eq(b, [[1, "1", 2], [None, "1", 7]], "a:double,o:str,b:double", throw=True)