def test_table_schema(c, df): original_df = c.sql("SELECT * FROM df") assert_eq(original_df, c.sql("SELECT * FROM root.df")) c.sql("CREATE SCHEMA foo") assert_eq(original_df, c.sql("SELECT * FROM df")) c.sql('USE SCHEMA "foo"') assert_eq(original_df, c.sql("SELECT * FROM root.df")) c.sql("CREATE TABLE bar AS TABLE root.df") assert_eq(original_df, c.sql("SELECT * FROM bar")) with pytest.raises(KeyError): c.sql("CREATE TABLE other.bar AS TABLE df") c.sql('USE SCHEMA "root"') assert_eq(original_df, c.sql("SELECT * FROM foo.bar")) with pytest.raises(ParsingException): c.sql("SELECT * FROM bar") c.sql("DROP SCHEMA foo") with pytest.raises(ParsingException): c.sql("SELECT * FROM foo.bar")
def test_create_from_query(c, df): c.sql(""" CREATE OR REPLACE TABLE new_table AS ( SELECT * FROM df ) """) return_df = c.sql(""" SELECT * FROM new_table """) assert_eq(df, return_df) c.sql(""" CREATE OR REPLACE VIEW new_table AS ( SELECT * FROM df ) """) return_df = c.sql(""" SELECT * FROM new_table """) assert_eq(df, return_df)
def test_null(c): df = c.sql( """ SELECT c IS NOT NULL AS nn, c IS NULL AS n FROM user_table_nan """ ) expected_df = pd.DataFrame(index=[0, 1, 2]) expected_df["nn"] = [True, False, True] expected_df["nn"] = expected_df["nn"].astype("boolean") expected_df["n"] = [False, True, False] assert_eq(df, expected_df) df = c.sql( """ SELECT a IS NOT NULL AS nn, a IS NULL AS n FROM string_table """ ) expected_df = pd.DataFrame(index=[0, 1, 2]) expected_df["nn"] = [True, True, True] expected_df["nn"] = expected_df["nn"].astype("boolean") expected_df["n"] = [False, False, False] assert_eq(df, expected_df)
def test_over_with_windows(c): tmp_df = pd.DataFrame({"a": range(5)}) c.create_table("tmp", tmp_df) return_df = c.sql(""" SELECT a, SUM(a) OVER (ORDER BY a ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS O1, SUM(a) OVER (ORDER BY a ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) AS O2, SUM(a) OVER (ORDER BY a ROWS BETWEEN 2 PRECEDING AND UNBOUNDED FOLLOWING) AS O3, SUM(a) OVER (ORDER BY a ROWS BETWEEN CURRENT ROW AND 3 FOLLOWING) AS O4, SUM(a) OVER (ORDER BY a ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS O5, SUM(a) OVER (ORDER BY a ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS O6, SUM(a) OVER (ORDER BY a ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING) AS O7, SUM(a) OVER (ORDER BY a ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS O8, SUM(a) OVER (ORDER BY a ROWS BETWEEN 3 FOLLOWING AND 3 FOLLOWING) AS O9, SUM(a) OVER (ORDER BY a ROWS BETWEEN 3 PRECEDING AND 1 PRECEDING) AS O10 FROM tmp """) expected_df = pd.DataFrame({ "a": return_df.a, "O1": [0, 1, 3, 6, 9], "O2": [6, 10, 10, 10, 9], "O3": [10, 10, 10, 10, 9], "O4": [6, 10, 9, 7, 4], "O5": [10, 10, 9, 7, 4], "O6": [0, 1, 3, 6, 10], "O7": [6, 10, 10, 10, 10], "O8": [10, 10, 10, 10, 10], "O9": [3, 4, None, None, None], "O10": [None, 0, 1, 3, 6], }) assert_eq(return_df, expected_df, check_dtype=False, check_index=False)
def test_string_filter(c, string_table): return_df = c.sql("SELECT * FROM string_table WHERE a = 'a normal string'") assert_eq( return_df, string_table.head(1), )
def test_describe_model(c, training_df): c.sql( """ CREATE MODEL ex_describe_model WITH ( model_class = 'sklearn.ensemble.GradientBoostingClassifier', wrap_predict = True, target_column = 'target' ) AS ( SELECT x, y, x*y > 0 AS target FROM timeseries LIMIT 100 ) """ ) model, training_columns = c.schema[c.schema_name].models["ex_describe_model"] expected_dict = model.get_params() expected_dict["training_columns"] = training_columns.tolist() # hack for converting model class into string expected_series = ( pd.DataFrame.from_dict(expected_dict, orient="index", columns=["Params"])[ "Params" ] .apply(lambda x: str(x)) .sort_index() ) # test result = c.sql("DESCRIBE MODEL ex_describe_model")["Params"].apply(lambda x: str(x)) assert_eq(expected_series, result) with pytest.raises(RuntimeError): c.sql("DESCRIBE MODEL undefined_model")
def test_boolean_operations(c): df = dd.from_pandas(pd.DataFrame({"b": [1, 0, -1]}), npartitions=1) df["b"] = df["b"].apply( lambda x: pd.NA if x < 0 else x > 0, meta=("b", "bool") ) # turn into a bool column c.create_table("df", df) df = c.sql( """ SELECT b IS TRUE AS t, b IS FALSE AS f, b IS NOT TRUE AS nt, b IS NOT FALSE AS nf, b IS UNKNOWN AS u, b IS NOT UNKNOWN AS nu FROM df""" ) expected_df = pd.DataFrame( { "t": [True, False, False], "f": [False, True, False], "nt": [False, True, True], "nf": [True, False, True], "u": [False, False, True], "nu": [True, True, False], }, dtype="bool", ) expected_df["nt"] = expected_df["nt"].astype("boolean") expected_df["nf"] = expected_df["nf"].astype("boolean") expected_df["nu"] = expected_df["nu"].astype("boolean") assert_eq(df, expected_df)
def test_filtered_csv(tmpdir, c): # Predicate pushdown is NOT supported for CSV data. # This test just checks that the "attempted" # predicate-pushdown logic does not lead to # any unexpected errors # Write simple csv dataset df = pd.DataFrame({ "a": [1, 2, 3] * 5, "b": range(15), "c": ["A"] * 15, }, ) dd.from_pandas(df, npartitions=3).to_csv(tmpdir + "/*.csv", index=False) # Read back with dask and apply WHERE query csv_ddf = dd.read_csv(tmpdir + "/*.csv") try: c.create_table("my_csv_table", csv_ddf) return_df = c.sql("SELECT * FROM my_csv_table WHERE b < 10") finally: c.drop_table("my_csv_table") # Check computed result is correct df = csv_ddf expected_df = df[df["b"] < 10] assert_eq(return_df, expected_df)
def test_group_by_nan(c): return_df = c.sql( """ SELECT c FROM user_table_nan GROUP BY c """ ) expected_df = pd.DataFrame({"c": [3, float("nan"), 1]}) # we return nullable int dtype instead of float assert_eq(return_df, expected_df, check_dtype=False) return_df = c.sql( """ SELECT c FROM user_table_inf GROUP BY c """ ) expected_df = pd.DataFrame({"c": [3, 1, float("inf")]}) expected_df["c"] = expected_df["c"].astype("float64") assert_eq( return_df.sort_values("c").reset_index(drop=True), expected_df.sort_values("c").reset_index(drop=True), )
def test_over_calls(c, user_table_1): return_df = c.sql(""" SELECT user_id, b, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY b) AS O1, FIRST_VALUE(user_id*10 - b) OVER (PARTITION BY user_id ORDER BY b) AS O2, SINGLE_VALUE(user_id*10 - b) OVER (PARTITION BY user_id ORDER BY b) AS O3, LAST_VALUE(user_id*10 - b) OVER (PARTITION BY user_id ORDER BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS O4, SUM(user_id) OVER (PARTITION BY user_id ORDER BY b) AS O5, AVG(user_id) OVER (PARTITION BY user_id ORDER BY b) AS O6, COUNT(*) OVER (PARTITION BY user_id ORDER BY b) AS O7, COUNT(b) OVER (PARTITION BY user_id ORDER BY b) AS O7b, MAX(b) OVER (PARTITION BY user_id ORDER BY b) AS O8, MIN(b) OVER (PARTITION BY user_id ORDER BY b) AS O9 FROM user_table_1 """) expected_df = pd.DataFrame({ "user_id": user_table_1.user_id, "b": user_table_1.b, "O1": [2, 1, 1, 1], "O2": [19, 7, 19, 27], "O3": [19, 7, 19, 27], "O4": [17, 7, 17, 27], "O5": [4, 1, 2, 3], "O6": [2, 1, 2, 3], "O7": [2, 1, 1, 1], "O7b": [2, 1, 1, 1], "O8": [3, 3, 1, 3], "O9": [1, 3, 1, 3], }) assert_eq(return_df, expected_df, check_dtype=False, check_index=False)
def test_groupby_split_every(c, gpu, split_every, expected_keys): input_ddf = dd.from_pandas( pd.DataFrame({"user_id": [1, 2, 3, 4] * 16, "b": [5, 6, 7, 8] * 16}), npartitions=16, ) # Need an input with multiple partitions to demonstrate split_every c.create_table("split_every_input", input_ddf, gpu=gpu) return_df = c.sql( """ SELECT user_id, SUM(b) AS "S" FROM split_every_input GROUP BY user_id """, config_options={"sql.groupby.split_every": split_every}, ) expected_df = ( input_ddf.groupby(by="user_id") .agg({"b": "sum"}, split_every=split_every) .reset_index(drop=False) .rename(columns={"b": "S"}) .sort_values("user_id") ) assert len(return_df.dask.keys()) == expected_keys assert_eq(return_df, expected_df, check_index=False) c.drop_table("split_every_input")
def test_sort_with_nan_many_partitions(gpu): c = Context() df = pd.DataFrame({ "a": [float("nan"), 1] * 30, "b": [1, 2, 3] * 20, }) c.create_table("df", dd.from_pandas(df, npartitions=10), gpu=gpu) df_result = c.sql( "SELECT * FROM df ORDER BY a NULLS FIRST, b ASC NULLS FIRST") assert_eq( df_result, pd.DataFrame({ "a": [float("nan")] * 30 + [1] * 30, "b": [1] * 10 + [2] * 10 + [3] * 10 + [1] * 10 + [2] * 10 + [3] * 10, }), check_index=False, ) df = pd.DataFrame({"a": [float("nan"), 1] * 30}) c.create_table("df", dd.from_pandas(df, npartitions=10)) df_result = c.sql("SELECT * FROM df ORDER BY a") assert_eq( df_result, pd.DataFrame({ "a": [1] * 30 + [float("nan")] * 30, }), check_index=False, )
def test_join_literal(c): return_df = c.sql(""" SELECT lhs.user_id, lhs.b, rhs.user_id, rhs.c FROM user_table_1 AS lhs JOIN user_table_2 AS rhs ON True """) expected_df = pd.DataFrame({ "user_id": [2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], "b": [1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], "user_id0": [1, 1, 2, 4, 1, 1, 2, 4, 1, 1, 2, 4, 1, 1, 2, 4], "c": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4], }) assert_eq(return_df, expected_df, check_index=False) return_df = c.sql(""" SELECT lhs.user_id, lhs.b, rhs.user_id, rhs.c FROM user_table_1 AS lhs JOIN user_table_2 AS rhs ON False """) expected_df = pd.DataFrame({ "user_id": [], "b": [], "user_id0": [], "c": [] }) assert_eq(return_df, expected_df, check_dtype=False, check_index=False)
def test_operators(c, df): result_df = c.sql( """ SELECT a * b AS m, -a AS u, a / b AS q, a + b AS s, a - b AS d, a = b AS e, a > b AS g, a >= b AS ge, a < b AS l, a <= b AS le, a <> b AS n FROM df """ ) expected_df = pd.DataFrame(index=df.index) expected_df["m"] = df["a"] * df["b"] expected_df["u"] = -df["a"] expected_df["q"] = df["a"] / df["b"] expected_df["s"] = df["a"] + df["b"] expected_df["d"] = df["a"] - df["b"] expected_df["e"] = df["a"] == df["b"] expected_df["g"] = df["a"] > df["b"] expected_df["ge"] = df["a"] >= df["b"] expected_df["l"] = df["a"] < df["b"] expected_df["le"] = df["a"] <= df["b"] expected_df["n"] = df["a"] != df["b"] assert_eq(result_df, expected_df)
def test_multiple_definitions(c, df_simple): def f(x): return x**2 c.register_function(f, "f", [("x", np.float64)], np.float64) c.register_function(f, "f", [("x", np.int64)], np.int64) return_df = c.sql(""" SELECT F(a) AS a, f(b) AS b FROM df_simple """) expected_df = df_simple[["a", "b"]]**2 assert_eq(return_df, expected_df) def f(x): return x**3 c.register_function(f, "f", [("x", np.float64)], np.float64, replace=True) c.register_function(f, "f", [("x", np.int64)], np.int64) return_df = c.sql(""" SELECT F(a) AS a, f(b) AS b FROM df_simple """) expected_df = df_simple[["a", "b"]]**3 assert_eq(return_df, expected_df)
def test_case(c, df): result_df = c.sql( """ SELECT (CASE WHEN a = 3 THEN 1 END) AS "S1", (CASE WHEN a > 0 THEN a ELSE 1 END) AS "S2", (CASE WHEN a = 4 THEN 3 ELSE a + 1 END) AS "S3", (CASE WHEN a = 3 THEN 1 WHEN a > 0 THEN 2 ELSE a END) AS "S4", CASE WHEN (a >= 1 AND a < 2) OR (a > 2) THEN CAST('in-between' AS VARCHAR) ELSE CAST('out-of-range' AS VARCHAR) END AS "S5", CASE WHEN (a < 2) OR (3 < a AND a < 4) THEN 42 ELSE 47 END AS "S6", CASE WHEN (1 < a AND a <= 4) THEN 1 ELSE 0 END AS "S7" FROM df """ ) expected_df = pd.DataFrame(index=df.index) expected_df["S1"] = df.a.apply(lambda a: 1 if a == 3 else pd.NA) expected_df["S2"] = df.a.apply(lambda a: a if a > 0 else 1) expected_df["S3"] = df.a.apply(lambda a: 3 if a == 4 else a + 1) expected_df["S4"] = df.a.apply(lambda a: 1 if a == 3 else 2 if a > 0 else a) expected_df["S5"] = df.a.apply( lambda a: "in-between" if ((1 <= a < 2) or (a > 2)) else "out-of-range" ) expected_df["S6"] = df.a.apply(lambda a: 42 if ((a < 2) or (3 < a < 4)) else 47) expected_df["S7"] = df.a.apply(lambda a: 1 if (1 < a <= 4) else 0) # Do not check dtypes, as pandas versions are inconsistent here assert_eq(result_df, expected_df, check_dtype=False)
def test_filter_complicated(c, df): return_df = c.sql("SELECT * FROM df WHERE a < 3 AND (b > 1 AND b < 3)") expected_df = df[((df["a"] < 3) & ((df["b"] > 1) & (df["b"] < 3)))] assert_eq( return_df, expected_df, )
def test_timezones(c, datetime_table): result_df = c.sql( """ SELECT * FROM datetime_table """ ) assert_eq(result_df, datetime_table)
def test_select(hive_cursor): c = Context() c.create_table("df", hive_cursor) result_df = c.sql("SELECT * FROM df") expected_df = pd.DataFrame({"i": [1, 2], "j": [2, 4]}).astype("int32") assert_eq(result_df, expected_df, check_index=False)
def test_select_alias(c, df): result_df = c.sql("SELECT a as b, b as a FROM df") expected_df = pd.DataFrame(index=df.index) expected_df["b"] = df.a expected_df["a"] = df.b assert_eq(result_df[["a", "b"]], expected_df[["a", "b"]])
def test_tables(gpu): c = Context() c.create_table("table", pd.DataFrame(), gpu=gpu) result_df = c.sql(f'SHOW TABLES FROM "{c.schema_name}"') expected_df = pd.DataFrame({"Table": ["table"]}) assert_eq(result_df, expected_df, check_index=False)
def test_math_operations(c, df): result_df = c.sql( """ SELECT ABS(b) AS "abs" , ACOS(b) AS "acos" , ASIN(b) AS "asin" , ATAN(b) AS "atan" , ATAN2(a, b) AS "atan2" , CBRT(b) AS "cbrt" , CEIL(b) AS "ceil" , COS(b) AS "cos" , COT(b) AS "cot" , DEGREES(b) AS "degrees" , EXP(b) AS "exp" , FLOOR(b) AS "floor" , LOG10(b) AS "log10" , LN(b) AS "ln" , MOD(b, 4) AS "mod" , POWER(b, 2) AS "power" , POWER(b, a) AS "power2" , RADIANS(b) AS "radians" , ROUND(b) AS "round" , ROUND(b, 3) AS "round2" , SIGN(b) AS "sign" , SIN(b) AS "sin" , TAN(b) AS "tan" , TRUNCATE(b) AS "truncate" FROM df """ ) expected_df = pd.DataFrame(index=df.index) expected_df["abs"] = df.b.abs() expected_df["acos"] = np.arccos(df.b) expected_df["asin"] = np.arcsin(df.b) expected_df["atan"] = np.arctan(df.b) expected_df["atan2"] = np.arctan2(df.a, df.b) expected_df["cbrt"] = np.cbrt(df.b) expected_df["ceil"] = np.ceil(df.b) expected_df["cos"] = np.cos(df.b) expected_df["cot"] = 1 / np.tan(df.b) expected_df["degrees"] = df.b / np.pi * 180 expected_df["exp"] = np.exp(df.b) expected_df["floor"] = np.floor(df.b) expected_df["log10"] = np.log10(df.b) expected_df["ln"] = np.log(df.b) expected_df["mod"] = np.mod(df.b, 4) expected_df["power"] = np.power(df.b, 2) expected_df["power2"] = np.power(df.b, df.a) expected_df["radians"] = df.b / 180 * np.pi expected_df["round"] = np.round(df.b) expected_df["round2"] = np.round(df.b, 3) expected_df["sign"] = np.sign(df.b) expected_df["sin"] = np.sin(df.b) expected_df["tan"] = np.tan(df.b) expected_df["truncate"] = np.trunc(df.b) assert_eq(result_df, expected_df)
def test_show_tables_no_schema(c): c = Context() df = pd.DataFrame({"id": [0, 1]}) c.create_table("test", df) actual_df = c.sql("show tables").compute() expected_df = pd.DataFrame({"Table": ["test"]}) assert_eq(actual_df, expected_df)
def test_limit(c, input_table, limit, offset, request): long_table = request.getfixturevalue(input_table) if not limit: query = f"SELECT * FROM long_table OFFSET {offset}" else: query = f"SELECT * FROM long_table LIMIT {limit} OFFSET {offset}" assert_eq(c.sql(query), long_table.iloc[offset : offset + limit if limit else None])
def test_custom_function(c, df): def f(x): return x**2 c.register_function(f, "f", [("x", np.float64)], np.float64) return_df = c.sql("SELECT F(a) AS a FROM df") assert_eq(return_df, df[["a"]]**2)
def test_custom_function_row(c, df): def f(row): return row["x"]**2 c.register_function(f, "f", [("x", np.float64)], np.float64, row_udf=True) return_df = c.sql("SELECT F(a) AS a FROM df") assert_eq(return_df, df[["a"]]**2)
def test_nan(): op = call.IsNullOperation() assert op(None) assert op(np.NaN) assert op(pd.NA) assert_eq(op(pd.Series(["a", None, "c"])), pd.Series([False, True, False])) assert_eq(op(pd.Series([3, 2, np.NaN, pd.NA])), pd.Series([False, False, True, True]))
def test_filter_cast_timestamp(c, input_table, request): datetime_table = request.getfixturevalue(input_table) return_df = c.sql(f""" SELECT * FROM {input_table} WHERE CAST(timezone AS TIMESTAMP) >= TIMESTAMP '2014-08-01 23:00:00' """) expected_df = datetime_table[datetime_table["timezone"].astype("<M8[ns]") >= pd.Timestamp("2014-08-01 23:00:00")] assert_eq(return_df, expected_df)
def test_literal_null(c): df = c.sql( """ SELECT NULL AS "N", 1 + NULL AS "I" """ ) expected_df = pd.DataFrame({"N": [pd.NA], "I": [pd.NA]}) expected_df["I"] = expected_df["I"].astype("Int32") assert_eq(df, expected_df)
def test_filter_year(c): df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) df["dt"] = pd.to_datetime(df) c.create_table("datetime_test", df) return_df = c.sql("select * from datetime_test where year(dt) < 2016") expected_df = df[df["year"] < 2016] assert_eq(expected_df, return_df)