def test_explain(gpu): c = Context() data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) c.create_table("df", data_frame, gpu=gpu) sql_string = c.explain("SELECT * FROM df") assert sql_string.startswith( "DaskTableScan(table=[[root, df]]): rowcount = 100.0, cumulative cost = {100.0 rows, 101.0 cpu, 0.0 io}, id = " ) c.create_table("df", data_frame, statistics=Statistics(row_count=1337), gpu=gpu) sql_string = c.explain("SELECT * FROM df") assert sql_string.startswith( "DaskTableScan(table=[[root, df]]): rowcount = 1337.0, cumulative cost = {1337.0 rows, 1338.0 cpu, 0.0 io}, id = " ) c = Context() data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) sql_string = c.explain( "SELECT * FROM other_df", dataframes={"other_df": data_frame}, gpu=gpu ) assert sql_string.startswith( "DaskTableScan(table=[[root, other_df]]): rowcount = 100.0, cumulative cost = {100.0 rows, 101.0 cpu, 0.0 io}, id = " )
class TimeDaskSQLJoins: params = [[10**4, 10**5], [10], [3], [6], [0.75]] param_names = [ "rows", "cols", "number of join columns", "number of chained joins", "ratio of dictinct elements", ] def setup(self, N, ncols, njoin_columns, njoins, distinct_r): self.dfs = _generate_dataframes(N, ncols, njoin_columns, njoins, distinct_r) self.dfs = [dd.from_pandas(d, npartitions=1) for d in self.dfs] self.join_cols = [ c for c in self.dfs[0].columns if c in self.dfs[1].columns ] self.ctx = Context() self._create_tables() self._create_sql_query() def _create_tables(self): self.tables = [] for i, df in enumerate(self.dfs): _table_name = f"table_{i:03}" self.ctx.create_table(_table_name, df) _table = table(_table_name, *[column(c) for c in df.columns]) self.tables.append(_table) def _create_sql_query(self): left = self.tables[0] joinq = left select_cols = list(left.c) for right in self.tables[1:]: on = and_( left.c.get(col) == right.c.get(col) for col in self.join_cols) joinq = joinq.join(right, on) select_cols += [c for c in right.c if c.name not in self.join_cols] query = select(*select_cols).select_from(joinq) self.sql_query = str( query.compile( dialect=postgresql.dialect(), compile_kwargs={"literal_binds": True}, )) def time_joins(self, N, ncols, njoin_columns, njoins, distinct_r): start = time.perf_counter() print(f"Processing SQL query: {self.sql_query}") res = self.ctx.sql(self.sql_query) stop = time.perf_counter() print(f"Processing SQL query took {stop-start:0.4f} s.") start = time.perf_counter() print("Computing dask dataframe") res.compute() stop = time.perf_counter() print(f"Computing dask dataframe took {stop-start:0.4f} s.") # Visualize task graph # res.visualize('taskgraph.png') return res
def test_tables(gpu): c = Context() c.create_table("table", pd.DataFrame(), gpu=gpu) result_df = c.sql(f'SHOW TABLES FROM "{c.schema_name}"') expected_df = pd.DataFrame({"Table": ["table"]}) assert_eq(result_df, expected_df, check_index=False)
def test_show_tables_no_schema(c): c = Context() df = pd.DataFrame({"id": [0, 1]}) c.create_table("test", df) actual_df = c.sql("show tables").compute() expected_df = pd.DataFrame({"Table": ["test"]}) assert_eq(actual_df, expected_df)
def test_dask_sql_sg_logistic_regression( datatype, nrows, ncols, n_parts, wrap_predict ): if wrap_predict: cuml.set_global_output_type("input") else: cuml.set_global_output_type("cudf") X, y = make_classification( n_samples=nrows, n_features=ncols, n_informative=5, random_state=0 ) X_train, X_test, y_train, y_test = train_test_split(X, y) train_df = cudf.DataFrame( X_train, dtype=datatype, columns=[chr(i) for i in range(ncols)] ) train_df["target"] = y_train train_ddf = dask_cudf.from_cudf(train_df, npartitions=n_parts) c = Context() c.create_table("train_df", train_ddf) train_query = f""" CREATE MODEL model WITH ( model_class = 'cuml.linear_model.LogisticRegression', wrap_predict = {wrap_predict}, target_column = 'target' ) AS ( SELECT * FROM train_df ) """ c.sql(train_query) skmodel = LogisticRegression().fit(X_train, y_train) test_df = cudf.DataFrame( X_test, dtype=datatype, columns=[chr(i) for i in range(ncols)] ) test_ddf = dask_cudf.from_cudf(test_df, npartitions=n_parts) c.create_table("test_df", test_ddf) inference_query = """ SELECT * FROM PREDICT( MODEL model, SELECT * FROM test_df ) """ preds = c.sql(inference_query).compute() score = cuml.metrics.accuracy_score(y_test, preds["target"].to_numpy()) assert score >= skmodel.score(X_test, y_test) - 0.022
def main(): # pragma: no cover """ CLI version of the :func:`run_server` function. """ parser = ArgumentParser() parser.add_argument( "--host", default="0.0.0.0", help="The host interface to listen on (defaults to all interfaces)", ) parser.add_argument( "--port", default=8080, help="The port to listen on (defaults to 8080)" ) parser.add_argument( "--scheduler-address", default=None, help="Connect to this dask scheduler if given", ) parser.add_argument( "--log-level", default=None, help="Set the log level of the server. Defaults to info.", choices=uvicorn.config.LOG_LEVELS, ) parser.add_argument( "--load-test-data", default=False, action="store_true", help="Preload some test data.", ) parser.add_argument( "--startup", default=False, action="store_true", help="Wait until Apache Calcite was properly loaded", ) args = parser.parse_args() client = None if args.scheduler_address: client = dask.distributed.Client(args.scheduler_address) context = Context() if args.load_test_data: df = dask.datasets.timeseries(freq="1d").reset_index(drop=False) context.create_table("timeseries", df.persist()) run_server( context=context, client=client, host=args.host, port=args.port, startup=args.startup, log_level=args.log_level, )
def select(self, dfs: fugue.dataframe.DataFrames, statement: str) -> fugue.dataframe.DataFrame: """Send the SQL command to the dask-sql context and register all temporary dataframes""" c = Context() for k, v in dfs.items(): c.create_table(k, self.execution_engine.to_df(v).native) df = c.sql(statement) return fugue_dask.dataframe.DaskDataFrame(df)
def test_sql(): c = Context() data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) c.create_table("df", data_frame) result = c.sql("SELECT * FROM df") assert isinstance(result, dd.DataFrame) result = c.sql("SELECT * FROM df", return_futures=False) assert isinstance(result, pd.DataFrame)
def c(): c = Context() c.create_schema(schema) row = create_table_row() tables = pd.DataFrame().append(row, ignore_index=True) tables = tables.astype({"AN_INT": "int64"}) c.create_table(table, tables, schema_name=schema) yield c c.drop_schema(schema)
def test_explain(): c = Context() data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) c.create_table("df", data_frame) sql_string = c.explain("SELECT * FROM df") assert ( sql_string == f"LogicalProject(a=[$0]){os.linesep} LogicalTableScan(table=[[schema, df]]){os.linesep}" )
def test_query_case_sensitivity(): c = Context() df = pd.DataFrame({"id": [0, 1]}) c.create_table("test", df) try: c.sql( "select ID from test", config_options={"sql.identifier.case_sensitive": False}, ) except ParsingException as pe: assert False, f"Queries should be case insensitve but raised exception {pe}"
def eq_sqlite(sql, **dfs): c = Context() engine = sqlite3.connect(":memory:") for name, df in dfs.items(): c.create_table(name, df) df.to_sql(name, engine, index=False) dask_result = c.sql(sql).compute().reset_index(drop=True) sqlite_result = pd.read_sql(sql, engine).reset_index(drop=True) # Make sure SQL and Dask use the same "NULL" value dask_result = dask_result.fillna(np.NaN) sqlite_result = sqlite_result.fillna(np.NaN) assert_frame_equal(dask_result, sqlite_result, check_dtype=False)
def test_add_remove_tables(): c = Context() data_frame = dd.from_pandas(pd.DataFrame(), npartitions=1) c.create_table("table", data_frame) assert "table" in c.tables c.drop_table("table") assert "table" not in c.tables with pytest.raises(KeyError): c.drop_table("table") c.create_table("table", [data_frame]) assert "table" in c.tables
def test_sql(): c = Context() data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) c.create_table("df", data_frame) result = c.sql("SELECT * FROM df") assert isinstance(result, dd.DataFrame) assert_frame_equal(result.compute(), data_frame.compute()) result = c.sql("SELECT * FROM df", return_futures=False) assert isinstance(result, pd.DataFrame) assert_frame_equal(result, data_frame.compute()) result = c.sql("SELECT * FROM other_df", dataframes={"other_df": data_frame}) assert isinstance(result, dd.DataFrame) assert_frame_equal(result.compute(), data_frame.compute())
def test_sql(gpu): c = Context() data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) c.create_table("df", data_frame, gpu=gpu) result = c.sql("SELECT * FROM df") assert isinstance(result, dd.DataFrame) assert_eq(result, data_frame) result = c.sql("SELECT * FROM df", return_futures=False) assert not isinstance(result, dd.DataFrame) assert_eq(result, data_frame) result = c.sql( "SELECT * FROM other_df", dataframes={"other_df": data_frame}, gpu=gpu ) assert isinstance(result, dd.DataFrame) assert_eq(result, data_frame)
def eq_sqlite(sql, **dfs): c = Context() engine = sqlite3.connect(":memory:") for name, df in dfs.items(): c.create_table(name, df) df.to_sql(name, engine, index=False) dask_result = c.sql(sql).reset_index(drop=True) sqlite_result = pd.read_sql(sql, engine).reset_index(drop=True) # casting to object to ensure equality with sql-lite # which returns object dtype for datetime inputs dask_result = cast_datetime_to_string(dask_result) # Make sure SQL and Dask use the same "NULL" value dask_result = dask_result.fillna(np.NaN) sqlite_result = sqlite_result.fillna(np.NaN) assert_eq(dask_result, sqlite_result, check_dtype=False)
def main(): # pragma: no cover parser = ArgumentParser() parser.add_argument( "--scheduler-address", default=None, help="Connect to this dask scheduler if given", ) parser.add_argument( "--log-level", default=None, help="Set the log level of the server. Defaults to info.", choices=["DEBUG", "INFO", "WARNING", "ERROR"], ) parser.add_argument( "--load-test-data", default=False, action="store_true", help="Preload some test data.", ) parser.add_argument( "--startup", default=False, action="store_true", help="Wait until Apache Calcite was properly loaded", ) args = parser.parse_args() client = None if args.scheduler_address: client = dask.distributed.Client(args.scheduler_address) context = Context() if args.load_test_data: df = dask.datasets.timeseries(freq="1d").reset_index(drop=False) context.create_table("timeseries", df.persist()) cmd_loop( context=context, client=client, startup=args.startup, log_level=args.log_level )
def test_join_case_projection_subquery(): c = Context() # Tables for query demo = pd.DataFrame({"demo_sku": [], "hd_dep_count": []}) site_page = pd.DataFrame({"site_page_sk": [], "site_char_count": []}) sales = pd.DataFrame({ "sales_hdemo_sk": [], "sales_page_sk": [], "sold_time_sk": [] }) t_dim = pd.DataFrame({"t_time_sk": [], "t_hour": []}) c.create_table("demos", demo, persist=False) c.create_table("site_page", site_page, persist=False) c.create_table("sales", sales, persist=False) c.create_table("t_dim", t_dim, persist=False) c.sql(""" SELECT CASE WHEN pmc > 0.0 THEN CAST (amc AS DOUBLE) / CAST (pmc AS DOUBLE) ELSE -1.0 END AS am_pm_ratio FROM ( SELECT SUM(amc1) AS amc, SUM(pmc1) AS pmc FROM ( SELECT CASE WHEN t_hour BETWEEN 7 AND 8 THEN COUNT(1) ELSE 0 END AS amc1, CASE WHEN t_hour BETWEEN 19 AND 20 THEN COUNT(1) ELSE 0 END AS pmc1 FROM sales ws JOIN demos hd ON (hd.demo_sku = ws.sales_hdemo_sk and hd.hd_dep_count = 5) JOIN site_page sp ON (sp.site_page_sk = ws.sales_page_sk and sp.site_char_count BETWEEN 5000 AND 6000) JOIN t_dim td ON (td.t_time_sk = ws.sold_time_sk and td.t_hour IN (7,8,19,20)) GROUP BY t_hour ) cnt_am_pm ) sum_am_pm """).compute()
def test_fsql(): def assert_eq(df: pd.DataFrame) -> None: assert_frame_equal(df, pd.DataFrame({"a": [1]})) # the simplest case: the SQL does not use any input and does not generate output fsql(""" CREATE [[0],[1]] SCHEMA a:long SELECT * WHERE a>0 OUTPUT USING assert_eq """) # it can directly use the dataframes inside dask-sql Context c = Context() c.create_table( "df", dd.from_pandas(pd.DataFrame([[0], [1]], columns=["a"]), npartitions=2)) fsql( """ SELECT * FROM df WHERE a>0 OUTPUT USING assert_eq """, c, ) # for dataframes with name, they can register back to the Context (register=True) # the return of fsql is the dict of all dask dataframes with explicit names result = fsql( """ x=SELECT * FROM df WHERE a>0 OUTPUT USING assert_eq """, c, register=True, ) assert isinstance(result["x"], dd.DataFrame) assert "x" in c.tables # integration test with fugue transformer extension c = Context() c.create_table( "df1", dd.from_pandas(pd.DataFrame([[0, 1], [1, 2]], columns=["a", "b"]), npartitions=2), ) c.create_table( "df2", dd.from_pandas(pd.DataFrame([[1, 2], [3, 4], [-4, 5]], columns=["a", "b"]), npartitions=2), ) # schema: * def cumsum(df: pd.DataFrame) -> pd.DataFrame: return df.cumsum() fsql( """ data = SELECT * FROM df1 WHERE a>0 UNION ALL SELECT * FROM df2 WHERE a>0 PERSIST result1 = TRANSFORM data PREPARTITION BY a PRESORT b USING cumsum result2 = TRANSFORM data PREPARTITION BY b PRESORT a USING cumsum PRINT result1, result2 """, c, register=True, ) assert "result1" in c.tables assert "result2" in c.tables
return dask_xgboost.predict(client, bst, X) # Create a context from dask_sql import Context, run_server c = Context() c.register_function(predict_price, "predict_price", [("total_amount", np.float64), ("trip_distance", np.float64), ("passenger_count", np.float64)], np.float64) # Load the data from S3 df = dd.read_csv("s3://nyc-tlc/trip data/yellow_tripdata_2019-01.csv", dtype={ "payment_type": "UInt8", "VendorID": "UInt8", "passenger_count": "UInt8", "RatecodeIDq": "UInt8", }, storage_options={ "anon": True }).persist() wait(df) c.create_table("nyc-taxi", df) c.sql("SELECT 1 + 1").compute() # Finally, spin up the dask-sql server run_server(context=c, client=client)
def test_input_types(temporary_data_file): c = Context() df = pd.DataFrame({"a": [1, 2, 3]}) def assert_correct_output(): result = c.sql("SELECT * FROM df") assert isinstance(result, dd.DataFrame) assert_frame_equal(result.compute(), df) c.create_table("df", df) assert_correct_output() c.create_table("df", dd.from_pandas(df, npartitions=1)) assert_correct_output() df.to_csv(temporary_data_file, index=False) c.create_table("df", temporary_data_file) assert_correct_output() df.to_csv(temporary_data_file, index=False) c.create_table("df", temporary_data_file, format="csv") assert_correct_output() df.to_parquet(temporary_data_file, index=False) c.create_table("df", temporary_data_file, format="parquet") assert_correct_output() with pytest.raises(AttributeError): c.create_table("df", temporary_data_file, format="unknown") strangeThing = object() with pytest.raises(ValueError): c.create_table("df", strangeThing)
def query(self, dataframe: DataFrame) -> Result[DataFrame, InvalidJob]: c = Context() c.create_table(self.table_name, dataframe) queried = c.sql(self.query_string) return Success(queried)
import dask.dataframe as dd import pandas as pd from dask_sql import Context c = Context() data = """ name,x Alice,34 Bob, """ df = pd.DataFrame({"name": ["Alice", "Bob", "Chris"] * 100, "x": list(range(300))}) ddf = dd.from_pandas(df, npartitions=10) c.create_table("my_data", ddf) got = c.sql( """ SELECT my_data.name, SUM(my_data.x) AS "S" FROM my_data GROUP BY my_data.name """ ) expect = pd.DataFrame({"name": ["Alice", "Bob", "Chris"], "S": [14850, 14950, 15050]}) dd.assert_eq(got, expect)