class TimeDaskSQLJoins: params = [[10**4, 10**5], [10], [3], [6], [0.75]] param_names = [ "rows", "cols", "number of join columns", "number of chained joins", "ratio of dictinct elements", ] def setup(self, N, ncols, njoin_columns, njoins, distinct_r): self.dfs = _generate_dataframes(N, ncols, njoin_columns, njoins, distinct_r) self.dfs = [dd.from_pandas(d, npartitions=1) for d in self.dfs] self.join_cols = [ c for c in self.dfs[0].columns if c in self.dfs[1].columns ] self.ctx = Context() self._create_tables() self._create_sql_query() def _create_tables(self): self.tables = [] for i, df in enumerate(self.dfs): _table_name = f"table_{i:03}" self.ctx.create_table(_table_name, df) _table = table(_table_name, *[column(c) for c in df.columns]) self.tables.append(_table) def _create_sql_query(self): left = self.tables[0] joinq = left select_cols = list(left.c) for right in self.tables[1:]: on = and_( left.c.get(col) == right.c.get(col) for col in self.join_cols) joinq = joinq.join(right, on) select_cols += [c for c in right.c if c.name not in self.join_cols] query = select(*select_cols).select_from(joinq) self.sql_query = str( query.compile( dialect=postgresql.dialect(), compile_kwargs={"literal_binds": True}, )) def time_joins(self, N, ncols, njoin_columns, njoins, distinct_r): start = time.perf_counter() print(f"Processing SQL query: {self.sql_query}") res = self.ctx.sql(self.sql_query) stop = time.perf_counter() print(f"Processing SQL query took {stop-start:0.4f} s.") start = time.perf_counter() print("Computing dask dataframe") res.compute() stop = time.perf_counter() print(f"Computing dask dataframe took {stop-start:0.4f} s.") # Visualize task graph # res.visualize('taskgraph.png') return res
def test_tables(gpu): c = Context() c.create_table("table", pd.DataFrame(), gpu=gpu) result_df = c.sql(f'SHOW TABLES FROM "{c.schema_name}"') expected_df = pd.DataFrame({"Table": ["table"]}) assert_eq(result_df, expected_df, check_index=False)
def test_show_tables_no_schema(c): c = Context() df = pd.DataFrame({"id": [0, 1]}) c.create_table("test", df) actual_df = c.sql("show tables").compute() expected_df = pd.DataFrame({"Table": ["test"]}) assert_eq(actual_df, expected_df)
def main(): # pragma: no cover """ CLI version of the :func:`run_server` function. """ parser = ArgumentParser() parser.add_argument( "--host", default="0.0.0.0", help="The host interface to listen on (defaults to all interfaces)", ) parser.add_argument( "--port", default=8080, help="The port to listen on (defaults to 8080)" ) parser.add_argument( "--scheduler-address", default=None, help="Connect to this dask scheduler if given", ) parser.add_argument( "--log-level", default=None, help="Set the log level of the server. Defaults to info.", choices=uvicorn.config.LOG_LEVELS, ) parser.add_argument( "--load-test-data", default=False, action="store_true", help="Preload some test data.", ) parser.add_argument( "--startup", default=False, action="store_true", help="Wait until Apache Calcite was properly loaded", ) args = parser.parse_args() client = None if args.scheduler_address: client = dask.distributed.Client(args.scheduler_address) context = Context() if args.load_test_data: df = dask.datasets.timeseries(freq="1d").reset_index(drop=False) context.create_table("timeseries", df.persist()) run_server( context=context, client=client, host=args.host, port=args.port, startup=args.startup, log_level=args.log_level, )
def select(self, dfs: fugue.dataframe.DataFrames, statement: str) -> fugue.dataframe.DataFrame: """Send the SQL command to the dask-sql context and register all temporary dataframes""" c = Context() for k, v in dfs.items(): c.create_table(k, self.execution_engine.to_df(v).native) df = c.sql(statement) return fugue_dask.dataframe.DaskDataFrame(df)
def setup(self, N, ncols, njoin_columns, njoins, distinct_r): self.dfs = _generate_dataframes(N, ncols, njoin_columns, njoins, distinct_r) self.dfs = [dd.from_pandas(d, npartitions=1) for d in self.dfs] self.join_cols = [ c for c in self.dfs[0].columns if c in self.dfs[1].columns ] self.ctx = Context() self._create_tables() self._create_sql_query()
def test_explain(): c = Context() data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) c.create_table("df", data_frame) sql_string = c.explain("SELECT * FROM df") assert ( sql_string == f"LogicalProject(a=[$0]){os.linesep} LogicalTableScan(table=[[schema, df]]){os.linesep}" )
def test_dask_sql_sg_logistic_regression( datatype, nrows, ncols, n_parts, wrap_predict ): if wrap_predict: cuml.set_global_output_type("input") else: cuml.set_global_output_type("cudf") X, y = make_classification( n_samples=nrows, n_features=ncols, n_informative=5, random_state=0 ) X_train, X_test, y_train, y_test = train_test_split(X, y) train_df = cudf.DataFrame( X_train, dtype=datatype, columns=[chr(i) for i in range(ncols)] ) train_df["target"] = y_train train_ddf = dask_cudf.from_cudf(train_df, npartitions=n_parts) c = Context() c.create_table("train_df", train_ddf) train_query = f""" CREATE MODEL model WITH ( model_class = 'cuml.linear_model.LogisticRegression', wrap_predict = {wrap_predict}, target_column = 'target' ) AS ( SELECT * FROM train_df ) """ c.sql(train_query) skmodel = LogisticRegression().fit(X_train, y_train) test_df = cudf.DataFrame( X_test, dtype=datatype, columns=[chr(i) for i in range(ncols)] ) test_ddf = dask_cudf.from_cudf(test_df, npartitions=n_parts) c.create_table("test_df", test_ddf) inference_query = """ SELECT * FROM PREDICT( MODEL model, SELECT * FROM test_df ) """ preds = c.sql(inference_query).compute() score = cuml.metrics.accuracy_score(y_test, preds["target"].to_numpy()) assert score >= skmodel.score(X_test, y_test) - 0.022
def app_client(): c = Context() c.sql("SELECT 1 + 1").compute() _init_app(app, c) # late import for the importskip from fastapi.testclient import TestClient yield TestClient(app) # don't disconnect the client if using an independent cluster if os.getenv("DASK_SQL_TEST_SCHEDULER", None) is None: app.client.close()
def test_add_remove_tables(): c = Context() data_frame = dd.from_pandas(pd.DataFrame(), npartitions=1) c.create_table("table", data_frame) assert "table" in c.tables c.drop_table("table") assert "table" not in c.tables with pytest.raises(KeyError): c.drop_table("table") c.create_table("table", [data_frame]) assert "table" in c.tables
def create_context_distributed(sched): from dask_sql import Context from dask.distributed import Client # Need dev version 1.18+ of reticulate # Error: C stack usage is too close to the limit # devtools::install_github('rstudio/reticulate') if sched: client = Client(sched) else: client = Client() ctx = Context() # FIX: develop better client handling on the R side ctx.dors_client = client return ctx
def setUp(self): super().setUp() app.c = Context() self.client = TestClient(app) self.f = os.path.join(tempfile.gettempdir(), os.urandom(24).hex())
def eq_sqlite(sql, **dfs): c = Context() engine = sqlite3.connect(":memory:") for name, df in dfs.items(): c.create_table(name, df) df.to_sql(name, engine, index=False) dask_result = c.sql(sql).compute().reset_index(drop=True) sqlite_result = pd.read_sql(sql, engine).reset_index(drop=True) # Make sure SQL and Dask use the same "NULL" value dask_result = dask_result.fillna(np.NaN) sqlite_result = sqlite_result.fillna(np.NaN) assert_frame_equal(dask_result, sqlite_result, check_dtype=False)
def _init_app( app: FastAPI, context: dask_sql.Context = None, client: dask.distributed.Client = None, ): app.c = context or Context() app.future_list = {} app.client = client or dask.distributed.Client()
def run_server(context: Context = None, host: str = "0.0.0.0", port: int = 8080): # pragma: no cover """ Run a HTTP server for answering SQL queries using ``dask-sql``. It uses the `Presto Wire Protocol <https://github.com/prestodb/presto/wiki/HTTP-Protocol>`_ for communication. This means, it has a single POST endpoint `v1/statement`, which answers SQL queries (as string in the body) with the output as a JSON (in the format described in the documentation above). Every SQL expression that ``dask-sql`` understands can be used here. Note: The presto protocol also includes some statistics on the query in the response. These statistics are currently only filled with placeholder variables. Args: context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one. host (:obj:`str`): The host interface to listen on (defaults to all interfaces) port (:obj:`int`): The port to listen on (defaults to 8080) Example: It is possible to run an SQL server by using the CLI script in ``dask_sql.server.app`` or by calling this function directly in your user code: .. code-block:: python from dask_sql import run_server # Create your pre-filled context c = Context() ... run_server(context=c) After starting the server, it is possible to send queries to it, e.g. with the `presto CLI <https://prestosql.io/docs/current/installation/cli.html>`_ or via sqlalchemy (e.g. using the `PyHive <https://github.com/dropbox/PyHive#sqlalchemy>`_ package): .. code-block:: python from sqlalchemy.engine import create_engine engine = create_engine('presto://localhost:8080/') import pandas as pd pd.read_sql_query("SELECT 1 + 1", con=engine) Of course, it is also possible to call the usual ``CREATE TABLE`` commands. """ if context is None: context = Context() app.c = context uvicorn.run(app, host=host, port=port)
def c(): c = Context() c.create_schema(schema) row = create_table_row() tables = pd.DataFrame().append(row, ignore_index=True) tables = tables.astype({"AN_INT": "int64"}) c.create_table(table, tables, schema_name=schema) yield c c.drop_schema(schema)
def eq_sqlite(sql, **dfs): c = Context() engine = sqlite3.connect(":memory:") for name, df in dfs.items(): c.create_table(name, df) df.to_sql(name, engine, index=False) dask_result = c.sql(sql).reset_index(drop=True) sqlite_result = pd.read_sql(sql, engine).reset_index(drop=True) # casting to object to ensure equality with sql-lite # which returns object dtype for datetime inputs dask_result = cast_datetime_to_string(dask_result) # Make sure SQL and Dask use the same "NULL" value dask_result = dask_result.fillna(np.NaN) sqlite_result = sqlite_result.fillna(np.NaN) assert_eq(dask_result, sqlite_result, check_dtype=False)
def test_aggregation_adding(): c = Context() assert not c.schema[c.schema_name].function_lists assert not c.schema[c.schema_name].functions f = lambda x: x c.register_aggregation(f, "f", [("x", int)], float) assert "f" in c.schema[c.schema_name].functions assert c.schema[c.schema_name].functions["f"] == f assert len(c.schema[c.schema_name].function_lists) == 2 assert c.schema[c.schema_name].function_lists[0].name == "F" assert c.schema[c.schema_name].function_lists[0].parameters == [("x", int)] assert c.schema[c.schema_name].function_lists[0].return_type == float assert c.schema[c.schema_name].function_lists[0].aggregation assert c.schema[c.schema_name].function_lists[1].name == "f" assert c.schema[c.schema_name].function_lists[1].parameters == [("x", int)] assert c.schema[c.schema_name].function_lists[1].return_type == float assert c.schema[c.schema_name].function_lists[1].aggregation # Without replacement c.register_aggregation(f, "f", [("x", float)], int, replace=False) assert "f" in c.schema[c.schema_name].functions assert c.schema[c.schema_name].functions["f"] == f assert len(c.schema[c.schema_name].function_lists) == 4 assert c.schema[c.schema_name].function_lists[2].name == "F" assert c.schema[c.schema_name].function_lists[2].parameters == [("x", float)] assert c.schema[c.schema_name].function_lists[2].return_type == int assert c.schema[c.schema_name].function_lists[2].aggregation assert c.schema[c.schema_name].function_lists[3].name == "f" assert c.schema[c.schema_name].function_lists[3].parameters == [("x", float)] assert c.schema[c.schema_name].function_lists[3].return_type == int assert c.schema[c.schema_name].function_lists[3].aggregation # With replacement f = lambda x: x + 1 c.register_aggregation(f, "f", [("x", str)], str, replace=True) assert "f" in c.schema[c.schema_name].functions assert c.schema[c.schema_name].functions["f"] == f assert len(c.schema[c.schema_name].function_lists) == 2 assert c.schema[c.schema_name].function_lists[0].name == "F" assert c.schema[c.schema_name].function_lists[0].parameters == [("x", str)] assert c.schema[c.schema_name].function_lists[0].return_type == str assert c.schema[c.schema_name].function_lists[0].aggregation assert c.schema[c.schema_name].function_lists[1].name == "f" assert c.schema[c.schema_name].function_lists[1].parameters == [("x", str)] assert c.schema[c.schema_name].function_lists[1].return_type == str assert c.schema[c.schema_name].function_lists[1].aggregation
def main(): # pragma: no cover parser = ArgumentParser() parser.add_argument( "--scheduler-address", default=None, help="Connect to this dask scheduler if given", ) parser.add_argument( "--log-level", default=None, help="Set the log level of the server. Defaults to info.", choices=["DEBUG", "INFO", "WARNING", "ERROR"], ) parser.add_argument( "--load-test-data", default=False, action="store_true", help="Preload some test data.", ) parser.add_argument( "--startup", default=False, action="store_true", help="Wait until Apache Calcite was properly loaded", ) args = parser.parse_args() client = None if args.scheduler_address: client = dask.distributed.Client(args.scheduler_address) context = Context() if args.load_test_data: df = dask.datasets.timeseries(freq="1d").reset_index(drop=False) context.create_table("timeseries", df.persist()) cmd_loop( context=context, client=client, startup=args.startup, log_level=args.log_level )
def test_tables_from_stack(): c = Context() assert not c._get_tables_from_stack() df = pd.DataFrame() assert "df" in c._get_tables_from_stack() def f(): df2 = pd.DataFrame() assert "df" in c._get_tables_from_stack() assert "df2" in c._get_tables_from_stack() f() def g(): df = pd.DataFrame({"a": [1]}) assert "df" in c._get_tables_from_stack() assert c._get_tables_from_stack()["df"].columns == ["a"]
def test_join_case_projection_subquery(): c = Context() # Tables for query demo = pd.DataFrame({"demo_sku": [], "hd_dep_count": []}) site_page = pd.DataFrame({"site_page_sk": [], "site_char_count": []}) sales = pd.DataFrame({ "sales_hdemo_sk": [], "sales_page_sk": [], "sold_time_sk": [] }) t_dim = pd.DataFrame({"t_time_sk": [], "t_hour": []}) c.create_table("demos", demo, persist=False) c.create_table("site_page", site_page, persist=False) c.create_table("sales", sales, persist=False) c.create_table("t_dim", t_dim, persist=False) c.sql(""" SELECT CASE WHEN pmc > 0.0 THEN CAST (amc AS DOUBLE) / CAST (pmc AS DOUBLE) ELSE -1.0 END AS am_pm_ratio FROM ( SELECT SUM(amc1) AS amc, SUM(pmc1) AS pmc FROM ( SELECT CASE WHEN t_hour BETWEEN 7 AND 8 THEN COUNT(1) ELSE 0 END AS amc1, CASE WHEN t_hour BETWEEN 19 AND 20 THEN COUNT(1) ELSE 0 END AS pmc1 FROM sales ws JOIN demos hd ON (hd.demo_sku = ws.sales_hdemo_sk and hd.hd_dep_count = 5) JOIN site_page sp ON (sp.site_page_sk = ws.sales_page_sk and sp.site_char_count BETWEEN 5000 AND 6000) JOIN t_dim td ON (td.t_time_sk = ws.sold_time_sk and td.t_hour IN (7,8,19,20)) GROUP BY t_hour ) cnt_am_pm ) sum_am_pm """).compute()
def test_tables_from_stack(gpu): c = Context() assert not c._get_tables_from_stack() df = pd.DataFrame() if not gpu else cudf.DataFrame() assert "df" in c._get_tables_from_stack() def f(gpu): df2 = pd.DataFrame() if not gpu else cudf.DataFrame() assert "df" in c._get_tables_from_stack() assert "df2" in c._get_tables_from_stack() f(gpu=gpu) def g(gpu=gpu): df = pd.DataFrame({"a": [1]}) if not gpu else cudf.DataFrame({"a": [1]}) assert "df" in c._get_tables_from_stack() assert c._get_tables_from_stack()["df"].columns == ["a"] g(gpu=gpu)
def get_context(cls, new=False): if cls._context is None or new: if not config["RAS"].getboolean("synchronous", False): cls._create_client() cls._context = Context() # We register an aggregate function called len which applies to string columns # Used for example in `test_probabilistic_frontend:test_postprob_conjunct_with_wlq_result` cls._context.register_aggregation(len, "len", [("x", pd.StringDtype())], np.int32) # We also register a sum which applies to objects (i.e `Symbol` or sets) # since by default sum applies only to numbers in SQL and Calcite will # try to cast objects to float before applying the default sum op. cls._context.register_aggregation(sum, "sum", [("x", np.object_)], np.object_) return cls._context
def test_sql(): c = Context() data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) c.create_table("df", data_frame) result = c.sql("SELECT * FROM df") assert isinstance(result, dd.DataFrame) result = c.sql("SELECT * FROM df", return_futures=False) assert isinstance(result, pd.DataFrame)
def test_query_case_sensitivity(): c = Context() df = pd.DataFrame({"id": [0, 1]}) c.create_table("test", df) try: c.sql( "select ID from test", config_options={"sql.identifier.case_sensitive": False}, ) except ParsingException as pe: assert False, f"Queries should be case insensitve but raised exception {pe}"
def test_sql(): c = Context() data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) c.create_table("df", data_frame) result = c.sql("SELECT * FROM df") assert isinstance(result, dd.DataFrame) assert_frame_equal(result.compute(), data_frame.compute()) result = c.sql("SELECT * FROM df", return_futures=False) assert isinstance(result, pd.DataFrame) assert_frame_equal(result, data_frame.compute()) result = c.sql("SELECT * FROM other_df", dataframes={"other_df": data_frame}) assert isinstance(result, dd.DataFrame) assert_frame_equal(result.compute(), data_frame.compute())
def test_sql(gpu): c = Context() data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) c.create_table("df", data_frame, gpu=gpu) result = c.sql("SELECT * FROM df") assert isinstance(result, dd.DataFrame) assert_eq(result, data_frame) result = c.sql("SELECT * FROM df", return_futures=False) assert not isinstance(result, dd.DataFrame) assert_eq(result, data_frame) result = c.sql( "SELECT * FROM other_df", dataframes={"other_df": data_frame}, gpu=gpu ) assert isinstance(result, dd.DataFrame) assert_eq(result, data_frame)
def test_deprecation_warning(): c = Context() data_frame = dd.from_pandas(pd.DataFrame(), npartitions=1) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") c.register_dask_table(data_frame, "table") assert len(w) == 1 assert issubclass(w[-1].category, DeprecationWarning) assert "table" in c.tables c.drop_table("table") assert "table" not in c.tables
def test_fsql(): def assert_eq(df: pd.DataFrame) -> None: assert_frame_equal(df, pd.DataFrame({"a": [1]})) # the simplest case: the SQL does not use any input and does not generate output fsql(""" CREATE [[0],[1]] SCHEMA a:long SELECT * WHERE a>0 OUTPUT USING assert_eq """) # it can directly use the dataframes inside dask-sql Context c = Context() c.create_table( "df", dd.from_pandas(pd.DataFrame([[0], [1]], columns=["a"]), npartitions=2)) fsql( """ SELECT * FROM df WHERE a>0 OUTPUT USING assert_eq """, c, ) # for dataframes with name, they can register back to the Context (register=True) # the return of fsql is the dict of all dask dataframes with explicit names result = fsql( """ x=SELECT * FROM df WHERE a>0 OUTPUT USING assert_eq """, c, register=True, ) assert isinstance(result["x"], dd.DataFrame) assert "x" in c.tables # integration test with fugue transformer extension c = Context() c.create_table( "df1", dd.from_pandas(pd.DataFrame([[0, 1], [1, 2]], columns=["a", "b"]), npartitions=2), ) c.create_table( "df2", dd.from_pandas(pd.DataFrame([[1, 2], [3, 4], [-4, 5]], columns=["a", "b"]), npartitions=2), ) # schema: * def cumsum(df: pd.DataFrame) -> pd.DataFrame: return df.cumsum() fsql( """ data = SELECT * FROM df1 WHERE a>0 UNION ALL SELECT * FROM df2 WHERE a>0 PERSIST result1 = TRANSFORM data PREPARTITION BY a PRESORT b USING cumsum result2 = TRANSFORM data PREPARTITION BY b PRESORT a USING cumsum PRINT result1, result2 """, c, register=True, ) assert "result1" in c.tables assert "result2" in c.tables
def cmd_loop( context: Context = None, client: dask.distributed.Client = None, startup=False, log_level=None, ): # pragma: no cover """ Run a REPL for answering SQL queries using ``dask-sql``. Every SQL expression that ``dask-sql`` understands can be used here. Args: context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one. client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one. startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded log_level: (:obj:`str`): The log level of the server and dask-sql Example: It is possible to run a REPL by using the CLI script in ``dask-sql`` or by calling this function directly in your user code: .. code-block:: python from dask_sql import cmd_loop # Create your pre-filled context c = Context() ... cmd_loop(context=c) Of course, it is also possible to call the usual ``CREATE TABLE`` commands. """ pd.set_option("display.max_rows", None) pd.set_option("display.max_columns", None) pd.set_option("display.width", None) pd.set_option("display.max_colwidth", None) logging.basicConfig(level=log_level) client = client or dask.distributed.Client() context = context or Context() if startup: context.sql("SELECT 1 + 1").compute() session = PromptSession(lexer=PygmentsLexer(SqlLexer)) while True: try: text = session.prompt("(dask-sql) > ") except KeyboardInterrupt: continue except EOFError: break text = text.rstrip(";").strip() if not text: continue try: df = context.sql(text, return_futures=False) print(df) except Exception as e: print(e)