def test_dask_sql_sg_logistic_regression( datatype, nrows, ncols, n_parts, wrap_predict ): if wrap_predict: cuml.set_global_output_type("input") else: cuml.set_global_output_type("cudf") X, y = make_classification( n_samples=nrows, n_features=ncols, n_informative=5, random_state=0 ) X_train, X_test, y_train, y_test = train_test_split(X, y) train_df = cudf.DataFrame( X_train, dtype=datatype, columns=[chr(i) for i in range(ncols)] ) train_df["target"] = y_train train_ddf = dask_cudf.from_cudf(train_df, npartitions=n_parts) c = Context() c.create_table("train_df", train_ddf) train_query = f""" CREATE MODEL model WITH ( model_class = 'cuml.linear_model.LogisticRegression', wrap_predict = {wrap_predict}, target_column = 'target' ) AS ( SELECT * FROM train_df ) """ c.sql(train_query) skmodel = LogisticRegression().fit(X_train, y_train) test_df = cudf.DataFrame( X_test, dtype=datatype, columns=[chr(i) for i in range(ncols)] ) test_ddf = dask_cudf.from_cudf(test_df, npartitions=n_parts) c.create_table("test_df", test_ddf) inference_query = """ SELECT * FROM PREDICT( MODEL model, SELECT * FROM test_df ) """ preds = c.sql(inference_query).compute() score = cuml.metrics.accuracy_score(y_test, preds["target"].to_numpy()) assert score >= skmodel.score(X_test, y_test) - 0.022
def test_sql(): c = Context() data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) c.create_table("df", data_frame) result = c.sql("SELECT * FROM df") assert isinstance(result, dd.DataFrame) result = c.sql("SELECT * FROM df", return_futures=False) assert isinstance(result, pd.DataFrame)
def app_client(): c = Context() c.sql("SELECT 1 + 1").compute() _init_app(app, c) # late import for the importskip from fastapi.testclient import TestClient yield TestClient(app) # don't disconnect the client if using an independent cluster if os.getenv("DASK_SQL_TEST_SCHEDULER", None) is None: app.client.close()
def test_query_case_sensitivity(): c = Context() df = pd.DataFrame({"id": [0, 1]}) c.create_table("test", df) try: c.sql( "select ID from test", config_options={"sql.identifier.case_sensitive": False}, ) except ParsingException as pe: assert False, f"Queries should be case insensitve but raised exception {pe}"
class TimeDaskSQLJoins: params = [[10**4, 10**5], [10], [3], [6], [0.75]] param_names = [ "rows", "cols", "number of join columns", "number of chained joins", "ratio of dictinct elements", ] def setup(self, N, ncols, njoin_columns, njoins, distinct_r): self.dfs = _generate_dataframes(N, ncols, njoin_columns, njoins, distinct_r) self.dfs = [dd.from_pandas(d, npartitions=1) for d in self.dfs] self.join_cols = [ c for c in self.dfs[0].columns if c in self.dfs[1].columns ] self.ctx = Context() self._create_tables() self._create_sql_query() def _create_tables(self): self.tables = [] for i, df in enumerate(self.dfs): _table_name = f"table_{i:03}" self.ctx.create_table(_table_name, df) _table = table(_table_name, *[column(c) for c in df.columns]) self.tables.append(_table) def _create_sql_query(self): left = self.tables[0] joinq = left select_cols = list(left.c) for right in self.tables[1:]: on = and_( left.c.get(col) == right.c.get(col) for col in self.join_cols) joinq = joinq.join(right, on) select_cols += [c for c in right.c if c.name not in self.join_cols] query = select(*select_cols).select_from(joinq) self.sql_query = str( query.compile( dialect=postgresql.dialect(), compile_kwargs={"literal_binds": True}, )) def time_joins(self, N, ncols, njoin_columns, njoins, distinct_r): start = time.perf_counter() print(f"Processing SQL query: {self.sql_query}") res = self.ctx.sql(self.sql_query) stop = time.perf_counter() print(f"Processing SQL query took {stop-start:0.4f} s.") start = time.perf_counter() print("Computing dask dataframe") res.compute() stop = time.perf_counter() print(f"Computing dask dataframe took {stop-start:0.4f} s.") # Visualize task graph # res.visualize('taskgraph.png') return res
def test_tables(gpu): c = Context() c.create_table("table", pd.DataFrame(), gpu=gpu) result_df = c.sql(f'SHOW TABLES FROM "{c.schema_name}"') expected_df = pd.DataFrame({"Table": ["table"]}) assert_eq(result_df, expected_df, check_index=False)
def test_sql(): c = Context() data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) c.create_table("df", data_frame) result = c.sql("SELECT * FROM df") assert isinstance(result, dd.DataFrame) assert_frame_equal(result.compute(), data_frame.compute()) result = c.sql("SELECT * FROM df", return_futures=False) assert isinstance(result, pd.DataFrame) assert_frame_equal(result, data_frame.compute()) result = c.sql("SELECT * FROM other_df", dataframes={"other_df": data_frame}) assert isinstance(result, dd.DataFrame) assert_frame_equal(result.compute(), data_frame.compute())
def test_show_tables_no_schema(c): c = Context() df = pd.DataFrame({"id": [0, 1]}) c.create_table("test", df) actual_df = c.sql("show tables").compute() expected_df = pd.DataFrame({"Table": ["test"]}) assert_eq(actual_df, expected_df)
def select(self, dfs: fugue.dataframe.DataFrames, statement: str) -> fugue.dataframe.DataFrame: """Send the SQL command to the dask-sql context and register all temporary dataframes""" c = Context() for k, v in dfs.items(): c.create_table(k, self.execution_engine.to_df(v).native) df = c.sql(statement) return fugue_dask.dataframe.DaskDataFrame(df)
def test_sql(gpu): c = Context() data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1) c.create_table("df", data_frame, gpu=gpu) result = c.sql("SELECT * FROM df") assert isinstance(result, dd.DataFrame) assert_eq(result, data_frame) result = c.sql("SELECT * FROM df", return_futures=False) assert not isinstance(result, dd.DataFrame) assert_eq(result, data_frame) result = c.sql( "SELECT * FROM other_df", dataframes={"other_df": data_frame}, gpu=gpu ) assert isinstance(result, dd.DataFrame) assert_eq(result, data_frame)
def test_join_case_projection_subquery(): c = Context() # Tables for query demo = pd.DataFrame({"demo_sku": [], "hd_dep_count": []}) site_page = pd.DataFrame({"site_page_sk": [], "site_char_count": []}) sales = pd.DataFrame({ "sales_hdemo_sk": [], "sales_page_sk": [], "sold_time_sk": [] }) t_dim = pd.DataFrame({"t_time_sk": [], "t_hour": []}) c.create_table("demos", demo, persist=False) c.create_table("site_page", site_page, persist=False) c.create_table("sales", sales, persist=False) c.create_table("t_dim", t_dim, persist=False) c.sql(""" SELECT CASE WHEN pmc > 0.0 THEN CAST (amc AS DOUBLE) / CAST (pmc AS DOUBLE) ELSE -1.0 END AS am_pm_ratio FROM ( SELECT SUM(amc1) AS amc, SUM(pmc1) AS pmc FROM ( SELECT CASE WHEN t_hour BETWEEN 7 AND 8 THEN COUNT(1) ELSE 0 END AS amc1, CASE WHEN t_hour BETWEEN 19 AND 20 THEN COUNT(1) ELSE 0 END AS pmc1 FROM sales ws JOIN demos hd ON (hd.demo_sku = ws.sales_hdemo_sk and hd.hd_dep_count = 5) JOIN site_page sp ON (sp.site_page_sk = ws.sales_page_sk and sp.site_char_count BETWEEN 5000 AND 6000) JOIN t_dim td ON (td.t_time_sk = ws.sold_time_sk and td.t_hour IN (7,8,19,20)) GROUP BY t_hour ) cnt_am_pm ) sum_am_pm """).compute()
def eq_sqlite(sql, **dfs): c = Context() engine = sqlite3.connect(":memory:") for name, df in dfs.items(): c.create_table(name, df) df.to_sql(name, engine, index=False) dask_result = c.sql(sql).compute().reset_index(drop=True) sqlite_result = pd.read_sql(sql, engine).reset_index(drop=True) # Make sure SQL and Dask use the same "NULL" value dask_result = dask_result.fillna(np.NaN) sqlite_result = sqlite_result.fillna(np.NaN) assert_frame_equal(dask_result, sqlite_result, check_dtype=False)
def eq_sqlite(sql, **dfs): c = Context() engine = sqlite3.connect(":memory:") for name, df in dfs.items(): c.create_table(name, df) df.to_sql(name, engine, index=False) dask_result = c.sql(sql).reset_index(drop=True) sqlite_result = pd.read_sql(sql, engine).reset_index(drop=True) # casting to object to ensure equality with sql-lite # which returns object dtype for datetime inputs dask_result = cast_datetime_to_string(dask_result) # Make sure SQL and Dask use the same "NULL" value dask_result = dask_result.fillna(np.NaN) sqlite_result = sqlite_result.fillna(np.NaN) assert_eq(dask_result, sqlite_result, check_dtype=False)
def cmd_loop( context: Context = None, client: dask.distributed.Client = None, startup=False, log_level=None, ): # pragma: no cover """ Run a REPL for answering SQL queries using ``dask-sql``. Every SQL expression that ``dask-sql`` understands can be used here. Args: context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one. client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one. startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded log_level: (:obj:`str`): The log level of the server and dask-sql Example: It is possible to run a REPL by using the CLI script in ``dask-sql`` or by calling this function directly in your user code: .. code-block:: python from dask_sql import cmd_loop # Create your pre-filled context c = Context() ... cmd_loop(context=c) Of course, it is also possible to call the usual ``CREATE TABLE`` commands. """ pd.set_option("display.max_rows", None) pd.set_option("display.max_columns", None) pd.set_option("display.width", None) pd.set_option("display.max_colwidth", None) logging.basicConfig(level=log_level) client = client or dask.distributed.Client() context = context or Context() if startup: context.sql("SELECT 1 + 1").compute() session = PromptSession(lexer=PygmentsLexer(SqlLexer)) while True: try: text = session.prompt("(dask-sql) > ") except KeyboardInterrupt: continue except EOFError: break text = text.rstrip(";").strip() if not text: continue try: df = context.sql(text, return_futures=False) print(df) except Exception as e: print(e)
return dask_xgboost.predict(client, bst, X) # Create a context from dask_sql import Context, run_server c = Context() c.register_function(predict_price, "predict_price", [("total_amount", np.float64), ("trip_distance", np.float64), ("passenger_count", np.float64)], np.float64) # Load the data from S3 df = dd.read_csv("s3://nyc-tlc/trip data/yellow_tripdata_2019-01.csv", dtype={ "payment_type": "UInt8", "VendorID": "UInt8", "passenger_count": "UInt8", "RatecodeIDq": "UInt8", }, storage_options={ "anon": True }).persist() wait(df) c.create_table("nyc-taxi", df) c.sql("SELECT 1 + 1").compute() # Finally, spin up the dask-sql server run_server(context=c, client=client)
import pandas as pd from dask_sql import Context c = Context() data = """ name,x Alice,34 Bob, """ df = pd.DataFrame({"name": ["Alice", "Bob", "Chris"] * 100, "x": list(range(300))}) ddf = dd.from_pandas(df, npartitions=10) c.create_table("my_data", ddf) got = c.sql( """ SELECT my_data.name, SUM(my_data.x) AS "S" FROM my_data GROUP BY my_data.name """ ) expect = pd.DataFrame({"name": ["Alice", "Bob", "Chris"], "S": [14850, 14950, 15050]}) dd.assert_eq(got, expect)
def query(self, dataframe: DataFrame) -> Result[DataFrame, InvalidJob]: c = Context() c.create_table(self.table_name, dataframe) queried = c.sql(self.query_string) return Success(queried)