def test_sort_with_nan_many_partitions(gpu): c = Context() df = pd.DataFrame({ "a": [float("nan"), 1] * 30, "b": [1, 2, 3] * 20, }) c.create_table("df", dd.from_pandas(df, npartitions=10), gpu=gpu) df_result = c.sql( "SELECT * FROM df ORDER BY a NULLS FIRST, b ASC NULLS FIRST") assert_eq( df_result, pd.DataFrame({ "a": [float("nan")] * 30 + [1] * 30, "b": [1] * 10 + [2] * 10 + [3] * 10 + [1] * 10 + [2] * 10 + [3] * 10, }), check_index=False, ) df = pd.DataFrame({"a": [float("nan"), 1] * 30}) c.create_table("df", dd.from_pandas(df, npartitions=10)) df_result = c.sql("SELECT * FROM df ORDER BY a") assert_eq( df_result, pd.DataFrame({ "a": [1] * 30 + [float("nan")] * 30, }), check_index=False, )
def c( df_simple, df, user_table_1, user_table_2, long_table, user_table_inf, user_table_nan, string_table, datetime_table, ): dfs = { "df_simple": df_simple, "df": df, "user_table_1": user_table_1, "user_table_2": user_table_2, "long_table": long_table, "user_table_inf": user_table_inf, "user_table_nan": user_table_nan, "string_table": string_table, "datetime_table": datetime_table, } # Lazy import, otherwise the pytest framework has problems from dask_sql.context import Context c = Context() for df_name, df in dfs.items(): dask_df = dd.from_pandas(df, npartitions=3) c.create_table(df_name, dask_df) yield c
def test_intake_location(intake_catalog_location): c = Context() c.create_table("df", intake_catalog_location, format="intake", intake_table_name="intake_table") check_read_table(c)
def test_select(hive_cursor): c = Context() c.create_table("df", hive_cursor) result_df = c.sql("SELECT * FROM df").compute().reset_index(drop=True) df = pd.DataFrame({"i": [1, 2], "j": [2, 4]}).astype("int32") assert_frame_equal(df, result_df)
def test_select(hive_cursor): c = Context() c.create_table("df", hive_cursor) result_df = c.sql("SELECT * FROM df") expected_df = pd.DataFrame({"i": [1, 2], "j": [2, 4]}).astype("int32") assert_eq(result_df, expected_df, check_index=False)
def main(): # pragma: no cover """ CLI version of the :func:`run_server` function. """ parser = ArgumentParser() parser.add_argument( "--host", default="0.0.0.0", help="The host interface to listen on (defaults to all interfaces)", ) parser.add_argument( "--port", default=8080, help="The port to listen on (defaults to 8080)" ) parser.add_argument( "--scheduler-address", default=None, help="Connect to this dask scheduler if given", ) parser.add_argument( "--log-level", default=None, help="Set the log level of the server. Defaults to info.", choices=uvicorn.config.LOG_LEVELS, ) parser.add_argument( "--load-test-data", default=False, action="store_true", help="Preload some test data.", ) parser.add_argument( "--startup", default=False, action="store_true", help="Wait until Apache Calcite was properly loaded", ) args = parser.parse_args() client = None if args.scheduler_address: client = dask.distributed.Client(args.scheduler_address) context = Context() if args.load_test_data: df = dask.datasets.timeseries(freq="1d").reset_index(drop=False) context.create_table("timeseries", df.persist()) run_server( context=context, client=client, host=args.host, port=args.port, startup=args.startup, log_level=args.log_level, )
def select(self, dfs: fugue.dataframe.DataFrames, statement: str) -> fugue.dataframe.DataFrame: """Send the SQL command to the dask-sql context and register all temporary dataframes""" c = Context() for k, v in dfs.items(): c.create_table(k, self.execution_engine.to_df(v).native) df = c.sql(statement) return fugue_dask.dataframe.DaskDataFrame(df)
def test_intake_sql(intake_catalog_location): c = Context() c.sql(f""" CREATE TABLE df WITH ( location = '{intake_catalog_location}', format = 'intake', intake_table_name = 'intake_table' ) """) check_read_table(c)
def _init_app( app: FastAPI, context: Context = None, client: dask.distributed.Client = None, ): app.c = context or Context() app.future_list = {} try: client = client or dask.distributed.Client.current() except ValueError: client = dask.distributed.Client() app.client = client
def c( df_simple, df_wide, df, user_table_1, user_table_2, long_table, user_table_inf, user_table_nan, string_table, datetime_table, parquet_ddf, gpu_user_table_1, gpu_df, gpu_long_table, gpu_string_table, gpu_datetime_table, ): dfs = { "df_simple": df_simple, "df_wide": df_wide, "df": df, "user_table_1": user_table_1, "user_table_2": user_table_2, "long_table": long_table, "user_table_inf": user_table_inf, "user_table_nan": user_table_nan, "string_table": string_table, "datetime_table": datetime_table, "parquet_ddf": parquet_ddf, "gpu_user_table_1": gpu_user_table_1, "gpu_df": gpu_df, "gpu_long_table": gpu_long_table, "gpu_string_table": gpu_string_table, "gpu_datetime_table": gpu_datetime_table, } # Lazy import, otherwise the pytest framework has problems from dask_sql.context import Context c = Context() for df_name, df in dfs.items(): if df is None: continue if hasattr(df, "npartitions"): # df is already a dask collection dask_df = df else: dask_df = dd.from_pandas(df, npartitions=3) c.create_table(df_name, dask_df) yield c
def test_sort_with_nan_more_columns(gpu): c = Context() df = pd.DataFrame({ "a": [1, 1, 2, 2, float("nan"), float("nan")], "b": [1, 1, 2, float("nan"), float("inf"), 5], "c": [1, float("nan"), 3, 4, 5, 6], }) c.create_table("df", df, gpu=gpu) df_result = c.sql( "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c ASC NULLS FIRST" ) assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), float("nan"), 1, 1, 2, 2], "b": [float("inf"), 5, 1, 1, 2, float("nan")], "c": [5, 6, float("nan"), 1, 3, 4], }), check_index=False, ) df_result = c.sql( "SELECT * FROM df ORDER BY a ASC NULLS LAST, b DESC NULLS FIRST, c DESC NULLS LAST" ) assert_eq( df_result, pd.DataFrame({ "a": [1, 1, 2, 2, float("nan"), float("nan")], "b": [1, 1, float("nan"), 2, float("inf"), 5], "c": [1, float("nan"), 4, 3, 5, 6], }), check_index=False, ) df_result = c.sql( "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c DESC NULLS LAST" ) assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), float("nan"), 1, 1, 2, 2], "b": [float("inf"), 5, 1, 1, 2, float("nan")], "c": [5, 6, 1, float("nan"), 3, 4], }), check_index=False, )
def main(): # pragma: no cover parser = ArgumentParser() parser.add_argument( "--scheduler-address", default=None, help="Connect to this dask scheduler if given", ) parser.add_argument( "--log-level", default=None, help="Set the log level of the server. Defaults to info.", choices=["DEBUG", "INFO", "WARNING", "ERROR"], ) parser.add_argument( "--load-test-data", default=False, action="store_true", help="Preload some test data.", ) parser.add_argument( "--startup", default=False, action="store_true", help="Wait until Apache Calcite was properly loaded", ) args = parser.parse_args() client = None if args.scheduler_address: client = Client(args.scheduler_address) context = Context() if args.load_test_data: df = timeseries(freq="1d").reset_index(drop=False) context.create_table("timeseries", df.persist()) cmd_loop(context=context, client=client, startup=args.startup, log_level=args.log_level)
def test_sort_with_nan_many_partitions(): c = Context() df = pd.DataFrame({ "a": [float("nan"), 1] * 30, "b": [1, 2, 3] * 20, }) c.create_table("df", dd.from_pandas(df, npartitions=10)) df_result = ( c.sql("SELECT * FROM df ORDER BY a NULLS FIRST, b ASC NULLS FIRST" ).compute().reset_index(drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [float("nan")] * 30 + [1] * 30, "b": [1] * 10 + [2] * 10 + [3] * 10 + [1] * 10 + [2] * 10 + [3] * 10, }), check_names=False, )
def test_sort_with_nan_more_columns(): c = Context() df = pd.DataFrame({ "a": [1, 1, 2, 2, float("nan"), float("nan")], "b": [1, 1, 2, float("nan"), float("inf"), 5], "c": [1, float("nan"), 3, 4, 5, 6], }) c.create_table("df", df) df_result = (c.sql( "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c ASC NULLS FIRST" ).c.compute().reset_index(drop=True)) assert_series_equal(df_result, pd.Series([5, 6, float("nan"), 1, 3, 4]), check_names=False) df_result = (c.sql( "SELECT * FROM df ORDER BY a ASC NULLS LAST, b DESC NULLS FIRST, c DESC NULLS LAST" ).c.compute().reset_index(drop=True)) assert_series_equal(df_result, pd.Series([1, float("nan"), 4, 3, 5, 6]), check_names=False)
def test_sort_with_nan(): c = Context() df = pd.DataFrame({ "a": [1, 2, float("nan"), 2], "b": [4, float("nan"), 5, float("inf")] }) c.create_table("df", df) df_result = c.sql("SELECT * FROM df ORDER BY a").compute().reset_index( drop=True) assert_frame_equal( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), ) df_result = ( c.sql("SELECT * FROM df ORDER BY a NULLS FIRST").compute().reset_index( drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [float("nan"), 1, 2, 2], "b": [5, 4, float("nan"), float("inf")] }), ) df_result = ( c.sql("SELECT * FROM df ORDER BY a NULLS LAST").compute().reset_index( drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), ) df_result = ( c.sql("SELECT * FROM df ORDER BY a ASC").compute().reset_index( drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), ) df_result = (c.sql("SELECT * FROM df ORDER BY a ASC NULLS FIRST").compute( ).reset_index(drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [float("nan"), 1, 2, 2], "b": [5, 4, float("nan"), float("inf")] }), ) df_result = (c.sql("SELECT * FROM df ORDER BY a ASC NULLS LAST").compute(). reset_index(drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), ) df_result = ( c.sql("SELECT * FROM df ORDER BY a DESC").compute().reset_index( drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [float("nan"), 2, 2, 1], "b": [5, float("inf"), float("nan"), 4] }), ) df_result = (c.sql("SELECT * FROM df ORDER BY a DESC NULLS FIRST").compute( ).reset_index(drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [float("nan"), 2, 2, 1], "b": [5, float("inf"), float("nan"), 4] }), ) df_result = (c.sql("SELECT * FROM df ORDER BY a DESC NULLS LAST").compute( ).reset_index(drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [2, 2, 1, float("nan")], "b": [float("inf"), float("nan"), 4, 5] }), )
def test_sort_with_nan(gpu): c = Context() df = pd.DataFrame({ "a": [1, 2, float("nan"), 2], "b": [4, float("nan"), 5, float("inf")] }) c.create_table("df", df, gpu=gpu) df_result = c.sql("SELECT * FROM df ORDER BY a") assert_eq( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a NULLS FIRST") assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), 1, 2, 2], "b": [5, 4, float("nan"), float("inf")] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a NULLS LAST") assert_eq( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a ASC") assert_eq( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a ASC NULLS FIRST") assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), 1, 2, 2], "b": [5, 4, float("nan"), float("inf")] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a ASC NULLS LAST") assert_eq( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a DESC") assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), 2, 2, 1], "b": [5, float("nan"), float("inf"), 4] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a DESC NULLS FIRST") assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), 2, 2, 1], "b": [5, float("nan"), float("inf"), 4] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a DESC NULLS LAST") assert_eq( df_result, pd.DataFrame({ "a": [2, 2, 1, float("nan")], "b": [float("nan"), float("inf"), 4, 5] }), check_index=False, )
def cmd_loop( context: Context = None, client: Client = None, startup=False, log_level=None, ): # pragma: no cover """ Run a REPL for answering SQL queries using ``dask-sql``. Every SQL expression that ``dask-sql`` understands can be used here. Args: context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one. client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one. startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded log_level: (:obj:`str`): The log level of the server and dask-sql Example: It is possible to run a REPL by using the CLI script in ``dask-sql`` or by calling this function directly in your user code: .. code-block:: python from dask_sql import cmd_loop # Create your pre-filled context c = Context() ... cmd_loop(context=c) Of course, it is also possible to call the usual ``CREATE TABLE`` commands. """ pd.set_option("display.max_rows", None) pd.set_option("display.max_columns", None) pd.set_option("display.width", None) pd.set_option("display.max_colwidth", None) logging.basicConfig(level=log_level) client = client or Client() context = context or Context() if startup: context.sql("SELECT 1 + 1").compute() session = CompatiblePromptSession(lexer=PygmentsLexer(SqlLexer)) while True: try: text = session.prompt("(dask-sql) > ") except KeyboardInterrupt: continue except EOFError: break text = text.rstrip(";").strip() if not text: continue try: df = context.sql(text, return_futures=False) print(df) except Exception: traceback.print_exc()
def test_intake_catalog(intake_catalog_location): catalog = intake.open_catalog(intake_catalog_location) c = Context() c.create_table("df", catalog, intake_table_name="intake_table") check_read_table(c)
def cmd_loop( context: Context = None, client: Client = None, startup=False, log_level=None, ): # pragma: no cover """ Run a REPL for answering SQL queries using ``dask-sql``. Every SQL expression that ``dask-sql`` understands can be used here. Args: context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one. client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one. startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded log_level: (:obj:`str`): The log level of the server and dask-sql Example: It is possible to run a REPL by using the CLI script in ``dask-sql`` or by calling this function directly in your user code: .. code-block:: python from dask_sql import cmd_loop # Create your pre-filled context c = Context() ... cmd_loop(context=c) Of course, it is also possible to call the usual ``CREATE TABLE`` commands. """ pd.set_option("display.max_rows", None) pd.set_option("display.max_columns", None) pd.set_option("display.width", None) pd.set_option("display.max_colwidth", None) logging.basicConfig(level=log_level) client = client or Client() context = context or Context() if startup: context.sql("SELECT 1 + 1").compute() session = CompatiblePromptSession(lexer=PygmentsLexer(SqlLexer)) while True: try: text = session.prompt("(dask-sql) > ") except KeyboardInterrupt: continue except EOFError: break text = text.rstrip(";").strip() if not text: continue meta_command_detected = _meta_commands(text, context=context, client=client) if isinstance(meta_command_detected, Client): client = meta_command_detected if not meta_command_detected: try: df = context.sql(text, return_futures=True) if df is not None: # some sql commands returns None df = df.persist() # Now turn it into a list of futures futures = client.futures_of(df) with ProgressBar() as pb: for _ in pb( as_completed(futures), total=len(futures), label="Executing" ): continue df = df.compute() print(df.to_markdown(tablefmt="fancy_grid")) except Exception: traceback.print_exc()
def assert_query_gives_same_result(engine): np.random.seed(42) df1 = dd.from_pandas( pd.DataFrame( { "user_id": np.random.choice([1, 2, 3, 4, pd.NA], 100), "a": np.random.rand(100), "b": np.random.randint(-10, 10, 100), } ), npartitions=3, ) df1["user_id"] = df1["user_id"].astype("Int64") df2 = dd.from_pandas( pd.DataFrame( { "user_id": np.random.choice([1, 2, 3, 4], 100), "c": np.random.randint(20, 30, 100), "d": np.random.choice(["a", "b", "c", None], 100), } ), npartitions=3, ) df3 = dd.from_pandas( pd.DataFrame( { "s": [ "".join(np.random.choice(["a", "B", "c", "D"], 10)) for _ in range(100) ] + [None] } ), npartitions=3, ) # the other is a Int64, that makes joining simpler df2["user_id"] = df2["user_id"].astype("Int64") # add some NaNs df1["a"] = df1["a"].apply( lambda a: float("nan") if a > 0.8 else a, meta=("a", "float") ) df1["b_bool"] = df1["b"].apply( lambda b: pd.NA if b > 5 else b < 0, meta=("a", "bool") ) # Lazy import, otherwise the pytest framework has problems from dask_sql.context import Context c = Context() c.create_table("df1", df1) c.create_table("df2", df2) c.create_table("df3", df3) df1.compute().to_sql("df1", engine, index=False, if_exists="replace") df2.compute().to_sql("df2", engine, index=False, if_exists="replace") df3.compute().to_sql("df3", engine, index=False, if_exists="replace") def _assert_query_gives_same_result(query, sort_columns=None, **kwargs): sql_result = pd.read_sql_query(query, engine) dask_result = c.sql(query).compute() # allow that the names are different # as expressions are handled differently dask_result.columns = sql_result.columns if sort_columns: sql_result = sql_result.sort_values(sort_columns) dask_result = dask_result.sort_values(sort_columns) sql_result = sql_result.reset_index(drop=True) dask_result = dask_result.reset_index(drop=True) assert_frame_equal(sql_result, dask_result, check_dtype=False, **kwargs) return _assert_query_gives_same_result