def test_sort_with_nan_many_partitions(gpu): c = Context() df = pd.DataFrame({ "a": [float("nan"), 1] * 30, "b": [1, 2, 3] * 20, }) c.create_table("df", dd.from_pandas(df, npartitions=10), gpu=gpu) df_result = c.sql( "SELECT * FROM df ORDER BY a NULLS FIRST, b ASC NULLS FIRST") assert_eq( df_result, pd.DataFrame({ "a": [float("nan")] * 30 + [1] * 30, "b": [1] * 10 + [2] * 10 + [3] * 10 + [1] * 10 + [2] * 10 + [3] * 10, }), check_index=False, ) df = pd.DataFrame({"a": [float("nan"), 1] * 30}) c.create_table("df", dd.from_pandas(df, npartitions=10)) df_result = c.sql("SELECT * FROM df ORDER BY a") assert_eq( df_result, pd.DataFrame({ "a": [1] * 30 + [float("nan")] * 30, }), check_index=False, )
def test_intake_sql(intake_catalog_location): c = Context() c.sql(f""" CREATE TABLE df WITH ( location = '{intake_catalog_location}', format = 'intake', intake_table_name = 'intake_table' ) """) check_read_table(c)
def test_sort_with_nan_more_columns(gpu): c = Context() df = pd.DataFrame({ "a": [1, 1, 2, 2, float("nan"), float("nan")], "b": [1, 1, 2, float("nan"), float("inf"), 5], "c": [1, float("nan"), 3, 4, 5, 6], }) c.create_table("df", df, gpu=gpu) df_result = c.sql( "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c ASC NULLS FIRST" ) assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), float("nan"), 1, 1, 2, 2], "b": [float("inf"), 5, 1, 1, 2, float("nan")], "c": [5, 6, float("nan"), 1, 3, 4], }), check_index=False, ) df_result = c.sql( "SELECT * FROM df ORDER BY a ASC NULLS LAST, b DESC NULLS FIRST, c DESC NULLS LAST" ) assert_eq( df_result, pd.DataFrame({ "a": [1, 1, 2, 2, float("nan"), float("nan")], "b": [1, 1, float("nan"), 2, float("inf"), 5], "c": [1, float("nan"), 4, 3, 5, 6], }), check_index=False, ) df_result = c.sql( "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c DESC NULLS LAST" ) assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), float("nan"), 1, 1, 2, 2], "b": [float("inf"), 5, 1, 1, 2, float("nan")], "c": [5, 6, 1, float("nan"), 3, 4], }), check_index=False, )
def test_select(hive_cursor): c = Context() c.create_table("df", hive_cursor) result_df = c.sql("SELECT * FROM df").compute().reset_index(drop=True) df = pd.DataFrame({"i": [1, 2], "j": [2, 4]}).astype("int32") assert_frame_equal(df, result_df)
def test_select(hive_cursor): c = Context() c.create_table("df", hive_cursor) result_df = c.sql("SELECT * FROM df") expected_df = pd.DataFrame({"i": [1, 2], "j": [2, 4]}).astype("int32") assert_eq(result_df, expected_df, check_index=False)
def select(self, dfs: fugue.dataframe.DataFrames, statement: str) -> fugue.dataframe.DataFrame: """Send the SQL command to the dask-sql context and register all temporary dataframes""" c = Context() for k, v in dfs.items(): c.create_table(k, self.execution_engine.to_df(v).native) df = c.sql(statement) return fugue_dask.dataframe.DaskDataFrame(df)
def test_sort_with_nan_more_columns(): c = Context() df = pd.DataFrame({ "a": [1, 1, 2, 2, float("nan"), float("nan")], "b": [1, 1, 2, float("nan"), float("inf"), 5], "c": [1, float("nan"), 3, 4, 5, 6], }) c.create_table("df", df) df_result = (c.sql( "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c ASC NULLS FIRST" ).c.compute().reset_index(drop=True)) assert_series_equal(df_result, pd.Series([5, 6, float("nan"), 1, 3, 4]), check_names=False) df_result = (c.sql( "SELECT * FROM df ORDER BY a ASC NULLS LAST, b DESC NULLS FIRST, c DESC NULLS LAST" ).c.compute().reset_index(drop=True)) assert_series_equal(df_result, pd.Series([1, float("nan"), 4, 3, 5, 6]), check_names=False)
def test_sort_with_nan_many_partitions(): c = Context() df = pd.DataFrame({ "a": [float("nan"), 1] * 30, "b": [1, 2, 3] * 20, }) c.create_table("df", dd.from_pandas(df, npartitions=10)) df_result = ( c.sql("SELECT * FROM df ORDER BY a NULLS FIRST, b ASC NULLS FIRST" ).compute().reset_index(drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [float("nan")] * 30 + [1] * 30, "b": [1] * 10 + [2] * 10 + [3] * 10 + [1] * 10 + [2] * 10 + [3] * 10, }), check_names=False, )
def test_sort_with_nan(): c = Context() df = pd.DataFrame({ "a": [1, 2, float("nan"), 2], "b": [4, float("nan"), 5, float("inf")] }) c.create_table("df", df) df_result = c.sql("SELECT * FROM df ORDER BY a").compute().reset_index( drop=True) assert_frame_equal( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), ) df_result = ( c.sql("SELECT * FROM df ORDER BY a NULLS FIRST").compute().reset_index( drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [float("nan"), 1, 2, 2], "b": [5, 4, float("nan"), float("inf")] }), ) df_result = ( c.sql("SELECT * FROM df ORDER BY a NULLS LAST").compute().reset_index( drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), ) df_result = ( c.sql("SELECT * FROM df ORDER BY a ASC").compute().reset_index( drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), ) df_result = (c.sql("SELECT * FROM df ORDER BY a ASC NULLS FIRST").compute( ).reset_index(drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [float("nan"), 1, 2, 2], "b": [5, 4, float("nan"), float("inf")] }), ) df_result = (c.sql("SELECT * FROM df ORDER BY a ASC NULLS LAST").compute(). reset_index(drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), ) df_result = ( c.sql("SELECT * FROM df ORDER BY a DESC").compute().reset_index( drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [float("nan"), 2, 2, 1], "b": [5, float("inf"), float("nan"), 4] }), ) df_result = (c.sql("SELECT * FROM df ORDER BY a DESC NULLS FIRST").compute( ).reset_index(drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [float("nan"), 2, 2, 1], "b": [5, float("inf"), float("nan"), 4] }), ) df_result = (c.sql("SELECT * FROM df ORDER BY a DESC NULLS LAST").compute( ).reset_index(drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [2, 2, 1, float("nan")], "b": [float("inf"), float("nan"), 4, 5] }), )
def cmd_loop( context: Context = None, client: Client = None, startup=False, log_level=None, ): # pragma: no cover """ Run a REPL for answering SQL queries using ``dask-sql``. Every SQL expression that ``dask-sql`` understands can be used here. Args: context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one. client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one. startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded log_level: (:obj:`str`): The log level of the server and dask-sql Example: It is possible to run a REPL by using the CLI script in ``dask-sql`` or by calling this function directly in your user code: .. code-block:: python from dask_sql import cmd_loop # Create your pre-filled context c = Context() ... cmd_loop(context=c) Of course, it is also possible to call the usual ``CREATE TABLE`` commands. """ pd.set_option("display.max_rows", None) pd.set_option("display.max_columns", None) pd.set_option("display.width", None) pd.set_option("display.max_colwidth", None) logging.basicConfig(level=log_level) client = client or Client() context = context or Context() if startup: context.sql("SELECT 1 + 1").compute() session = CompatiblePromptSession(lexer=PygmentsLexer(SqlLexer)) while True: try: text = session.prompt("(dask-sql) > ") except KeyboardInterrupt: continue except EOFError: break text = text.rstrip(";").strip() if not text: continue meta_command_detected = _meta_commands(text, context=context, client=client) if isinstance(meta_command_detected, Client): client = meta_command_detected if not meta_command_detected: try: df = context.sql(text, return_futures=True) if df is not None: # some sql commands returns None df = df.persist() # Now turn it into a list of futures futures = client.futures_of(df) with ProgressBar() as pb: for _ in pb( as_completed(futures), total=len(futures), label="Executing" ): continue df = df.compute() print(df.to_markdown(tablefmt="fancy_grid")) except Exception: traceback.print_exc()
def test_sort_with_nan(gpu): c = Context() df = pd.DataFrame({ "a": [1, 2, float("nan"), 2], "b": [4, float("nan"), 5, float("inf")] }) c.create_table("df", df, gpu=gpu) df_result = c.sql("SELECT * FROM df ORDER BY a") assert_eq( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a NULLS FIRST") assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), 1, 2, 2], "b": [5, 4, float("nan"), float("inf")] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a NULLS LAST") assert_eq( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a ASC") assert_eq( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a ASC NULLS FIRST") assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), 1, 2, 2], "b": [5, 4, float("nan"), float("inf")] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a ASC NULLS LAST") assert_eq( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a DESC") assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), 2, 2, 1], "b": [5, float("nan"), float("inf"), 4] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a DESC NULLS FIRST") assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), 2, 2, 1], "b": [5, float("nan"), float("inf"), 4] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a DESC NULLS LAST") assert_eq( df_result, pd.DataFrame({ "a": [2, 2, 1, float("nan")], "b": [float("nan"), float("inf"), 4, 5] }), check_index=False, )
def cmd_loop( context: Context = None, client: Client = None, startup=False, log_level=None, ): # pragma: no cover """ Run a REPL for answering SQL queries using ``dask-sql``. Every SQL expression that ``dask-sql`` understands can be used here. Args: context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one. client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one. startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded log_level: (:obj:`str`): The log level of the server and dask-sql Example: It is possible to run a REPL by using the CLI script in ``dask-sql`` or by calling this function directly in your user code: .. code-block:: python from dask_sql import cmd_loop # Create your pre-filled context c = Context() ... cmd_loop(context=c) Of course, it is also possible to call the usual ``CREATE TABLE`` commands. """ pd.set_option("display.max_rows", None) pd.set_option("display.max_columns", None) pd.set_option("display.width", None) pd.set_option("display.max_colwidth", None) logging.basicConfig(level=log_level) client = client or Client() context = context or Context() if startup: context.sql("SELECT 1 + 1").compute() session = CompatiblePromptSession(lexer=PygmentsLexer(SqlLexer)) while True: try: text = session.prompt("(dask-sql) > ") except KeyboardInterrupt: continue except EOFError: break text = text.rstrip(";").strip() if not text: continue try: df = context.sql(text, return_futures=False) print(df) except Exception: traceback.print_exc()