Esempio n. 1
0
def test_sort_with_nan_many_partitions(gpu):
    c = Context()
    df = pd.DataFrame({
        "a": [float("nan"), 1] * 30,
        "b": [1, 2, 3] * 20,
    })
    c.create_table("df", dd.from_pandas(df, npartitions=10), gpu=gpu)

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a NULLS FIRST, b ASC NULLS FIRST")

    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan")] * 30 + [1] * 30,
            "b":
            [1] * 10 + [2] * 10 + [3] * 10 + [1] * 10 + [2] * 10 + [3] * 10,
        }),
        check_index=False,
    )

    df = pd.DataFrame({"a": [float("nan"), 1] * 30})
    c.create_table("df", dd.from_pandas(df, npartitions=10))

    df_result = c.sql("SELECT * FROM df ORDER BY a")

    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1] * 30 + [float("nan")] * 30,
        }),
        check_index=False,
    )
Esempio n. 2
0
def test_intake_sql(intake_catalog_location):
    c = Context()
    c.sql(f"""
        CREATE TABLE df WITH (
         location = '{intake_catalog_location}',
         format = 'intake',
         intake_table_name = 'intake_table'
        )
    """)

    check_read_table(c)
Esempio n. 3
0
def test_sort_with_nan_more_columns(gpu):
    c = Context()
    df = pd.DataFrame({
        "a": [1, 1, 2, 2, float("nan"), float("nan")],
        "b": [1, 1, 2, float("nan"), float("inf"), 5],
        "c": [1, float("nan"), 3, 4, 5, 6],
    })
    c.create_table("df", df, gpu=gpu)

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c ASC NULLS FIRST"
    )
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), float("nan"), 1, 1, 2, 2],
            "b": [float("inf"), 5, 1, 1, 2,
                  float("nan")],
            "c": [5, 6, float("nan"), 1, 3, 4],
        }),
        check_index=False,
    )

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS LAST, b DESC NULLS FIRST, c DESC NULLS LAST"
    )
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 1, 2, 2, float("nan"),
                  float("nan")],
            "b": [1, 1, float("nan"), 2,
                  float("inf"), 5],
            "c": [1, float("nan"), 4, 3, 5, 6],
        }),
        check_index=False,
    )

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c DESC NULLS LAST"
    )
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), float("nan"), 1, 1, 2, 2],
            "b": [float("inf"), 5, 1, 1, 2,
                  float("nan")],
            "c": [5, 6, 1, float("nan"), 3, 4],
        }),
        check_index=False,
    )
Esempio n. 4
0
def test_select(hive_cursor):
    c = Context()
    c.create_table("df", hive_cursor)

    result_df = c.sql("SELECT * FROM df").compute().reset_index(drop=True)
    df = pd.DataFrame({"i": [1, 2], "j": [2, 4]}).astype("int32")

    assert_frame_equal(df, result_df)
Esempio n. 5
0
def test_select(hive_cursor):
    c = Context()
    c.create_table("df", hive_cursor)

    result_df = c.sql("SELECT * FROM df")
    expected_df = pd.DataFrame({"i": [1, 2], "j": [2, 4]}).astype("int32")

    assert_eq(result_df, expected_df, check_index=False)
Esempio n. 6
0
    def select(self, dfs: fugue.dataframe.DataFrames,
               statement: str) -> fugue.dataframe.DataFrame:
        """Send the SQL command to the dask-sql context and register all temporary dataframes"""
        c = Context()

        for k, v in dfs.items():
            c.create_table(k, self.execution_engine.to_df(v).native)

        df = c.sql(statement)
        return fugue_dask.dataframe.DaskDataFrame(df)
Esempio n. 7
0
def test_sort_with_nan_more_columns():
    c = Context()
    df = pd.DataFrame({
        "a": [1, 1, 2, 2, float("nan"), float("nan")],
        "b": [1, 1, 2, float("nan"), float("inf"), 5],
        "c": [1, float("nan"), 3, 4, 5, 6],
    })
    c.create_table("df", df)

    df_result = (c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c ASC NULLS FIRST"
    ).c.compute().reset_index(drop=True))
    assert_series_equal(df_result,
                        pd.Series([5, 6, float("nan"), 1, 3, 4]),
                        check_names=False)

    df_result = (c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS LAST, b DESC NULLS FIRST, c DESC NULLS LAST"
    ).c.compute().reset_index(drop=True))
    assert_series_equal(df_result,
                        pd.Series([1, float("nan"), 4, 3, 5, 6]),
                        check_names=False)
Esempio n. 8
0
def test_sort_with_nan_many_partitions():
    c = Context()
    df = pd.DataFrame({
        "a": [float("nan"), 1] * 30,
        "b": [1, 2, 3] * 20,
    })
    c.create_table("df", dd.from_pandas(df, npartitions=10))

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a NULLS FIRST, b ASC NULLS FIRST"
              ).compute().reset_index(drop=True))

    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan")] * 30 + [1] * 30,
            "b":
            [1] * 10 + [2] * 10 + [3] * 10 + [1] * 10 + [2] * 10 + [3] * 10,
        }),
        check_names=False,
    )
Esempio n. 9
0
def test_sort_with_nan():
    c = Context()
    df = pd.DataFrame({
        "a": [1, 2, float("nan"), 2],
        "b": [4, float("nan"), 5, float("inf")]
    })
    c.create_table("df", df)

    df_result = c.sql("SELECT * FROM df ORDER BY a").compute().reset_index(
        drop=True)
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a NULLS FIRST").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a NULLS LAST").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a ASC").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a ASC NULLS FIRST").compute(
    ).reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a ASC NULLS LAST").compute().
                 reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a DESC").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("inf"), float("nan"), 4]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a DESC NULLS FIRST").compute(
    ).reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("inf"), float("nan"), 4]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a DESC NULLS LAST").compute(
    ).reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [2, 2, 1, float("nan")],
            "b": [float("inf"), float("nan"), 4, 5]
        }),
    )
Esempio n. 10
0
def cmd_loop(
    context: Context = None,
    client: Client = None,
    startup=False,
    log_level=None,
):  # pragma: no cover
    """
    Run a REPL for answering SQL queries using ``dask-sql``.
    Every SQL expression that ``dask-sql`` understands can be used here.

    Args:
        context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one.
        client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one.
        startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded
        log_level: (:obj:`str`): The log level of the server and dask-sql

    Example:
        It is possible to run a REPL by using the CLI script in ``dask-sql``
        or by calling this function directly in your user code:

        .. code-block:: python

            from dask_sql import cmd_loop

            # Create your pre-filled context
            c = Context()
            ...

            cmd_loop(context=c)

        Of course, it is also possible to call the usual ``CREATE TABLE``
        commands.
    """
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", None)
    pd.set_option("display.max_colwidth", None)

    logging.basicConfig(level=log_level)

    client = client or Client()
    context = context or Context()

    if startup:
        context.sql("SELECT 1 + 1").compute()

    session = CompatiblePromptSession(lexer=PygmentsLexer(SqlLexer))

    while True:
        try:
            text = session.prompt("(dask-sql) > ")
        except KeyboardInterrupt:
            continue
        except EOFError:
            break

        text = text.rstrip(";").strip()

        if not text:
            continue

        meta_command_detected = _meta_commands(text, context=context, client=client)
        if isinstance(meta_command_detected, Client):
            client = meta_command_detected

        if not meta_command_detected:
            try:
                df = context.sql(text, return_futures=True)
                if df is not None:  # some sql commands returns None
                    df = df.persist()
                    # Now turn it into a list of futures
                    futures = client.futures_of(df)
                    with ProgressBar() as pb:
                        for _ in pb(
                            as_completed(futures), total=len(futures), label="Executing"
                        ):
                            continue
                        df = df.compute()
                        print(df.to_markdown(tablefmt="fancy_grid"))

            except Exception:
                traceback.print_exc()
Esempio n. 11
0
def test_sort_with_nan(gpu):
    c = Context()
    df = pd.DataFrame({
        "a": [1, 2, float("nan"), 2],
        "b": [4, float("nan"), 5, float("inf")]
    })
    c.create_table("df", df, gpu=gpu)

    df_result = c.sql("SELECT * FROM df ORDER BY a")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a NULLS FIRST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a NULLS LAST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a ASC")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a ASC NULLS FIRST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a ASC NULLS LAST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a DESC")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("nan"), float("inf"), 4]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a DESC NULLS FIRST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("nan"), float("inf"), 4]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a DESC NULLS LAST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [2, 2, 1, float("nan")],
            "b": [float("nan"), float("inf"), 4, 5]
        }),
        check_index=False,
    )
Esempio n. 12
0
def cmd_loop(
    context: Context = None,
    client: Client = None,
    startup=False,
    log_level=None,
):  # pragma: no cover
    """
    Run a REPL for answering SQL queries using ``dask-sql``.
    Every SQL expression that ``dask-sql`` understands can be used here.

    Args:
        context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one.
        client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one.
        startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded
        log_level: (:obj:`str`): The log level of the server and dask-sql

    Example:
        It is possible to run a REPL by using the CLI script in ``dask-sql``
        or by calling this function directly in your user code:

        .. code-block:: python

            from dask_sql import cmd_loop

            # Create your pre-filled context
            c = Context()
            ...

            cmd_loop(context=c)

        Of course, it is also possible to call the usual ``CREATE TABLE``
        commands.
    """
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", None)
    pd.set_option("display.max_colwidth", None)

    logging.basicConfig(level=log_level)

    client = client or Client()
    context = context or Context()

    if startup:
        context.sql("SELECT 1 + 1").compute()

    session = CompatiblePromptSession(lexer=PygmentsLexer(SqlLexer))

    while True:
        try:
            text = session.prompt("(dask-sql) > ")
        except KeyboardInterrupt:
            continue
        except EOFError:
            break

        text = text.rstrip(";").strip()

        if not text:
            continue

        try:
            df = context.sql(text, return_futures=False)
            print(df)
        except Exception:
            traceback.print_exc()