Exemple #1
0
def test_sort_with_nan_many_partitions(gpu):
    c = Context()
    df = pd.DataFrame({
        "a": [float("nan"), 1] * 30,
        "b": [1, 2, 3] * 20,
    })
    c.create_table("df", dd.from_pandas(df, npartitions=10), gpu=gpu)

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a NULLS FIRST, b ASC NULLS FIRST")

    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan")] * 30 + [1] * 30,
            "b":
            [1] * 10 + [2] * 10 + [3] * 10 + [1] * 10 + [2] * 10 + [3] * 10,
        }),
        check_index=False,
    )

    df = pd.DataFrame({"a": [float("nan"), 1] * 30})
    c.create_table("df", dd.from_pandas(df, npartitions=10))

    df_result = c.sql("SELECT * FROM df ORDER BY a")

    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1] * 30 + [float("nan")] * 30,
        }),
        check_index=False,
    )
Exemple #2
0
def c(
    df_simple,
    df,
    user_table_1,
    user_table_2,
    long_table,
    user_table_inf,
    user_table_nan,
    string_table,
    datetime_table,
):
    dfs = {
        "df_simple": df_simple,
        "df": df,
        "user_table_1": user_table_1,
        "user_table_2": user_table_2,
        "long_table": long_table,
        "user_table_inf": user_table_inf,
        "user_table_nan": user_table_nan,
        "string_table": string_table,
        "datetime_table": datetime_table,
    }

    # Lazy import, otherwise the pytest framework has problems
    from dask_sql.context import Context

    c = Context()
    for df_name, df in dfs.items():
        dask_df = dd.from_pandas(df, npartitions=3)
        c.create_table(df_name, dask_df)

    yield c
Exemple #3
0
def test_intake_location(intake_catalog_location):
    c = Context()
    c.create_table("df",
                   intake_catalog_location,
                   format="intake",
                   intake_table_name="intake_table")

    check_read_table(c)
Exemple #4
0
def test_select(hive_cursor):
    c = Context()
    c.create_table("df", hive_cursor)

    result_df = c.sql("SELECT * FROM df").compute().reset_index(drop=True)
    df = pd.DataFrame({"i": [1, 2], "j": [2, 4]}).astype("int32")

    assert_frame_equal(df, result_df)
Exemple #5
0
def test_select(hive_cursor):
    c = Context()
    c.create_table("df", hive_cursor)

    result_df = c.sql("SELECT * FROM df")
    expected_df = pd.DataFrame({"i": [1, 2], "j": [2, 4]}).astype("int32")

    assert_eq(result_df, expected_df, check_index=False)
Exemple #6
0
def main():  # pragma: no cover
    """
    CLI version of the :func:`run_server` function.
    """
    parser = ArgumentParser()
    parser.add_argument(
        "--host",
        default="0.0.0.0",
        help="The host interface to listen on (defaults to all interfaces)",
    )
    parser.add_argument(
        "--port", default=8080, help="The port to listen on (defaults to 8080)"
    )
    parser.add_argument(
        "--scheduler-address",
        default=None,
        help="Connect to this dask scheduler if given",
    )
    parser.add_argument(
        "--log-level",
        default=None,
        help="Set the log level of the server. Defaults to info.",
        choices=uvicorn.config.LOG_LEVELS,
    )
    parser.add_argument(
        "--load-test-data",
        default=False,
        action="store_true",
        help="Preload some test data.",
    )
    parser.add_argument(
        "--startup",
        default=False,
        action="store_true",
        help="Wait until Apache Calcite was properly loaded",
    )

    args = parser.parse_args()

    client = None
    if args.scheduler_address:
        client = dask.distributed.Client(args.scheduler_address)

    context = Context()
    if args.load_test_data:
        df = dask.datasets.timeseries(freq="1d").reset_index(drop=False)
        context.create_table("timeseries", df.persist())

    run_server(
        context=context,
        client=client,
        host=args.host,
        port=args.port,
        startup=args.startup,
        log_level=args.log_level,
    )
Exemple #7
0
    def select(self, dfs: fugue.dataframe.DataFrames,
               statement: str) -> fugue.dataframe.DataFrame:
        """Send the SQL command to the dask-sql context and register all temporary dataframes"""
        c = Context()

        for k, v in dfs.items():
            c.create_table(k, self.execution_engine.to_df(v).native)

        df = c.sql(statement)
        return fugue_dask.dataframe.DaskDataFrame(df)
Exemple #8
0
def test_intake_sql(intake_catalog_location):
    c = Context()
    c.sql(f"""
        CREATE TABLE df WITH (
         location = '{intake_catalog_location}',
         format = 'intake',
         intake_table_name = 'intake_table'
        )
    """)

    check_read_table(c)
Exemple #9
0
def _init_app(
    app: FastAPI, context: Context = None, client: dask.distributed.Client = None,
):
    app.c = context or Context()
    app.future_list = {}

    try:
        client = client or dask.distributed.Client.current()
    except ValueError:
        client = dask.distributed.Client()
    app.client = client
Exemple #10
0
def c(
    df_simple,
    df_wide,
    df,
    user_table_1,
    user_table_2,
    long_table,
    user_table_inf,
    user_table_nan,
    string_table,
    datetime_table,
    parquet_ddf,
    gpu_user_table_1,
    gpu_df,
    gpu_long_table,
    gpu_string_table,
    gpu_datetime_table,
):
    dfs = {
        "df_simple": df_simple,
        "df_wide": df_wide,
        "df": df,
        "user_table_1": user_table_1,
        "user_table_2": user_table_2,
        "long_table": long_table,
        "user_table_inf": user_table_inf,
        "user_table_nan": user_table_nan,
        "string_table": string_table,
        "datetime_table": datetime_table,
        "parquet_ddf": parquet_ddf,
        "gpu_user_table_1": gpu_user_table_1,
        "gpu_df": gpu_df,
        "gpu_long_table": gpu_long_table,
        "gpu_string_table": gpu_string_table,
        "gpu_datetime_table": gpu_datetime_table,
    }

    # Lazy import, otherwise the pytest framework has problems
    from dask_sql.context import Context

    c = Context()
    for df_name, df in dfs.items():
        if df is None:
            continue
        if hasattr(df, "npartitions"):
            # df is already a dask collection
            dask_df = df
        else:
            dask_df = dd.from_pandas(df, npartitions=3)
        c.create_table(df_name, dask_df)

    yield c
Exemple #11
0
def test_sort_with_nan_more_columns(gpu):
    c = Context()
    df = pd.DataFrame({
        "a": [1, 1, 2, 2, float("nan"), float("nan")],
        "b": [1, 1, 2, float("nan"), float("inf"), 5],
        "c": [1, float("nan"), 3, 4, 5, 6],
    })
    c.create_table("df", df, gpu=gpu)

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c ASC NULLS FIRST"
    )
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), float("nan"), 1, 1, 2, 2],
            "b": [float("inf"), 5, 1, 1, 2,
                  float("nan")],
            "c": [5, 6, float("nan"), 1, 3, 4],
        }),
        check_index=False,
    )

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS LAST, b DESC NULLS FIRST, c DESC NULLS LAST"
    )
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 1, 2, 2, float("nan"),
                  float("nan")],
            "b": [1, 1, float("nan"), 2,
                  float("inf"), 5],
            "c": [1, float("nan"), 4, 3, 5, 6],
        }),
        check_index=False,
    )

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c DESC NULLS LAST"
    )
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), float("nan"), 1, 1, 2, 2],
            "b": [float("inf"), 5, 1, 1, 2,
                  float("nan")],
            "c": [5, 6, 1, float("nan"), 3, 4],
        }),
        check_index=False,
    )
Exemple #12
0
def main():  # pragma: no cover
    parser = ArgumentParser()
    parser.add_argument(
        "--scheduler-address",
        default=None,
        help="Connect to this dask scheduler if given",
    )
    parser.add_argument(
        "--log-level",
        default=None,
        help="Set the log level of the server. Defaults to info.",
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
    )
    parser.add_argument(
        "--load-test-data",
        default=False,
        action="store_true",
        help="Preload some test data.",
    )
    parser.add_argument(
        "--startup",
        default=False,
        action="store_true",
        help="Wait until Apache Calcite was properly loaded",
    )

    args = parser.parse_args()

    client = None
    if args.scheduler_address:
        client = Client(args.scheduler_address)

    context = Context()
    if args.load_test_data:
        df = timeseries(freq="1d").reset_index(drop=False)
        context.create_table("timeseries", df.persist())

    cmd_loop(context=context,
             client=client,
             startup=args.startup,
             log_level=args.log_level)
Exemple #13
0
def test_sort_with_nan_many_partitions():
    c = Context()
    df = pd.DataFrame({
        "a": [float("nan"), 1] * 30,
        "b": [1, 2, 3] * 20,
    })
    c.create_table("df", dd.from_pandas(df, npartitions=10))

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a NULLS FIRST, b ASC NULLS FIRST"
              ).compute().reset_index(drop=True))

    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan")] * 30 + [1] * 30,
            "b":
            [1] * 10 + [2] * 10 + [3] * 10 + [1] * 10 + [2] * 10 + [3] * 10,
        }),
        check_names=False,
    )
Exemple #14
0
def test_sort_with_nan_more_columns():
    c = Context()
    df = pd.DataFrame({
        "a": [1, 1, 2, 2, float("nan"), float("nan")],
        "b": [1, 1, 2, float("nan"), float("inf"), 5],
        "c": [1, float("nan"), 3, 4, 5, 6],
    })
    c.create_table("df", df)

    df_result = (c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c ASC NULLS FIRST"
    ).c.compute().reset_index(drop=True))
    assert_series_equal(df_result,
                        pd.Series([5, 6, float("nan"), 1, 3, 4]),
                        check_names=False)

    df_result = (c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS LAST, b DESC NULLS FIRST, c DESC NULLS LAST"
    ).c.compute().reset_index(drop=True))
    assert_series_equal(df_result,
                        pd.Series([1, float("nan"), 4, 3, 5, 6]),
                        check_names=False)
Exemple #15
0
def test_sort_with_nan():
    c = Context()
    df = pd.DataFrame({
        "a": [1, 2, float("nan"), 2],
        "b": [4, float("nan"), 5, float("inf")]
    })
    c.create_table("df", df)

    df_result = c.sql("SELECT * FROM df ORDER BY a").compute().reset_index(
        drop=True)
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a NULLS FIRST").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a NULLS LAST").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a ASC").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a ASC NULLS FIRST").compute(
    ).reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a ASC NULLS LAST").compute().
                 reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a DESC").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("inf"), float("nan"), 4]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a DESC NULLS FIRST").compute(
    ).reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("inf"), float("nan"), 4]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a DESC NULLS LAST").compute(
    ).reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [2, 2, 1, float("nan")],
            "b": [float("inf"), float("nan"), 4, 5]
        }),
    )
Exemple #16
0
def test_sort_with_nan(gpu):
    c = Context()
    df = pd.DataFrame({
        "a": [1, 2, float("nan"), 2],
        "b": [4, float("nan"), 5, float("inf")]
    })
    c.create_table("df", df, gpu=gpu)

    df_result = c.sql("SELECT * FROM df ORDER BY a")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a NULLS FIRST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a NULLS LAST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a ASC")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a ASC NULLS FIRST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a ASC NULLS LAST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a DESC")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("nan"), float("inf"), 4]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a DESC NULLS FIRST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("nan"), float("inf"), 4]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a DESC NULLS LAST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [2, 2, 1, float("nan")],
            "b": [float("nan"), float("inf"), 4, 5]
        }),
        check_index=False,
    )
Exemple #17
0
def cmd_loop(
    context: Context = None,
    client: Client = None,
    startup=False,
    log_level=None,
):  # pragma: no cover
    """
    Run a REPL for answering SQL queries using ``dask-sql``.
    Every SQL expression that ``dask-sql`` understands can be used here.

    Args:
        context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one.
        client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one.
        startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded
        log_level: (:obj:`str`): The log level of the server and dask-sql

    Example:
        It is possible to run a REPL by using the CLI script in ``dask-sql``
        or by calling this function directly in your user code:

        .. code-block:: python

            from dask_sql import cmd_loop

            # Create your pre-filled context
            c = Context()
            ...

            cmd_loop(context=c)

        Of course, it is also possible to call the usual ``CREATE TABLE``
        commands.
    """
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", None)
    pd.set_option("display.max_colwidth", None)

    logging.basicConfig(level=log_level)

    client = client or Client()
    context = context or Context()

    if startup:
        context.sql("SELECT 1 + 1").compute()

    session = CompatiblePromptSession(lexer=PygmentsLexer(SqlLexer))

    while True:
        try:
            text = session.prompt("(dask-sql) > ")
        except KeyboardInterrupt:
            continue
        except EOFError:
            break

        text = text.rstrip(";").strip()

        if not text:
            continue

        try:
            df = context.sql(text, return_futures=False)
            print(df)
        except Exception:
            traceback.print_exc()
Exemple #18
0
def test_intake_catalog(intake_catalog_location):
    catalog = intake.open_catalog(intake_catalog_location)
    c = Context()
    c.create_table("df", catalog, intake_table_name="intake_table")

    check_read_table(c)
Exemple #19
0
def cmd_loop(
    context: Context = None,
    client: Client = None,
    startup=False,
    log_level=None,
):  # pragma: no cover
    """
    Run a REPL for answering SQL queries using ``dask-sql``.
    Every SQL expression that ``dask-sql`` understands can be used here.

    Args:
        context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one.
        client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one.
        startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded
        log_level: (:obj:`str`): The log level of the server and dask-sql

    Example:
        It is possible to run a REPL by using the CLI script in ``dask-sql``
        or by calling this function directly in your user code:

        .. code-block:: python

            from dask_sql import cmd_loop

            # Create your pre-filled context
            c = Context()
            ...

            cmd_loop(context=c)

        Of course, it is also possible to call the usual ``CREATE TABLE``
        commands.
    """
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", None)
    pd.set_option("display.max_colwidth", None)

    logging.basicConfig(level=log_level)

    client = client or Client()
    context = context or Context()

    if startup:
        context.sql("SELECT 1 + 1").compute()

    session = CompatiblePromptSession(lexer=PygmentsLexer(SqlLexer))

    while True:
        try:
            text = session.prompt("(dask-sql) > ")
        except KeyboardInterrupt:
            continue
        except EOFError:
            break

        text = text.rstrip(";").strip()

        if not text:
            continue

        meta_command_detected = _meta_commands(text, context=context, client=client)
        if isinstance(meta_command_detected, Client):
            client = meta_command_detected

        if not meta_command_detected:
            try:
                df = context.sql(text, return_futures=True)
                if df is not None:  # some sql commands returns None
                    df = df.persist()
                    # Now turn it into a list of futures
                    futures = client.futures_of(df)
                    with ProgressBar() as pb:
                        for _ in pb(
                            as_completed(futures), total=len(futures), label="Executing"
                        ):
                            continue
                        df = df.compute()
                        print(df.to_markdown(tablefmt="fancy_grid"))

            except Exception:
                traceback.print_exc()
Exemple #20
0
def assert_query_gives_same_result(engine):
    np.random.seed(42)

    df1 = dd.from_pandas(
        pd.DataFrame(
            {
                "user_id": np.random.choice([1, 2, 3, 4, pd.NA], 100),
                "a": np.random.rand(100),
                "b": np.random.randint(-10, 10, 100),
            }
        ),
        npartitions=3,
    )
    df1["user_id"] = df1["user_id"].astype("Int64")

    df2 = dd.from_pandas(
        pd.DataFrame(
            {
                "user_id": np.random.choice([1, 2, 3, 4], 100),
                "c": np.random.randint(20, 30, 100),
                "d": np.random.choice(["a", "b", "c", None], 100),
            }
        ),
        npartitions=3,
    )

    df3 = dd.from_pandas(
        pd.DataFrame(
            {
                "s": [
                    "".join(np.random.choice(["a", "B", "c", "D"], 10))
                    for _ in range(100)
                ]
                + [None]
            }
        ),
        npartitions=3,
    )

    # the other is a Int64, that makes joining simpler
    df2["user_id"] = df2["user_id"].astype("Int64")

    # add some NaNs
    df1["a"] = df1["a"].apply(
        lambda a: float("nan") if a > 0.8 else a, meta=("a", "float")
    )
    df1["b_bool"] = df1["b"].apply(
        lambda b: pd.NA if b > 5 else b < 0, meta=("a", "bool")
    )

    # Lazy import, otherwise the pytest framework has problems
    from dask_sql.context import Context

    c = Context()
    c.create_table("df1", df1)
    c.create_table("df2", df2)
    c.create_table("df3", df3)

    df1.compute().to_sql("df1", engine, index=False, if_exists="replace")
    df2.compute().to_sql("df2", engine, index=False, if_exists="replace")
    df3.compute().to_sql("df3", engine, index=False, if_exists="replace")

    def _assert_query_gives_same_result(query, sort_columns=None, **kwargs):
        sql_result = pd.read_sql_query(query, engine)
        dask_result = c.sql(query).compute()

        # allow that the names are different
        # as expressions are handled differently
        dask_result.columns = sql_result.columns

        if sort_columns:
            sql_result = sql_result.sort_values(sort_columns)
            dask_result = dask_result.sort_values(sort_columns)

        sql_result = sql_result.reset_index(drop=True)
        dask_result = dask_result.reset_index(drop=True)

        assert_frame_equal(sql_result, dask_result, check_dtype=False, **kwargs)

    return _assert_query_gives_same_result