Esempio n. 1
0
def c(
    df_simple,
    df,
    user_table_1,
    user_table_2,
    long_table,
    user_table_inf,
    user_table_nan,
    string_table,
    datetime_table,
):
    dfs = {
        "df_simple": df_simple,
        "df": df,
        "user_table_1": user_table_1,
        "user_table_2": user_table_2,
        "long_table": long_table,
        "user_table_inf": user_table_inf,
        "user_table_nan": user_table_nan,
        "string_table": string_table,
        "datetime_table": datetime_table,
    }

    # Lazy import, otherwise the pytest framework has problems
    from dask_sql.context import Context

    c = Context()
    for df_name, df in dfs.items():
        dask_df = dd.from_pandas(df, npartitions=3)
        c.create_table(df_name, dask_df)

    yield c
Esempio n. 2
0
def test_select(hive_cursor):
    c = Context()
    c.create_table("df", hive_cursor)

    result_df = c.sql("SELECT * FROM df")
    expected_df = pd.DataFrame({"i": [1, 2], "j": [2, 4]}).astype("int32")

    assert_eq(result_df, expected_df, check_index=False)
Esempio n. 3
0
def test_intake_location(intake_catalog_location):
    c = Context()
    c.create_table("df",
                   intake_catalog_location,
                   format="intake",
                   intake_table_name="intake_table")

    check_read_table(c)
Esempio n. 4
0
def test_select(hive_cursor):
    c = Context()
    c.create_table("df", hive_cursor)

    result_df = c.sql("SELECT * FROM df").compute().reset_index(drop=True)
    df = pd.DataFrame({"i": [1, 2], "j": [2, 4]}).astype("int32")

    assert_frame_equal(df, result_df)
Esempio n. 5
0
def main():  # pragma: no cover
    """
    CLI version of the :func:`run_server` function.
    """
    parser = ArgumentParser()
    parser.add_argument(
        "--host",
        default="0.0.0.0",
        help="The host interface to listen on (defaults to all interfaces)",
    )
    parser.add_argument(
        "--port", default=8080, help="The port to listen on (defaults to 8080)"
    )
    parser.add_argument(
        "--scheduler-address",
        default=None,
        help="Connect to this dask scheduler if given",
    )
    parser.add_argument(
        "--log-level",
        default=None,
        help="Set the log level of the server. Defaults to info.",
        choices=uvicorn.config.LOG_LEVELS,
    )
    parser.add_argument(
        "--load-test-data",
        default=False,
        action="store_true",
        help="Preload some test data.",
    )
    parser.add_argument(
        "--startup",
        default=False,
        action="store_true",
        help="Wait until Apache Calcite was properly loaded",
    )

    args = parser.parse_args()

    client = None
    if args.scheduler_address:
        client = dask.distributed.Client(args.scheduler_address)

    context = Context()
    if args.load_test_data:
        df = dask.datasets.timeseries(freq="1d").reset_index(drop=False)
        context.create_table("timeseries", df.persist())

    run_server(
        context=context,
        client=client,
        host=args.host,
        port=args.port,
        startup=args.startup,
        log_level=args.log_level,
    )
Esempio n. 6
0
    def select(self, dfs: fugue.dataframe.DataFrames,
               statement: str) -> fugue.dataframe.DataFrame:
        """Send the SQL command to the dask-sql context and register all temporary dataframes"""
        c = Context()

        for k, v in dfs.items():
            c.create_table(k, self.execution_engine.to_df(v).native)

        df = c.sql(statement)
        return fugue_dask.dataframe.DaskDataFrame(df)
Esempio n. 7
0
def test_intake_sql(intake_catalog_location):
    c = Context()
    c.sql(f"""
        CREATE TABLE df WITH (
         location = '{intake_catalog_location}',
         format = 'intake',
         intake_table_name = 'intake_table'
        )
    """)

    check_read_table(c)
Esempio n. 8
0
def c(
    df_simple,
    df_wide,
    df,
    user_table_1,
    user_table_2,
    long_table,
    user_table_inf,
    user_table_nan,
    string_table,
    datetime_table,
    parquet_ddf,
    gpu_user_table_1,
    gpu_df,
    gpu_long_table,
    gpu_string_table,
    gpu_datetime_table,
):
    dfs = {
        "df_simple": df_simple,
        "df_wide": df_wide,
        "df": df,
        "user_table_1": user_table_1,
        "user_table_2": user_table_2,
        "long_table": long_table,
        "user_table_inf": user_table_inf,
        "user_table_nan": user_table_nan,
        "string_table": string_table,
        "datetime_table": datetime_table,
        "parquet_ddf": parquet_ddf,
        "gpu_user_table_1": gpu_user_table_1,
        "gpu_df": gpu_df,
        "gpu_long_table": gpu_long_table,
        "gpu_string_table": gpu_string_table,
        "gpu_datetime_table": gpu_datetime_table,
    }

    # Lazy import, otherwise the pytest framework has problems
    from dask_sql.context import Context

    c = Context()
    for df_name, df in dfs.items():
        if df is None:
            continue
        if hasattr(df, "npartitions"):
            # df is already a dask collection
            dask_df = df
        else:
            dask_df = dd.from_pandas(df, npartitions=3)
        c.create_table(df_name, dask_df)

    yield c
Esempio n. 9
0
def test_sort_with_nan_many_partitions(gpu):
    c = Context()
    df = pd.DataFrame({
        "a": [float("nan"), 1] * 30,
        "b": [1, 2, 3] * 20,
    })
    c.create_table("df", dd.from_pandas(df, npartitions=10), gpu=gpu)

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a NULLS FIRST, b ASC NULLS FIRST")

    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan")] * 30 + [1] * 30,
            "b":
            [1] * 10 + [2] * 10 + [3] * 10 + [1] * 10 + [2] * 10 + [3] * 10,
        }),
        check_index=False,
    )

    df = pd.DataFrame({"a": [float("nan"), 1] * 30})
    c.create_table("df", dd.from_pandas(df, npartitions=10))

    df_result = c.sql("SELECT * FROM df ORDER BY a")

    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1] * 30 + [float("nan")] * 30,
        }),
        check_index=False,
    )
Esempio n. 10
0
def _init_app(
    app: FastAPI, context: Context = None, client: dask.distributed.Client = None,
):
    app.c = context or Context()
    app.future_list = {}

    try:
        client = client or dask.distributed.Client.current()
    except ValueError:
        client = dask.distributed.Client()
    app.client = client
Esempio n. 11
0
def main():  # pragma: no cover
    parser = ArgumentParser()
    parser.add_argument(
        "--scheduler-address",
        default=None,
        help="Connect to this dask scheduler if given",
    )
    parser.add_argument(
        "--log-level",
        default=None,
        help="Set the log level of the server. Defaults to info.",
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
    )
    parser.add_argument(
        "--load-test-data",
        default=False,
        action="store_true",
        help="Preload some test data.",
    )
    parser.add_argument(
        "--startup",
        default=False,
        action="store_true",
        help="Wait until Apache Calcite was properly loaded",
    )

    args = parser.parse_args()

    client = None
    if args.scheduler_address:
        client = Client(args.scheduler_address)

    context = Context()
    if args.load_test_data:
        df = timeseries(freq="1d").reset_index(drop=False)
        context.create_table("timeseries", df.persist())

    cmd_loop(context=context,
             client=client,
             startup=args.startup,
             log_level=args.log_level)
Esempio n. 12
0
def test_sort_with_nan_many_partitions():
    c = Context()
    df = pd.DataFrame({
        "a": [float("nan"), 1] * 30,
        "b": [1, 2, 3] * 20,
    })
    c.create_table("df", dd.from_pandas(df, npartitions=10))

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a NULLS FIRST, b ASC NULLS FIRST"
              ).compute().reset_index(drop=True))

    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan")] * 30 + [1] * 30,
            "b":
            [1] * 10 + [2] * 10 + [3] * 10 + [1] * 10 + [2] * 10 + [3] * 10,
        }),
        check_names=False,
    )
Esempio n. 13
0
def test_sort_with_nan_more_columns(gpu):
    c = Context()
    df = pd.DataFrame({
        "a": [1, 1, 2, 2, float("nan"), float("nan")],
        "b": [1, 1, 2, float("nan"), float("inf"), 5],
        "c": [1, float("nan"), 3, 4, 5, 6],
    })
    c.create_table("df", df, gpu=gpu)

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c ASC NULLS FIRST"
    )
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), float("nan"), 1, 1, 2, 2],
            "b": [float("inf"), 5, 1, 1, 2,
                  float("nan")],
            "c": [5, 6, float("nan"), 1, 3, 4],
        }),
        check_index=False,
    )

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS LAST, b DESC NULLS FIRST, c DESC NULLS LAST"
    )
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 1, 2, 2, float("nan"),
                  float("nan")],
            "b": [1, 1, float("nan"), 2,
                  float("inf"), 5],
            "c": [1, float("nan"), 4, 3, 5, 6],
        }),
        check_index=False,
    )

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c DESC NULLS LAST"
    )
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), float("nan"), 1, 1, 2, 2],
            "b": [float("inf"), 5, 1, 1, 2,
                  float("nan")],
            "c": [5, 6, 1, float("nan"), 3, 4],
        }),
        check_index=False,
    )
Esempio n. 14
0
def test_sort_with_nan_more_columns():
    c = Context()
    df = pd.DataFrame({
        "a": [1, 1, 2, 2, float("nan"), float("nan")],
        "b": [1, 1, 2, float("nan"), float("inf"), 5],
        "c": [1, float("nan"), 3, 4, 5, 6],
    })
    c.create_table("df", df)

    df_result = (c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c ASC NULLS FIRST"
    ).c.compute().reset_index(drop=True))
    assert_series_equal(df_result,
                        pd.Series([5, 6, float("nan"), 1, 3, 4]),
                        check_names=False)

    df_result = (c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS LAST, b DESC NULLS FIRST, c DESC NULLS LAST"
    ).c.compute().reset_index(drop=True))
    assert_series_equal(df_result,
                        pd.Series([1, float("nan"), 4, 3, 5, 6]),
                        check_names=False)
Esempio n. 15
0
def test_sort_with_nan():
    c = Context()
    df = pd.DataFrame({
        "a": [1, 2, float("nan"), 2],
        "b": [4, float("nan"), 5, float("inf")]
    })
    c.create_table("df", df)

    df_result = c.sql("SELECT * FROM df ORDER BY a").compute().reset_index(
        drop=True)
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a NULLS FIRST").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a NULLS LAST").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a ASC").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a ASC NULLS FIRST").compute(
    ).reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a ASC NULLS LAST").compute().
                 reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a DESC").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("inf"), float("nan"), 4]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a DESC NULLS FIRST").compute(
    ).reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("inf"), float("nan"), 4]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a DESC NULLS LAST").compute(
    ).reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [2, 2, 1, float("nan")],
            "b": [float("inf"), float("nan"), 4, 5]
        }),
    )
Esempio n. 16
0
def test_sort_with_nan(gpu):
    c = Context()
    df = pd.DataFrame({
        "a": [1, 2, float("nan"), 2],
        "b": [4, float("nan"), 5, float("inf")]
    })
    c.create_table("df", df, gpu=gpu)

    df_result = c.sql("SELECT * FROM df ORDER BY a")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a NULLS FIRST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a NULLS LAST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a ASC")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a ASC NULLS FIRST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a ASC NULLS LAST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a DESC")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("nan"), float("inf"), 4]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a DESC NULLS FIRST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("nan"), float("inf"), 4]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a DESC NULLS LAST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [2, 2, 1, float("nan")],
            "b": [float("nan"), float("inf"), 4, 5]
        }),
        check_index=False,
    )
Esempio n. 17
0
def create_meta_data(c: Context):
    """
    Creates the schema, table and column data for prestodb JDBC driver so that data can be viewed
    in a database tool like DBeaver. It doesn't create a catalog entry although JDBC expects one
    as dask-sql doesn't support catalogs. For both catalogs and procedures empty placeholder
    tables are created.

    The meta-data appears in a separate schema called system_jdbc largely because the JDBC driver
    tries to access system.jdbc and it sufficiently so shouldn't clash with other schemas.

    A function is required in the /v1/statement to change system.jdbc to system_jdbc and ignore
    order by statements from the driver (as adjust_for_presto_sql above)

    :param c: Context containing created tables
    :return:
    """

    if c is None:
        logger.warn("Context None: jdbc meta data not created")
        return
    catalog = ""
    system_schema = "system_jdbc"
    c.create_schema(system_schema)

    # TODO: add support for catalogs in presto interface
    # see https://github.com/dask-contrib/dask-sql/pull/351
    # if catalog and len(catalog.strip()) > 0:
    #     catalogs = pd.DataFrame().append(create_catalog_row(catalog), ignore_index=True)
    #     c.create_table("catalogs", catalogs, schema_name=system_schema)

    schemas = pd.DataFrame().append(create_schema_row(), ignore_index=True)
    c.create_table("schemas", schemas, schema_name=system_schema)
    schema_rows = []

    tables = pd.DataFrame().append(create_table_row(), ignore_index=True)
    c.create_table("tables", tables, schema_name=system_schema)
    table_rows = []

    columns = pd.DataFrame().append(create_column_row(), ignore_index=True)
    c.create_table("columns", columns, schema_name=system_schema)
    column_rows = []

    for schema_name, schema in c.schema.items():
        schema_rows.append(create_schema_row(catalog, schema_name))
        for table_name, dc in schema.tables.items():
            df = dc.df
            logger.info(f"schema ${schema_name}, table {table_name}, {df}")
            table_rows.append(
                create_table_row(catalog, schema_name, table_name))
            pos: int = 0
            for column in df.columns:
                pos = pos + 1
                logger.debug(f"column {column}")
                dtype = "VARCHAR"
                if df[column].dtype == "int64" or df[column].dtype == "int":
                    dtype = "INTEGER"
                elif df[column].dtype == "float64" or df[
                        column].dtype == "float":
                    dtype = "FLOAT"
                elif (df[column].dtype == "datetime"
                      or df[column].dtype == "datetime64[ns]"):
                    dtype = "TIMESTAMP"
                column_rows.append(
                    create_column_row(
                        catalog,
                        schema_name,
                        table_name,
                        dtype,
                        df[column].name,
                        str(pos),
                    ))

    schemas = pd.DataFrame(schema_rows)
    c.create_table("schemas", schemas, schema_name=system_schema)
    tables = pd.DataFrame(table_rows)
    c.create_table("tables", tables, schema_name=system_schema)
    columns = pd.DataFrame(column_rows)
    c.create_table("columns", columns, schema_name=system_schema)

    logger.info(f"jdbc meta data ready for {len(table_rows)} tables")
Esempio n. 18
0
def test_intake_catalog(intake_catalog_location):
    catalog = intake.open_catalog(intake_catalog_location)
    c = Context()
    c.create_table("df", catalog, intake_table_name="intake_table")

    check_read_table(c)
Esempio n. 19
0
def assert_query_gives_same_result(engine):
    np.random.seed(42)

    df1 = dd.from_pandas(
        pd.DataFrame(
            {
                "user_id": np.random.choice([1, 2, 3, 4, pd.NA], 100),
                "a": np.random.rand(100),
                "b": np.random.randint(-10, 10, 100),
            }
        ),
        npartitions=3,
    )
    df1["user_id"] = df1["user_id"].astype("Int64")

    df2 = dd.from_pandas(
        pd.DataFrame(
            {
                "user_id": np.random.choice([1, 2, 3, 4], 100),
                "c": np.random.randint(20, 30, 100),
                "d": np.random.choice(["a", "b", "c", None], 100),
            }
        ),
        npartitions=3,
    )

    df3 = dd.from_pandas(
        pd.DataFrame(
            {
                "s": [
                    "".join(np.random.choice(["a", "B", "c", "D"], 10))
                    for _ in range(100)
                ]
                + [None]
            }
        ),
        npartitions=3,
    )

    # the other is a Int64, that makes joining simpler
    df2["user_id"] = df2["user_id"].astype("Int64")

    # add some NaNs
    df1["a"] = df1["a"].apply(
        lambda a: float("nan") if a > 0.8 else a, meta=("a", "float")
    )
    df1["b_bool"] = df1["b"].apply(
        lambda b: pd.NA if b > 5 else b < 0, meta=("a", "bool")
    )

    # Lazy import, otherwise the pytest framework has problems
    from dask_sql.context import Context

    c = Context()
    c.create_table("df1", df1)
    c.create_table("df2", df2)
    c.create_table("df3", df3)

    df1.compute().to_sql("df1", engine, index=False, if_exists="replace")
    df2.compute().to_sql("df2", engine, index=False, if_exists="replace")
    df3.compute().to_sql("df3", engine, index=False, if_exists="replace")

    def _assert_query_gives_same_result(query, sort_columns=None, **kwargs):
        sql_result = pd.read_sql_query(query, engine)
        dask_result = c.sql(query).compute()

        # allow that the names are different
        # as expressions are handled differently
        dask_result.columns = sql_result.columns

        if sort_columns:
            sql_result = sql_result.sort_values(sort_columns)
            dask_result = dask_result.sort_values(sort_columns)

        sql_result = sql_result.reset_index(drop=True)
        dask_result = dask_result.reset_index(drop=True)

        assert_frame_equal(sql_result, dask_result, check_dtype=False, **kwargs)

    return _assert_query_gives_same_result
Esempio n. 20
0
def _meta_commands(sql: str, context: Context, client: Client) -> Union[bool, Client]:
    """
    parses metacommands and prints their result
    returns True if meta commands detected
    """
    cmd, schema_name = _parse_meta_command(sql)
    available_commands = [
        ["\\l", "List schemas"],
        ["\\d?, help, ?", "Show available commands"],
        ["\\conninfo", "Show Dask cluster info"],
        ["\\dt [schema]", "List tables"],
        ["\\df [schema]", "List functions"],
        ["\\dm [schema]", "List models"],
        ["\\de [schema]", "List experiments"],
        ["\\dss [schema]", "Switch schema"],
        ["\\dsc [dask scheduler address]", "Switch Dask cluster"],
        ["quit", "Quits dask-sql-cli"],
    ]
    if cmd == "\\dsc":
        # Switch Dask cluster
        _, scheduler_address = _parse_meta_command(sql)
        client = Client(scheduler_address)
        return client  # pragma: no cover
    schema_name = schema_name or context.schema_name
    if cmd == "\\d?" or cmd == "help" or cmd == "?":
        _display_markdown(available_commands, columns=["Commands", "Description"])
    elif cmd == "\\l":
        _display_markdown(context.schema.keys(), columns=["Schemas"])
    elif cmd == "\\dt":
        _display_markdown(context.schema[schema_name].tables.keys(), columns=["Tables"])
    elif cmd == "\\df":
        _display_markdown(
            context.schema[schema_name].functions.keys(), columns=["Functions"]
        )
    elif cmd == "\\de":
        _display_markdown(
            context.schema[schema_name].experiments.keys(), columns=["Experiments"]
        )
    elif cmd == "\\dm":
        _display_markdown(context.schema[schema_name].models.keys(), columns=["Models"])
    elif cmd == "\\conninfo":
        cluster_info = [
            ["Dask scheduler", client.scheduler.__dict__["addr"]],
            ["Dask dashboard", client.dashboard_link],
            ["Cluster status", client.status],
            ["Dask workers", len(client.cluster.workers)],
        ]
        _display_markdown(
            cluster_info, columns=["components", "value"]
        )  # pragma: no cover
    elif cmd == "\\dss":
        if schema_name in context.schema:
            context.schema_name = schema_name
        else:
            print(f"Schema {schema_name} not available")
    elif cmd == "quit":
        print("Quitting dask-sql ...")
        client.close()  # for safer side
        sys.exit()
    elif cmd.startswith("\\"):
        print(
            f"The meta command {cmd} not available, please use commands from below list"
        )
        _display_markdown(available_commands, columns=["Commands", "Description"])
    else:
        # nothing detected probably not a meta command
        return False
    return True
Esempio n. 21
0
def cmd_loop(
    context: Context = None,
    client: Client = None,
    startup=False,
    log_level=None,
):  # pragma: no cover
    """
    Run a REPL for answering SQL queries using ``dask-sql``.
    Every SQL expression that ``dask-sql`` understands can be used here.

    Args:
        context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one.
        client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one.
        startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded
        log_level: (:obj:`str`): The log level of the server and dask-sql

    Example:
        It is possible to run a REPL by using the CLI script in ``dask-sql``
        or by calling this function directly in your user code:

        .. code-block:: python

            from dask_sql import cmd_loop

            # Create your pre-filled context
            c = Context()
            ...

            cmd_loop(context=c)

        Of course, it is also possible to call the usual ``CREATE TABLE``
        commands.
    """
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", None)
    pd.set_option("display.max_colwidth", None)

    logging.basicConfig(level=log_level)

    client = client or Client()
    context = context or Context()

    if startup:
        context.sql("SELECT 1 + 1").compute()

    session = CompatiblePromptSession(lexer=PygmentsLexer(SqlLexer))

    while True:
        try:
            text = session.prompt("(dask-sql) > ")
        except KeyboardInterrupt:
            continue
        except EOFError:
            break

        text = text.rstrip(";").strip()

        if not text:
            continue

        meta_command_detected = _meta_commands(text, context=context, client=client)
        if isinstance(meta_command_detected, Client):
            client = meta_command_detected

        if not meta_command_detected:
            try:
                df = context.sql(text, return_futures=True)
                if df is not None:  # some sql commands returns None
                    df = df.persist()
                    # Now turn it into a list of futures
                    futures = client.futures_of(df)
                    with ProgressBar() as pb:
                        for _ in pb(
                            as_completed(futures), total=len(futures), label="Executing"
                        ):
                            continue
                        df = df.compute()
                        print(df.to_markdown(tablefmt="fancy_grid"))

            except Exception:
                traceback.print_exc()
Esempio n. 22
0
def run_server(
    context: Context = None,
    client: dask.distributed.Client = None,
    host: str = "0.0.0.0",
    port: int = 8080,
    startup=False,
    log_level=None,
    blocking: bool = True,
    jdbc_metadata: bool = False,
):  # pragma: no cover
    """
    Run a HTTP server for answering SQL queries using ``dask-sql``.
    It uses the `Presto Wire Protocol <https://github.com/prestodb/presto/wiki/HTTP-Protocol>`_
    for communication.
    This means, it has a single POST endpoint `/v1/statement`, which answers
    SQL queries (as string in the body) with the output as a JSON
    (in the format described in the documentation above).
    Every SQL expression that ``dask-sql`` understands can be used here.

    See :ref:`server` for more information.

    Note:
        The presto protocol also includes some statistics on the query
        in the response.
        These statistics are currently only filled with placeholder variables.

    Args:
        context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one.
        client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one.
        host (:obj:`str`): The host interface to listen on (defaults to all interfaces)
        port (:obj:`int`): The port to listen on (defaults to 8080)
        startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded
        log_level: (:obj:`str`): The log level of the server and dask-sql
        blocking: (:obj:`bool`): If running in an environment with an event loop (e.g. a jupyter notebook),
                do not block. The server can be stopped with `context.stop_server()` afterwards.
        jdbc_metadata: (:obj:`bool`): If enabled create JDBC metadata tables using schemas and tables in
                the current dask_sql context

    Example:
        It is possible to run an SQL server by using the CLI script ``dask-sql-server``
        or by calling this function directly in your user code:

        .. code-block:: python

            from dask_sql import run_server

            # Create your pre-filled context
            c = Context()
            ...

            run_server(context=c)

        After starting the server, it is possible to send queries to it, e.g. with the
        `presto CLI <https://prestosql.io/docs/current/installation/cli.html>`_
        or via sqlalchemy (e.g. using the `PyHive <https://github.com/dropbox/PyHive#sqlalchemy>`_ package):

        .. code-block:: python

            from sqlalchemy.engine import create_engine
            engine = create_engine('presto://localhost:8080/')

            import pandas as pd
            pd.read_sql_query("SELECT 1 + 1", con=engine)

        Of course, it is also possible to call the usual ``CREATE TABLE``
        commands.

        If in a jupyter notebook, you should run the following code

        .. code-block:: python

            from dask_sql import Context

            c = Context()
            c.run_server(blocking=False)

            ...

            c.stop_server()

        Note:
            When running in a jupyter notebook without blocking,
            it is not possible to access the SQL server from within the
            notebook, e.g. using sqlalchemy.
            Doing so will deadlock infinitely.

    """
    _init_app(app, context=context, client=client)
    if jdbc_metadata:
        create_meta_data(context)

    if startup:
        app.c.sql("SELECT 1 + 1").compute()

    config = Config(app, host=host, port=port, log_level=log_level)
    server = Server(config=config)

    loop = asyncio.get_event_loop()
    if blocking:
        if loop and loop.is_running():
            apply(loop=loop)

        server.run()
    else:
        if not loop or not loop.is_running():
            raise AttributeError(
                "blocking=True needs a running event loop (e.g. in a jupyter notebook)"
            )
        loop.create_task(server.serve())
        context.sql_server = server
Esempio n. 23
0
def cmd_loop(
    context: Context = None,
    client: Client = None,
    startup=False,
    log_level=None,
):  # pragma: no cover
    """
    Run a REPL for answering SQL queries using ``dask-sql``.
    Every SQL expression that ``dask-sql`` understands can be used here.

    Args:
        context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one.
        client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one.
        startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded
        log_level: (:obj:`str`): The log level of the server and dask-sql

    Example:
        It is possible to run a REPL by using the CLI script in ``dask-sql``
        or by calling this function directly in your user code:

        .. code-block:: python

            from dask_sql import cmd_loop

            # Create your pre-filled context
            c = Context()
            ...

            cmd_loop(context=c)

        Of course, it is also possible to call the usual ``CREATE TABLE``
        commands.
    """
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", None)
    pd.set_option("display.max_colwidth", None)

    logging.basicConfig(level=log_level)

    client = client or Client()
    context = context or Context()

    if startup:
        context.sql("SELECT 1 + 1").compute()

    session = CompatiblePromptSession(lexer=PygmentsLexer(SqlLexer))

    while True:
        try:
            text = session.prompt("(dask-sql) > ")
        except KeyboardInterrupt:
            continue
        except EOFError:
            break

        text = text.rstrip(";").strip()

        if not text:
            continue

        try:
            df = context.sql(text, return_futures=False)
            print(df)
        except Exception:
            traceback.print_exc()