Esempio n. 1
0
def test_dask_sql_sg_logistic_regression(
    datatype,
    nrows,
    ncols,
    n_parts,
    wrap_predict
):
    if wrap_predict:
        cuml.set_global_output_type("input")
    else:
        cuml.set_global_output_type("cudf")

    X, y = make_classification(
        n_samples=nrows, n_features=ncols, n_informative=5, random_state=0
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    train_df = cudf.DataFrame(
        X_train, dtype=datatype, columns=[chr(i) for i in range(ncols)]
    )
    train_df["target"] = y_train
    train_ddf = dask_cudf.from_cudf(train_df, npartitions=n_parts)

    c = Context()
    c.create_table("train_df", train_ddf)

    train_query = f"""
        CREATE MODEL model WITH (
            model_class = 'cuml.linear_model.LogisticRegression',
            wrap_predict = {wrap_predict},
            target_column = 'target'
        ) AS (
            SELECT * FROM train_df
        )
    """

    c.sql(train_query)

    skmodel = LogisticRegression().fit(X_train, y_train)

    test_df = cudf.DataFrame(
        X_test, dtype=datatype, columns=[chr(i) for i in range(ncols)]
    )
    test_ddf = dask_cudf.from_cudf(test_df, npartitions=n_parts)
    c.create_table("test_df", test_ddf)

    inference_query = """
        SELECT * FROM PREDICT(
            MODEL model,
            SELECT * FROM test_df
        )
    """

    preds = c.sql(inference_query).compute()
    score = cuml.metrics.accuracy_score(y_test, preds["target"].to_numpy())

    assert score >= skmodel.score(X_test, y_test) - 0.022
Esempio n. 2
0
def test_sql():
    c = Context()

    data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
    c.create_table("df", data_frame)

    result = c.sql("SELECT * FROM df")
    assert isinstance(result, dd.DataFrame)

    result = c.sql("SELECT * FROM df", return_futures=False)
    assert isinstance(result, pd.DataFrame)
Esempio n. 3
0
def app_client():
    c = Context()
    c.sql("SELECT 1 + 1").compute()
    _init_app(app, c)

    # late import for the importskip
    from fastapi.testclient import TestClient

    yield TestClient(app)

    # don't disconnect the client if using an independent cluster
    if os.getenv("DASK_SQL_TEST_SCHEDULER", None) is None:
        app.client.close()
Esempio n. 4
0
def test_query_case_sensitivity():
    c = Context()
    df = pd.DataFrame({"id": [0, 1]})

    c.create_table("test", df)

    try:
        c.sql(
            "select ID from test",
            config_options={"sql.identifier.case_sensitive": False},
        )
    except ParsingException as pe:
        assert False, f"Queries should be case insensitve but raised exception {pe}"
class TimeDaskSQLJoins:
    params = [[10**4, 10**5], [10], [3], [6], [0.75]]

    param_names = [
        "rows",
        "cols",
        "number of join columns",
        "number of chained joins",
        "ratio of dictinct elements",
    ]

    def setup(self, N, ncols, njoin_columns, njoins, distinct_r):
        self.dfs = _generate_dataframes(N, ncols, njoin_columns, njoins,
                                        distinct_r)
        self.dfs = [dd.from_pandas(d, npartitions=1) for d in self.dfs]
        self.join_cols = [
            c for c in self.dfs[0].columns if c in self.dfs[1].columns
        ]
        self.ctx = Context()
        self._create_tables()
        self._create_sql_query()

    def _create_tables(self):
        self.tables = []
        for i, df in enumerate(self.dfs):
            _table_name = f"table_{i:03}"
            self.ctx.create_table(_table_name, df)
            _table = table(_table_name, *[column(c) for c in df.columns])
            self.tables.append(_table)

    def _create_sql_query(self):
        left = self.tables[0]
        joinq = left
        select_cols = list(left.c)
        for right in self.tables[1:]:
            on = and_(
                left.c.get(col) == right.c.get(col) for col in self.join_cols)
            joinq = joinq.join(right, on)
            select_cols += [c for c in right.c if c.name not in self.join_cols]
        query = select(*select_cols).select_from(joinq)
        self.sql_query = str(
            query.compile(
                dialect=postgresql.dialect(),
                compile_kwargs={"literal_binds": True},
            ))

    def time_joins(self, N, ncols, njoin_columns, njoins, distinct_r):
        start = time.perf_counter()
        print(f"Processing SQL query: {self.sql_query}")
        res = self.ctx.sql(self.sql_query)
        stop = time.perf_counter()
        print(f"Processing SQL query took {stop-start:0.4f} s.")
        start = time.perf_counter()
        print("Computing dask dataframe")
        res.compute()
        stop = time.perf_counter()
        print(f"Computing dask dataframe took {stop-start:0.4f} s.")
        # Visualize task graph
        # res.visualize('taskgraph.png')
        return res
Esempio n. 6
0
def test_tables(gpu):
    c = Context()
    c.create_table("table", pd.DataFrame(), gpu=gpu)

    result_df = c.sql(f'SHOW TABLES FROM "{c.schema_name}"')
    expected_df = pd.DataFrame({"Table": ["table"]})

    assert_eq(result_df, expected_df, check_index=False)
Esempio n. 7
0
def test_sql():
    c = Context()

    data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
    c.create_table("df", data_frame)

    result = c.sql("SELECT * FROM df")
    assert isinstance(result, dd.DataFrame)
    assert_frame_equal(result.compute(), data_frame.compute())

    result = c.sql("SELECT * FROM df", return_futures=False)
    assert isinstance(result, pd.DataFrame)
    assert_frame_equal(result, data_frame.compute())

    result = c.sql("SELECT * FROM other_df",
                   dataframes={"other_df": data_frame})
    assert isinstance(result, dd.DataFrame)
    assert_frame_equal(result.compute(), data_frame.compute())
Esempio n. 8
0
def test_show_tables_no_schema(c):
    c = Context()

    df = pd.DataFrame({"id": [0, 1]})
    c.create_table("test", df)

    actual_df = c.sql("show tables").compute()
    expected_df = pd.DataFrame({"Table": ["test"]})
    assert_eq(actual_df, expected_df)
Esempio n. 9
0
    def select(self, dfs: fugue.dataframe.DataFrames,
               statement: str) -> fugue.dataframe.DataFrame:
        """Send the SQL command to the dask-sql context and register all temporary dataframes"""
        c = Context()

        for k, v in dfs.items():
            c.create_table(k, self.execution_engine.to_df(v).native)

        df = c.sql(statement)
        return fugue_dask.dataframe.DaskDataFrame(df)
Esempio n. 10
0
def test_sql(gpu):
    c = Context()

    data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
    c.create_table("df", data_frame, gpu=gpu)

    result = c.sql("SELECT * FROM df")
    assert isinstance(result, dd.DataFrame)
    assert_eq(result, data_frame)

    result = c.sql("SELECT * FROM df", return_futures=False)
    assert not isinstance(result, dd.DataFrame)
    assert_eq(result, data_frame)

    result = c.sql(
        "SELECT * FROM other_df", dataframes={"other_df": data_frame}, gpu=gpu
    )
    assert isinstance(result, dd.DataFrame)
    assert_eq(result, data_frame)
Esempio n. 11
0
def test_join_case_projection_subquery():
    c = Context()

    # Tables for query
    demo = pd.DataFrame({"demo_sku": [], "hd_dep_count": []})
    site_page = pd.DataFrame({"site_page_sk": [], "site_char_count": []})
    sales = pd.DataFrame({
        "sales_hdemo_sk": [],
        "sales_page_sk": [],
        "sold_time_sk": []
    })
    t_dim = pd.DataFrame({"t_time_sk": [], "t_hour": []})

    c.create_table("demos", demo, persist=False)
    c.create_table("site_page", site_page, persist=False)
    c.create_table("sales", sales, persist=False)
    c.create_table("t_dim", t_dim, persist=False)

    c.sql("""
    SELECT CASE WHEN pmc > 0.0 THEN CAST (amc AS DOUBLE) / CAST (pmc AS DOUBLE) ELSE -1.0 END AS am_pm_ratio
    FROM
    (
        SELECT SUM(amc1) AS amc, SUM(pmc1) AS pmc
        FROM
        (
            SELECT
                CASE WHEN t_hour BETWEEN 7 AND 8 THEN COUNT(1) ELSE 0 END AS amc1,
                CASE WHEN t_hour BETWEEN 19 AND 20 THEN COUNT(1) ELSE 0 END AS pmc1
            FROM sales ws
            JOIN demos hd ON (hd.demo_sku = ws.sales_hdemo_sk and hd.hd_dep_count = 5)
            JOIN site_page sp ON (sp.site_page_sk = ws.sales_page_sk and sp.site_char_count BETWEEN 5000 AND 6000)
            JOIN t_dim td ON (td.t_time_sk = ws.sold_time_sk and td.t_hour IN (7,8,19,20))
            GROUP BY t_hour
        ) cnt_am_pm
    ) sum_am_pm
    """).compute()
Esempio n. 12
0
def eq_sqlite(sql, **dfs):
    c = Context()
    engine = sqlite3.connect(":memory:")

    for name, df in dfs.items():
        c.create_table(name, df)
        df.to_sql(name, engine, index=False)

    dask_result = c.sql(sql).compute().reset_index(drop=True)
    sqlite_result = pd.read_sql(sql, engine).reset_index(drop=True)

    # Make sure SQL and Dask use the same "NULL" value
    dask_result = dask_result.fillna(np.NaN)
    sqlite_result = sqlite_result.fillna(np.NaN)

    assert_frame_equal(dask_result, sqlite_result, check_dtype=False)
Esempio n. 13
0
def eq_sqlite(sql, **dfs):
    c = Context()
    engine = sqlite3.connect(":memory:")

    for name, df in dfs.items():
        c.create_table(name, df)
        df.to_sql(name, engine, index=False)

    dask_result = c.sql(sql).reset_index(drop=True)
    sqlite_result = pd.read_sql(sql, engine).reset_index(drop=True)

    # casting to object to ensure equality with sql-lite
    # which returns object dtype for datetime inputs
    dask_result = cast_datetime_to_string(dask_result)

    # Make sure SQL and Dask use the same "NULL" value
    dask_result = dask_result.fillna(np.NaN)
    sqlite_result = sqlite_result.fillna(np.NaN)

    assert_eq(dask_result, sqlite_result, check_dtype=False)
Esempio n. 14
0
def cmd_loop(
    context: Context = None,
    client: dask.distributed.Client = None,
    startup=False,
    log_level=None,
):  # pragma: no cover
    """
    Run a REPL for answering SQL queries using ``dask-sql``.
    Every SQL expression that ``dask-sql`` understands can be used here.

    Args:
        context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one.
        client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one.
        startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded
        log_level: (:obj:`str`): The log level of the server and dask-sql

    Example:
        It is possible to run a REPL by using the CLI script in ``dask-sql``
        or by calling this function directly in your user code:

        .. code-block:: python

            from dask_sql import cmd_loop

            # Create your pre-filled context
            c = Context()
            ...

            cmd_loop(context=c)

        Of course, it is also possible to call the usual ``CREATE TABLE``
        commands.
    """
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", None)
    pd.set_option("display.max_colwidth", None)

    logging.basicConfig(level=log_level)

    client = client or dask.distributed.Client()
    context = context or Context()

    if startup:
        context.sql("SELECT 1 + 1").compute()

    session = PromptSession(lexer=PygmentsLexer(SqlLexer))

    while True:
        try:
            text = session.prompt("(dask-sql) > ")
        except KeyboardInterrupt:
            continue
        except EOFError:
            break

        text = text.rstrip(";").strip()

        if not text:
            continue

        try:
            df = context.sql(text, return_futures=False)
            print(df)
        except Exception as e:
            print(e)
Esempio n. 15
0
        return dask_xgboost.predict(client, bst, X)

    # Create a context
    from dask_sql import Context, run_server
    c = Context()

    c.register_function(predict_price, "predict_price",
                        [("total_amount", np.float64),
                         ("trip_distance", np.float64),
                         ("passenger_count", np.float64)], np.float64)

    # Load the data from S3
    df = dd.read_csv("s3://nyc-tlc/trip data/yellow_tripdata_2019-01.csv",
                     dtype={
                         "payment_type": "UInt8",
                         "VendorID": "UInt8",
                         "passenger_count": "UInt8",
                         "RatecodeIDq": "UInt8",
                     },
                     storage_options={
                         "anon": True
                     }).persist()

    wait(df)

    c.create_table("nyc-taxi", df)

    c.sql("SELECT 1 + 1").compute()

    # Finally, spin up the dask-sql server
    run_server(context=c, client=client)
Esempio n. 16
0
import pandas as pd

from dask_sql import Context

c = Context()

data = """
name,x
Alice,34
Bob,
"""

df = pd.DataFrame({"name": ["Alice", "Bob", "Chris"] * 100, "x": list(range(300))})
ddf = dd.from_pandas(df, npartitions=10)
c.create_table("my_data", ddf)

got = c.sql(
    """
    SELECT
        my_data.name,
        SUM(my_data.x) AS "S"
    FROM
        my_data
    GROUP BY
        my_data.name
"""
)
expect = pd.DataFrame({"name": ["Alice", "Bob", "Chris"], "S": [14850, 14950, 15050]})

dd.assert_eq(got, expect)
Esempio n. 17
0
 def query(self, dataframe: DataFrame) -> Result[DataFrame, InvalidJob]:
     c = Context()
     c.create_table(self.table_name, dataframe)
     queried = c.sql(self.query_string)
     return Success(queried)