Esempio n. 1
0
def test_explain(gpu):
    c = Context()

    data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
    c.create_table("df", data_frame, gpu=gpu)

    sql_string = c.explain("SELECT * FROM df")

    assert sql_string.startswith(
        "DaskTableScan(table=[[root, df]]): rowcount = 100.0, cumulative cost = {100.0 rows, 101.0 cpu, 0.0 io}, id = "
    )

    c.create_table("df", data_frame, statistics=Statistics(row_count=1337), gpu=gpu)

    sql_string = c.explain("SELECT * FROM df")

    assert sql_string.startswith(
        "DaskTableScan(table=[[root, df]]): rowcount = 1337.0, cumulative cost = {1337.0 rows, 1338.0 cpu, 0.0 io}, id = "
    )

    c = Context()

    data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)

    sql_string = c.explain(
        "SELECT * FROM other_df", dataframes={"other_df": data_frame}, gpu=gpu
    )

    assert sql_string.startswith(
        "DaskTableScan(table=[[root, other_df]]): rowcount = 100.0, cumulative cost = {100.0 rows, 101.0 cpu, 0.0 io}, id = "
    )
class TimeDaskSQLJoins:
    params = [[10**4, 10**5], [10], [3], [6], [0.75]]

    param_names = [
        "rows",
        "cols",
        "number of join columns",
        "number of chained joins",
        "ratio of dictinct elements",
    ]

    def setup(self, N, ncols, njoin_columns, njoins, distinct_r):
        self.dfs = _generate_dataframes(N, ncols, njoin_columns, njoins,
                                        distinct_r)
        self.dfs = [dd.from_pandas(d, npartitions=1) for d in self.dfs]
        self.join_cols = [
            c for c in self.dfs[0].columns if c in self.dfs[1].columns
        ]
        self.ctx = Context()
        self._create_tables()
        self._create_sql_query()

    def _create_tables(self):
        self.tables = []
        for i, df in enumerate(self.dfs):
            _table_name = f"table_{i:03}"
            self.ctx.create_table(_table_name, df)
            _table = table(_table_name, *[column(c) for c in df.columns])
            self.tables.append(_table)

    def _create_sql_query(self):
        left = self.tables[0]
        joinq = left
        select_cols = list(left.c)
        for right in self.tables[1:]:
            on = and_(
                left.c.get(col) == right.c.get(col) for col in self.join_cols)
            joinq = joinq.join(right, on)
            select_cols += [c for c in right.c if c.name not in self.join_cols]
        query = select(*select_cols).select_from(joinq)
        self.sql_query = str(
            query.compile(
                dialect=postgresql.dialect(),
                compile_kwargs={"literal_binds": True},
            ))

    def time_joins(self, N, ncols, njoin_columns, njoins, distinct_r):
        start = time.perf_counter()
        print(f"Processing SQL query: {self.sql_query}")
        res = self.ctx.sql(self.sql_query)
        stop = time.perf_counter()
        print(f"Processing SQL query took {stop-start:0.4f} s.")
        start = time.perf_counter()
        print("Computing dask dataframe")
        res.compute()
        stop = time.perf_counter()
        print(f"Computing dask dataframe took {stop-start:0.4f} s.")
        # Visualize task graph
        # res.visualize('taskgraph.png')
        return res
Esempio n. 3
0
def test_tables(gpu):
    c = Context()
    c.create_table("table", pd.DataFrame(), gpu=gpu)

    result_df = c.sql(f'SHOW TABLES FROM "{c.schema_name}"')
    expected_df = pd.DataFrame({"Table": ["table"]})

    assert_eq(result_df, expected_df, check_index=False)
Esempio n. 4
0
def test_show_tables_no_schema(c):
    c = Context()

    df = pd.DataFrame({"id": [0, 1]})
    c.create_table("test", df)

    actual_df = c.sql("show tables").compute()
    expected_df = pd.DataFrame({"Table": ["test"]})
    assert_eq(actual_df, expected_df)
Esempio n. 5
0
def test_dask_sql_sg_logistic_regression(
    datatype,
    nrows,
    ncols,
    n_parts,
    wrap_predict
):
    if wrap_predict:
        cuml.set_global_output_type("input")
    else:
        cuml.set_global_output_type("cudf")

    X, y = make_classification(
        n_samples=nrows, n_features=ncols, n_informative=5, random_state=0
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    train_df = cudf.DataFrame(
        X_train, dtype=datatype, columns=[chr(i) for i in range(ncols)]
    )
    train_df["target"] = y_train
    train_ddf = dask_cudf.from_cudf(train_df, npartitions=n_parts)

    c = Context()
    c.create_table("train_df", train_ddf)

    train_query = f"""
        CREATE MODEL model WITH (
            model_class = 'cuml.linear_model.LogisticRegression',
            wrap_predict = {wrap_predict},
            target_column = 'target'
        ) AS (
            SELECT * FROM train_df
        )
    """

    c.sql(train_query)

    skmodel = LogisticRegression().fit(X_train, y_train)

    test_df = cudf.DataFrame(
        X_test, dtype=datatype, columns=[chr(i) for i in range(ncols)]
    )
    test_ddf = dask_cudf.from_cudf(test_df, npartitions=n_parts)
    c.create_table("test_df", test_ddf)

    inference_query = """
        SELECT * FROM PREDICT(
            MODEL model,
            SELECT * FROM test_df
        )
    """

    preds = c.sql(inference_query).compute()
    score = cuml.metrics.accuracy_score(y_test, preds["target"].to_numpy())

    assert score >= skmodel.score(X_test, y_test) - 0.022
Esempio n. 6
0
def main():  # pragma: no cover
    """
    CLI version of the :func:`run_server` function.
    """
    parser = ArgumentParser()
    parser.add_argument(
        "--host",
        default="0.0.0.0",
        help="The host interface to listen on (defaults to all interfaces)",
    )
    parser.add_argument(
        "--port", default=8080, help="The port to listen on (defaults to 8080)"
    )
    parser.add_argument(
        "--scheduler-address",
        default=None,
        help="Connect to this dask scheduler if given",
    )
    parser.add_argument(
        "--log-level",
        default=None,
        help="Set the log level of the server. Defaults to info.",
        choices=uvicorn.config.LOG_LEVELS,
    )
    parser.add_argument(
        "--load-test-data",
        default=False,
        action="store_true",
        help="Preload some test data.",
    )
    parser.add_argument(
        "--startup",
        default=False,
        action="store_true",
        help="Wait until Apache Calcite was properly loaded",
    )

    args = parser.parse_args()

    client = None
    if args.scheduler_address:
        client = dask.distributed.Client(args.scheduler_address)

    context = Context()
    if args.load_test_data:
        df = dask.datasets.timeseries(freq="1d").reset_index(drop=False)
        context.create_table("timeseries", df.persist())

    run_server(
        context=context,
        client=client,
        host=args.host,
        port=args.port,
        startup=args.startup,
        log_level=args.log_level,
    )
Esempio n. 7
0
    def select(self, dfs: fugue.dataframe.DataFrames,
               statement: str) -> fugue.dataframe.DataFrame:
        """Send the SQL command to the dask-sql context and register all temporary dataframes"""
        c = Context()

        for k, v in dfs.items():
            c.create_table(k, self.execution_engine.to_df(v).native)

        df = c.sql(statement)
        return fugue_dask.dataframe.DaskDataFrame(df)
Esempio n. 8
0
def test_sql():
    c = Context()

    data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
    c.create_table("df", data_frame)

    result = c.sql("SELECT * FROM df")
    assert isinstance(result, dd.DataFrame)

    result = c.sql("SELECT * FROM df", return_futures=False)
    assert isinstance(result, pd.DataFrame)
Esempio n. 9
0
def c():
    c = Context()
    c.create_schema(schema)
    row = create_table_row()
    tables = pd.DataFrame().append(row, ignore_index=True)
    tables = tables.astype({"AN_INT": "int64"})
    c.create_table(table, tables, schema_name=schema)

    yield c

    c.drop_schema(schema)
Esempio n. 10
0
def test_explain():
    c = Context()

    data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
    c.create_table("df", data_frame)

    sql_string = c.explain("SELECT * FROM df")

    assert (
        sql_string
        == f"LogicalProject(a=[$0]){os.linesep}  LogicalTableScan(table=[[schema, df]]){os.linesep}"
    )
Esempio n. 11
0
def test_query_case_sensitivity():
    c = Context()
    df = pd.DataFrame({"id": [0, 1]})

    c.create_table("test", df)

    try:
        c.sql(
            "select ID from test",
            config_options={"sql.identifier.case_sensitive": False},
        )
    except ParsingException as pe:
        assert False, f"Queries should be case insensitve but raised exception {pe}"
Esempio n. 12
0
def eq_sqlite(sql, **dfs):
    c = Context()
    engine = sqlite3.connect(":memory:")

    for name, df in dfs.items():
        c.create_table(name, df)
        df.to_sql(name, engine, index=False)

    dask_result = c.sql(sql).compute().reset_index(drop=True)
    sqlite_result = pd.read_sql(sql, engine).reset_index(drop=True)

    # Make sure SQL and Dask use the same "NULL" value
    dask_result = dask_result.fillna(np.NaN)
    sqlite_result = sqlite_result.fillna(np.NaN)

    assert_frame_equal(dask_result, sqlite_result, check_dtype=False)
Esempio n. 13
0
def test_add_remove_tables():
    c = Context()

    data_frame = dd.from_pandas(pd.DataFrame(), npartitions=1)

    c.create_table("table", data_frame)
    assert "table" in c.tables

    c.drop_table("table")
    assert "table" not in c.tables

    with pytest.raises(KeyError):
        c.drop_table("table")

    c.create_table("table", [data_frame])
    assert "table" in c.tables
Esempio n. 14
0
def test_sql():
    c = Context()

    data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
    c.create_table("df", data_frame)

    result = c.sql("SELECT * FROM df")
    assert isinstance(result, dd.DataFrame)
    assert_frame_equal(result.compute(), data_frame.compute())

    result = c.sql("SELECT * FROM df", return_futures=False)
    assert isinstance(result, pd.DataFrame)
    assert_frame_equal(result, data_frame.compute())

    result = c.sql("SELECT * FROM other_df",
                   dataframes={"other_df": data_frame})
    assert isinstance(result, dd.DataFrame)
    assert_frame_equal(result.compute(), data_frame.compute())
Esempio n. 15
0
def test_sql(gpu):
    c = Context()

    data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
    c.create_table("df", data_frame, gpu=gpu)

    result = c.sql("SELECT * FROM df")
    assert isinstance(result, dd.DataFrame)
    assert_eq(result, data_frame)

    result = c.sql("SELECT * FROM df", return_futures=False)
    assert not isinstance(result, dd.DataFrame)
    assert_eq(result, data_frame)

    result = c.sql(
        "SELECT * FROM other_df", dataframes={"other_df": data_frame}, gpu=gpu
    )
    assert isinstance(result, dd.DataFrame)
    assert_eq(result, data_frame)
Esempio n. 16
0
def eq_sqlite(sql, **dfs):
    c = Context()
    engine = sqlite3.connect(":memory:")

    for name, df in dfs.items():
        c.create_table(name, df)
        df.to_sql(name, engine, index=False)

    dask_result = c.sql(sql).reset_index(drop=True)
    sqlite_result = pd.read_sql(sql, engine).reset_index(drop=True)

    # casting to object to ensure equality with sql-lite
    # which returns object dtype for datetime inputs
    dask_result = cast_datetime_to_string(dask_result)

    # Make sure SQL and Dask use the same "NULL" value
    dask_result = dask_result.fillna(np.NaN)
    sqlite_result = sqlite_result.fillna(np.NaN)

    assert_eq(dask_result, sqlite_result, check_dtype=False)
Esempio n. 17
0
def main():  # pragma: no cover
    parser = ArgumentParser()
    parser.add_argument(
        "--scheduler-address",
        default=None,
        help="Connect to this dask scheduler if given",
    )
    parser.add_argument(
        "--log-level",
        default=None,
        help="Set the log level of the server. Defaults to info.",
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
    )
    parser.add_argument(
        "--load-test-data",
        default=False,
        action="store_true",
        help="Preload some test data.",
    )
    parser.add_argument(
        "--startup",
        default=False,
        action="store_true",
        help="Wait until Apache Calcite was properly loaded",
    )

    args = parser.parse_args()

    client = None
    if args.scheduler_address:
        client = dask.distributed.Client(args.scheduler_address)

    context = Context()
    if args.load_test_data:
        df = dask.datasets.timeseries(freq="1d").reset_index(drop=False)
        context.create_table("timeseries", df.persist())

    cmd_loop(
        context=context, client=client, startup=args.startup, log_level=args.log_level
    )
Esempio n. 18
0
def test_join_case_projection_subquery():
    c = Context()

    # Tables for query
    demo = pd.DataFrame({"demo_sku": [], "hd_dep_count": []})
    site_page = pd.DataFrame({"site_page_sk": [], "site_char_count": []})
    sales = pd.DataFrame({
        "sales_hdemo_sk": [],
        "sales_page_sk": [],
        "sold_time_sk": []
    })
    t_dim = pd.DataFrame({"t_time_sk": [], "t_hour": []})

    c.create_table("demos", demo, persist=False)
    c.create_table("site_page", site_page, persist=False)
    c.create_table("sales", sales, persist=False)
    c.create_table("t_dim", t_dim, persist=False)

    c.sql("""
    SELECT CASE WHEN pmc > 0.0 THEN CAST (amc AS DOUBLE) / CAST (pmc AS DOUBLE) ELSE -1.0 END AS am_pm_ratio
    FROM
    (
        SELECT SUM(amc1) AS amc, SUM(pmc1) AS pmc
        FROM
        (
            SELECT
                CASE WHEN t_hour BETWEEN 7 AND 8 THEN COUNT(1) ELSE 0 END AS amc1,
                CASE WHEN t_hour BETWEEN 19 AND 20 THEN COUNT(1) ELSE 0 END AS pmc1
            FROM sales ws
            JOIN demos hd ON (hd.demo_sku = ws.sales_hdemo_sk and hd.hd_dep_count = 5)
            JOIN site_page sp ON (sp.site_page_sk = ws.sales_page_sk and sp.site_char_count BETWEEN 5000 AND 6000)
            JOIN t_dim td ON (td.t_time_sk = ws.sold_time_sk and td.t_hour IN (7,8,19,20))
            GROUP BY t_hour
        ) cnt_am_pm
    ) sum_am_pm
    """).compute()
Esempio n. 19
0
def test_fsql():
    def assert_eq(df: pd.DataFrame) -> None:
        assert_frame_equal(df, pd.DataFrame({"a": [1]}))

    # the simplest case: the SQL does not use any input and does not generate output
    fsql("""
    CREATE [[0],[1]] SCHEMA a:long
    SELECT * WHERE a>0
    OUTPUT USING assert_eq
    """)

    # it can directly use the dataframes inside dask-sql Context
    c = Context()
    c.create_table(
        "df",
        dd.from_pandas(pd.DataFrame([[0], [1]], columns=["a"]), npartitions=2))

    fsql(
        """
    SELECT * FROM df WHERE a>0
    OUTPUT USING assert_eq
    """,
        c,
    )

    # for dataframes with name, they can register back to the Context (register=True)
    # the return of fsql is the dict of all dask dataframes with explicit names
    result = fsql(
        """
    x=SELECT * FROM df WHERE a>0
    OUTPUT USING assert_eq
    """,
        c,
        register=True,
    )
    assert isinstance(result["x"], dd.DataFrame)
    assert "x" in c.tables

    # integration test with fugue transformer extension
    c = Context()
    c.create_table(
        "df1",
        dd.from_pandas(pd.DataFrame([[0, 1], [1, 2]], columns=["a", "b"]),
                       npartitions=2),
    )
    c.create_table(
        "df2",
        dd.from_pandas(pd.DataFrame([[1, 2], [3, 4], [-4, 5]],
                                    columns=["a", "b"]),
                       npartitions=2),
    )

    # schema: *
    def cumsum(df: pd.DataFrame) -> pd.DataFrame:
        return df.cumsum()

    fsql(
        """
    data = SELECT * FROM df1 WHERE a>0 UNION ALL SELECT * FROM df2 WHERE a>0 PERSIST
    result1 = TRANSFORM data PREPARTITION BY a PRESORT b USING cumsum
    result2 = TRANSFORM data PREPARTITION BY b PRESORT a USING cumsum
    PRINT result1, result2
    """,
        c,
        register=True,
    )
    assert "result1" in c.tables
    assert "result2" in c.tables
Esempio n. 20
0
        return dask_xgboost.predict(client, bst, X)

    # Create a context
    from dask_sql import Context, run_server
    c = Context()

    c.register_function(predict_price, "predict_price",
                        [("total_amount", np.float64),
                         ("trip_distance", np.float64),
                         ("passenger_count", np.float64)], np.float64)

    # Load the data from S3
    df = dd.read_csv("s3://nyc-tlc/trip data/yellow_tripdata_2019-01.csv",
                     dtype={
                         "payment_type": "UInt8",
                         "VendorID": "UInt8",
                         "passenger_count": "UInt8",
                         "RatecodeIDq": "UInt8",
                     },
                     storage_options={
                         "anon": True
                     }).persist()

    wait(df)

    c.create_table("nyc-taxi", df)

    c.sql("SELECT 1 + 1").compute()

    # Finally, spin up the dask-sql server
    run_server(context=c, client=client)
Esempio n. 21
0
def test_input_types(temporary_data_file):
    c = Context()
    df = pd.DataFrame({"a": [1, 2, 3]})

    def assert_correct_output():
        result = c.sql("SELECT * FROM df")
        assert isinstance(result, dd.DataFrame)
        assert_frame_equal(result.compute(), df)

    c.create_table("df", df)
    assert_correct_output()

    c.create_table("df", dd.from_pandas(df, npartitions=1))
    assert_correct_output()

    df.to_csv(temporary_data_file, index=False)
    c.create_table("df", temporary_data_file)
    assert_correct_output()

    df.to_csv(temporary_data_file, index=False)
    c.create_table("df", temporary_data_file, format="csv")
    assert_correct_output()

    df.to_parquet(temporary_data_file, index=False)
    c.create_table("df", temporary_data_file, format="parquet")
    assert_correct_output()

    with pytest.raises(AttributeError):
        c.create_table("df", temporary_data_file, format="unknown")

    strangeThing = object()

    with pytest.raises(ValueError):
        c.create_table("df", strangeThing)
Esempio n. 22
0
 def query(self, dataframe: DataFrame) -> Result[DataFrame, InvalidJob]:
     c = Context()
     c.create_table(self.table_name, dataframe)
     queried = c.sql(self.query_string)
     return Success(queried)
Esempio n. 23
0
import dask.dataframe as dd
import pandas as pd

from dask_sql import Context

c = Context()

data = """
name,x
Alice,34
Bob,
"""

df = pd.DataFrame({"name": ["Alice", "Bob", "Chris"] * 100, "x": list(range(300))})
ddf = dd.from_pandas(df, npartitions=10)
c.create_table("my_data", ddf)

got = c.sql(
    """
    SELECT
        my_data.name,
        SUM(my_data.x) AS "S"
    FROM
        my_data
    GROUP BY
        my_data.name
"""
)
expect = pd.DataFrame({"name": ["Alice", "Bob", "Chris"], "S": [14850, 14950, 15050]})

dd.assert_eq(got, expect)