Esempio n. 1
0
def c(
    df_simple,
    df,
    user_table_1,
    user_table_2,
    long_table,
    user_table_inf,
    user_table_nan,
    string_table,
    datetime_table,
):
    dfs = {
        "df_simple": df_simple,
        "df": df,
        "user_table_1": user_table_1,
        "user_table_2": user_table_2,
        "long_table": long_table,
        "user_table_inf": user_table_inf,
        "user_table_nan": user_table_nan,
        "string_table": string_table,
        "datetime_table": datetime_table,
    }

    # Lazy import, otherwise the pytest framework has problems
    from dask_sql.context import Context

    c = Context()
    for df_name, df in dfs.items():
        dask_df = dd.from_pandas(df, npartitions=3)
        c.create_table(df_name, dask_df)

    yield c
Esempio n. 2
0
def test_sort_with_nan_many_partitions(gpu):
    c = Context()
    df = pd.DataFrame({
        "a": [float("nan"), 1] * 30,
        "b": [1, 2, 3] * 20,
    })
    c.create_table("df", dd.from_pandas(df, npartitions=10), gpu=gpu)

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a NULLS FIRST, b ASC NULLS FIRST")

    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan")] * 30 + [1] * 30,
            "b":
            [1] * 10 + [2] * 10 + [3] * 10 + [1] * 10 + [2] * 10 + [3] * 10,
        }),
        check_index=False,
    )

    df = pd.DataFrame({"a": [float("nan"), 1] * 30})
    c.create_table("df", dd.from_pandas(df, npartitions=10))

    df_result = c.sql("SELECT * FROM df ORDER BY a")

    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1] * 30 + [float("nan")] * 30,
        }),
        check_index=False,
    )
Esempio n. 3
0
def test_select(hive_cursor):
    c = Context()
    c.create_table("df", hive_cursor)

    result_df = c.sql("SELECT * FROM df").compute().reset_index(drop=True)
    df = pd.DataFrame({"i": [1, 2], "j": [2, 4]}).astype("int32")

    assert_frame_equal(df, result_df)
Esempio n. 4
0
def test_select(hive_cursor):
    c = Context()
    c.create_table("df", hive_cursor)

    result_df = c.sql("SELECT * FROM df")
    expected_df = pd.DataFrame({"i": [1, 2], "j": [2, 4]}).astype("int32")

    assert_eq(result_df, expected_df, check_index=False)
Esempio n. 5
0
def test_intake_location(intake_catalog_location):
    c = Context()
    c.create_table("df",
                   intake_catalog_location,
                   format="intake",
                   intake_table_name="intake_table")

    check_read_table(c)
Esempio n. 6
0
def main():  # pragma: no cover
    """
    CLI version of the :func:`run_server` function.
    """
    parser = ArgumentParser()
    parser.add_argument(
        "--host",
        default="0.0.0.0",
        help="The host interface to listen on (defaults to all interfaces)",
    )
    parser.add_argument(
        "--port", default=8080, help="The port to listen on (defaults to 8080)"
    )
    parser.add_argument(
        "--scheduler-address",
        default=None,
        help="Connect to this dask scheduler if given",
    )
    parser.add_argument(
        "--log-level",
        default=None,
        help="Set the log level of the server. Defaults to info.",
        choices=uvicorn.config.LOG_LEVELS,
    )
    parser.add_argument(
        "--load-test-data",
        default=False,
        action="store_true",
        help="Preload some test data.",
    )
    parser.add_argument(
        "--startup",
        default=False,
        action="store_true",
        help="Wait until Apache Calcite was properly loaded",
    )

    args = parser.parse_args()

    client = None
    if args.scheduler_address:
        client = dask.distributed.Client(args.scheduler_address)

    context = Context()
    if args.load_test_data:
        df = dask.datasets.timeseries(freq="1d").reset_index(drop=False)
        context.create_table("timeseries", df.persist())

    run_server(
        context=context,
        client=client,
        host=args.host,
        port=args.port,
        startup=args.startup,
        log_level=args.log_level,
    )
Esempio n. 7
0
    def select(self, dfs: fugue.dataframe.DataFrames,
               statement: str) -> fugue.dataframe.DataFrame:
        """Send the SQL command to the dask-sql context and register all temporary dataframes"""
        c = Context()

        for k, v in dfs.items():
            c.create_table(k, self.execution_engine.to_df(v).native)

        df = c.sql(statement)
        return fugue_dask.dataframe.DaskDataFrame(df)
Esempio n. 8
0
def c(
    df_simple,
    df_wide,
    df,
    user_table_1,
    user_table_2,
    long_table,
    user_table_inf,
    user_table_nan,
    string_table,
    datetime_table,
    parquet_ddf,
    gpu_user_table_1,
    gpu_df,
    gpu_long_table,
    gpu_string_table,
    gpu_datetime_table,
):
    dfs = {
        "df_simple": df_simple,
        "df_wide": df_wide,
        "df": df,
        "user_table_1": user_table_1,
        "user_table_2": user_table_2,
        "long_table": long_table,
        "user_table_inf": user_table_inf,
        "user_table_nan": user_table_nan,
        "string_table": string_table,
        "datetime_table": datetime_table,
        "parquet_ddf": parquet_ddf,
        "gpu_user_table_1": gpu_user_table_1,
        "gpu_df": gpu_df,
        "gpu_long_table": gpu_long_table,
        "gpu_string_table": gpu_string_table,
        "gpu_datetime_table": gpu_datetime_table,
    }

    # Lazy import, otherwise the pytest framework has problems
    from dask_sql.context import Context

    c = Context()
    for df_name, df in dfs.items():
        if df is None:
            continue
        if hasattr(df, "npartitions"):
            # df is already a dask collection
            dask_df = df
        else:
            dask_df = dd.from_pandas(df, npartitions=3)
        c.create_table(df_name, dask_df)

    yield c
Esempio n. 9
0
def test_sort_with_nan_more_columns(gpu):
    c = Context()
    df = pd.DataFrame({
        "a": [1, 1, 2, 2, float("nan"), float("nan")],
        "b": [1, 1, 2, float("nan"), float("inf"), 5],
        "c": [1, float("nan"), 3, 4, 5, 6],
    })
    c.create_table("df", df, gpu=gpu)

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c ASC NULLS FIRST"
    )
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), float("nan"), 1, 1, 2, 2],
            "b": [float("inf"), 5, 1, 1, 2,
                  float("nan")],
            "c": [5, 6, float("nan"), 1, 3, 4],
        }),
        check_index=False,
    )

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS LAST, b DESC NULLS FIRST, c DESC NULLS LAST"
    )
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 1, 2, 2, float("nan"),
                  float("nan")],
            "b": [1, 1, float("nan"), 2,
                  float("inf"), 5],
            "c": [1, float("nan"), 4, 3, 5, 6],
        }),
        check_index=False,
    )

    df_result = c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c DESC NULLS LAST"
    )
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), float("nan"), 1, 1, 2, 2],
            "b": [float("inf"), 5, 1, 1, 2,
                  float("nan")],
            "c": [5, 6, 1, float("nan"), 3, 4],
        }),
        check_index=False,
    )
Esempio n. 10
0
def main():  # pragma: no cover
    parser = ArgumentParser()
    parser.add_argument(
        "--scheduler-address",
        default=None,
        help="Connect to this dask scheduler if given",
    )
    parser.add_argument(
        "--log-level",
        default=None,
        help="Set the log level of the server. Defaults to info.",
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
    )
    parser.add_argument(
        "--load-test-data",
        default=False,
        action="store_true",
        help="Preload some test data.",
    )
    parser.add_argument(
        "--startup",
        default=False,
        action="store_true",
        help="Wait until Apache Calcite was properly loaded",
    )

    args = parser.parse_args()

    client = None
    if args.scheduler_address:
        client = Client(args.scheduler_address)

    context = Context()
    if args.load_test_data:
        df = timeseries(freq="1d").reset_index(drop=False)
        context.create_table("timeseries", df.persist())

    cmd_loop(context=context,
             client=client,
             startup=args.startup,
             log_level=args.log_level)
Esempio n. 11
0
def test_sort_with_nan_many_partitions():
    c = Context()
    df = pd.DataFrame({
        "a": [float("nan"), 1] * 30,
        "b": [1, 2, 3] * 20,
    })
    c.create_table("df", dd.from_pandas(df, npartitions=10))

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a NULLS FIRST, b ASC NULLS FIRST"
              ).compute().reset_index(drop=True))

    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan")] * 30 + [1] * 30,
            "b":
            [1] * 10 + [2] * 10 + [3] * 10 + [1] * 10 + [2] * 10 + [3] * 10,
        }),
        check_names=False,
    )
Esempio n. 12
0
def test_sort_with_nan_more_columns():
    c = Context()
    df = pd.DataFrame({
        "a": [1, 1, 2, 2, float("nan"), float("nan")],
        "b": [1, 1, 2, float("nan"), float("inf"), 5],
        "c": [1, float("nan"), 3, 4, 5, 6],
    })
    c.create_table("df", df)

    df_result = (c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c ASC NULLS FIRST"
    ).c.compute().reset_index(drop=True))
    assert_series_equal(df_result,
                        pd.Series([5, 6, float("nan"), 1, 3, 4]),
                        check_names=False)

    df_result = (c.sql(
        "SELECT * FROM df ORDER BY a ASC NULLS LAST, b DESC NULLS FIRST, c DESC NULLS LAST"
    ).c.compute().reset_index(drop=True))
    assert_series_equal(df_result,
                        pd.Series([1, float("nan"), 4, 3, 5, 6]),
                        check_names=False)
Esempio n. 13
0
def test_sort_with_nan():
    c = Context()
    df = pd.DataFrame({
        "a": [1, 2, float("nan"), 2],
        "b": [4, float("nan"), 5, float("inf")]
    })
    c.create_table("df", df)

    df_result = c.sql("SELECT * FROM df ORDER BY a").compute().reset_index(
        drop=True)
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a NULLS FIRST").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a NULLS LAST").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a ASC").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a ASC NULLS FIRST").compute(
    ).reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a ASC NULLS LAST").compute().
                 reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
    )

    df_result = (
        c.sql("SELECT * FROM df ORDER BY a DESC").compute().reset_index(
            drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("inf"), float("nan"), 4]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a DESC NULLS FIRST").compute(
    ).reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("inf"), float("nan"), 4]
        }),
    )

    df_result = (c.sql("SELECT * FROM df ORDER BY a DESC NULLS LAST").compute(
    ).reset_index(drop=True))
    assert_frame_equal(
        df_result,
        pd.DataFrame({
            "a": [2, 2, 1, float("nan")],
            "b": [float("inf"), float("nan"), 4, 5]
        }),
    )
Esempio n. 14
0
def test_sort_with_nan(gpu):
    c = Context()
    df = pd.DataFrame({
        "a": [1, 2, float("nan"), 2],
        "b": [4, float("nan"), 5, float("inf")]
    })
    c.create_table("df", df, gpu=gpu)

    df_result = c.sql("SELECT * FROM df ORDER BY a")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a NULLS FIRST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a NULLS LAST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a ASC")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a ASC NULLS FIRST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 1, 2, 2],
            "b": [5, 4, float("nan"), float("inf")]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a ASC NULLS LAST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [1, 2, 2, float("nan")],
            "b": [4, float("nan"), float("inf"), 5]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a DESC")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("nan"), float("inf"), 4]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a DESC NULLS FIRST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [float("nan"), 2, 2, 1],
            "b": [5, float("nan"), float("inf"), 4]
        }),
        check_index=False,
    )

    df_result = c.sql("SELECT * FROM df ORDER BY a DESC NULLS LAST")
    assert_eq(
        df_result,
        pd.DataFrame({
            "a": [2, 2, 1, float("nan")],
            "b": [float("nan"), float("inf"), 4, 5]
        }),
        check_index=False,
    )
Esempio n. 15
0
def test_intake_catalog(intake_catalog_location):
    catalog = intake.open_catalog(intake_catalog_location)
    c = Context()
    c.create_table("df", catalog, intake_table_name="intake_table")

    check_read_table(c)
Esempio n. 16
0
def create_meta_data(c: Context):
    """
    Creates the schema, table and column data for prestodb JDBC driver so that data can be viewed
    in a database tool like DBeaver. It doesn't create a catalog entry although JDBC expects one
    as dask-sql doesn't support catalogs. For both catalogs and procedures empty placeholder
    tables are created.

    The meta-data appears in a separate schema called system_jdbc largely because the JDBC driver
    tries to access system.jdbc and it sufficiently so shouldn't clash with other schemas.

    A function is required in the /v1/statement to change system.jdbc to system_jdbc and ignore
    order by statements from the driver (as adjust_for_presto_sql above)

    :param c: Context containing created tables
    :return:
    """

    if c is None:
        logger.warn("Context None: jdbc meta data not created")
        return
    catalog = ""
    system_schema = "system_jdbc"
    c.create_schema(system_schema)

    # TODO: add support for catalogs in presto interface
    # see https://github.com/dask-contrib/dask-sql/pull/351
    # if catalog and len(catalog.strip()) > 0:
    #     catalogs = pd.DataFrame().append(create_catalog_row(catalog), ignore_index=True)
    #     c.create_table("catalogs", catalogs, schema_name=system_schema)

    schemas = pd.DataFrame().append(create_schema_row(), ignore_index=True)
    c.create_table("schemas", schemas, schema_name=system_schema)
    schema_rows = []

    tables = pd.DataFrame().append(create_table_row(), ignore_index=True)
    c.create_table("tables", tables, schema_name=system_schema)
    table_rows = []

    columns = pd.DataFrame().append(create_column_row(), ignore_index=True)
    c.create_table("columns", columns, schema_name=system_schema)
    column_rows = []

    for schema_name, schema in c.schema.items():
        schema_rows.append(create_schema_row(catalog, schema_name))
        for table_name, dc in schema.tables.items():
            df = dc.df
            logger.info(f"schema ${schema_name}, table {table_name}, {df}")
            table_rows.append(
                create_table_row(catalog, schema_name, table_name))
            pos: int = 0
            for column in df.columns:
                pos = pos + 1
                logger.debug(f"column {column}")
                dtype = "VARCHAR"
                if df[column].dtype == "int64" or df[column].dtype == "int":
                    dtype = "INTEGER"
                elif df[column].dtype == "float64" or df[
                        column].dtype == "float":
                    dtype = "FLOAT"
                elif (df[column].dtype == "datetime"
                      or df[column].dtype == "datetime64[ns]"):
                    dtype = "TIMESTAMP"
                column_rows.append(
                    create_column_row(
                        catalog,
                        schema_name,
                        table_name,
                        dtype,
                        df[column].name,
                        str(pos),
                    ))

    schemas = pd.DataFrame(schema_rows)
    c.create_table("schemas", schemas, schema_name=system_schema)
    tables = pd.DataFrame(table_rows)
    c.create_table("tables", tables, schema_name=system_schema)
    columns = pd.DataFrame(column_rows)
    c.create_table("columns", columns, schema_name=system_schema)

    logger.info(f"jdbc meta data ready for {len(table_rows)} tables")
Esempio n. 17
0
def assert_query_gives_same_result(engine):
    np.random.seed(42)

    df1 = dd.from_pandas(
        pd.DataFrame(
            {
                "user_id": np.random.choice([1, 2, 3, 4, pd.NA], 100),
                "a": np.random.rand(100),
                "b": np.random.randint(-10, 10, 100),
            }
        ),
        npartitions=3,
    )
    df1["user_id"] = df1["user_id"].astype("Int64")

    df2 = dd.from_pandas(
        pd.DataFrame(
            {
                "user_id": np.random.choice([1, 2, 3, 4], 100),
                "c": np.random.randint(20, 30, 100),
                "d": np.random.choice(["a", "b", "c", None], 100),
            }
        ),
        npartitions=3,
    )

    df3 = dd.from_pandas(
        pd.DataFrame(
            {
                "s": [
                    "".join(np.random.choice(["a", "B", "c", "D"], 10))
                    for _ in range(100)
                ]
                + [None]
            }
        ),
        npartitions=3,
    )

    # the other is a Int64, that makes joining simpler
    df2["user_id"] = df2["user_id"].astype("Int64")

    # add some NaNs
    df1["a"] = df1["a"].apply(
        lambda a: float("nan") if a > 0.8 else a, meta=("a", "float")
    )
    df1["b_bool"] = df1["b"].apply(
        lambda b: pd.NA if b > 5 else b < 0, meta=("a", "bool")
    )

    # Lazy import, otherwise the pytest framework has problems
    from dask_sql.context import Context

    c = Context()
    c.create_table("df1", df1)
    c.create_table("df2", df2)
    c.create_table("df3", df3)

    df1.compute().to_sql("df1", engine, index=False, if_exists="replace")
    df2.compute().to_sql("df2", engine, index=False, if_exists="replace")
    df3.compute().to_sql("df3", engine, index=False, if_exists="replace")

    def _assert_query_gives_same_result(query, sort_columns=None, **kwargs):
        sql_result = pd.read_sql_query(query, engine)
        dask_result = c.sql(query).compute()

        # allow that the names are different
        # as expressions are handled differently
        dask_result.columns = sql_result.columns

        if sort_columns:
            sql_result = sql_result.sort_values(sort_columns)
            dask_result = dask_result.sort_values(sort_columns)

        sql_result = sql_result.reset_index(drop=True)
        dask_result = dask_result.reset_index(drop=True)

        assert_frame_equal(sql_result, dask_result, check_dtype=False, **kwargs)

    return _assert_query_gives_same_result