Esempio n. 1
0
    def parse(sql, con, index_col, read_sql_engine, **kwargs):

        enable_cx = False
        if read_sql_engine == "Connectorx":
            try:
                import connectorx as cx

                enable_cx = True
            except ImportError:
                warnings.warn(
                    "Switch to 'pandas.read_sql' since 'connectorx' is not installed, please run 'pip install connectorx'."
                )

        num_splits = kwargs.pop("num_splits", None)
        if isinstance(con, ModinDatabaseConnection):
            con = con.get_string() if enable_cx else con.get_connection()

        if num_splits is None:
            if enable_cx:
                return cx.read_sql(con, sql, index_col=index_col)
            return pandas.read_sql(sql, con, index_col=index_col, **kwargs)

        if enable_cx:
            df = cx.read_sql(con, sql, index_col=index_col)
        else:
            df = pandas.read_sql(sql, con, index_col=index_col, **kwargs)
        if index_col is None:
            index = len(df)
        else:
            index = df.index
        return _split_result_for_readers(1, num_splits,
                                         df) + [index, df.dtypes]
Esempio n. 2
0
def read_sql(
    sql: Union[List[str], str],
    connection_uri: str,
    partition_on: Optional[str] = None,
    partition_range: Optional[Tuple[int, int]] = None,
    partition_num: Optional[int] = None,
    protocol: Optional[str] = None,
) -> DataFrame:
    """
    Read a SQL query into a DataFrame.
    Make sure to install connectorx>=0.2

    # Sources
    Supports reading a sql query from the following data sources:

    * Postgres
    * Mysql
    * Sqlite
    * Redshift (through postgres protocol)
    * Clickhouse (through mysql protocol)

    ## Source not supported?
    If a database source is not supported, pandas can be used to load the query:

    >>> import pandas as pd
    >>> df = pl.from_pandas(pd.read_sql(sql, engine))  # doctest: +SKIP

    Parameters
    ----------
    sql
        raw sql query.
    connection_uri
        connectorx connection uri:
            - "postgresql://*****:*****@server:port/database"
    partition_on
      the column on which to partition the result.
    partition_range
      the value range of the partition column.
    partition_num
      how many partitions to generate.
    protocol
      backend-specific transfer protocol directive; see connectorx documentation for details.

    Examples
    --------

    ## Single threaded
    Read a DataFrame from a SQL query using a single thread:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> query = "SELECT * FROM lineitem"
    >>> pl.read_sql(query, uri)  # doctest: +SKIP

    ## Using 10 threads
    Read a DataFrame in parallel using 10 threads by automatically partitioning the provided SQL on the partition column:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> query = "SELECT * FROM lineitem"
    >>> pl.read_sql(
    ...     query, uri, partition_on="partition_col", partition_num=10
    ... )  # doctest: +SKIP

    ## Using
    Read a DataFrame in parallel using 2 threads by explicitly providing two SQL queries:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> queries = [
    ...     "SELECT * FROM lineitem WHERE partition_col <= 10",
    ...     "SELECT * FROM lineitem WHERE partition_col > 10",
    ... ]
    >>> pl.read_sql(uri, queries)  # doctest: +SKIP

    """
    if _WITH_CX:
        tbl = cx.read_sql(
            conn=connection_uri,
            query=sql,
            return_type="arrow",
            partition_on=partition_on,
            partition_range=partition_range,
            partition_num=partition_num,
            protocol=protocol,
        )
        return cast(DataFrame, from_arrow(tbl))
    else:
        raise ImportError(
            "connectorx is not installed. Please run `pip install connectorx>=0.2.2`."
        )
Esempio n. 3
0
def read_sql(
    sql: Union[List[str], str],
    connection_uri: str,
    partition_on: Optional[str] = None,
    partition_range: Optional[Tuple[int, int]] = None,
    partition_num: Optional[int] = None,
) -> "pl.DataFrame":
    """
    Read a SQL query into a DataFrame
    Make sure to install connextorx>=0.2

    # Sources
    Supports reading a sql query from the following data sources:

    * Postgres
    * Mysql
    * Sqlite
    * Redshift (through postgres protocol)
    * Clickhouse (through mysql protocol)

    ## Source not supported?
    If a database source is not supported, pandas can be used to load the query:

    >>>> df = pl.from_pandas(pd.read_sql(sql, engine))

    Parameters
    ----------
    sql
        raw sql query
    connection_uri
        connectorx connection uri:
            - "postgresql://*****:*****@server:port/database"
    partition_on
      the column to partition the result.
    partition_range
      the value range of the partition column.
    partition_num
      how many partition to generate.


    Examples
    --------

    ## Single threaded
    Read a DataFrame from a SQL using a single thread:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> query = "SELECT * FROM lineitem"
    >>> pl.read_sql(query, uri)

    ## Using 10 threads
    Read a DataFrame parallelly using 10 threads by automatically partitioning the provided SQL on the partition column:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> query = "SELECT * FROM lineitem"
    >>> read_sql(query, uri, partition_on="partition_col", partition_num=10)

    ## Using
    Read a DataFrame parallel using 2 threads by manually providing two partition SQLs:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> queries = ["SELECT * FROM lineitem WHERE partition_col <= 10", "SELECT * FROM lineitem WHERE partition_col > 10"]
    >>> read_sql(uri, queries)

    """
    if _WITH_CX:
        tbl = cx.read_sql(
            conn=connection_uri,
            query=sql,
            return_type="arrow",
            partition_on=partition_on,
            partition_range=partition_range,
            partition_num=partition_num,
        )
        return pl.from_arrow(tbl)  # type: ignore[return-value]
    else:
        raise ImportError("connectorx is not installed."
                          "Please run pip install connectorx>=0.2.0a3")
Esempio n. 4
0
  --protocol=<protocol>  The protocol to use [default: binary].
  --conn=<conn>          The connection url to use [default: POSTGRES_URL].
  -h --help              Show this screen.
  --version              Show version.
"""
import os

import connectorx as cx
from contexttimer import Timer
from docopt import docopt

if __name__ == "__main__":

    args = docopt(__doc__, version="Naval Fate 2.0")
    conn = os.environ[args["--conn"]]
    table = os.environ["TPCH_TABLE"]

    with Timer() as timer:
        df = cx.read_sql(
            conn,
            f"""SELECT * FROM {table}""",
            partition_on="L_ORDERKEY",
            partition_num=int(args["<num>"]),
            protocol=args["--protocol"],
        )
    print("time in total:", timer.elapsed)

    print(df.head())
    print(df.tail())
    print(len(df))
Esempio n. 5
0
File: io.py Progetto: pola-rs/polars
def read_sql(
    sql: list[str] | str,
    connection_uri: str,
    partition_on: str | None = None,
    partition_range: tuple[int, int] | None = None,
    partition_num: int | None = None,
    protocol: str | None = None,
) -> DataFrame:
    """
    Read a SQL query into a DataFrame.

    .. note::
        Make sure to install connectorx>=0.2.2. Read the documentation
        `here <https://sfu-db.github.io/connector-x/intro.html>`_.

    Reading a SQL query from the following data sources are supported:

        * Postgres
        * Mysql
        * Sqlite
        * Redshift (through postgres protocol)
        * Clickhouse (through mysql protocol)

    If a database source is not supported, an alternative solution is to first use pandas
    to load the SQL query, then converting the result into a polars DataFrame:

    >>> import pandas as pd
    >>> df = pl.from_pandas(pd.read_sql(sql, engine))  # doctest: +SKIP

    Parameters
    ----------
    sql
        Raw SQL query / queries.
    connection_uri
        Connectorx connection uri, for example

        * "postgresql://*****:*****@server:port/database"
    partition_on
        The column on which to partition the result.
    partition_range
        The value range of the partition column.
    partition_num
        How many partitions to generate.
    protocol
        Backend-specific transfer protocol directive; see connectorx documentation for details.

    Examples
    --------
    Read a DataFrame from a SQL query using a single thread:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> query = "SELECT * FROM lineitem"
    >>> pl.read_sql(query, uri)  # doctest: +SKIP

    Read a DataFrame in parallel using 10 threads by automatically partitioning the provided SQL on the partition column:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> query = "SELECT * FROM lineitem"
    >>> pl.read_sql(
    ...     query, uri, partition_on="partition_col", partition_num=10
    ... )  # doctest: +SKIP

    Read a DataFrame in parallel using 2 threads by explicitly providing two SQL queries:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> queries = [
    ...     "SELECT * FROM lineitem WHERE partition_col <= 10",
    ...     "SELECT * FROM lineitem WHERE partition_col > 10",
    ... ]
    >>> pl.read_sql(queries, uri)  # doctest: +SKIP

    """
    if _WITH_CX:
        tbl = cx.read_sql(
            conn=connection_uri,
            query=sql,
            return_type="arrow2",
            partition_on=partition_on,
            partition_range=partition_range,
            partition_num=partition_num,
            protocol=protocol,
        )
        return cast(DataFrame, from_arrow(tbl))
    else:
        raise ImportError(
            "connectorx is not installed. Please run `pip install connectorx>=0.2.2`."
        )