def parse(sql, con, index_col, read_sql_engine, **kwargs): enable_cx = False if read_sql_engine == "Connectorx": try: import connectorx as cx enable_cx = True except ImportError: warnings.warn( "Switch to 'pandas.read_sql' since 'connectorx' is not installed, please run 'pip install connectorx'." ) num_splits = kwargs.pop("num_splits", None) if isinstance(con, ModinDatabaseConnection): con = con.get_string() if enable_cx else con.get_connection() if num_splits is None: if enable_cx: return cx.read_sql(con, sql, index_col=index_col) return pandas.read_sql(sql, con, index_col=index_col, **kwargs) if enable_cx: df = cx.read_sql(con, sql, index_col=index_col) else: df = pandas.read_sql(sql, con, index_col=index_col, **kwargs) if index_col is None: index = len(df) else: index = df.index return _split_result_for_readers(1, num_splits, df) + [index, df.dtypes]
def read_sql( sql: Union[List[str], str], connection_uri: str, partition_on: Optional[str] = None, partition_range: Optional[Tuple[int, int]] = None, partition_num: Optional[int] = None, protocol: Optional[str] = None, ) -> DataFrame: """ Read a SQL query into a DataFrame. Make sure to install connectorx>=0.2 # Sources Supports reading a sql query from the following data sources: * Postgres * Mysql * Sqlite * Redshift (through postgres protocol) * Clickhouse (through mysql protocol) ## Source not supported? If a database source is not supported, pandas can be used to load the query: >>> import pandas as pd >>> df = pl.from_pandas(pd.read_sql(sql, engine)) # doctest: +SKIP Parameters ---------- sql raw sql query. connection_uri connectorx connection uri: - "postgresql://*****:*****@server:port/database" partition_on the column on which to partition the result. partition_range the value range of the partition column. partition_num how many partitions to generate. protocol backend-specific transfer protocol directive; see connectorx documentation for details. Examples -------- ## Single threaded Read a DataFrame from a SQL query using a single thread: >>> uri = "postgresql://*****:*****@server:port/database" >>> query = "SELECT * FROM lineitem" >>> pl.read_sql(query, uri) # doctest: +SKIP ## Using 10 threads Read a DataFrame in parallel using 10 threads by automatically partitioning the provided SQL on the partition column: >>> uri = "postgresql://*****:*****@server:port/database" >>> query = "SELECT * FROM lineitem" >>> pl.read_sql( ... query, uri, partition_on="partition_col", partition_num=10 ... ) # doctest: +SKIP ## Using Read a DataFrame in parallel using 2 threads by explicitly providing two SQL queries: >>> uri = "postgresql://*****:*****@server:port/database" >>> queries = [ ... "SELECT * FROM lineitem WHERE partition_col <= 10", ... "SELECT * FROM lineitem WHERE partition_col > 10", ... ] >>> pl.read_sql(uri, queries) # doctest: +SKIP """ if _WITH_CX: tbl = cx.read_sql( conn=connection_uri, query=sql, return_type="arrow", partition_on=partition_on, partition_range=partition_range, partition_num=partition_num, protocol=protocol, ) return cast(DataFrame, from_arrow(tbl)) else: raise ImportError( "connectorx is not installed. Please run `pip install connectorx>=0.2.2`." )
def read_sql( sql: Union[List[str], str], connection_uri: str, partition_on: Optional[str] = None, partition_range: Optional[Tuple[int, int]] = None, partition_num: Optional[int] = None, ) -> "pl.DataFrame": """ Read a SQL query into a DataFrame Make sure to install connextorx>=0.2 # Sources Supports reading a sql query from the following data sources: * Postgres * Mysql * Sqlite * Redshift (through postgres protocol) * Clickhouse (through mysql protocol) ## Source not supported? If a database source is not supported, pandas can be used to load the query: >>>> df = pl.from_pandas(pd.read_sql(sql, engine)) Parameters ---------- sql raw sql query connection_uri connectorx connection uri: - "postgresql://*****:*****@server:port/database" partition_on the column to partition the result. partition_range the value range of the partition column. partition_num how many partition to generate. Examples -------- ## Single threaded Read a DataFrame from a SQL using a single thread: >>> uri = "postgresql://*****:*****@server:port/database" >>> query = "SELECT * FROM lineitem" >>> pl.read_sql(query, uri) ## Using 10 threads Read a DataFrame parallelly using 10 threads by automatically partitioning the provided SQL on the partition column: >>> uri = "postgresql://*****:*****@server:port/database" >>> query = "SELECT * FROM lineitem" >>> read_sql(query, uri, partition_on="partition_col", partition_num=10) ## Using Read a DataFrame parallel using 2 threads by manually providing two partition SQLs: >>> uri = "postgresql://*****:*****@server:port/database" >>> queries = ["SELECT * FROM lineitem WHERE partition_col <= 10", "SELECT * FROM lineitem WHERE partition_col > 10"] >>> read_sql(uri, queries) """ if _WITH_CX: tbl = cx.read_sql( conn=connection_uri, query=sql, return_type="arrow", partition_on=partition_on, partition_range=partition_range, partition_num=partition_num, ) return pl.from_arrow(tbl) # type: ignore[return-value] else: raise ImportError("connectorx is not installed." "Please run pip install connectorx>=0.2.0a3")
--protocol=<protocol> The protocol to use [default: binary]. --conn=<conn> The connection url to use [default: POSTGRES_URL]. -h --help Show this screen. --version Show version. """ import os import connectorx as cx from contexttimer import Timer from docopt import docopt if __name__ == "__main__": args = docopt(__doc__, version="Naval Fate 2.0") conn = os.environ[args["--conn"]] table = os.environ["TPCH_TABLE"] with Timer() as timer: df = cx.read_sql( conn, f"""SELECT * FROM {table}""", partition_on="L_ORDERKEY", partition_num=int(args["<num>"]), protocol=args["--protocol"], ) print("time in total:", timer.elapsed) print(df.head()) print(df.tail()) print(len(df))
def read_sql( sql: list[str] | str, connection_uri: str, partition_on: str | None = None, partition_range: tuple[int, int] | None = None, partition_num: int | None = None, protocol: str | None = None, ) -> DataFrame: """ Read a SQL query into a DataFrame. .. note:: Make sure to install connectorx>=0.2.2. Read the documentation `here <https://sfu-db.github.io/connector-x/intro.html>`_. Reading a SQL query from the following data sources are supported: * Postgres * Mysql * Sqlite * Redshift (through postgres protocol) * Clickhouse (through mysql protocol) If a database source is not supported, an alternative solution is to first use pandas to load the SQL query, then converting the result into a polars DataFrame: >>> import pandas as pd >>> df = pl.from_pandas(pd.read_sql(sql, engine)) # doctest: +SKIP Parameters ---------- sql Raw SQL query / queries. connection_uri Connectorx connection uri, for example * "postgresql://*****:*****@server:port/database" partition_on The column on which to partition the result. partition_range The value range of the partition column. partition_num How many partitions to generate. protocol Backend-specific transfer protocol directive; see connectorx documentation for details. Examples -------- Read a DataFrame from a SQL query using a single thread: >>> uri = "postgresql://*****:*****@server:port/database" >>> query = "SELECT * FROM lineitem" >>> pl.read_sql(query, uri) # doctest: +SKIP Read a DataFrame in parallel using 10 threads by automatically partitioning the provided SQL on the partition column: >>> uri = "postgresql://*****:*****@server:port/database" >>> query = "SELECT * FROM lineitem" >>> pl.read_sql( ... query, uri, partition_on="partition_col", partition_num=10 ... ) # doctest: +SKIP Read a DataFrame in parallel using 2 threads by explicitly providing two SQL queries: >>> uri = "postgresql://*****:*****@server:port/database" >>> queries = [ ... "SELECT * FROM lineitem WHERE partition_col <= 10", ... "SELECT * FROM lineitem WHERE partition_col > 10", ... ] >>> pl.read_sql(queries, uri) # doctest: +SKIP """ if _WITH_CX: tbl = cx.read_sql( conn=connection_uri, query=sql, return_type="arrow2", partition_on=partition_on, partition_range=partition_range, partition_num=partition_num, protocol=protocol, ) return cast(DataFrame, from_arrow(tbl)) else: raise ImportError( "connectorx is not installed. Please run `pip install connectorx>=0.2.2`." )