Ejemplo n.º 1
0
def test_no_rechunk() -> None:
    table = pa.Table.from_pydict(
        {"x": pa.chunked_array([list("ab"), list("cd")])})
    # table
    assert pl.from_arrow(table, rechunk=False).n_chunks() == 2
    # chunked array
    assert pl.from_arrow(table["x"], rechunk=False).n_chunks() == 2
Ejemplo n.º 2
0
def test_from_arrow() -> None:
    data = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]})
    df = pl.from_arrow(data)
    assert df.shape == (3, 2)

    # if not a PyArrow type, raise a ValueError
    with pytest.raises(ValueError):
        _ = pl.from_arrow([1, 2])
Ejemplo n.º 3
0
def build_tissue_synonym_df(tissue_file, output_dir):
    # Get metadata file and tissue_df (assume that tissue_df is also in output_dir)
    tissue_metadata = pl.read_csv(tissue_file)  # will read NA as string!
    tissue_df = pl.from_arrow(
        fread(os.path.join(output_dir, "tissue.jay")).to_arrow())
    dataset_df = pl.from_arrow(
        fread(os.path.join(output_dir, "dataset.jay")).to_arrow())

    # Find all columns relevant to tissueid
    tissue_cols = [
        col for col in tissue_metadata.columns
        if re.match(".*tissueid$", col) and col != "unique.tissueid"
    ]

    # Read in which datasets we are working with
    dataset_names = os.listdir("procdata")
    clean_dataset_names = [re.sub("_.*$", "", name) for name in dataset_names]
    dataset_regex = re.compile("|".join(clean_dataset_names))

    # Filter the cellid columns to only valid datasets
    tissue_columns = [
        name for name in tissue_cols if re.match(dataset_regex, name)
    ]

    # Get all unique synonyms and join with cell_df
    tissue_meta_long = tissue_metadata \
        .melt(id_vars="unique.tissueid", value_vars=tissue_columns) \
        .drop_nulls() \
        .drop_duplicates() \
        .rename({"value": "tissue_name", "variable": "dataset_id"})

    tissue_synonym_df = tissue_df \
        .join(tissue_meta_long, left_on="name", right_on="unique.tissueid", how="left") \
        .drop("name") \
        .rename({"id": "tissue_id"}) \
        .filter(col("tissue_name") != "") \
        .drop_duplicates() \
        .drop_nulls()

    # Create a map from dataset
    dataset_map = {
        dct["name"]: str(dct["id"])
        for dct in dataset_df.to_pandas().to_dict(orient="record")
    }
    # Regex the dataset identifiers to match the dataset map
    tissue_synonym_df["dataset_id"] = tissue_synonym_df["dataset_id"] \
        .apply(lambda x: re.sub("\.cellid$|[_.].*$", "", x)) \
        .apply(lambda x: re.sub("GDSC$", "GDSC_v2", x)) \
        .apply(lambda x: re.sub("GDSC1.*$", "GDSC_v1", x)) \
        .apply(lambda x: dataset_map[x]) \
        .cast(pl.Int64)

    tissue_synonym_df = tissue_synonym_df.drop_duplicates()
    tissue_synonym_df["id"] = range(1, tissue_synonym_df.shape[0] + 1)

    # Convert to datatable.Frame for fast write to disk
    tissue_synonym_dt = dt.Frame(tissue_synonym_df.to_arrow())
    tissue_synonym_dt.to_jay(os.path.join(output_dir, "tissue_synonym.jay"))
Ejemplo n.º 4
0
def build_compound_synonym_df(compound_file, output_dir):
    # Get metadata file and compound_df
    compound_metadata = pl.read_csv(compound_file, null_values="NA")
    compound_df = pl.from_arrow(
        fread(os.path.join(output_dir, "compound.jay")).to_arrow())
    dataset_df = pl.from_arrow(
        fread(os.path.join(output_dir, "dataset.jay")).to_arrow())

    # Find all columns relevant to tissueid
    compound_cols = [
        col for col in compound_metadata.columns
        if re.match(".*drugid$", col) and col != "unique.drugid"
    ]

    # Read in which datasets we are working with
    dataset_names = os.listdir("procdata")
    clean_dataset_names = [re.sub("_.*$", "", name) for name in dataset_names]
    dataset_regex = re.compile("|".join(clean_dataset_names))

    # Filter the cellid columns to only valid datasets
    compound_columns = [
        name for name in compound_cols if re.match(dataset_regex, name)
    ]

    # Get all unique synonyms and join with cell_df
    compound_meta_long = compound_metadata \
        .melt(id_vars="unique.drugid", value_vars=compound_columns) \
        .drop_nulls() \
        .drop_duplicates() \
        .rename({"value": "compound_name", "variable": "dataset_id"}) \
        .filter(col("compound_name") != "")

    compound_synonym_df = compound_df \
        .join(compound_meta_long, left_on="name", right_on="unique.drugid", how="left") \
        .rename({"id": "compound_id"}) \
        .select(["compound_id", "dataset_id", "compound_name"]) \
        .drop_nulls() \
        .drop_duplicates()

    # Create a map from dataset
    dataset_map = {
        dct["name"]: str(dct["id"])
        for dct in dataset_df.to_pandas().to_dict(orient="record")
    }
    # Regex the dataset identifiers to match the dataset map
    compound_synonym_df["dataset_id"] = compound_synonym_df["dataset_id"] \
        .apply(lambda x: re.sub("\.drugid$|[_.].*$", "", x)) \
        .apply(lambda x: re.sub("GDSC2019", "GDSC_v2", x)) \
        .apply(lambda x: re.sub("GDSC1.*$", "GDSC_v1", x)) \
        .apply(lambda x: dataset_map[x]) \
        .cast(pl.Int64)

    compound_synonym_df = compound_synonym_df.drop_duplicates()
    compound_synonym_df["id"] = range(1, compound_synonym_df.shape[0] + 1)

    # Convert to datatable.Frame for memory mapped output file
    df = dt.Frame(compound_synonym_df.to_arrow())
    df.to_jay(os.path.join(output_dir, "compound_synonym.jay"))
Ejemplo n.º 5
0
def test_from_optional_not_available() -> None:
    with patch("polars.convert._NUMPY_AVAILABLE", False):
        with pytest.raises(ImportError):
            pl.from_numpy(np.array([[1, 2], [3, 4]]), columns=["a", "b"])
    with patch("polars.convert._PYARROW_AVAILABLE", False):
        with pytest.raises(ImportError):
            pl.from_arrow(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}))
        with pytest.raises(ImportError):
            pl.from_pandas(pd.Series([1, 2, 3]))
    with patch("polars.convert._PANDAS_AVAILABLE", False):
        with pytest.raises(ImportError):
            pl.from_pandas(pd.Series([1, 2, 3]))
Ejemplo n.º 6
0
def test_timezone() -> None:
    ts = pa.timestamp("s")
    data = pa.array([1000, 2000], type=ts)
    s: pl.Series = pl.from_arrow(data)  # type: ignore

    # with timezone; we do expect a warning here
    tz_ts = pa.timestamp("s", tz="America/New_York")
    tz_data = pa.array([1000, 2000], type=tz_ts)
    with pytest.warns(Warning):
        tz_s: pl.Series = pl.from_arrow(tz_data)  # type: ignore

    # timezones have no effect, i.e. `s` equals `tz_s`
    assert s.series_equal(tz_s)
Ejemplo n.º 7
0
def test_from_empty_arrow() -> None:
    df = pl.from_arrow(pa.table(pd.DataFrame({"a": [], "b": []})))
    assert df.columns == ["a", "b"]  # type: ignore
    assert df.dtypes == [pl.Float64, pl.Float64]  # type: ignore

    # 2705
    df1 = pd.DataFrame(columns=["b"], dtype=float)
    tbl = pa.Table.from_pandas(df1)
    out = pl.from_arrow(tbl)
    assert out.columns == ["b", "__index_level_0__"]  # type: ignore
    assert out.dtypes == [pl.Float64, pl.Utf8]  # type: ignore
    tbl = pa.Table.from_pandas(df1, preserve_index=False)
    out = pl.from_arrow(tbl)
    assert out.columns == ["b"]  # type: ignore
    assert out.dtypes == [pl.Float64]  # type: ignore
Ejemplo n.º 8
0
def coerce_arrow(array: pa.Array) -> pa.Array:
    # also coerces timezone to naive representation
    # units are accounted for by pyarrow
    if "timestamp" in str(array.type):
        warnings.warn(
            "Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost",
        )
        ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False)
        ms = pa.compute.cast(ts_ms, pa.int64())
        del ts_ms
        array = pa.compute.cast(ms, pa.date64())
        del ms
    # note: Decimal256 could not be cast to float
    elif isinstance(array.type, pa.Decimal128Type):
        array = pa.compute.cast(array, pa.float64())

    if hasattr(array, "num_chunks") and array.num_chunks > 1:
        # we have to coerce before combining chunks, because pyarrow panics if
        # offsets overflow
        if pa.types.is_string(array.type):
            array = pa.compute.cast(array, pa.large_utf8())
        elif pa.types.is_list(array.type):
            # pyarrow does not seem to support casting from list to largelist
            # so we use convert to large list ourselves and do the re-alloc on polars/arrow side
            chunks = []
            for arr in array.iterchunks():
                chunks.append(pl.from_arrow(arr).to_arrow())
            array = pa.chunked_array(chunks)

        array = array.combine_chunks()
    return array
Ejemplo n.º 9
0
def concat_and_sort(blocks: List["pyarrow.Table"], key: "SortKeyT",
                    descending: bool) -> "pyarrow.Table":
    check_polars_installed()
    col, _ = key[0]
    blocks = [pl.from_arrow(block) for block in blocks]
    df = pl.concat(blocks).sort(col, reverse=descending)
    return df.to_arrow()
Ejemplo n.º 10
0
def test_column_names():
    tbl = pa.table({
        "a": pa.array([1, 2, 3, 4, 5], pa.decimal128(38, 2)),
        "b": pa.array([1, 2, 3, 4, 5], pa.int64()),
    })
    df = pl.from_arrow(tbl)
    assert df.columns == ["a", "b"]
Ejemplo n.º 11
0
def build_compound_target_table(chembl_df, drugbank_df, target_df, output_dir,
                                compound_synonym_file):
    """
    Using data from the Drugbank and ChEMBL drug target files and 
    the target table, build the drug target table.

    @param chembl_df: [`dt.Frame`] The ChEMBL drug target table
    @param drugbank_df: [`dt.Frame`] The DrugBank drug target table
    @param target_df: [`datatable.Frame`] The target table, keyed
    @param output_dir: [`string`] The file path with all final PharmacoDB tables
    @param compound_synonym_file: [`string`] The file path to the compound synonym table
    @return: [`dt.Frame`] The drug target table
    """
    # Load compound synonym table from output_dir
    if not os.path.exists(compound_synonym_file):
        raise FileNotFoundError(
            f"The file {compound_synonym_file} doesn't exist!")
    drug_syn_df = dt.fread(compound_synonym_file)
    # Join drugbank df with drug table
    del drug_syn_df[:, ['dataset_id', 'id']]
    drug_syn_df = pl.from_arrow(drug_syn_df.to_arrow()) \
        .drop_duplicates()
    drugbank_df = pl.from_arrow(
        drugbank_df[:, ['name', 'compound_name']].to_arrow())
    drugbank_df = drugbank_df.join(drug_syn_df, on='compound_name')
    # Combine ChEMBL and Drugbank tables to make drug target table
    drug_target_df = pd.concat([
        chembl_df.to_pandas()[['name', 'compound_id']].copy(),
        drugbank_df.to_pandas()[['name', 'compound_id']].copy()
    ])
    drug_target_df.rename(columns={'name': 'target_id'}, inplace=True)
    drug_target_df.drop_duplicates(inplace=True)
    # Join with target table
    drug_target_df = dt.Frame(drug_target_df)
    drug_target_df = join_tables(drug_target_df, target_df, 'target_id')
    # Drop rows with no target_id, drop duplicates
    drug_target_df = drug_target_df[dt.f.target_id >= 1, :]
    drug_target_df = drug_target_df[0, :, dt.by(drug_target_df.names)]
    drug_target_df = dt.Frame(
        pl.from_arrow(drug_target_df.to_arrow()) \
            .drop_nulls() \
            .to_arrow())
    drug_target_df = write_table(drug_target_df,
                                 'compound_target',
                                 output_dir,
                                 add_index=False)
    return drug_target_df
Ejemplo n.º 12
0
def test_from_time_arrow() -> None:
    times = pa.array([10, 20, 30], type=pa.time32("s"))
    times_table = pa.table([times], names=["times"])

    assert pl.from_arrow(times_table).to_series().to_list() == [
        time(0, 0, 10),
        time(0, 0, 20),
        time(0, 0, 30),
    ]
Ejemplo n.º 13
0
def test_from_arrow():
    tbl = pa.table({
        "a": pa.array([1, 2], pa.timestamp("s")),
        "b": pa.array([1, 2], pa.timestamp("ms")),
        "c": pa.array([1, 2], pa.timestamp("us")),
        "d": pa.array([1, 2], pa.timestamp("ns")),
        "decimal1": pa.array([1, 2], pa.decimal128(2, 1)),
    })
    assert pl.from_arrow(tbl).shape == (2, 5)
Ejemplo n.º 14
0
def test_categorical_round_trip() -> None:
    df = pl.DataFrame({"ints": [1, 2, 3], "cat": ["a", "b", "c"]})
    df = df.with_column(pl.col("cat").cast(pl.Categorical))

    tbl = df.to_arrow()
    assert "dictionary" in str(tbl["cat"].type)

    df2: pl.DataFrame = pl.from_arrow(tbl)  # type: ignore
    assert df2.dtypes == [pl.Int64, pl.Categorical]
Ejemplo n.º 15
0
def test_arrow():
    a = pl.Series("a", [1, 2, 3, None])
    out = a.to_arrow()
    assert out == pa.array([1, 2, 3, None])

    a = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8()))
    s = pl.Series("a", a)
    assert s.dtype == pl.Categorical
    assert (pl.from_arrow(
        pa.array([["foo"], ["foo", "bar"]],
                 pa.list_(pa.utf8()))).dtype == pl.List)
Ejemplo n.º 16
0
def test_upcast_pyarrow_dicts() -> None:
    # 1752
    tbls = []
    for i in range(128):
        tbls.append(
            pa.table({
                "col_name":
                pa.array(["value_" + str(i)],
                         pa.dictionary(pa.int8(), pa.string())),
            }))

    tbl = pa.concat_tables(tbls, promote=True)
    out = pl.from_arrow(tbl)
    assert out.shape == (128, 1)
Ejemplo n.º 17
0
def test_microseconds_accuracy() -> None:
    timestamps = [
        datetime(2600, 1, 1, 0, 0, 0, 123456),
        datetime(2800, 1, 1, 0, 0, 0, 456789),
    ]
    a = pa.Table.from_arrays(
        arrays=[timestamps, [128, 256]],
        schema=pa.schema([
            ("timestamp", pa.timestamp("us")),
            ("value", pa.int16()),
        ]),
    )

    assert pl.from_arrow(
        a)["timestamp"].to_list() == timestamps  # type: ignore
Ejemplo n.º 18
0
def _scan_ds_impl(
    ds: pa.dataset.dataset, with_columns: list[str] | None
) -> pli.DataFrame:
    """
    Takes the projected columns and materializes an arrow table.

    Parameters
    ----------
    ds
    with_columns

    Returns
    -------

    """
    if not _PYARROW_AVAILABLE:  # pragma: no cover
        raise ImportError("'pyarrow' is required for scanning from pyarrow datasets.")
    return pl.from_arrow(ds.to_table(columns=with_columns))  # type: ignore[return-value]
Ejemplo n.º 19
0
def _scan_ds_impl(ds: "pa.dataset.dataset",
                  with_columns: Optional[List[str]]) -> "pli.DataFrame":
    """
    Takes the projected columns and materializes an arrow table.

    Parameters
    ----------
    ds
    with_columns

    Returns
    -------

    """
    if not _PYARROW_AVAILABLE:
        raise ImportError(  # pragma: no cover
            "'pyarrow' is required for scanning from pyarrow datasets.")
    return pl.from_arrow(ds.to_table(columns=with_columns))  # type: ignore
Ejemplo n.º 20
0
def test_cat_int_types_3500() -> None:
    with pl.StringCache():
        # Create an enum / categorical / dictionary typed pyarrow array
        # Most simply done by creating a pandas categorical series first
        categorical_df = pd.Series(["a", "a", "b"], dtype="category")
        pyarrow_array = pa.Array.from_pandas(categorical_df)

        # The in-memory representation of each category can either be a signed or unsigned 8-bit integer
        # Pandas uses Int8...
        int_dict_type = pa.dictionary(index_type=pa.int8(),
                                      value_type=pa.utf8())
        # ... while DuckDB uses UInt8
        uint_dict_type = pa.dictionary(index_type=pa.uint8(),
                                       value_type=pa.utf8())

        for t in [int_dict_type, uint_dict_type]:
            s = pl.from_arrow(pyarrow_array.cast(t))
            assert s.series_equal(
                pl.Series(["a", "a", "b"]).cast(pl.Categorical))
Ejemplo n.º 21
0
    def to_polars(self, *, skip_date_conversion: bool = False) -> pl.DataFrame:
        """Export the report data to a Polars DataFrame.

        Keyword Args:
            skip_date_conversion:
                Whether to skip automatically converting date columns to
                the ``datetime[ns]`` format. Defaults to ``False``.

        Returns:
            The newly created DataFrame.

        .. versionadded:: 3.6.0
        """

        if analytix.can_use("polars"):
            import polars as pl
        else:
            raise errors.MissingOptionalComponents("polars")

        return pl.from_arrow(
            self.to_arrow_table(skip_date_conversion=skip_date_conversion))
Ejemplo n.º 22
0
 def build_struct_df(data: list) -> DataFrame:
     """Build Polars df from list of dicts. Can't import directly because of issue #3145."""
     arrow_df = pa.Table.from_pylist(data)
     polars_df = pl.from_arrow(arrow_df)
     assert isinstance(polars_df, DataFrame)
     return polars_df
Ejemplo n.º 23
0
def sort(table: "pyarrow.Table", key: "SortKeyT",
         descending: bool) -> "pyarrow.Table":
    check_polars_installed()
    col, _ = key[0]
    df = pl.from_arrow(table)
    return df.sort(col, reverse=descending).to_arrow()
Ejemplo n.º 24
0
def test_arrow_list_roundtrip() -> None:
    # https://github.com/pola-rs/polars/issues/1064
    tbl = pa.table({"a": [1], "b": [[1, 2]]})
    assert pl.from_arrow(tbl).to_arrow().shape == tbl.shape
Ejemplo n.º 25
0
def test_arrow_list_chunked_array() -> None:
    a = pa.array([[1, 2], [3, 4]])
    ca = pa.chunked_array([a, a, a])
    s = pl.from_arrow(ca)
    assert s.dtype == pl.List
Ejemplo n.º 26
0
def test_arrow_list_roundtrip():
    # https://github.com/pola-rs/polars/issues/1064
    pl.from_arrow(pa.table({"a": [1], "b": [[1, 2]]})).to_arrow()
Ejemplo n.º 27
0
def test_struct_with_validity() -> None:
    data = [{"a": {"b": 1}}, {"a": None}]
    tbl = pa.Table.from_pylist(data)
    df = pl.from_arrow(tbl)
    assert isinstance(df, pl.DataFrame)
    assert df["a"].to_list() == [{"b": 1}, {"b": None}]
Ejemplo n.º 28
0
def test_from_arrow_table():
    data = {"a": [1, 2], "b": [1, 2]}
    tbl = pa.table(data)

    df = pl.from_arrow(tbl)
    df.frame_equal(pl.DataFrame(data))
Ejemplo n.º 29
0
def read_sql(
    sql: Union[List[str], str],
    connection_uri: str,
    partition_on: Optional[str] = None,
    partition_range: Optional[Tuple[int, int]] = None,
    partition_num: Optional[int] = None,
) -> "pl.DataFrame":
    """
    Read a SQL query into a DataFrame
    Make sure to install connextorx>=0.2

    # Sources
    Supports reading a sql query from the following data sources:

    * Postgres
    * Mysql
    * Sqlite
    * Redshift (through postgres protocol)
    * Clickhouse (through mysql protocol)

    ## Source not supported?
    If a database source is not supported, pandas can be used to load the query:

    >>>> df = pl.from_pandas(pd.read_sql(sql, engine))

    Parameters
    ----------
    sql
        raw sql query
    connection_uri
        connectorx connection uri:
            - "postgresql://*****:*****@server:port/database"
    partition_on
      the column to partition the result.
    partition_range
      the value range of the partition column.
    partition_num
      how many partition to generate.


    Examples
    --------

    ## Single threaded
    Read a DataFrame from a SQL using a single thread:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> query = "SELECT * FROM lineitem"
    >>> pl.read_sql(query, uri)

    ## Using 10 threads
    Read a DataFrame parallelly using 10 threads by automatically partitioning the provided SQL on the partition column:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> query = "SELECT * FROM lineitem"
    >>> read_sql(query, uri, partition_on="partition_col", partition_num=10)

    ## Using
    Read a DataFrame parallel using 2 threads by manually providing two partition SQLs:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> queries = ["SELECT * FROM lineitem WHERE partition_col <= 10", "SELECT * FROM lineitem WHERE partition_col > 10"]
    >>> read_sql(uri, queries)

    """
    if _WITH_CX:
        tbl = cx.read_sql(
            conn=connection_uri,
            query=sql,
            return_type="arrow",
            partition_on=partition_on,
            partition_range=partition_range,
            partition_num=partition_num,
        )
        return pl.from_arrow(tbl)  # type: ignore[return-value]
    else:
        raise ImportError("connectorx is not installed."
                          "Please run pip install connectorx>=0.2.0a3")
Ejemplo n.º 30
0
def read_sql(
    conn: str,
    query: Union[List[str], str],
    *,
    return_type: str = "pandas",
    protocol: Optional[str] = None,
    partition_on: Optional[str] = None,
    partition_range: Optional[Tuple[int, int]] = None,
    partition_num: Optional[int] = None,
    index_col: Optional[str] = None,
):
    """
    Run the SQL query, download the data from database into a dataframe.

    Parameters
    ==========
    conn
      the connection string.
    query
      a SQL query or a list of SQL queries.
    return_type
      the return type of this function; one of "arrow", "pandas", "modin", "dask" or "polars".
    protocol
      backend-specific transfer protocol directive; defaults to 'binary' (except for redshift
      connection strings, where 'cursor' will be used instead).
    partition_on
      the column on which to partition the result.
    partition_range
      the value range of the partition column.
    partition_num
      how many partitions to generate.
    index_col
      the index column to set; only applicable for return type "pandas", "modin", "dask".

    Examples
    ========
    Read a DataFrame from a SQL query using a single thread:

    >>> postgres_url = "postgresql://*****:*****@server:port/database"
    >>> query = "SELECT * FROM lineitem"
    >>> read_sql(postgres_url, query)

    Read a DataFrame in parallel using 10 threads by automatically partitioning the provided SQL on the partition column:

    >>> postgres_url = "postgresql://*****:*****@server:port/database"
    >>> query = "SELECT * FROM lineitem"
    >>> read_sql(postgres_url, query, partition_on="partition_col", partition_num=10)

    Read a DataFrame in parallel using 2 threads by explicitly providing two SQL queries:

    >>> postgres_url = "postgresql://*****:*****@server:port/database"
    >>> queries = ["SELECT * FROM lineitem WHERE partition_col <= 10", "SELECT * FROM lineitem WHERE partition_col > 10"]
    >>> read_sql(postgres_url, queries)

    """

    if isinstance(query, list) and len(query) == 1:
        query = query[0]

    if isinstance(query, str):
        if partition_on is None:
            queries = [query]
            partition_query = None
        else:
            partition_query = {
                "query": query,
                "column": partition_on,
                "min": partition_range[0] if partition_range else None,
                "max": partition_range[1] if partition_range else None,
                "num": partition_num,
            }
            queries = None
    elif isinstance(query, list):
        queries = query
        partition_query = None

        if partition_on is not None:
            raise ValueError("Partition on multiple queries is not supported.")
    else:
        raise ValueError("query must be either str or a list of str")

    if not protocol:
        # note: redshift/clickhouse are not compatible with the 'binary' protocol, and use other database
        # drivers to connect. set a compatible protocol and masquerade as the appropriate backend.
        backend, connection_details = conn.split(":", 1) if conn else ("", "")
        if "redshift" in backend:
            conn = f"postgresql:{connection_details}"
            protocol = "cursor"
        elif "clickhouse" in backend:
            conn = f"mysql:{connection_details}"
            protocol = "text"
        else:
            protocol = "binary"

    if return_type in {"modin", "dask", "pandas"}:
        try:
            import pandas
        except ModuleNotFoundError:
            raise ValueError("You need to install pandas first")

        result = _read_sql(
            conn,
            "pandas",
            queries=queries,
            protocol=protocol,
            partition_query=partition_query,
        )
        df = reconstruct_pandas(result)

        if index_col is not None:
            df.set_index(index_col, inplace=True)

        if return_type == "modin":
            try:
                import modin.pandas as mpd
            except ModuleNotFoundError:
                raise ValueError("You need to install modin first")

            df = mpd.DataFrame(df)
        elif return_type == "dask":
            try:
                import dask.dataframe as dd
            except ModuleNotFoundError:
                raise ValueError("You need to install dask first")

            df = dd.from_pandas(df, npartitions=1)

    elif return_type in {"arrow", "polars"}:
        try:
            import pyarrow
        except ModuleNotFoundError:
            raise ValueError("You need to install pyarrow first")

        result = _read_sql(
            conn,
            "arrow",
            queries=queries,
            protocol=protocol,
            partition_query=partition_query,
        )
        df = reconstruct_arrow(result)
        if return_type == "polars":
            try:
                import polars as pl
            except ModuleNotFoundError:
                raise ValueError("You need to install polars first")

            try:
                df = pl.DataFrame.from_arrow(df)
            except AttributeError:
                # api change for polars >= 0.8.*
                df = pl.from_arrow(df)
    else:
        raise ValueError(return_type)

    return df