def test_no_rechunk() -> None: table = pa.Table.from_pydict( {"x": pa.chunked_array([list("ab"), list("cd")])}) # table assert pl.from_arrow(table, rechunk=False).n_chunks() == 2 # chunked array assert pl.from_arrow(table["x"], rechunk=False).n_chunks() == 2
def test_from_arrow() -> None: data = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}) df = pl.from_arrow(data) assert df.shape == (3, 2) # if not a PyArrow type, raise a ValueError with pytest.raises(ValueError): _ = pl.from_arrow([1, 2])
def build_tissue_synonym_df(tissue_file, output_dir): # Get metadata file and tissue_df (assume that tissue_df is also in output_dir) tissue_metadata = pl.read_csv(tissue_file) # will read NA as string! tissue_df = pl.from_arrow( fread(os.path.join(output_dir, "tissue.jay")).to_arrow()) dataset_df = pl.from_arrow( fread(os.path.join(output_dir, "dataset.jay")).to_arrow()) # Find all columns relevant to tissueid tissue_cols = [ col for col in tissue_metadata.columns if re.match(".*tissueid$", col) and col != "unique.tissueid" ] # Read in which datasets we are working with dataset_names = os.listdir("procdata") clean_dataset_names = [re.sub("_.*$", "", name) for name in dataset_names] dataset_regex = re.compile("|".join(clean_dataset_names)) # Filter the cellid columns to only valid datasets tissue_columns = [ name for name in tissue_cols if re.match(dataset_regex, name) ] # Get all unique synonyms and join with cell_df tissue_meta_long = tissue_metadata \ .melt(id_vars="unique.tissueid", value_vars=tissue_columns) \ .drop_nulls() \ .drop_duplicates() \ .rename({"value": "tissue_name", "variable": "dataset_id"}) tissue_synonym_df = tissue_df \ .join(tissue_meta_long, left_on="name", right_on="unique.tissueid", how="left") \ .drop("name") \ .rename({"id": "tissue_id"}) \ .filter(col("tissue_name") != "") \ .drop_duplicates() \ .drop_nulls() # Create a map from dataset dataset_map = { dct["name"]: str(dct["id"]) for dct in dataset_df.to_pandas().to_dict(orient="record") } # Regex the dataset identifiers to match the dataset map tissue_synonym_df["dataset_id"] = tissue_synonym_df["dataset_id"] \ .apply(lambda x: re.sub("\.cellid$|[_.].*$", "", x)) \ .apply(lambda x: re.sub("GDSC$", "GDSC_v2", x)) \ .apply(lambda x: re.sub("GDSC1.*$", "GDSC_v1", x)) \ .apply(lambda x: dataset_map[x]) \ .cast(pl.Int64) tissue_synonym_df = tissue_synonym_df.drop_duplicates() tissue_synonym_df["id"] = range(1, tissue_synonym_df.shape[0] + 1) # Convert to datatable.Frame for fast write to disk tissue_synonym_dt = dt.Frame(tissue_synonym_df.to_arrow()) tissue_synonym_dt.to_jay(os.path.join(output_dir, "tissue_synonym.jay"))
def build_compound_synonym_df(compound_file, output_dir): # Get metadata file and compound_df compound_metadata = pl.read_csv(compound_file, null_values="NA") compound_df = pl.from_arrow( fread(os.path.join(output_dir, "compound.jay")).to_arrow()) dataset_df = pl.from_arrow( fread(os.path.join(output_dir, "dataset.jay")).to_arrow()) # Find all columns relevant to tissueid compound_cols = [ col for col in compound_metadata.columns if re.match(".*drugid$", col) and col != "unique.drugid" ] # Read in which datasets we are working with dataset_names = os.listdir("procdata") clean_dataset_names = [re.sub("_.*$", "", name) for name in dataset_names] dataset_regex = re.compile("|".join(clean_dataset_names)) # Filter the cellid columns to only valid datasets compound_columns = [ name for name in compound_cols if re.match(dataset_regex, name) ] # Get all unique synonyms and join with cell_df compound_meta_long = compound_metadata \ .melt(id_vars="unique.drugid", value_vars=compound_columns) \ .drop_nulls() \ .drop_duplicates() \ .rename({"value": "compound_name", "variable": "dataset_id"}) \ .filter(col("compound_name") != "") compound_synonym_df = compound_df \ .join(compound_meta_long, left_on="name", right_on="unique.drugid", how="left") \ .rename({"id": "compound_id"}) \ .select(["compound_id", "dataset_id", "compound_name"]) \ .drop_nulls() \ .drop_duplicates() # Create a map from dataset dataset_map = { dct["name"]: str(dct["id"]) for dct in dataset_df.to_pandas().to_dict(orient="record") } # Regex the dataset identifiers to match the dataset map compound_synonym_df["dataset_id"] = compound_synonym_df["dataset_id"] \ .apply(lambda x: re.sub("\.drugid$|[_.].*$", "", x)) \ .apply(lambda x: re.sub("GDSC2019", "GDSC_v2", x)) \ .apply(lambda x: re.sub("GDSC1.*$", "GDSC_v1", x)) \ .apply(lambda x: dataset_map[x]) \ .cast(pl.Int64) compound_synonym_df = compound_synonym_df.drop_duplicates() compound_synonym_df["id"] = range(1, compound_synonym_df.shape[0] + 1) # Convert to datatable.Frame for memory mapped output file df = dt.Frame(compound_synonym_df.to_arrow()) df.to_jay(os.path.join(output_dir, "compound_synonym.jay"))
def test_from_optional_not_available() -> None: with patch("polars.convert._NUMPY_AVAILABLE", False): with pytest.raises(ImportError): pl.from_numpy(np.array([[1, 2], [3, 4]]), columns=["a", "b"]) with patch("polars.convert._PYARROW_AVAILABLE", False): with pytest.raises(ImportError): pl.from_arrow(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]})) with pytest.raises(ImportError): pl.from_pandas(pd.Series([1, 2, 3])) with patch("polars.convert._PANDAS_AVAILABLE", False): with pytest.raises(ImportError): pl.from_pandas(pd.Series([1, 2, 3]))
def test_timezone() -> None: ts = pa.timestamp("s") data = pa.array([1000, 2000], type=ts) s: pl.Series = pl.from_arrow(data) # type: ignore # with timezone; we do expect a warning here tz_ts = pa.timestamp("s", tz="America/New_York") tz_data = pa.array([1000, 2000], type=tz_ts) with pytest.warns(Warning): tz_s: pl.Series = pl.from_arrow(tz_data) # type: ignore # timezones have no effect, i.e. `s` equals `tz_s` assert s.series_equal(tz_s)
def test_from_empty_arrow() -> None: df = pl.from_arrow(pa.table(pd.DataFrame({"a": [], "b": []}))) assert df.columns == ["a", "b"] # type: ignore assert df.dtypes == [pl.Float64, pl.Float64] # type: ignore # 2705 df1 = pd.DataFrame(columns=["b"], dtype=float) tbl = pa.Table.from_pandas(df1) out = pl.from_arrow(tbl) assert out.columns == ["b", "__index_level_0__"] # type: ignore assert out.dtypes == [pl.Float64, pl.Utf8] # type: ignore tbl = pa.Table.from_pandas(df1, preserve_index=False) out = pl.from_arrow(tbl) assert out.columns == ["b"] # type: ignore assert out.dtypes == [pl.Float64] # type: ignore
def coerce_arrow(array: pa.Array) -> pa.Array: # also coerces timezone to naive representation # units are accounted for by pyarrow if "timestamp" in str(array.type): warnings.warn( "Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost", ) ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False) ms = pa.compute.cast(ts_ms, pa.int64()) del ts_ms array = pa.compute.cast(ms, pa.date64()) del ms # note: Decimal256 could not be cast to float elif isinstance(array.type, pa.Decimal128Type): array = pa.compute.cast(array, pa.float64()) if hasattr(array, "num_chunks") and array.num_chunks > 1: # we have to coerce before combining chunks, because pyarrow panics if # offsets overflow if pa.types.is_string(array.type): array = pa.compute.cast(array, pa.large_utf8()) elif pa.types.is_list(array.type): # pyarrow does not seem to support casting from list to largelist # so we use convert to large list ourselves and do the re-alloc on polars/arrow side chunks = [] for arr in array.iterchunks(): chunks.append(pl.from_arrow(arr).to_arrow()) array = pa.chunked_array(chunks) array = array.combine_chunks() return array
def concat_and_sort(blocks: List["pyarrow.Table"], key: "SortKeyT", descending: bool) -> "pyarrow.Table": check_polars_installed() col, _ = key[0] blocks = [pl.from_arrow(block) for block in blocks] df = pl.concat(blocks).sort(col, reverse=descending) return df.to_arrow()
def test_column_names(): tbl = pa.table({ "a": pa.array([1, 2, 3, 4, 5], pa.decimal128(38, 2)), "b": pa.array([1, 2, 3, 4, 5], pa.int64()), }) df = pl.from_arrow(tbl) assert df.columns == ["a", "b"]
def build_compound_target_table(chembl_df, drugbank_df, target_df, output_dir, compound_synonym_file): """ Using data from the Drugbank and ChEMBL drug target files and the target table, build the drug target table. @param chembl_df: [`dt.Frame`] The ChEMBL drug target table @param drugbank_df: [`dt.Frame`] The DrugBank drug target table @param target_df: [`datatable.Frame`] The target table, keyed @param output_dir: [`string`] The file path with all final PharmacoDB tables @param compound_synonym_file: [`string`] The file path to the compound synonym table @return: [`dt.Frame`] The drug target table """ # Load compound synonym table from output_dir if not os.path.exists(compound_synonym_file): raise FileNotFoundError( f"The file {compound_synonym_file} doesn't exist!") drug_syn_df = dt.fread(compound_synonym_file) # Join drugbank df with drug table del drug_syn_df[:, ['dataset_id', 'id']] drug_syn_df = pl.from_arrow(drug_syn_df.to_arrow()) \ .drop_duplicates() drugbank_df = pl.from_arrow( drugbank_df[:, ['name', 'compound_name']].to_arrow()) drugbank_df = drugbank_df.join(drug_syn_df, on='compound_name') # Combine ChEMBL and Drugbank tables to make drug target table drug_target_df = pd.concat([ chembl_df.to_pandas()[['name', 'compound_id']].copy(), drugbank_df.to_pandas()[['name', 'compound_id']].copy() ]) drug_target_df.rename(columns={'name': 'target_id'}, inplace=True) drug_target_df.drop_duplicates(inplace=True) # Join with target table drug_target_df = dt.Frame(drug_target_df) drug_target_df = join_tables(drug_target_df, target_df, 'target_id') # Drop rows with no target_id, drop duplicates drug_target_df = drug_target_df[dt.f.target_id >= 1, :] drug_target_df = drug_target_df[0, :, dt.by(drug_target_df.names)] drug_target_df = dt.Frame( pl.from_arrow(drug_target_df.to_arrow()) \ .drop_nulls() \ .to_arrow()) drug_target_df = write_table(drug_target_df, 'compound_target', output_dir, add_index=False) return drug_target_df
def test_from_time_arrow() -> None: times = pa.array([10, 20, 30], type=pa.time32("s")) times_table = pa.table([times], names=["times"]) assert pl.from_arrow(times_table).to_series().to_list() == [ time(0, 0, 10), time(0, 0, 20), time(0, 0, 30), ]
def test_from_arrow(): tbl = pa.table({ "a": pa.array([1, 2], pa.timestamp("s")), "b": pa.array([1, 2], pa.timestamp("ms")), "c": pa.array([1, 2], pa.timestamp("us")), "d": pa.array([1, 2], pa.timestamp("ns")), "decimal1": pa.array([1, 2], pa.decimal128(2, 1)), }) assert pl.from_arrow(tbl).shape == (2, 5)
def test_categorical_round_trip() -> None: df = pl.DataFrame({"ints": [1, 2, 3], "cat": ["a", "b", "c"]}) df = df.with_column(pl.col("cat").cast(pl.Categorical)) tbl = df.to_arrow() assert "dictionary" in str(tbl["cat"].type) df2: pl.DataFrame = pl.from_arrow(tbl) # type: ignore assert df2.dtypes == [pl.Int64, pl.Categorical]
def test_arrow(): a = pl.Series("a", [1, 2, 3, None]) out = a.to_arrow() assert out == pa.array([1, 2, 3, None]) a = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8())) s = pl.Series("a", a) assert s.dtype == pl.Categorical assert (pl.from_arrow( pa.array([["foo"], ["foo", "bar"]], pa.list_(pa.utf8()))).dtype == pl.List)
def test_upcast_pyarrow_dicts() -> None: # 1752 tbls = [] for i in range(128): tbls.append( pa.table({ "col_name": pa.array(["value_" + str(i)], pa.dictionary(pa.int8(), pa.string())), })) tbl = pa.concat_tables(tbls, promote=True) out = pl.from_arrow(tbl) assert out.shape == (128, 1)
def test_microseconds_accuracy() -> None: timestamps = [ datetime(2600, 1, 1, 0, 0, 0, 123456), datetime(2800, 1, 1, 0, 0, 0, 456789), ] a = pa.Table.from_arrays( arrays=[timestamps, [128, 256]], schema=pa.schema([ ("timestamp", pa.timestamp("us")), ("value", pa.int16()), ]), ) assert pl.from_arrow( a)["timestamp"].to_list() == timestamps # type: ignore
def _scan_ds_impl( ds: pa.dataset.dataset, with_columns: list[str] | None ) -> pli.DataFrame: """ Takes the projected columns and materializes an arrow table. Parameters ---------- ds with_columns Returns ------- """ if not _PYARROW_AVAILABLE: # pragma: no cover raise ImportError("'pyarrow' is required for scanning from pyarrow datasets.") return pl.from_arrow(ds.to_table(columns=with_columns)) # type: ignore[return-value]
def _scan_ds_impl(ds: "pa.dataset.dataset", with_columns: Optional[List[str]]) -> "pli.DataFrame": """ Takes the projected columns and materializes an arrow table. Parameters ---------- ds with_columns Returns ------- """ if not _PYARROW_AVAILABLE: raise ImportError( # pragma: no cover "'pyarrow' is required for scanning from pyarrow datasets.") return pl.from_arrow(ds.to_table(columns=with_columns)) # type: ignore
def test_cat_int_types_3500() -> None: with pl.StringCache(): # Create an enum / categorical / dictionary typed pyarrow array # Most simply done by creating a pandas categorical series first categorical_df = pd.Series(["a", "a", "b"], dtype="category") pyarrow_array = pa.Array.from_pandas(categorical_df) # The in-memory representation of each category can either be a signed or unsigned 8-bit integer # Pandas uses Int8... int_dict_type = pa.dictionary(index_type=pa.int8(), value_type=pa.utf8()) # ... while DuckDB uses UInt8 uint_dict_type = pa.dictionary(index_type=pa.uint8(), value_type=pa.utf8()) for t in [int_dict_type, uint_dict_type]: s = pl.from_arrow(pyarrow_array.cast(t)) assert s.series_equal( pl.Series(["a", "a", "b"]).cast(pl.Categorical))
def to_polars(self, *, skip_date_conversion: bool = False) -> pl.DataFrame: """Export the report data to a Polars DataFrame. Keyword Args: skip_date_conversion: Whether to skip automatically converting date columns to the ``datetime[ns]`` format. Defaults to ``False``. Returns: The newly created DataFrame. .. versionadded:: 3.6.0 """ if analytix.can_use("polars"): import polars as pl else: raise errors.MissingOptionalComponents("polars") return pl.from_arrow( self.to_arrow_table(skip_date_conversion=skip_date_conversion))
def build_struct_df(data: list) -> DataFrame: """Build Polars df from list of dicts. Can't import directly because of issue #3145.""" arrow_df = pa.Table.from_pylist(data) polars_df = pl.from_arrow(arrow_df) assert isinstance(polars_df, DataFrame) return polars_df
def sort(table: "pyarrow.Table", key: "SortKeyT", descending: bool) -> "pyarrow.Table": check_polars_installed() col, _ = key[0] df = pl.from_arrow(table) return df.sort(col, reverse=descending).to_arrow()
def test_arrow_list_roundtrip() -> None: # https://github.com/pola-rs/polars/issues/1064 tbl = pa.table({"a": [1], "b": [[1, 2]]}) assert pl.from_arrow(tbl).to_arrow().shape == tbl.shape
def test_arrow_list_chunked_array() -> None: a = pa.array([[1, 2], [3, 4]]) ca = pa.chunked_array([a, a, a]) s = pl.from_arrow(ca) assert s.dtype == pl.List
def test_arrow_list_roundtrip(): # https://github.com/pola-rs/polars/issues/1064 pl.from_arrow(pa.table({"a": [1], "b": [[1, 2]]})).to_arrow()
def test_struct_with_validity() -> None: data = [{"a": {"b": 1}}, {"a": None}] tbl = pa.Table.from_pylist(data) df = pl.from_arrow(tbl) assert isinstance(df, pl.DataFrame) assert df["a"].to_list() == [{"b": 1}, {"b": None}]
def test_from_arrow_table(): data = {"a": [1, 2], "b": [1, 2]} tbl = pa.table(data) df = pl.from_arrow(tbl) df.frame_equal(pl.DataFrame(data))
def read_sql( sql: Union[List[str], str], connection_uri: str, partition_on: Optional[str] = None, partition_range: Optional[Tuple[int, int]] = None, partition_num: Optional[int] = None, ) -> "pl.DataFrame": """ Read a SQL query into a DataFrame Make sure to install connextorx>=0.2 # Sources Supports reading a sql query from the following data sources: * Postgres * Mysql * Sqlite * Redshift (through postgres protocol) * Clickhouse (through mysql protocol) ## Source not supported? If a database source is not supported, pandas can be used to load the query: >>>> df = pl.from_pandas(pd.read_sql(sql, engine)) Parameters ---------- sql raw sql query connection_uri connectorx connection uri: - "postgresql://*****:*****@server:port/database" partition_on the column to partition the result. partition_range the value range of the partition column. partition_num how many partition to generate. Examples -------- ## Single threaded Read a DataFrame from a SQL using a single thread: >>> uri = "postgresql://*****:*****@server:port/database" >>> query = "SELECT * FROM lineitem" >>> pl.read_sql(query, uri) ## Using 10 threads Read a DataFrame parallelly using 10 threads by automatically partitioning the provided SQL on the partition column: >>> uri = "postgresql://*****:*****@server:port/database" >>> query = "SELECT * FROM lineitem" >>> read_sql(query, uri, partition_on="partition_col", partition_num=10) ## Using Read a DataFrame parallel using 2 threads by manually providing two partition SQLs: >>> uri = "postgresql://*****:*****@server:port/database" >>> queries = ["SELECT * FROM lineitem WHERE partition_col <= 10", "SELECT * FROM lineitem WHERE partition_col > 10"] >>> read_sql(uri, queries) """ if _WITH_CX: tbl = cx.read_sql( conn=connection_uri, query=sql, return_type="arrow", partition_on=partition_on, partition_range=partition_range, partition_num=partition_num, ) return pl.from_arrow(tbl) # type: ignore[return-value] else: raise ImportError("connectorx is not installed." "Please run pip install connectorx>=0.2.0a3")
def read_sql( conn: str, query: Union[List[str], str], *, return_type: str = "pandas", protocol: Optional[str] = None, partition_on: Optional[str] = None, partition_range: Optional[Tuple[int, int]] = None, partition_num: Optional[int] = None, index_col: Optional[str] = None, ): """ Run the SQL query, download the data from database into a dataframe. Parameters ========== conn the connection string. query a SQL query or a list of SQL queries. return_type the return type of this function; one of "arrow", "pandas", "modin", "dask" or "polars". protocol backend-specific transfer protocol directive; defaults to 'binary' (except for redshift connection strings, where 'cursor' will be used instead). partition_on the column on which to partition the result. partition_range the value range of the partition column. partition_num how many partitions to generate. index_col the index column to set; only applicable for return type "pandas", "modin", "dask". Examples ======== Read a DataFrame from a SQL query using a single thread: >>> postgres_url = "postgresql://*****:*****@server:port/database" >>> query = "SELECT * FROM lineitem" >>> read_sql(postgres_url, query) Read a DataFrame in parallel using 10 threads by automatically partitioning the provided SQL on the partition column: >>> postgres_url = "postgresql://*****:*****@server:port/database" >>> query = "SELECT * FROM lineitem" >>> read_sql(postgres_url, query, partition_on="partition_col", partition_num=10) Read a DataFrame in parallel using 2 threads by explicitly providing two SQL queries: >>> postgres_url = "postgresql://*****:*****@server:port/database" >>> queries = ["SELECT * FROM lineitem WHERE partition_col <= 10", "SELECT * FROM lineitem WHERE partition_col > 10"] >>> read_sql(postgres_url, queries) """ if isinstance(query, list) and len(query) == 1: query = query[0] if isinstance(query, str): if partition_on is None: queries = [query] partition_query = None else: partition_query = { "query": query, "column": partition_on, "min": partition_range[0] if partition_range else None, "max": partition_range[1] if partition_range else None, "num": partition_num, } queries = None elif isinstance(query, list): queries = query partition_query = None if partition_on is not None: raise ValueError("Partition on multiple queries is not supported.") else: raise ValueError("query must be either str or a list of str") if not protocol: # note: redshift/clickhouse are not compatible with the 'binary' protocol, and use other database # drivers to connect. set a compatible protocol and masquerade as the appropriate backend. backend, connection_details = conn.split(":", 1) if conn else ("", "") if "redshift" in backend: conn = f"postgresql:{connection_details}" protocol = "cursor" elif "clickhouse" in backend: conn = f"mysql:{connection_details}" protocol = "text" else: protocol = "binary" if return_type in {"modin", "dask", "pandas"}: try: import pandas except ModuleNotFoundError: raise ValueError("You need to install pandas first") result = _read_sql( conn, "pandas", queries=queries, protocol=protocol, partition_query=partition_query, ) df = reconstruct_pandas(result) if index_col is not None: df.set_index(index_col, inplace=True) if return_type == "modin": try: import modin.pandas as mpd except ModuleNotFoundError: raise ValueError("You need to install modin first") df = mpd.DataFrame(df) elif return_type == "dask": try: import dask.dataframe as dd except ModuleNotFoundError: raise ValueError("You need to install dask first") df = dd.from_pandas(df, npartitions=1) elif return_type in {"arrow", "polars"}: try: import pyarrow except ModuleNotFoundError: raise ValueError("You need to install pyarrow first") result = _read_sql( conn, "arrow", queries=queries, protocol=protocol, partition_query=partition_query, ) df = reconstruct_arrow(result) if return_type == "polars": try: import polars as pl except ModuleNotFoundError: raise ValueError("You need to install polars first") try: df = pl.DataFrame.from_arrow(df) except AttributeError: # api change for polars >= 0.8.* df = pl.from_arrow(df) else: raise ValueError(return_type) return df