def test_to_from_buffer(df: pl.DataFrame, compressions: list[str]) -> None: for compression in compressions: if compression == "lzo": # lzo compression is not supported now with pytest.raises(pl.ArrowError): buf = io.BytesIO() df.write_parquet(buf, compression=compression) buf.seek(0) _ = pl.read_parquet(buf) with pytest.raises(OSError): buf = io.BytesIO() df.write_parquet(buf, compression=compression, use_pyarrow=True) buf.seek(0) _ = pl.read_parquet(buf) else: buf = io.BytesIO() df.write_parquet(buf, compression=compression) buf.seek(0) read_df = pl.read_parquet(buf) assert_frame_equal_local_categoricals(df, read_df) for use_pyarrow in [True, False]: buf = io.BytesIO() df.write_parquet(buf, use_pyarrow=use_pyarrow) buf.seek(0) read_df = pl.read_parquet(buf, use_pyarrow=use_pyarrow) assert_frame_equal_local_categoricals(df, read_df)
def test_parquet_chunks(): """ This failed in https://github.com/pola-rs/polars/issues/545 """ cases = [ 1048576, 1048577, ] for case in cases: f = io.BytesIO() # repeat until it has case instances df = pd.DataFrame( np.tile([1.0, pd.to_datetime("2010-10-10")], [case, 1]), columns=["floats", "dates"], ) print(df) # write as parquet df.to_parquet(f) print(f"reading {case} dates with polars...", end="") f.seek(0) # read it with polars polars_df = pl.read_parquet(f)
def test_parquet_datetime() -> None: """ This failed because parquet writers cast datetime to Date """ f = io.BytesIO() data = { "datetime": [ # unix timestamp in ms 1618354800000, 1618354740000, 1618354680000, 1618354620000, 1618354560000, ], "laf_max": [73.1999969482, 71.0999984741, 74.5, 69.5999984741, 69.6999969482], "laf_eq": [59.5999984741, 61.0, 62.2999992371, 56.9000015259, 60.0], } df = pl.DataFrame(data) df = df.with_column(df["datetime"].cast(pl.Datetime)) # todo! test all compressions here df.write_parquet(f, use_pyarrow=True, compression="snappy") f.seek(0) read = pl.read_parquet(f) assert read.frame_equal(df)
def decode( self, ctx: FlyteContext, flyte_value: literals.StructuredDataset, current_task_metadata: StructuredDatasetMetadata, ) -> pl.DataFrame: local_dir = ctx.file_access.get_random_local_directory() ctx.file_access.get_data(flyte_value.uri, local_dir, is_multipart=True) path = f"{local_dir}/00000" if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns: columns = [ c.name for c in current_task_metadata.structured_dataset_type.columns ] return pl.read_parquet(path, columns=columns) return pl.read_parquet(path)
def test_to_from_file(io_test_dir: str, df: pl.DataFrame, compressions: List[str]) -> None: f = os.path.join(io_test_dir, "small.parquet") for compression in compressions: if compression == "lzo": # lzo compression is not supported now with pytest.raises(pl.ArrowError): df.write_parquet(f, compression=compression) _ = pl.read_parquet(f) with pytest.raises(OSError): df.write_parquet(f, compression=compression, use_pyarrow=True) _ = pl.read_parquet(f) else: df.write_parquet(f, compression=compression) read_df = pl.read_parquet(f) assert df.frame_equal(read_df)
def test_select_projection() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [True, False, True], "c": ["a", "b", "c"]}) expected = pl.DataFrame({"b": [True, False, True], "c": ["a", "b", "c"]}) f = io.BytesIO() df.write_parquet(f) f.seek(0) read_df = pl.read_parquet(f, columns=[1, 2], use_pyarrow=False) assert expected.frame_equal(read_df)
def _scan_parquet_impl(uri: str, with_columns: list[str] | None) -> pli.DataFrame: """ Takes the projected columns and materializes an arrow table. Parameters ---------- uri with_columns """ import polars as pl return pl.read_parquet(uri, with_columns)
def from_bytes(self, b: bytes, extension=None): if extension is None: extension = self.default_extension() f = BytesIO() f.write(b) f.seek(0) if extension == "csv": return pl.read_csv(f) elif extension == "parquet": return pl.read_parquet(f) raise Exception( f"Deserialization: file extension {extension} is not supported by polars data-frame type." )
def test_read_utc_times_parquet() -> None: df = pd.DataFrame( data={ "Timestamp": pd.date_range( "2022-01-01T00:00+00:00", "2022-01-01T10:00+00:00", freq="H" ) } ) f = io.BytesIO() df.to_parquet(f) f.seek(0) df_in = pl.read_parquet(f) assert df_in["Timestamp"][0] == datetime(2022, 1, 1, 0, 0)
def test_nested_parquet() -> None: f = io.BytesIO() data = [ {"a": [{"b": 0}]}, {"a": [{"b": 1}, {"b": 2}]}, ] df = pd.DataFrame(data) df.to_parquet(f) read = pl.read_parquet(f, use_pyarrow=True) assert read.columns == ["a"] assert isinstance(read.dtypes[0], pl.datatypes.List) assert isinstance(read.dtypes[0].inner, pl.datatypes.Struct)
def test_to_from_buffer(df: pl.DataFrame, compressions: List[str]) -> None: for compression in compressions: if compression == "lzo": # lzo compression is not supported now with pytest.raises(pl.ArrowError): buf = io.BytesIO() df.write_parquet(buf, compression=compression) buf.seek(0) _ = pl.read_parquet(buf) with pytest.raises(OSError): buf = io.BytesIO() df.write_parquet(buf, compression=compression, use_pyarrow=True) buf.seek(0) _ = pl.read_parquet(buf) else: buf = io.BytesIO() df.write_parquet(buf, compression=compression) buf.seek(0) read_df = pl.read_parquet(buf) assert df.frame_equal(read_df, null_equal=True)
def test_nested_dictionary() -> None: with pl.StringCache(): df = (pl.DataFrame({ "str": ["A", "B", "A", "B", "C"], "group": [1, 1, 2, 1, 2] }).with_column(pl.col("str").cast( pl.Categorical)).groupby("group").agg( [pl.col("str").list().alias("cat_list")])) f = io.BytesIO() df.write_parquet(f) f.seek(0) read_df = pl.read_parquet(f) assert df.frame_equal(read_df)
def recursive_logical_type() -> None: df = pl.DataFrame({ "str": ["A", "B", "A", "B", "C"], "group": [1, 1, 2, 1, 2] }) df = df.with_column(pl.col("str").cast(pl.Categorical)) df_groups = df.groupby("group").agg( [pl.col("str").list().alias("cat_list")]) f = io.BytesIO() df_groups.write_parquet(f, use_pyarrow=True) f.seek(0) read = pl.read_parquet(f, use_pyarrow=True) assert read.dtypes == [pl.Int64, pl.List(pl.Categorical)] assert read.shape == (2, 2)
def test_row_count(foods_parquet: str) -> None: df = pl.read_parquet(foods_parquet, row_count_name="row_count") assert df["row_count"].to_list() == list(range(27)) df = (pl.scan_parquet(foods_parquet, row_count_name="row_count").filter( pl.col("category") == pl.lit("vegetables")).collect()) assert df["row_count"].to_list() == [0, 6, 11, 13, 14, 20, 25] df = (pl.scan_parquet( foods_parquet, row_count_name="row_count").with_row_count( "foo", 10).filter(pl.col("category") == pl.lit("vegetables")).collect()) assert df["foo"].to_list() == [10, 16, 21, 23, 24, 30, 35]
def test_chunked_round_trip() -> None: df1 = pl.DataFrame({ "a": [1] * 2, "l": [[1] for j in range(0, 2)], }) df2 = pl.DataFrame({ "a": [2] * 3, "l": [[2] for j in range(0, 3)], }) df = df1.vstack(df2) f = io.BytesIO() df.write_parquet(f) f.seek(0) assert pl.read_parquet(f).frame_equal(df)
def test_glob_parquet(io_test_dir: str) -> None: path = os.path.join(io_test_dir, "small*.parquet") assert pl.read_parquet(path).shape == (3, 16) assert pl.scan_parquet(path).collect().shape == (3, 16)
def test_null_parquet(io_test_dir: str) -> None: file = path.join(io_test_dir, "null.parquet") df = pl.DataFrame([pl.Series("foo", [], dtype=pl.Int8)]) df.write_parquet(file) out = pl.read_parquet(file) assert out.frame_equal(df)