def test_slice_zero_row_groups(self): table = pa.Table.from_batches([], schema=pa.schema([("A", pa.string())])) with parquet_file(table) as path: self.assertEqual( parquet.read_slice_as_text(path, "csv", range(1), range(0)), "A") self.assertEqual( parquet.read_slice_as_text(path, "json", range(1), range(0)), "[]")
def read_cached_render_result_slice_as_text(crr: CachedRenderResult, format: str, only_columns: range, only_rows: range) -> str: """ Call `parquet-to-text-stream` and return its output. Ignore out-of-range rows and columns. Raise CorruptCacheError if the cached data does not match `crr`. That can mean: * The cached Parquet file is corrupt * The cached Parquet file is missing * `crr` is stale -- the cached result is for a different delta. This could be detected by a `Workflow.cooperative_lock()`, too, should the caller want to distinguish this error from the others. To limit the amount of text stored in RAM, use relatively small ranges for `only_columns` and `only_rows`. This uses `parquet-to-text-stream` with `--row-range` and `--column-range`. Read `parquet-to-text-stream` documentation to see how nulls and floats are handled in your chosen format (`csv` or `json`). (In a nutshell: it's mostly non-lossy, though CSV can't represent `null`.) """ if not crr.table_metadata.columns: # Zero-column tables aren't written to cache return {} try: with downloaded_parquet_file(crr) as parquet_path: return parquet.read_slice_as_text(parquet_path, format, only_columns, only_rows) except (pyarrow.ArrowIOError, FileNotFoundError): # FIXME unit-test raise CorruptCacheError
def test_slice_zero_rows(self): with tempfile_context() as path: # ensure at least 1 row group parquet.write( path, pa.table({ "A": pa.array([], pa.string()), "B": pa.DictionaryArray.from_arrays(pa.array([], pa.int32()), pa.array([], pa.string())), "C": pa.array([], pa.timestamp("ns")), "D": pa.array([], pa.float64()), }), ) self.assertEqual( parquet.read_slice_as_text(path, "csv", range(4), range(0)), "A,B,C,D") self.assertEqual( parquet.read_slice_as_text(path, "json", range(4), range(0)), "[]")