コード例 #1
0
 def test_slice_zero_row_groups(self):
     table = pa.Table.from_batches([],
                                   schema=pa.schema([("A", pa.string())]))
     with parquet_file(table) as path:
         self.assertEqual(
             parquet.read_slice_as_text(path, "csv", range(1), range(0)),
             "A")
         self.assertEqual(
             parquet.read_slice_as_text(path, "json", range(1), range(0)),
             "[]")
コード例 #2
0
ファイル: io.py プロジェクト: zhiliangpersonal/cjworkbench
def read_cached_render_result_slice_as_text(crr: CachedRenderResult,
                                            format: str, only_columns: range,
                                            only_rows: range) -> str:
    """
    Call `parquet-to-text-stream` and return its output.

    Ignore out-of-range rows and columns.

    Raise CorruptCacheError if the cached data does not match `crr`. That can
    mean:

        * The cached Parquet file is corrupt
        * The cached Parquet file is missing
        * `crr` is stale -- the cached result is for a different delta. This
          could be detected by a `Workflow.cooperative_lock()`, too, should the
          caller want to distinguish this error from the others.

    To limit the amount of text stored in RAM, use relatively small ranges for
    `only_columns` and `only_rows`.

    This uses `parquet-to-text-stream` with `--row-range` and `--column-range`.
    Read `parquet-to-text-stream` documentation to see how nulls and floats are
    handled in your chosen format (`csv` or `json`). (In a nutshell: it's
    mostly non-lossy, though CSV can't represent `null`.)
    """
    if not crr.table_metadata.columns:
        # Zero-column tables aren't written to cache
        return {}

    try:
        with downloaded_parquet_file(crr) as parquet_path:
            return parquet.read_slice_as_text(parquet_path, format,
                                              only_columns, only_rows)
    except (pyarrow.ArrowIOError, FileNotFoundError):  # FIXME unit-test
        raise CorruptCacheError
コード例 #3
0
 def test_slice_zero_rows(self):
     with tempfile_context() as path:
         # ensure at least 1 row group
         parquet.write(
             path,
             pa.table({
                 "A":
                 pa.array([], pa.string()),
                 "B":
                 pa.DictionaryArray.from_arrays(pa.array([], pa.int32()),
                                                pa.array([], pa.string())),
                 "C":
                 pa.array([], pa.timestamp("ns")),
                 "D":
                 pa.array([], pa.float64()),
             }),
         )
         self.assertEqual(
             parquet.read_slice_as_text(path, "csv", range(4), range(0)),
             "A,B,C,D")
         self.assertEqual(
             parquet.read_slice_as_text(path, "json", range(4), range(0)),
             "[]")