Ejemplo n.º 1
0
 def test_slice_zero_rows(self):
     with tempfile_context() as path:
         # ensure at least 1 row group
         cjwparquet.write(
             path,
             pa.table({
                 "A":
                 pa.array([], pa.string()),
                 "B":
                 pa.DictionaryArray.from_arrays(pa.array([], pa.int32()),
                                                pa.array([], pa.string())),
                 "C":
                 pa.array([], pa.timestamp("ns")),
                 "D":
                 pa.array([], pa.float64()),
             }),
         )
         self.assertEqual(
             cjwparquet.read_slice_as_text(path,
                                           format="csv",
                                           only_columns=range(4),
                                           only_rows=range(0)),
             "A,B,C,D",
         )
         self.assertEqual(
             cjwparquet.read_slice_as_text(path,
                                           format="json",
                                           only_columns=range(4),
                                           only_rows=range(0)),
             "[]",
         )
Ejemplo n.º 2
0
 def test_slice_rows(self):
     with parquet_file({"A": [0, 1, 2, 3, 4, 5, 6, 7]}) as path:
         self.assertEqual(
             cjwparquet.read_slice_as_text(path,
                                           format="csv",
                                           only_columns=range(1),
                                           only_rows=range(2, 5)),
             "A\n2\n3\n4",
         )
         self.assertEqual(
             cjwparquet.read_slice_as_text(path,
                                           format="json",
                                           only_columns=range(1),
                                           only_rows=range(2, 5)),
             '[{"A":2},{"A":3},{"A":4}]',
         )
Ejemplo n.º 3
0
 def test_slice_ignore_missing_columns(self):
     with parquet_file({"A": [1]}) as path:
         self.assertEqual(
             cjwparquet.read_slice_as_text(path,
                                           format="csv",
                                           only_columns=range(3),
                                           only_rows=range(1)),
             "A\n1",
         )
         self.assertEqual(
             cjwparquet.read_slice_as_text(path,
                                           format="json",
                                           only_columns=range(3),
                                           only_rows=range(1)),
             '[{"A":1}]',
         )
Ejemplo n.º 4
0
 def test_slice_ignore_missing_rows(self):
     with parquet_file({"A": [0, 1, 2, 3]}) as path:
         self.assertEqual(
             cjwparquet.read_slice_as_text(path,
                                           format="csv",
                                           only_columns=range(1),
                                           only_rows=range(2, 5)),
             "A\n2\n3",
         )
Ejemplo n.º 5
0
 def test_slice_zero_row_groups(self):
     table = pa.Table.from_batches([],
                                   schema=pa.schema([("A", pa.string())]))
     with parquet_file(table) as path:
         self.assertEqual(
             cjwparquet.read_slice_as_text(path,
                                           format="csv",
                                           only_columns=range(1),
                                           only_rows=range(0)),
             "A",
         )
         self.assertEqual(
             cjwparquet.read_slice_as_text(path,
                                           format="json",
                                           only_columns=range(1),
                                           only_rows=range(0)),
             "[]",
         )
Ejemplo n.º 6
0
 def test_slice_lots_of_types(self):
     dt1 = datetime(2019, 12, 18, 23, 33, 55, 123000)
     dt2 = datetime(2019, 12, 18)
     with parquet_file({
             "str": ["x", "y", None, ""],
             "cat":
             pa.array(["x", "y", None, ""]).dictionary_encode(),
             "dt":
             pa.array([dt1, None, dt2, None], pa.timestamp("ns")),
             "int32": [1, 2, None, 2**31],
             "float": [1.1, None, 3.3, 4.4],
     }) as path:
         self.assertEqual(
             cjwparquet.read_slice_as_text(path,
                                           format="csv",
                                           only_columns=range(5),
                                           only_rows=range(4)),
             "\n".join([
                 "str,cat,dt,int32,float",
                 "x,x,2019-12-18T23:33:55.123Z,1,1.1",
                 "y,y,,2,",
                 ",,2019-12-18,,3.3",
                 ",,,2147483648,4.4",
             ]),
         )
         self.assertEqual(
             cjwparquet.read_slice_as_text(path,
                                           format="json",
                                           only_columns=range(5),
                                           only_rows=range(4)),
             "".join([
                 "[",
                 '{"str":"x","cat":"x","dt":"2019-12-18T23:33:55.123Z","int32":1,"float":1.1},',
                 '{"str":"y","cat":"y","dt":null,"int32":2,"float":null},',
                 '{"str":null,"cat":null,"dt":"2019-12-18","int32":null,"float":3.3},',
                 '{"str":"","cat":"","dt":null,"int32":2147483648,"float":4.4}',
                 "]",
             ]),
         )
Ejemplo n.º 7
0
def read_cached_render_result_slice_as_text(crr: CachedRenderResult,
                                            format: str, only_columns: range,
                                            only_rows: range) -> str:
    """
    Call `parquet-to-text-stream` and return its output.

    Ignore out-of-range rows and columns.

    Raise CorruptCacheError if the cached data does not match `crr`. That can
    mean:

        * The cached Parquet file is corrupt
        * The cached Parquet file is missing
        * `crr` is stale -- the cached result is for a different delta. This
          could be detected by a `Workflow.cooperative_lock()`, too, should the
          caller want to distinguish this error from the others.

    To limit the amount of text stored in RAM, use relatively small ranges for
    `only_columns` and `only_rows`.

    This uses `parquet-to-text-stream` with `--row-range` and `--column-range`.
    Read `parquet-to-text-stream` documentation to see how nulls and floats are
    handled in your chosen format (`csv` or `json`). (In a nutshell: it's
    mostly non-lossy, though CSV can't represent `null`.)
    """
    if not crr.table_metadata.columns:
        # Zero-column tables aren't written to cache
        return {}

    try:
        with downloaded_parquet_file(crr) as parquet_path:
            return cjwparquet.read_slice_as_text(
                parquet_path,
                format=format,
                only_columns=only_columns,
                only_rows=only_rows,
            )
    except (pyarrow.ArrowIOError, FileNotFoundError):  # FIXME unit-test
        raise CorruptCacheError