Example #1
0
 def test_sink_transform_multiple_row_group(self):
     with TemporaryDirectory() as tmp_dirname:
         path = os.path.join(tmp_dirname + "tmp_filename")
         with TestPipeline() as p:
             # writing 623200 bytes of data
             _ = p \
             | Create(self.RECORDS * 4000) \
             | WriteToParquet(
                 path, self.SCHEMA, num_shards=1, codec='none',
                 shard_name_template='', row_group_buffer_size=250000)
         self.assertEqual(pq.read_metadata(path).num_row_groups, 3)
Example #2
0
 def test_batched_read(self):
     with TemporaryDirectory() as tmp_dirname:
         path = os.path.join(tmp_dirname + "tmp_filename")
         with TestPipeline() as p:
             _ = p \
             | Create(self.RECORDS, reshuffle=False) \
             | WriteToParquet(
                 path, self.SCHEMA, num_shards=1, shard_name_template='')
         with TestPipeline() as p:
             # json used for stable sortability
             readback = \
                 p \
                 | ReadFromParquetBatched(path)
             assert_that(readback, equal_to([self._records_as_arrow()]))
Example #3
0
 def test_sink_transform(self):
   with TemporaryDirectory() as tmp_dirname:
     path = os.path.join(tmp_dirname + "tmp_filename")
     with TestPipeline() as p:
       _ = p \
       | Create(self.RECORDS) \
       | WriteToParquet(
           path, self.SCHEMA, num_shards=1, shard_name_template='')
     with TestPipeline() as p:
       # json used for stable sortability
       readback = \
           p \
           | ReadFromParquet(path) \
           | Map(json.dumps)
       assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
Example #4
0
 def test_sink_transform_compressed(self, compression_type):
   if compression_type == 'lz4' and int(pa.__version__.split('.')[0]) == 1:
     return unittest.skip(
         "Writing with LZ4 compression is not supported in "
         "pyarrow 1.x")
   with TemporaryDirectory() as tmp_dirname:
     path = os.path.join(tmp_dirname + "tmp_filename")
     with TestPipeline() as p:
       _ = p \
       | Create(self.RECORDS) \
       | WriteToParquet(
           path, self.SCHEMA, codec=compression_type,
           num_shards=1, shard_name_template='')
     with TestPipeline() as p:
       # json used for stable sortability
       readback = \
           p \
           | ReadFromParquet(path + '*') \
           | Map(json.dumps)
       assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
 def test_wrapper_pass_through(self):
     # We use a file to check the result because the MyDoFn instance passed is
     # not the same one that actually runs in the pipeline (it is serialized
     # here and deserialized in the worker).
     with TemporaryDirectory() as tmp_dirname:
         path = os.path.join(tmp_dirname + "tmp_filename")
         dofn = MyDoFn(path)
         result = self.p | beam.Create([1, 2, 3]) | beam.ParDo(dofn)
         assert_that(result, equal_to([1, 2, 3]))
         self.p.run()
         with open(path, mode="r") as ft:
             lines = [line.strip() for line in ft]
             self.assertListEqual([
                 'setup',
                 'start_bundle',
                 'process',
                 'process',
                 'process',
                 'finish_bundle',
                 'teardown',
             ], lines)