def test_sink_transform_multiple_row_group(self): with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") with TestPipeline() as p: # writing 623200 bytes of data _ = p \ | Create(self.RECORDS * 4000) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, codec='none', shard_name_template='', row_group_buffer_size=250000) self.assertEqual(pq.read_metadata(path).num_row_groups, 3)
def test_batched_read(self): with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") with TestPipeline() as p: _ = p \ | Create(self.RECORDS, reshuffle=False) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, shard_name_template='') with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquetBatched(path) assert_that(readback, equal_to([self._records_as_arrow()]))
def test_sink_transform(self): with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") with TestPipeline() as p: _ = p \ | Create(self.RECORDS) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, shard_name_template='') with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquet(path) \ | Map(json.dumps) assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
def test_sink_transform_compressed(self, compression_type): if compression_type == 'lz4' and int(pa.__version__.split('.')[0]) == 1: return unittest.skip( "Writing with LZ4 compression is not supported in " "pyarrow 1.x") with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") with TestPipeline() as p: _ = p \ | Create(self.RECORDS) \ | WriteToParquet( path, self.SCHEMA, codec=compression_type, num_shards=1, shard_name_template='') with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquet(path + '*') \ | Map(json.dumps) assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
def test_wrapper_pass_through(self): # We use a file to check the result because the MyDoFn instance passed is # not the same one that actually runs in the pipeline (it is serialized # here and deserialized in the worker). with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") dofn = MyDoFn(path) result = self.p | beam.Create([1, 2, 3]) | beam.ParDo(dofn) assert_that(result, equal_to([1, 2, 3])) self.p.run() with open(path, mode="r") as ft: lines = [line.strip() for line in ft] self.assertListEqual([ 'setup', 'start_bundle', 'process', 'process', 'process', 'finish_bundle', 'teardown', ], lines)