def test_file_sink_multi_shards(self): temp_path = os.path.join(self._new_tempdir(), 'multishard') sink = MyFileSink(temp_path, file_name_suffix='.output', coder=coders.ToStringCoder()) # Manually invoke the generic Sink API. init_token = sink.initialize_write() num_shards = 1000 writer_results = [] for i in range(num_shards): uuid = 'uuid-%05d' % i writer = sink.open_writer(init_token, uuid) writer.write('a') writer.write('b') writer.write(uuid) writer_results.append(writer.close()) res_first = list(sink.finalize_write(init_token, writer_results)) # Retry the finalize operation (as if the first attempt was lost). res_second = list(sink.finalize_write(init_token, writer_results)) self.assertItemsEqual(res_first, res_second) res = sorted(res_second) for i in range(num_shards): shard_name = '%s-%05d-of-%05d.output' % (temp_path, i, num_shards) uuid = 'uuid-%05d' % i self.assertEqual(res[i], shard_name) self.assertEqual( open(shard_name).read(), ('[start][a][b][%s][end]' % uuid)) # Check that any temp files are deleted. self.assertItemsEqual(res, glob.glob(temp_path + '*'))
def test_file_sink_writing(self): temp_path = os.path.join(self._new_tempdir(), 'filesink') sink = MyFileSink(temp_path, file_name_suffix='.output', coder=coders.ToStringCoder()) # Manually invoke the generic Sink API. init_token = sink.initialize_write() writer1 = sink.open_writer(init_token, '1') writer1.write('a') writer1.write('b') res1 = writer1.close() writer2 = sink.open_writer(init_token, '2') writer2.write('x') writer2.write('y') writer2.write('z') res2 = writer2.close() _ = list(sink.finalize_write(init_token, [res1, res2])) # Retry the finalize operation (as if the first attempt was lost). res = list(sink.finalize_write(init_token, [res1, res2])) # Check the results. shard1 = temp_path + '-00000-of-00002.output' shard2 = temp_path + '-00001-of-00002.output' self.assertEqual(res, [shard1, shard2]) self.assertEqual(open(shard1).read(), '[start][a][b][end]') self.assertEqual(open(shard2).read(), '[start][x][y][z][end]') # Check that any temp files are deleted. self.assertItemsEqual([shard1, shard2], glob.glob(temp_path + '*'))
def __init__(self, file_path_prefix, file_name_suffix='', append_trailing_newlines=True, num_shards=0, shard_name_template=None, coder=coders.ToStringCoder(), compression_type=CompressionTypes.AUTO): """Initialize a TextFileSink. Args: file_path_prefix: The file path to write to. The files written will begin with this prefix, followed by a shard identifier (see num_shards), and end in a common extension, if given by file_name_suffix. In most cases, only this argument is specified and num_shards, shard_name_template, and file_name_suffix use default values. file_name_suffix: Suffix for the files written. append_trailing_newlines: indicate whether this sink should write an additional newline char after writing each element. num_shards: The number of files (shards) used for output. If not set, the service will decide on the optimal number of shards. Constraining the number of shards is likely to reduce the performance of a pipeline. Setting this value is not recommended unless you require a specific number of output files. shard_name_template: A template string containing placeholders for the shard number and shard count. Currently only '' and '-SSSSS-of-NNNNN' are patterns accepted by the service. When constructing a filename for a particular shard number, the upper-case letters 'S' and 'N' are replaced with the 0-padded shard number and shard count respectively. This argument can be '' in which case it behaves as if num_shards was set to 1 and only one file will be generated. The default pattern used is '-SSSSS-of-NNNNN'. coder: Coder used to encode each line. compression_type: Used to handle compressed output files. Typical value is CompressionTypes.AUTO, in which case the final file path's extension (as determined by file_path_prefix, file_name_suffix, num_shards and shard_name_template) will be used to detect the compression. Returns: A TextFileSink object usable for writing. """ super(TextFileSink, self).__init__(file_path_prefix, file_name_suffix=file_name_suffix, num_shards=num_shards, shard_name_template=shard_name_template, coder=coder, mime_type='text/plain', compression_type=compression_type) self.append_trailing_newlines = append_trailing_newlines if type(self) is TextFileSink: logging.warning( 'Direct usage of TextFileSink is deprecated. Please use ' '\'textio.WriteToText()\' instead of directly ' 'instantiating a TextFileSink object.')
def test_empty_write(self): temp_path = tempfile.NamedTemporaryFile().name sink = MyFileSink(temp_path, file_name_suffix='.output', coder=coders.ToStringCoder()) p = TestPipeline() p | beam.Create([]) | beam.io.Write(sink) # pylint: disable=expression-not-assigned p.run() self.assertEqual( open(temp_path + '-00000-of-00001.output').read(), '[start][end]')
def test_static_value_provider_empty_write(self): temp_path = StaticValueProvider( value_type=str, value=tempfile.NamedTemporaryFile().name) sink = MyFileSink(temp_path, file_name_suffix=StaticValueProvider( value_type=str, value='.output'), coder=coders.ToStringCoder()) p = TestPipeline() p | beam.Create([]) | beam.io.Write(sink) # pylint: disable=expression-not-assigned p.run() self.assertEqual( open(temp_path.get() + '-00000-of-00001.output').read(), '[start][end]')
def test_file_sink_display_data(self): temp_path = os.path.join(self._new_tempdir(), 'display') sink = MyFileSink(temp_path, file_name_suffix='.output', coder=coders.ToStringCoder()) dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher( 'file_pattern', '{}{}'.format(temp_path, '-%(shard_num)05d-of-%(num_shards)05d.output')) ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_file_sink_display_data(self): temp_path = tempfile.NamedTemporaryFile().name sink = MyFileSink(temp_path, file_name_suffix='.foo', coder=coders.ToStringCoder()) dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher( 'file_pattern', '{}{}'.format(temp_path, '-%(shard_num)05d-of-%(num_shards)05d.foo')) ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def __init__(self, file_path_prefix, file_name_suffix='', append_trailing_newlines=True, num_shards=0, shard_name_template=None, coder=coders.ToStringCoder(), compression_type=fileio.CompressionTypes.AUTO, header=None): """Initialize a WriteToText PTransform. Args: file_path_prefix: The file path to write to. The files written will begin with this prefix, followed by a shard identifier (see num_shards), and end in a common extension, if given by file_name_suffix. In most cases, only this argument is specified and num_shards, shard_name_template, and file_name_suffix use default values. file_name_suffix: Suffix for the files written. append_trailing_newlines: indicate whether this sink should write an additional newline char after writing each element. num_shards: The number of files (shards) used for output. If not set, the service will decide on the optimal number of shards. Constraining the number of shards is likely to reduce the performance of a pipeline. Setting this value is not recommended unless you require a specific number of output files. shard_name_template: A template string containing placeholders for the shard number and shard count. Currently only '' and '-SSSSS-of-NNNNN' are patterns accepted by the service. When constructing a filename for a particular shard number, the upper-case letters 'S' and 'N' are replaced with the 0-padded shard number and shard count respectively. This argument can be '' in which case it behaves as if num_shards was set to 1 and only one file will be generated. The default pattern used is '-SSSSS-of-NNNNN'. coder: Coder used to encode each line. compression_type: Used to handle compressed output files. Typical value is CompressionTypes.AUTO, in which case the final file path's extension (as determined by file_path_prefix, file_name_suffix, num_shards and shard_name_template) will be used to detect the compression. header: String to write at beginning of file as a header. If not None and append_trailing_newlines is set, '\n' will be added. """ self._sink = _TextSink(file_path_prefix, file_name_suffix, append_trailing_newlines, num_shards, shard_name_template, coder, compression_type, header)
def test_fixed_shard_write(self): temp_path = os.path.join(self._new_tempdir(), 'empty') sink = MyFileSink(temp_path, file_name_suffix='.output', num_shards=3, shard_name_template='_NN_SSS_', coder=coders.ToStringCoder()) p = TestPipeline() p | beam.Create(['a', 'b']) | beam.io.Write(sink) # pylint: disable=expression-not-assigned p.run() concat = ''.join( open(temp_path + '_03_%03d_.output' % shard_num).read() for shard_num in range(3)) self.assertTrue('][a][' in concat, concat) self.assertTrue('][b][' in concat, concat)
def test_file_sink_io_error(self): temp_path = os.path.join(self._new_tempdir(), 'ioerror') sink = MyFileSink(temp_path, file_name_suffix='.output', coder=coders.ToStringCoder()) # Manually invoke the generic Sink API. init_token = sink.initialize_write() writer1 = sink.open_writer(init_token, '1') writer1.write('a') writer1.write('b') res1 = writer1.close() writer2 = sink.open_writer(init_token, '2') writer2.write('x') writer2.write('y') writer2.write('z') res2 = writer2.close() os.remove(res2) with self.assertRaises(Exception): list(sink.finalize_write(init_token, [res1, res2]))
def __init__(self, file_to_write): self.file_to_write = file_to_write self.file_obj = None self.coder = coders.ToStringCoder()
def _get_temp_dir(file_path_prefix): sink = MyFileSink(file_path_prefix, file_name_suffix='.output', coder=coders.ToStringCoder()) return sink.initialize_write()