Exemple #1
0
    def test_file_sink_multi_shards(self):
        temp_path = os.path.join(self._new_tempdir(), 'multishard')
        sink = MyFileSink(temp_path,
                          file_name_suffix='.output',
                          coder=coders.ToStringCoder())

        # Manually invoke the generic Sink API.
        init_token = sink.initialize_write()

        num_shards = 1000
        writer_results = []
        for i in range(num_shards):
            uuid = 'uuid-%05d' % i
            writer = sink.open_writer(init_token, uuid)
            writer.write('a')
            writer.write('b')
            writer.write(uuid)
            writer_results.append(writer.close())

        res_first = list(sink.finalize_write(init_token, writer_results))
        # Retry the finalize operation (as if the first attempt was lost).
        res_second = list(sink.finalize_write(init_token, writer_results))

        self.assertItemsEqual(res_first, res_second)

        res = sorted(res_second)
        for i in range(num_shards):
            shard_name = '%s-%05d-of-%05d.output' % (temp_path, i, num_shards)
            uuid = 'uuid-%05d' % i
            self.assertEqual(res[i], shard_name)
            self.assertEqual(
                open(shard_name).read(), ('[start][a][b][%s][end]' % uuid))

        # Check that any temp files are deleted.
        self.assertItemsEqual(res, glob.glob(temp_path + '*'))
Exemple #2
0
    def test_file_sink_writing(self):
        temp_path = os.path.join(self._new_tempdir(), 'filesink')
        sink = MyFileSink(temp_path,
                          file_name_suffix='.output',
                          coder=coders.ToStringCoder())

        # Manually invoke the generic Sink API.
        init_token = sink.initialize_write()

        writer1 = sink.open_writer(init_token, '1')
        writer1.write('a')
        writer1.write('b')
        res1 = writer1.close()

        writer2 = sink.open_writer(init_token, '2')
        writer2.write('x')
        writer2.write('y')
        writer2.write('z')
        res2 = writer2.close()

        _ = list(sink.finalize_write(init_token, [res1, res2]))
        # Retry the finalize operation (as if the first attempt was lost).
        res = list(sink.finalize_write(init_token, [res1, res2]))

        # Check the results.
        shard1 = temp_path + '-00000-of-00002.output'
        shard2 = temp_path + '-00001-of-00002.output'
        self.assertEqual(res, [shard1, shard2])
        self.assertEqual(open(shard1).read(), '[start][a][b][end]')
        self.assertEqual(open(shard2).read(), '[start][x][y][z][end]')

        # Check that any temp files are deleted.
        self.assertItemsEqual([shard1, shard2], glob.glob(temp_path + '*'))
Exemple #3
0
    def __init__(self,
                 file_path_prefix,
                 file_name_suffix='',
                 append_trailing_newlines=True,
                 num_shards=0,
                 shard_name_template=None,
                 coder=coders.ToStringCoder(),
                 compression_type=CompressionTypes.AUTO):
        """Initialize a TextFileSink.

    Args:
      file_path_prefix: The file path to write to. The files written will begin
        with this prefix, followed by a shard identifier (see num_shards), and
        end in a common extension, if given by file_name_suffix. In most cases,
        only this argument is specified and num_shards, shard_name_template, and
        file_name_suffix use default values.
      file_name_suffix: Suffix for the files written.
      append_trailing_newlines: indicate whether this sink should write an
        additional newline char after writing each element.
      num_shards: The number of files (shards) used for output. If not set, the
        service will decide on the optimal number of shards.
        Constraining the number of shards is likely to reduce
        the performance of a pipeline.  Setting this value is not recommended
        unless you require a specific number of output files.
      shard_name_template: A template string containing placeholders for
        the shard number and shard count. Currently only '' and
        '-SSSSS-of-NNNNN' are patterns accepted by the service.
        When constructing a filename for a particular shard number, the
        upper-case letters 'S' and 'N' are replaced with the 0-padded shard
        number and shard count respectively.  This argument can be '' in which
        case it behaves as if num_shards was set to 1 and only one file will be
        generated. The default pattern used is '-SSSSS-of-NNNNN'.
      coder: Coder used to encode each line.
      compression_type: Used to handle compressed output files. Typical value
          is CompressionTypes.AUTO, in which case the final file path's
          extension (as determined by file_path_prefix, file_name_suffix,
          num_shards and shard_name_template) will be used to detect the
          compression.

    Returns:
      A TextFileSink object usable for writing.
    """
        super(TextFileSink,
              self).__init__(file_path_prefix,
                             file_name_suffix=file_name_suffix,
                             num_shards=num_shards,
                             shard_name_template=shard_name_template,
                             coder=coder,
                             mime_type='text/plain',
                             compression_type=compression_type)
        self.append_trailing_newlines = append_trailing_newlines

        if type(self) is TextFileSink:
            logging.warning(
                'Direct usage of TextFileSink is deprecated. Please use '
                '\'textio.WriteToText()\' instead of directly '
                'instantiating a TextFileSink object.')
Exemple #4
0
 def test_empty_write(self):
     temp_path = tempfile.NamedTemporaryFile().name
     sink = MyFileSink(temp_path,
                       file_name_suffix='.output',
                       coder=coders.ToStringCoder())
     p = TestPipeline()
     p | beam.Create([]) | beam.io.Write(sink)  # pylint: disable=expression-not-assigned
     p.run()
     self.assertEqual(
         open(temp_path + '-00000-of-00001.output').read(), '[start][end]')
Exemple #5
0
 def test_static_value_provider_empty_write(self):
     temp_path = StaticValueProvider(
         value_type=str, value=tempfile.NamedTemporaryFile().name)
     sink = MyFileSink(temp_path,
                       file_name_suffix=StaticValueProvider(
                           value_type=str, value='.output'),
                       coder=coders.ToStringCoder())
     p = TestPipeline()
     p | beam.Create([]) | beam.io.Write(sink)  # pylint: disable=expression-not-assigned
     p.run()
     self.assertEqual(
         open(temp_path.get() + '-00000-of-00001.output').read(),
         '[start][end]')
Exemple #6
0
 def test_file_sink_display_data(self):
     temp_path = os.path.join(self._new_tempdir(), 'display')
     sink = MyFileSink(temp_path,
                       file_name_suffix='.output',
                       coder=coders.ToStringCoder())
     dd = DisplayData.create_from(sink)
     expected_items = [
         DisplayDataItemMatcher('compression', 'auto'),
         DisplayDataItemMatcher(
             'file_pattern',
             '{}{}'.format(temp_path,
                           '-%(shard_num)05d-of-%(num_shards)05d.output'))
     ]
     hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Exemple #7
0
    def test_file_sink_display_data(self):
        temp_path = tempfile.NamedTemporaryFile().name
        sink = MyFileSink(temp_path,
                          file_name_suffix='.foo',
                          coder=coders.ToStringCoder())
        dd = DisplayData.create_from(sink)
        expected_items = [
            DisplayDataItemMatcher('compression', 'auto'),
            DisplayDataItemMatcher(
                'file_pattern',
                '{}{}'.format(temp_path,
                              '-%(shard_num)05d-of-%(num_shards)05d.foo'))
        ]

        hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Exemple #8
0
    def __init__(self,
                 file_path_prefix,
                 file_name_suffix='',
                 append_trailing_newlines=True,
                 num_shards=0,
                 shard_name_template=None,
                 coder=coders.ToStringCoder(),
                 compression_type=fileio.CompressionTypes.AUTO,
                 header=None):
        """Initialize a WriteToText PTransform.

    Args:
      file_path_prefix: The file path to write to. The files written will begin
        with this prefix, followed by a shard identifier (see num_shards), and
        end in a common extension, if given by file_name_suffix. In most cases,
        only this argument is specified and num_shards, shard_name_template, and
        file_name_suffix use default values.
      file_name_suffix: Suffix for the files written.
      append_trailing_newlines: indicate whether this sink should write an
        additional newline char after writing each element.
      num_shards: The number of files (shards) used for output. If not set, the
        service will decide on the optimal number of shards.
        Constraining the number of shards is likely to reduce
        the performance of a pipeline.  Setting this value is not recommended
        unless you require a specific number of output files.
      shard_name_template: A template string containing placeholders for
        the shard number and shard count. Currently only '' and
        '-SSSSS-of-NNNNN' are patterns accepted by the service.
        When constructing a filename for a particular shard number, the
        upper-case letters 'S' and 'N' are replaced with the 0-padded shard
        number and shard count respectively.  This argument can be '' in which
        case it behaves as if num_shards was set to 1 and only one file will be
        generated. The default pattern used is '-SSSSS-of-NNNNN'.
      coder: Coder used to encode each line.
      compression_type: Used to handle compressed output files. Typical value
          is CompressionTypes.AUTO, in which case the final file path's
          extension (as determined by file_path_prefix, file_name_suffix,
          num_shards and shard_name_template) will be used to detect the
          compression.
      header: String to write at beginning of file as a header. If not None and
          append_trailing_newlines is set, '\n' will be added.
    """

        self._sink = _TextSink(file_path_prefix, file_name_suffix,
                               append_trailing_newlines, num_shards,
                               shard_name_template, coder, compression_type,
                               header)
Exemple #9
0
    def test_fixed_shard_write(self):
        temp_path = os.path.join(self._new_tempdir(), 'empty')
        sink = MyFileSink(temp_path,
                          file_name_suffix='.output',
                          num_shards=3,
                          shard_name_template='_NN_SSS_',
                          coder=coders.ToStringCoder())
        p = TestPipeline()
        p | beam.Create(['a', 'b']) | beam.io.Write(sink)  # pylint: disable=expression-not-assigned

        p.run()

        concat = ''.join(
            open(temp_path + '_03_%03d_.output' % shard_num).read()
            for shard_num in range(3))
        self.assertTrue('][a][' in concat, concat)
        self.assertTrue('][b][' in concat, concat)
Exemple #10
0
    def test_file_sink_io_error(self):
        temp_path = os.path.join(self._new_tempdir(), 'ioerror')
        sink = MyFileSink(temp_path,
                          file_name_suffix='.output',
                          coder=coders.ToStringCoder())

        # Manually invoke the generic Sink API.
        init_token = sink.initialize_write()

        writer1 = sink.open_writer(init_token, '1')
        writer1.write('a')
        writer1.write('b')
        res1 = writer1.close()

        writer2 = sink.open_writer(init_token, '2')
        writer2.write('x')
        writer2.write('y')
        writer2.write('z')
        res2 = writer2.close()

        os.remove(res2)
        with self.assertRaises(Exception):
            list(sink.finalize_write(init_token, [res1, res2]))
Exemple #11
0
 def __init__(self, file_to_write):
     self.file_to_write = file_to_write
     self.file_obj = None
     self.coder = coders.ToStringCoder()
Exemple #12
0
 def _get_temp_dir(file_path_prefix):
     sink = MyFileSink(file_path_prefix,
                       file_name_suffix='.output',
                       coder=coders.ToStringCoder())
     return sink.initialize_write()