Example #1
0
 def test_temp_dir_uniqueness(self):
     temp_path = os.path.join(self._new_tempdir(), 'unique')
     sink = MyFileBasedSink(temp_path, coder=coders.ToStringCoder())
     init_list = [''] * 1000
     temp_dir_list = [sink._create_temp_dir(temp_path) for _ in init_list]
     temp_dir_set = set(temp_dir_list)
     self.assertEqual(len(temp_dir_list), len(temp_dir_set))
Example #2
0
    def test_pre_finalize(self):
        temp_path = os.path.join(self._new_tempdir(), 'pre_finalize')
        sink = MyFileBasedSink(temp_path,
                               file_name_suffix='.output',
                               coder=coders.ToStringCoder())
        init_token, [res1, res2] = self._common_init(sink)

        # no-op
        sink.pre_finalize(init_token, [res1, res2])

        # Create finalized outputs from a previous run, which pre_finalize should
        # delete.
        shard1 = temp_path + '-00000-of-00002.output'
        shard2 = temp_path + '-00001-of-00002.output'
        with open(shard1, 'w') as f:
            f.write('foo')
        with open(shard2, 'w') as f:
            f.write('foo')
        self.assertTrue(os.path.exists(res1))
        self.assertTrue(os.path.exists(res2))
        self.assertTrue(os.path.exists(shard1))
        self.assertTrue(os.path.exists(shard2))

        sink.pre_finalize(init_token, [res1, res2])
        self.assertTrue(os.path.exists(res1))
        self.assertTrue(os.path.exists(res2))
        self.assertFalse(os.path.exists(shard1))
        self.assertFalse(os.path.exists(shard2))
Example #3
0
 def open(self, temp_path):
     file_handle = super(_TextSink, self).open(temp_path)
     if self._header is not None:
         file_handle.write(coders.ToStringCoder().encode(self._header))
         if self._append_trailing_newlines:
             file_handle.write(b'\n')
     return file_handle
Example #4
0
    def test_file_sink_writing(self):
        temp_path = os.path.join(self._new_tempdir(), 'FileBasedSink')
        sink = MyFileBasedSink(temp_path,
                               file_name_suffix='.output',
                               coder=coders.ToStringCoder())

        init_token, writer_results = self._common_init(sink)

        pre_finalize_results = sink.pre_finalize(init_token, writer_results)
        finalize_res1 = list(
            sink.finalize_write(init_token, writer_results,
                                pre_finalize_results))
        # Retry the finalize operation (as if the first attempt was lost).
        finalize_res2 = list(
            sink.finalize_write(init_token, writer_results,
                                pre_finalize_results))

        # Check the results.
        shard1 = temp_path + '-00000-of-00002.output'
        shard2 = temp_path + '-00001-of-00002.output'
        self.assertEqual(finalize_res1, [shard1, shard2])
        self.assertEqual(finalize_res2, [])
        self.assertEqual(open(shard1).read(), '[start][a][b][end]')
        self.assertEqual(open(shard2).read(), '[start][x][y][z][end]')

        # Check that any temp files are deleted.
        self.assertCountEqual([shard1, shard2], glob.glob(temp_path + '*'))
Example #5
0
def run(argv=None):
    parser = argparse.ArgumentParser()

    parser.add_argument('--input',
                        dest='input',
                        default='gs://url',
                        help='Input file')

    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        default='gs://url',
                        help='Output file to write result')

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = False

    p = beam.Pipeline(options=pipeline_options)
    query = "SELECT user_dim.user_id, user_dim.app_info.app_instance_id, user_dim.app_info.app_platform, " \
            "user_dim.app_info.app_version, user_dim.geo_info.country, event.* " \
            "FROM `table`, UNNEST(event_dim) as event "

    lines = p | 'read' >> beam.io.Read(
        beam.io.BigQuerySource(query=query, use_standard_sql=True))

    counts = (lines
              | 'data_extract' >>
              (beam.ParDo(JsonExtractEvent())).with_output_types(unicode))

    counts | 'write' >> WriteToText(
        known_args.output, coder=coders.ToStringCoder(), num_shards=1)

    result = p.run()
    result.wait_until_finish()
Example #6
0
    def test_file_sink_writing(self):
        temp_path = os.path.join(self._new_tempdir(), 'FileBasedSink')
        sink = MyFileBasedSink(temp_path,
                               file_name_suffix='.output',
                               coder=coders.ToStringCoder())

        # Manually invoke the generic Sink API.
        init_token = sink.initialize_write()

        writer1 = sink.open_writer(init_token, '1')
        writer1.write('a')
        writer1.write('b')
        res1 = writer1.close()

        writer2 = sink.open_writer(init_token, '2')
        writer2.write('x')
        writer2.write('y')
        writer2.write('z')
        res2 = writer2.close()

        _ = list(sink.finalize_write(init_token, [res1, res2]))
        # Retry the finalize operation (as if the first attempt was lost).
        res = list(sink.finalize_write(init_token, [res1, res2]))

        # Check the results.
        shard1 = temp_path + '-00000-of-00002.output'
        shard2 = temp_path + '-00001-of-00002.output'
        self.assertEqual(res, [shard1, shard2])
        self.assertEqual(open(shard1).read(), '[start][a][b][end]')
        self.assertEqual(open(shard2).read(), '[start][x][y][z][end]')

        # Check that any temp files are deleted.
        self.assertItemsEqual([shard1, shard2], glob.glob(temp_path + '*'))
Example #7
0
    def test_file_sink_dst_matches_src(self):
        temp_path = os.path.join(self._new_tempdir(), 'dst_matches_src')
        sink = MyFileBasedSink(temp_path,
                               file_name_suffix='.output',
                               coder=coders.ToStringCoder())
        init_token, [res1, res2] = self._common_init(sink)

        pre_finalize_results = sink.pre_finalize(init_token, [res1, res2])
        list(
            sink.finalize_write(init_token, [res1, res2],
                                pre_finalize_results))

        self.assertFalse(os.path.exists(res1))
        self.assertFalse(os.path.exists(res2))
        shard1 = temp_path + '-00000-of-00002.output'
        shard2 = temp_path + '-00001-of-00002.output'
        self.assertEqual(open(shard1).read(), '[start][a][b][end]')
        self.assertEqual(open(shard2).read(), '[start][x][y][z][end]')

        os.makedirs(os.path.dirname(res1))
        shutil.copyfile(shard1, res1)
        shutil.copyfile(shard2, res2)
        list(
            sink.finalize_write(init_token, [res1, res2],
                                pre_finalize_results))
Example #8
0
    def test_file_sink_multi_shards(self):
        temp_path = os.path.join(self._new_tempdir(), 'multishard')
        sink = MyFileBasedSink(temp_path,
                               file_name_suffix='.output',
                               coder=coders.ToStringCoder())

        # Manually invoke the generic Sink API.
        init_token = sink.initialize_write()

        num_shards = 1000
        writer_results = []
        for i in range(num_shards):
            uuid = 'uuid-%05d' % i
            writer = sink.open_writer(init_token, uuid)
            writer.write('a')
            writer.write('b')
            writer.write(uuid)
            writer_results.append(writer.close())

        pre_finalize_results = sink.pre_finalize(init_token, writer_results)
        res = sorted(
            sink.finalize_write(init_token, writer_results,
                                pre_finalize_results))

        for i in range(num_shards):
            shard_name = '%s-%05d-of-%05d.output' % (temp_path, i, num_shards)
            uuid = 'uuid-%05d' % i
            self.assertEqual(res[i], shard_name)
            self.assertEqual(
                open(shard_name).read(), ('[start][a][b][%s][end]' % uuid))

        # Check that any temp files are deleted.
        self.assertCountEqual(res, glob.glob(temp_path + '*'))
Example #9
0
 def test_empty_write(self):
     temp_path = tempfile.NamedTemporaryFile().name
     sink = MyFileBasedSink(temp_path,
                            file_name_suffix='.output',
                            coder=coders.ToStringCoder())
     with TestPipeline() as p:
         p | beam.Create([]) | beam.io.Write(sink)  # pylint: disable=expression-not-assigned
     self.assertEqual(
         open(temp_path + '-00000-of-00001.output').read(), '[start][end]')
Example #10
0
  def __init__(
      self,
      file_path_prefix,  # type: str
      file_name_suffix='',
      append_trailing_newlines=True,
      num_shards=0,
      shard_name_template=None,  # type: Optional[str]
      coder=coders.ToStringCoder(),  # type: coders.Coder
      compression_type=CompressionTypes.AUTO,
      header=None):
    r"""Initialize a :class:`WriteToText` transform.

    Args:
      file_path_prefix (str): The file path to write to. The files written will
        begin with this prefix, followed by a shard identifier (see
        **num_shards**), and end in a common extension, if given by
        **file_name_suffix**. In most cases, only this argument is specified and
        **num_shards**, **shard_name_template**, and **file_name_suffix** use
        default values.
      file_name_suffix (str): Suffix for the files written.
      append_trailing_newlines (bool): indicate whether this sink should write
        an additional newline char after writing each element.
      num_shards (int): The number of files (shards) used for output.
        If not set, the service will decide on the optimal number of shards.
        Constraining the number of shards is likely to reduce
        the performance of a pipeline.  Setting this value is not recommended
        unless you require a specific number of output files.
      shard_name_template (str): A template string containing placeholders for
        the shard number and shard count. Currently only ``''`` and
        ``'-SSSSS-of-NNNNN'`` are patterns accepted by the service.
        When constructing a filename for a particular shard number, the
        upper-case letters ``S`` and ``N`` are replaced with the ``0``-padded
        shard number and shard count respectively.  This argument can be ``''``
        in which case it behaves as if num_shards was set to 1 and only one file
        will be generated. The default pattern used is ``'-SSSSS-of-NNNNN'``.
      coder (~apache_beam.coders.coders.Coder): Coder used to encode each line.
      compression_type (str): Used to handle compressed output files.
        Typical value is :class:`CompressionTypes.AUTO
        <apache_beam.io.filesystem.CompressionTypes.AUTO>`, in which case the
        final file path's extension (as determined by **file_path_prefix**,
        **file_name_suffix**, **num_shards** and **shard_name_template**) will
        be used to detect the compression.
      header (str): String to write at beginning of file as a header.
        If not :data:`None` and **append_trailing_newlines** is set, ``\n`` will
        be added.
    """

    self._sink = _TextSink(
        file_path_prefix,
        file_name_suffix,
        append_trailing_newlines,
        num_shards,
        shard_name_template,
        coder,
        compression_type,
        header)
Example #11
0
    def __init__(self,
                 file_path_prefix,
                 file_name_suffix='',
                 num_shards=0,
                 shard_name_template=None,
                 coder=coders.ToStringCoder(),
                 compression_type=CompressionTypes.AUTO):

        self._sink = _JsonSink(file_path_prefix, file_name_suffix, num_shards,
                               shard_name_template, coder, compression_type)
Example #12
0
    def __init__(
            self,
            file_path_prefix,
            file_name_suffix='',
            append_trailing_newlines=True,
            num_shards=0,
            shard_name_template=None,
            coder=coders.ToStringCoder(),  # type: coders.Coder
            compression_type=CompressionTypes.AUTO,
            header=None):
        """Initialize a _TextSink.

    Args:
      file_path_prefix: The file path to write to. The files written will begin
        with this prefix, followed by a shard identifier (see num_shards), and
        end in a common extension, if given by file_name_suffix. In most cases,
        only this argument is specified and num_shards, shard_name_template, and
        file_name_suffix use default values.
      file_name_suffix: Suffix for the files written.
      append_trailing_newlines: indicate whether this sink should write an
        additional newline char after writing each element.
      num_shards: The number of files (shards) used for output. If not set, the
        service will decide on the optimal number of shards.
        Constraining the number of shards is likely to reduce
        the performance of a pipeline.  Setting this value is not recommended
        unless you require a specific number of output files.
      shard_name_template: A template string containing placeholders for
        the shard number and shard count. When constructing a filename for a
        particular shard number, the upper-case letters 'S' and 'N' are
        replaced with the 0-padded shard number and shard count respectively.
        This argument can be '' in which case it behaves as if num_shards was
        set to 1 and only one file will be generated. The default pattern used
        is '-SSSSS-of-NNNNN' if None is passed as the shard_name_template.
      coder: Coder used to encode each line.
      compression_type: Used to handle compressed output files. Typical value
        is CompressionTypes.AUTO, in which case the final file path's
        extension (as determined by file_path_prefix, file_name_suffix,
        num_shards and shard_name_template) will be used to detect the
        compression.
      header: String to write at beginning of file as a header. If not None and
        append_trailing_newlines is set, '\n' will be added.

    Returns:
      A _TextSink object usable for writing.
    """
        super(_TextSink,
              self).__init__(file_path_prefix,
                             file_name_suffix=file_name_suffix,
                             num_shards=num_shards,
                             shard_name_template=shard_name_template,
                             coder=coder,
                             mime_type='text/plain',
                             compression_type=compression_type)
        self._append_trailing_newlines = append_trailing_newlines
        self._header = header
Example #13
0
  def test_file_sink_src_missing(self):
    temp_path = os.path.join(self._new_tempdir(), 'src_missing')
    sink = MyFileBasedSink(
        temp_path, file_name_suffix='.output', coder=coders.ToStringCoder())
    init_token, writer_results = self._common_init(sink)
    pre_finalize_results = sink.pre_finalize(init_token, writer_results)

    os.remove(writer_results[0])
    with self.assertRaisesRegexp(Exception, r'not exist'):
      list(sink.finalize_write(init_token, writer_results,
                               pre_finalize_results))
Example #14
0
 def test_static_value_provider_empty_write(self):
     temp_path = StaticValueProvider(
         value_type=str, value=tempfile.NamedTemporaryFile().name)
     sink = MyFileBasedSink(temp_path,
                            file_name_suffix=StaticValueProvider(
                                value_type=str, value='.output'),
                            coder=coders.ToStringCoder())
     with TestPipeline() as p:
         p | beam.Create([]) | beam.io.Write(sink)  # pylint: disable=expression-not-assigned
     self.assertEqual(
         open(temp_path.get() + '-00000-of-00001.output').read(),
         '[start][end]')
Example #15
0
  def test_file_sink_rename_error(self, rename_mock):
    temp_path = os.path.join(self._new_tempdir(), 'rename_error')
    sink = MyFileBasedSink(
        temp_path, file_name_suffix='.output', coder=coders.ToStringCoder())
    init_token, writer_results = self._common_init(sink)
    pre_finalize_results = sink.pre_finalize(init_token, writer_results)

    error_str = 'mock rename error description'
    rename_mock.side_effect = BeamIOError(
        'mock rename error', {('src', 'dst'): error_str})
    with self.assertRaisesRegexp(Exception, error_str):
      list(sink.finalize_write(init_token, writer_results,
                               pre_finalize_results))
Example #16
0
 def test_file_sink_display_data(self):
     temp_path = os.path.join(self._new_tempdir(), 'display')
     sink = MyFileBasedSink(temp_path,
                            file_name_suffix='.output',
                            coder=coders.ToStringCoder())
     dd = DisplayData.create_from(sink)
     expected_items = [
         DisplayDataItemMatcher('compression', 'auto'),
         DisplayDataItemMatcher(
             'file_pattern',
             '{}{}'.format(temp_path,
                           '-%(shard_num)05d-of-%(num_shards)05d.output'))
     ]
     hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #17
0
    def test_fixed_shard_write(self):
        temp_path = os.path.join(self._new_tempdir(), 'empty')
        sink = MyFileBasedSink(temp_path,
                               file_name_suffix='.output',
                               num_shards=3,
                               shard_name_template='_NN_SSS_',
                               coder=coders.ToStringCoder())
        with TestPipeline() as p:
            p | beam.Create(['a', 'b']) | beam.io.Write(sink)  # pylint: disable=expression-not-assigned

        concat = ''.join(
            open(temp_path + '_03_%03d_.output' % shard_num).read()
            for shard_num in range(3))
        self.assertTrue('][a][' in concat, concat)
        self.assertTrue('][b][' in concat, concat)
Example #18
0
    def __init__(self,
                 file_path_prefix,
                 file_name_suffix='',
                 num_shards=0,
                 shard_name_template=None,
                 coder=coders.ToStringCoder(),
                 compression_type=CompressionTypes.AUTO):

        super(_JsonSink,
              self).__init__(file_path_prefix,
                             file_name_suffix=file_name_suffix,
                             num_shards=num_shards,
                             shard_name_template=shard_name_template,
                             coder=coder,
                             mime_type='text/plain',
                             compression_type=compression_type)
        self.last_rows = dict()
Example #19
0
def run(argv=None):
    parser = argparse.ArgumentParser()

    parser.add_argument('--input',
                        dest='input',
                        default='gs://dev-temp/tmp/test2.json',
                        help='Input file')

    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        default='gs://dev-temp/tmp/dataflow/',
                        help='Output file to write result')

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = False


    p = beam.Pipeline(options=pipeline_options)
    client_query = "SELECT user_dim.user_id, user_dim.app_info.app_instance_id, user_dim.app_info.app_platform, " \
            "user_dim.app_info.app_version, user_dim.geo_info.country, event.* " \
            "FROM `com_bootstlab_retro_IOS.app_events_20180101`, UNNEST(event_dim) as event limit 1000"
    server_query = "SELECT user_id, device_id, event_name as name, CAST(timestamp_nanos/1000 AS INT64) as timestamp_micros, MAX(IF(event_params.key = 'StatusCode', event_params.value.string_value, null)) as status_code " \
            "FROM `server_logs_us.events_20180102`, UNNEST(event_params) AS event_params GROUP BY 1,2,3,4 HAVING statue_code = '200' limit 100 "

    lines = p | 'read_ios' >> beam.io.Read(beam.io.BigQuerySource(
                query=client_query, use_standard_sql=True, flatten_results=False))

    counts = (lines
              | 'data_extract' >> (beam.ParDo(JsonExtractEvent())).with_output_types(unicode)
              | 'pair_with_one' >> beam.Map(lambda x: (str(x.split('\t')[:4]), 1))  # [:4] : event, [1] : dau
              # | 'group_by_key' >> beam.GroupByKey()  # group by key를 하려면 위에서 tuple이 나와야 함
              | 'count_per_key' >> beam.combiners.Count.PerKey()
              # | 'count_user' >> beam.FlatMap(lambda x: len(x))
              # | 'count_total' >> beam.Map(count_total)
              )

    counts | 'write' >> WriteToText(known_args.output, coder=coders.ToStringCoder(), num_shards=1)

    result = p.run()
    result.wait_until_finish()
Example #20
0
 def __init__(self,
              file_path_prefix,
              file_name_suffix='',
              append_trailing_newlines=True,
              num_shards=0,
              shard_name_template=None,
              coder=coders.ToStringCoder(),
              compression_type=CompressionTypes.AUTO,
              header=[],
              delimiter='\x01',
              lineterminator='\r\n'):
     super(CsvFileSink,
           self).__init__(file_path_prefix=file_path_prefix,
                          file_name_suffix=file_name_suffix,
                          num_shards=num_shards,
                          shard_name_template=shard_name_template,
                          coder=coder,
                          mime_type='text/plain',
                          compression_type=compression_type)
     self.header = header
     self.delimiter = delimiter
     self.lineterminator = lineterminator
Example #21
0
    def test_file_sink_io_error(self):
        temp_path = os.path.join(self._new_tempdir(), 'ioerror')
        sink = MyFileBasedSink(temp_path,
                               file_name_suffix='.output',
                               coder=coders.ToStringCoder())

        # Manually invoke the generic Sink API.
        init_token = sink.initialize_write()

        writer1 = sink.open_writer(init_token, '1')
        writer1.write('a')
        writer1.write('b')
        res1 = writer1.close()

        writer2 = sink.open_writer(init_token, '2')
        writer2.write('x')
        writer2.write('y')
        writer2.write('z')
        res2 = writer2.close()

        os.remove(res2)
        with self.assertRaises(Exception):
            list(sink.finalize_write(init_token, [res1, res2]))
Example #22
0
  def test_pre_finalize_error(self, delete_mock):
    temp_path = os.path.join(self._new_tempdir(), 'pre_finalize')
    sink = MyFileBasedSink(
        temp_path, file_name_suffix='.output', coder=coders.ToStringCoder())
    init_token, [res1, res2] = self._common_init(sink)

    # no-op
    sink.pre_finalize(init_token, [res1, res2])

    # Create finalized outputs from a previous run, which pre_finalize should
    # delete.
    shard1 = temp_path + '-00000-of-00002.output'
    shard2 = temp_path + '-00001-of-00002.output'
    with open(shard1, 'w') as f:
      f.write('foo')
    with open(shard2, 'w') as f:
      f.write('foo')

    error_str = 'mock rename error description'
    delete_mock.side_effect = BeamIOError(
        'mock rename error', {shard2: error_str})
    with self.assertRaisesRegexp(Exception, error_str):
      sink.pre_finalize(init_token, [res1, res2])
Example #23
0
 def _get_temp_dir(file_path_prefix):
     sink = MyFileBasedSink(file_path_prefix,
                            file_name_suffix='.output',
                            coder=coders.ToStringCoder())
     return sink.initialize_write()