Beispiel #1
0
 def _parse_text_source(specs, codec_specs, unused_context):
   if specs['@type'] == 'TextSource':
     coder = get_coder_from_spec(codec_specs)
     start_offset = None
     if 'start_offset' in specs:
       start_offset = int(specs['start_offset']['value'])
     end_offset = None
     if 'end_offset' in specs:
       end_offset = int(specs['end_offset']['value'])
     return io.TextFileSource(
         file_path=specs['filename']['value'],
         start_offset=start_offset,
         end_offset=end_offset,
         compression_type=specs['compression_type']['value'],
         strip_trailing_newlines=specs['strip_trailing_newlines']['value'],
         coder=coder,
     )
Beispiel #2
0
 def _parse_avro_source(specs, unused_codec_specs, unused_context):
   if specs['@type'] == 'AvroSource':
     # Note that the worker does not really implement AVRO yet.It takes
     # advantage that both reading and writing is done through the worker to
     # choose a supported format (text files with one pickled object per line).
     start_offset = None
     if 'start_offset' in specs:
       start_offset = int(specs['start_offset']['value'])
     end_offset = None
     if 'end_offset' in specs:
       end_offset = int(specs['end_offset']['value'])
     return io.TextFileSource(
         file_path=specs['filename']['value'],
         start_offset=start_offset,
         end_offset=end_offset,
         strip_trailing_newlines=True,
         coder=coders.Base64PickleCoder())
 def test_text_source_to_shuffle_sink(self):
     work = workitem.get_work_items(
         get_text_source_to_shuffle_sink_message())
     self.assertEqual((work.proto.id, work.map_task.operations), (1234, [
         maptask.WorkerRead(io.TextFileSource(file_path='gs://somefile',
                                              start_offset=123,
                                              end_offset=123123,
                                              strip_trailing_newlines=True,
                                              coder=CODER),
                            output_coders=[CODER]),
         maptask.WorkerDoFn(serialized_fn='code',
                            output_tags=['out'],
                            input=(1, 0),
                            side_inputs=[],
                            output_coders=[CODER]),
         maptask.WorkerShuffleWrite(shuffle_kind='group_keys',
                                    shuffle_writer_config='opaque',
                                    input=(1, 0),
                                    output_coders=(CODER, ))
     ]))