def _parse_text_source(specs, codec_specs, unused_context): if specs['@type'] == 'TextSource': coder = get_coder_from_spec(codec_specs) start_offset = None if 'start_offset' in specs: start_offset = int(specs['start_offset']['value']) end_offset = None if 'end_offset' in specs: end_offset = int(specs['end_offset']['value']) return io.TextFileSource( file_path=specs['filename']['value'], start_offset=start_offset, end_offset=end_offset, compression_type=specs['compression_type']['value'], strip_trailing_newlines=specs['strip_trailing_newlines']['value'], coder=coder, )
def _parse_avro_source(specs, unused_codec_specs, unused_context): if specs['@type'] == 'AvroSource': # Note that the worker does not really implement AVRO yet.It takes # advantage that both reading and writing is done through the worker to # choose a supported format (text files with one pickled object per line). start_offset = None if 'start_offset' in specs: start_offset = int(specs['start_offset']['value']) end_offset = None if 'end_offset' in specs: end_offset = int(specs['end_offset']['value']) return io.TextFileSource( file_path=specs['filename']['value'], start_offset=start_offset, end_offset=end_offset, strip_trailing_newlines=True, coder=coders.Base64PickleCoder())
def test_text_source_to_shuffle_sink(self): work = workitem.get_work_items( get_text_source_to_shuffle_sink_message()) self.assertEqual((work.proto.id, work.map_task.operations), (1234, [ maptask.WorkerRead(io.TextFileSource(file_path='gs://somefile', start_offset=123, end_offset=123123, strip_trailing_newlines=True, coder=CODER), output_coders=[CODER]), maptask.WorkerDoFn(serialized_fn='code', output_tags=['out'], input=(1, 0), side_inputs=[], output_coders=[CODER]), maptask.WorkerShuffleWrite(shuffle_kind='group_keys', shuffle_writer_config='opaque', input=(1, 0), output_coders=(CODER, )) ]))