def test_create_from_null(self): all_data = [] with concat_reader.ConcatSource(None).reader() as reader: for data in reader: all_data.append(data) self.assertEqual(0, len(all_data))
def test_read_empty_list(self): all_data = [] with concat_reader.ConcatSource([]).reader() as reader: for data in reader: all_data.append(data) self.assertEqual(0, len(all_data))
def test_concat_source_to_shuffle_sink(self): work = workitem.get_work_items( get_concat_source_to_shuffle_sink_message()) self.assertIsNotNone(work) expected_sub_sources = [] expected_sub_sources.append( io.TextFileSource(file_path='gs://sort_g/input_small_files/' 'ascii_sort_1MB_input.0000006', start_offset=0, end_offset=1000000, strip_trailing_newlines=True, coder=CODER)) expected_sub_sources.append( io.TextFileSource(file_path='gs://sort_g/input_small_files/' 'ascii_sort_1MB_input.0000007', start_offset=0, end_offset=1000000, strip_trailing_newlines=True, coder=CODER)) expected_concat_source = concat_reader.ConcatSource( expected_sub_sources) self.assertEqual((work.proto.id, work.map_task.operations), (1234, [ maptask.WorkerRead(expected_concat_source, output_coders=[CODER]), maptask.WorkerDoFn(serialized_fn='code', output_tags=['out'], input=(1, 0), side_inputs=[], output_coders=[CODER]), maptask.WorkerShuffleWrite(shuffle_kind='group_keys', shuffle_writer_config='opaque', input=(1, 0), output_coders=(CODER, )) ]))
def _parse_concat_source(specs, _, unused_context): if specs['@type'] == 'ConcatSource': assert unused_context.worker_environment is not None sub_sources = [] for sub_source_dict in specs['sources']: sub_source_specs = sub_source_dict['spec'] sub_source_codec_specs = None if 'encoding' in sub_source_dict: sub_source_codec_specs = sub_source_dict['encoding'] sub_sources.append(unused_context.worker_environment.parse_source( sub_source_specs, sub_source_codec_specs, unused_context)) return concat_reader.ConcatSource(sub_sources)
def _create_concat_source(self, sub_source_sizes, output_record, index_of_source_to_fail=-1, index_to_fail_reading=-1, fail_reader_at_close=False): sub_sources = [] all_data = self.create_data(sub_source_sizes) for data in all_data: output_record.extend(data) for index, data in enumerate(all_data): if index == index_of_source_to_fail: sub_sources.append( TestSource(data, index_to_fail_reading, fail_reader_at_close)) else: sub_sources.append(TestSource(data, -1, False)) return concat_reader.ConcatSource(sub_sources)