def test_create_do_with_side_in_memory_write(self): elements = ['abc', 'def', 'ghi'] side_elements = ['x', 'y', 'z'] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=3), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, side)]), tag_and_type=('inmemory', pvalue.SingletonPCollectionView, (False, None))), output_tags=['out'], input=(0, 0), side_inputs=[ maptask.WorkerSideInputSource( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in side_elements], start_index=None, end_index=None), tag='inmemory')], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,))])) # The side source was specified as singleton therefore we should see # only the first element appended. self.assertEqual(['abc:x', 'def:x', 'ghi:x'], output_buffer)
def _parse_inmemory_source(specs, codec_specs, unused_context): if specs['@type'] == 'InMemorySource': # We do not wrap values sent to the service in a Create transform and # received here in a WindowedValue wrapper, but the service needs to be # sent the wrapped encoding so subsequent GroupByKey operations work # correctly. # # Note: The service may create a dummy empty InMemorySource that is a # windowed value when processing a BigQuerySource. In that case, we do # not unwrap this coder. # TODO(ccy): investigate if we can make these semantics cleaner. coder = get_coder_from_spec(codec_specs) if isinstance(coder, coders.WindowedValueCoder): coder = coder.wrapped_value_coder # Handle the case where 'elements' for an InMemory source is empty # list. if specs['elements']: # start_index/end_index could be missing if default behavior should be # used. For instance a list with one element will have start_index=0 and # end_index=1 by default. start_index = ( None if 'start_index' not in specs else int( specs['start_index']['value'])) end_index = ( None if 'end_index' not in specs else int(specs['end_index']['value'])) return inmemory.InMemorySource( elements=[base64.b64decode(v['value']) for v in specs['elements']], coder=coder, start_index=start_index, end_index=end_index) else: return inmemory.InMemorySource(elements=[], coder=coder)
def test_create_do_with_side_text_file_write(self): input_path = self.create_temp_file('x\ny\n') elements = ['aa', 'bb'] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=2), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, s) for s in side]), tag_and_type=('textfile', pvalue.IterablePCollectionView, ())), output_tags=['out'], input=(0, 0), side_inputs=[ maptask.WorkerSideInputSource(fileio.TextFileSource( file_path=input_path, start_offset=None, end_offset=None, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), tag='textfile')], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,))])) # The side source was specified as collection therefore we should see # all elements of the side source. self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'], sorted(output_buffer))
def test_create_do_write(self): output_path = self.create_temp_file('n/a') elements = ['abc', 'def', 'ghi'] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], # Start at the last element. start_index=2, # Go beyond the end to test that case is handled. end_index=15), output_coders=[coders.ToStringCoder()]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), maptask.WorkerWrite( fileio.TextFileSink(file_path_prefix=output_path, append_trailing_newlines=True, coder=coders.ToStringCoder()), input=(1, 0), output_coders=(coders.ToStringCoder(),)) ])) with open(output_path) as f: self.assertEqual('XYZ: ghi\n', f.read())
def test_create_do_avro_write(self): output_path = self.create_temp_file('n/a') elements = ['abc', 'def', 'ghi'] work_item = workitem.BatchWorkItem(None) work_item.map_task = make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=2, # Start at the last element. end_index=3), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])), output_tags=['out'], input=(0, 0), side_inputs=None, output_coders=[self.OUTPUT_CODER]), make_text_sink(output_path, input=(1, 0), coder=coders.Base64PickleCoder()) ]) executor.MapTaskExecutor(work_item.map_task).execute() with open(output_path) as f: self.assertEqual('XYZ: ghi', pickler.loads(f.read().strip()))
def test_in_memory_source_updates_progress_many(self): source = inmemory.InMemorySource([1, 2, 3, 4, 5], coder=FakeCoder()) with source.reader() as reader: self.assertEqual(0, reader.get_progress().percent_complete) i = 0 for item in reader: i += 1 self.assertEqual(i + 10, item) self.assertEqual(float(i) / 5, reader.get_progress().percent_complete) self.assertEqual(5, i) self.assertEqual(1, reader.get_progress().percent_complete)
def test_in_memory_source_updates_progress_one(self): source = inmemory.InMemorySource([1], coder=FakeCoder()) with source.reader() as reader: self.assertEqual(0, reader.get_progress().percent_complete) i = 0 for item in reader: i += 1 self.assertEqual(11, item) self.assertEqual(1, reader.get_progress().percent_complete) self.assertEqual(1, i) self.assertEqual(1, reader.get_progress().percent_complete)
def test_in_memory_source_updates_progress_many(self): source = inmemory.InMemorySource([1, 2, 3, 4, 5], coder=FakeCoder()) with source.reader() as reader: self.assertEqual(None, reader.get_progress()) i = 0 for item in reader: self.assertEqual(i, reader.get_progress().position.record_index) self.assertEqual(11 + i, item) i += 1 self.assertEqual(5, i) self.assertEqual(4, reader.get_progress().position.record_index)
def test_in_memory_source_to_flatten(self): work = workitem.get_work_items( get_in_memory_source_to_flatten_message()) self.assertEqual((work.proto.id, work.map_task.operations), (1234, [ maptask.WorkerRead(inmemory.InMemorySource( start_index=1, end_index=3, elements=[ base64.b64decode(v['value']) for v in IN_MEMORY_ELEMENTS ], coder=CODER), output_coders=[CODER]), maptask.WorkerFlatten(inputs=[(0, 0)], output_coders=[CODER]) ]))
def test_create_do_with_collection_side_bigquery_write(self): elements = ['aa', 'bb'] side_elements = ['x', 'y'] output_buffer = [] patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader' with mock.patch(target=patch_target) as mock_class: # Setup the reader so it will yield the values in 'side_elements'. reader_mock = mock_class.return_value reader_mock.__enter__.return_value = reader_mock # Use a lambda so that multiple readers can be created, each reading the # entirety of the side elements. reader_mock.__iter__.side_effect = lambda: (x for x in side_elements) executor.MapTaskExecutor().execute( make_map_task([ maptask.WorkerRead(inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=3), output_coders=[self.OUTPUT_CODER]), maptask. WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, s) for s in side]), tag_and_type=('bigquery', pvalue.IterablePCollectionView, ())), output_tags=['out'], input=(0, 0), side_inputs=[ maptask.WorkerSideInputSource( bigquery.BigQuerySource( project='project', dataset='dataset', table='table', coder=get_bigquery_source_coder()), tag='bigquery') ], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER, )) ])) # The side source was specified as collection therefore we should see # all elements of the side source. self.assertEqual(['aa:x', 'aa:y', 'bb:x', 'bb:y'], sorted(output_buffer))
def test_create_do_with_side_avro_file_write(self): input_path1 = self.create_temp_file('%s\n' % pickler.dumps('x')) input_path2 = self.create_temp_file('%s\n' % pickler.dumps('y')) elements = ['aa', 'bb'] output_buffer = [] executor.MapTaskExecutor().execute( make_map_task([ maptask.WorkerRead(inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=2), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, s) for s in side]), tag_and_type=('sometag', pvalue.IterablePCollectionView, ())), output_tags=['out'], input=(0, 0), # Note that the two side inputs have the same tag. This is quite # common for intermediary PCollections used as side inputs that # are saved as AVRO files. The files will contain the sharded # PCollection. side_inputs=[ maptask.WorkerSideInputSource(fileio.TextFileSource( file_path=input_path1, coder=coders.Base64PickleCoder()), tag='sometag'), maptask.WorkerSideInputSource(fileio.TextFileSource( file_path=input_path2, coder=coders.Base64PickleCoder()), tag='sometag') ], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER, )) ])) # The side source was specified as collection therefore we should see # all three elements of the side source. self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'], sorted(output_buffer))
def test_pgbk(self): elements = [('a', 1), ('b', 2), ('a', 3), ('a', 4)] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource(elements=[pickler.dumps(e) for e in elements ], start_index=0, end_index=100), output_coders=[self.OUTPUT_CODER]), maptask.WorkerPartialGroupByKey( combine_fn=None, input=(0, 0), output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,)) ])) self.assertEqual([('a', [1, 3, 4]), ('b', [2])], sorted(output_buffer))
def test_in_memory_source_to_text_sink(self): work = workitem.get_work_items( get_in_memory_source_to_text_sink_message()) self.assertEqual((work.proto.id, work.map_task.operations), (1234, [ maptask.WorkerRead(inmemory.InMemorySource( start_index=1, end_index=3, elements=[ base64.b64decode(v['value']) for v in IN_MEMORY_ELEMENTS ], coder=CODER), output_coders=[CODER]), maptask.WorkerWrite(fileio.NativeTextFileSink( file_path_prefix='gs://somefile', append_trailing_newlines=True, coder=CODER), input=(0, 0), output_coders=(CODER, )) ]))
def test_combine(self): elements = [('a', [1, 2, 3]), ('b', [10])] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=100), output_coders=[self.OUTPUT_CODER]), maptask.WorkerCombineFn(serialized_fn=pickle_with_side_inputs( ptransform.CombineFn.from_callable(sum)), phase='all', input=(0, 0), output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,)) ])) self.assertEqual([('a', 6), ('b', 10)], output_buffer)
def test_create_do_with_singleton_side_bigquery_write(self): elements = ['abc', 'def', 'ghi'] side_elements = ['x', 'y', 'z'] output_buffer = [] patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader' with mock.patch(target=patch_target) as mock_class: # Setup the reader so it will yield the values in 'side_elements'. reader_mock = mock_class.return_value reader_mock.__enter__.return_value = reader_mock reader_mock.__iter__.return_value = (x for x in side_elements) pickled_elements = [pickler.dumps(e) for e in elements] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource(elements=pickled_elements, start_index=0, end_index=3), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, side)]), tag_and_type=('bigquery', pvalue.SingletonPCollectionView, (False, None))), output_tags=['out'], input=(0, 0), side_inputs=[ maptask.WorkerSideInputSource( bigquery.BigQuerySource( project='project', dataset='dataset', table='table', coder=get_bigquery_source_coder()), tag='bigquery')], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,))])) # The side source was specified as singleton therefore we should see # only the first element appended. self.assertEqual(['abc:x', 'def:x', 'ghi:x'], output_buffer)
def test_in_memory_source_dynamic_split(self): source = inmemory.InMemorySource([10, 20, 30, 40, 50, 60], coder=FakeCoder()) # Unstarted reader with source.reader() as reader: self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=2))), None) # Proposed split position out of range with source.reader() as reader: reader_iter = iter(reader) next(reader_iter) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=-1))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=10))), None) # Already read past proposed split position with source.reader() as reader: reader_iter = iter(reader) next(reader_iter) next(reader_iter) next(reader_iter) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=1))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=2))), None) # Successful split with source.reader() as reader: reader_iter = iter(reader) next(reader_iter) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=4))), iobase.DynamicSplitResultWithPosition( stop_position=iobase.ReaderPosition(record_index=4))) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=2))), iobase.DynamicSplitResultWithPosition( stop_position=iobase.ReaderPosition(record_index=2)))
def test_inmemory(self): source = inmemory.InMemorySource([1, 2, 3, 4, 5], FakeCoder(), 1, 3) with source.reader() as reader: self.assertItemsEqual([12, 13], [i for i in reader])
def test_in_memory_source_updates_progress_none(self): source = inmemory.InMemorySource([], coder=FakeCoder()) with source.reader() as reader: self.assertEqual(None, reader.get_progress())
def test_norange(self): source = inmemory.InMemorySource([1, 2, 3, 4, 5], coder=FakeCoder()) with source.reader() as reader: self.assertItemsEqual([11, 12, 13, 14, 15], [i for i in reader])