def test_read_do_write_with_start_bundle(self): input_path = self.create_temp_file('01234567890123456789\n0123456789') output_path = '%s.out' % input_path finish_path = '%s.finish' % input_path executor.MapTaskExecutor().execute( make_map_task([ maptask.WorkerRead(fileio.TextFileSource( file_path=input_path, start_offset=0, end_offset=15, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( DoFnUsingStartBundle(finish_path)), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), make_text_sink(output_path, input=(1, 0)) ])) with open(output_path) as f: self.assertEqual('XYZ: 01234567890123456789\n', f.read()) # Check that the finish_bundle method of the custom DoFn object left the # expected side-effect by writing a file with a specific content. with open(finish_path) as f: self.assertEqual('finish called.', f.read())
def test_create_do_write(self): output_path = self.create_temp_file('n/a') elements = ['abc', 'def', 'ghi'] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], # Start at the last element. start_index=2, # Go beyond the end to test that case is handled. end_index=15), output_coders=[coders.ToStringCoder()]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), maptask.WorkerWrite( fileio.TextFileSink(file_path_prefix=output_path, append_trailing_newlines=True, coder=coders.ToStringCoder()), input=(1, 0), output_coders=(coders.ToStringCoder(),)) ])) with open(output_path) as f: self.assertEqual('XYZ: ghi\n', f.read())
def test_create_do_with_side_in_memory_write(self): elements = ['abc', 'def', 'ghi'] side_elements = ['x', 'y', 'z'] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=3), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, side)]), tag_and_type=('inmemory', pvalue.SingletonPCollectionView, (False, None))), output_tags=['out'], input=(0, 0), side_inputs=[ maptask.WorkerSideInputSource( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in side_elements], start_index=None, end_index=None), tag='inmemory')], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,))])) # The side source was specified as singleton therefore we should see # only the first element appended. self.assertEqual(['abc:x', 'def:x', 'ghi:x'], output_buffer)
def test_read_do_shuffle_write(self): input_path = self.create_temp_file('a\nb\nc\nd\n') work_spec = [ maptask.WorkerRead(fileio.TextFileSource( file_path=input_path, start_offset=0, end_offset=8, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: [(x, 1)])), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), maptask.WorkerShuffleWrite(shuffle_kind='group_keys', shuffle_writer_config='none', input=(1, 0), output_coders=(self.SHUFFLE_CODER, )) ] shuffle_sink_mock = mock.MagicMock() executor.MapTaskExecutor().execute(make_map_task(work_spec), test_shuffle_sink=shuffle_sink_mock) # Make sure we have seen all the (k, v) writes. shuffle_sink_mock.writer().Write.assert_has_calls([ mock.call('a', '', 1), mock.call('b', '', 1), mock.call('c', '', 1), mock.call('d', '', 1) ])
def test_read_do_write(self): input_path = self.create_temp_file('01234567890123456789\n0123456789') output_path = '%s.out' % input_path executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( fileio.TextFileSource(file_path=input_path, start_offset=0, end_offset=15, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), maptask.WorkerWrite( fileio.TextFileSink(file_path_prefix=output_path, append_trailing_newlines=True, coder=coders.ToStringCoder()), input=(1, 0), output_coders=(coders.ToStringCoder(),)) ])) with open(output_path) as f: self.assertEqual('XYZ: 01234567890123456789\n', f.read())
def test_create_do_avro_write(self): output_path = self.create_temp_file('n/a') elements = ['abc', 'def', 'ghi'] work_item = workitem.BatchWorkItem(None) work_item.map_task = make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=2, # Start at the last element. end_index=3), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])), output_tags=['out'], input=(0, 0), side_inputs=None, output_coders=[self.OUTPUT_CODER]), make_text_sink(output_path, input=(1, 0), coder=coders.Base64PickleCoder()) ]) executor.MapTaskExecutor(work_item.map_task).execute() with open(output_path) as f: self.assertEqual('XYZ: ghi', pickler.loads(f.read().strip()))
def test_shuffle_read_do_write(self): output_path = self.create_temp_file('n/a') work_spec = [ maptask.WorkerGroupingShuffleRead( shuffle_reader_config='none', start_shuffle_position='aaa', end_shuffle_position='zzz', coder=self.SHUFFLE_CODER, output_coders=[self.SHUFFLE_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda (k, vs): [str((k, v)) for v in vs])), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), make_text_sink(output_path, input=(1, 0)) ] shuffle_source_mock = mock.MagicMock() shuffle_source_mock.reader().__enter__().__iter__.return_value = [ (10, [1, 2]), (20, [3]) ] executor.MapTaskExecutor().execute( make_map_task(work_spec), test_shuffle_source=shuffle_source_mock) with open(output_path) as f: self.assertEqual('(10, 1)\n(10, 2)\n(20, 3)\n', f.read())
def process_work_item(self, computation_id, map_task_proto, input_data_watermark, work_item): """Process a work item.""" workitem_commit_request = windmill_pb2.WorkItemCommitRequest( key=work_item.key, work_token=work_item.work_token) env = maptask.WorkerEnvironment() context = maptask.StreamingExecutionContext() reader = windmillstate.WindmillStateReader(computation_id, work_item.key, work_item.work_token, self.windmill) state_internals = windmillstate.WindmillStateInternals(reader) state = windmillstate.WindmillUnmergedState(state_internals) output_data_watermark = windmillio.windmill_to_harness_timestamp( work_item.output_data_watermark) context.start(computation_id, work_item, input_data_watermark, output_data_watermark, workitem_commit_request, self.windmill, state) map_task = maptask.decode_map_task(map_task_proto, env, context) map_task_executor = executor.MapTaskExecutor(map_task) map_task_executor.execute() state_internals.persist_to(workitem_commit_request) # Send result to Windmill. # TODO(ccy): in the future, this will not be done serially with respect to # work execution. commit_request = windmill_pb2.CommitWorkRequest() computation_commit_request = windmill_pb2.ComputationCommitWorkRequest( computation_id=computation_id, requests=[workitem_commit_request]) commit_request.requests.extend([computation_commit_request]) self.windmill.CommitWork(commit_request)
def test_create_do_with_side_text_file_write(self): input_path = self.create_temp_file('x\ny\n') elements = ['aa', 'bb'] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=2), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, s) for s in side]), tag_and_type=('textfile', pvalue.IterablePCollectionView, ())), output_tags=['out'], input=(0, 0), side_inputs=[ maptask.WorkerSideInputSource(fileio.TextFileSource( file_path=input_path, start_offset=None, end_offset=None, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), tag='textfile')], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,))])) # The side source was specified as collection therefore we should see # all elements of the side source. self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'], sorted(output_buffer))
def get_executor_for_work_item(self, work_item): if work_item.map_task is not None: return executor.MapTaskExecutor(work_item.map_task) elif work_item.source_operation_split_task is not None: return executor.CustomSourceSplitExecutor( work_item.source_operation_split_task) else: raise ValueError('Unknown type of work item : %s', work_item)
def do_work(self, work_item, deferred_exception_details=None): """Executes worker operations and adds any failures to the report status.""" logging.info('Executing %s', work_item) BatchWorker.log_memory_usage_if_needed(self.worker_id, force=True) work_executor = executor.MapTaskExecutor() progress_reporter = ProgressReporter( work_item, work_executor, self, self.client) if deferred_exception_details: # Report (fatal) deferred exceptions that happened earlier. This # workflow will fail with the deferred exception. with work_item.lock: self.report_completion_status( work_item, progress_reporter, exception_details=deferred_exception_details) work_item.done = True logging.error('Not processing WorkItem %s since a deferred exception ' 'was found: %s', work_item, deferred_exception_details) return exception_details = None try: progress_reporter.start_reporting_progress() work_executor.execute(work_item.map_task) except Exception: # pylint: disable=broad-except exception_details = traceback.format_exc() logging.error('An exception was raised when trying to execute the ' 'work item %s : %s', work_item, exception_details, exc_info=True) finally: try: progress_reporter.stop_reporting_progress() except Exception: # pylint: disable=broad-except logging.error('An exception was raised when trying to stop the ' 'progress reporter : %s', traceback.format_exc(), exc_info=True) # If 'exception_details' was already set, we were already going to # mark this work item as failed. Hence only logging this error and # reporting the original error. if exception_details is None: # This will be reported to the service and work item will be marked as # failed. exception_details = traceback.format_exc() with work_item.lock: self.report_completion_status(work_item, progress_reporter, exception_details=exception_details) work_item.done = True
def test_create_do_with_collection_side_bigquery_write(self): elements = ['aa', 'bb'] side_elements = ['x', 'y'] output_buffer = [] patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader' with mock.patch(target=patch_target) as mock_class: # Setup the reader so it will yield the values in 'side_elements'. reader_mock = mock_class.return_value reader_mock.__enter__.return_value = reader_mock # Use a lambda so that multiple readers can be created, each reading the # entirety of the side elements. reader_mock.__iter__.side_effect = lambda: (x for x in side_elements) executor.MapTaskExecutor().execute( make_map_task([ maptask.WorkerRead(inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=3), output_coders=[self.OUTPUT_CODER]), maptask. WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, s) for s in side]), tag_and_type=('bigquery', pvalue.IterablePCollectionView, ())), output_tags=['out'], input=(0, 0), side_inputs=[ maptask.WorkerSideInputSource( bigquery.BigQuerySource( project='project', dataset='dataset', table='table', coder=get_bigquery_source_coder()), tag='bigquery') ], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER, )) ])) # The side source was specified as collection therefore we should see # all elements of the side source. self.assertEqual(['aa:x', 'aa:y', 'bb:x', 'bb:y'], sorted(output_buffer))
def test_in_memory_source_progress_reporting(self): elements = [101, 201, 301, 401, 501, 601, 701] output_buffer = [] source = ProgressRequestRecordingInMemorySource( elements=[pickler.dumps(e) for e in elements]) work_item = workitem.BatchWorkItem(None) work_item.map_task = make_map_task([ maptask.WorkerRead(source, output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(0, 0), output_coders=(self.OUTPUT_CODER, )) ]) executor.MapTaskExecutor(work_item.map_task).execute() self.assertEqual(elements, output_buffer) expected_progress_record = range(len(elements)) self.assertEqual(expected_progress_record, source.last_reader.progress_record)
def test_create_do_with_side_avro_file_write(self): input_path1 = self.create_temp_file('%s\n' % pickler.dumps('x')) input_path2 = self.create_temp_file('%s\n' % pickler.dumps('y')) elements = ['aa', 'bb'] output_buffer = [] executor.MapTaskExecutor().execute( make_map_task([ maptask.WorkerRead(inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=2), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, s) for s in side]), tag_and_type=('sometag', pvalue.IterablePCollectionView, ())), output_tags=['out'], input=(0, 0), # Note that the two side inputs have the same tag. This is quite # common for intermediary PCollections used as side inputs that # are saved as AVRO files. The files will contain the sharded # PCollection. side_inputs=[ maptask.WorkerSideInputSource(fileio.TextFileSource( file_path=input_path1, coder=coders.Base64PickleCoder()), tag='sometag'), maptask.WorkerSideInputSource(fileio.TextFileSource( file_path=input_path2, coder=coders.Base64PickleCoder()), tag='sometag') ], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER, )) ])) # The side source was specified as collection therefore we should see # all three elements of the side source. self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'], sorted(output_buffer))
def test_ungrouped_shuffle_read_and_write(self): output_path = self.create_temp_file('n/a') work_spec = [ maptask.WorkerUngroupedShuffleRead( shuffle_reader_config='none', start_shuffle_position='aaa', end_shuffle_position='zzz', coder=self.SHUFFLE_CODER, output_coders=[self.SHUFFLE_CODER]), make_text_sink(output_path, input=(0, 0)) ] shuffle_source_mock = mock.MagicMock() shuffle_source_mock.reader().__enter__().__iter__.return_value = [ 1, 2, 3 ] executor.MapTaskExecutor().execute( make_map_task(work_spec), test_shuffle_source=shuffle_source_mock) with open(output_path) as f: self.assertEqual('1\n2\n3\n', f.read())
def test_pgbk(self): elements = [('a', 1), ('b', 2), ('a', 3), ('a', 4)] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource(elements=[pickler.dumps(e) for e in elements ], start_index=0, end_index=100), output_coders=[self.OUTPUT_CODER]), maptask.WorkerPartialGroupByKey( combine_fn=None, input=(0, 0), output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,)) ])) self.assertEqual([('a', [1, 3, 4]), ('b', [2])], sorted(output_buffer))
def test_in_memory_source_progress_reporting(self): elements = [101, 201, 301, 401, 501, 601, 701] output_buffer = [] source = ProgressRequestRecordingInMemorySource( elements=[pickler.dumps(e) for e in elements]) executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead(source, output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(0, 0), output_coders=(self.OUTPUT_CODER,)) ])) self.assertEqual(elements, output_buffer) expected_progress_record = [] len_elements = len(elements) for i in range(len_elements): expected_progress_record.append(float(i + 1) / len_elements) self.assertEqual(expected_progress_record, source.last_reader.progress_record)
def test_combine(self): elements = [('a', [1, 2, 3]), ('b', [10])] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=100), output_coders=[self.OUTPUT_CODER]), maptask.WorkerCombineFn(serialized_fn=pickle_with_side_inputs( ptransform.CombineFn.from_callable(sum)), phase='all', input=(0, 0), output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,)) ])) self.assertEqual([('a', 6), ('b', 10)], output_buffer)
def test_create_do_with_singleton_side_bigquery_write(self): elements = ['abc', 'def', 'ghi'] side_elements = ['x', 'y', 'z'] output_buffer = [] patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader' with mock.patch(target=patch_target) as mock_class: # Setup the reader so it will yield the values in 'side_elements'. reader_mock = mock_class.return_value reader_mock.__enter__.return_value = reader_mock reader_mock.__iter__.return_value = (x for x in side_elements) pickled_elements = [pickler.dumps(e) for e in elements] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource(elements=pickled_elements, start_index=0, end_index=3), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, side)]), tag_and_type=('bigquery', pvalue.SingletonPCollectionView, (False, None))), output_tags=['out'], input=(0, 0), side_inputs=[ maptask.WorkerSideInputSource( bigquery.BigQuerySource( project='project', dataset='dataset', table='table', coder=get_bigquery_source_coder()), tag='bigquery')], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,))])) # The side source was specified as singleton therefore we should see # only the first element appended. self.assertEqual(['abc:x', 'def:x', 'ghi:x'], output_buffer)
def test_read_do_write_with_undeclared_output(self): input_path = self.create_temp_file('01234567890123456789\n0123456789') output_path = '%s.out' % input_path work_item = workitem.BatchWorkItem(None) work_item.map_task = make_map_task([ maptask.WorkerRead(fileio.TextFileSource( file_path=input_path, start_offset=0, end_offset=15, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( DoFnUsingWithUndeclaredSideOutput()), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), make_text_sink(output_path, input=(1, 0)) ]) executor.MapTaskExecutor(work_item.map_task).execute() with open(output_path) as f: self.assertEqual('01234567890123456789\n', f.read())