Example #1
0
 def test_read_do_write_with_start_bundle(self):
     input_path = self.create_temp_file('01234567890123456789\n0123456789')
     output_path = '%s.out' % input_path
     finish_path = '%s.finish' % input_path
     executor.MapTaskExecutor().execute(
         make_map_task([
             maptask.WorkerRead(fileio.TextFileSource(
                 file_path=input_path,
                 start_offset=0,
                 end_offset=15,
                 strip_trailing_newlines=True,
                 coder=coders.StrUtf8Coder()),
                                output_coders=[self.OUTPUT_CODER]),
             maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                 DoFnUsingStartBundle(finish_path)),
                                output_tags=['out'],
                                output_coders=[self.OUTPUT_CODER],
                                input=(0, 0),
                                side_inputs=None),
             make_text_sink(output_path, input=(1, 0))
         ]))
     with open(output_path) as f:
         self.assertEqual('XYZ: 01234567890123456789\n', f.read())
     # Check that the finish_bundle method of the custom DoFn object left the
     # expected side-effect by writing a file with a specific content.
     with open(finish_path) as f:
         self.assertEqual('finish called.', f.read())
Example #2
0
 def test_create_do_write(self):
   output_path = self.create_temp_file('n/a')
   elements = ['abc', 'def', 'ghi']
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               # Start at the last element.
               start_index=2,
               # Go beyond the end to test that case is handled.
               end_index=15),
           output_coders=[coders.ToStringCoder()]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: ghi\n', f.read())
Example #3
0
 def test_create_do_with_side_in_memory_write(self):
   elements = ['abc', 'def', 'ghi']
   side_elements = ['x', 'y', 'z']
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=3),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(
           serialized_fn=pickle_with_side_inputs(
               ptransform.CallableWrapperDoFn(
                   lambda x, side: ['%s:%s' % (x, side)]),
               tag_and_type=('inmemory', pvalue.SingletonPCollectionView,
                             (False, None))),
           output_tags=['out'], input=(0, 0),
           side_inputs=[
               maptask.WorkerSideInputSource(
                   inmemory.InMemorySource(
                       elements=[pickler.dumps(e) for e in side_elements],
                       start_index=None,
                       end_index=None),
                   tag='inmemory')],
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(
           output_buffer=output_buffer,
           input=(1, 0),
           output_coders=(self.OUTPUT_CODER,))]))
   # The side source was specified as singleton therefore we should see
   # only the first element appended.
   self.assertEqual(['abc:x', 'def:x', 'ghi:x'], output_buffer)
Example #4
0
 def test_read_do_shuffle_write(self):
     input_path = self.create_temp_file('a\nb\nc\nd\n')
     work_spec = [
         maptask.WorkerRead(fileio.TextFileSource(
             file_path=input_path,
             start_offset=0,
             end_offset=8,
             strip_trailing_newlines=True,
             coder=coders.StrUtf8Coder()),
                            output_coders=[self.OUTPUT_CODER]),
         maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
             ptransform.CallableWrapperDoFn(lambda x: [(x, 1)])),
                            output_tags=['out'],
                            output_coders=[self.OUTPUT_CODER],
                            input=(0, 0),
                            side_inputs=None),
         maptask.WorkerShuffleWrite(shuffle_kind='group_keys',
                                    shuffle_writer_config='none',
                                    input=(1, 0),
                                    output_coders=(self.SHUFFLE_CODER, ))
     ]
     shuffle_sink_mock = mock.MagicMock()
     executor.MapTaskExecutor().execute(make_map_task(work_spec),
                                        test_shuffle_sink=shuffle_sink_mock)
     # Make sure we have seen all the (k, v) writes.
     shuffle_sink_mock.writer().Write.assert_has_calls([
         mock.call('a', '', 1),
         mock.call('b', '', 1),
         mock.call('c', '', 1),
         mock.call('d', '', 1)
     ])
Example #5
0
 def test_read_do_write(self):
   input_path = self.create_temp_file('01234567890123456789\n0123456789')
   output_path = '%s.out' % input_path
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           fileio.TextFileSource(file_path=input_path,
                                 start_offset=0,
                                 end_offset=15,
                                 strip_trailing_newlines=True,
                                 coder=coders.StrUtf8Coder()),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: 01234567890123456789\n', f.read())
Example #6
0
    def test_create_do_avro_write(self):
        output_path = self.create_temp_file('n/a')
        elements = ['abc', 'def', 'ghi']
        work_item = workitem.BatchWorkItem(None)

        work_item.map_task = make_map_task([
            maptask.WorkerRead(
                inmemory.InMemorySource(
                    elements=[pickler.dumps(e) for e in elements],
                    start_index=2,  # Start at the last element.
                    end_index=3),
                output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                               output_tags=['out'],
                               input=(0, 0),
                               side_inputs=None,
                               output_coders=[self.OUTPUT_CODER]),
            make_text_sink(output_path,
                           input=(1, 0),
                           coder=coders.Base64PickleCoder())
        ])

        executor.MapTaskExecutor(work_item.map_task).execute()
        with open(output_path) as f:
            self.assertEqual('XYZ: ghi', pickler.loads(f.read().strip()))
Example #7
0
 def test_shuffle_read_do_write(self):
     output_path = self.create_temp_file('n/a')
     work_spec = [
         maptask.WorkerGroupingShuffleRead(
             shuffle_reader_config='none',
             start_shuffle_position='aaa',
             end_shuffle_position='zzz',
             coder=self.SHUFFLE_CODER,
             output_coders=[self.SHUFFLE_CODER]),
         maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
             ptransform.CallableWrapperDoFn(
                 lambda (k, vs): [str((k, v)) for v in vs])),
                            output_tags=['out'],
                            output_coders=[self.OUTPUT_CODER],
                            input=(0, 0),
                            side_inputs=None),
         make_text_sink(output_path, input=(1, 0))
     ]
     shuffle_source_mock = mock.MagicMock()
     shuffle_source_mock.reader().__enter__().__iter__.return_value = [
         (10, [1, 2]), (20, [3])
     ]
     executor.MapTaskExecutor().execute(
         make_map_task(work_spec), test_shuffle_source=shuffle_source_mock)
     with open(output_path) as f:
         self.assertEqual('(10, 1)\n(10, 2)\n(20, 3)\n', f.read())
    def process_work_item(self, computation_id, map_task_proto,
                          input_data_watermark, work_item):
        """Process a work item."""
        workitem_commit_request = windmill_pb2.WorkItemCommitRequest(
            key=work_item.key, work_token=work_item.work_token)

        env = maptask.WorkerEnvironment()
        context = maptask.StreamingExecutionContext()

        reader = windmillstate.WindmillStateReader(computation_id,
                                                   work_item.key,
                                                   work_item.work_token,
                                                   self.windmill)
        state_internals = windmillstate.WindmillStateInternals(reader)
        state = windmillstate.WindmillUnmergedState(state_internals)
        output_data_watermark = windmillio.windmill_to_harness_timestamp(
            work_item.output_data_watermark)

        context.start(computation_id, work_item, input_data_watermark,
                      output_data_watermark, workitem_commit_request,
                      self.windmill, state)

        map_task = maptask.decode_map_task(map_task_proto, env, context)
        map_task_executor = executor.MapTaskExecutor(map_task)
        map_task_executor.execute()
        state_internals.persist_to(workitem_commit_request)

        # Send result to Windmill.
        # TODO(ccy): in the future, this will not be done serially with respect to
        # work execution.
        commit_request = windmill_pb2.CommitWorkRequest()
        computation_commit_request = windmill_pb2.ComputationCommitWorkRequest(
            computation_id=computation_id, requests=[workitem_commit_request])
        commit_request.requests.extend([computation_commit_request])
        self.windmill.CommitWork(commit_request)
Example #9
0
 def test_create_do_with_side_text_file_write(self):
   input_path = self.create_temp_file('x\ny\n')
   elements = ['aa', 'bb']
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=2),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(
           serialized_fn=pickle_with_side_inputs(
               ptransform.CallableWrapperDoFn(
                   lambda x, side: ['%s:%s' % (x, s) for s in side]),
               tag_and_type=('textfile', pvalue.IterablePCollectionView, ())),
           output_tags=['out'], input=(0, 0),
           side_inputs=[
               maptask.WorkerSideInputSource(fileio.TextFileSource(
                   file_path=input_path, start_offset=None, end_offset=None,
                   strip_trailing_newlines=True,
                   coder=coders.StrUtf8Coder()),
                                             tag='textfile')],
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                   input=(1, 0),
                                   output_coders=(self.OUTPUT_CODER,))]))
   # The side source was specified as collection therefore we should see
   # all elements of the side source.
   self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'],
                    sorted(output_buffer))
Example #10
0
 def get_executor_for_work_item(self, work_item):
     if work_item.map_task is not None:
         return executor.MapTaskExecutor(work_item.map_task)
     elif work_item.source_operation_split_task is not None:
         return executor.CustomSourceSplitExecutor(
             work_item.source_operation_split_task)
     else:
         raise ValueError('Unknown type of work item : %s', work_item)
  def do_work(self, work_item, deferred_exception_details=None):
    """Executes worker operations and adds any failures to the report status."""
    logging.info('Executing %s', work_item)
    BatchWorker.log_memory_usage_if_needed(self.worker_id, force=True)

    work_executor = executor.MapTaskExecutor()
    progress_reporter = ProgressReporter(
        work_item, work_executor, self, self.client)

    if deferred_exception_details:
      # Report (fatal) deferred exceptions that happened earlier. This
      # workflow will fail with the deferred exception.
      with work_item.lock:
        self.report_completion_status(
            work_item,
            progress_reporter,
            exception_details=deferred_exception_details)
        work_item.done = True
        logging.error('Not processing WorkItem %s since a deferred exception '
                      'was found: %s', work_item, deferred_exception_details)
        return

    exception_details = None
    try:
      progress_reporter.start_reporting_progress()
      work_executor.execute(work_item.map_task)
    except Exception:  # pylint: disable=broad-except
      exception_details = traceback.format_exc()
      logging.error('An exception was raised when trying to execute the '
                    'work item %s : %s',
                    work_item,
                    exception_details, exc_info=True)
    finally:
      try:
        progress_reporter.stop_reporting_progress()
      except Exception:  # pylint: disable=broad-except
        logging.error('An exception was raised when trying to stop the '
                      'progress reporter : %s',
                      traceback.format_exc(), exc_info=True)
        # If 'exception_details' was already set, we were already going to
        # mark this work item as failed. Hence only logging this error and
        # reporting the original error.
        if exception_details is None:
          # This will be reported to the service and work item will be marked as
          # failed.
          exception_details = traceback.format_exc()

      with work_item.lock:
        self.report_completion_status(work_item, progress_reporter,
                                      exception_details=exception_details)
        work_item.done = True
Example #12
0
    def test_create_do_with_collection_side_bigquery_write(self):
        elements = ['aa', 'bb']
        side_elements = ['x', 'y']
        output_buffer = []
        patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader'
        with mock.patch(target=patch_target) as mock_class:
            # Setup the reader so it will yield the values in 'side_elements'.
            reader_mock = mock_class.return_value
            reader_mock.__enter__.return_value = reader_mock
            # Use a lambda so that multiple readers can be created, each reading the
            # entirety of the side elements.
            reader_mock.__iter__.side_effect = lambda: (x
                                                        for x in side_elements)

            executor.MapTaskExecutor().execute(
                make_map_task([
                    maptask.WorkerRead(inmemory.InMemorySource(
                        elements=[pickler.dumps(e) for e in elements],
                        start_index=0,
                        end_index=3),
                                       output_coders=[self.OUTPUT_CODER]),
                    maptask.
                    WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                        ptransform.CallableWrapperDoFn(
                            lambda x, side: ['%s:%s' % (x, s) for s in side]),
                        tag_and_type=('bigquery',
                                      pvalue.IterablePCollectionView, ())),
                               output_tags=['out'],
                               input=(0, 0),
                               side_inputs=[
                                   maptask.WorkerSideInputSource(
                                       bigquery.BigQuerySource(
                                           project='project',
                                           dataset='dataset',
                                           table='table',
                                           coder=get_bigquery_source_coder()),
                                       tag='bigquery')
                               ],
                               output_coders=[self.OUTPUT_CODER]),
                    maptask.WorkerInMemoryWrite(
                        output_buffer=output_buffer,
                        input=(1, 0),
                        output_coders=(self.OUTPUT_CODER, ))
                ]))
        # The side source was specified as collection therefore we should see
        # all elements of the side source.
        self.assertEqual(['aa:x', 'aa:y', 'bb:x', 'bb:y'],
                         sorted(output_buffer))
Example #13
0
    def test_in_memory_source_progress_reporting(self):
        elements = [101, 201, 301, 401, 501, 601, 701]
        output_buffer = []
        source = ProgressRequestRecordingInMemorySource(
            elements=[pickler.dumps(e) for e in elements])
        work_item = workitem.BatchWorkItem(None)
        work_item.map_task = make_map_task([
            maptask.WorkerRead(source, output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                        input=(0, 0),
                                        output_coders=(self.OUTPUT_CODER, ))
        ])
        executor.MapTaskExecutor(work_item.map_task).execute()
        self.assertEqual(elements, output_buffer)

        expected_progress_record = range(len(elements))
        self.assertEqual(expected_progress_record,
                         source.last_reader.progress_record)
Example #14
0
 def test_create_do_with_side_avro_file_write(self):
     input_path1 = self.create_temp_file('%s\n' % pickler.dumps('x'))
     input_path2 = self.create_temp_file('%s\n' % pickler.dumps('y'))
     elements = ['aa', 'bb']
     output_buffer = []
     executor.MapTaskExecutor().execute(
         make_map_task([
             maptask.WorkerRead(inmemory.InMemorySource(
                 elements=[pickler.dumps(e) for e in elements],
                 start_index=0,
                 end_index=2),
                                output_coders=[self.OUTPUT_CODER]),
             maptask.WorkerDoFn(
                 serialized_fn=pickle_with_side_inputs(
                     ptransform.CallableWrapperDoFn(
                         lambda x, side: ['%s:%s' % (x, s) for s in side]),
                     tag_and_type=('sometag',
                                   pvalue.IterablePCollectionView, ())),
                 output_tags=['out'],
                 input=(0, 0),
                 # Note that the two side inputs have the same tag. This is quite
                 # common for intermediary PCollections used as side inputs that
                 # are saved as AVRO files. The files will contain the sharded
                 # PCollection.
                 side_inputs=[
                     maptask.WorkerSideInputSource(fileio.TextFileSource(
                         file_path=input_path1,
                         coder=coders.Base64PickleCoder()),
                                                   tag='sometag'),
                     maptask.WorkerSideInputSource(fileio.TextFileSource(
                         file_path=input_path2,
                         coder=coders.Base64PickleCoder()),
                                                   tag='sometag')
                 ],
                 output_coders=[self.OUTPUT_CODER]),
             maptask.WorkerInMemoryWrite(
                 output_buffer=output_buffer,
                 input=(1, 0),
                 output_coders=(self.OUTPUT_CODER, ))
         ]))
     # The side source was specified as collection therefore we should see
     # all three elements of the side source.
     self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'],
                      sorted(output_buffer))
Example #15
0
 def test_ungrouped_shuffle_read_and_write(self):
     output_path = self.create_temp_file('n/a')
     work_spec = [
         maptask.WorkerUngroupedShuffleRead(
             shuffle_reader_config='none',
             start_shuffle_position='aaa',
             end_shuffle_position='zzz',
             coder=self.SHUFFLE_CODER,
             output_coders=[self.SHUFFLE_CODER]),
         make_text_sink(output_path, input=(0, 0))
     ]
     shuffle_source_mock = mock.MagicMock()
     shuffle_source_mock.reader().__enter__().__iter__.return_value = [
         1, 2, 3
     ]
     executor.MapTaskExecutor().execute(
         make_map_task(work_spec), test_shuffle_source=shuffle_source_mock)
     with open(output_path) as f:
         self.assertEqual('1\n2\n3\n', f.read())
Example #16
0
 def test_pgbk(self):
   elements = [('a', 1), ('b', 2), ('a', 3), ('a', 4)]
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(elements=[pickler.dumps(e) for e in elements
                                            ],
                                   start_index=0,
                                   end_index=100),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerPartialGroupByKey(
           combine_fn=None,
           input=(0, 0),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                   input=(1, 0),
                                   output_coders=(self.OUTPUT_CODER,))
   ]))
   self.assertEqual([('a', [1, 3, 4]), ('b', [2])], sorted(output_buffer))
Example #17
0
  def test_in_memory_source_progress_reporting(self):
    elements = [101, 201, 301, 401, 501, 601, 701]
    output_buffer = []
    source = ProgressRequestRecordingInMemorySource(
        elements=[pickler.dumps(e) for e in elements])
    executor.MapTaskExecutor().execute(make_map_task([
        maptask.WorkerRead(source, output_coders=[self.OUTPUT_CODER]),
        maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                    input=(0, 0),
                                    output_coders=(self.OUTPUT_CODER,))
    ]))
    self.assertEqual(elements, output_buffer)

    expected_progress_record = []
    len_elements = len(elements)
    for i in range(len_elements):
      expected_progress_record.append(float(i + 1) / len_elements)

    self.assertEqual(expected_progress_record,
                     source.last_reader.progress_record)
Example #18
0
 def test_combine(self):
   elements = [('a', [1, 2, 3]), ('b', [10])]
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=100),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerCombineFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CombineFn.from_callable(sum)),
                               phase='all',
                               input=(0, 0),
                               output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                   input=(1, 0),
                                   output_coders=(self.OUTPUT_CODER,))
   ]))
   self.assertEqual([('a', 6), ('b', 10)], output_buffer)
Example #19
0
  def test_create_do_with_singleton_side_bigquery_write(self):
    elements = ['abc', 'def', 'ghi']
    side_elements = ['x', 'y', 'z']
    output_buffer = []
    patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader'
    with mock.patch(target=patch_target) as mock_class:
      # Setup the reader so it will yield the values in 'side_elements'.
      reader_mock = mock_class.return_value
      reader_mock.__enter__.return_value = reader_mock
      reader_mock.__iter__.return_value = (x for x in side_elements)

      pickled_elements = [pickler.dumps(e) for e in elements]
      executor.MapTaskExecutor().execute(make_map_task([
          maptask.WorkerRead(
              inmemory.InMemorySource(elements=pickled_elements,
                                      start_index=0,
                                      end_index=3),
              output_coders=[self.OUTPUT_CODER]),
          maptask.WorkerDoFn(
              serialized_fn=pickle_with_side_inputs(
                  ptransform.CallableWrapperDoFn(
                      lambda x, side: ['%s:%s' % (x, side)]),
                  tag_and_type=('bigquery', pvalue.SingletonPCollectionView,
                                (False, None))),
              output_tags=['out'], input=(0, 0),
              side_inputs=[
                  maptask.WorkerSideInputSource(
                      bigquery.BigQuerySource(
                          project='project',
                          dataset='dataset',
                          table='table',
                          coder=get_bigquery_source_coder()),
                      tag='bigquery')],
              output_coders=[self.OUTPUT_CODER]),
          maptask.WorkerInMemoryWrite(
              output_buffer=output_buffer,
              input=(1, 0),
              output_coders=(self.OUTPUT_CODER,))]))
    # The side source was specified as singleton therefore we should see
    # only the first element appended.
    self.assertEqual(['abc:x', 'def:x', 'ghi:x'], output_buffer)
Example #20
0
    def test_read_do_write_with_undeclared_output(self):
        input_path = self.create_temp_file('01234567890123456789\n0123456789')
        output_path = '%s.out' % input_path
        work_item = workitem.BatchWorkItem(None)
        work_item.map_task = make_map_task([
            maptask.WorkerRead(fileio.TextFileSource(
                file_path=input_path,
                start_offset=0,
                end_offset=15,
                strip_trailing_newlines=True,
                coder=coders.StrUtf8Coder()),
                               output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                DoFnUsingWithUndeclaredSideOutput()),
                               output_tags=['out'],
                               output_coders=[self.OUTPUT_CODER],
                               input=(0, 0),
                               side_inputs=None),
            make_text_sink(output_path, input=(1, 0))
        ])

        executor.MapTaskExecutor(work_item.map_task).execute()
        with open(output_path) as f:
            self.assertEqual('01234567890123456789\n', f.read())