예제 #1
0
 def _make_name_contexts(original_names, user_names, system_names):
     # TODO(BEAM-4028): Remove method once map task relies on name contexts.
     return [
         common.DataflowNameContext(step_name, user_name, system_name)
         for step_name, user_name, system_name in zip(
             original_names, user_names, system_names)
     ]
예제 #2
0
  def execute(self):
    """Executes all the operation_specs.Worker* instructions in a map task.

    We update the map_task with the execution status, expressed as counters.

    Raises:
      RuntimeError: if we find more than on read instruction in task spec.
      TypeError: if the spec parameter is not an instance of the recognized
        operation_specs.Worker* classes.
    """

    # operations is a list of operation_specs.Worker* instances.
    # The order of the elements is important because the inputs use
    # list indexes as references.

    for ix, spec in enumerate(self._map_task.operations):
      # This is used for logging and assigning names to counters.
      name_context = common.DataflowNameContext(
          step_name=self._map_task.original_names[ix],
          user_name=self._map_task.step_names[ix],
          system_name=self._map_task.system_names[ix])
      op = create_operation(
          name_context, spec, self._counter_factory, None,
          self._state_sampler,
          test_shuffle_source=self._test_shuffle_source,
          test_shuffle_sink=self._test_shuffle_sink)
      self._ops.append(op)

      # Add receiver operations to the appropriate producers.
      if hasattr(op.spec, 'input'):
        producer, output_index = op.spec.input
        self._ops[producer].add_receiver(op, output_index)
      # Flatten has 'inputs', not 'input'
      if hasattr(op.spec, 'inputs'):
        for producer, output_index in op.spec.inputs:
          self._ops[producer].add_receiver(op, output_index)

    for ix, op in reversed(list(enumerate(self._ops))):
      logging.debug('Starting op %d %s', ix, op)
      with op.scoped_metrics_container:
        op.start()
    for op in self._ops:
      with op.scoped_metrics_container:
        op.finish()
예제 #3
0
def create_operation(name_context,
                     spec,
                     counter_factory,
                     step_name,
                     state_sampler,
                     test_shuffle_source=None,
                     test_shuffle_sink=None,
                     is_streaming=False):
    """Create Operation object for given operation specification."""
    if not isinstance(name_context, common.NameContext):
        # TODO(BEAM-4028): Remove ad-hoc NameContext once all has been migrated.
        name_context = common.DataflowNameContext(step_name=name_context,
                                                  user_name=step_name,
                                                  system_name=None)

    if isinstance(spec, operation_specs.WorkerRead):
        if isinstance(spec.source, iobase.SourceBundle):
            op = ReadOperation(name_context, spec, counter_factory,
                               state_sampler)
        else:
            from dataflow_worker.native_operations import NativeReadOperation
            op = NativeReadOperation(name_context, spec, counter_factory,
                                     state_sampler)
    elif isinstance(spec, operation_specs.WorkerWrite):
        from dataflow_worker.native_operations import NativeWriteOperation
        op = NativeWriteOperation(name_context, spec, counter_factory,
                                  state_sampler)
    elif isinstance(spec, operation_specs.WorkerCombineFn):
        op = CombineOperation(name_context, spec, counter_factory,
                              state_sampler)
    elif isinstance(spec, operation_specs.WorkerPartialGroupByKey):
        op = create_pgbk_op(name_context, spec, counter_factory, state_sampler)
    elif isinstance(spec, operation_specs.WorkerDoFn):
        op = DoOperation(name_context, spec, counter_factory, state_sampler)
    elif isinstance(spec, operation_specs.WorkerGroupingShuffleRead):
        from dataflow_worker.shuffle_operations import GroupedShuffleReadOperation
        op = GroupedShuffleReadOperation(name_context,
                                         spec,
                                         counter_factory,
                                         state_sampler,
                                         shuffle_source=test_shuffle_source)
    elif isinstance(spec, operation_specs.WorkerUngroupedShuffleRead):
        from dataflow_worker.shuffle_operations import UngroupedShuffleReadOperation
        op = UngroupedShuffleReadOperation(name_context,
                                           spec,
                                           counter_factory,
                                           state_sampler,
                                           shuffle_source=test_shuffle_source)
    elif isinstance(spec, operation_specs.WorkerInMemoryWrite):
        op = InMemoryWriteOperation(name_context, spec, counter_factory,
                                    state_sampler)
    elif isinstance(spec, operation_specs.WorkerShuffleWrite):
        from dataflow_worker.shuffle_operations import ShuffleWriteOperation
        op = ShuffleWriteOperation(name_context,
                                   spec,
                                   counter_factory,
                                   state_sampler,
                                   shuffle_sink=test_shuffle_sink)
    elif isinstance(spec, operation_specs.WorkerFlatten):
        op = FlattenOperation(name_context, spec, counter_factory,
                              state_sampler)
    elif isinstance(spec, operation_specs.WorkerMergeWindows):
        from dataflow_worker.shuffle_operations import BatchGroupAlsoByWindowsOperation
        from dataflow_worker.shuffle_operations import StreamingGroupAlsoByWindowsOperation
        if is_streaming:
            op = StreamingGroupAlsoByWindowsOperation(name_context, spec,
                                                      counter_factory,
                                                      state_sampler)
        else:
            op = BatchGroupAlsoByWindowsOperation(name_context, spec,
                                                  counter_factory,
                                                  state_sampler)
    elif isinstance(spec, operation_specs.WorkerReifyTimestampAndWindows):
        from dataflow_worker.shuffle_operations import ReifyTimestampAndWindowsOperation
        op = ReifyTimestampAndWindowsOperation(name_context, spec,
                                               counter_factory, state_sampler)
    else:
        raise TypeError(
            'Expected an instance of operation_specs.Worker* class '
            'instead of %s' % (spec, ))
    return op