def _make_name_contexts(original_names, user_names, system_names): # TODO(BEAM-4028): Remove method once map task relies on name contexts. return [ common.DataflowNameContext(step_name, user_name, system_name) for step_name, user_name, system_name in zip( original_names, user_names, system_names) ]
def execute(self): """Executes all the operation_specs.Worker* instructions in a map task. We update the map_task with the execution status, expressed as counters. Raises: RuntimeError: if we find more than on read instruction in task spec. TypeError: if the spec parameter is not an instance of the recognized operation_specs.Worker* classes. """ # operations is a list of operation_specs.Worker* instances. # The order of the elements is important because the inputs use # list indexes as references. for ix, spec in enumerate(self._map_task.operations): # This is used for logging and assigning names to counters. name_context = common.DataflowNameContext( step_name=self._map_task.original_names[ix], user_name=self._map_task.step_names[ix], system_name=self._map_task.system_names[ix]) op = create_operation( name_context, spec, self._counter_factory, None, self._state_sampler, test_shuffle_source=self._test_shuffle_source, test_shuffle_sink=self._test_shuffle_sink) self._ops.append(op) # Add receiver operations to the appropriate producers. if hasattr(op.spec, 'input'): producer, output_index = op.spec.input self._ops[producer].add_receiver(op, output_index) # Flatten has 'inputs', not 'input' if hasattr(op.spec, 'inputs'): for producer, output_index in op.spec.inputs: self._ops[producer].add_receiver(op, output_index) for ix, op in reversed(list(enumerate(self._ops))): logging.debug('Starting op %d %s', ix, op) with op.scoped_metrics_container: op.start() for op in self._ops: with op.scoped_metrics_container: op.finish()
def create_operation(name_context, spec, counter_factory, step_name, state_sampler, test_shuffle_source=None, test_shuffle_sink=None, is_streaming=False): """Create Operation object for given operation specification.""" if not isinstance(name_context, common.NameContext): # TODO(BEAM-4028): Remove ad-hoc NameContext once all has been migrated. name_context = common.DataflowNameContext(step_name=name_context, user_name=step_name, system_name=None) if isinstance(spec, operation_specs.WorkerRead): if isinstance(spec.source, iobase.SourceBundle): op = ReadOperation(name_context, spec, counter_factory, state_sampler) else: from dataflow_worker.native_operations import NativeReadOperation op = NativeReadOperation(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerWrite): from dataflow_worker.native_operations import NativeWriteOperation op = NativeWriteOperation(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerCombineFn): op = CombineOperation(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerPartialGroupByKey): op = create_pgbk_op(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerDoFn): op = DoOperation(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerGroupingShuffleRead): from dataflow_worker.shuffle_operations import GroupedShuffleReadOperation op = GroupedShuffleReadOperation(name_context, spec, counter_factory, state_sampler, shuffle_source=test_shuffle_source) elif isinstance(spec, operation_specs.WorkerUngroupedShuffleRead): from dataflow_worker.shuffle_operations import UngroupedShuffleReadOperation op = UngroupedShuffleReadOperation(name_context, spec, counter_factory, state_sampler, shuffle_source=test_shuffle_source) elif isinstance(spec, operation_specs.WorkerInMemoryWrite): op = InMemoryWriteOperation(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerShuffleWrite): from dataflow_worker.shuffle_operations import ShuffleWriteOperation op = ShuffleWriteOperation(name_context, spec, counter_factory, state_sampler, shuffle_sink=test_shuffle_sink) elif isinstance(spec, operation_specs.WorkerFlatten): op = FlattenOperation(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerMergeWindows): from dataflow_worker.shuffle_operations import BatchGroupAlsoByWindowsOperation from dataflow_worker.shuffle_operations import StreamingGroupAlsoByWindowsOperation if is_streaming: op = StreamingGroupAlsoByWindowsOperation(name_context, spec, counter_factory, state_sampler) else: op = BatchGroupAlsoByWindowsOperation(name_context, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerReifyTimestampAndWindows): from dataflow_worker.shuffle_operations import ReifyTimestampAndWindowsOperation op = ReifyTimestampAndWindowsOperation(name_context, spec, counter_factory, state_sampler) else: raise TypeError( 'Expected an instance of operation_specs.Worker* class ' 'instead of %s' % (spec, )) return op