Beispiel #1
0
    def scoped_state(self,
                     name_context,
                     state_name,
                     io_target=None,
                     metrics_container=None):
        """Returns a ScopedState object associated to a Step and a State.

    Args:
      name_context: common.NameContext. It is the step name information.
      state_name: str. It is the state name (e.g. process / start / finish).
      io_target:
      metrics_container: MetricsContainer. The step's metrics container.

    Returns:
      A ScopedState that keeps the execution context and is able to switch it
      for the execution thread.
    """
        if not isinstance(name_context, common.NameContext):
            name_context = common.NameContext(name_context)

        counter_name = CounterName(state_name + '-msecs',
                                   stage_name=self._prefix,
                                   step_name=name_context.metrics_name(),
                                   io_target=io_target)
        if counter_name in self._states_by_name:
            return self._states_by_name[counter_name]
        else:
            output_counter = self._counter_factory.get_counter(
                counter_name, Counter.SUM)
            self._states_by_name[counter_name] = super(
                StateSampler, self)._scoped_state(counter_name, name_context,
                                                  output_counter,
                                                  metrics_container)
            return self._states_by_name[counter_name]
Beispiel #2
0
    def __init__(
            self,
            name_context,  # type: Union[str, common.NameContext]
            spec,
            counter_factory,
            state_sampler  # type: StateSampler
    ):
        """Initializes a worker operation instance.

    Args:
      name_context: A NameContext instance or string(deprecated), with the
        name information for this operation.
      spec: A operation_specs.Worker* instance.
      counter_factory: The CounterFactory to use for our counters.
      state_sampler: The StateSampler for the current operation.
    """
        if isinstance(name_context, common.NameContext):
            # TODO(BEAM-4028): Clean this up once it's completely migrated.
            # We use the specific operation name that is used for metrics and state
            # sampling.
            self.name_context = name_context
        else:
            self.name_context = common.NameContext(name_context)

        self.spec = spec
        self.counter_factory = counter_factory
        self.execution_context = None  # type: Optional[ExecutionContext]
        self.consumers = collections.defaultdict(
            list)  # type: DefaultDict[int, List[Operation]]

        # These are overwritten in the legacy harness.
        self.metrics_container = MetricsContainer(
            self.name_context.metrics_name())

        self.state_sampler = state_sampler
        self.scoped_start_state = self.state_sampler.scoped_state(
            self.name_context,
            'start',
            metrics_container=self.metrics_container)
        self.scoped_process_state = self.state_sampler.scoped_state(
            self.name_context,
            'process',
            metrics_container=self.metrics_container)
        self.scoped_finish_state = self.state_sampler.scoped_state(
            self.name_context,
            'finish',
            metrics_container=self.metrics_container)
        # TODO(ccy): the '-abort' state can be added when the abort is supported in
        # Operations.
        self.receivers = []  # type: List[ConsumerSet]
        # Legacy workers cannot call setup() until after setting additional state
        # on the operation.
        self.setup_done = False
        self.step_name = None  # type: Optional[str]
Beispiel #3
0
def _create_user_defined_function_operation(factory, transform_proto,
                                            consumers, udfs_proto,
                                            beam_operation_cls,
                                            internal_operation_cls):
    output_tags = list(transform_proto.outputs.keys())
    output_coders = factory.get_output_coders(transform_proto)
    spec = operation_specs.WorkerDoFn(
        serialized_fn=udfs_proto,
        output_tags=output_tags,
        input=None,
        side_inputs=None,
        output_coders=[output_coders[tag] for tag in output_tags])
    name = common.NameContext(transform_proto.unique_name)

    serialized_fn = spec.serialized_fn
    if hasattr(serialized_fn, "key_type"):
        # keyed operation, need to create the KeyedStateBackend.
        row_schema = serialized_fn.key_type.row_schema
        key_row_coder = FlattenRowCoder(
            [from_proto(f.type) for f in row_schema.fields])
        if serialized_fn.HasField('group_window'):
            if serialized_fn.group_window.is_time_window:
                window_coder = TimeWindowCoder()
            else:
                window_coder = CountWindowCoder()
        else:
            window_coder = None
        keyed_state_backend = RemoteKeyedStateBackend(
            factory.state_handler, key_row_coder, window_coder,
            serialized_fn.state_cache_size,
            serialized_fn.map_state_read_cache_size,
            serialized_fn.map_state_write_cache_size)

        return beam_operation_cls(name, spec, factory.counter_factory,
                                  factory.state_sampler, consumers,
                                  internal_operation_cls, keyed_state_backend)
    elif internal_operation_cls == datastream_operations.StatefulOperation:
        key_row_coder = from_type_info_proto(serialized_fn.key_type_info)
        keyed_state_backend = RemoteKeyedStateBackend(
            factory.state_handler, key_row_coder, None,
            serialized_fn.state_cache_size,
            serialized_fn.map_state_read_cache_size,
            serialized_fn.map_state_write_cache_size)
        return beam_operation_cls(name, spec, factory.counter_factory,
                                  factory.state_sampler, consumers,
                                  internal_operation_cls, keyed_state_backend)
    else:
        return beam_operation_cls(name, spec, factory.counter_factory,
                                  factory.state_sampler, consumers,
                                  internal_operation_cls)
    def __init__(self, name_context, spec, counter_factory, state_sampler):
        """Initializes a worker operation instance.

    Args:
      name_context: A NameContext instance or string(deprecated), with the
        name information for this operation.
      spec: A operation_specs.Worker* instance.
      counter_factory: The CounterFactory to use for our counters.
      state_sampler: The StateSampler for the current operation.
    """
        if isinstance(name_context, common.NameContext):
            # TODO(BEAM-4028): Clean this up once it's completely migrated.
            # We use the specific operation name that is used for metrics and state
            # sampling.
            self.name_context = name_context
        else:
            self.name_context = common.NameContext(name_context)

        # TODO(BEAM-4028): Remove following two lines. Rely on name context.
        self.operation_name = self.name_context.step_name
        self.step_name = self.name_context.logging_name()

        self.spec = spec
        self.counter_factory = counter_factory
        self.consumers = collections.defaultdict(list)

        # These are overwritten in the legacy harness.
        self.metrics_container = MetricsContainer(
            self.name_context.metrics_name())
        # TODO(BEAM-4094): Remove ScopedMetricsContainer after Dataflow no longer
        # depends on it.
        self.scoped_metrics_container = ScopedMetricsContainer()

        self.state_sampler = state_sampler
        self.scoped_start_state = self.state_sampler.scoped_state(
            self.name_context.metrics_name(),
            'start',
            metrics_container=self.metrics_container)
        self.scoped_process_state = self.state_sampler.scoped_state(
            self.name_context.metrics_name(),
            'process',
            metrics_container=self.metrics_container)
        self.scoped_finish_state = self.state_sampler.scoped_state(
            self.name_context.metrics_name(),
            'finish',
            metrics_container=self.metrics_container)
        # TODO(ccy): the '-abort' state can be added when the abort is supported in
        # Operations.
        self.receivers = []
Beispiel #5
0
def create_operation(name_context,
                     spec,
                     counter_factory,
                     step_name=None,
                     state_sampler=None,
                     test_shuffle_source=None,
                     test_shuffle_sink=None,
                     is_streaming=False):
    # type: (...) -> Operation
    """Create Operation object for given operation specification."""

    # TODO(pabloem): Document arguments to this function call.
    if not isinstance(name_context, common.NameContext):
        name_context = common.NameContext(step_name=name_context)

    if isinstance(spec, operation_specs.WorkerRead):
        if isinstance(spec.source, iobase.SourceBundle):
            op = ReadOperation(name_context, spec, counter_factory,
                               state_sampler)  # type: Operation
        else:
            from dataflow_worker.native_operations import NativeReadOperation
            op = NativeReadOperation(name_context, spec, counter_factory,
                                     state_sampler)
    elif isinstance(spec, operation_specs.WorkerWrite):
        from dataflow_worker.native_operations import NativeWriteOperation
        op = NativeWriteOperation(name_context, spec, counter_factory,
                                  state_sampler)
    elif isinstance(spec, operation_specs.WorkerCombineFn):
        op = CombineOperation(name_context, spec, counter_factory,
                              state_sampler)
    elif isinstance(spec, operation_specs.WorkerPartialGroupByKey):
        op = create_pgbk_op(name_context, spec, counter_factory, state_sampler)
    elif isinstance(spec, operation_specs.WorkerDoFn):
        op = DoOperation(name_context, spec, counter_factory, state_sampler)
    elif isinstance(spec, operation_specs.WorkerGroupingShuffleRead):
        from dataflow_worker.shuffle_operations import GroupedShuffleReadOperation
        op = GroupedShuffleReadOperation(name_context,
                                         spec,
                                         counter_factory,
                                         state_sampler,
                                         shuffle_source=test_shuffle_source)
    elif isinstance(spec, operation_specs.WorkerUngroupedShuffleRead):
        from dataflow_worker.shuffle_operations import UngroupedShuffleReadOperation
        op = UngroupedShuffleReadOperation(name_context,
                                           spec,
                                           counter_factory,
                                           state_sampler,
                                           shuffle_source=test_shuffle_source)
    elif isinstance(spec, operation_specs.WorkerInMemoryWrite):
        op = InMemoryWriteOperation(name_context, spec, counter_factory,
                                    state_sampler)
    elif isinstance(spec, operation_specs.WorkerShuffleWrite):
        from dataflow_worker.shuffle_operations import ShuffleWriteOperation
        op = ShuffleWriteOperation(name_context,
                                   spec,
                                   counter_factory,
                                   state_sampler,
                                   shuffle_sink=test_shuffle_sink)
    elif isinstance(spec, operation_specs.WorkerFlatten):
        op = FlattenOperation(name_context, spec, counter_factory,
                              state_sampler)
    elif isinstance(spec, operation_specs.WorkerMergeWindows):
        from dataflow_worker.shuffle_operations import BatchGroupAlsoByWindowsOperation
        from dataflow_worker.shuffle_operations import StreamingGroupAlsoByWindowsOperation
        if is_streaming:
            op = StreamingGroupAlsoByWindowsOperation(name_context, spec,
                                                      counter_factory,
                                                      state_sampler)
        else:
            op = BatchGroupAlsoByWindowsOperation(name_context, spec,
                                                  counter_factory,
                                                  state_sampler)
    elif isinstance(spec, operation_specs.WorkerReifyTimestampAndWindows):
        from dataflow_worker.shuffle_operations import ReifyTimestampAndWindowsOperation
        op = ReifyTimestampAndWindowsOperation(name_context, spec,
                                               counter_factory, state_sampler)
    else:
        raise TypeError(
            'Expected an instance of operation_specs.Worker* class '
            'instead of %s' % (spec, ))
    return op