def __iter__(self):
   output_stream = create_OutputStream()
   for encoded_key, values in self._table.items():
     key = self._key_coder.decode(encoded_key)
     self._post_grouped_coder.get_impl().encode_to_stream(
         GlobalWindows.windowed_value((key, values)), output_stream, True)
   return iter([output_stream.get()])
 def items(self):
   value_coder_impl = self._value_coder.get_impl()
   for window, values in self._values_by_window.items():
     encoded_window = self._window_coder.encode(window)
     output_stream = create_OutputStream()
     for value in values:
       value_coder_impl.encode_to_stream(value, output_stream, True)
     yield encoded_window, output_stream.get()
 def __iter__(self):
   output_stream = create_OutputStream()
   trigger_driver = trigger.create_trigger_driver(self._windowing, True)
   for encoded_key, windowed_values in self._table.items():
     key = self._key_coder.decode(encoded_key)
     for wkvs in trigger_driver.process_entire_key(key, windowed_values):
       self._post_grouped_coder.get_impl().encode_to_stream(
           wkvs, output_stream, True)
   return iter([output_stream.get()])
 def _commit(self):
   if self._cleared:
     self._state_handler.blocking_clear(self._state_key)
   if self._added_elements:
     value_coder_impl = self._value_coder.get_impl()
     out = coder_impl.create_OutputStream()
     for element in self._added_elements:
       value_coder_impl.encode_to_stream(element, out, True)
     self._state_handler.blocking_append(self._state_key, out.get())
Exemple #5
0
 def iterable_state_write(values, element_coder_impl):
   token = unique_name(None, 'iter').encode('ascii')
   out = create_OutputStream()
   for element in values:
     element_coder_impl.encode_to_stream(element, out, True)
   controller.state.blocking_append(
       beam_fn_api_pb2.StateKey(
           runner=beam_fn_api_pb2.StateKey.Runner(key=token)),
       out.get())
   return token
Exemple #6
0
 def encoded_items(self):
   value_coder_impl = self._value_coder.get_impl()
   key_coder_impl = self._key_coder.get_impl()
   for (key, window), values in self._values_by_window.items():
     encoded_window = self._window_coder.encode(window)
     encoded_key = key_coder_impl.encode_nested(key)
     output_stream = create_OutputStream()
     for value in values:
       value_coder_impl.encode_to_stream(value, output_stream, True)
     yield encoded_key, encoded_window, output_stream.get()
 def __iter__(self):
   output_stream = create_OutputStream()
   if self._windowing.is_default():
     globally_window = GlobalWindows.windowed_value(None).with_value
     windowed_key_values = lambda key, values: [globally_window((key, values))]
   else:
     trigger_driver = trigger.create_trigger_driver(self._windowing, True)
     windowed_key_values = trigger_driver.process_entire_key
   coder_impl = self._post_grouped_coder.get_impl()
   key_coder_impl = self._key_coder.get_impl()
   for encoded_key, windowed_values in self._table.items():
     key = key_coder_impl.decode(encoded_key)
     for wkvs in windowed_key_values(key, windowed_values):
       coder_impl.encode_to_stream(wkvs, output_stream, True)
   return iter([output_stream.get()])
Exemple #8
0
 def __init__(self, flatten_row_coder):
     self._flatten_row_coder = flatten_row_coder
     self._data_out_stream = create_OutputStream()
Exemple #9
0
from apache_beam.runners.worker.channel_factory import GRPCChannelFactory
from apache_beam.runners.worker.worker_id_interceptor import WorkerIdInterceptor

if TYPE_CHECKING:
    # TODO(BEAM-9372): move this out of the TYPE_CHECKING scope when we drop
    #  support for python < 3.5.3
    from types import TracebackType
    ExcInfo = Tuple[Type[BaseException], BaseException, TracebackType]
    OptExcInfo = Union[ExcInfo, Tuple[None, None, None]]
    # TODO: move this out of the TYPE_CHECKING scope when we drop support for
    #  python < 3.6
    from typing import Collection  # pylint: disable=ungrouped-imports
    import apache_beam.coders.slow_stream
    OutputStream = apache_beam.coders.slow_stream.OutputStream
else:
    OutputStream = type(coder_impl.create_OutputStream())

# This module is experimental. No backwards-compatibility guarantees.

_LOGGER = logging.getLogger(__name__)

_DEFAULT_SIZE_FLUSH_THRESHOLD = 10 << 20  # 10MB
_DEFAULT_TIME_FLUSH_THRESHOLD_MS = 0  # disable time-based flush by default


class ClosableOutputStream(OutputStream):
    """A Outputstream for use with CoderImpls that has a close() method."""
    def __init__(self, close_callback=None):
        super(ClosableOutputStream, self).__init__()
        self._close_callback = close_callback
Exemple #10
0
def encode_nested(coder, value, nested=True):
    out = coder_impl.create_OutputStream()
    coder.get_impl().encode_to_stream(value, out, nested)
    return out.get()
Exemple #11
0
 def _reencode_elements(elements, element_coder):
   output_stream = create_OutputStream()
   for element in elements:
     element_coder.get_impl().encode_to_stream(element, output_stream, True)
   return output_stream.get()
Exemple #12
0
    def _map_task_registration(self, map_task, state_handler,
                               data_operation_spec):
        input_data = {}
        runner_sinks = {}
        transforms = []
        transform_index_to_id = {}

        # Maps coders to new coder objects and references.
        coders = {}

        def coder_id(coder):
            if coder not in coders:
                coders[coder] = beam_fn_api_pb2.Coder(
                    function_spec=sdk_worker.pack_function_spec_data(
                        json.dumps(coder.as_cloud_object()),
                        sdk_worker.PYTHON_CODER_URN,
                        id=self._next_uid()))

            return coders[coder].function_spec.id

        def output_tags(op):
            return getattr(op, 'output_tags', ['out'])

        def as_target(op_input):
            input_op_index, input_output_index = op_input
            input_op = map_task[input_op_index][1]
            return {
                'ignored_input_tag':
                beam_fn_api_pb2.Target.List(target=[
                    beam_fn_api_pb2.Target(
                        primitive_transform_reference=transform_index_to_id[
                            input_op_index],
                        name=output_tags(input_op)[input_output_index])
                ])
            }

        def outputs(op):
            return {
                tag:
                beam_fn_api_pb2.PCollection(coder_reference=coder_id(coder))
                for tag, coder in zip(output_tags(op), op.output_coders)
            }

        for op_ix, (stage_name, operation) in enumerate(map_task):
            transform_id = transform_index_to_id[op_ix] = self._next_uid()
            if isinstance(operation, operation_specs.WorkerInMemoryWrite):
                # Write this data back to the runner.
                fn = beam_fn_api_pb2.FunctionSpec(
                    urn=sdk_worker.DATA_OUTPUT_URN, id=self._next_uid())
                if data_operation_spec:
                    fn.data.Pack(data_operation_spec)
                inputs = as_target(operation.input)
                side_inputs = {}
                runner_sinks[(transform_id, 'out')] = operation

            elif isinstance(operation, operation_specs.WorkerRead):
                # A Read is either translated to a direct injection of windowed values
                # into the sdk worker, or an injection of the source object into the
                # sdk worker as data followed by an SDF that reads that source.
                if (isinstance(operation.source.source,
                               worker_runner_base.InMemorySource)
                        and isinstance(
                            operation.source.source.default_output_coder(),
                            WindowedValueCoder)):
                    output_stream = create_OutputStream()
                    element_coder = (operation.source.source.
                                     default_output_coder().get_impl())
                    # Re-encode the elements in the nested context and
                    # concatenate them together
                    for element in operation.source.source.read(None):
                        element_coder.encode_to_stream(element, output_stream,
                                                       True)
                    target_name = self._next_uid()
                    input_data[(transform_id,
                                target_name)] = output_stream.get()
                    fn = beam_fn_api_pb2.FunctionSpec(
                        urn=sdk_worker.DATA_INPUT_URN, id=self._next_uid())
                    if data_operation_spec:
                        fn.data.Pack(data_operation_spec)
                    inputs = {target_name: beam_fn_api_pb2.Target.List()}
                    side_inputs = {}
                else:
                    # Read the source object from the runner.
                    source_coder = beam.coders.DillCoder()
                    input_transform_id = self._next_uid()
                    output_stream = create_OutputStream()
                    source_coder.get_impl().encode_to_stream(
                        GlobalWindows.windowed_value(operation.source),
                        output_stream, True)
                    target_name = self._next_uid()
                    input_data[(input_transform_id,
                                target_name)] = output_stream.get()
                    input_ptransform = beam_fn_api_pb2.PrimitiveTransform(
                        id=input_transform_id,
                        function_spec=beam_fn_api_pb2.FunctionSpec(
                            urn=sdk_worker.DATA_INPUT_URN,
                            id=self._next_uid()),
                        # TODO(robertwb): Possible name collision.
                        step_name=stage_name + '/inject_source',
                        inputs={target_name: beam_fn_api_pb2.Target.List()},
                        outputs={
                            'out':
                            beam_fn_api_pb2.PCollection(
                                coder_reference=coder_id(source_coder))
                        })
                    if data_operation_spec:
                        input_ptransform.function_spec.data.Pack(
                            data_operation_spec)
                    transforms.append(input_ptransform)

                    # Read the elements out of the source.
                    fn = sdk_worker.pack_function_spec_data(
                        OLDE_SOURCE_SPLITTABLE_DOFN_DATA,
                        sdk_worker.PYTHON_DOFN_URN,
                        id=self._next_uid())
                    inputs = {
                        'ignored_input_tag':
                        beam_fn_api_pb2.Target.List(target=[
                            beam_fn_api_pb2.Target(
                                primitive_transform_reference=
                                input_transform_id,
                                name='out')
                        ])
                    }
                    side_inputs = {}

            elif isinstance(operation, operation_specs.WorkerDoFn):
                fn = sdk_worker.pack_function_spec_data(
                    operation.serialized_fn,
                    sdk_worker.PYTHON_DOFN_URN,
                    id=self._next_uid())
                inputs = as_target(operation.input)
                # Store the contents of each side input for state access.
                for si in operation.side_inputs:
                    assert isinstance(si.source, iobase.BoundedSource)
                    element_coder = si.source.default_output_coder()
                    view_id = self._next_uid()
                    # TODO(robertwb): Actually flesh out the ViewFn API.
                    side_inputs[si.tag] = beam_fn_api_pb2.SideInput(
                        view_fn=sdk_worker.serialize_and_pack_py_fn(
                            element_coder,
                            urn=sdk_worker.PYTHON_ITERABLE_VIEWFN_URN,
                            id=view_id))
                    # Re-encode the elements in the nested context and
                    # concatenate them together
                    output_stream = create_OutputStream()
                    for element in si.source.read(
                            si.source.get_range_tracker(None, None)):
                        element_coder.get_impl().encode_to_stream(
                            element, output_stream, True)
                    elements_data = output_stream.get()
                    state_key = beam_fn_api_pb2.StateKey(
                        function_spec_reference=view_id)
                    state_handler.Clear(state_key)
                    state_handler.Append(
                        beam_fn_api_pb2.SimpleStateAppendRequest(
                            state_key=state_key, data=[elements_data]))

            elif isinstance(operation, operation_specs.WorkerFlatten):
                fn = sdk_worker.pack_function_spec_data(
                    operation.serialized_fn,
                    sdk_worker.IDENTITY_DOFN_URN,
                    id=self._next_uid())
                inputs = {
                    'ignored_input_tag':
                    beam_fn_api_pb2.Target.List(target=[
                        beam_fn_api_pb2.Target(
                            primitive_transform_reference=
                            transform_index_to_id[input_op_index],
                            name=output_tags(map_task[input_op_index]
                                             [1])[input_output_index]) for
                        input_op_index, input_output_index in operation.inputs
                    ])
                }
                side_inputs = {}

            else:
                raise TypeError(operation)

            ptransform = beam_fn_api_pb2.PrimitiveTransform(
                id=transform_id,
                function_spec=fn,
                step_name=stage_name,
                inputs=inputs,
                side_inputs=side_inputs,
                outputs=outputs(operation))
            transforms.append(ptransform)

        process_bundle_descriptor = beam_fn_api_pb2.ProcessBundleDescriptor(
            id=self._next_uid(),
            coders=coders.values(),
            primitive_transform=transforms)
        return beam_fn_api_pb2.InstructionRequest(
            instruction_id=self._next_uid(),
            register=beam_fn_api_pb2.RegisterRequest(
                process_bundle_descriptor=[process_bundle_descriptor
                                           ])), runner_sinks, input_data
def _encode_gauge(coder, timestamp, value):
  timestamp_coder = coders.VarIntCoder().get_impl()
  stream = coder_impl.create_OutputStream()
  timestamp_coder.encode_to_stream(int(timestamp * 1000), stream, True)
  coder.get_impl().encode_to_stream(value, stream, True)
  return stream.get()
  def _map_task_registration(self, map_task, state_handler,
                             data_operation_spec):
    input_data = {}
    runner_sinks = {}
    transforms = []
    transform_index_to_id = {}

    # Maps coders to new coder objects and references.
    coders = {}

    def coder_id(coder):
      if coder not in coders:
        coders[coder] = beam_fn_api_pb2.Coder(
            function_spec=sdk_worker.pack_function_spec_data(
                json.dumps(coder.as_cloud_object()),
                sdk_worker.PYTHON_CODER_URN, id=self._next_uid()))

      return coders[coder].function_spec.id

    def output_tags(op):
      return getattr(op, 'output_tags', ['out'])

    def as_target(op_input):
      input_op_index, input_output_index = op_input
      input_op = map_task[input_op_index][1]
      return {
          'ignored_input_tag':
              beam_fn_api_pb2.Target.List(target=[
                  beam_fn_api_pb2.Target(
                      primitive_transform_reference=transform_index_to_id[
                          input_op_index],
                      name=output_tags(input_op)[input_output_index])
              ])
      }

    def outputs(op):
      return {
          tag: beam_fn_api_pb2.PCollection(coder_reference=coder_id(coder))
          for tag, coder in zip(output_tags(op), op.output_coders)
      }

    for op_ix, (stage_name, operation) in enumerate(map_task):
      transform_id = transform_index_to_id[op_ix] = self._next_uid()
      if isinstance(operation, operation_specs.WorkerInMemoryWrite):
        # Write this data back to the runner.
        fn = beam_fn_api_pb2.FunctionSpec(urn=sdk_worker.DATA_OUTPUT_URN,
                                          id=self._next_uid())
        if data_operation_spec:
          fn.data.Pack(data_operation_spec)
        inputs = as_target(operation.input)
        side_inputs = {}
        runner_sinks[(transform_id, 'out')] = operation

      elif isinstance(operation, operation_specs.WorkerRead):
        # A Read is either translated to a direct injection of windowed values
        # into the sdk worker, or an injection of the source object into the
        # sdk worker as data followed by an SDF that reads that source.
        if (isinstance(operation.source.source,
                       maptask_executor_runner.InMemorySource)
            and isinstance(operation.source.source.default_output_coder(),
                           WindowedValueCoder)):
          output_stream = create_OutputStream()
          element_coder = (
              operation.source.source.default_output_coder().get_impl())
          # Re-encode the elements in the nested context and
          # concatenate them together
          for element in operation.source.source.read(None):
            element_coder.encode_to_stream(element, output_stream, True)
          target_name = self._next_uid()
          input_data[(transform_id, target_name)] = output_stream.get()
          fn = beam_fn_api_pb2.FunctionSpec(urn=sdk_worker.DATA_INPUT_URN,
                                            id=self._next_uid())
          if data_operation_spec:
            fn.data.Pack(data_operation_spec)
          inputs = {target_name: beam_fn_api_pb2.Target.List()}
          side_inputs = {}
        else:
          # Read the source object from the runner.
          source_coder = beam.coders.DillCoder()
          input_transform_id = self._next_uid()
          output_stream = create_OutputStream()
          source_coder.get_impl().encode_to_stream(
              GlobalWindows.windowed_value(operation.source),
              output_stream,
              True)
          target_name = self._next_uid()
          input_data[(input_transform_id, target_name)] = output_stream.get()
          input_ptransform = beam_fn_api_pb2.PrimitiveTransform(
              id=input_transform_id,
              function_spec=beam_fn_api_pb2.FunctionSpec(
                  urn=sdk_worker.DATA_INPUT_URN,
                  id=self._next_uid()),
              # TODO(robertwb): Possible name collision.
              step_name=stage_name + '/inject_source',
              inputs={target_name: beam_fn_api_pb2.Target.List()},
              outputs={
                  'out':
                      beam_fn_api_pb2.PCollection(
                          coder_reference=coder_id(source_coder))
              })
          if data_operation_spec:
            input_ptransform.function_spec.data.Pack(data_operation_spec)
          transforms.append(input_ptransform)

          # Read the elements out of the source.
          fn = sdk_worker.pack_function_spec_data(
              OLDE_SOURCE_SPLITTABLE_DOFN_DATA,
              sdk_worker.PYTHON_DOFN_URN,
              id=self._next_uid())
          inputs = {
              'ignored_input_tag':
                  beam_fn_api_pb2.Target.List(target=[
                      beam_fn_api_pb2.Target(
                          primitive_transform_reference=input_transform_id,
                          name='out')
                  ])
          }
          side_inputs = {}

      elif isinstance(operation, operation_specs.WorkerDoFn):
        fn = sdk_worker.pack_function_spec_data(
            operation.serialized_fn,
            sdk_worker.PYTHON_DOFN_URN,
            id=self._next_uid())
        inputs = as_target(operation.input)
        # Store the contents of each side input for state access.
        for si in operation.side_inputs:
          assert isinstance(si.source, iobase.BoundedSource)
          element_coder = si.source.default_output_coder()
          view_id = self._next_uid()
          # TODO(robertwb): Actually flesh out the ViewFn API.
          side_inputs[si.tag] = beam_fn_api_pb2.SideInput(
              view_fn=sdk_worker.serialize_and_pack_py_fn(
                  element_coder, urn=sdk_worker.PYTHON_ITERABLE_VIEWFN_URN,
                  id=view_id))
          # Re-encode the elements in the nested context and
          # concatenate them together
          output_stream = create_OutputStream()
          for element in si.source.read(
              si.source.get_range_tracker(None, None)):
            element_coder.get_impl().encode_to_stream(
                element, output_stream, True)
          elements_data = output_stream.get()
          state_key = beam_fn_api_pb2.StateKey.MultimapSideInput(key=view_id)
          state_handler.Clear(state_key)
          state_handler.Append(state_key, elements_data)

      elif isinstance(operation, operation_specs.WorkerFlatten):
        fn = sdk_worker.pack_function_spec_data(
            operation.serialized_fn,
            sdk_worker.IDENTITY_DOFN_URN,
            id=self._next_uid())
        inputs = {
            'ignored_input_tag':
                beam_fn_api_pb2.Target.List(target=[
                    beam_fn_api_pb2.Target(
                        primitive_transform_reference=transform_index_to_id[
                            input_op_index],
                        name=output_tags(map_task[input_op_index][1])[
                            input_output_index])
                    for input_op_index, input_output_index in operation.inputs
                ])
        }
        side_inputs = {}

      else:
        raise TypeError(operation)

      ptransform = beam_fn_api_pb2.PrimitiveTransform(
          id=transform_id,
          function_spec=fn,
          step_name=stage_name,
          inputs=inputs,
          side_inputs=side_inputs,
          outputs=outputs(operation))
      transforms.append(ptransform)

    process_bundle_descriptor = beam_fn_api_pb2.ProcessBundleDescriptor(
        id=self._next_uid(), coders=coders.values(),
        primitive_transform=transforms)
    return beam_fn_api_pb2.InstructionRequest(
        instruction_id=self._next_uid(),
        register=beam_fn_api_pb2.RegisterRequest(
            process_bundle_descriptor=[process_bundle_descriptor
                                      ])), runner_sinks, input_data
 def _reencode_elements(elements, element_coder):
   output_stream = create_OutputStream()
   for element in elements:
     element_coder.get_impl().encode_to_stream(element, output_stream, True)
   return output_stream.get()
Exemple #16
0
  def run_stage(
      self,
      worker_handler_factory,
      pipeline_components,
      stage,
      pcoll_buffers,
      safe_coders):

    def iterable_state_write(values, element_coder_impl):
      token = unique_name(None, 'iter').encode('ascii')
      out = create_OutputStream()
      for element in values:
        element_coder_impl.encode_to_stream(element, out, True)
      controller.state.blocking_append(
          beam_fn_api_pb2.StateKey(
              runner=beam_fn_api_pb2.StateKey.Runner(key=token)),
          out.get())
      return token

    controller = worker_handler_factory(stage.environment)
    context = pipeline_context.PipelineContext(
        pipeline_components, iterable_state_write=iterable_state_write)
    data_api_service_descriptor = controller.data_api_service_descriptor()

    def extract_endpoints(stage):
      # Returns maps of transform names to PCollection identifiers.
      # Also mutates IO stages to point to the data ApiServiceDescriptor.
      data_input = {}
      data_side_input = {}
      data_output = {}
      for transform in stage.transforms:
        if transform.spec.urn in (bundle_processor.DATA_INPUT_URN,
                                  bundle_processor.DATA_OUTPUT_URN):
          pcoll_id = transform.spec.payload
          if transform.spec.urn == bundle_processor.DATA_INPUT_URN:
            target = transform.unique_name, only_element(transform.outputs)
            if pcoll_id == fn_api_runner_transforms.IMPULSE_BUFFER:
              data_input[target] = [ENCODED_IMPULSE_VALUE]
            else:
              data_input[target] = pcoll_buffers[pcoll_id]
            coder_id = pipeline_components.pcollections[
                only_element(transform.outputs.values())].coder_id
          elif transform.spec.urn == bundle_processor.DATA_OUTPUT_URN:
            target = transform.unique_name, only_element(transform.inputs)
            data_output[target] = pcoll_id
            coder_id = pipeline_components.pcollections[
                only_element(transform.inputs.values())].coder_id
          else:
            raise NotImplementedError
          data_spec = beam_fn_api_pb2.RemoteGrpcPort(coder_id=coder_id)
          if data_api_service_descriptor:
            data_spec.api_service_descriptor.url = (
                data_api_service_descriptor.url)
          transform.spec.payload = data_spec.SerializeToString()
        elif transform.spec.urn == common_urns.primitives.PAR_DO.urn:
          payload = proto_utils.parse_Bytes(
              transform.spec.payload, beam_runner_api_pb2.ParDoPayload)
          for tag, si in payload.side_inputs.items():
            data_side_input[transform.unique_name, tag] = (
                create_buffer_id(transform.inputs[tag]), si.access_pattern)
      return data_input, data_side_input, data_output

    logging.info('Running %s', stage.name)
    logging.debug('       %s', stage)
    data_input, data_side_input, data_output = extract_endpoints(stage)

    process_bundle_descriptor = beam_fn_api_pb2.ProcessBundleDescriptor(
        id=self._next_uid(),
        transforms={transform.unique_name: transform
                    for transform in stage.transforms},
        pcollections=dict(pipeline_components.pcollections.items()),
        coders=dict(pipeline_components.coders.items()),
        windowing_strategies=dict(
            pipeline_components.windowing_strategies.items()),
        environments=dict(pipeline_components.environments.items()))

    if controller.state_api_service_descriptor():
      process_bundle_descriptor.state_api_service_descriptor.url = (
          controller.state_api_service_descriptor().url)

    # Store the required side inputs into state.
    for (transform_id, tag), (buffer_id, si) in data_side_input.items():
      _, pcoll_id = split_buffer_id(buffer_id)
      value_coder = context.coders[safe_coders[
          pipeline_components.pcollections[pcoll_id].coder_id]]
      elements_by_window = _WindowGroupingBuffer(si, value_coder)
      for element_data in pcoll_buffers[buffer_id]:
        elements_by_window.append(element_data)
      for key, window, elements_data in elements_by_window.encoded_items():
        state_key = beam_fn_api_pb2.StateKey(
            multimap_side_input=beam_fn_api_pb2.StateKey.MultimapSideInput(
                ptransform_id=transform_id,
                side_input_id=tag,
                window=window,
                key=key))
        controller.state.blocking_append(state_key, elements_data)

    def get_buffer(buffer_id):
      kind, name = split_buffer_id(buffer_id)
      if kind in ('materialize', 'timers'):
        if buffer_id not in pcoll_buffers:
          # Just store the data chunks for replay.
          pcoll_buffers[buffer_id] = list()
      elif kind == 'group':
        # This is a grouping write, create a grouping buffer if needed.
        if buffer_id not in pcoll_buffers:
          original_gbk_transform = name
          transform_proto = pipeline_components.transforms[
              original_gbk_transform]
          input_pcoll = only_element(list(transform_proto.inputs.values()))
          output_pcoll = only_element(list(transform_proto.outputs.values()))
          pre_gbk_coder = context.coders[safe_coders[
              pipeline_components.pcollections[input_pcoll].coder_id]]
          post_gbk_coder = context.coders[safe_coders[
              pipeline_components.pcollections[output_pcoll].coder_id]]
          windowing_strategy = context.windowing_strategies[
              pipeline_components
              .pcollections[output_pcoll].windowing_strategy_id]
          pcoll_buffers[buffer_id] = _GroupingBuffer(
              pre_gbk_coder, post_gbk_coder, windowing_strategy)
      else:
        # These should be the only two identifiers we produce for now,
        # but special side input writes may go here.
        raise NotImplementedError(buffer_id)
      return pcoll_buffers[buffer_id]

    for k in range(self._bundle_repeat):
      try:
        controller.state.checkpoint()
        BundleManager(
            controller, lambda pcoll_id: [], process_bundle_descriptor,
            self._progress_frequency, k).process_bundle(data_input, data_output)
      finally:
        controller.state.restore()

    result = BundleManager(
        controller, get_buffer, process_bundle_descriptor,
        self._progress_frequency).process_bundle(data_input, data_output)

    while True:
      timer_inputs = {}
      for transform_id, timer_writes in stage.timer_pcollections:
        windowed_timer_coder_impl = context.coders[
            pipeline_components.pcollections[timer_writes].coder_id].get_impl()
        written_timers = get_buffer(
            create_buffer_id(timer_writes, kind='timers'))
        if written_timers:
          # Keep only the "last" timer set per key and window.
          timers_by_key_and_window = {}
          for elements_data in written_timers:
            input_stream = create_InputStream(elements_data)
            while input_stream.size() > 0:
              windowed_key_timer = windowed_timer_coder_impl.decode_from_stream(
                  input_stream, True)
              key, _ = windowed_key_timer.value
              # TODO: Explode and merge windows.
              assert len(windowed_key_timer.windows) == 1
              timers_by_key_and_window[
                  key, windowed_key_timer.windows[0]] = windowed_key_timer
          out = create_OutputStream()
          for windowed_key_timer in timers_by_key_and_window.values():
            windowed_timer_coder_impl.encode_to_stream(
                windowed_key_timer, out, True)
          timer_inputs[transform_id, 'out'] = [out.get()]
          written_timers[:] = []
      if timer_inputs:
        # The worker will be waiting on these inputs as well.
        for other_input in data_input:
          if other_input not in timer_inputs:
            timer_inputs[other_input] = []
        # TODO(robertwb): merge results
        BundleManager(
            controller,
            get_buffer,
            process_bundle_descriptor,
            self._progress_frequency,
            True).process_bundle(timer_inputs, data_output)
      else:
        break

    return result
Exemple #17
0
  def run_stage(
      self,
      worker_handler_factory,
      pipeline_components,
      stage,
      pcoll_buffers,
      safe_coders):

    def iterable_state_write(values, element_coder_impl):
      token = unique_name(None, 'iter').encode('ascii')
      out = create_OutputStream()
      for element in values:
        element_coder_impl.encode_to_stream(element, out, True)
      controller.state.blocking_append(
          beam_fn_api_pb2.StateKey(
              runner=beam_fn_api_pb2.StateKey.Runner(key=token)),
          out.get())
      return token

    controller = worker_handler_factory(stage.environment)
    context = pipeline_context.PipelineContext(
        pipeline_components, iterable_state_write=iterable_state_write)
    data_api_service_descriptor = controller.data_api_service_descriptor()

    def extract_endpoints(stage):
      # Returns maps of transform names to PCollection identifiers.
      # Also mutates IO stages to point to the data ApiServiceDescriptor.
      data_input = {}
      data_side_input = {}
      data_output = {}
      for transform in stage.transforms:
        if transform.spec.urn in (bundle_processor.DATA_INPUT_URN,
                                  bundle_processor.DATA_OUTPUT_URN):
          pcoll_id = transform.spec.payload
          if transform.spec.urn == bundle_processor.DATA_INPUT_URN:
            target = transform.unique_name, only_element(transform.outputs)
            if pcoll_id == fn_api_runner_transforms.IMPULSE_BUFFER:
              data_input[target] = [ENCODED_IMPULSE_VALUE]
            else:
              data_input[target] = pcoll_buffers[pcoll_id]
            coder_id = pipeline_components.pcollections[
                only_element(transform.outputs.values())].coder_id
          elif transform.spec.urn == bundle_processor.DATA_OUTPUT_URN:
            target = transform.unique_name, only_element(transform.inputs)
            data_output[target] = pcoll_id
            coder_id = pipeline_components.pcollections[
                only_element(transform.inputs.values())].coder_id
          else:
            raise NotImplementedError
          data_spec = beam_fn_api_pb2.RemoteGrpcPort(coder_id=coder_id)
          if data_api_service_descriptor:
            data_spec.api_service_descriptor.url = (
                data_api_service_descriptor.url)
          transform.spec.payload = data_spec.SerializeToString()
        elif transform.spec.urn in fn_api_runner_transforms.PAR_DO_URNS:
          payload = proto_utils.parse_Bytes(
              transform.spec.payload, beam_runner_api_pb2.ParDoPayload)
          for tag, si in payload.side_inputs.items():
            data_side_input[transform.unique_name, tag] = (
                create_buffer_id(transform.inputs[tag]), si.access_pattern)
      return data_input, data_side_input, data_output

    logging.info('Running %s', stage.name)
    logging.debug('       %s', stage)
    data_input, data_side_input, data_output = extract_endpoints(stage)

    process_bundle_descriptor = beam_fn_api_pb2.ProcessBundleDescriptor(
        id=self._next_uid(),
        transforms={transform.unique_name: transform
                    for transform in stage.transforms},
        pcollections=dict(pipeline_components.pcollections.items()),
        coders=dict(pipeline_components.coders.items()),
        windowing_strategies=dict(
            pipeline_components.windowing_strategies.items()),
        environments=dict(pipeline_components.environments.items()))

    if controller.state_api_service_descriptor():
      process_bundle_descriptor.state_api_service_descriptor.url = (
          controller.state_api_service_descriptor().url)

    # Store the required side inputs into state.
    for (transform_id, tag), (buffer_id, si) in data_side_input.items():
      _, pcoll_id = split_buffer_id(buffer_id)
      value_coder = context.coders[safe_coders[
          pipeline_components.pcollections[pcoll_id].coder_id]]
      elements_by_window = _WindowGroupingBuffer(si, value_coder)
      for element_data in pcoll_buffers[buffer_id]:
        elements_by_window.append(element_data)
      for key, window, elements_data in elements_by_window.encoded_items():
        state_key = beam_fn_api_pb2.StateKey(
            multimap_side_input=beam_fn_api_pb2.StateKey.MultimapSideInput(
                ptransform_id=transform_id,
                side_input_id=tag,
                window=window,
                key=key))
        controller.state.blocking_append(state_key, elements_data)

    def get_buffer(buffer_id):
      kind, name = split_buffer_id(buffer_id)
      if kind in ('materialize', 'timers'):
        if buffer_id not in pcoll_buffers:
          # Just store the data chunks for replay.
          pcoll_buffers[buffer_id] = list()
      elif kind == 'group':
        # This is a grouping write, create a grouping buffer if needed.
        if buffer_id not in pcoll_buffers:
          original_gbk_transform = name
          transform_proto = pipeline_components.transforms[
              original_gbk_transform]
          input_pcoll = only_element(list(transform_proto.inputs.values()))
          output_pcoll = only_element(list(transform_proto.outputs.values()))
          pre_gbk_coder = context.coders[safe_coders[
              pipeline_components.pcollections[input_pcoll].coder_id]]
          post_gbk_coder = context.coders[safe_coders[
              pipeline_components.pcollections[output_pcoll].coder_id]]
          windowing_strategy = context.windowing_strategies[
              pipeline_components
              .pcollections[output_pcoll].windowing_strategy_id]
          pcoll_buffers[buffer_id] = _GroupingBuffer(
              pre_gbk_coder, post_gbk_coder, windowing_strategy)
      else:
        # These should be the only two identifiers we produce for now,
        # but special side input writes may go here.
        raise NotImplementedError(buffer_id)
      return pcoll_buffers[buffer_id]

    for k in range(self._bundle_repeat):
      try:
        controller.state.checkpoint()
        BundleManager(
            controller, lambda pcoll_id: [], process_bundle_descriptor,
            self._progress_frequency, k).process_bundle(data_input, data_output)
      finally:
        controller.state.restore()

    result = BundleManager(
        controller, get_buffer, process_bundle_descriptor,
        self._progress_frequency).process_bundle(
            data_input, data_output)

    last_result = result
    while True:
      deferred_inputs = collections.defaultdict(list)
      for transform_id, timer_writes in stage.timer_pcollections:

        # Queue any set timers as new inputs.
        windowed_timer_coder_impl = context.coders[
            pipeline_components.pcollections[timer_writes].coder_id].get_impl()
        written_timers = get_buffer(
            create_buffer_id(timer_writes, kind='timers'))
        if written_timers:
          # Keep only the "last" timer set per key and window.
          timers_by_key_and_window = {}
          for elements_data in written_timers:
            input_stream = create_InputStream(elements_data)
            while input_stream.size() > 0:
              windowed_key_timer = windowed_timer_coder_impl.decode_from_stream(
                  input_stream, True)
              key, _ = windowed_key_timer.value
              # TODO: Explode and merge windows.
              assert len(windowed_key_timer.windows) == 1
              timers_by_key_and_window[
                  key, windowed_key_timer.windows[0]] = windowed_key_timer
          out = create_OutputStream()
          for windowed_key_timer in timers_by_key_and_window.values():
            windowed_timer_coder_impl.encode_to_stream(
                windowed_key_timer, out, True)
          deferred_inputs[transform_id, 'out'] = [out.get()]
          written_timers[:] = []

      # Queue any delayed bundle applications.
      for delayed_application in last_result.process_bundle.residual_roots:
        # Find the io transform that feeds this transform.
        # TODO(SDF): Memoize?
        application = delayed_application.application
        input_pcoll = process_bundle_descriptor.transforms[
            application.ptransform_id].inputs[application.input_id]
        for input_id, proto in process_bundle_descriptor.transforms.items():
          if (proto.spec.urn == bundle_processor.DATA_INPUT_URN
              and input_pcoll in proto.outputs.values()):
            deferred_inputs[input_id, 'out'].append(application.element)
            break
        else:
          raise RuntimeError(
              'No IO transform feeds %s' % application.ptransform_id)

      if deferred_inputs:
        # The worker will be waiting on these inputs as well.
        for other_input in data_input:
          if other_input not in deferred_inputs:
            deferred_inputs[other_input] = []
        # TODO(robertwb): merge results
        last_result = BundleManager(
            controller,
            get_buffer,
            process_bundle_descriptor,
            self._progress_frequency,
            True).process_bundle(deferred_inputs, data_output)
      else:
        break

    return result
def encode_nested(coder, value, nested=True):
  out = coder_impl.create_OutputStream()
  coder.get_impl().encode_to_stream(value, out, nested)
  return out.get()
Exemple #19
0
 def __init__(self, field_coder):
     self._field_coder = field_coder
     self.data_out_stream = create_OutputStream()