Example #1
0
 def process_bundle(self, request, instruction_id):
     bundle_processor = self.bundle_processor_cache.get(
         instruction_id, request.process_bundle_descriptor_id)
     try:
         with bundle_processor.state_handler.process_instruction_id(
                 instruction_id, request.cache_tokens):
             with self.maybe_profile(instruction_id):
                 delayed_applications, requests_finalization = (
                     bundle_processor.process_bundle(instruction_id))
                 monitoring_infos = bundle_processor.monitoring_infos()
                 monitoring_infos.extend(self.state_cache_metrics_fn())
                 response = beam_fn_api_pb2.InstructionResponse(
                     instruction_id=instruction_id,
                     process_bundle=beam_fn_api_pb2.ProcessBundleResponse(
                         residual_roots=delayed_applications,
                         metrics=bundle_processor.metrics(),
                         monitoring_infos=monitoring_infos,
                         requires_finalization=requests_finalization))
         # Don't release here if finalize is needed.
         if not requests_finalization:
             self.bundle_processor_cache.release(instruction_id)
         return response
     except:  # pylint: disable=broad-except
         # Don't re-use bundle processors on failure.
         self.bundle_processor_cache.discard(instruction_id)
         raise
Example #2
0
    def process_bundle(self, request, instruction_id):
        bundle_processor.BundleProcessor(
            self.fns[request.process_bundle_descriptor_reference],
            self.state_handler,
            self.data_channel_factory).process_bundle(instruction_id)

        return beam_fn_api_pb2.ProcessBundleResponse()
Example #3
0
    def process_bundle(self, request, instruction_id):
        ops = self.create_execution_tree(
            self.fns[request.process_bundle_descriptor_reference])

        expected_inputs = []
        for op in ops:
            if isinstance(op, DataOutputOperation):
                # TODO(robertwb): Is there a better way to pass the instruction id to
                # the operation?
                op.set_output_stream(
                    op.data_channel.output_stream(instruction_id, op.target))
            elif isinstance(op, DataInputOperation):
                # We must wait until we receive "end of stream" for each of these ops.
                expected_inputs.append(op)

        # Start all operations.
        for op in reversed(ops):
            logging.info('start %s', op)
            op.start()

        # Inject inputs from data plane.
        for input_op in expected_inputs:
            for data in input_op.data_channel.input_elements(
                    instruction_id, [input_op.target]):
                # ignores input name
                input_op.process_encoded(data.data)

        # Finish all operations.
        for op in ops:
            logging.info('finish %s', op)
            op.finish()

        return beam_fn_api_pb2.ProcessBundleResponse()
    def process_bundle(
        self,
        inputs,  # type: Mapping[str, execution.PartitionableBuffer]
        expected_outputs,  # type: DataOutput
        fired_timers,  # type: Mapping[Tuple[str, str], execution.PartitionableBuffer]
        expected_output_timers,  # type: Dict[Tuple[str, str], str]
        dry_run=False,
    ):
        # type: (...) -> BundleProcessResult
        part_inputs = [{} for _ in range(self._num_workers)
                       ]  # type: List[Dict[str, List[bytes]]]
        # Timers are only executed on the first worker
        # TODO(BEAM-9741): Split timers to multiple workers
        timer_inputs = [
            fired_timers if i == 0 else {} for i in range(self._num_workers)
        ]
        for name, input in inputs.items():
            for ix, part in enumerate(input.partition(self._num_workers)):
                part_inputs[ix][name] = part

        merged_result = None  # type: Optional[beam_fn_api_pb2.InstructionResponse]
        split_result_list = [
        ]  # type: List[beam_fn_api_pb2.ProcessBundleSplitResponse]

        def execute(part_map_input_timers):
            # type: (...) -> BundleProcessResult
            part_map, input_timers = part_map_input_timers
            bundle_manager = BundleManager(
                self.bundle_context_manager,
                self._progress_frequency,
                cache_token_generator=self._cache_token_generator)
            return bundle_manager.process_bundle(part_map, expected_outputs,
                                                 input_timers,
                                                 expected_output_timers,
                                                 dry_run)

        with thread_pool_executor.shared_unbounded_instance() as executor:
            for result, split_result in executor.map(
                    execute,
                    zip(
                        part_inputs,  # pylint: disable=zip-builtin-not-iterating
                        timer_inputs)):
                split_result_list += split_result
                if merged_result is None:
                    merged_result = result
                else:
                    merged_result = beam_fn_api_pb2.InstructionResponse(
                        process_bundle=beam_fn_api_pb2.ProcessBundleResponse(
                            monitoring_infos=monitoring_infos.consolidate(
                                itertools.chain(
                                    result.process_bundle.monitoring_infos,
                                    merged_result.process_bundle.
                                    monitoring_infos))),
                        error=result.error or merged_result.error)
        assert merged_result is not None
        return merged_result, split_result_list
Example #5
0
 def process_bundle(self, request, instruction_id):
     with self.get_bundle_processor(
             instruction_id, request.process_bundle_descriptor_reference
     ) as bundle_processor:
         bundle_processor.process_bundle(instruction_id)
         return beam_fn_api_pb2.InstructionResponse(
             instruction_id=instruction_id,
             process_bundle=beam_fn_api_pb2.ProcessBundleResponse(
                 metrics=bundle_processor.metrics(),
                 monitoring_infos=bundle_processor.monitoring_infos()))
 def merge_results(last_result):
     """ Merge the latest result with other accumulated results. """
     return (
         last_result if final_result is None else
         beam_fn_api_pb2.InstructionResponse(
             process_bundle=beam_fn_api_pb2.ProcessBundleResponse(
                 monitoring_infos=monitoring_infos.consolidate(
                     itertools.chain(
                         final_result.process_bundle.monitoring_infos,
                         last_result.process_bundle.monitoring_infos))),
             error=final_result.error or last_result.error))
Example #7
0
 def process_bundle(self, request, instruction_id):
     with self.get_bundle_processor(
             instruction_id, request.process_bundle_descriptor_reference
     ) as bundle_processor:
         with self.maybe_profile(instruction_id):
             delayed_applications = bundle_processor.process_bundle(
                 instruction_id)
         return beam_fn_api_pb2.InstructionResponse(
             instruction_id=instruction_id,
             process_bundle=beam_fn_api_pb2.ProcessBundleResponse(
                 residual_roots=delayed_applications,
                 metrics=bundle_processor.metrics(),
                 monitoring_infos=bundle_processor.monitoring_infos()))
Example #8
0
  def process_bundle(self, request, instruction_id):
    self.bundle_processors[
        instruction_id] = processor = bundle_processor.BundleProcessor(
            self.fns[request.process_bundle_descriptor_reference],
            self.state_handler, self.data_channel_factory)
    try:
      processor.process_bundle(instruction_id)
    finally:
      del self.bundle_processors[instruction_id]

    return beam_fn_api_pb2.InstructionResponse(
        instruction_id=instruction_id,
        process_bundle=beam_fn_api_pb2.ProcessBundleResponse(
            metrics=processor.metrics()))
Example #9
0
  def process_bundle(self,
                     inputs,  # type: Mapping[str, PartitionableBuffer]
                     expected_outputs,  # type: DataOutput
                     fired_timers,  # type: Mapping[Tuple[str, str], PartitionableBuffer]
                     expected_output_timers  # type: Dict[Tuple[str, str], str]
                     ):
    # type: (...) -> BundleProcessResult
    part_inputs = [{} for _ in range(self._num_workers)
                   ]  # type: List[Dict[str, List[bytes]]]
    for name, input in inputs.items():
      for ix, part in enumerate(input.partition(self._num_workers)):
        part_inputs[ix][name] = part

    merged_result = None  # type: Optional[beam_fn_api_pb2.InstructionResponse]
    split_result_list = [
    ]  # type: List[beam_fn_api_pb2.ProcessBundleSplitResponse]

    def execute(part_map):
      # type: (...) -> BundleProcessResult
      bundle_manager = BundleManager(
          self._worker_handler_list,
          self._get_buffer,
          self._get_input_coder_impl,
          self._bundle_descriptor,
          self._progress_frequency,
          self._registered,
          cache_token_generator=self._cache_token_generator)
      return bundle_manager.process_bundle(
          part_map, expected_outputs, fired_timers, expected_output_timers)

    with UnboundedThreadPoolExecutor() as executor:
      for result, split_result in executor.map(execute, part_inputs):

        split_result_list += split_result
        if merged_result is None:
          merged_result = result
        else:
          merged_result = beam_fn_api_pb2.InstructionResponse(
              process_bundle=beam_fn_api_pb2.ProcessBundleResponse(
                  monitoring_infos=monitoring_infos.consolidate(
                      itertools.chain(
                          result.process_bundle.monitoring_infos,
                          merged_result.process_bundle.monitoring_infos))),
              error=result.error or merged_result.error)
    assert merged_result is not None

    return merged_result, split_result_list
Example #10
0
    def process_bundle(self, request, instruction_id):
        process_bundle_desc = self.fns[
            request.process_bundle_descriptor_reference]
        state_handler = self.state_handler_factory.create_state_handler(
            process_bundle_desc.state_api_service_descriptor)
        self.bundle_processors[
            instruction_id] = processor = bundle_processor.BundleProcessor(
                process_bundle_desc, state_handler, self.data_channel_factory)
        try:
            with state_handler.process_instruction_id(instruction_id):
                processor.process_bundle(instruction_id)
        finally:
            del self.bundle_processors[instruction_id]

        return beam_fn_api_pb2.InstructionResponse(
            instruction_id=instruction_id,
            process_bundle=beam_fn_api_pb2.ProcessBundleResponse(
                metrics=processor.metrics(),
                monitoring_infos=processor.monitoring_infos()))
Example #11
0
 def process_bundle(self, request, instruction_id):
     bundle_processor = self.bundle_processor_cache.get(
         instruction_id, request.process_bundle_descriptor_reference)
     try:
         with bundle_processor.state_handler.process_instruction_id(
                 instruction_id):
             with self.maybe_profile(instruction_id):
                 delayed_applications = bundle_processor.process_bundle(
                     instruction_id)
                 response = beam_fn_api_pb2.InstructionResponse(
                     instruction_id=instruction_id,
                     process_bundle=beam_fn_api_pb2.ProcessBundleResponse(
                         residual_roots=delayed_applications,
                         metrics=bundle_processor.metrics(),
                         monitoring_infos=bundle_processor.monitoring_infos(
                         )))
         # TODO(boyuanz): Don't release here if finalize is needed.
         self.bundle_processor_cache.release(instruction_id)
         return response
     except:  # pylint: disable=broad-except
         # Don't re-use bundle processors on failure.
         self.bundle_processor_cache.discard(instruction_id)
         raise
Example #12
0
  def _run_stage(self,
                 runner_execution_context,  # type: execution.FnApiRunnerExecutionContext
                 bundle_context_manager,  # type: execution.BundleContextManager
                ):
    # type: (...) -> beam_fn_api_pb2.InstructionResponse

    """Run an individual stage.

    Args:
      runner_execution_context (execution.FnApiRunnerExecutionContext): An
        object containing execution information for the pipeline.
      stage (translations.Stage): A description of the stage to execute.
    """
    worker_handler_list = bundle_context_manager.worker_handlers
    worker_handler_manager = runner_execution_context.worker_handler_manager
    _LOGGER.info('Running %s', bundle_context_manager.stage.name)
    (data_input, data_side_input, data_output,
     expected_timer_output) = self._extract_endpoints(
         bundle_context_manager, runner_execution_context)
    worker_handler_manager.register_process_bundle_descriptor(
        bundle_context_manager.process_bundle_descriptor)

    # Store the required side inputs into state so it is accessible for the
    # worker when it runs this bundle.
    self._store_side_inputs_in_state(runner_execution_context, data_side_input)

    # Change cache token across bundle repeats
    cache_token_generator = FnApiRunner.get_cache_token_generator(static=False)

    self._run_bundle_multiple_times_for_testing(
        runner_execution_context,
        bundle_context_manager,
        data_input,
        data_output, {},
        expected_timer_output,
        cache_token_generator=cache_token_generator)

    bundle_manager = ParallelBundleManager(
        worker_handler_list,
        bundle_context_manager.get_buffer,
        bundle_context_manager.get_input_coder_impl,
        bundle_context_manager.process_bundle_descriptor,
        self._progress_frequency,
        num_workers=self._num_workers,
        cache_token_generator=cache_token_generator)

    # For the first time of processing, we don't have fired timers as inputs.
    result, splits = bundle_manager.process_bundle(data_input,
                                                   data_output,
                                                   {},
                                                   expected_timer_output)

    last_result = result
    last_sent = data_input

    # We cannot split deferred_input until we include residual_roots to
    # merged results. Without residual_roots, pipeline stops earlier and we
    # may miss some data.
    # We also don't partition fired timer inputs for the same reason.
    bundle_manager._num_workers = 1
    while True:
      deferred_inputs = {}  # type: Dict[str, PartitionableBuffer]
      fired_timers = {}

      self._collect_written_timers_and_add_to_fired_timers(
          bundle_context_manager, fired_timers)
      # Queue any process-initiated delayed bundle applications.
      for delayed_application in last_result.process_bundle.residual_roots:
        name = bundle_context_manager.input_for(
            delayed_application.application.transform_id,
            delayed_application.application.input_id)
        if name not in deferred_inputs:
          deferred_inputs[name] = ListBuffer(
              coder_impl=bundle_context_manager.get_input_coder_impl(name))
        deferred_inputs[name].append(delayed_application.application.element)
      # Queue any runner-initiated delayed bundle applications.
      self._add_residuals_and_channel_splits_to_deferred_inputs(
          splits, bundle_context_manager, last_sent, deferred_inputs)

      if deferred_inputs or fired_timers:
        # The worker will be waiting on these inputs as well.
        for other_input in data_input:
          if other_input not in deferred_inputs:
            deferred_inputs[other_input] = ListBuffer(
                coder_impl=bundle_context_manager.get_input_coder_impl(
                    other_input))
        # TODO(robertwb): merge results
        last_result, splits = bundle_manager.process_bundle(
            deferred_inputs, data_output, fired_timers, expected_timer_output)
        last_sent = deferred_inputs
        result = beam_fn_api_pb2.InstructionResponse(
            process_bundle=beam_fn_api_pb2.ProcessBundleResponse(
                monitoring_infos=monitoring_infos.consolidate(
                    itertools.chain(
                        result.process_bundle.monitoring_infos,
                        last_result.process_bundle.monitoring_infos))),
            error=result.error or last_result.error)
      else:
        break

    return result