def process_bundle(
        self,
        inputs,  # type: Mapping[str, execution.PartitionableBuffer]
        expected_outputs,  # type: DataOutput
        fired_timers,  # type: Mapping[Tuple[str, str], execution.PartitionableBuffer]
        expected_output_timers,  # type: Dict[Tuple[str, str], str]
        dry_run=False,
    ):
        # type: (...) -> BundleProcessResult
        part_inputs = [{} for _ in range(self._num_workers)
                       ]  # type: List[Dict[str, List[bytes]]]
        # Timers are only executed on the first worker
        # TODO(BEAM-9741): Split timers to multiple workers
        timer_inputs = [
            fired_timers if i == 0 else {} for i in range(self._num_workers)
        ]
        for name, input in inputs.items():
            for ix, part in enumerate(input.partition(self._num_workers)):
                part_inputs[ix][name] = part

        merged_result = None  # type: Optional[beam_fn_api_pb2.InstructionResponse]
        split_result_list = [
        ]  # type: List[beam_fn_api_pb2.ProcessBundleSplitResponse]

        def execute(part_map_input_timers):
            # type: (...) -> BundleProcessResult
            part_map, input_timers = part_map_input_timers
            bundle_manager = BundleManager(
                self.bundle_context_manager,
                self._progress_frequency,
                cache_token_generator=self._cache_token_generator)
            return bundle_manager.process_bundle(part_map, expected_outputs,
                                                 input_timers,
                                                 expected_output_timers,
                                                 dry_run)

        with thread_pool_executor.shared_unbounded_instance() as executor:
            for result, split_result in executor.map(
                    execute,
                    zip(
                        part_inputs,  # pylint: disable=zip-builtin-not-iterating
                        timer_inputs)):
                split_result_list += split_result
                if merged_result is None:
                    merged_result = result
                else:
                    merged_result = beam_fn_api_pb2.InstructionResponse(
                        process_bundle=beam_fn_api_pb2.ProcessBundleResponse(
                            monitoring_infos=monitoring_infos.consolidate(
                                itertools.chain(
                                    result.process_bundle.monitoring_infos,
                                    merged_result.process_bundle.
                                    monitoring_infos))),
                        error=result.error or merged_result.error)
        assert merged_result is not None
        return merged_result, split_result_list
 def merge_results(last_result):
     """ Merge the latest result with other accumulated results. """
     return (
         last_result if final_result is None else
         beam_fn_api_pb2.InstructionResponse(
             process_bundle=beam_fn_api_pb2.ProcessBundleResponse(
                 monitoring_infos=monitoring_infos.consolidate(
                     itertools.chain(
                         final_result.process_bundle.monitoring_infos,
                         last_result.process_bundle.monitoring_infos))),
             error=final_result.error or last_result.error))
Esempio n. 3
0
  def process_bundle(self,
                     inputs,  # type: Mapping[str, PartitionableBuffer]
                     expected_outputs,  # type: DataOutput
                     fired_timers,  # type: Mapping[Tuple[str, str], PartitionableBuffer]
                     expected_output_timers  # type: Dict[Tuple[str, str], str]
                     ):
    # type: (...) -> BundleProcessResult
    part_inputs = [{} for _ in range(self._num_workers)
                   ]  # type: List[Dict[str, List[bytes]]]
    for name, input in inputs.items():
      for ix, part in enumerate(input.partition(self._num_workers)):
        part_inputs[ix][name] = part

    merged_result = None  # type: Optional[beam_fn_api_pb2.InstructionResponse]
    split_result_list = [
    ]  # type: List[beam_fn_api_pb2.ProcessBundleSplitResponse]

    def execute(part_map):
      # type: (...) -> BundleProcessResult
      bundle_manager = BundleManager(
          self._worker_handler_list,
          self._get_buffer,
          self._get_input_coder_impl,
          self._bundle_descriptor,
          self._progress_frequency,
          self._registered,
          cache_token_generator=self._cache_token_generator)
      return bundle_manager.process_bundle(
          part_map, expected_outputs, fired_timers, expected_output_timers)

    with UnboundedThreadPoolExecutor() as executor:
      for result, split_result in executor.map(execute, part_inputs):

        split_result_list += split_result
        if merged_result is None:
          merged_result = result
        else:
          merged_result = beam_fn_api_pb2.InstructionResponse(
              process_bundle=beam_fn_api_pb2.ProcessBundleResponse(
                  monitoring_infos=monitoring_infos.consolidate(
                      itertools.chain(
                          result.process_bundle.monitoring_infos,
                          merged_result.process_bundle.monitoring_infos))),
              error=result.error or merged_result.error)
    assert merged_result is not None

    return merged_result, split_result_list
Esempio n. 4
0
  def _run_stage(self,
                 runner_execution_context,  # type: execution.FnApiRunnerExecutionContext
                 bundle_context_manager,  # type: execution.BundleContextManager
                ):
    # type: (...) -> beam_fn_api_pb2.InstructionResponse

    """Run an individual stage.

    Args:
      runner_execution_context (execution.FnApiRunnerExecutionContext): An
        object containing execution information for the pipeline.
      stage (translations.Stage): A description of the stage to execute.
    """
    worker_handler_list = bundle_context_manager.worker_handlers
    worker_handler_manager = runner_execution_context.worker_handler_manager
    _LOGGER.info('Running %s', bundle_context_manager.stage.name)
    (data_input, data_side_input, data_output,
     expected_timer_output) = self._extract_endpoints(
         bundle_context_manager, runner_execution_context)
    worker_handler_manager.register_process_bundle_descriptor(
        bundle_context_manager.process_bundle_descriptor)

    # Store the required side inputs into state so it is accessible for the
    # worker when it runs this bundle.
    self._store_side_inputs_in_state(runner_execution_context, data_side_input)

    # Change cache token across bundle repeats
    cache_token_generator = FnApiRunner.get_cache_token_generator(static=False)

    self._run_bundle_multiple_times_for_testing(
        runner_execution_context,
        bundle_context_manager,
        data_input,
        data_output, {},
        expected_timer_output,
        cache_token_generator=cache_token_generator)

    bundle_manager = ParallelBundleManager(
        worker_handler_list,
        bundle_context_manager.get_buffer,
        bundle_context_manager.get_input_coder_impl,
        bundle_context_manager.process_bundle_descriptor,
        self._progress_frequency,
        num_workers=self._num_workers,
        cache_token_generator=cache_token_generator)

    # For the first time of processing, we don't have fired timers as inputs.
    result, splits = bundle_manager.process_bundle(data_input,
                                                   data_output,
                                                   {},
                                                   expected_timer_output)

    last_result = result
    last_sent = data_input

    # We cannot split deferred_input until we include residual_roots to
    # merged results. Without residual_roots, pipeline stops earlier and we
    # may miss some data.
    # We also don't partition fired timer inputs for the same reason.
    bundle_manager._num_workers = 1
    while True:
      deferred_inputs = {}  # type: Dict[str, PartitionableBuffer]
      fired_timers = {}

      self._collect_written_timers_and_add_to_fired_timers(
          bundle_context_manager, fired_timers)
      # Queue any process-initiated delayed bundle applications.
      for delayed_application in last_result.process_bundle.residual_roots:
        name = bundle_context_manager.input_for(
            delayed_application.application.transform_id,
            delayed_application.application.input_id)
        if name not in deferred_inputs:
          deferred_inputs[name] = ListBuffer(
              coder_impl=bundle_context_manager.get_input_coder_impl(name))
        deferred_inputs[name].append(delayed_application.application.element)
      # Queue any runner-initiated delayed bundle applications.
      self._add_residuals_and_channel_splits_to_deferred_inputs(
          splits, bundle_context_manager, last_sent, deferred_inputs)

      if deferred_inputs or fired_timers:
        # The worker will be waiting on these inputs as well.
        for other_input in data_input:
          if other_input not in deferred_inputs:
            deferred_inputs[other_input] = ListBuffer(
                coder_impl=bundle_context_manager.get_input_coder_impl(
                    other_input))
        # TODO(robertwb): merge results
        last_result, splits = bundle_manager.process_bundle(
            deferred_inputs, data_output, fired_timers, expected_timer_output)
        last_sent = deferred_inputs
        result = beam_fn_api_pb2.InstructionResponse(
            process_bundle=beam_fn_api_pb2.ProcessBundleResponse(
                monitoring_infos=monitoring_infos.consolidate(
                    itertools.chain(
                        result.process_bundle.monitoring_infos,
                        last_result.process_bundle.monitoring_infos))),
            error=result.error or last_result.error)
      else:
        break

    return result