def process_bundle(self, request, instruction_id): bundle_processor = self.bundle_processor_cache.get( instruction_id, request.process_bundle_descriptor_id) try: with bundle_processor.state_handler.process_instruction_id( instruction_id, request.cache_tokens): with self.maybe_profile(instruction_id): delayed_applications, requests_finalization = ( bundle_processor.process_bundle(instruction_id)) monitoring_infos = bundle_processor.monitoring_infos() monitoring_infos.extend(self.state_cache_metrics_fn()) response = beam_fn_api_pb2.InstructionResponse( instruction_id=instruction_id, process_bundle=beam_fn_api_pb2.ProcessBundleResponse( residual_roots=delayed_applications, metrics=bundle_processor.metrics(), monitoring_infos=monitoring_infos, requires_finalization=requests_finalization)) # Don't release here if finalize is needed. if not requests_finalization: self.bundle_processor_cache.release(instruction_id) return response except: # pylint: disable=broad-except # Don't re-use bundle processors on failure. self.bundle_processor_cache.discard(instruction_id) raise
def process_bundle(self, request, instruction_id): bundle_processor.BundleProcessor( self.fns[request.process_bundle_descriptor_reference], self.state_handler, self.data_channel_factory).process_bundle(instruction_id) return beam_fn_api_pb2.ProcessBundleResponse()
def process_bundle(self, request, instruction_id): ops = self.create_execution_tree( self.fns[request.process_bundle_descriptor_reference]) expected_inputs = [] for op in ops: if isinstance(op, DataOutputOperation): # TODO(robertwb): Is there a better way to pass the instruction id to # the operation? op.set_output_stream( op.data_channel.output_stream(instruction_id, op.target)) elif isinstance(op, DataInputOperation): # We must wait until we receive "end of stream" for each of these ops. expected_inputs.append(op) # Start all operations. for op in reversed(ops): logging.info('start %s', op) op.start() # Inject inputs from data plane. for input_op in expected_inputs: for data in input_op.data_channel.input_elements( instruction_id, [input_op.target]): # ignores input name input_op.process_encoded(data.data) # Finish all operations. for op in ops: logging.info('finish %s', op) op.finish() return beam_fn_api_pb2.ProcessBundleResponse()
def process_bundle( self, inputs, # type: Mapping[str, execution.PartitionableBuffer] expected_outputs, # type: DataOutput fired_timers, # type: Mapping[Tuple[str, str], execution.PartitionableBuffer] expected_output_timers, # type: Dict[Tuple[str, str], str] dry_run=False, ): # type: (...) -> BundleProcessResult part_inputs = [{} for _ in range(self._num_workers) ] # type: List[Dict[str, List[bytes]]] # Timers are only executed on the first worker # TODO(BEAM-9741): Split timers to multiple workers timer_inputs = [ fired_timers if i == 0 else {} for i in range(self._num_workers) ] for name, input in inputs.items(): for ix, part in enumerate(input.partition(self._num_workers)): part_inputs[ix][name] = part merged_result = None # type: Optional[beam_fn_api_pb2.InstructionResponse] split_result_list = [ ] # type: List[beam_fn_api_pb2.ProcessBundleSplitResponse] def execute(part_map_input_timers): # type: (...) -> BundleProcessResult part_map, input_timers = part_map_input_timers bundle_manager = BundleManager( self.bundle_context_manager, self._progress_frequency, cache_token_generator=self._cache_token_generator) return bundle_manager.process_bundle(part_map, expected_outputs, input_timers, expected_output_timers, dry_run) with thread_pool_executor.shared_unbounded_instance() as executor: for result, split_result in executor.map( execute, zip( part_inputs, # pylint: disable=zip-builtin-not-iterating timer_inputs)): split_result_list += split_result if merged_result is None: merged_result = result else: merged_result = beam_fn_api_pb2.InstructionResponse( process_bundle=beam_fn_api_pb2.ProcessBundleResponse( monitoring_infos=monitoring_infos.consolidate( itertools.chain( result.process_bundle.monitoring_infos, merged_result.process_bundle. monitoring_infos))), error=result.error or merged_result.error) assert merged_result is not None return merged_result, split_result_list
def process_bundle(self, request, instruction_id): with self.get_bundle_processor( instruction_id, request.process_bundle_descriptor_reference ) as bundle_processor: bundle_processor.process_bundle(instruction_id) return beam_fn_api_pb2.InstructionResponse( instruction_id=instruction_id, process_bundle=beam_fn_api_pb2.ProcessBundleResponse( metrics=bundle_processor.metrics(), monitoring_infos=bundle_processor.monitoring_infos()))
def merge_results(last_result): """ Merge the latest result with other accumulated results. """ return ( last_result if final_result is None else beam_fn_api_pb2.InstructionResponse( process_bundle=beam_fn_api_pb2.ProcessBundleResponse( monitoring_infos=monitoring_infos.consolidate( itertools.chain( final_result.process_bundle.monitoring_infos, last_result.process_bundle.monitoring_infos))), error=final_result.error or last_result.error))
def process_bundle(self, request, instruction_id): with self.get_bundle_processor( instruction_id, request.process_bundle_descriptor_reference ) as bundle_processor: with self.maybe_profile(instruction_id): delayed_applications = bundle_processor.process_bundle( instruction_id) return beam_fn_api_pb2.InstructionResponse( instruction_id=instruction_id, process_bundle=beam_fn_api_pb2.ProcessBundleResponse( residual_roots=delayed_applications, metrics=bundle_processor.metrics(), monitoring_infos=bundle_processor.monitoring_infos()))
def process_bundle(self, request, instruction_id): self.bundle_processors[ instruction_id] = processor = bundle_processor.BundleProcessor( self.fns[request.process_bundle_descriptor_reference], self.state_handler, self.data_channel_factory) try: processor.process_bundle(instruction_id) finally: del self.bundle_processors[instruction_id] return beam_fn_api_pb2.InstructionResponse( instruction_id=instruction_id, process_bundle=beam_fn_api_pb2.ProcessBundleResponse( metrics=processor.metrics()))
def process_bundle(self, inputs, # type: Mapping[str, PartitionableBuffer] expected_outputs, # type: DataOutput fired_timers, # type: Mapping[Tuple[str, str], PartitionableBuffer] expected_output_timers # type: Dict[Tuple[str, str], str] ): # type: (...) -> BundleProcessResult part_inputs = [{} for _ in range(self._num_workers) ] # type: List[Dict[str, List[bytes]]] for name, input in inputs.items(): for ix, part in enumerate(input.partition(self._num_workers)): part_inputs[ix][name] = part merged_result = None # type: Optional[beam_fn_api_pb2.InstructionResponse] split_result_list = [ ] # type: List[beam_fn_api_pb2.ProcessBundleSplitResponse] def execute(part_map): # type: (...) -> BundleProcessResult bundle_manager = BundleManager( self._worker_handler_list, self._get_buffer, self._get_input_coder_impl, self._bundle_descriptor, self._progress_frequency, self._registered, cache_token_generator=self._cache_token_generator) return bundle_manager.process_bundle( part_map, expected_outputs, fired_timers, expected_output_timers) with UnboundedThreadPoolExecutor() as executor: for result, split_result in executor.map(execute, part_inputs): split_result_list += split_result if merged_result is None: merged_result = result else: merged_result = beam_fn_api_pb2.InstructionResponse( process_bundle=beam_fn_api_pb2.ProcessBundleResponse( monitoring_infos=monitoring_infos.consolidate( itertools.chain( result.process_bundle.monitoring_infos, merged_result.process_bundle.monitoring_infos))), error=result.error or merged_result.error) assert merged_result is not None return merged_result, split_result_list
def process_bundle(self, request, instruction_id): process_bundle_desc = self.fns[ request.process_bundle_descriptor_reference] state_handler = self.state_handler_factory.create_state_handler( process_bundle_desc.state_api_service_descriptor) self.bundle_processors[ instruction_id] = processor = bundle_processor.BundleProcessor( process_bundle_desc, state_handler, self.data_channel_factory) try: with state_handler.process_instruction_id(instruction_id): processor.process_bundle(instruction_id) finally: del self.bundle_processors[instruction_id] return beam_fn_api_pb2.InstructionResponse( instruction_id=instruction_id, process_bundle=beam_fn_api_pb2.ProcessBundleResponse( metrics=processor.metrics(), monitoring_infos=processor.monitoring_infos()))
def process_bundle(self, request, instruction_id): bundle_processor = self.bundle_processor_cache.get( instruction_id, request.process_bundle_descriptor_reference) try: with bundle_processor.state_handler.process_instruction_id( instruction_id): with self.maybe_profile(instruction_id): delayed_applications = bundle_processor.process_bundle( instruction_id) response = beam_fn_api_pb2.InstructionResponse( instruction_id=instruction_id, process_bundle=beam_fn_api_pb2.ProcessBundleResponse( residual_roots=delayed_applications, metrics=bundle_processor.metrics(), monitoring_infos=bundle_processor.monitoring_infos( ))) # TODO(boyuanz): Don't release here if finalize is needed. self.bundle_processor_cache.release(instruction_id) return response except: # pylint: disable=broad-except # Don't re-use bundle processors on failure. self.bundle_processor_cache.discard(instruction_id) raise
def _run_stage(self, runner_execution_context, # type: execution.FnApiRunnerExecutionContext bundle_context_manager, # type: execution.BundleContextManager ): # type: (...) -> beam_fn_api_pb2.InstructionResponse """Run an individual stage. Args: runner_execution_context (execution.FnApiRunnerExecutionContext): An object containing execution information for the pipeline. stage (translations.Stage): A description of the stage to execute. """ worker_handler_list = bundle_context_manager.worker_handlers worker_handler_manager = runner_execution_context.worker_handler_manager _LOGGER.info('Running %s', bundle_context_manager.stage.name) (data_input, data_side_input, data_output, expected_timer_output) = self._extract_endpoints( bundle_context_manager, runner_execution_context) worker_handler_manager.register_process_bundle_descriptor( bundle_context_manager.process_bundle_descriptor) # Store the required side inputs into state so it is accessible for the # worker when it runs this bundle. self._store_side_inputs_in_state(runner_execution_context, data_side_input) # Change cache token across bundle repeats cache_token_generator = FnApiRunner.get_cache_token_generator(static=False) self._run_bundle_multiple_times_for_testing( runner_execution_context, bundle_context_manager, data_input, data_output, {}, expected_timer_output, cache_token_generator=cache_token_generator) bundle_manager = ParallelBundleManager( worker_handler_list, bundle_context_manager.get_buffer, bundle_context_manager.get_input_coder_impl, bundle_context_manager.process_bundle_descriptor, self._progress_frequency, num_workers=self._num_workers, cache_token_generator=cache_token_generator) # For the first time of processing, we don't have fired timers as inputs. result, splits = bundle_manager.process_bundle(data_input, data_output, {}, expected_timer_output) last_result = result last_sent = data_input # We cannot split deferred_input until we include residual_roots to # merged results. Without residual_roots, pipeline stops earlier and we # may miss some data. # We also don't partition fired timer inputs for the same reason. bundle_manager._num_workers = 1 while True: deferred_inputs = {} # type: Dict[str, PartitionableBuffer] fired_timers = {} self._collect_written_timers_and_add_to_fired_timers( bundle_context_manager, fired_timers) # Queue any process-initiated delayed bundle applications. for delayed_application in last_result.process_bundle.residual_roots: name = bundle_context_manager.input_for( delayed_application.application.transform_id, delayed_application.application.input_id) if name not in deferred_inputs: deferred_inputs[name] = ListBuffer( coder_impl=bundle_context_manager.get_input_coder_impl(name)) deferred_inputs[name].append(delayed_application.application.element) # Queue any runner-initiated delayed bundle applications. self._add_residuals_and_channel_splits_to_deferred_inputs( splits, bundle_context_manager, last_sent, deferred_inputs) if deferred_inputs or fired_timers: # The worker will be waiting on these inputs as well. for other_input in data_input: if other_input not in deferred_inputs: deferred_inputs[other_input] = ListBuffer( coder_impl=bundle_context_manager.get_input_coder_impl( other_input)) # TODO(robertwb): merge results last_result, splits = bundle_manager.process_bundle( deferred_inputs, data_output, fired_timers, expected_timer_output) last_sent = deferred_inputs result = beam_fn_api_pb2.InstructionResponse( process_bundle=beam_fn_api_pb2.ProcessBundleResponse( monitoring_infos=monitoring_infos.consolidate( itertools.chain( result.process_bundle.monitoring_infos, last_result.process_bundle.monitoring_infos))), error=result.error or last_result.error) else: break return result