def __iter__(self): output_stream = create_OutputStream() for encoded_key, values in self._table.items(): key = self._key_coder.decode(encoded_key) self._post_grouped_coder.get_impl().encode_to_stream( GlobalWindows.windowed_value((key, values)), output_stream, True) return iter([output_stream.get()])
def items(self): value_coder_impl = self._value_coder.get_impl() for window, values in self._values_by_window.items(): encoded_window = self._window_coder.encode(window) output_stream = create_OutputStream() for value in values: value_coder_impl.encode_to_stream(value, output_stream, True) yield encoded_window, output_stream.get()
def __iter__(self): output_stream = create_OutputStream() trigger_driver = trigger.create_trigger_driver(self._windowing, True) for encoded_key, windowed_values in self._table.items(): key = self._key_coder.decode(encoded_key) for wkvs in trigger_driver.process_entire_key(key, windowed_values): self._post_grouped_coder.get_impl().encode_to_stream( wkvs, output_stream, True) return iter([output_stream.get()])
def _commit(self): if self._cleared: self._state_handler.blocking_clear(self._state_key) if self._added_elements: value_coder_impl = self._value_coder.get_impl() out = coder_impl.create_OutputStream() for element in self._added_elements: value_coder_impl.encode_to_stream(element, out, True) self._state_handler.blocking_append(self._state_key, out.get())
def iterable_state_write(values, element_coder_impl): token = unique_name(None, 'iter').encode('ascii') out = create_OutputStream() for element in values: element_coder_impl.encode_to_stream(element, out, True) controller.state.blocking_append( beam_fn_api_pb2.StateKey( runner=beam_fn_api_pb2.StateKey.Runner(key=token)), out.get()) return token
def encoded_items(self): value_coder_impl = self._value_coder.get_impl() key_coder_impl = self._key_coder.get_impl() for (key, window), values in self._values_by_window.items(): encoded_window = self._window_coder.encode(window) encoded_key = key_coder_impl.encode_nested(key) output_stream = create_OutputStream() for value in values: value_coder_impl.encode_to_stream(value, output_stream, True) yield encoded_key, encoded_window, output_stream.get()
def __iter__(self): output_stream = create_OutputStream() if self._windowing.is_default(): globally_window = GlobalWindows.windowed_value(None).with_value windowed_key_values = lambda key, values: [globally_window((key, values))] else: trigger_driver = trigger.create_trigger_driver(self._windowing, True) windowed_key_values = trigger_driver.process_entire_key coder_impl = self._post_grouped_coder.get_impl() key_coder_impl = self._key_coder.get_impl() for encoded_key, windowed_values in self._table.items(): key = key_coder_impl.decode(encoded_key) for wkvs in windowed_key_values(key, windowed_values): coder_impl.encode_to_stream(wkvs, output_stream, True) return iter([output_stream.get()])
def __init__(self, flatten_row_coder): self._flatten_row_coder = flatten_row_coder self._data_out_stream = create_OutputStream()
from apache_beam.runners.worker.channel_factory import GRPCChannelFactory from apache_beam.runners.worker.worker_id_interceptor import WorkerIdInterceptor if TYPE_CHECKING: # TODO(BEAM-9372): move this out of the TYPE_CHECKING scope when we drop # support for python < 3.5.3 from types import TracebackType ExcInfo = Tuple[Type[BaseException], BaseException, TracebackType] OptExcInfo = Union[ExcInfo, Tuple[None, None, None]] # TODO: move this out of the TYPE_CHECKING scope when we drop support for # python < 3.6 from typing import Collection # pylint: disable=ungrouped-imports import apache_beam.coders.slow_stream OutputStream = apache_beam.coders.slow_stream.OutputStream else: OutputStream = type(coder_impl.create_OutputStream()) # This module is experimental. No backwards-compatibility guarantees. _LOGGER = logging.getLogger(__name__) _DEFAULT_SIZE_FLUSH_THRESHOLD = 10 << 20 # 10MB _DEFAULT_TIME_FLUSH_THRESHOLD_MS = 0 # disable time-based flush by default class ClosableOutputStream(OutputStream): """A Outputstream for use with CoderImpls that has a close() method.""" def __init__(self, close_callback=None): super(ClosableOutputStream, self).__init__() self._close_callback = close_callback
def encode_nested(coder, value, nested=True): out = coder_impl.create_OutputStream() coder.get_impl().encode_to_stream(value, out, nested) return out.get()
def _reencode_elements(elements, element_coder): output_stream = create_OutputStream() for element in elements: element_coder.get_impl().encode_to_stream(element, output_stream, True) return output_stream.get()
def _map_task_registration(self, map_task, state_handler, data_operation_spec): input_data = {} runner_sinks = {} transforms = [] transform_index_to_id = {} # Maps coders to new coder objects and references. coders = {} def coder_id(coder): if coder not in coders: coders[coder] = beam_fn_api_pb2.Coder( function_spec=sdk_worker.pack_function_spec_data( json.dumps(coder.as_cloud_object()), sdk_worker.PYTHON_CODER_URN, id=self._next_uid())) return coders[coder].function_spec.id def output_tags(op): return getattr(op, 'output_tags', ['out']) def as_target(op_input): input_op_index, input_output_index = op_input input_op = map_task[input_op_index][1] return { 'ignored_input_tag': beam_fn_api_pb2.Target.List(target=[ beam_fn_api_pb2.Target( primitive_transform_reference=transform_index_to_id[ input_op_index], name=output_tags(input_op)[input_output_index]) ]) } def outputs(op): return { tag: beam_fn_api_pb2.PCollection(coder_reference=coder_id(coder)) for tag, coder in zip(output_tags(op), op.output_coders) } for op_ix, (stage_name, operation) in enumerate(map_task): transform_id = transform_index_to_id[op_ix] = self._next_uid() if isinstance(operation, operation_specs.WorkerInMemoryWrite): # Write this data back to the runner. fn = beam_fn_api_pb2.FunctionSpec( urn=sdk_worker.DATA_OUTPUT_URN, id=self._next_uid()) if data_operation_spec: fn.data.Pack(data_operation_spec) inputs = as_target(operation.input) side_inputs = {} runner_sinks[(transform_id, 'out')] = operation elif isinstance(operation, operation_specs.WorkerRead): # A Read is either translated to a direct injection of windowed values # into the sdk worker, or an injection of the source object into the # sdk worker as data followed by an SDF that reads that source. if (isinstance(operation.source.source, worker_runner_base.InMemorySource) and isinstance( operation.source.source.default_output_coder(), WindowedValueCoder)): output_stream = create_OutputStream() element_coder = (operation.source.source. default_output_coder().get_impl()) # Re-encode the elements in the nested context and # concatenate them together for element in operation.source.source.read(None): element_coder.encode_to_stream(element, output_stream, True) target_name = self._next_uid() input_data[(transform_id, target_name)] = output_stream.get() fn = beam_fn_api_pb2.FunctionSpec( urn=sdk_worker.DATA_INPUT_URN, id=self._next_uid()) if data_operation_spec: fn.data.Pack(data_operation_spec) inputs = {target_name: beam_fn_api_pb2.Target.List()} side_inputs = {} else: # Read the source object from the runner. source_coder = beam.coders.DillCoder() input_transform_id = self._next_uid() output_stream = create_OutputStream() source_coder.get_impl().encode_to_stream( GlobalWindows.windowed_value(operation.source), output_stream, True) target_name = self._next_uid() input_data[(input_transform_id, target_name)] = output_stream.get() input_ptransform = beam_fn_api_pb2.PrimitiveTransform( id=input_transform_id, function_spec=beam_fn_api_pb2.FunctionSpec( urn=sdk_worker.DATA_INPUT_URN, id=self._next_uid()), # TODO(robertwb): Possible name collision. step_name=stage_name + '/inject_source', inputs={target_name: beam_fn_api_pb2.Target.List()}, outputs={ 'out': beam_fn_api_pb2.PCollection( coder_reference=coder_id(source_coder)) }) if data_operation_spec: input_ptransform.function_spec.data.Pack( data_operation_spec) transforms.append(input_ptransform) # Read the elements out of the source. fn = sdk_worker.pack_function_spec_data( OLDE_SOURCE_SPLITTABLE_DOFN_DATA, sdk_worker.PYTHON_DOFN_URN, id=self._next_uid()) inputs = { 'ignored_input_tag': beam_fn_api_pb2.Target.List(target=[ beam_fn_api_pb2.Target( primitive_transform_reference= input_transform_id, name='out') ]) } side_inputs = {} elif isinstance(operation, operation_specs.WorkerDoFn): fn = sdk_worker.pack_function_spec_data( operation.serialized_fn, sdk_worker.PYTHON_DOFN_URN, id=self._next_uid()) inputs = as_target(operation.input) # Store the contents of each side input for state access. for si in operation.side_inputs: assert isinstance(si.source, iobase.BoundedSource) element_coder = si.source.default_output_coder() view_id = self._next_uid() # TODO(robertwb): Actually flesh out the ViewFn API. side_inputs[si.tag] = beam_fn_api_pb2.SideInput( view_fn=sdk_worker.serialize_and_pack_py_fn( element_coder, urn=sdk_worker.PYTHON_ITERABLE_VIEWFN_URN, id=view_id)) # Re-encode the elements in the nested context and # concatenate them together output_stream = create_OutputStream() for element in si.source.read( si.source.get_range_tracker(None, None)): element_coder.get_impl().encode_to_stream( element, output_stream, True) elements_data = output_stream.get() state_key = beam_fn_api_pb2.StateKey( function_spec_reference=view_id) state_handler.Clear(state_key) state_handler.Append( beam_fn_api_pb2.SimpleStateAppendRequest( state_key=state_key, data=[elements_data])) elif isinstance(operation, operation_specs.WorkerFlatten): fn = sdk_worker.pack_function_spec_data( operation.serialized_fn, sdk_worker.IDENTITY_DOFN_URN, id=self._next_uid()) inputs = { 'ignored_input_tag': beam_fn_api_pb2.Target.List(target=[ beam_fn_api_pb2.Target( primitive_transform_reference= transform_index_to_id[input_op_index], name=output_tags(map_task[input_op_index] [1])[input_output_index]) for input_op_index, input_output_index in operation.inputs ]) } side_inputs = {} else: raise TypeError(operation) ptransform = beam_fn_api_pb2.PrimitiveTransform( id=transform_id, function_spec=fn, step_name=stage_name, inputs=inputs, side_inputs=side_inputs, outputs=outputs(operation)) transforms.append(ptransform) process_bundle_descriptor = beam_fn_api_pb2.ProcessBundleDescriptor( id=self._next_uid(), coders=coders.values(), primitive_transform=transforms) return beam_fn_api_pb2.InstructionRequest( instruction_id=self._next_uid(), register=beam_fn_api_pb2.RegisterRequest( process_bundle_descriptor=[process_bundle_descriptor ])), runner_sinks, input_data
def _encode_gauge(coder, timestamp, value): timestamp_coder = coders.VarIntCoder().get_impl() stream = coder_impl.create_OutputStream() timestamp_coder.encode_to_stream(int(timestamp * 1000), stream, True) coder.get_impl().encode_to_stream(value, stream, True) return stream.get()
def _map_task_registration(self, map_task, state_handler, data_operation_spec): input_data = {} runner_sinks = {} transforms = [] transform_index_to_id = {} # Maps coders to new coder objects and references. coders = {} def coder_id(coder): if coder not in coders: coders[coder] = beam_fn_api_pb2.Coder( function_spec=sdk_worker.pack_function_spec_data( json.dumps(coder.as_cloud_object()), sdk_worker.PYTHON_CODER_URN, id=self._next_uid())) return coders[coder].function_spec.id def output_tags(op): return getattr(op, 'output_tags', ['out']) def as_target(op_input): input_op_index, input_output_index = op_input input_op = map_task[input_op_index][1] return { 'ignored_input_tag': beam_fn_api_pb2.Target.List(target=[ beam_fn_api_pb2.Target( primitive_transform_reference=transform_index_to_id[ input_op_index], name=output_tags(input_op)[input_output_index]) ]) } def outputs(op): return { tag: beam_fn_api_pb2.PCollection(coder_reference=coder_id(coder)) for tag, coder in zip(output_tags(op), op.output_coders) } for op_ix, (stage_name, operation) in enumerate(map_task): transform_id = transform_index_to_id[op_ix] = self._next_uid() if isinstance(operation, operation_specs.WorkerInMemoryWrite): # Write this data back to the runner. fn = beam_fn_api_pb2.FunctionSpec(urn=sdk_worker.DATA_OUTPUT_URN, id=self._next_uid()) if data_operation_spec: fn.data.Pack(data_operation_spec) inputs = as_target(operation.input) side_inputs = {} runner_sinks[(transform_id, 'out')] = operation elif isinstance(operation, operation_specs.WorkerRead): # A Read is either translated to a direct injection of windowed values # into the sdk worker, or an injection of the source object into the # sdk worker as data followed by an SDF that reads that source. if (isinstance(operation.source.source, maptask_executor_runner.InMemorySource) and isinstance(operation.source.source.default_output_coder(), WindowedValueCoder)): output_stream = create_OutputStream() element_coder = ( operation.source.source.default_output_coder().get_impl()) # Re-encode the elements in the nested context and # concatenate them together for element in operation.source.source.read(None): element_coder.encode_to_stream(element, output_stream, True) target_name = self._next_uid() input_data[(transform_id, target_name)] = output_stream.get() fn = beam_fn_api_pb2.FunctionSpec(urn=sdk_worker.DATA_INPUT_URN, id=self._next_uid()) if data_operation_spec: fn.data.Pack(data_operation_spec) inputs = {target_name: beam_fn_api_pb2.Target.List()} side_inputs = {} else: # Read the source object from the runner. source_coder = beam.coders.DillCoder() input_transform_id = self._next_uid() output_stream = create_OutputStream() source_coder.get_impl().encode_to_stream( GlobalWindows.windowed_value(operation.source), output_stream, True) target_name = self._next_uid() input_data[(input_transform_id, target_name)] = output_stream.get() input_ptransform = beam_fn_api_pb2.PrimitiveTransform( id=input_transform_id, function_spec=beam_fn_api_pb2.FunctionSpec( urn=sdk_worker.DATA_INPUT_URN, id=self._next_uid()), # TODO(robertwb): Possible name collision. step_name=stage_name + '/inject_source', inputs={target_name: beam_fn_api_pb2.Target.List()}, outputs={ 'out': beam_fn_api_pb2.PCollection( coder_reference=coder_id(source_coder)) }) if data_operation_spec: input_ptransform.function_spec.data.Pack(data_operation_spec) transforms.append(input_ptransform) # Read the elements out of the source. fn = sdk_worker.pack_function_spec_data( OLDE_SOURCE_SPLITTABLE_DOFN_DATA, sdk_worker.PYTHON_DOFN_URN, id=self._next_uid()) inputs = { 'ignored_input_tag': beam_fn_api_pb2.Target.List(target=[ beam_fn_api_pb2.Target( primitive_transform_reference=input_transform_id, name='out') ]) } side_inputs = {} elif isinstance(operation, operation_specs.WorkerDoFn): fn = sdk_worker.pack_function_spec_data( operation.serialized_fn, sdk_worker.PYTHON_DOFN_URN, id=self._next_uid()) inputs = as_target(operation.input) # Store the contents of each side input for state access. for si in operation.side_inputs: assert isinstance(si.source, iobase.BoundedSource) element_coder = si.source.default_output_coder() view_id = self._next_uid() # TODO(robertwb): Actually flesh out the ViewFn API. side_inputs[si.tag] = beam_fn_api_pb2.SideInput( view_fn=sdk_worker.serialize_and_pack_py_fn( element_coder, urn=sdk_worker.PYTHON_ITERABLE_VIEWFN_URN, id=view_id)) # Re-encode the elements in the nested context and # concatenate them together output_stream = create_OutputStream() for element in si.source.read( si.source.get_range_tracker(None, None)): element_coder.get_impl().encode_to_stream( element, output_stream, True) elements_data = output_stream.get() state_key = beam_fn_api_pb2.StateKey.MultimapSideInput(key=view_id) state_handler.Clear(state_key) state_handler.Append(state_key, elements_data) elif isinstance(operation, operation_specs.WorkerFlatten): fn = sdk_worker.pack_function_spec_data( operation.serialized_fn, sdk_worker.IDENTITY_DOFN_URN, id=self._next_uid()) inputs = { 'ignored_input_tag': beam_fn_api_pb2.Target.List(target=[ beam_fn_api_pb2.Target( primitive_transform_reference=transform_index_to_id[ input_op_index], name=output_tags(map_task[input_op_index][1])[ input_output_index]) for input_op_index, input_output_index in operation.inputs ]) } side_inputs = {} else: raise TypeError(operation) ptransform = beam_fn_api_pb2.PrimitiveTransform( id=transform_id, function_spec=fn, step_name=stage_name, inputs=inputs, side_inputs=side_inputs, outputs=outputs(operation)) transforms.append(ptransform) process_bundle_descriptor = beam_fn_api_pb2.ProcessBundleDescriptor( id=self._next_uid(), coders=coders.values(), primitive_transform=transforms) return beam_fn_api_pb2.InstructionRequest( instruction_id=self._next_uid(), register=beam_fn_api_pb2.RegisterRequest( process_bundle_descriptor=[process_bundle_descriptor ])), runner_sinks, input_data
def _reencode_elements(elements, element_coder): output_stream = create_OutputStream() for element in elements: element_coder.get_impl().encode_to_stream(element, output_stream, True) return output_stream.get()
def run_stage( self, worker_handler_factory, pipeline_components, stage, pcoll_buffers, safe_coders): def iterable_state_write(values, element_coder_impl): token = unique_name(None, 'iter').encode('ascii') out = create_OutputStream() for element in values: element_coder_impl.encode_to_stream(element, out, True) controller.state.blocking_append( beam_fn_api_pb2.StateKey( runner=beam_fn_api_pb2.StateKey.Runner(key=token)), out.get()) return token controller = worker_handler_factory(stage.environment) context = pipeline_context.PipelineContext( pipeline_components, iterable_state_write=iterable_state_write) data_api_service_descriptor = controller.data_api_service_descriptor() def extract_endpoints(stage): # Returns maps of transform names to PCollection identifiers. # Also mutates IO stages to point to the data ApiServiceDescriptor. data_input = {} data_side_input = {} data_output = {} for transform in stage.transforms: if transform.spec.urn in (bundle_processor.DATA_INPUT_URN, bundle_processor.DATA_OUTPUT_URN): pcoll_id = transform.spec.payload if transform.spec.urn == bundle_processor.DATA_INPUT_URN: target = transform.unique_name, only_element(transform.outputs) if pcoll_id == fn_api_runner_transforms.IMPULSE_BUFFER: data_input[target] = [ENCODED_IMPULSE_VALUE] else: data_input[target] = pcoll_buffers[pcoll_id] coder_id = pipeline_components.pcollections[ only_element(transform.outputs.values())].coder_id elif transform.spec.urn == bundle_processor.DATA_OUTPUT_URN: target = transform.unique_name, only_element(transform.inputs) data_output[target] = pcoll_id coder_id = pipeline_components.pcollections[ only_element(transform.inputs.values())].coder_id else: raise NotImplementedError data_spec = beam_fn_api_pb2.RemoteGrpcPort(coder_id=coder_id) if data_api_service_descriptor: data_spec.api_service_descriptor.url = ( data_api_service_descriptor.url) transform.spec.payload = data_spec.SerializeToString() elif transform.spec.urn == common_urns.primitives.PAR_DO.urn: payload = proto_utils.parse_Bytes( transform.spec.payload, beam_runner_api_pb2.ParDoPayload) for tag, si in payload.side_inputs.items(): data_side_input[transform.unique_name, tag] = ( create_buffer_id(transform.inputs[tag]), si.access_pattern) return data_input, data_side_input, data_output logging.info('Running %s', stage.name) logging.debug(' %s', stage) data_input, data_side_input, data_output = extract_endpoints(stage) process_bundle_descriptor = beam_fn_api_pb2.ProcessBundleDescriptor( id=self._next_uid(), transforms={transform.unique_name: transform for transform in stage.transforms}, pcollections=dict(pipeline_components.pcollections.items()), coders=dict(pipeline_components.coders.items()), windowing_strategies=dict( pipeline_components.windowing_strategies.items()), environments=dict(pipeline_components.environments.items())) if controller.state_api_service_descriptor(): process_bundle_descriptor.state_api_service_descriptor.url = ( controller.state_api_service_descriptor().url) # Store the required side inputs into state. for (transform_id, tag), (buffer_id, si) in data_side_input.items(): _, pcoll_id = split_buffer_id(buffer_id) value_coder = context.coders[safe_coders[ pipeline_components.pcollections[pcoll_id].coder_id]] elements_by_window = _WindowGroupingBuffer(si, value_coder) for element_data in pcoll_buffers[buffer_id]: elements_by_window.append(element_data) for key, window, elements_data in elements_by_window.encoded_items(): state_key = beam_fn_api_pb2.StateKey( multimap_side_input=beam_fn_api_pb2.StateKey.MultimapSideInput( ptransform_id=transform_id, side_input_id=tag, window=window, key=key)) controller.state.blocking_append(state_key, elements_data) def get_buffer(buffer_id): kind, name = split_buffer_id(buffer_id) if kind in ('materialize', 'timers'): if buffer_id not in pcoll_buffers: # Just store the data chunks for replay. pcoll_buffers[buffer_id] = list() elif kind == 'group': # This is a grouping write, create a grouping buffer if needed. if buffer_id not in pcoll_buffers: original_gbk_transform = name transform_proto = pipeline_components.transforms[ original_gbk_transform] input_pcoll = only_element(list(transform_proto.inputs.values())) output_pcoll = only_element(list(transform_proto.outputs.values())) pre_gbk_coder = context.coders[safe_coders[ pipeline_components.pcollections[input_pcoll].coder_id]] post_gbk_coder = context.coders[safe_coders[ pipeline_components.pcollections[output_pcoll].coder_id]] windowing_strategy = context.windowing_strategies[ pipeline_components .pcollections[output_pcoll].windowing_strategy_id] pcoll_buffers[buffer_id] = _GroupingBuffer( pre_gbk_coder, post_gbk_coder, windowing_strategy) else: # These should be the only two identifiers we produce for now, # but special side input writes may go here. raise NotImplementedError(buffer_id) return pcoll_buffers[buffer_id] for k in range(self._bundle_repeat): try: controller.state.checkpoint() BundleManager( controller, lambda pcoll_id: [], process_bundle_descriptor, self._progress_frequency, k).process_bundle(data_input, data_output) finally: controller.state.restore() result = BundleManager( controller, get_buffer, process_bundle_descriptor, self._progress_frequency).process_bundle(data_input, data_output) while True: timer_inputs = {} for transform_id, timer_writes in stage.timer_pcollections: windowed_timer_coder_impl = context.coders[ pipeline_components.pcollections[timer_writes].coder_id].get_impl() written_timers = get_buffer( create_buffer_id(timer_writes, kind='timers')) if written_timers: # Keep only the "last" timer set per key and window. timers_by_key_and_window = {} for elements_data in written_timers: input_stream = create_InputStream(elements_data) while input_stream.size() > 0: windowed_key_timer = windowed_timer_coder_impl.decode_from_stream( input_stream, True) key, _ = windowed_key_timer.value # TODO: Explode and merge windows. assert len(windowed_key_timer.windows) == 1 timers_by_key_and_window[ key, windowed_key_timer.windows[0]] = windowed_key_timer out = create_OutputStream() for windowed_key_timer in timers_by_key_and_window.values(): windowed_timer_coder_impl.encode_to_stream( windowed_key_timer, out, True) timer_inputs[transform_id, 'out'] = [out.get()] written_timers[:] = [] if timer_inputs: # The worker will be waiting on these inputs as well. for other_input in data_input: if other_input not in timer_inputs: timer_inputs[other_input] = [] # TODO(robertwb): merge results BundleManager( controller, get_buffer, process_bundle_descriptor, self._progress_frequency, True).process_bundle(timer_inputs, data_output) else: break return result
def run_stage( self, worker_handler_factory, pipeline_components, stage, pcoll_buffers, safe_coders): def iterable_state_write(values, element_coder_impl): token = unique_name(None, 'iter').encode('ascii') out = create_OutputStream() for element in values: element_coder_impl.encode_to_stream(element, out, True) controller.state.blocking_append( beam_fn_api_pb2.StateKey( runner=beam_fn_api_pb2.StateKey.Runner(key=token)), out.get()) return token controller = worker_handler_factory(stage.environment) context = pipeline_context.PipelineContext( pipeline_components, iterable_state_write=iterable_state_write) data_api_service_descriptor = controller.data_api_service_descriptor() def extract_endpoints(stage): # Returns maps of transform names to PCollection identifiers. # Also mutates IO stages to point to the data ApiServiceDescriptor. data_input = {} data_side_input = {} data_output = {} for transform in stage.transforms: if transform.spec.urn in (bundle_processor.DATA_INPUT_URN, bundle_processor.DATA_OUTPUT_URN): pcoll_id = transform.spec.payload if transform.spec.urn == bundle_processor.DATA_INPUT_URN: target = transform.unique_name, only_element(transform.outputs) if pcoll_id == fn_api_runner_transforms.IMPULSE_BUFFER: data_input[target] = [ENCODED_IMPULSE_VALUE] else: data_input[target] = pcoll_buffers[pcoll_id] coder_id = pipeline_components.pcollections[ only_element(transform.outputs.values())].coder_id elif transform.spec.urn == bundle_processor.DATA_OUTPUT_URN: target = transform.unique_name, only_element(transform.inputs) data_output[target] = pcoll_id coder_id = pipeline_components.pcollections[ only_element(transform.inputs.values())].coder_id else: raise NotImplementedError data_spec = beam_fn_api_pb2.RemoteGrpcPort(coder_id=coder_id) if data_api_service_descriptor: data_spec.api_service_descriptor.url = ( data_api_service_descriptor.url) transform.spec.payload = data_spec.SerializeToString() elif transform.spec.urn in fn_api_runner_transforms.PAR_DO_URNS: payload = proto_utils.parse_Bytes( transform.spec.payload, beam_runner_api_pb2.ParDoPayload) for tag, si in payload.side_inputs.items(): data_side_input[transform.unique_name, tag] = ( create_buffer_id(transform.inputs[tag]), si.access_pattern) return data_input, data_side_input, data_output logging.info('Running %s', stage.name) logging.debug(' %s', stage) data_input, data_side_input, data_output = extract_endpoints(stage) process_bundle_descriptor = beam_fn_api_pb2.ProcessBundleDescriptor( id=self._next_uid(), transforms={transform.unique_name: transform for transform in stage.transforms}, pcollections=dict(pipeline_components.pcollections.items()), coders=dict(pipeline_components.coders.items()), windowing_strategies=dict( pipeline_components.windowing_strategies.items()), environments=dict(pipeline_components.environments.items())) if controller.state_api_service_descriptor(): process_bundle_descriptor.state_api_service_descriptor.url = ( controller.state_api_service_descriptor().url) # Store the required side inputs into state. for (transform_id, tag), (buffer_id, si) in data_side_input.items(): _, pcoll_id = split_buffer_id(buffer_id) value_coder = context.coders[safe_coders[ pipeline_components.pcollections[pcoll_id].coder_id]] elements_by_window = _WindowGroupingBuffer(si, value_coder) for element_data in pcoll_buffers[buffer_id]: elements_by_window.append(element_data) for key, window, elements_data in elements_by_window.encoded_items(): state_key = beam_fn_api_pb2.StateKey( multimap_side_input=beam_fn_api_pb2.StateKey.MultimapSideInput( ptransform_id=transform_id, side_input_id=tag, window=window, key=key)) controller.state.blocking_append(state_key, elements_data) def get_buffer(buffer_id): kind, name = split_buffer_id(buffer_id) if kind in ('materialize', 'timers'): if buffer_id not in pcoll_buffers: # Just store the data chunks for replay. pcoll_buffers[buffer_id] = list() elif kind == 'group': # This is a grouping write, create a grouping buffer if needed. if buffer_id not in pcoll_buffers: original_gbk_transform = name transform_proto = pipeline_components.transforms[ original_gbk_transform] input_pcoll = only_element(list(transform_proto.inputs.values())) output_pcoll = only_element(list(transform_proto.outputs.values())) pre_gbk_coder = context.coders[safe_coders[ pipeline_components.pcollections[input_pcoll].coder_id]] post_gbk_coder = context.coders[safe_coders[ pipeline_components.pcollections[output_pcoll].coder_id]] windowing_strategy = context.windowing_strategies[ pipeline_components .pcollections[output_pcoll].windowing_strategy_id] pcoll_buffers[buffer_id] = _GroupingBuffer( pre_gbk_coder, post_gbk_coder, windowing_strategy) else: # These should be the only two identifiers we produce for now, # but special side input writes may go here. raise NotImplementedError(buffer_id) return pcoll_buffers[buffer_id] for k in range(self._bundle_repeat): try: controller.state.checkpoint() BundleManager( controller, lambda pcoll_id: [], process_bundle_descriptor, self._progress_frequency, k).process_bundle(data_input, data_output) finally: controller.state.restore() result = BundleManager( controller, get_buffer, process_bundle_descriptor, self._progress_frequency).process_bundle( data_input, data_output) last_result = result while True: deferred_inputs = collections.defaultdict(list) for transform_id, timer_writes in stage.timer_pcollections: # Queue any set timers as new inputs. windowed_timer_coder_impl = context.coders[ pipeline_components.pcollections[timer_writes].coder_id].get_impl() written_timers = get_buffer( create_buffer_id(timer_writes, kind='timers')) if written_timers: # Keep only the "last" timer set per key and window. timers_by_key_and_window = {} for elements_data in written_timers: input_stream = create_InputStream(elements_data) while input_stream.size() > 0: windowed_key_timer = windowed_timer_coder_impl.decode_from_stream( input_stream, True) key, _ = windowed_key_timer.value # TODO: Explode and merge windows. assert len(windowed_key_timer.windows) == 1 timers_by_key_and_window[ key, windowed_key_timer.windows[0]] = windowed_key_timer out = create_OutputStream() for windowed_key_timer in timers_by_key_and_window.values(): windowed_timer_coder_impl.encode_to_stream( windowed_key_timer, out, True) deferred_inputs[transform_id, 'out'] = [out.get()] written_timers[:] = [] # Queue any delayed bundle applications. for delayed_application in last_result.process_bundle.residual_roots: # Find the io transform that feeds this transform. # TODO(SDF): Memoize? application = delayed_application.application input_pcoll = process_bundle_descriptor.transforms[ application.ptransform_id].inputs[application.input_id] for input_id, proto in process_bundle_descriptor.transforms.items(): if (proto.spec.urn == bundle_processor.DATA_INPUT_URN and input_pcoll in proto.outputs.values()): deferred_inputs[input_id, 'out'].append(application.element) break else: raise RuntimeError( 'No IO transform feeds %s' % application.ptransform_id) if deferred_inputs: # The worker will be waiting on these inputs as well. for other_input in data_input: if other_input not in deferred_inputs: deferred_inputs[other_input] = [] # TODO(robertwb): merge results last_result = BundleManager( controller, get_buffer, process_bundle_descriptor, self._progress_frequency, True).process_bundle(deferred_inputs, data_output) else: break return result
def encode_nested(coder, value, nested=True): out = coder_impl.create_OutputStream() coder.get_impl().encode_to_stream(value, out, nested) return out.get()
def __init__(self, field_coder): self._field_coder = field_coder self.data_out_stream = create_OutputStream()