def finish_bundle(self): data = self._read_from_pubsub(self.source.timestamp_attribute) if data: output_pcollection = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(output_pcollection) # TODO(ccy): Respect the PubSub source's id_label field. for timestamp, message in data: if self.source.with_attributes: element = message else: element = message.data bundle.output( GlobalWindows.windowed_value(element, timestamp=timestamp)) bundles = [bundle] else: bundles = [] if self._applied_ptransform.inputs: input_pvalue = self._applied_ptransform.inputs[0] else: input_pvalue = pvalue.PBegin(self._applied_ptransform.transform.pipeline) unprocessed_bundle = self._evaluation_context.create_bundle( input_pvalue) # TODO(udim): Correct value for watermark hold. return TransformResult(self, bundles, [unprocessed_bundle], None, {None: Timestamp.of(time.time())})
def test_update_int(self): opcounts = OperationCounters(CounterFactory(), 'some-name', coders.PickleCoder(), 0) self.verify_counters(opcounts, 0) opcounts.update_from(GlobalWindows.windowed_value(1)) opcounts.update_collect() self.verify_counters(opcounts, 1)
def __iter__(self): output_stream = create_OutputStream() for encoded_key, values in self._table.items(): key = self._key_coder.decode(encoded_key) self._post_grouped_coder.get_impl().encode_to_stream( GlobalWindows.windowed_value((key, values)), output_stream, True) return iter([output_stream.get()])
def test_update_multiple(self): coder = coders.PickleCoder() total_size = 0 opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0) self.verify_counters(opcounts, 0, float('nan')) value = GlobalWindows.windowed_value('abcde') opcounts.update_from(value) total_size += coder.estimate_size(value) value = GlobalWindows.windowed_value('defghij') opcounts.update_from(value) total_size += coder.estimate_size(value) self.verify_counters(opcounts, 2, (float(total_size) / 2)) value = GlobalWindows.windowed_value('klmnop') opcounts.update_from(value) total_size += coder.estimate_size(value) self.verify_counters(opcounts, 3, (float(total_size) / 3))
def test_update_str(self): coder = coders.PickleCoder() opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0) self.verify_counters(opcounts, 0, float('nan')) value = GlobalWindows.windowed_value('abcde') opcounts.update_from(value) estimated_size = coder.estimate_size(value) self.verify_counters(opcounts, 1, estimated_size)
def test_update_old_object(self): coder = coders.PickleCoder() opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0) self.verify_counters(opcounts, 0, float('nan')) obj = OldClassThatDoesNotImplementLen() value = GlobalWindows.windowed_value(obj) opcounts.update_from(value) estimated_size = coder.estimate_size(value) self.verify_counters(opcounts, 1, estimated_size)
def finish_bundle(self): bundles = [] transform = self._applied_ptransform.transform assert transform.value is not None create_result = [GlobalWindows.windowed_value(v) for v in transform.value] for result in create_result: self.bundle.output(result) bundles.append(self.bundle) return TransformResult( self._applied_ptransform, bundles, None, None, None, None)
def get_root_bundles(self): test_stream = self._applied_ptransform.transform bundles = [] if len(test_stream.events) > 0: bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) # Explicitly set timestamp to MIN_TIMESTAMP to ensure that we hold the # watermark. bundle.add(GlobalWindows.windowed_value(0, timestamp=MIN_TIMESTAMP)) bundle.commit(None) bundles.append(bundle) return bundles
def finish_bundle(self): unprocessed_bundles = [] hold = None if self.current_index < len(self.test_stream.events) - 1: unprocessed_bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) unprocessed_bundle.add(GlobalWindows.windowed_value( self.current_index + 1, timestamp=self.watermark)) unprocessed_bundles.append(unprocessed_bundle) hold = self.watermark return TransformResult( self._applied_ptransform, self.bundles, unprocessed_bundles, None, hold)
def finish_bundle(self): bundles = [] bundle = None for encoded_k, vs in iteritems(self.gbk_items): if not bundle: bundle = self._evaluation_context.create_bundle( self.output_pcollection) bundles.append(bundle) kwi = KeyedWorkItem(encoded_k, elements=vs) bundle.add(GlobalWindows.windowed_value(kwi)) return TransformResult(self, bundles, [], None, None)
def __iter__(self): output_stream = create_OutputStream() if self._windowing.is_default(): globally_window = GlobalWindows.windowed_value(None).with_value windowed_key_values = lambda key, values: [globally_window((key, values))] else: trigger_driver = trigger.create_trigger_driver(self._windowing, True) windowed_key_values = trigger_driver.process_entire_key coder_impl = self._post_grouped_coder.get_impl() key_coder_impl = self._key_coder.get_impl() for encoded_key, windowed_values in self._table.items(): key = key_coder_impl.decode(encoded_key) for wkvs in windowed_key_values(key, windowed_values): coder_impl.encode_to_stream(wkvs, output_stream, True) return iter([output_stream.get()])
def _flush_batch(self, destination): # Flush the current batch of rows to BigQuery. rows = self._rows_buffer[destination] table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') logging.debug('Flushing data to %s. Total %s rows.', destination, len(rows)) while True: # TODO: Figure out an insertId to make calls idempotent. passed, errors = self.bigquery_wrapper.insert_rows( project_id=table_reference.projectId, dataset_id=table_reference.datasetId, table_id=table_reference.tableId, rows=rows, skip_invalid_rows=True) logging.debug("Passed: %s. Errors are %s", passed, errors) failed_rows = [rows[entry.index] for entry in errors] should_retry = any( bigquery_tools.RetryStrategy.should_retry( self._retry_strategy, entry.errors[0].reason) for entry in errors) rows = failed_rows if not should_retry: break else: retry_backoff = next(self._backoff_calculator) logging.info('Sleeping %s seconds before retrying insertion.', retry_backoff) time.sleep(retry_backoff) self._total_buffered_rows -= len(self._rows_buffer[destination]) del self._rows_buffer[destination] return [pvalue.TaggedOutput(BigQueryWriteFn.FAILED_ROWS, GlobalWindows.windowed_value( (destination, row))) for row in failed_rows]
def partition(self, n): # type: (int) -> List[List[bytes]] """ It is used to partition _GroupingBuffer to N parts. Once it is partitioned, it would not be re-partitioned with diff N. Re-partition is not supported now. """ if not self._grouped_output: if self._windowing.is_default(): globally_window = GlobalWindows.windowed_value( None, timestamp=GlobalWindow().max_timestamp(), pane_info=windowed_value.PaneInfo( is_first=True, is_last=True, timing=windowed_value.PaneInfoTiming.ON_TIME, index=0, nonspeculative_index=0)).with_value windowed_key_values = lambda key, values: [ globally_window((key, values)) ] else: # TODO(pabloem, BEAM-7514): Trigger driver needs access to the clock # note that this only comes through if windowing is default - but what # about having multiple firings on the global window. # May need to revise. trigger_driver = trigger.create_trigger_driver( self._windowing, True) windowed_key_values = trigger_driver.process_entire_key coder_impl = self._post_grouped_coder.get_impl() key_coder_impl = self._key_coder.get_impl() self._grouped_output = [[] for _ in range(n)] output_stream_list = [create_OutputStream() for _ in range(n)] for idx, (encoded_key, windowed_values) in enumerate(self._table.items()): key = key_coder_impl.decode(encoded_key) for wkvs in windowed_key_values(key, windowed_values): coder_impl.encode_to_stream(wkvs, output_stream_list[idx % n], True) for ix, output_stream in enumerate(output_stream_list): self._grouped_output[ix] = [output_stream.get()] self._table.clear() return self._grouped_output
def __iter__(self): if not self._grouped_output: output_stream = create_OutputStream() if self._windowing.is_default(): globally_window = GlobalWindows.windowed_value(None).with_value windowed_key_values = lambda key, values: [ globally_window((key, values))] else: trigger_driver = trigger.create_trigger_driver(self._windowing, True) windowed_key_values = trigger_driver.process_entire_key coder_impl = self._post_grouped_coder.get_impl() key_coder_impl = self._key_coder.get_impl() for encoded_key, windowed_values in self._table.items(): key = key_coder_impl.decode(encoded_key) for wkvs in windowed_key_values(key, windowed_values): coder_impl.encode_to_stream(wkvs, output_stream, True) self._grouped_output = [output_stream.get()] self._table = None return iter(self._grouped_output)
def test_reshuffle_streaming_global_window(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True pipeline = TestPipeline(options=options) data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])] before_reshuffle = (pipeline | beam.Create(data) | beam.WindowInto(GlobalWindows()) | beam.GroupByKey() | beam.MapTuple(lambda k, vs: (k, sorted(vs)))) assert_that(before_reshuffle, equal_to(expected_data), label='before_reshuffle') after_reshuffle = before_reshuffle | beam.Reshuffle() assert_that(after_reshuffle, equal_to(expected_data), label='after reshuffle') pipeline.run()
def get_root_bundles(self): test_stream = self._applied_ptransform.transform # If there was an endpoint defined then get the events from the # TestStreamService. if test_stream.endpoint: _TestStreamEvaluator.event_stream = _TestStream.events_from_rpc( test_stream.endpoint, test_stream.output_tags, test_stream.coder, self._evaluation_context) else: _TestStreamEvaluator.event_stream = ( _TestStream.events_from_script(test_stream._events)) bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) bundle.add(GlobalWindows.windowed_value(b'', timestamp=MIN_TIMESTAMP)) bundle.commit(None) return [bundle]
def create_trigger_driver( windowing, is_batch=False, phased_combine_fn=None, clock=None): """Create the TriggerDriver for the given windowing and options.""" # TODO(robertwb): We can do more if we know elements are in timestamp # sorted order. if windowing.is_default() and is_batch: driver = BatchGlobalTriggerDriver() elif (windowing.windowfn == GlobalWindows() and (windowing.triggerfn in [AfterCount(1), Always()]) and is_batch): # Here we also just pass through all the values exactly once. driver = BatchGlobalTriggerDriver() else: driver = GeneralTriggerDriver(windowing, clock) if phased_combine_fn: # TODO(ccy): Refactor GeneralTriggerDriver to combine values eagerly using # the known phased_combine_fn here. driver = CombiningTriggerDriver(phased_combine_fn, driver) return driver
def finish_bundle(self): if self._is_final_bundle(): if self.global_state.get_state( None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG): # Ignore empty bundles after emitting output. (This may happen because # empty bundles do not affect input watermarks.) bundles = [] else: gbk_result = [] # TODO(ccy): perhaps we can clean this up to not use this # internal attribute of the DirectStepContext. for encoded_k in self.step_context.keyed_existing_state: # Ignore global state. if encoded_k is None: continue k = self.key_coder.decode(encoded_k) state = self.step_context.get_keyed_state(encoded_k) vs = state.get_state(None, _GroupByKeyOnlyEvaluator.ELEMENTS_TAG) gbk_result.append(GlobalWindows.windowed_value((k, vs))) def len_element_fn(element): _, v = element.value return len(v) bundles = self._split_list_into_bundles( self.output_pcollection, gbk_result, _GroupByKeyOnlyEvaluator.MAX_ELEMENT_PER_BUNDLE, len_element_fn) self.global_state.add_state( None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG, True) hold = WatermarkManager.WATERMARK_POS_INF else: bundles = [] hold = WatermarkManager.WATERMARK_NEG_INF self.global_state.set_timer(None, '', TimeDomain.WATERMARK, WatermarkManager.WATERMARK_POS_INF) return TransformResult(self._applied_ptransform, bundles, [], None, {None: hold})
def test_buffering_timer_in_global_window_streaming(self): max_buffering_duration_secs = 42 start_time = timestamp.Timestamp(0) test_stream = TestStream().advance_watermark_to(start_time) for i, value in enumerate(GroupIntoBatchesTest._create_test_data()): test_stream.add_elements( [TimestampedValue(value, start_time + i)]) \ .advance_processing_time(5) test_stream.advance_watermark_to( start_time + GroupIntoBatchesTest.NUM_ELEMENTS + 1) \ .advance_watermark_to_infinity() with TestPipeline(options=StandardOptions(streaming=True)) as pipeline: # Set a batch size larger than the total number of elements. # Since we're in a global window, we would have been waiting # for all the elements to arrive without the buffering time limit. batch_size = GroupIntoBatchesTest.NUM_ELEMENTS * 2 # To trigger the processing time timer, use a fake clock with start time # being Timestamp(0). Since the fake clock never really advances during # the pipeline execution, meaning that the timer is always set to the same # value, the timer will be fired on every element after the first firing. fake_clock = FakeClock(now=start_time) num_elements_per_batch = ( pipeline | test_stream | WindowInto( GlobalWindows(), trigger=Repeatedly(AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | util.GroupIntoBatches( batch_size, max_buffering_duration_secs, fake_clock) | 'count elements in batch' >> Map(lambda x: (None, len(x[1]))) | GroupByKey() | FlatMapTuple(lambda k, vs: vs)) # We will flush twice when the max buffering duration is reached and when # the global window ends. assert_that(num_elements_per_batch, equal_to([9, 1]))
def finish_bundle(self): if self._is_final_bundle(): if self.global_state.get_state( None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG): # Ignore empty bundles after emitting output. (This may happen because # empty bundles do not affect input watermarks.) bundles = [] else: gbk_result = [] # TODO(ccy): perhaps we can clean this up to not use this # internal attribute of the DirectStepContext. for encoded_k in self.step_context.keyed_existing_state: # Ignore global state. if encoded_k is None: continue k = self.key_coder.decode(encoded_k) state = self.step_context.get_keyed_state(encoded_k) vs = state.get_state(None, _GroupByKeyOnlyEvaluator.ELEMENTS_TAG) gbk_result.append(GlobalWindows.windowed_value((k, vs))) def len_element_fn(element): _, v = element.value return len(v) bundles = self._split_list_into_bundles( self.output_pcollection, gbk_result, _GroupByKeyOnlyEvaluator.MAX_ELEMENT_PER_BUNDLE, len_element_fn) self.global_state.add_state( None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG, True) hold = WatermarkManager.WATERMARK_POS_INF else: bundles = [] hold = WatermarkManager.WATERMARK_NEG_INF self.global_state.set_timer( None, '', TimeDomain.WATERMARK, WatermarkManager.WATERMARK_POS_INF) return TransformResult( self._applied_ptransform, bundles, [], None, {None: hold})
def finish_bundle(self): data = self._read_from_pubsub() if data: output_pcollection = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(output_pcollection) # TODO(ccy): we currently do not use the PubSub message timestamp or # respect the PubSub source's id_label field. now = Timestamp.of(time.time()) for message_data in data: bundle.output(GlobalWindows.windowed_value(message_data, timestamp=now)) bundles = [bundle] else: bundles = [] if self._applied_ptransform.inputs: input_pvalue = self._applied_ptransform.inputs[0] else: input_pvalue = pvalue.PBegin(self._applied_ptransform.transform.pipeline) unprocessed_bundle = self._evaluation_context.create_bundle( input_pvalue) return TransformResult( self._applied_ptransform, bundles, [unprocessed_bundle], None, {None: Timestamp.of(time.time())})
def create_trigger_driver(windowing, is_batch=False, phased_combine_fn=None, clock=None): """Create the TriggerDriver for the given windowing and options.""" # TODO(robertwb): We can do more if we know elements are in timestamp # sorted order. if windowing.is_default() and is_batch: driver = DiscardingGlobalTriggerDriver() elif (windowing.windowfn == GlobalWindows() and windowing.triggerfn == AfterCount(1) and windowing.accumulation_mode == AccumulationMode.DISCARDING): # Here we also just pass through all the values every time. driver = DiscardingGlobalTriggerDriver() else: driver = GeneralTriggerDriver(windowing, clock) if phased_combine_fn: # TODO(ccy): Refactor GeneralTriggerDriver to combine values eagerly using # the known phased_combine_fn here. driver = CombiningTriggerDriver(phased_combine_fn, driver) return driver
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True BATCH_SIZE = 1000000 BUFFERING_SECS = 600 p = Pipeline(options=options) (p | Create(range(100), reshuffle=True) | ParDo(make_large_elements) # 128 KiB | WithKeys('') | WindowInto(GlobalWindows(), trigger=Repeatedly( AfterAny(AfterCount(BATCH_SIZE), AfterProcessingTime(BUFFERING_SECS))), accumulation_mode=AccumulationMode.DISCARDING) | GroupByKey() | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1] )))) run = p.run() run.wait_until_finish()
def process_element(self, element): index = element.value self.watermark = element.timestamp assert isinstance(index, int) assert 0 <= index <= len(self.test_stream.events) self.current_index = index event = self.test_stream.events[self.current_index] if isinstance(event, ElementEvent): assert len(self._outputs) == 1 output_pcollection = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(output_pcollection) for tv in event.timestamped_values: bundle.output( GlobalWindows.windowed_value(tv.value, timestamp=tv.timestamp)) self.bundles.append(bundle) elif isinstance(event, WatermarkEvent): assert event.new_watermark >= self.watermark self.watermark = event.new_watermark elif isinstance(event, ProcessingTimeEvent): # TODO(ccy): advance processing time in the context's mock clock. pass else: raise ValueError('Invalid TestStream event: %s.' % event)
def test_combining_with_accumulation_mode_and_fanout(self): # PCollection will contain elements from 1 to 5. elements = [i for i in range(1, 6)] ts = TestStream().advance_watermark_to(0) for i in elements: ts.add_elements([i]) ts.advance_watermark_to_infinity() options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: result = ( p | ts | beam.WindowInto( GlobalWindows(), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, trigger=AfterWatermark(early=AfterAll(AfterCount(1)))) | beam.CombineGlobally(sum).without_defaults().with_fanout(2)) # The frings for DISCARDING mode is [1, 2, 3, 4, 5, 0, 0]. firings = [1, 3, 6, 10, 15, 15, 15] assert_that(result, equal_to(firings))
def get_windowing(self, inputs): return Windowing(GlobalWindows())
def finish_bundle(self): yield WindowedValue(list(self.all_columns), timestamp=0, windows=[GlobalWindows()])
def test_update_int(self): opcounts = OperationCounters(CounterFactory(), 'some-name', coders.PickleCoder(), 0) self.verify_counters(opcounts, 0) opcounts.update_from(GlobalWindows.windowed_value(1)) self.verify_counters(opcounts, 1)
def _read_values_to_bundles(reader): read_result = [GlobalWindows.windowed_value(e) for e in reader] return self._split_list_into_bundles( output_pcollection, read_result, _BoundedReadEvaluator.MAX_ELEMENT_PER_BUNDLE, lambda _: 1)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """ Main execution logic for the Sequencer component :param input_dict: input channels :param output_dict: output channels :param exec_properties: the execution properties defined in the spec """ source = exec_properties[StepKeys.SOURCE] args = exec_properties[StepKeys.ARGS] c = source_utils.load_source_path_class(source) # Get the schema schema_path = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[constants.SCHEMA])) schema = io_utils.SchemaReader().read(schema_path) # TODO: Getting the statistics might help the future implementations sequence_step: BaseSequencerStep = c(schema=schema, statistics=None, **args) # Get split names input_artifact = artifact_utils.get_single_instance( input_dict[constants.INPUT_EXAMPLES]) split_names = artifact_utils.decode_split_names( input_artifact.split_names) # Create output artifact output_artifact = artifact_utils.get_single_instance( output_dict[constants.OUTPUT_EXAMPLES]) output_artifact.split_names = artifact_utils.encode_split_names( split_names) with self._make_beam_pipeline() as p: for s in split_names: input_uri = io_utils.all_files_pattern( artifact_utils.get_split_uri( input_dict[constants.INPUT_EXAMPLES], s)) output_uri = artifact_utils.get_split_uri( output_dict[constants.OUTPUT_EXAMPLES], s) output_path = os.path.join(output_uri, self._DEFAULT_FILENAME) # Read and decode the data data = \ (p | 'Read_' + s >> beam.io.ReadFromTFRecord( file_pattern=input_uri) | 'Decode_' + s >> tf_example_decoder.DecodeTFExample() | 'ToDataFrame_' + s >> beam.ParDo(utils.ConvertToDataframe())) # Window into sessions s_data = \ (data | 'AddCategory_' + s >> beam.ParDo( sequence_step.get_category_do_fn()) | 'AddTimestamp_' + s >> beam.ParDo( sequence_step.get_timestamp_do_fn()) | 'Sessions_' + s >> beam.WindowInto( sequence_step.get_window())) # Combine and transform p_data = \ (s_data | 'Combine_' + s >> beam.CombinePerKey( sequence_step.get_combine_fn())) # Write the results _ = \ (p_data | 'Global_' + s >> beam.WindowInto(GlobalWindows()) | 'RemoveKey_' + s >> beam.ParDo(RemoveKey()) | 'ToExample_' + s >> beam.Map(utils.df_to_example) | 'Serialize_' + s >> beam.Map(utils.serialize) | 'Write_' + s >> beam.io.WriteToTFRecord( output_path, file_name_suffix='.gz'))
from apache_beam.transforms import trigger from apache_beam.transforms import window from apache_beam.transforms.window import GlobalWindow from apache_beam.transforms.window import GlobalWindows from apache_beam.utils import proto_utils from apache_beam.utils import windowed_value if TYPE_CHECKING: from apache_beam.coders.coder_impl import CoderImpl from apache_beam.runners.portability.fn_api_runner import worker_handlers from apache_beam.runners.portability.fn_api_runner.translations import DataSideInput from apache_beam.transforms.window import BoundedWindow ENCODED_IMPULSE_VALUE = WindowedValueCoder( BytesCoder(), GlobalWindowCoder()).get_impl().encode_nested( GlobalWindows.windowed_value(b'')) SAFE_WINDOW_FNS = set(window.WindowFn._known_urns.keys()) - set( [python_urns.PICKLED_WINDOWFN]) class Buffer(Protocol): def __iter__(self): # type: () -> Iterator[bytes] pass def append(self, item): # type: (bytes) -> None pass
def _map_task_registration(self, map_task, state_handler, data_operation_spec): input_data = {} runner_sinks = {} transforms = [] transform_index_to_id = {} # Maps coders to new coder objects and references. coders = {} def coder_id(coder): if coder not in coders: coders[coder] = beam_fn_api_pb2.Coder( function_spec=sdk_worker.pack_function_spec_data( json.dumps(coder.as_cloud_object()), sdk_worker.PYTHON_CODER_URN, id=self._next_uid())) return coders[coder].function_spec.id def output_tags(op): return getattr(op, 'output_tags', ['out']) def as_target(op_input): input_op_index, input_output_index = op_input input_op = map_task[input_op_index][1] return { 'ignored_input_tag': beam_fn_api_pb2.Target.List(target=[ beam_fn_api_pb2.Target( primitive_transform_reference=transform_index_to_id[ input_op_index], name=output_tags(input_op)[input_output_index]) ]) } def outputs(op): return { tag: beam_fn_api_pb2.PCollection(coder_reference=coder_id(coder)) for tag, coder in zip(output_tags(op), op.output_coders) } for op_ix, (stage_name, operation) in enumerate(map_task): transform_id = transform_index_to_id[op_ix] = self._next_uid() if isinstance(operation, operation_specs.WorkerInMemoryWrite): # Write this data back to the runner. fn = beam_fn_api_pb2.FunctionSpec( urn=sdk_worker.DATA_OUTPUT_URN, id=self._next_uid()) if data_operation_spec: fn.data.Pack(data_operation_spec) inputs = as_target(operation.input) side_inputs = {} runner_sinks[(transform_id, 'out')] = operation elif isinstance(operation, operation_specs.WorkerRead): # A Read is either translated to a direct injection of windowed values # into the sdk worker, or an injection of the source object into the # sdk worker as data followed by an SDF that reads that source. if (isinstance(operation.source.source, maptask_executor_runner.InMemorySource) and isinstance( operation.source.source.default_output_coder(), WindowedValueCoder)): output_stream = create_OutputStream() element_coder = (operation.source.source. default_output_coder().get_impl()) # Re-encode the elements in the nested context and # concatenate them together for element in operation.source.source.read(None): element_coder.encode_to_stream(element, output_stream, True) target_name = self._next_uid() input_data[(transform_id, target_name)] = output_stream.get() fn = beam_fn_api_pb2.FunctionSpec( urn=sdk_worker.DATA_INPUT_URN, id=self._next_uid()) if data_operation_spec: fn.data.Pack(data_operation_spec) inputs = {target_name: beam_fn_api_pb2.Target.List()} side_inputs = {} else: # Read the source object from the runner. source_coder = beam.coders.DillCoder() input_transform_id = self._next_uid() output_stream = create_OutputStream() source_coder.get_impl().encode_to_stream( GlobalWindows.windowed_value(operation.source), output_stream, True) target_name = self._next_uid() input_data[(input_transform_id, target_name)] = output_stream.get() input_ptransform = beam_fn_api_pb2.PrimitiveTransform( id=input_transform_id, function_spec=beam_fn_api_pb2.FunctionSpec( urn=sdk_worker.DATA_INPUT_URN, id=self._next_uid()), # TODO(robertwb): Possible name collision. step_name=stage_name + '/inject_source', inputs={target_name: beam_fn_api_pb2.Target.List()}, outputs={ 'out': beam_fn_api_pb2.PCollection( coder_reference=coder_id(source_coder)) }) if data_operation_spec: input_ptransform.function_spec.data.Pack( data_operation_spec) transforms.append(input_ptransform) # Read the elements out of the source. fn = sdk_worker.pack_function_spec_data( OLDE_SOURCE_SPLITTABLE_DOFN_DATA, sdk_worker.PYTHON_DOFN_URN, id=self._next_uid()) inputs = { 'ignored_input_tag': beam_fn_api_pb2.Target.List(target=[ beam_fn_api_pb2.Target( primitive_transform_reference= input_transform_id, name='out') ]) } side_inputs = {} elif isinstance(operation, operation_specs.WorkerDoFn): fn = sdk_worker.pack_function_spec_data( operation.serialized_fn, sdk_worker.PYTHON_DOFN_URN, id=self._next_uid()) inputs = as_target(operation.input) # Store the contents of each side input for state access. for si in operation.side_inputs: assert isinstance(si.source, iobase.BoundedSource) element_coder = si.source.default_output_coder() view_id = self._next_uid() # TODO(robertwb): Actually flesh out the ViewFn API. side_inputs[si.tag] = beam_fn_api_pb2.SideInput( view_fn=sdk_worker.serialize_and_pack_py_fn( element_coder, urn=sdk_worker.PYTHON_ITERABLE_VIEWFN_URN, id=view_id)) # Re-encode the elements in the nested context and # concatenate them together output_stream = create_OutputStream() for element in si.source.read( si.source.get_range_tracker(None, None)): element_coder.get_impl().encode_to_stream( element, output_stream, True) elements_data = output_stream.get() state_key = beam_fn_api_pb2.StateKey.MultimapSideInput( key=view_id) state_handler.Clear(state_key) state_handler.Append(state_key, elements_data) elif isinstance(operation, operation_specs.WorkerFlatten): fn = sdk_worker.pack_function_spec_data( operation.serialized_fn, sdk_worker.IDENTITY_DOFN_URN, id=self._next_uid()) inputs = { 'ignored_input_tag': beam_fn_api_pb2.Target.List(target=[ beam_fn_api_pb2.Target( primitive_transform_reference= transform_index_to_id[input_op_index], name=output_tags(map_task[input_op_index] [1])[input_output_index]) for input_op_index, input_output_index in operation.inputs ]) } side_inputs = {} else: raise TypeError(operation) ptransform = beam_fn_api_pb2.PrimitiveTransform( id=transform_id, function_spec=fn, step_name=stage_name, inputs=inputs, side_inputs=side_inputs, outputs=outputs(operation)) transforms.append(ptransform) process_bundle_descriptor = beam_fn_api_pb2.ProcessBundleDescriptor( id=self._next_uid(), coders=coders.values(), primitive_transform=transforms) return beam_fn_api_pb2.InstructionRequest( instruction_id=self._next_uid(), register=beam_fn_api_pb2.RegisterRequest( process_bundle_descriptor=[process_bundle_descriptor ])), runner_sinks, input_data
from apache_beam.transforms.window import GlobalWindows from apache_beam.utils.windowed_value import WindowedValue # Allow some "pure mode" declarations. try: import cython except ImportError: class FakeCython(object): @staticmethod def cast(type, value): return value globals()['cython'] = FakeCython() _globally_windowed_value = GlobalWindows.windowed_value(None) _global_window_type = type(_globally_windowed_value.windows[0]) class ConsumerSet(Receiver): """A ConsumerSet represents a graph edge between two Operation nodes. The ConsumerSet object collects information from the output of the Operation at one end of its edge and the input of the Operation at the other edge. ConsumerSet are attached to the outputting Operation. """ def __init__(self, counter_factory, step_name, output_index, consumers, coder): self.consumers = consumers self.opcounter = opcounters.OperationCounters(counter_factory,
def set(self, ts): from apache_beam.transforms.window import GlobalWindows self._receiver.receive( GlobalWindows.windowed_value( (self._key, dict(timestamp=timestamp.Timestamp.of(ts)))))
def process_timer(self, timer_firing): """Default process_timer() impl. generating KeyedWorkItem element.""" self.process_element( GlobalWindows.windowed_value( KeyedWorkItem(timer_firing.encoded_key, timer_firings=[timer_firing])))
from apache_beam.transforms.combiners import curry_combine_fn from apache_beam.transforms.window import GlobalWindows from apache_beam.utils.windowed_value import WindowedValue # Allow some "pure mode" declarations. try: import cython except ImportError: class FakeCython(object): @staticmethod def cast(type, value): return value globals()['cython'] = FakeCython() _globally_windowed_value = GlobalWindows.windowed_value(None) _global_window_type = type(_globally_windowed_value.windows[0]) class ConsumerSet(Receiver): """A ConsumerSet represents a graph edge between two Operation nodes. The ConsumerSet object collects information from the output of the Operation at one end of its edge and the input of the Operation at the other edge. ConsumerSet are attached to the outputting Operation. """ def __init__( self, counter_factory, step_name, output_index, consumers, coder): self.consumers = consumers
def finish_bundle(self): assert len(self._outputs) == 1 output_pcollection = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(output_pcollection) bundle.output(GlobalWindows.windowed_value(b'')) return TransformResult(self, [bundle], [], None, None)
def process(self, source): if isinstance(source, iobase.SourceBundle): for value in source.source.read(source.source.get_range_tracker( source.start_position, source.stop_position)): yield value else: # Dataflow native source with source.reader() as reader: for value in reader: yield value # See DataflowRunner._pardo_fn_data OLDE_SOURCE_SPLITTABLE_DOFN_DATA = pickler.dumps( (OldeSourceSplittableDoFn(), (), {}, [], beam.transforms.core.Windowing(GlobalWindows()))) class _GroupingBuffer(object): """Used to accumulate groupded (shuffled) results.""" def __init__(self, pre_grouped_coder, post_grouped_coder, windowing): self._key_coder = pre_grouped_coder.key_coder() self._pre_grouped_coder = pre_grouped_coder self._post_grouped_coder = post_grouped_coder self._table = collections.defaultdict(list) self._windowing = windowing def append(self, elements_data): input_stream = create_InputStream(elements_data) while input_stream.size() > 0: windowed_key_value = self._pre_grouped_coder.get_impl(
def _test(self, trigger, lateness, expected): windowing = WindowInto(GlobalWindows(), trigger=trigger, accumulation_mode=AccumulationMode.ACCUMULATING, allowed_lateness=lateness).windowing self.assertEqual(trigger.may_lose_data(windowing), expected)
def _map_task_registration(self, map_task, state_handler, data_operation_spec): input_data = {} runner_sinks = {} transforms = [] transform_index_to_id = {} # Maps coders to new coder objects and references. coders = {} def coder_id(coder): if coder not in coders: coders[coder] = beam_fn_api_pb2.Coder( function_spec=sdk_worker.pack_function_spec_data( json.dumps(coder.as_cloud_object()), sdk_worker.PYTHON_CODER_URN, id=self._next_uid())) return coders[coder].function_spec.id def output_tags(op): return getattr(op, 'output_tags', ['out']) def as_target(op_input): input_op_index, input_output_index = op_input input_op = map_task[input_op_index][1] return { 'ignored_input_tag': beam_fn_api_pb2.Target.List(target=[ beam_fn_api_pb2.Target( primitive_transform_reference=transform_index_to_id[ input_op_index], name=output_tags(input_op)[input_output_index]) ]) } def outputs(op): return { tag: beam_fn_api_pb2.PCollection(coder_reference=coder_id(coder)) for tag, coder in zip(output_tags(op), op.output_coders) } for op_ix, (stage_name, operation) in enumerate(map_task): transform_id = transform_index_to_id[op_ix] = self._next_uid() if isinstance(operation, operation_specs.WorkerInMemoryWrite): # Write this data back to the runner. fn = beam_fn_api_pb2.FunctionSpec(urn=sdk_worker.DATA_OUTPUT_URN, id=self._next_uid()) if data_operation_spec: fn.data.Pack(data_operation_spec) inputs = as_target(operation.input) side_inputs = {} runner_sinks[(transform_id, 'out')] = operation elif isinstance(operation, operation_specs.WorkerRead): # A Read is either translated to a direct injection of windowed values # into the sdk worker, or an injection of the source object into the # sdk worker as data followed by an SDF that reads that source. if (isinstance(operation.source.source, maptask_executor_runner.InMemorySource) and isinstance(operation.source.source.default_output_coder(), WindowedValueCoder)): output_stream = create_OutputStream() element_coder = ( operation.source.source.default_output_coder().get_impl()) # Re-encode the elements in the nested context and # concatenate them together for element in operation.source.source.read(None): element_coder.encode_to_stream(element, output_stream, True) target_name = self._next_uid() input_data[(transform_id, target_name)] = output_stream.get() fn = beam_fn_api_pb2.FunctionSpec(urn=sdk_worker.DATA_INPUT_URN, id=self._next_uid()) if data_operation_spec: fn.data.Pack(data_operation_spec) inputs = {target_name: beam_fn_api_pb2.Target.List()} side_inputs = {} else: # Read the source object from the runner. source_coder = beam.coders.DillCoder() input_transform_id = self._next_uid() output_stream = create_OutputStream() source_coder.get_impl().encode_to_stream( GlobalWindows.windowed_value(operation.source), output_stream, True) target_name = self._next_uid() input_data[(input_transform_id, target_name)] = output_stream.get() input_ptransform = beam_fn_api_pb2.PrimitiveTransform( id=input_transform_id, function_spec=beam_fn_api_pb2.FunctionSpec( urn=sdk_worker.DATA_INPUT_URN, id=self._next_uid()), # TODO(robertwb): Possible name collision. step_name=stage_name + '/inject_source', inputs={target_name: beam_fn_api_pb2.Target.List()}, outputs={ 'out': beam_fn_api_pb2.PCollection( coder_reference=coder_id(source_coder)) }) if data_operation_spec: input_ptransform.function_spec.data.Pack(data_operation_spec) transforms.append(input_ptransform) # Read the elements out of the source. fn = sdk_worker.pack_function_spec_data( OLDE_SOURCE_SPLITTABLE_DOFN_DATA, sdk_worker.PYTHON_DOFN_URN, id=self._next_uid()) inputs = { 'ignored_input_tag': beam_fn_api_pb2.Target.List(target=[ beam_fn_api_pb2.Target( primitive_transform_reference=input_transform_id, name='out') ]) } side_inputs = {} elif isinstance(operation, operation_specs.WorkerDoFn): fn = sdk_worker.pack_function_spec_data( operation.serialized_fn, sdk_worker.PYTHON_DOFN_URN, id=self._next_uid()) inputs = as_target(operation.input) # Store the contents of each side input for state access. for si in operation.side_inputs: assert isinstance(si.source, iobase.BoundedSource) element_coder = si.source.default_output_coder() view_id = self._next_uid() # TODO(robertwb): Actually flesh out the ViewFn API. side_inputs[si.tag] = beam_fn_api_pb2.SideInput( view_fn=sdk_worker.serialize_and_pack_py_fn( element_coder, urn=sdk_worker.PYTHON_ITERABLE_VIEWFN_URN, id=view_id)) # Re-encode the elements in the nested context and # concatenate them together output_stream = create_OutputStream() for element in si.source.read( si.source.get_range_tracker(None, None)): element_coder.get_impl().encode_to_stream( element, output_stream, True) elements_data = output_stream.get() state_key = beam_fn_api_pb2.StateKey.MultimapSideInput(key=view_id) state_handler.Clear(state_key) state_handler.Append(state_key, elements_data) elif isinstance(operation, operation_specs.WorkerFlatten): fn = sdk_worker.pack_function_spec_data( operation.serialized_fn, sdk_worker.IDENTITY_DOFN_URN, id=self._next_uid()) inputs = { 'ignored_input_tag': beam_fn_api_pb2.Target.List(target=[ beam_fn_api_pb2.Target( primitive_transform_reference=transform_index_to_id[ input_op_index], name=output_tags(map_task[input_op_index][1])[ input_output_index]) for input_op_index, input_output_index in operation.inputs ]) } side_inputs = {} else: raise TypeError(operation) ptransform = beam_fn_api_pb2.PrimitiveTransform( id=transform_id, function_spec=fn, step_name=stage_name, inputs=inputs, side_inputs=side_inputs, outputs=outputs(operation)) transforms.append(ptransform) process_bundle_descriptor = beam_fn_api_pb2.ProcessBundleDescriptor( id=self._next_uid(), coders=coders.values(), primitive_transform=transforms) return beam_fn_api_pb2.InstructionRequest( instruction_id=self._next_uid(), register=beam_fn_api_pb2.RegisterRequest( process_bundle_descriptor=[process_bundle_descriptor ])), runner_sinks, input_data