def test_always(self): with TestPipeline() as p: def construct_timestamped(k_t): return TimestampedValue((k_t[0], k_t[1]), k_t[1]) def format_result(k_v): return ('%s-%s' % (k_v[0], len(k_v[1])), set(k_v[1])) result = ( p | beam.Create([1, 1, 2, 3, 4, 5, 10, 11]) | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)]) | beam.Map(construct_timestamped) | beam.WindowInto( FixedWindows(10), trigger=Always(), accumulation_mode=AccumulationMode.DISCARDING) | beam.GroupByKey() | beam.Map(format_result)) assert_that( result, equal_to( list({ 'A-2': {10, 11}, # Elements out of windows are also emitted. 'A-6': {1, 2, 3, 4, 5}, # A,1 is emitted twice. 'B-5': {6, 7, 8, 9}, # B,6 is emitted twice. 'B-3': {10, 15, 16}, }.items())))
def expand(self, pcoll): windowing_saved = pcoll.windowing if windowing_saved.is_default(): # In this (common) case we can use a trivial trigger driver # and avoid the (expensive) window param. globally_windowed = window.GlobalWindows.windowed_value(None) MIN_TIMESTAMP = window.MIN_TIMESTAMP def reify_timestamps(element, timestamp=DoFn.TimestampParam): key, value = element if timestamp == MIN_TIMESTAMP: timestamp = None return key, (value, timestamp) def restore_timestamps(element): key, values = element return [ globally_windowed.with_value((key, value)) if timestamp is None else window.GlobalWindows.windowed_value( (key, value), timestamp) for (value, timestamp) in values ] else: # typing: All conditional function variants must have identical signatures def reify_timestamps( # type: ignore[misc] element, timestamp=DoFn.TimestampParam, window=DoFn.WindowParam): key, value = element # Transport the window as part of the value and restore it later. return key, windowed_value.WindowedValue( value, timestamp, [window]) def restore_timestamps(element): key, windowed_values = element return [ wv.with_value((key, wv.value)) for wv in windowed_values ] ungrouped = pcoll | Map(reify_timestamps).with_output_types(Any) # TODO(BEAM-8104) Using global window as one of the standard window. # This is to mitigate the Dataflow Java Runner Harness limitation to # accept only standard coders. ungrouped._windowing = Windowing( window.GlobalWindows(), triggerfn=Always(), accumulation_mode=AccumulationMode.DISCARDING, timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST) result = (ungrouped | GroupByKey() | FlatMap(restore_timestamps).with_output_types(Any)) result._windowing = windowing_saved return result
def test_always(self): self._test(Always(), 0, DataLossReason.NO_POTENTIAL_LOSS)