def expand(self, pcoll): windowing_saved = pcoll.windowing if windowing_saved.is_default(): # In this (common) case we can use a trivial trigger driver # and avoid the (expensive) window param. globally_windowed = window.GlobalWindows.windowed_value(None) MIN_TIMESTAMP = window.MIN_TIMESTAMP def reify_timestamps(element, timestamp=DoFn.TimestampParam): key, value = element if timestamp == MIN_TIMESTAMP: timestamp = None return key, (value, timestamp) def restore_timestamps(element): key, values = element return [ globally_windowed.with_value((key, value)) if timestamp is None else window.GlobalWindows.windowed_value((key, value), timestamp) for (value, timestamp) in values ] else: # typing: All conditional function variants must have identical signatures def reify_timestamps( # type: ignore[misc] element, timestamp=DoFn.TimestampParam, window=DoFn.WindowParam): key, value = element # Transport the window as part of the value and restore it later. return key, windowed_value.WindowedValue(value, timestamp, [window]) def restore_timestamps(element): key, windowed_values = element return [wv.with_value((key, wv.value)) for wv in windowed_values] ungrouped = pcoll | Map(reify_timestamps).with_output_types(Any) # TODO(BEAM-8104) Using global window as one of the standard window. # This is to mitigate the Dataflow Java Runner Harness limitation to # accept only standard coders. ungrouped._windowing = Windowing( window.GlobalWindows(), triggerfn=Always(), accumulation_mode=AccumulationMode.DISCARDING, timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST) result = ( ungrouped | GroupByKey() | FlatMap(restore_timestamps).with_output_types(Any)) result._windowing = windowing_saved return result
def test_globally(self): l = [window.TimestampedValue(3, 100), window.TimestampedValue(1, 200), window.TimestampedValue(2, 300)] with TestPipeline() as p: # Map(lambda x: x) PTransform is added after Create here, because when # a PCollection of TimestampedValues is created with Create PTransform, # the timestamps are not assigned to it. Adding a Map forces the # PCollection to go through a DoFn so that the PCollection consists of # the elements with timestamps assigned to them instead of a PCollection # of TimestampedValue(element, timestamp). pc = p | Create(l) | Map(lambda x: x) latest = pc | combine.Latest.Globally() assert_that(latest, equal_to([2]))
def test_top(self): with TestPipeline() as pipeline: timestamp = 0 # First for global combines. pcoll = pipeline | 'start' >> Create( [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]) result_top = pcoll | 'top' >> combine.Top.Largest(5) result_bot = pcoll | 'bot' >> combine.Top.Smallest(4) assert_that(result_top, equal_to([[9, 6, 6, 5, 3]]), label='assert:top') assert_that(result_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot') # Now for global combines without default timestamped = pcoll | Map(lambda x: TimestampedValue(x, timestamp)) windowed = timestamped | 'window' >> WindowInto(FixedWindows(60)) result_windowed_top = windowed | 'top-wo-defaults' >> combine.Top.Largest( 5, has_defaults=False) result_windowed_bot = (windowed | 'bot-wo-defaults' >> combine.Top.Smallest( 4, has_defaults=False)) assert_that(result_windowed_top, equal_to([[9, 6, 6, 5, 3]]), label='assert:top-wo-defaults') assert_that(result_windowed_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot-wo-defaults') # Again for per-key combines. pcoll = pipeline | 'start-perkey' >> Create( [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]]) result_key_top = pcoll | 'top-perkey' >> combine.Top.LargestPerKey( 5) result_key_bot = pcoll | 'bot-perkey' >> combine.Top.SmallestPerKey( 4) assert_that(result_key_top, equal_to([('a', [9, 6, 6, 5, 3])]), label='key:top') assert_that(result_key_bot, equal_to([('a', [0, 1, 1, 1])]), label='key:bot')
def test_avro_it(self): num_records = self.test_pipeline.get_option('records') num_records = int(num_records) if num_records else 1000000 fastavro_output = '/'.join([self.output, 'fastavro']) # Seed a `PCollection` with indices that will each be FlatMap'd into # `batch_size` records, to avoid having a too-large list in memory at # the outset batch_size = self.test_pipeline.get_option('batch-size') batch_size = int(batch_size) if batch_size else 10000 # pylint: disable=bad-option-value batches = range(int(num_records / batch_size)) def batch_indices(start): # pylint: disable=bad-option-value return range(start * batch_size, (start + 1) * batch_size) # A `PCollection` with `num_records` avro records records_pcoll = \ self.test_pipeline \ | 'create-batches' >> Create(batches) \ | 'expand-batches' >> FlatMap(batch_indices) \ | 'create-records' >> Map(record) # pylint: disable=expression-not-assigned records_pcoll \ | 'write_fastavro' >> WriteToAvro( fastavro_output, parse_schema(json.loads(self.SCHEMA_STRING)), ) result = self.test_pipeline.run() result.wait_until_finish() fastavro_pcoll = self.test_pipeline \ | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \ | 'read-fastavro' >> ReadAllFromAvro() mapped_fastavro_pcoll = fastavro_pcoll | "map_fastavro" >> Map( lambda x: (x['number'], x)) mapped_record_pcoll = records_pcoll | "map_record" >> Map( lambda x: (x['number'], x)) def validate_record(elem): v = elem[1] def assertEqual(l, r): if l != r: raise BeamAssertException('Assertion failed: %s == %s' % (l, r)) assertEqual(sorted(v.keys()), ['fastavro', 'record_pcoll']) record_pcoll_values = v['record_pcoll'] fastavro_values = v['fastavro'] assertEqual(record_pcoll_values, fastavro_values) assertEqual(len(record_pcoll_values), 1) { "record_pcoll": mapped_record_pcoll, "fastavro": mapped_fastavro_pcoll } | CoGroupByKey() | Map(validate_record) result = self.test_pipeline.run() result.wait_until_finish() self.addCleanup(delete_files, [self.output]) assert result.state == PipelineState.DONE