Esempio n. 1
0
File: util.py Progetto: raznem/beam
  def expand(self, pcoll):
    windowing_saved = pcoll.windowing
    if windowing_saved.is_default():
      # In this (common) case we can use a trivial trigger driver
      # and avoid the (expensive) window param.
      globally_windowed = window.GlobalWindows.windowed_value(None)
      MIN_TIMESTAMP = window.MIN_TIMESTAMP

      def reify_timestamps(element, timestamp=DoFn.TimestampParam):
        key, value = element
        if timestamp == MIN_TIMESTAMP:
          timestamp = None
        return key, (value, timestamp)

      def restore_timestamps(element):
        key, values = element
        return [
            globally_windowed.with_value((key, value)) if timestamp is None else
            window.GlobalWindows.windowed_value((key, value), timestamp)
            for (value, timestamp) in values
        ]
    else:

      # typing: All conditional function variants must have identical signatures
      def reify_timestamps(  # type: ignore[misc]
          element, timestamp=DoFn.TimestampParam, window=DoFn.WindowParam):
        key, value = element
        # Transport the window as part of the value and restore it later.
        return key, windowed_value.WindowedValue(value, timestamp, [window])

      def restore_timestamps(element):
        key, windowed_values = element
        return [wv.with_value((key, wv.value)) for wv in windowed_values]

    ungrouped = pcoll | Map(reify_timestamps).with_output_types(Any)

    # TODO(BEAM-8104) Using global window as one of the standard window.
    # This is to mitigate the Dataflow Java Runner Harness limitation to
    # accept only standard coders.
    ungrouped._windowing = Windowing(
        window.GlobalWindows(),
        triggerfn=Always(),
        accumulation_mode=AccumulationMode.DISCARDING,
        timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST)
    result = (
        ungrouped
        | GroupByKey()
        | FlatMap(restore_timestamps).with_output_types(Any))
    result._windowing = windowing_saved
    return result
Esempio n. 2
0
 def test_globally(self):
   l = [window.TimestampedValue(3, 100),
        window.TimestampedValue(1, 200),
        window.TimestampedValue(2, 300)]
   with TestPipeline() as p:
     # Map(lambda x: x) PTransform is added after Create here, because when
     # a PCollection of TimestampedValues is created with Create PTransform,
     # the timestamps are not assigned to it. Adding a Map forces the
     # PCollection to go through a DoFn so that the PCollection consists of
     # the elements with timestamps assigned to them instead of a PCollection
     # of TimestampedValue(element, timestamp).
     pc = p | Create(l) | Map(lambda x: x)
     latest = pc | combine.Latest.Globally()
     assert_that(latest, equal_to([2]))
Esempio n. 3
0
    def test_top(self):
        with TestPipeline() as pipeline:
            timestamp = 0

            # First for global combines.
            pcoll = pipeline | 'start' >> Create(
                [6, 3, 1, 1, 9, 1, 5, 2, 0, 6])
            result_top = pcoll | 'top' >> combine.Top.Largest(5)
            result_bot = pcoll | 'bot' >> combine.Top.Smallest(4)
            assert_that(result_top,
                        equal_to([[9, 6, 6, 5, 3]]),
                        label='assert:top')
            assert_that(result_bot,
                        equal_to([[0, 1, 1, 1]]),
                        label='assert:bot')

            # Now for global combines without default
            timestamped = pcoll | Map(lambda x: TimestampedValue(x, timestamp))
            windowed = timestamped | 'window' >> WindowInto(FixedWindows(60))
            result_windowed_top = windowed | 'top-wo-defaults' >> combine.Top.Largest(
                5, has_defaults=False)
            result_windowed_bot = (windowed
                                   | 'bot-wo-defaults' >> combine.Top.Smallest(
                                       4, has_defaults=False))
            assert_that(result_windowed_top,
                        equal_to([[9, 6, 6, 5, 3]]),
                        label='assert:top-wo-defaults')
            assert_that(result_windowed_bot,
                        equal_to([[0, 1, 1, 1]]),
                        label='assert:bot-wo-defaults')

            # Again for per-key combines.
            pcoll = pipeline | 'start-perkey' >> Create(
                [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]])
            result_key_top = pcoll | 'top-perkey' >> combine.Top.LargestPerKey(
                5)
            result_key_bot = pcoll | 'bot-perkey' >> combine.Top.SmallestPerKey(
                4)
            assert_that(result_key_top,
                        equal_to([('a', [9, 6, 6, 5, 3])]),
                        label='key:top')
            assert_that(result_key_bot,
                        equal_to([('a', [0, 1, 1, 1])]),
                        label='key:bot')
Esempio n. 4
0
    def test_avro_it(self):
        num_records = self.test_pipeline.get_option('records')
        num_records = int(num_records) if num_records else 1000000
        fastavro_output = '/'.join([self.output, 'fastavro'])

        # Seed a `PCollection` with indices that will each be FlatMap'd into
        # `batch_size` records, to avoid having a too-large list in memory at
        # the outset
        batch_size = self.test_pipeline.get_option('batch-size')
        batch_size = int(batch_size) if batch_size else 10000

        # pylint: disable=bad-option-value
        batches = range(int(num_records / batch_size))

        def batch_indices(start):
            # pylint: disable=bad-option-value
            return range(start * batch_size, (start + 1) * batch_size)

        # A `PCollection` with `num_records` avro records
        records_pcoll = \
            self.test_pipeline \
            | 'create-batches' >> Create(batches) \
            | 'expand-batches' >> FlatMap(batch_indices) \
            | 'create-records' >> Map(record)

        # pylint: disable=expression-not-assigned
        records_pcoll \
        | 'write_fastavro' >> WriteToAvro(
            fastavro_output,
            parse_schema(json.loads(self.SCHEMA_STRING)),
        )
        result = self.test_pipeline.run()
        result.wait_until_finish()
        fastavro_pcoll = self.test_pipeline \
                         | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \
                         | 'read-fastavro' >> ReadAllFromAvro()

        mapped_fastavro_pcoll = fastavro_pcoll | "map_fastavro" >> Map(
            lambda x: (x['number'], x))
        mapped_record_pcoll = records_pcoll | "map_record" >> Map(
            lambda x: (x['number'], x))

        def validate_record(elem):
            v = elem[1]

            def assertEqual(l, r):
                if l != r:
                    raise BeamAssertException('Assertion failed: %s == %s' %
                                              (l, r))

            assertEqual(sorted(v.keys()), ['fastavro', 'record_pcoll'])
            record_pcoll_values = v['record_pcoll']
            fastavro_values = v['fastavro']
            assertEqual(record_pcoll_values, fastavro_values)
            assertEqual(len(record_pcoll_values), 1)

        {
            "record_pcoll": mapped_record_pcoll,
            "fastavro": mapped_fastavro_pcoll
        } | CoGroupByKey() | Map(validate_record)

        result = self.test_pipeline.run()
        result.wait_until_finish()

        self.addCleanup(delete_files, [self.output])
        assert result.state == PipelineState.DONE