Ejemplo n.º 1
0
  def test_always(self):
    with TestPipeline() as p:

      def construct_timestamped(k_t):
        return TimestampedValue((k_t[0], k_t[1]), k_t[1])

      def format_result(k_v):
        return ('%s-%s' % (k_v[0], len(k_v[1])), set(k_v[1]))

      result = (
          p
          | beam.Create([1, 1, 2, 3, 4, 5, 10, 11])
          | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)])
          | beam.Map(construct_timestamped)
          | beam.WindowInto(
              FixedWindows(10),
              trigger=Always(),
              accumulation_mode=AccumulationMode.DISCARDING)
          | beam.GroupByKey()
          | beam.Map(format_result))
      assert_that(
          result,
          equal_to(
              list({
                  'A-2': {10, 11},
                  # Elements out of windows are also emitted.
                  'A-6': {1, 2, 3, 4, 5},
                  # A,1 is emitted twice.
                  'B-5': {6, 7, 8, 9},
                  # B,6 is emitted twice.
                  'B-3': {10, 15, 16},
              }.items())))
Ejemplo n.º 2
0
    def expand(self, pcoll):
        windowing_saved = pcoll.windowing
        if windowing_saved.is_default():
            # In this (common) case we can use a trivial trigger driver
            # and avoid the (expensive) window param.
            globally_windowed = window.GlobalWindows.windowed_value(None)
            MIN_TIMESTAMP = window.MIN_TIMESTAMP

            def reify_timestamps(element, timestamp=DoFn.TimestampParam):
                key, value = element
                if timestamp == MIN_TIMESTAMP:
                    timestamp = None
                return key, (value, timestamp)

            def restore_timestamps(element):
                key, values = element
                return [
                    globally_windowed.with_value((key, value)) if
                    timestamp is None else window.GlobalWindows.windowed_value(
                        (key, value), timestamp)
                    for (value, timestamp) in values
                ]
        else:

            # typing: All conditional function variants must have identical signatures
            def reify_timestamps(  # type: ignore[misc]
                    element,
                    timestamp=DoFn.TimestampParam,
                    window=DoFn.WindowParam):
                key, value = element
                # Transport the window as part of the value and restore it later.
                return key, windowed_value.WindowedValue(
                    value, timestamp, [window])

            def restore_timestamps(element):
                key, windowed_values = element
                return [
                    wv.with_value((key, wv.value)) for wv in windowed_values
                ]

        ungrouped = pcoll | Map(reify_timestamps).with_output_types(Any)

        # TODO(BEAM-8104) Using global window as one of the standard window.
        # This is to mitigate the Dataflow Java Runner Harness limitation to
        # accept only standard coders.
        ungrouped._windowing = Windowing(
            window.GlobalWindows(),
            triggerfn=Always(),
            accumulation_mode=AccumulationMode.DISCARDING,
            timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST)
        result = (ungrouped
                  | GroupByKey()
                  | FlatMap(restore_timestamps).with_output_types(Any))
        result._windowing = windowing_saved
        return result
Ejemplo n.º 3
0
 def test_always(self):
     self._test(Always(), 0, DataLossReason.NO_POTENTIAL_LOSS)