Beispiel #1
0
 def test_global_window(self):
   self.assertEqual(GlobalWindow(), GlobalWindow())
   self.assertNotEqual(GlobalWindow(),
                       IntervalWindow(MIN_TIMESTAMP, MAX_TIMESTAMP))
   self.assertNotEqual(IntervalWindow(MIN_TIMESTAMP, MAX_TIMESTAMP),
                       GlobalWindow())
   self.assertTrue(GlobalWindow().max_timestamp() < MAX_TIMESTAMP)
Beispiel #2
0
  def test_windowedvalue_coder_paneinfo(self):
    coder = coders.WindowedValueCoder(coders.VarIntCoder(),
                                      coders.GlobalWindowCoder())
    test_paneinfo_values = [
        windowed_value.PANE_INFO_UNKNOWN,
        windowed_value.PaneInfo(
            True, True, windowed_value.PaneInfoTiming.EARLY, 0, -1),
        windowed_value.PaneInfo(
            True, False, windowed_value.PaneInfoTiming.ON_TIME, 0, 0),
        windowed_value.PaneInfo(
            True, False, windowed_value.PaneInfoTiming.ON_TIME, 10, 0),
        windowed_value.PaneInfo(
            False, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 23),
        windowed_value.PaneInfo(
            False, True, windowed_value.PaneInfoTiming.ON_TIME, 12, 23),
        windowed_value.PaneInfo(
            False, False, windowed_value.PaneInfoTiming.LATE, 0, 123),]

    test_values = [windowed_value.WindowedValue(123, 234, (GlobalWindow(),), p)
                   for p in test_paneinfo_values]

    # Test unnested.
    self.check_coder(coder, windowed_value.WindowedValue(
        123, 234, (GlobalWindow(),), windowed_value.PANE_INFO_UNKNOWN))
    for value in test_values:
      self.check_coder(coder, value)

    # Test nested.
    for value1 in test_values:
      for value2 in test_values:
        self.check_coder(coders.TupleCoder((coder, coder)), (value1, value2))
Beispiel #3
0
    def test_parse_windowedvalue_with_dicts(self):
        """Tests that dicts play well with WindowedValues.
    """
        from apache_beam.transforms.window import GlobalWindow

        els = [
            WindowedValue({
                'b': 2,
                'd': 4
            }, 1, [GlobalWindow()]),
            WindowedValue({
                'a': 1,
                'b': 2,
                'c': 3
            }, 1, [GlobalWindow()])
        ]

        actual_df = utils.elements_to_df(els, include_window_info=True)
        expected_df = pd.DataFrame(
            [[
                np.nan, 2, np.nan, 4,
                int(1e6), els[0].windows, els[0].pane_info
            ], [1, 2, 3, np.nan,
                int(1e6), els[1].windows, els[1].pane_info]],
            columns=['a', 'b', 'c', 'd', 'event_time', 'windows', 'pane_info'])
        pd.testing.assert_frame_equal(actual_df, expected_df)
Beispiel #4
0
 def test_shard_naming(self):
     namer = fileio.default_file_naming(prefix='/path/to/file',
                                        suffix='.txt')
     self.assertEqual(namer(GlobalWindow(), None, None, None, None, None),
                      '/path/to/file.txt')
     self.assertEqual(namer(GlobalWindow(), None, 1, 5, None, None),
                      '/path/to/file-00001-of-00005.txt')
     self.assertEqual(namer(GlobalWindow(), None, 1, 5, 'gz', None),
                      '/path/to/file-00001-of-00005.txt.gz')
     self.assertEqual(
         namer(IntervalWindow(0, 100), None, 1, 5, None, None),
         '/path/to/file'
         '-1970-01-01T00:00:00-1970-01-01T00:01:40-00001-of-00005.txt')
Beispiel #5
0
    def test_parse_windowedvalue(self):
        """Tests that WindowedValues are supported but not present.
    """
        from apache_beam.transforms.window import GlobalWindow

        els = [
            WindowedValue(('a', 2), 1, [GlobalWindow()]),
            WindowedValue(('b', 3), 1, [GlobalWindow()])
        ]

        actual_df = utils.elements_to_df(els, include_window_info=False)
        expected_df = pd.DataFrame([['a', 2], ['b', 3]], columns=[0, 1])
        pd.testing.assert_frame_equal(actual_df, expected_df)
Beispiel #6
0
 def test_timestamp_in_value(self):
   l = [TimestampedValue(('a', 1), 100),
        TimestampedValue(('b', 2), 200),
        TimestampedValue(('c', 3), 300)]
   expected = [TestWindowedValue(('a', TimestampedValue(1, 100)), 100,
                                 [GlobalWindow()]),
               TestWindowedValue(('b', TimestampedValue(2, 200)), 200,
                                 [GlobalWindow()]),
               TestWindowedValue(('c', TimestampedValue(3, 300)), 300,
                                 [GlobalWindow()])]
   with TestPipeline() as p:
     pc = p | beam.Create(l) | beam.Map(lambda x: x)
     reified_pc = pc | util.Reify.TimestampInValue()
     assert_that(reified_pc, equal_to(expected), reify_windows=True)
    def test_basic_wordcount(self):
        """A wordcount to be used as a smoke test."""

        # Create the pipeline that will emit 0, 1, 2.
        p = beam.Pipeline(InteractiveRunner())
        elems = p | beam.Create([0, 1, 2])

        # Watch the pipeline and PCollections. This is normally done in a notebook
        # environment automatically, but we have to do it manually here.
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Create the recording objects. By calling `record` a new PipelineFragment
        # is started to compute the given PCollections and cache to disk.
        rm = RecordingManager(p)
        recording = rm.record([elems], max_n=3, max_duration_secs=500)
        stream = recording.stream(elems)
        recording.wait_until_finish()

        # Once the pipeline fragment completes, we can read from the stream and know
        # that all elements were written to cache.
        elems = list(stream.read())
        expected_elems = [
            WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3)
        ]
        self.assertListEqual(elems, expected_elems)
Beispiel #8
0
 def test_reified_value_assert_fail_unmatched_timestamp(self):
   expected = [TestWindowedValue(v, 1, [GlobalWindow()])
               for v in [1, 2, 3]]
   with self.assertRaises(Exception):
     with TestPipeline() as p:
       assert_that(p | Create([2, 3, 1]), equal_to(expected),
                   reify_windows=True)
Beispiel #9
0
    def test_parse_windowedvalue_with_window_info(self):
        """Tests that WindowedValues are supported and have their own columns.
    """
        from apache_beam.transforms.window import GlobalWindow

        els = [
            WindowedValue(('a', 2), 1, [GlobalWindow()]),
            WindowedValue(('b', 3), 1, [GlobalWindow()])
        ]

        actual_df = utils.elements_to_df(els, include_window_info=True)
        expected_df = pd.DataFrame(
            [['a', 2, int(1e6), els[0].windows, els[0].pane_info],
             ['b', 3, int(1e6), els[1].windows, els[1].pane_info]],
            columns=[0, 1, 'event_time', 'windows', 'pane_info'])
        pd.testing.assert_frame_equal(actual_df, expected_df)
Beispiel #10
0
class BatchGlobalTriggerDriver(TriggerDriver):
  """Groups all received values together.
  """
  GLOBAL_WINDOW_TUPLE = (GlobalWindow(), )
  ONLY_FIRING = windowed_value.PaneInfo(
      is_first=True,
      is_last=True,
      timing=windowed_value.PaneInfoTiming.ON_TIME,
      index=0,
      nonspeculative_index=0)

  def process_elements(
      self,
      state,
      windowed_values,
      unused_output_watermark,
      unused_input_watermark=MIN_TIMESTAMP):
    yield WindowedValue(
        _UnwindowedValues(windowed_values),
        MIN_TIMESTAMP,
        self.GLOBAL_WINDOW_TUPLE,
        self.ONLY_FIRING)

  def process_timer(
      self,
      window_id,
      name,
      time_domain,
      timestamp,
      state,
      input_watermark=None):
    raise TypeError('Triggers never set or called for batch default windowing.')
Beispiel #11
0
  def _inner(window, pane, shard_index, total_shards, compression, destination):
    kwargs = {'prefix': prefix,
              'start': '',
              'end': '',
              'pane': '',
              'shard': 0,
              'total_shards': 0,
              'suffix': '',
              'compression': ''}
    if total_shards is not None and shard_index is not None:
      kwargs['shard'] = int(shard_index)
      kwargs['total_shards'] = int(total_shards)

    if window != GlobalWindow():
      kwargs['start'] = window.start.to_utc_datetime().isoformat()
      kwargs['end'] = window.end.to_utc_datetime().isoformat()

    # TODO(pabloem): Add support for PaneInfo
    # If the PANE is the ONLY firing in the window, we don't add it.
    #if pane and not (pane.is_first and pane.is_last):
    #  kwargs['pane'] = pane.index

    if compression:
      kwargs['compression'] = '.%s' % compression
    if suffix:
      kwargs['suffix'] = suffix

    return _DEFAULT_FILE_NAME_TEMPLATE.format(**kwargs)
Beispiel #12
0
 def finish_bundle(self):
     from apache_beam.utils import timestamp
     from apache_beam.transforms.window import WindowedValue, GlobalWindow
     if len(self._buffer) != 0:
         logging.info("Final Buffer Length: {}".format(len(self._buffer)))
         yield WindowedValue(self._buffer, timestamp.MIN_TIMESTAMP,
                             [GlobalWindow()])
         self._buffer = []
Beispiel #13
0
    def __init__(self, keyed_state_backend):
        self._keyed_state_backend = keyed_state_backend
        self._current_watermark = None
        self._timer_coder_impl = None
        self._output_stream = None

        from apache_beam.transforms.window import GlobalWindow
        self._global_window = GlobalWindow()
Beispiel #14
0
 def test_timestamp(self):
   l = [TimestampedValue('a', 100),
        TimestampedValue('b', 200),
        TimestampedValue('c', 300)]
   expected = [TestWindowedValue('a', 100, [GlobalWindow()]),
               TestWindowedValue('b', 200, [GlobalWindow()]),
               TestWindowedValue('c', 300, [GlobalWindow()])]
   with TestPipeline() as p:
     # Map(lambda x: x) PTransform is added after Create here, because when
     # a PCollection of TimestampedValues is created with Create PTransform,
     # the timestamps are not assigned to it. Adding a Map forces the
     # PCollection to go through a DoFn so that the PCollection consists of
     # the elements with timestamps assigned to them instead of a PCollection
     # of TimestampedValue(element, timestamp).
     pc = p | beam.Create(l) | beam.Map(lambda x: x)
     reified_pc = pc | util.Reify.Timestamp()
     assert_that(reified_pc, equal_to(expected), reify_windows=True)
Beispiel #15
0
 def __init__(self):
     self.window = GlobalWindow()
     self.batch = []
     self.engagement_range = 10
     self.from_ts = 0
     self.to_ts = 0
     self.giap_es_username = ''
     self.giap_es_password = ''
     self.giap_es_index = ''
Beispiel #16
0
 def test_reified_value_passes(self):
     expected = [
         TestWindowedValue(v, MIN_TIMESTAMP, [GlobalWindow()])
         for v in [1, 2, 3]
     ]
     with TestPipeline() as p:
         assert_that(p | Create([2, 3, 1]),
                     equal_to(expected),
                     reify_windows=True)
Beispiel #17
0
 def test_timer_coder(self):
     self.check_coder(
         coders._TimerCoder(coders.StrUtf8Coder(),
                            coders.GlobalWindowCoder()),
         *[
             userstate.Timer(user_key="key",
                             dynamic_timer_tag="tag",
                             windows=(GlobalWindow(), ),
                             clear_bit=True,
                             fire_timestamp=None,
                             hold_timestamp=None,
                             paneinfo=None),
             userstate.Timer(user_key="key",
                             dynamic_timer_tag="tag",
                             windows=(GlobalWindow(), ),
                             clear_bit=False,
                             fire_timestamp=timestamp.Timestamp.of(123),
                             hold_timestamp=timestamp.Timestamp.of(456),
                             paneinfo=windowed_value.PANE_INFO_UNKNOWN)
         ])
Beispiel #18
0
class DiscardingGlobalTriggerDriver(TriggerDriver):
    """Groups all received values together.
  """
    GLOBAL_WINDOW_TUPLE = (GlobalWindow(), )

    def process_elements(self, state, windowed_values,
                         unused_output_watermark):
        yield WindowedValue(_UnwindowedValues(windowed_values), MIN_TIMESTAMP,
                            self.GLOBAL_WINDOW_TUPLE)

    def process_timer(self, window_id, name, time_domain, timestamp, state):
        raise TypeError(
            'Triggers never set or called for batch default windowing.')
Beispiel #19
0
  def _invoke_per_window(
      self, windowed_value, additional_args,
      additional_kwargs, output_processor):
    if self.has_windowed_inputs:
      window, = windowed_value.windows
      side_inputs = [si[window] for si in self.side_inputs]
      side_inputs.extend(additional_args)
      args_for_process, kwargs_for_process = util.insert_values_in_args(
          self.args_for_process, self.kwargs_for_process,
          side_inputs)
    elif self.cache_globally_windowed_args:
      # Attempt to cache additional args if all inputs are globally
      # windowed inputs when processing the first element.
      self.cache_globally_windowed_args = False

      # Fill in sideInputs if they are globally windowed
      global_window = GlobalWindow()
      self.args_for_process, self.kwargs_for_process = (
          util.insert_values_in_args(
              self.args_for_process, self.kwargs_for_process,
              [si[global_window] for si in self.side_inputs]))
      args_for_process, kwargs_for_process = (
          self.args_for_process, self.kwargs_for_process)
    else:
      args_for_process, kwargs_for_process = (
          self.args_for_process, self.kwargs_for_process)
    # TODO(sourabhbajaj): Investigate why we can't use `is` instead of ==
    for i, p in self.placeholders:
      if p == core.DoFn.ElementParam:
        args_for_process[i] = windowed_value.value
      elif p == core.DoFn.WindowParam:
        args_for_process[i] = window
      elif p == core.DoFn.TimestampParam:
        args_for_process[i] = windowed_value.timestamp

    if additional_kwargs:
      if kwargs_for_process is None:
        kwargs_for_process = additional_kwargs
      else:
        for key in additional_kwargs:
          kwargs_for_process[key] = additional_kwargs[key]

    if kwargs_for_process:
      output_processor.process_outputs(
          windowed_value,
          self.process_method(*args_for_process, **kwargs_for_process))
    else:
      output_processor.process_outputs(
          windowed_value, self.process_method(*args_for_process))
Beispiel #20
0
    def test_find_orphaned_files(self):
        dir = self._new_tempdir()

        write_transform = beam.io.fileio.WriteToFiles(path=dir)

        def write_orphaned_file(temp_dir, writer_key):
            temp_dir_path = FileSystems.join(dir, temp_dir)

            file_prefix_dir = FileSystems.join(temp_dir_path,
                                               str(abs(hash(writer_key))))

            file_name = '%s_%s' % (file_prefix_dir, uuid.uuid4())
            with FileSystems.create(file_name) as f:
                f.write(b'Hello y\'all')

            return file_name

        with TestPipeline() as p:
            _ = (p
                 | beam.Create(WriteFilesTest.SIMPLE_COLLECTION)
                 | "Serialize" >> beam.Map(json.dumps)
                 | write_transform)

            # Pre-create the temp directory.
            temp_dir_path = FileSystems.mkdirs(
                FileSystems.join(dir, write_transform._temp_directory.get()))
            write_orphaned_file(write_transform._temp_directory.get(),
                                (None, GlobalWindow()))
            f2 = write_orphaned_file(write_transform._temp_directory.get(),
                                     ('other-dest', GlobalWindow()))

        temp_dir_path = FileSystems.join(dir,
                                         write_transform._temp_directory.get())
        leftovers = FileSystems.match(['%s%s*' % (temp_dir_path, os.sep)])
        found_files = [m.path for m in leftovers[0].metadata_list]
        self.assertListEqual(found_files, [f2])
Beispiel #21
0
class DefaultGlobalBatchTriggerDriver(TriggerDriver):
    """Breaks a bundles into window (pane)s according to the default triggering.
  """
    GLOBAL_WINDOW_TUPLE = (GlobalWindow(), )

    def __init__(self):
        pass

    def process_elements(self, state, windowed_values,
                         unused_output_watermark):
        yield WindowedValue(_UnwindowedValues(windowed_values), MIN_TIMESTAMP,
                            self.GLOBAL_WINDOW_TUPLE)

    def process_timer(self, window_id, name, time_domain, timestamp, state):
        raise TypeError(
            'Triggers never set or called for batch default windowing.')
Beispiel #22
0
 def test_reshuffle_timestamps_unchanged(self):
   pipeline = TestPipeline()
   timestamp = 5
   data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 3)]
   expected_result = [TestWindowedValue(v, timestamp, [GlobalWindow()])
                      for v in data]
   before_reshuffle = (pipeline
                       | 'start' >> beam.Create(data)
                       | 'add_timestamp' >> beam.Map(
                           lambda v: beam.window.TimestampedValue(v,
                                                                  timestamp)))
   assert_that(before_reshuffle, equal_to(expected_result),
               label='before_reshuffle', reify_windows=True)
   after_reshuffle = before_reshuffle | beam.Reshuffle()
   assert_that(after_reshuffle, equal_to(expected_result),
               label='after_reshuffle', reify_windows=True)
   pipeline.run()
Beispiel #23
0
def _format_shard(
    window, pane, shard_index, total_shards, compression, prefix, suffix):
  kwargs = {
      'prefix': prefix,
      'start': '',
      'end': '',
      'pane': '',
      'shard': 0,
      'total_shards': 0,
      'suffix': '',
      'compression': ''
  }

  if total_shards is not None and shard_index is not None:
    kwargs['shard'] = int(shard_index)
    kwargs['total_shards'] = int(total_shards)

  if window != GlobalWindow():
    kwargs['start'] = window.start.to_utc_datetime().isoformat()
    kwargs['end'] = window.end.to_utc_datetime().isoformat()

  # TODO(BEAM-3759): Add support for PaneInfo
  # If the PANE is the ONLY firing in the window, we don't add it.
  #if pane and not (pane.is_first and pane.is_last):
  #  kwargs['pane'] = pane.index

  if suffix:
    kwargs['suffix'] = suffix

  if compression:
    kwargs['compression'] = '.%s' % compression

  # Remove separators for unused template parts.
  format = _DEFAULT_FILE_NAME_TEMPLATE
  if shard_index is None:
    format = format.replace('-{shard:05d}', '')
  if total_shards is None:
    format = format.replace('-of-{total_shards:05d}', '')
  for name, value in kwargs.items():
    if value in (None, ''):
      format = format.replace('-{%s}' % name, '')

  return format.format(**kwargs)
Beispiel #24
0
 def partition(self, n):
     # type: (int) -> List[List[bytes]]
     """ It is used to partition _GroupingBuffer to N parts. Once it is
 partitioned, it would not be re-partitioned with diff N. Re-partition
 is not supported now.
 """
     if not self._grouped_output:
         if self._windowing.is_default():
             globally_window = GlobalWindows.windowed_value(
                 None,
                 timestamp=GlobalWindow().max_timestamp(),
                 pane_info=windowed_value.PaneInfo(
                     is_first=True,
                     is_last=True,
                     timing=windowed_value.PaneInfoTiming.ON_TIME,
                     index=0,
                     nonspeculative_index=0)).with_value
             windowed_key_values = lambda key, values: [
                 globally_window((key, values))
             ]
         else:
             # TODO(pabloem, BEAM-7514): Trigger driver needs access to the clock
             #   note that this only comes through if windowing is default - but what
             #   about having multiple firings on the global window.
             #   May need to revise.
             trigger_driver = trigger.create_trigger_driver(
                 self._windowing, True)
             windowed_key_values = trigger_driver.process_entire_key
         coder_impl = self._post_grouped_coder.get_impl()
         key_coder_impl = self._key_coder.get_impl()
         self._grouped_output = [[] for _ in range(n)]
         output_stream_list = [create_OutputStream() for _ in range(n)]
         for idx, (encoded_key,
                   windowed_values) in enumerate(self._table.items()):
             key = key_coder_impl.decode(encoded_key)
             for wkvs in windowed_key_values(key, windowed_values):
                 coder_impl.encode_to_stream(wkvs,
                                             output_stream_list[idx % n],
                                             True)
         for ix, output_stream in enumerate(output_stream_list):
             self._grouped_output[ix] = [output_stream.get()]
         self._table.clear()
     return self._grouped_output
Beispiel #25
0
  def test_windowed_value_coder(self):
    coder = coders.WindowedValueCoder(
        coders.VarIntCoder(), coders.GlobalWindowCoder())
    # Verify cloud object representation
    self.assertEqual({
        '@type': 'kind:windowed_value',
        'is_wrapper': True,
        'component_encodings': [
            coders.VarIntCoder().as_cloud_object(),
            coders.GlobalWindowCoder().as_cloud_object(),
        ],
    },
                     coder.as_cloud_object())
    # Test binary representation
    self.assertEqual(
        b'\x7f\xdf;dZ\x1c\xac\t\x00\x00\x00\x01\x0f\x01',
        coder.encode(window.GlobalWindows.windowed_value(1)))

    # Test decoding large timestamp
    self.assertEqual(
        coder.decode(b'\x7f\xdf;dZ\x1c\xac\x08\x00\x00\x00\x01\x0f\x00'),
        windowed_value.create(0, MIN_TIMESTAMP.micros, (GlobalWindow(), )))

    # Test unnested
    self.check_coder(
        coders.WindowedValueCoder(coders.VarIntCoder()),
        windowed_value.WindowedValue(3, -100, ()),
        windowed_value.WindowedValue(-1, 100, (1, 2, 3)))

    # Test Global Window
    self.check_coder(
        coders.WindowedValueCoder(
            coders.VarIntCoder(), coders.GlobalWindowCoder()),
        window.GlobalWindows.windowed_value(1))

    # Test nested
    self.check_coder(
        coders.TupleCoder((
            coders.WindowedValueCoder(coders.FloatCoder()),
            coders.WindowedValueCoder(coders.StrUtf8Coder()))),
        (
            windowed_value.WindowedValue(1.5, 0, ()),
            windowed_value.WindowedValue("abc", 10, ('window', ))))
Beispiel #26
0
 def test_window(self):
   l = [GlobalWindows.windowed_value('a', 100),
        GlobalWindows.windowed_value('b', 200),
        GlobalWindows.windowed_value('c', 300)]
   expected = [TestWindowedValue(('a', 100, GlobalWindow()), 100,
                                 [GlobalWindow()]),
               TestWindowedValue(('b', 200, GlobalWindow()), 200,
                                 [GlobalWindow()]),
               TestWindowedValue(('c', 300, GlobalWindow()), 300,
                                 [GlobalWindow()])]
   with TestPipeline() as p:
     pc = p | beam.Create(l)
     reified_pc = pc | util.Reify.Window()
     assert_that(reified_pc, equal_to(expected), reify_windows=True)
Beispiel #27
0
    def test_basic_execution(self):
        """A basic pipeline to be used as a smoke test."""

        # Create the pipeline that will emit 0, 1, 2.
        p = beam.Pipeline(InteractiveRunner())
        numbers = p | 'numbers' >> beam.Create([0, 1, 2])
        letters = p | 'letters' >> beam.Create(['a', 'b', 'c'])

        # Watch the pipeline and PCollections. This is normally done in a notebook
        # environment automatically, but we have to do it manually here.
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Create the recording objects. By calling `record` a new PipelineFragment
        # is started to compute the given PCollections and cache to disk.
        rm = RecordingManager(p)
        numbers_recording = rm.record([numbers],
                                      max_n=3,
                                      max_duration_secs=500)
        numbers_stream = numbers_recording.stream(numbers)
        numbers_recording.wait_until_finish()

        # Once the pipeline fragment completes, we can read from the stream and know
        # that all elements were written to cache.
        elems = list(numbers_stream.read())
        expected_elems = [
            WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3)
        ]
        self.assertListEqual(elems, expected_elems)

        # Make an extra recording and test the description.
        letters_recording = rm.record([letters],
                                      max_n=3,
                                      max_duration_secs=500)
        letters_recording.wait_until_finish()

        self.assertEqual(
            rm.describe()['size'],
            numbers_recording.describe()['size'] +
            letters_recording.describe()['size'])

        rm.cancel()
Beispiel #28
0
 def test_window_in_value(self):
   l = [GlobalWindows.windowed_value(('a', 1), 100),
        GlobalWindows.windowed_value(('b', 2), 200),
        GlobalWindows.windowed_value(('c', 3), 300)]
   expected = [TestWindowedValue(('a', (1, 100, GlobalWindow())), 100,
                                 [GlobalWindow()]),
               TestWindowedValue(('b', (2, 200, GlobalWindow())), 200,
                                 [GlobalWindow()]),
               TestWindowedValue(('c', (3, 300, GlobalWindow())), 300,
                                 [GlobalWindow()])]
   with TestPipeline() as p:
     # Map(lambda x: x) hack is used for the same reason here.
     # Also, this makes the typehint on Reify.WindowInValue work.
     pc = p | beam.Create(l) | beam.Map(lambda x: x)
     reified_pc = pc | util.Reify.WindowInValue()
     assert_that(reified_pc, equal_to(expected), reify_windows=True)
    def process(self,
                element,
                batch=DoFn.StateParam(BATCH),
                batchSize=DoFn.StateParam(BATCH_SIZE),
                flushTimer=DoFn.TimerParam(FLUSH_TIMER),
                endOfTime=DoFn.TimerParam(EOW_TIMER)):

        from apache_beam.utils.timestamp import Timestamp, Duration
        from apache_beam.transforms.window import GlobalWindow

        currentSize = batchSize.read()
        if not currentSize:
            currentSize = 1
            flushTimer.set(Timestamp.now() +
                           Duration(micros=self.maxWaitTime * 1000))
            endOfTime.set(GlobalWindow().max_timestamp())
        else:
            currentSize += 1
        batchSize.write(currentSize)
        batch.add(element[1])
        if currentSize >= self.batchSize:
            return self.flush(batch, batchSize)
Beispiel #30
0
    def _invoke_process_per_window(
        self,
        windowed_value,  # type: WindowedValue
        additional_args,
        additional_kwargs,
    ):
        # type: (...) -> Optional[SplitResultResidual]
        if self.has_windowed_inputs:
            window, = windowed_value.windows
            side_inputs = [si[window] for si in self.side_inputs]
            side_inputs.extend(additional_args)
            args_for_process, kwargs_for_process = util.insert_values_in_args(
                self.args_for_process, self.kwargs_for_process, side_inputs)
        elif self.cache_globally_windowed_args:
            # Attempt to cache additional args if all inputs are globally
            # windowed inputs when processing the first element.
            self.cache_globally_windowed_args = False

            # Fill in sideInputs if they are globally windowed
            global_window = GlobalWindow()
            self.args_for_process, self.kwargs_for_process = (
                util.insert_values_in_args(
                    self.args_for_process, self.kwargs_for_process,
                    [si[global_window] for si in self.side_inputs]))
            args_for_process, kwargs_for_process = (self.args_for_process,
                                                    self.kwargs_for_process)
        else:
            args_for_process, kwargs_for_process = (self.args_for_process,
                                                    self.kwargs_for_process)

        # Extract key in the case of a stateful DoFn. Note that in the case of a
        # stateful DoFn, we set during __init__ self.has_windowed_inputs to be
        # True. Therefore, windows will be exploded coming into this method, and
        # we can rely on the window variable being set above.
        if self.user_state_context or self.is_key_param_required:
            try:
                key, unused_value = windowed_value.value
            except (TypeError, ValueError):
                raise ValueError((
                    'Input value to a stateful DoFn or KeyParam must be a KV tuple; '
                    'instead, got \'%s\'.') % (windowed_value.value, ))

        for i, p in self.placeholders:
            if core.DoFn.ElementParam == p:
                args_for_process[i] = windowed_value.value
            elif core.DoFn.KeyParam == p:
                args_for_process[i] = key
            elif core.DoFn.WindowParam == p:
                args_for_process[i] = window
            elif core.DoFn.TimestampParam == p:
                args_for_process[i] = windowed_value.timestamp
            elif core.DoFn.PaneInfoParam == p:
                args_for_process[i] = windowed_value.pane_info
            elif isinstance(p, core.DoFn.StateParam):
                assert self.user_state_context is not None
                args_for_process[i] = (self.user_state_context.get_state(
                    p.state_spec, key, window))
            elif isinstance(p, core.DoFn.TimerParam):
                assert self.user_state_context is not None
                args_for_process[i] = (self.user_state_context.get_timer(
                    p.timer_spec, key, window))
            elif core.DoFn.BundleFinalizerParam == p:
                args_for_process[i] = self.bundle_finalizer_param

        if additional_kwargs:
            if kwargs_for_process is None:
                kwargs_for_process = additional_kwargs
            else:
                for key in additional_kwargs:
                    kwargs_for_process[key] = additional_kwargs[key]

        if kwargs_for_process:
            self.output_processor.process_outputs(
                windowed_value,
                self.process_method(*args_for_process, **kwargs_for_process))
        else:
            self.output_processor.process_outputs(
                windowed_value, self.process_method(*args_for_process))

        if self.is_splittable:
            assert self.threadsafe_restriction_tracker is not None
            # TODO: Consider calling check_done right after SDF.Process() finishing.
            # In order to do this, we need to know that current invoking dofn is
            # ProcessSizedElementAndRestriction.
            self.threadsafe_restriction_tracker.check_done()
            deferred_status = self.threadsafe_restriction_tracker.deferred_status(
            )
            current_watermark = None
            if self.watermark_estimator:
                current_watermark = self.watermark_estimator.current_watermark(
                )
            if deferred_status:
                deferred_restriction, deferred_timestamp = deferred_status
                element = windowed_value.value
                size = self.signature.get_restriction_provider(
                ).restriction_size(element, deferred_restriction)
                residual_value = ((element, deferred_restriction), size)
                return SplitResultResidual(
                    residual_value=windowed_value.with_value(residual_value),
                    current_watermark=current_watermark,
                    deferred_timestamp=deferred_timestamp)
        return None