Esempio n. 1
0
  def read(self, tail=True):
    # type: (boolean) -> Any

    """Reads the elements currently recorded."""

    # Get the cache manager and wait until the file exists.
    cache_manager = ie.current_env().get_cache_manager(self._pipeline)

    # Retrieve the coder for the particular PCollection which will be used to
    # decode elements read from cache.
    coder = cache_manager.load_pcoder('full', self._cache_key)

    # Read the elements from the cache.
    # Import limiters here to prevent a circular import.
    from apache_beam.runners.interactive.options.capture_limiters import CountLimiter
    from apache_beam.runners.interactive.options.capture_limiters import ProcessingTimeLimiter
    reader, _ = cache_manager.read('full', self._cache_key, tail=tail)

    # Because a single TestStreamFileRecord can yield multiple elements, we
    # limit the count again here in the to_element_list call.
    #
    # There are two ways of exiting this loop either a limiter was triggered or
    # all elements from the cache were read. In the latter situation, it may be
    # the case that the pipeline was still running. Thus, another invocation of
    # `read` will yield new elements.
    count_limiter = CountLimiter(self._n)
    time_limiter = ProcessingTimeLimiter(self._duration_secs)
    limiters = (count_limiter, time_limiter)
    for e in utils.to_element_list(reader,
                                   coder,
                                   include_window_info=True,
                                   n=self._n,
                                   include_time_events=True):

      # From the to_element_list we either get TestStreamPayload.Events if
      # include_time_events or decoded elements from the reader. Make sure we
      # only count the decoded elements to break early.
      if isinstance(e, TestStreamPayload.Event):
        time_limiter.update(e)
      else:
        count_limiter.update(e)
        yield e

      if any(l.is_triggered() for l in limiters):
        break

    # A limiter being triggered means that we have fulfilled the user's request.
    # This implies that reading from the cache again won't yield any new
    # elements. WLOG, this applies to the user pipeline being terminated.
    if any(l.is_triggered()
           for l in limiters) or ie.current_env().is_terminated(self._pipeline):
      self._done = True
Esempio n. 2
0
  def test_count_limiter(self):
    limiter = CountLimiter(5)

    for e in range(4):
      limiter.update(e)

    self.assertFalse(limiter.is_triggered())
    limiter.update(4)
    self.assertTrue(limiter.is_triggered())
Esempio n. 3
0
  def test_count_limiter_with_dataframes(self):
    limiter = CountLimiter(5)

    # Test that empty dataframes don't count.
    for _ in range(10):
      df = WindowedValue(pd.DataFrame(), 0, [])
      limiter.update(df)

    self.assertFalse(limiter.is_triggered())
    df = WindowedValue(pd.DataFrame({'col': list(range(10))}), 0, [])
    limiter.update(df)
    self.assertTrue(limiter.is_triggered())
Esempio n. 4
0
    def test_read_with_count_limiter(self):
        """Test the condition where the cache is read once after written once."""
        prefix = 'full'
        cache_label = 'some-cache-label'
        cache_version_one = ['cache', 'version', 'one']

        self.mock_write_cache(cache_version_one, prefix, cache_label)
        reader, version = self.cache_manager.read(prefix,
                                                  cache_label,
                                                  limiters=[CountLimiter(2)])
        pcoll_list = list(reader)
        self.assertListEqual(pcoll_list, ['cache', 'version'])
        self.assertEqual(version, 0)
        self.assertTrue(
            self.cache_manager.is_latest_version(version, prefix, cache_label))
Esempio n. 5
0
    def test_single_reader_with_count_limiter(self):
        """Tests that we expect to see all the correctly emitted TestStreamPayloads.
    """
        CACHED_PCOLLECTION_KEY = repr(CacheKey('arbitrary_key', '', '', ''))

        values = (FileRecordsBuilder(tag=CACHED_PCOLLECTION_KEY)
                  .add_element(element=0, event_time_secs=0)
                  .advance_processing_time(1)
                  .add_element(element=1, event_time_secs=1)
                  .advance_processing_time(1)
                  .add_element(element=2, event_time_secs=2)
                  .build()) # yapf: disable

        cache = StreamingCache(cache_dir=None)
        cache.write(values, CACHED_PCOLLECTION_KEY)

        reader, _ = cache.read(CACHED_PCOLLECTION_KEY,
                               limiters=[CountLimiter(2)])
        coder = coders.FastPrimitivesCoder()
        events = list(reader)

        # Units here are in microseconds.
        # These are a slice of the original values such that we only get two
        # elements.
        expected = [
            TestStreamPayload.Event(
                element_event=TestStreamPayload.Event.AddElements(
                    elements=[
                        TestStreamPayload.TimestampedElement(
                            encoded_element=coder.encode(0), timestamp=0)
                    ],
                    tag=CACHED_PCOLLECTION_KEY)),
            TestStreamPayload.Event(processing_time_event=TestStreamPayload.
                                    Event.AdvanceProcessingTime(
                                        advance_duration=1 * 10**6)),
            TestStreamPayload.Event(element_event=TestStreamPayload.Event.
                                    AddElements(elements=[
                                        TestStreamPayload.TimestampedElement(
                                            encoded_element=coder.encode(1),
                                            timestamp=1 * 10**6)
                                    ],
                                                tag=CACHED_PCOLLECTION_KEY)),
            TestStreamPayload.Event(processing_time_event=TestStreamPayload.
                                    Event.AdvanceProcessingTime(
                                        advance_duration=1 * 10**6)),
        ]
        self.assertSequenceEqual(events, expected)
Esempio n. 6
0
    def read(self, tail=True):
        # type: (boolean) -> Any
        """Reads the elements currently recorded."""

        # Get the cache manager and wait until the file exists.
        cache_manager = ie.current_env().get_cache_manager(self._pipeline)

        # Retrieve the coder for the particular PCollection which will be used to
        # decode elements read from cache.
        coder = cache_manager.load_pcoder('full', self._cache_key)

        # Read the elements from the cache.
        limiters = [
            CountLimiter(self._n),
            ProcessingTimeLimiter(self._duration_secs)
        ]
        reader, _ = cache_manager.read('full', self._cache_key, tail=tail)

        # Because a single TestStreamFileRecord can yield multiple elements, we
        # limit the count again here in the to_element_list call.
        #
        # There are two ways of exiting this loop either a limiter was triggered or
        # all elements from the cache were read. In the latter situation, it may be
        # the case that the pipeline was still running. Thus, another invocation of
        # `read` will yield new elements.
        for e in utils.to_element_list(reader,
                                       coder,
                                       include_window_info=True,
                                       n=self._n):
            for l in limiters:
                l.update(e)

            yield e

            if any(l.is_triggered() for l in limiters):
                break

        # A limiter being triggered means that we have fulfilled the user's request.
        # This implies that reading from the cache again won't yield any new
        # elements. WLOG, this applies to the user pipeline being terminated.
        if any(l.is_triggered()
               for l in limiters) or ie.current_env().is_terminated(
                   self._pipeline):
            self._done = True