Ejemplo n.º 1
0
        def read(self):
            """Reads records from PCollection readers.
      """

            # The largest timestamp read from the different streams.
            target_timestamp = timestamp.Timestamp.of(0)

            # The events from last iteration that are past the target timestamp.
            unsent_events = []

            # Emit events until all events have been read.
            while True:
                # Read the next set of events. The read events will most likely be
                # out of order if there are multiple readers. Here we sort them into
                # a more manageable state.
                new_events = self._test_stream_events_before_target(
                    target_timestamp)
                events_to_send = self._merge_sort(unsent_events, new_events)
                if not events_to_send:
                    break

                # Get the next largest timestamp in the stream. This is used as the
                # timestamp for readers to "catch-up" to. This will only read from
                # readers with a timestamp less than this.
                target_timestamp = self._min_timestamp_of(events_to_send)

                # Loop through the elements with the correct timestamp.
                while not self._event_stream_caught_up_to_target(
                        events_to_send, target_timestamp):
                    tag, r = events_to_send.pop()

                    # First advance the clock to match the time of the stream. This has
                    # a side-effect of also advancing this cache's clock.
                    curr_timestamp = Timestamp.from_proto(r.processing_time)
                    if curr_timestamp > self._monotonic_clock:
                        yield self._advance_processing_time(curr_timestamp)

                    # Then, send either a new element or watermark.
                    if r.HasField('element'):
                        yield self._add_element(r.element, tag)
                    elif r.HasField('watermark'):
                        yield self._advance_watermark(r.watermark, tag)
                unsent_events = events_to_send
                target_timestamp = self._min_timestamp_of(unsent_events)
Ejemplo n.º 2
0
    def _test_stream_events_before_target(self, target_timestamp):
      """Reads the next iteration of elements from each stream.

      Retrieves an element from each stream iff the most recently read timestamp
      from that stream is less than the target_timestamp. Since the amount of
      events may not fit into memory, this StreamingCache reads at most one
      element from each stream at a time.
      """
      records = []
      for tag, r in self._readers.items():
        # The target_timestamp is the maximum timestamp that was read from the
        # stream. Some readers may have elements that are less than this. Thus,
        # we skip all readers that already have elements that are at this
        # timestamp so that we don't read everything into memory.
        if self._stream_times[tag] >= target_timestamp:
          continue
        try:
          record = next(r)
          records.append((tag, record))
          self._stream_times[tag] = Timestamp.from_proto(record.processing_time)
        except StopIteration:
          pass
      return records
Ejemplo n.º 3
0
 def test_from_proto_fails_with_truncation(self):
     # TODO(BEAM-8738): Better define timestamps.
     with self.assertRaises(ValueError):
         Timestamp.from_proto(
             timestamp_pb2.Timestamp(seconds=1234, nanos=56789))
Ejemplo n.º 4
0
 def test_from_proto(self):
     ts_proto = timestamp_pb2.Timestamp(seconds=1234, nanos=56000)
     actual_ts = Timestamp.from_proto(ts_proto)
     expected_ts = Timestamp(seconds=1234, micros=56)
     self.assertEqual(actual_ts, expected_ts)
Ejemplo n.º 5
0
 def _min_timestamp_of(self, events):
     return (Timestamp.from_proto(events[-1][1].processing_time)
             if events else timestamp.MAX_TIMESTAMP)
Ejemplo n.º 6
0
 def _merge_sort(self, previous_events, new_events):
     return sorted(
         previous_events + new_events,
         key=lambda x: Timestamp.from_proto(x[1].processing_time),
         reverse=True)
Ejemplo n.º 7
0
 def test_from_proto_fails_with_truncation(self):
     # TODO(https://github.com/apache/beam/issues/19922): Better define
     # timestamps.
     with self.assertRaises(ValueError):
         Timestamp.from_proto(
             timestamp_pb2.Timestamp(seconds=1234, nanos=56789))