def read(self): """Reads records from PCollection readers. """ # The largest timestamp read from the different streams. target_timestamp = timestamp.Timestamp.of(0) # The events from last iteration that are past the target timestamp. unsent_events = [] # Emit events until all events have been read. while True: # Read the next set of events. The read events will most likely be # out of order if there are multiple readers. Here we sort them into # a more manageable state. new_events = self._test_stream_events_before_target( target_timestamp) events_to_send = self._merge_sort(unsent_events, new_events) if not events_to_send: break # Get the next largest timestamp in the stream. This is used as the # timestamp for readers to "catch-up" to. This will only read from # readers with a timestamp less than this. target_timestamp = self._min_timestamp_of(events_to_send) # Loop through the elements with the correct timestamp. while not self._event_stream_caught_up_to_target( events_to_send, target_timestamp): tag, r = events_to_send.pop() # First advance the clock to match the time of the stream. This has # a side-effect of also advancing this cache's clock. curr_timestamp = Timestamp.from_proto(r.processing_time) if curr_timestamp > self._monotonic_clock: yield self._advance_processing_time(curr_timestamp) # Then, send either a new element or watermark. if r.HasField('element'): yield self._add_element(r.element, tag) elif r.HasField('watermark'): yield self._advance_watermark(r.watermark, tag) unsent_events = events_to_send target_timestamp = self._min_timestamp_of(unsent_events)
def _test_stream_events_before_target(self, target_timestamp): """Reads the next iteration of elements from each stream. Retrieves an element from each stream iff the most recently read timestamp from that stream is less than the target_timestamp. Since the amount of events may not fit into memory, this StreamingCache reads at most one element from each stream at a time. """ records = [] for tag, r in self._readers.items(): # The target_timestamp is the maximum timestamp that was read from the # stream. Some readers may have elements that are less than this. Thus, # we skip all readers that already have elements that are at this # timestamp so that we don't read everything into memory. if self._stream_times[tag] >= target_timestamp: continue try: record = next(r) records.append((tag, record)) self._stream_times[tag] = Timestamp.from_proto(record.processing_time) except StopIteration: pass return records
def test_from_proto_fails_with_truncation(self): # TODO(BEAM-8738): Better define timestamps. with self.assertRaises(ValueError): Timestamp.from_proto( timestamp_pb2.Timestamp(seconds=1234, nanos=56789))
def test_from_proto(self): ts_proto = timestamp_pb2.Timestamp(seconds=1234, nanos=56000) actual_ts = Timestamp.from_proto(ts_proto) expected_ts = Timestamp(seconds=1234, micros=56) self.assertEqual(actual_ts, expected_ts)
def _min_timestamp_of(self, events): return (Timestamp.from_proto(events[-1][1].processing_time) if events else timestamp.MAX_TIMESTAMP)
def _merge_sort(self, previous_events, new_events): return sorted( previous_events + new_events, key=lambda x: Timestamp.from_proto(x[1].processing_time), reverse=True)
def test_from_proto_fails_with_truncation(self): # TODO(https://github.com/apache/beam/issues/19922): Better define # timestamps. with self.assertRaises(ValueError): Timestamp.from_proto( timestamp_pb2.Timestamp(seconds=1234, nanos=56789))