def __init__(self, coder=coders.FastPrimitivesCoder(), events=()): super(TestStream, self).__init__() assert coder is not None self.coder = coder self.watermarks = {None: timestamp.MIN_TIMESTAMP} self._events = list(events) self.output_tags = set()
def __init__(self, coder=coders.FastPrimitivesCoder(), events=None, output_tags=None, endpoint=None): """TestStream constructor. Args: coder: (apache_beam.Coder) the element coder for any ElementEvents. events: (List[Event]) a list of instructions for the TestStream to execute. This must be a subset of the given output_tags. output_tags: (List[str]) a list of PCollection output tags. endpoint: (str) a URL locating a TestStreamService. """ super(TestStream, self).__init__() assert coder is not None self.coder = coder self.watermarks = {None: timestamp.MIN_TIMESTAMP} self.output_tags = set(output_tags) if output_tags else set() self._events = [] if events is None else list(events) self._endpoint = endpoint event_tags = set(e.tag for e in self._events if isinstance(e, (WatermarkEvent, ElementEvent))) assert event_tags.issubset(self.output_tags), \ '{} is not a subset of {}'.format(event_tags, output_tags) assert not (self._events and self._endpoint), \ 'Only either events or an endpoint can be given at once.'
def __init__(self, coder=coders.FastPrimitivesCoder(), events=None, output_tags=None, endpoint=None): """ Args: coder: (apache_beam.Coder) the coder to encode/decode elements. events: (List[Event]) a list of instructions for the TestStream to execute. If specified, the events tags must exist in the output_tags. output_tags: (List[str]) Initial set of outputs. If no event references an output tag, no output will be produced for that tag. endpoint: (str) a URL locating a TestStreamService. """ super(TestStream, self).__init__() assert coder is not None self.coder = coder self.watermarks = {None: timestamp.MIN_TIMESTAMP} self.output_tags = set(output_tags) if output_tags else set() self._events = [] if events is None else list(events) self._endpoint = endpoint event_tags = set(e.tag for e in self._events if isinstance(e, (WatermarkEvent, ElementEvent))) assert event_tags.issubset(self.output_tags), \ '{} is not a subset of {}'.format(event_tags, output_tags) assert not (self._events and self._endpoint), \ 'Only either events or an endpoint can be given at once.'
def __init__(self, output_tags, coder=coders.FastPrimitivesCoder(), events=None): assert coder is not None self.coder = coder self._events = self._add_watermark_advancements(output_tags, events)
def test_test_stream_payload_events(self): """Tests that the to_element_list can limit the count in a single bundle.""" coder = coders.FastPrimitivesCoder() def reader(): element_payload = [ TestStreamPayload.TimestampedElement( encoded_element=coder.encode( WindowedValueHolder(WindowedValue(e, 0, []))), timestamp=Timestamp.of(0).micros) for e in range(10) ] event = TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=element_payload)) yield event # The reader creates 10 elements in a single TestStreamPayload but we limit # the number of elements read to 5 here. This tests that the to_element_list # can limit the number of elements in a single bundle. elements = utils.to_element_list(reader(), coder, include_window_info=False, n=5) self.assertSequenceEqual(list(elements), list(range(5)))
def test_default_fallback_path(self): """Test fallback path picks a matching coder if no coder is registered.""" coder = coders.registry.get_coder(DummyClass) # No matching coder, so picks the last fallback coder which is a # FastPrimitivesCoder. self.assertEqual(coder, coders.FastPrimitivesCoder()) self.assertEqual(DummyClass(), coder.decode(coder.encode(DummyClass())))
def test_read_duration(self): """Test that the stream only reads a 'duration' of elements.""" def as_windowed_value(element): return WindowedValueHolder(WindowedValue(element, 0, [])) values = (FileRecordsBuilder(tag=self.cache_key) .advance_processing_time(1) .add_element(element=as_windowed_value(0), event_time_secs=0) .advance_processing_time(1) .add_element(element=as_windowed_value(1), event_time_secs=1) .advance_processing_time(1) .add_element(element=as_windowed_value(2), event_time_secs=3) .advance_processing_time(1) .add_element(element=as_windowed_value(3), event_time_secs=4) .advance_processing_time(1) .add_element(element=as_windowed_value(4), event_time_secs=5) .build()) # yapf: disable values = [ v.recorded_event for v in values if isinstance(v, beam_interactive_api_pb2.TestStreamFileRecord) ] self.mock_result.set_state(PipelineState.DONE) self.cache.write(values, 'full', self.cache_key) self.cache.save_pcoder(coders.FastPrimitivesCoder(), 'full', self.cache_key) # The following tests a progression of reading different durations from the # cache. stream = ElementStream(self.pcoll, '', self.cache_key, max_n=100, max_duration_secs=1) self.assertSequenceEqual([e.value for e in stream.read()], [0]) stream = ElementStream(self.pcoll, '', self.cache_key, max_n=100, max_duration_secs=2) self.assertSequenceEqual([e.value for e in stream.read()], [0, 1]) stream = ElementStream(self.pcoll, '', self.cache_key, max_n=100, max_duration_secs=10) self.assertSequenceEqual([e.value for e in stream.read()], [0, 1, 2, 3, 4])
def test_single_reader_with_processing_time_limiter(self): """Tests that we expect to see all the correctly emitted TestStreamPayloads. """ CACHED_PCOLLECTION_KEY = repr(CacheKey('arbitrary_key', '', '', '')) values = (FileRecordsBuilder(tag=CACHED_PCOLLECTION_KEY) .advance_processing_time(1e-6) .add_element(element=0, event_time_secs=0) .advance_processing_time(1) .add_element(element=1, event_time_secs=1) .advance_processing_time(1) .add_element(element=2, event_time_secs=2) .advance_processing_time(1) .add_element(element=3, event_time_secs=2) .advance_processing_time(1) .add_element(element=4, event_time_secs=2) .build()) # yapf: disable cache = StreamingCache(cache_dir=None) cache.write(values, CACHED_PCOLLECTION_KEY) reader, _ = cache.read(CACHED_PCOLLECTION_KEY, limiters=[ProcessingTimeLimiter(2)]) coder = coders.FastPrimitivesCoder() events = list(reader) # Units here are in microseconds. # Expects that the elements are a slice of the original values where all # processing time is less than the duration. expected = [ TestStreamPayload.Event(processing_time_event=TestStreamPayload. Event.AdvanceProcessingTime( advance_duration=1)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode(0), timestamp=0) ], tag=CACHED_PCOLLECTION_KEY)), TestStreamPayload.Event(processing_time_event=TestStreamPayload. Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event(element_event=TestStreamPayload.Event. AddElements(elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode(1), timestamp=1 * 10**6) ], tag=CACHED_PCOLLECTION_KEY)), ] self.assertSequenceEqual(events, expected)
def get_elements(events): coder = coders.FastPrimitivesCoder() elements = [] for e in events: if not isinstance(e, TestStreamFileRecord): continue if e.recorded_event.element_event: elements += ([ coder.decode(el.encoded_element) for el in e.recorded_event.element_event.elements ]) return elements
def test_single_reader(self): """Tests that we expect to see all the correctly emitted TestStreamPayloads. """ CACHED_PCOLLECTION_KEY = 'arbitrary_key' values = (FileRecordsBuilder(tag=CACHED_PCOLLECTION_KEY) .add_element(element=0, event_time_secs=0) .advance_processing_time(1) .add_element(element=1, event_time_secs=1) .advance_processing_time(1) .add_element(element=2, event_time_secs=2) .build()) # yapf: disable cache = StreamingCache(cache_dir=None) cache.write(values, CACHED_PCOLLECTION_KEY) reader, _ = cache.read(CACHED_PCOLLECTION_KEY) coder = coders.FastPrimitivesCoder() events = list(reader) # Units here are in microseconds. expected = [ TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode(0), timestamp=0) ], tag=CACHED_PCOLLECTION_KEY)), TestStreamPayload.Event(processing_time_event=TestStreamPayload. Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event(element_event=TestStreamPayload.Event. AddElements(elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode(1), timestamp=1 * 10**6) ], tag=CACHED_PCOLLECTION_KEY)), TestStreamPayload.Event(processing_time_event=TestStreamPayload. Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event(element_event=TestStreamPayload.Event. AddElements(elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode(2), timestamp=2 * 10**6) ], tag=CACHED_PCOLLECTION_KEY)), ] self.assertSequenceEqual(events, expected)
def __init__(self, coder=coders.FastPrimitivesCoder(), events=None, output_tags=None): super(TestStream, self).__init__() assert coder is not None self.coder = coder self.watermarks = {None: timestamp.MIN_TIMESTAMP} self._events = [] if events is None else list(events) self.output_tags = set(output_tags) if output_tags else set() event_tags = set(e.tag for e in self._events if isinstance(e, (WatermarkEvent, ElementEvent))) assert event_tags.issubset(self.output_tags), \ '{} is not a subset of {}'.format(event_tags, output_tags)
def test_capture_size_limit_reached_when_file_size_above_limit(self): ib.options.capture_size_limit = 1 cache = StreamingCache(cache_dir=None) cache.sink(['my_label'], is_capture=True) cache.write([ TestStreamFileRecord(recorded_event=TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements(elements=[ TestStreamPayload.TimestampedElement( encoded_element=coders.FastPrimitivesCoder().encode( 'a'), timestamp=0) ]))) ], 'my_label') self.assertTrue(cache.exists('my_label')) ie.current_env().set_cache_manager(cache) self.assertTrue(ie.current_env().options.capture_control. is_capture_size_limit_reached())
def test_single_reader(self): """Tests that we expect to see all the correctly emitted TestStreamPayloads. """ in_memory_reader = InMemoryReader() in_memory_reader.add_element( element=0, event_time=0, processing_time=0) in_memory_reader.add_element( element=1, event_time=1, processing_time=1) in_memory_reader.add_element( element=2, event_time=2, processing_time=2) cache = StreamingCache([in_memory_reader]) reader = cache.reader() coder = coders.FastPrimitivesCoder() events = all_events(reader) expected = [ TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[TestStreamPayload.TimestampedElement( encoded_element=coder.encode(0), timestamp=0)])), TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[TestStreamPayload.TimestampedElement( encoded_element=coder.encode(1), timestamp=1 * 10**6)])), TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[TestStreamPayload.TimestampedElement( encoded_element=coder.encode(2), timestamp=2 * 10**6)])), ] self.assertSequenceEqual(events, expected)
def test_capture_size_limit_reached_when_file_size_above_limit(self): ib.options.capture_size_limit = 1 cache = StreamingCache(cache_dir=None) cache.sink(['my_label'], is_capture=True) cache.write([ TestStreamFileRecord(recorded_event=TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements(elements=[ TestStreamPayload.TimestampedElement( encoded_element=coders.FastPrimitivesCoder().encode( 'a'), timestamp=0) ]))) ], 'my_label') self.assertTrue(cache.exists('my_label')) p = _build_an_empty_streaming_pipeline() ie.current_env().set_cache_manager(cache, p) limiter = capture_limiters.SizeLimiter(1) self.assertTrue(limiter.is_triggered())
def test_update_batch(self): coder = coders.FastPrimitivesCoder() opcounts = OperationCounters( CounterFactory(), 'some-name', coder, 0, producer_batch_converter=typehints.batch.BatchConverter. from_typehints(element_type=typehints.Any, batch_type=typehints.List[typehints.Any])) size_per_element = coder.estimate_size(50) self.verify_counters(opcounts, 0, float('nan')) opcounts.update_from_batch( GlobalWindows.windowed_batch(list(range(100)))) self.verify_counters(opcounts, 100, size_per_element) opcounts.update_from_batch( GlobalWindows.windowed_batch(list(range(100, 200)))) self.verify_counters(opcounts, 200, size_per_element)
def __init__(self, coder=coders.FastPrimitivesCoder()): assert coder is not None self.coder = coder self.current_watermark = timestamp.MIN_TIMESTAMP self.events = []
def __init__(self, tag=None): self._header = TestStreamFileHeader(tag=tag) self._records = [] self._coder = coders.FastPrimitivesCoder()
def test_multiple_readers(self): """Tests that the service advances the clock with multiple outputs.""" letters = InMemoryReader('letters') letters.advance_watermark(0, 1) letters.add_element( element='a', event_time=0, processing_time=1) letters.advance_watermark(10, 11) letters.add_element( element='b', event_time=10, processing_time=11) numbers = InMemoryReader('numbers') numbers.add_element( element=1, event_time=0, processing_time=2) numbers.add_element( element=2, event_time=0, processing_time=3) numbers.add_element( element=2, event_time=0, processing_time=4) late = InMemoryReader('late') late.add_element( element='late', event_time=0, processing_time=101) cache = StreamingCache([letters, numbers, late]) reader = cache.reader() coder = coders.FastPrimitivesCoder() events = all_events(reader) expected = [ # Advances clock from 0 to 1 TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event( watermark_event=TestStreamPayload.Event.AdvanceWatermark( new_watermark=0, tag='letters')), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[TestStreamPayload.TimestampedElement( encoded_element=coder.encode('a'), timestamp=0)], tag='letters')), # Advances clock from 1 to 2 TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[TestStreamPayload.TimestampedElement( encoded_element=coder.encode(1), timestamp=0)], tag='numbers')), # Advances clock from 2 to 3 TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[TestStreamPayload.TimestampedElement( encoded_element=coder.encode(2), timestamp=0)], tag='numbers')), # Advances clock from 3 to 4 TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[TestStreamPayload.TimestampedElement( encoded_element=coder.encode(2), timestamp=0)], tag='numbers')), # Advances clock from 4 to 11 TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=7 * 10**6)), TestStreamPayload.Event( watermark_event=TestStreamPayload.Event.AdvanceWatermark( new_watermark=10 * 10**6, tag='letters')), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[TestStreamPayload.TimestampedElement( encoded_element=coder.encode('b'), timestamp=10 * 10**6)], tag='letters')), # Advances clock from 11 to 101 TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=90 * 10**6)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[TestStreamPayload.TimestampedElement( encoded_element=coder.encode('late'), timestamp=0)], tag='late')), ] self.assertSequenceEqual(events, expected)
def __init__(self, coder=coders.FastPrimitivesCoder(), events=()): super(TestStream, self).__init__() assert coder is not None self.coder = coder self.current_watermark = timestamp.MIN_TIMESTAMP self.events = list(events)
def __init__(self, tag=None): self._header = beam_interactive_api_pb2.TestStreamFileHeader(tag=tag) self._records = [] self._coder = coders.FastPrimitivesCoder()
def test_multiple_readers(self): """Tests that the service advances the clock with multiple outputs. """ CACHED_LETTERS = repr(CacheKey('letters', '', '', '')) CACHED_NUMBERS = repr(CacheKey('numbers', '', '', '')) CACHED_LATE = repr(CacheKey('late', '', '', '')) letters = (FileRecordsBuilder(CACHED_LETTERS) .advance_processing_time(1) .advance_watermark(watermark_secs=0) .add_element(element='a', event_time_secs=0) .advance_processing_time(10) .advance_watermark(watermark_secs=10) .add_element(element='b', event_time_secs=10) .build()) # yapf: disable numbers = (FileRecordsBuilder(CACHED_NUMBERS) .advance_processing_time(2) .add_element(element=1, event_time_secs=0) .advance_processing_time(1) .add_element(element=2, event_time_secs=0) .advance_processing_time(1) .add_element(element=2, event_time_secs=0) .build()) # yapf: disable late = (FileRecordsBuilder(CACHED_LATE) .advance_processing_time(101) .add_element(element='late', event_time_secs=0) .build()) # yapf: disable cache = StreamingCache(cache_dir=None) cache.write(letters, CACHED_LETTERS) cache.write(numbers, CACHED_NUMBERS) cache.write(late, CACHED_LATE) reader = cache.read_multiple([[CACHED_LETTERS], [CACHED_NUMBERS], [CACHED_LATE]]) coder = coders.FastPrimitivesCoder() events = list(reader) # Units here are in microseconds. expected = [ # Advances clock from 0 to 1 TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event( watermark_event=TestStreamPayload.Event.AdvanceWatermark( new_watermark=0, tag=CACHED_LETTERS)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode('a'), timestamp=0) ], tag=CACHED_LETTERS)), # Advances clock from 1 to 2 TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode(1), timestamp=0) ], tag=CACHED_NUMBERS)), # Advances clock from 2 to 3 TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode(2), timestamp=0) ], tag=CACHED_NUMBERS)), # Advances clock from 3 to 4 TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode(2), timestamp=0) ], tag=CACHED_NUMBERS)), # Advances clock from 4 to 11 TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=7 * 10**6)), TestStreamPayload.Event( watermark_event=TestStreamPayload.Event.AdvanceWatermark( new_watermark=10 * 10**6, tag=CACHED_LETTERS)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode('b'), timestamp=10 * 10**6) ], tag=CACHED_LETTERS)), # Advances clock from 11 to 101 TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=90 * 10**6)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode('late'), timestamp=0) ], tag=CACHED_LATE)), ] self.assertSequenceEqual(events, expected)