def test_empty(self): CACHED_PCOLLECTION_KEY = repr(CacheKey('arbitrary_key', '', '', '')) cache = StreamingCache(cache_dir=None) self.assertFalse(cache.exists(CACHED_PCOLLECTION_KEY)) cache.write([], CACHED_PCOLLECTION_KEY) reader, _ = cache.read(CACHED_PCOLLECTION_KEY) # Assert that an empty reader returns an empty list. self.assertFalse([e for e in reader])
def test_single_reader_with_processing_time_limiter(self): """Tests that we expect to see all the correctly emitted TestStreamPayloads. """ CACHED_PCOLLECTION_KEY = repr(CacheKey('arbitrary_key', '', '', '')) values = (FileRecordsBuilder(tag=CACHED_PCOLLECTION_KEY) .advance_processing_time(1e-6) .add_element(element=0, event_time_secs=0) .advance_processing_time(1) .add_element(element=1, event_time_secs=1) .advance_processing_time(1) .add_element(element=2, event_time_secs=2) .advance_processing_time(1) .add_element(element=3, event_time_secs=2) .advance_processing_time(1) .add_element(element=4, event_time_secs=2) .build()) # yapf: disable cache = StreamingCache(cache_dir=None) cache.write(values, CACHED_PCOLLECTION_KEY) reader, _ = cache.read(CACHED_PCOLLECTION_KEY, limiters=[ProcessingTimeLimiter(2)]) coder = coders.FastPrimitivesCoder() events = list(reader) # Units here are in microseconds. # Expects that the elements are a slice of the original values where all # processing time is less than the duration. expected = [ TestStreamPayload.Event(processing_time_event=TestStreamPayload. Event.AdvanceProcessingTime( advance_duration=1)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode(0), timestamp=0) ], tag=CACHED_PCOLLECTION_KEY)), TestStreamPayload.Event(processing_time_event=TestStreamPayload. Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event(element_event=TestStreamPayload.Event. AddElements(elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode(1), timestamp=1 * 10**6) ], tag=CACHED_PCOLLECTION_KEY)), ] self.assertSequenceEqual(events, expected)
def test_single_reader(self): """Tests that we expect to see all the correctly emitted TestStreamPayloads. """ CACHED_PCOLLECTION_KEY = 'arbitrary_key' values = (FileRecordsBuilder(tag=CACHED_PCOLLECTION_KEY) .add_element(element=0, event_time_secs=0) .advance_processing_time(1) .add_element(element=1, event_time_secs=1) .advance_processing_time(1) .add_element(element=2, event_time_secs=2) .build()) # yapf: disable cache = StreamingCache(cache_dir=None) cache.write(values, CACHED_PCOLLECTION_KEY) reader, _ = cache.read(CACHED_PCOLLECTION_KEY) coder = coders.FastPrimitivesCoder() events = list(reader) # Units here are in microseconds. expected = [ TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode(0), timestamp=0) ], tag=CACHED_PCOLLECTION_KEY)), TestStreamPayload.Event(processing_time_event=TestStreamPayload. Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event(element_event=TestStreamPayload.Event. AddElements(elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode(1), timestamp=1 * 10**6) ], tag=CACHED_PCOLLECTION_KEY)), TestStreamPayload.Event(processing_time_event=TestStreamPayload. Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event(element_event=TestStreamPayload.Event. AddElements(elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode(2), timestamp=2 * 10**6) ], tag=CACHED_PCOLLECTION_KEY)), ] self.assertSequenceEqual(events, expected)
def test_streaming_cache_uses_local_ib_cache_root(self): """ Checks that StreamingCache._cache_dir is set to the cache_root set under Interactive Beam for a local directory and that the cached values are the same as the values of a cache using default settings. """ CACHED_PCOLLECTION_KEY = repr(CacheKey('arbitrary_key', '', '', '')) values = (FileRecordsBuilder(CACHED_PCOLLECTION_KEY) .advance_processing_time(1) .advance_watermark(watermark_secs=0) .add_element(element=1, event_time_secs=0) .build()) # yapf: disable local_cache = StreamingCache(cache_dir=None) local_cache.write(values, CACHED_PCOLLECTION_KEY) reader_one, _ = local_cache.read(CACHED_PCOLLECTION_KEY) pcoll_list_one = list(reader_one) # Set Interactive Beam specified cache dir to cloud storage ib.options.cache_root = '/tmp/it-test/' cache_manager_with_ib_option = StreamingCache( cache_dir=ib.options.cache_root) self.assertEqual(ib.options.cache_root, cache_manager_with_ib_option._cache_dir) cache_manager_with_ib_option.write(values, CACHED_PCOLLECTION_KEY) reader_two, _ = cache_manager_with_ib_option.read( CACHED_PCOLLECTION_KEY) pcoll_list_two = list(reader_two) self.assertEqual(pcoll_list_one, pcoll_list_two) # Reset Interactive Beam setting ib.options.cache_root = None
def test_read_and_write(self): """An integration test between the Sink and Source. This ensures that the sink and source speak the same language in terms of coders, protos, order, and units. """ CACHED_RECORDS = repr(CacheKey('records', '', '', '')) # Units here are in seconds. test_stream = ( TestStream(output_tags=(CACHED_RECORDS)) .advance_watermark_to(0, tag=CACHED_RECORDS) .advance_processing_time(5) .add_elements(['a', 'b', 'c'], tag=CACHED_RECORDS) .advance_watermark_to(10, tag=CACHED_RECORDS) .advance_processing_time(1) .add_elements( [ TimestampedValue('1', 15), TimestampedValue('2', 15), TimestampedValue('3', 15) ], tag=CACHED_RECORDS)) # yapf: disable coder = SafeFastPrimitivesCoder() cache = StreamingCache(cache_dir=None, sample_resolution_sec=1.0) # Assert that there are no capture keys at first. self.assertEqual(cache.capture_keys, set()) options = StandardOptions(streaming=True) with TestPipeline(options=options) as p: records = (p | test_stream)[CACHED_RECORDS] # pylint: disable=expression-not-assigned records | cache.sink([CACHED_RECORDS], is_capture=True) reader, _ = cache.read(CACHED_RECORDS) actual_events = list(reader) # Assert that the capture keys are forwarded correctly. self.assertEqual(cache.capture_keys, set([CACHED_RECORDS])) # Units here are in microseconds. expected_events = [ TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=5 * 10**6)), TestStreamPayload.Event( watermark_event=TestStreamPayload.Event.AdvanceWatermark( new_watermark=0, tag=CACHED_RECORDS)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode('a'), timestamp=0), TestStreamPayload.TimestampedElement( encoded_element=coder.encode('b'), timestamp=0), TestStreamPayload.TimestampedElement( encoded_element=coder.encode('c'), timestamp=0), ], tag=CACHED_RECORDS)), TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event( watermark_event=TestStreamPayload.Event.AdvanceWatermark( new_watermark=10 * 10**6, tag=CACHED_RECORDS)), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode('1'), timestamp=15 * 10**6), TestStreamPayload.TimestampedElement( encoded_element=coder.encode('2'), timestamp=15 * 10**6), TestStreamPayload.TimestampedElement( encoded_element=coder.encode('3'), timestamp=15 * 10**6), ], tag=CACHED_RECORDS)), ] self.assertEqual(actual_events, expected_events)
def test_read_and_write(self): """An integration test between the Sink and Source. This ensures that the sink and source speak the same language in terms of coders, protos, order, and units. """ # Units here are in seconds. test_stream = (TestStream() .advance_watermark_to(0, tag='records') .advance_processing_time(5) .add_elements(['a', 'b', 'c'], tag='records') .advance_watermark_to(10, tag='records') .advance_processing_time(1) .add_elements( [ TimestampedValue('1', 15), TimestampedValue('2', 15), TimestampedValue('3', 15) ], tag='records')) # yapf: disable coder = SafeFastPrimitivesCoder() cache = StreamingCache(cache_dir=None, sample_resolution_sec=1.0) options = StandardOptions(streaming=True) options.view_as(DebugOptions).add_experiment( 'passthrough_pcollection_output_ids') with TestPipeline(options=options) as p: # pylint: disable=expression-not-assigned p | test_stream | cache.sink(['records']) reader, _ = cache.read('records') actual_events = list(reader) # Units here are in microseconds. expected_events = [ TestStreamPayload.Event(processing_time_event=TestStreamPayload. Event.AdvanceProcessingTime( advance_duration=5 * 10**6)), TestStreamPayload.Event( watermark_event=TestStreamPayload.Event.AdvanceWatermark( new_watermark=0, tag='records')), TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode('a'), timestamp=0), TestStreamPayload.TimestampedElement( encoded_element=coder.encode('b'), timestamp=0), TestStreamPayload.TimestampedElement( encoded_element=coder.encode('c'), timestamp=0), ], tag='records')), TestStreamPayload.Event(processing_time_event=TestStreamPayload. Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event( watermark_event=TestStreamPayload.Event.AdvanceWatermark( new_watermark=10 * 10**6, tag='records')), TestStreamPayload.Event(element_event=TestStreamPayload.Event. AddElements(elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode('1'), timestamp=15 * 10**6), TestStreamPayload.TimestampedElement( encoded_element=coder.encode('2'), timestamp=15 * 10**6), TestStreamPayload.TimestampedElement( encoded_element=coder.encode('3'), timestamp=15 * 10**6), ], tag='records')), ] self.assertEqual(actual_events, expected_events)