def run_benchmark(num_runs=50, input_per_source=4000, num_sources=4): print("Number of runs:", num_runs) print("Input size:", num_sources * input_per_source) print("Sources:", num_sources) times = [] for i in range(num_runs): counter_factory = CounterFactory() state_sampler = statesampler.StateSampler('basic', counter_factory) with state_sampler.scoped_state('step1', 'state'): si_counter = opcounters.SideInputReadCounter( counter_factory, state_sampler, 'step1', 1) si_counter = opcounters.NoOpTransformIOCounter() sources = [ FakeSource(long_generator(i, input_per_source)) for i in range(num_sources)] iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, read_counter=si_counter) start = time.time() list(iterator_fn()) time_cost = time.time() - start times.append(time_cost) print("Runtimes:", times) avg_runtime = sum(times) // len(times) print("Average runtime:", avg_runtime) print("Time per element:", avg_runtime // (input_per_source * num_sources))
def test_source_iterator_fn_exception(self): class MyException(Exception): pass def exception_generator(): yield 0 time.sleep(0.1) raise MyException('I am an exception!') def perpetual_generator(value): while True: yield value time.sleep(0.1) sources = [ FakeSource(perpetual_generator(1)), FakeSource(perpetual_generator(2)), FakeSource(perpetual_generator(3)), FakeSource(perpetual_generator(4)), FakeSource(exception_generator()), ] iterator_fn = sideinputs.get_iterator_fn_for_sources(sources) seen = set() with self.assertRaises(MyException): for value in iterator_fn(): seen.add(value.value) self.assertEqual(sorted(seen), range(5))
def test_source_iterator_fn_exception(self): class MyException(Exception): pass def exception_generator(): yield 0 time.sleep(0.1) raise MyException('I am an exception!') def perpetual_generator(value): while True: yield value time.sleep(0.1) sources = [ FakeSource(perpetual_generator(1)), FakeSource(perpetual_generator(2)), FakeSource(perpetual_generator(3)), FakeSource(perpetual_generator(4)), FakeSource(exception_generator()), ] iterator_fn = sideinputs.get_iterator_fn_for_sources(sources) seen = set() with self.assertRaises(MyException): for value in iterator_fn(): seen.add(value.value) self.assertEqual(sorted(seen), range(5))
def _read_side_inputs(self, tags_and_types): # type: (...) -> Iterator[apache_sideinputs.SideInputMap] """Generator reading side inputs in the order prescribed by tags_and_types. Args: tags_and_types: List of tuples (tag, type). Each side input has a string tag that is specified in the worker instruction. The type is actually a boolean which is True for singleton input (read just first value) and False for collection input (read all values). Yields: With each iteration it yields the result of reading an entire side source either in singleton or collection mode according to the tags_and_types argument. """ # Only call this on the old path where side_input_maps was not # provided directly. assert self.side_input_maps is None # We will read the side inputs in the order prescribed by the # tags_and_types argument because this is exactly the order needed to # replace the ArgumentPlaceholder objects in the args/kwargs of the DoFn # getting the side inputs. # # Note that for each tag there could be several read operations in the # specification. This can happen for instance if the source has been # sharded into several files. for i, (side_tag, view_class, view_options) in enumerate(tags_and_types): sources = [] # Using the side_tag in the lambda below will trigger a pylint warning. # However in this case it is fine because the lambda is used right away # while the variable has the value assigned by the current iteration of # the for loop. # pylint: disable=cell-var-from-loop for si in filter(lambda o: o.tag == side_tag, self.spec.side_inputs): if not isinstance(si, operation_specs.WorkerSideInputSource): raise NotImplementedError('Unknown side input type: %r' % si) sources.append(si.source) # The tracking of time spend reading and bytes read from side inputs is # behind an experiment flag to test its performance impact. si_counter = opcounters.SideInputReadCounter( self.counter_factory, self.state_sampler, declaring_step=self.name_context.step_name, # Inputs are 1-indexed, so we add 1 to i in the side input id input_index=i + 1) iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, read_counter=si_counter) # Backwards compatibility for pre BEAM-733 SDKs. if isinstance(view_options, tuple): if view_class == pvalue.AsSingleton: has_default, default = view_options view_options = {'default': default} if has_default else {} else: view_options = {} yield apache_sideinputs.SideInputMap( view_class, view_options, sideinputs.EmulatedIterable(iterator_fn))
def test_single_source_iterator_fn(self): sources = [ FakeSource([0, 1, 2, 3, 4, 5]), ] iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, max_reader_threads=2) assert list(strip_windows(iterator_fn())) == range(6)
def test_single_source_iterator_fn(self): sources = [ FakeSource([0, 1, 2, 3, 4, 5]), ] iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, max_reader_threads=2) assert list(strip_windows(iterator_fn())) == range(6)
def run_benchmark(num_runs=50, input_per_source=4000, num_sources=4): print("Number of runs:", num_runs) print("Input size:", num_sources * input_per_source) print("Sources:", num_sources) times = [] for i in range(num_runs): counter_factory = CounterFactory() state_sampler = statesampler.StateSampler('basic', counter_factory) state_sampler.start() with state_sampler.scoped_state('step1', 'state'): si_counter = opcounters.SideInputReadCounter( counter_factory, state_sampler, 'step1', 1) si_counter = opcounters.NoOpTransformIOCounter() sources = [ FakeSource(long_generator(i, input_per_source)) for i in range(num_sources)] iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, read_counter=si_counter) start = time.time() list(iterator_fn()) time_cost = time.time() - start times.append(time_cost) state_sampler.stop() print("Runtimes:", times) avg_runtime = sum(times) / len(times) print("Average runtime:", avg_runtime) print("Time per element:", avg_runtime / (input_per_source * num_sources))
def _read_side_inputs(self, tags_and_types): """Generator reading side inputs in the order prescribed by tags_and_types. Args: tags_and_types: List of tuples (tag, type). Each side input has a string tag that is specified in the worker instruction. The type is actually a boolean which is True for singleton input (read just first value) and False for collection input (read all values). Yields: With each iteration it yields the result of reading an entire side source either in singleton or collection mode according to the tags_and_types argument. """ # Only call this on the old path where side_input_maps was not # provided directly. assert self.side_input_maps is None # We will read the side inputs in the order prescribed by the # tags_and_types argument because this is exactly the order needed to # replace the ArgumentPlaceholder objects in the args/kwargs of the DoFn # getting the side inputs. # # Note that for each tag there could be several read operations in the # specification. This can happen for instance if the source has been # sharded into several files. for i, (side_tag, view_class, view_options) in enumerate(tags_and_types): sources = [] # Using the side_tag in the lambda below will trigger a pylint warning. # However in this case it is fine because the lambda is used right away # while the variable has the value assigned by the current iteration of # the for loop. # pylint: disable=cell-var-from-loop for si in filter( lambda o: o.tag == side_tag, self.spec.side_inputs): if not isinstance(si, operation_specs.WorkerSideInputSource): raise NotImplementedError('Unknown side input type: %r' % si) sources.append(si.source) # The tracking of time spend reading and bytes read from side inputs is # behind an experiment flag to test its performance impact. si_counter = opcounters.SideInputReadCounter( self.counter_factory, self.state_sampler, declaring_step=self.name_context.step_name, # Inputs are 1-indexed, so we add 1 to i in the side input id input_index=i + 1) iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, read_counter=si_counter) # Backwards compatibility for pre BEAM-733 SDKs. if isinstance(view_options, tuple): if view_class == pvalue.AsSingleton: has_default, default = view_options view_options = {'default': default} if has_default else {} else: view_options = {} yield apache_sideinputs.SideInputMap( view_class, view_options, sideinputs.EmulatedIterable(iterator_fn))
def test_multiple_sources_single_reader_iterator_fn(self): sources = [ FakeSource([0]), FakeSource([1, 2, 3, 4, 5]), FakeSource([]), FakeSource([6, 7, 8, 9, 10]), ] iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, max_reader_threads=1) assert list(strip_windows(iterator_fn())) == range(11)
def test_bytes_read_behind_experiment(self): mock_read_counter = mock.MagicMock() source_records = ['a', 'b', 'c', 'd'] sources = [ FakeSource(source_records, notify_observers=True), ] iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, max_reader_threads=3, read_counter=mock_read_counter) assert list(strip_windows(iterator_fn())) == source_records mock_read_counter.add_bytes_read.assert_not_called()
def test_multiple_sources_single_reader_iterator_fn(self): sources = [ FakeSource([0]), FakeSource([1, 2, 3, 4, 5]), FakeSource([]), FakeSource([6, 7, 8, 9, 10]), ] iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, max_reader_threads=1) assert list(strip_windows(iterator_fn())) == range(11)
def test_bytes_read_behind_experiment(self): mock_read_counter = mock.MagicMock() source_records = ['a', 'b', 'c', 'd'] sources = [ FakeSource(source_records, notify_observers=True), ] iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, max_reader_threads=3, read_counter=mock_read_counter) assert list(strip_windows(iterator_fn())) == source_records mock_read_counter.add_bytes_read.assert_not_called()
def test_bytes_read_are_reported(self): RuntimeValueProvider.set_runtime_options( {'experiments': 'sideinput_io_metrics,other'}) mock_read_counter = mock.MagicMock() source_records = ['a', 'b', 'c', 'd'] sources = [ FakeSource(source_records, notify_observers=True), ] iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, max_reader_threads=3, read_counter=mock_read_counter) assert list(strip_windows(iterator_fn())) == source_records mock_read_counter.add_bytes_read.assert_called_with(4) # Remove runtime options from the runtime value provider. RuntimeValueProvider.set_runtime_options({})
def test_bytes_read_are_reported(self): RuntimeValueProvider.set_runtime_options( {'experiments': ['sideinput_io_metrics_v2', 'other']}) mock_read_counter = mock.MagicMock() source_records = ['a', 'b', 'c', 'd'] sources = [ FakeSource(source_records, notify_observers=True), ] iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, max_reader_threads=3, read_counter=mock_read_counter) assert list(strip_windows(iterator_fn())) == source_records mock_read_counter.add_bytes_read.assert_called_with(4) # Remove runtime options from the runtime value provider. RuntimeValueProvider.set_runtime_options({})
def _read_side_inputs(self, tags_and_types): """Generator reading side inputs in the order prescribed by tags_and_types. Args: tags_and_types: List of tuples (tag, type). Each side input has a string tag that is specified in the worker instruction. The type is actually a boolean which is True for singleton input (read just first value) and False for collection input (read all values). Yields: With each iteration it yields the result of reading an entire side source either in singleton or collection mode according to the tags_and_types argument. """ # We will read the side inputs in the order prescribed by the # tags_and_types argument because this is exactly the order needed to # replace the ArgumentPlaceholder objects in the args/kwargs of the DoFn # getting the side inputs. # # Note that for each tag there could be several read operations in the # specification. This can happen for instance if the source has been # sharded into several files. for side_tag, view_class, view_options in tags_and_types: sources = [] # Using the side_tag in the lambda below will trigger a pylint warning. # However in this case it is fine because the lambda is used right away # while the variable has the value assigned by the current iteration of # the for loop. # pylint: disable=cell-var-from-loop for si in itertools.ifilter(lambda o: o.tag == side_tag, self.spec.side_inputs): if not isinstance(si, operation_specs.WorkerSideInputSource): raise NotImplementedError('Unknown side input type: %r' % si) sources.append(si.source) iterator_fn = sideinputs.get_iterator_fn_for_sources(sources) # Backwards compatibility for pre BEAM-733 SDKs. if isinstance(view_options, tuple): if view_class == pvalue.AsSingleton: has_default, default = view_options view_options = {'default': default} if has_default else {} else: view_options = {} yield apache_sideinputs.SideInputMap( view_class, view_options, sideinputs.EmulatedIterable(iterator_fn))
def _read_side_inputs(self, tags_and_types): """Generator reading side inputs in the order prescribed by tags_and_types. Args: tags_and_types: List of tuples (tag, type). Each side input has a string tag that is specified in the worker instruction. The type is actually a boolean which is True for singleton input (read just first value) and False for collection input (read all values). Yields: With each iteration it yields the result of reading an entire side source either in singleton or collection mode according to the tags_and_types argument. """ # We will read the side inputs in the order prescribed by the # tags_and_types argument because this is exactly the order needed to # replace the ArgumentPlaceholder objects in the args/kwargs of the DoFn # getting the side inputs. # # Note that for each tag there could be several read operations in the # specification. This can happen for instance if the source has been # sharded into several files. for side_tag, view_class, view_options in tags_and_types: sources = [] # Using the side_tag in the lambda below will trigger a pylint warning. # However in this case it is fine because the lambda is used right away # while the variable has the value assigned by the current iteration of # the for loop. # pylint: disable=cell-var-from-loop for si in itertools.ifilter( lambda o: o.tag == side_tag, self.spec.side_inputs): if not isinstance(si, operation_specs.WorkerSideInputSource): raise NotImplementedError('Unknown side input type: %r' % si) sources.append(si.source) iterator_fn = sideinputs.get_iterator_fn_for_sources(sources) # Backwards compatibility for pre BEAM-733 SDKs. if isinstance(view_options, tuple): if view_class == pvalue.AsSingleton: has_default, default = view_options view_options = {'default': default} if has_default else {} else: view_options = {} yield apache_sideinputs.SideInputMap( view_class, view_options, sideinputs.EmulatedIterable(iterator_fn))
def test_source_iterator_single_source_exception(self): class MyException(Exception): pass def exception_generator(): yield 0 raise MyException('I am an exception!') sources = [ FakeSource(exception_generator()), ] iterator_fn = sideinputs.get_iterator_fn_for_sources(sources) seen = set() with self.assertRaises(MyException): for value in iterator_fn(): seen.add(value.value) self.assertEqual(sorted(seen), [0])
def test_source_iterator_single_source_exception(self): class MyException(Exception): pass def exception_generator(): yield 0 raise MyException('I am an exception!') sources = [ FakeSource(exception_generator()), ] iterator_fn = sideinputs.get_iterator_fn_for_sources(sources) seen = set() with self.assertRaises(MyException): for value in iterator_fn(): seen.add(value.value) self.assertEqual(sorted(seen), [0])