예제 #1
0
def run_benchmark(num_runs=50, input_per_source=4000, num_sources=4):
  print("Number of runs:", num_runs)
  print("Input size:", num_sources * input_per_source)
  print("Sources:", num_sources)

  times = []
  for i in range(num_runs):
    counter_factory = CounterFactory()
    state_sampler = statesampler.StateSampler('basic', counter_factory)
    with state_sampler.scoped_state('step1', 'state'):
      si_counter = opcounters.SideInputReadCounter(
          counter_factory, state_sampler, 'step1', 1)
      si_counter = opcounters.NoOpTransformIOCounter()
      sources = [
          FakeSource(long_generator(i, input_per_source))
          for i in range(num_sources)]
      iterator_fn = sideinputs.get_iterator_fn_for_sources(
          sources, read_counter=si_counter)
      start = time.time()
      list(iterator_fn())
      time_cost = time.time() - start
      times.append(time_cost)

  print("Runtimes:", times)

  avg_runtime = sum(times) // len(times)
  print("Average runtime:", avg_runtime)
  print("Time per element:", avg_runtime // (input_per_source *
                                             num_sources))
예제 #2
0
  def test_source_iterator_fn_exception(self):
    class MyException(Exception):
      pass

    def exception_generator():
      yield 0
      time.sleep(0.1)
      raise MyException('I am an exception!')

    def perpetual_generator(value):
      while True:
        yield value
        time.sleep(0.1)

    sources = [
        FakeSource(perpetual_generator(1)),
        FakeSource(perpetual_generator(2)),
        FakeSource(perpetual_generator(3)),
        FakeSource(perpetual_generator(4)),
        FakeSource(exception_generator()),
    ]
    iterator_fn = sideinputs.get_iterator_fn_for_sources(sources)
    seen = set()
    with self.assertRaises(MyException):
      for value in iterator_fn():
        seen.add(value.value)
    self.assertEqual(sorted(seen), range(5))
예제 #3
0
  def test_source_iterator_fn_exception(self):
    class MyException(Exception):
      pass

    def exception_generator():
      yield 0
      time.sleep(0.1)
      raise MyException('I am an exception!')

    def perpetual_generator(value):
      while True:
        yield value
        time.sleep(0.1)

    sources = [
        FakeSource(perpetual_generator(1)),
        FakeSource(perpetual_generator(2)),
        FakeSource(perpetual_generator(3)),
        FakeSource(perpetual_generator(4)),
        FakeSource(exception_generator()),
    ]
    iterator_fn = sideinputs.get_iterator_fn_for_sources(sources)
    seen = set()
    with self.assertRaises(MyException):
      for value in iterator_fn():
        seen.add(value.value)
    self.assertEqual(sorted(seen), range(5))
예제 #4
0
  def _read_side_inputs(self, tags_and_types):
    # type: (...) -> Iterator[apache_sideinputs.SideInputMap]

    """Generator reading side inputs in the order prescribed by tags_and_types.

    Args:
      tags_and_types: List of tuples (tag, type). Each side input has a string
        tag that is specified in the worker instruction. The type is actually
        a boolean which is True for singleton input (read just first value)
        and False for collection input (read all values).

    Yields:
      With each iteration it yields the result of reading an entire side source
      either in singleton or collection mode according to the tags_and_types
      argument.
    """
    # Only call this on the old path where side_input_maps was not
    # provided directly.
    assert self.side_input_maps is None

    # We will read the side inputs in the order prescribed by the
    # tags_and_types argument because this is exactly the order needed to
    # replace the ArgumentPlaceholder objects in the args/kwargs of the DoFn
    # getting the side inputs.
    #
    # Note that for each tag there could be several read operations in the
    # specification. This can happen for instance if the source has been
    # sharded into several files.
    for i, (side_tag, view_class, view_options) in enumerate(tags_and_types):
      sources = []
      # Using the side_tag in the lambda below will trigger a pylint warning.
      # However in this case it is fine because the lambda is used right away
      # while the variable has the value assigned by the current iteration of
      # the for loop.
      # pylint: disable=cell-var-from-loop
      for si in filter(lambda o: o.tag == side_tag, self.spec.side_inputs):
        if not isinstance(si, operation_specs.WorkerSideInputSource):
          raise NotImplementedError('Unknown side input type: %r' % si)
        sources.append(si.source)
        # The tracking of time spend reading and bytes read from side inputs is
        # behind an experiment flag to test its performance impact.
        si_counter = opcounters.SideInputReadCounter(
            self.counter_factory,
            self.state_sampler,
            declaring_step=self.name_context.step_name,
            # Inputs are 1-indexed, so we add 1 to i in the side input id
            input_index=i + 1)
      iterator_fn = sideinputs.get_iterator_fn_for_sources(
          sources, read_counter=si_counter)

      # Backwards compatibility for pre BEAM-733 SDKs.
      if isinstance(view_options, tuple):
        if view_class == pvalue.AsSingleton:
          has_default, default = view_options
          view_options = {'default': default} if has_default else {}
        else:
          view_options = {}

      yield apache_sideinputs.SideInputMap(
          view_class, view_options, sideinputs.EmulatedIterable(iterator_fn))
예제 #5
0
 def test_single_source_iterator_fn(self):
   sources = [
       FakeSource([0, 1, 2, 3, 4, 5]),
   ]
   iterator_fn = sideinputs.get_iterator_fn_for_sources(
       sources, max_reader_threads=2)
   assert list(strip_windows(iterator_fn())) == range(6)
예제 #6
0
 def test_single_source_iterator_fn(self):
   sources = [
       FakeSource([0, 1, 2, 3, 4, 5]),
   ]
   iterator_fn = sideinputs.get_iterator_fn_for_sources(
       sources, max_reader_threads=2)
   assert list(strip_windows(iterator_fn())) == range(6)
def run_benchmark(num_runs=50, input_per_source=4000, num_sources=4):
  print("Number of runs:", num_runs)
  print("Input size:", num_sources * input_per_source)
  print("Sources:", num_sources)

  times = []
  for i in range(num_runs):
    counter_factory = CounterFactory()
    state_sampler = statesampler.StateSampler('basic', counter_factory)
    state_sampler.start()
    with state_sampler.scoped_state('step1', 'state'):
      si_counter = opcounters.SideInputReadCounter(
          counter_factory, state_sampler, 'step1', 1)
      si_counter = opcounters.NoOpTransformIOCounter()
      sources = [
          FakeSource(long_generator(i, input_per_source))
          for i in range(num_sources)]
      iterator_fn = sideinputs.get_iterator_fn_for_sources(
          sources, read_counter=si_counter)
      start = time.time()
      list(iterator_fn())
      time_cost = time.time() - start
      times.append(time_cost)
    state_sampler.stop()

  print("Runtimes:", times)

  avg_runtime = sum(times) / len(times)
  print("Average runtime:", avg_runtime)
  print("Time per element:", avg_runtime / (input_per_source *
                                            num_sources))
예제 #8
0
  def _read_side_inputs(self, tags_and_types):
    """Generator reading side inputs in the order prescribed by tags_and_types.

    Args:
      tags_and_types: List of tuples (tag, type). Each side input has a string
        tag that is specified in the worker instruction. The type is actually
        a boolean which is True for singleton input (read just first value)
        and False for collection input (read all values).

    Yields:
      With each iteration it yields the result of reading an entire side source
      either in singleton or collection mode according to the tags_and_types
      argument.
    """
    # Only call this on the old path where side_input_maps was not
    # provided directly.
    assert self.side_input_maps is None

    # We will read the side inputs in the order prescribed by the
    # tags_and_types argument because this is exactly the order needed to
    # replace the ArgumentPlaceholder objects in the args/kwargs of the DoFn
    # getting the side inputs.
    #
    # Note that for each tag there could be several read operations in the
    # specification. This can happen for instance if the source has been
    # sharded into several files.
    for i, (side_tag, view_class, view_options) in enumerate(tags_and_types):
      sources = []
      # Using the side_tag in the lambda below will trigger a pylint warning.
      # However in this case it is fine because the lambda is used right away
      # while the variable has the value assigned by the current iteration of
      # the for loop.
      # pylint: disable=cell-var-from-loop
      for si in filter(
          lambda o: o.tag == side_tag, self.spec.side_inputs):
        if not isinstance(si, operation_specs.WorkerSideInputSource):
          raise NotImplementedError('Unknown side input type: %r' % si)
        sources.append(si.source)
        # The tracking of time spend reading and bytes read from side inputs is
        # behind an experiment flag to test its performance impact.
        si_counter = opcounters.SideInputReadCounter(
            self.counter_factory,
            self.state_sampler,
            declaring_step=self.name_context.step_name,
            # Inputs are 1-indexed, so we add 1 to i in the side input id
            input_index=i + 1)
      iterator_fn = sideinputs.get_iterator_fn_for_sources(
          sources, read_counter=si_counter)

      # Backwards compatibility for pre BEAM-733 SDKs.
      if isinstance(view_options, tuple):
        if view_class == pvalue.AsSingleton:
          has_default, default = view_options
          view_options = {'default': default} if has_default else {}
        else:
          view_options = {}

      yield apache_sideinputs.SideInputMap(
          view_class, view_options, sideinputs.EmulatedIterable(iterator_fn))
예제 #9
0
 def test_multiple_sources_single_reader_iterator_fn(self):
   sources = [
       FakeSource([0]),
       FakeSource([1, 2, 3, 4, 5]),
       FakeSource([]),
       FakeSource([6, 7, 8, 9, 10]),
   ]
   iterator_fn = sideinputs.get_iterator_fn_for_sources(
       sources, max_reader_threads=1)
   assert list(strip_windows(iterator_fn())) == range(11)
예제 #10
0
 def test_bytes_read_behind_experiment(self):
   mock_read_counter = mock.MagicMock()
   source_records = ['a', 'b', 'c', 'd']
   sources = [
       FakeSource(source_records, notify_observers=True),
   ]
   iterator_fn = sideinputs.get_iterator_fn_for_sources(
       sources, max_reader_threads=3, read_counter=mock_read_counter)
   assert list(strip_windows(iterator_fn())) == source_records
   mock_read_counter.add_bytes_read.assert_not_called()
예제 #11
0
 def test_multiple_sources_single_reader_iterator_fn(self):
   sources = [
       FakeSource([0]),
       FakeSource([1, 2, 3, 4, 5]),
       FakeSource([]),
       FakeSource([6, 7, 8, 9, 10]),
   ]
   iterator_fn = sideinputs.get_iterator_fn_for_sources(
       sources, max_reader_threads=1)
   assert list(strip_windows(iterator_fn())) == range(11)
예제 #12
0
 def test_bytes_read_behind_experiment(self):
   mock_read_counter = mock.MagicMock()
   source_records = ['a', 'b', 'c', 'd']
   sources = [
       FakeSource(source_records, notify_observers=True),
   ]
   iterator_fn = sideinputs.get_iterator_fn_for_sources(
       sources, max_reader_threads=3, read_counter=mock_read_counter)
   assert list(strip_windows(iterator_fn())) == source_records
   mock_read_counter.add_bytes_read.assert_not_called()
예제 #13
0
  def test_bytes_read_are_reported(self):
    RuntimeValueProvider.set_runtime_options(
        {'experiments': 'sideinput_io_metrics,other'})
    mock_read_counter = mock.MagicMock()
    source_records = ['a', 'b', 'c', 'd']
    sources = [
        FakeSource(source_records, notify_observers=True),
    ]
    iterator_fn = sideinputs.get_iterator_fn_for_sources(
        sources, max_reader_threads=3, read_counter=mock_read_counter)
    assert list(strip_windows(iterator_fn())) == source_records
    mock_read_counter.add_bytes_read.assert_called_with(4)

    # Remove runtime options from the runtime value provider.
    RuntimeValueProvider.set_runtime_options({})
예제 #14
0
  def test_bytes_read_are_reported(self):
    RuntimeValueProvider.set_runtime_options(
        {'experiments': ['sideinput_io_metrics_v2', 'other']})
    mock_read_counter = mock.MagicMock()
    source_records = ['a', 'b', 'c', 'd']
    sources = [
        FakeSource(source_records, notify_observers=True),
    ]
    iterator_fn = sideinputs.get_iterator_fn_for_sources(
        sources, max_reader_threads=3, read_counter=mock_read_counter)
    assert list(strip_windows(iterator_fn())) == source_records
    mock_read_counter.add_bytes_read.assert_called_with(4)

    # Remove runtime options from the runtime value provider.
    RuntimeValueProvider.set_runtime_options({})
예제 #15
0
    def _read_side_inputs(self, tags_and_types):
        """Generator reading side inputs in the order prescribed by tags_and_types.

    Args:
      tags_and_types: List of tuples (tag, type). Each side input has a string
        tag that is specified in the worker instruction. The type is actually
        a boolean which is True for singleton input (read just first value)
        and False for collection input (read all values).

    Yields:
      With each iteration it yields the result of reading an entire side source
      either in singleton or collection mode according to the tags_and_types
      argument.
    """
        # We will read the side inputs in the order prescribed by the
        # tags_and_types argument because this is exactly the order needed to
        # replace the ArgumentPlaceholder objects in the args/kwargs of the DoFn
        # getting the side inputs.
        #
        # Note that for each tag there could be several read operations in the
        # specification. This can happen for instance if the source has been
        # sharded into several files.
        for side_tag, view_class, view_options in tags_and_types:
            sources = []
            # Using the side_tag in the lambda below will trigger a pylint warning.
            # However in this case it is fine because the lambda is used right away
            # while the variable has the value assigned by the current iteration of
            # the for loop.
            # pylint: disable=cell-var-from-loop
            for si in itertools.ifilter(lambda o: o.tag == side_tag,
                                        self.spec.side_inputs):
                if not isinstance(si, operation_specs.WorkerSideInputSource):
                    raise NotImplementedError('Unknown side input type: %r' %
                                              si)
                sources.append(si.source)
            iterator_fn = sideinputs.get_iterator_fn_for_sources(sources)

            # Backwards compatibility for pre BEAM-733 SDKs.
            if isinstance(view_options, tuple):
                if view_class == pvalue.AsSingleton:
                    has_default, default = view_options
                    view_options = {'default': default} if has_default else {}
                else:
                    view_options = {}

            yield apache_sideinputs.SideInputMap(
                view_class, view_options,
                sideinputs.EmulatedIterable(iterator_fn))
예제 #16
0
  def _read_side_inputs(self, tags_and_types):
    """Generator reading side inputs in the order prescribed by tags_and_types.

    Args:
      tags_and_types: List of tuples (tag, type). Each side input has a string
        tag that is specified in the worker instruction. The type is actually
        a boolean which is True for singleton input (read just first value)
        and False for collection input (read all values).

    Yields:
      With each iteration it yields the result of reading an entire side source
      either in singleton or collection mode according to the tags_and_types
      argument.
    """
    # We will read the side inputs in the order prescribed by the
    # tags_and_types argument because this is exactly the order needed to
    # replace the ArgumentPlaceholder objects in the args/kwargs of the DoFn
    # getting the side inputs.
    #
    # Note that for each tag there could be several read operations in the
    # specification. This can happen for instance if the source has been
    # sharded into several files.
    for side_tag, view_class, view_options in tags_and_types:
      sources = []
      # Using the side_tag in the lambda below will trigger a pylint warning.
      # However in this case it is fine because the lambda is used right away
      # while the variable has the value assigned by the current iteration of
      # the for loop.
      # pylint: disable=cell-var-from-loop
      for si in itertools.ifilter(
          lambda o: o.tag == side_tag, self.spec.side_inputs):
        if not isinstance(si, operation_specs.WorkerSideInputSource):
          raise NotImplementedError('Unknown side input type: %r' % si)
        sources.append(si.source)
      iterator_fn = sideinputs.get_iterator_fn_for_sources(sources)

      # Backwards compatibility for pre BEAM-733 SDKs.
      if isinstance(view_options, tuple):
        if view_class == pvalue.AsSingleton:
          has_default, default = view_options
          view_options = {'default': default} if has_default else {}
        else:
          view_options = {}

      yield apache_sideinputs.SideInputMap(
          view_class, view_options, sideinputs.EmulatedIterable(iterator_fn))
예제 #17
0
  def test_source_iterator_single_source_exception(self):
    class MyException(Exception):
      pass

    def exception_generator():
      yield 0
      raise MyException('I am an exception!')

    sources = [
        FakeSource(exception_generator()),
    ]
    iterator_fn = sideinputs.get_iterator_fn_for_sources(sources)
    seen = set()
    with self.assertRaises(MyException):
      for value in iterator_fn():
        seen.add(value.value)
    self.assertEqual(sorted(seen), [0])
예제 #18
0
  def test_source_iterator_single_source_exception(self):
    class MyException(Exception):
      pass

    def exception_generator():
      yield 0
      raise MyException('I am an exception!')

    sources = [
        FakeSource(exception_generator()),
    ]
    iterator_fn = sideinputs.get_iterator_fn_for_sources(sources)
    seen = set()
    with self.assertRaises(MyException):
      for value in iterator_fn():
        seen.add(value.value)
    self.assertEqual(sorted(seen), [0])