예제 #1
0
  def _read_side_inputs(self, tags_and_types):
    # type: (...) -> Iterator[apache_sideinputs.SideInputMap]

    """Generator reading side inputs in the order prescribed by tags_and_types.

    Args:
      tags_and_types: List of tuples (tag, type). Each side input has a string
        tag that is specified in the worker instruction. The type is actually
        a boolean which is True for singleton input (read just first value)
        and False for collection input (read all values).

    Yields:
      With each iteration it yields the result of reading an entire side source
      either in singleton or collection mode according to the tags_and_types
      argument.
    """
    # Only call this on the old path where side_input_maps was not
    # provided directly.
    assert self.side_input_maps is None

    # We will read the side inputs in the order prescribed by the
    # tags_and_types argument because this is exactly the order needed to
    # replace the ArgumentPlaceholder objects in the args/kwargs of the DoFn
    # getting the side inputs.
    #
    # Note that for each tag there could be several read operations in the
    # specification. This can happen for instance if the source has been
    # sharded into several files.
    for i, (side_tag, view_class, view_options) in enumerate(tags_and_types):
      sources = []
      # Using the side_tag in the lambda below will trigger a pylint warning.
      # However in this case it is fine because the lambda is used right away
      # while the variable has the value assigned by the current iteration of
      # the for loop.
      # pylint: disable=cell-var-from-loop
      for si in filter(lambda o: o.tag == side_tag, self.spec.side_inputs):
        if not isinstance(si, operation_specs.WorkerSideInputSource):
          raise NotImplementedError('Unknown side input type: %r' % si)
        sources.append(si.source)
        # The tracking of time spend reading and bytes read from side inputs is
        # behind an experiment flag to test its performance impact.
        si_counter = opcounters.SideInputReadCounter(
            self.counter_factory,
            self.state_sampler,
            declaring_step=self.name_context.step_name,
            # Inputs are 1-indexed, so we add 1 to i in the side input id
            input_index=i + 1)
      iterator_fn = sideinputs.get_iterator_fn_for_sources(
          sources, read_counter=si_counter)

      # Backwards compatibility for pre BEAM-733 SDKs.
      if isinstance(view_options, tuple):
        if view_class == pvalue.AsSingleton:
          has_default, default = view_options
          view_options = {'default': default} if has_default else {}
        else:
          view_options = {}

      yield apache_sideinputs.SideInputMap(
          view_class, view_options, sideinputs.EmulatedIterable(iterator_fn))
예제 #2
0
def run_benchmark(num_runs=50, input_per_source=4000, num_sources=4):
  print("Number of runs:", num_runs)
  print("Input size:", num_sources * input_per_source)
  print("Sources:", num_sources)

  times = []
  for i in range(num_runs):
    counter_factory = CounterFactory()
    state_sampler = statesampler.StateSampler('basic', counter_factory)
    with state_sampler.scoped_state('step1', 'state'):
      si_counter = opcounters.SideInputReadCounter(
          counter_factory, state_sampler, 'step1', 1)
      si_counter = opcounters.NoOpTransformIOCounter()
      sources = [
          FakeSource(long_generator(i, input_per_source))
          for i in range(num_sources)]
      iterator_fn = sideinputs.get_iterator_fn_for_sources(
          sources, read_counter=si_counter)
      start = time.time()
      list(iterator_fn())
      time_cost = time.time() - start
      times.append(time_cost)

  print("Runtimes:", times)

  avg_runtime = sum(times) // len(times)
  print("Average runtime:", avg_runtime)
  print("Time per element:", avg_runtime // (input_per_source *
                                             num_sources))
예제 #3
0
    def test_basic_counters(self):
        counter_factory = CounterFactory()
        sampler = statesampler.StateSampler('stage1', counter_factory)
        sampler.start()

        with sampler.scoped_state('step1', 'stateA'):
            counter = opcounters.SideInputReadCounter(counter_factory,
                                                      sampler,
                                                      declaring_step='step1',
                                                      input_index=1)
        with sampler.scoped_state('step2', 'stateB'):
            with counter:
                counter.add_bytes_read(10)

            counter.update_current_step()

        sampler.stop()
        sampler.commit_counters()

        actual_counter_names = set(
            [c.name for c in counter_factory.get_counters()])
        expected_counter_names = set([
            # Counter names for STEP 1
            counters.CounterName('read-sideinput-msecs',
                                 stage_name='stage1',
                                 step_name='step1',
                                 io_target=counters.side_input_id('step1', 1)),
            counters.CounterName('read-sideinput-byte-count',
                                 step_name='step1',
                                 io_target=counters.side_input_id('step1', 1)),

            # Counter names for STEP 2
            counters.CounterName('read-sideinput-msecs',
                                 stage_name='stage1',
                                 step_name='step1',
                                 io_target=counters.side_input_id('step2', 1)),
            counters.CounterName('read-sideinput-byte-count',
                                 step_name='step1',
                                 io_target=counters.side_input_id('step2', 1)),
        ])
        self.assertTrue(
            actual_counter_names.issuperset(expected_counter_names))