Example #1
0
 def _reader_thread(self):
     # pylint: disable=too-many-nested-blocks
     experiments = set(
         RuntimeValueProvider.get_value('experiments', str, '').split(','))
     try:
         while True:
             try:
                 source = self.sources_queue.get_nowait()
                 if isinstance(source, iobase.BoundedSource):
                     for value in source.read(
                             source.get_range_tracker(None, None)):
                         if self.has_errored:
                             # If any reader has errored, just return.
                             return
                         if isinstance(value, window.WindowedValue):
                             self.element_queue.put(value)
                         else:
                             self.element_queue.put(
                                 _globally_windowed(value))
                 else:
                     # Native dataflow source.
                     with source.reader() as reader:
                         # The tracking of time spend reading and bytes read from side
                         # inputs is kept behind an experiment flag to test performance
                         # impact.
                         if 'sideinput_io_metrics' in experiments:
                             self.add_byte_counter(reader)
                         returns_windowed_values = reader.returns_windowed_values
                         for value in reader:
                             if self.has_errored:
                                 # If any reader has errored, just return.
                                 return
                             if returns_windowed_values:
                                 self.element_queue.put(value)
                             else:
                                 self.element_queue.put(
                                     _globally_windowed(value))
             except Queue.Empty:
                 return
     except Exception as e:  # pylint: disable=broad-except
         logging.error(
             'Encountered exception in PrefetchingSourceSetIterable '
             'reader thread: %s', traceback.format_exc())
         self.reader_exceptions.put(e)
         self.has_errored = True
     finally:
         self.element_queue.put(READER_THREAD_IS_DONE_SENTINEL)
Example #2
0
 def _reader_thread(self):
   # pylint: disable=too-many-nested-blocks
   experiments = RuntimeValueProvider.get_value('experiments', list, [])
   try:
     while True:
       try:
         source = self.sources_queue.get_nowait()
         if isinstance(source, iobase.BoundedSource):
           for value in source.read(source.get_range_tracker(None, None)):
             if self.has_errored:
               # If any reader has errored, just return.
               return
             if isinstance(value, window.WindowedValue):
               self.element_queue.put(value)
             else:
               self.element_queue.put(_globally_windowed(value))
         else:
           # Native dataflow source.
           with source.reader() as reader:
             # The tracking of time spend reading and bytes read from side
             # inputs is kept behind an experiment flag to test performance
             # impact.
             if 'sideinput_io_metrics' in experiments:
               self.add_byte_counter(reader)
             returns_windowed_values = reader.returns_windowed_values
             for value in reader:
               if self.has_errored:
                 # If any reader has errored, just return.
                 return
               if returns_windowed_values:
                 self.element_queue.put(value)
               else:
                 self.element_queue.put(_globally_windowed(value))
       except Queue.Empty:
         return
   except Exception as e:  # pylint: disable=broad-except
     logging.error('Encountered exception in PrefetchingSourceSetIterable '
                   'reader thread: %s', traceback.format_exc())
     self.reader_exceptions.put(e)
     self.has_errored = True
   finally:
     self.element_queue.put(READER_THREAD_IS_DONE_SENTINEL)
Example #3
0
  def _read_side_inputs(self, tags_and_types):
    """Generator reading side inputs in the order prescribed by tags_and_types.

    Args:
      tags_and_types: List of tuples (tag, type). Each side input has a string
        tag that is specified in the worker instruction. The type is actually
        a boolean which is True for singleton input (read just first value)
        and False for collection input (read all values).

    Yields:
      With each iteration it yields the result of reading an entire side source
      either in singleton or collection mode according to the tags_and_types
      argument.
    """
    # Only call this on the old path where side_input_maps was not
    # provided directly.
    assert self.side_input_maps is None

    # Get experiments active in the worker to check for side input metrics exp.
    experiments = set(
        RuntimeValueProvider.get_value('experiments', str, '').split(','))

    # We will read the side inputs in the order prescribed by the
    # tags_and_types argument because this is exactly the order needed to
    # replace the ArgumentPlaceholder objects in the args/kwargs of the DoFn
    # getting the side inputs.
    #
    # Note that for each tag there could be several read operations in the
    # specification. This can happen for instance if the source has been
    # sharded into several files.
    for i, (side_tag, view_class, view_options) in enumerate(tags_and_types):
      sources = []
      # Using the side_tag in the lambda below will trigger a pylint warning.
      # However in this case it is fine because the lambda is used right away
      # while the variable has the value assigned by the current iteration of
      # the for loop.
      # pylint: disable=cell-var-from-loop
      for si in itertools.ifilter(
          lambda o: o.tag == side_tag, self.spec.side_inputs):
        if not isinstance(si, operation_specs.WorkerSideInputSource):
          raise NotImplementedError('Unknown side input type: %r' % si)
        sources.append(si.source)
        # The tracking of time spend reading and bytes read from side inputs is
        # behind an experiment flag to test its performance impact.
        if 'sideinput_io_metrics' in experiments:
          si_counter = opcounters.SideInputReadCounter(
              self.counter_factory,
              self.state_sampler,
              declaring_step=self.operation_name,
              # Inputs are 1-indexed, so we add 1 to i in the side input id
              input_index=i + 1)
        else:
          si_counter = opcounters.TransformIOCounter()
      iterator_fn = sideinputs.get_iterator_fn_for_sources(
          sources, read_counter=si_counter)

      # Backwards compatibility for pre BEAM-733 SDKs.
      if isinstance(view_options, tuple):
        if view_class == pvalue.AsSingleton:
          has_default, default = view_options
          view_options = {'default': default} if has_default else {}
        else:
          view_options = {}

      yield apache_sideinputs.SideInputMap(
          view_class, view_options, sideinputs.EmulatedIterable(iterator_fn))
Example #4
0
  def _read_side_inputs(self, tags_and_types):
    """Generator reading side inputs in the order prescribed by tags_and_types.

    Args:
      tags_and_types: List of tuples (tag, type). Each side input has a string
        tag that is specified in the worker instruction. The type is actually
        a boolean which is True for singleton input (read just first value)
        and False for collection input (read all values).

    Yields:
      With each iteration it yields the result of reading an entire side source
      either in singleton or collection mode according to the tags_and_types
      argument.
    """
    # Only call this on the old path where side_input_maps was not
    # provided directly.
    assert self.side_input_maps is None

    # Get experiments active in the worker to check for side input metrics exp.
    experiments = RuntimeValueProvider.get_value('experiments', list, [])

    # We will read the side inputs in the order prescribed by the
    # tags_and_types argument because this is exactly the order needed to
    # replace the ArgumentPlaceholder objects in the args/kwargs of the DoFn
    # getting the side inputs.
    #
    # Note that for each tag there could be several read operations in the
    # specification. This can happen for instance if the source has been
    # sharded into several files.
    for i, (side_tag, view_class, view_options) in enumerate(tags_and_types):
      sources = []
      # Using the side_tag in the lambda below will trigger a pylint warning.
      # However in this case it is fine because the lambda is used right away
      # while the variable has the value assigned by the current iteration of
      # the for loop.
      # pylint: disable=cell-var-from-loop
      for si in itertools.ifilter(
          lambda o: o.tag == side_tag, self.spec.side_inputs):
        if not isinstance(si, operation_specs.WorkerSideInputSource):
          raise NotImplementedError('Unknown side input type: %r' % si)
        sources.append(si.source)
        # The tracking of time spend reading and bytes read from side inputs is
        # behind an experiment flag to test its performance impact.
        if 'sideinput_io_metrics' in experiments:
          si_counter = opcounters.SideInputReadCounter(
              self.counter_factory,
              self.state_sampler,
              declaring_step=self.name_context.step_name,
              # Inputs are 1-indexed, so we add 1 to i in the side input id
              input_index=i + 1)
        else:
          si_counter = opcounters.TransformIOCounter()
      iterator_fn = sideinputs.get_iterator_fn_for_sources(
          sources, read_counter=si_counter)

      # Backwards compatibility for pre BEAM-733 SDKs.
      if isinstance(view_options, tuple):
        if view_class == pvalue.AsSingleton:
          has_default, default = view_options
          view_options = {'default': default} if has_default else {}
        else:
          view_options = {}

      yield apache_sideinputs.SideInputMap(
          view_class, view_options, sideinputs.EmulatedIterable(iterator_fn))