Example #1
0
    def _read_side_inputs(self, tags_and_types):
        # type: (...) -> Iterator[apache_sideinputs.SideInputMap]
        """Generator reading side inputs in the order prescribed by tags_and_types.

    Args:
      tags_and_types: List of tuples (tag, type). Each side input has a string
        tag that is specified in the worker instruction. The type is actually
        a boolean which is True for singleton input (read just first value)
        and False for collection input (read all values).

    Yields:
      With each iteration it yields the result of reading an entire side source
      either in singleton or collection mode according to the tags_and_types
      argument.
    """
        # Only call this on the old path where side_input_maps was not
        # provided directly.
        assert self.side_input_maps is None

        # We will read the side inputs in the order prescribed by the
        # tags_and_types argument because this is exactly the order needed to
        # replace the ArgumentPlaceholder objects in the args/kwargs of the DoFn
        # getting the side inputs.
        #
        # Note that for each tag there could be several read operations in the
        # specification. This can happen for instance if the source has been
        # sharded into several files.
        for i, (side_tag, view_class,
                view_options) in enumerate(tags_and_types):
            sources = []
            # Using the side_tag in the lambda below will trigger a pylint warning.
            # However in this case it is fine because the lambda is used right away
            # while the variable has the value assigned by the current iteration of
            # the for loop.
            # pylint: disable=cell-var-from-loop
            for si in filter(lambda o: o.tag == side_tag,
                             self.spec.side_inputs):
                if not isinstance(si, operation_specs.WorkerSideInputSource):
                    raise NotImplementedError('Unknown side input type: %r' %
                                              si)
                sources.append(si.source)
            si_counter = opcounters.SideInputReadCounter(
                self.counter_factory,
                self.state_sampler,
                declaring_step=self.name_context.step_name,
                # Inputs are 1-indexed, so we add 1 to i in the side input id
                input_index=i + 1)
            element_counter = opcounters.OperationCounters(
                self.counter_factory,
                self.name_context.step_name,
                view_options['coder'],
                i,
                suffix='side-input')
            iterator_fn = sideinputs.get_iterator_fn_for_sources(
                sources,
                read_counter=si_counter,
                element_counter=element_counter)
            yield apache_sideinputs.SideInputMap(
                view_class, view_options,
                sideinputs.EmulatedIterable(iterator_fn))
Example #2
0
 def __init__(
     self, counter_factory, step_name, output_index, consumers, coder):
   self.consumers = consumers
   self.opcounter = opcounters.OperationCounters(
       counter_factory, step_name, coder, output_index)
   # Used in repr.
   self.step_name = step_name
   self.output_index = output_index
   self.coder = coder
Example #3
0
  def __init__(self,
               counter_factory,
               step_name,  # type: str
               output_index,
               consumers,  # type: List[Operation]
               coder,
               producer_type_hints
               ):
    self.consumers = consumers

    self.opcounter = opcounters.OperationCounters(
        counter_factory,
        step_name,
        coder,
        output_index,
        producer_type_hints=producer_type_hints)
    # Used in repr.
    self.step_name = step_name
    self.output_index = output_index
    self.coder = coder