Exemple #1
0
  def _add_step(self, step_kind, step_label, transform_node, side_tags=()):
    """Creates a Step object and adds it to the cache."""
    # Import here to avoid adding the dependency for local running scenarios.
    # pylint: disable=wrong-import-order, wrong-import-position
    from apache_beam.runners.dataflow.internal import apiclient
    step = apiclient.Step(step_kind, self._get_unique_step_name())
    self.job.proto.steps.append(step.proto)
    step.add_property(PropertyNames.USER_NAME, step_label)
    # Cache the node/step association for the main output of the transform node.
    self._cache.cache_output(transform_node, None, step)
    # If side_tags is not () then this is a multi-output transform node and we
    # need to cache the (node, tag, step) for each of the tags used to access
    # the outputs. This is essential because the keys used to search in the
    # cache always contain the tag.
    for tag in side_tags:
      self._cache.cache_output(transform_node, tag, step)

    # Finally, we add the display data items to the pipeline step.
    # If the transform contains no display data then an empty list is added.
    step.add_property(
        PropertyNames.DISPLAY_DATA,
        [item.get_dict() for item in
         DisplayData.create_from(transform_node.transform).items])

    return step
Exemple #2
0
 def _add_singleton_step(self, label, full_label, tag, input_step):
   """Creates a CollectionToSingleton step used to handle ParDo side inputs."""
   # Import here to avoid adding the dependency for local running scenarios.
   from apache_beam.runners.dataflow.internal import apiclient
   step = apiclient.Step(TransformNames.COLLECTION_TO_SINGLETON, label)
   self.job.proto.steps.append(step.proto)
   step.add_property(PropertyNames.USER_NAME, full_label)
   step.add_property(
       PropertyNames.PARALLEL_INPUT,
       {'@type': 'OutputReference',
        PropertyNames.STEP_NAME: input_step.proto.name,
        PropertyNames.OUTPUT_NAME: input_step.get_output(tag)})
   step.encoding = self._get_side_input_encoding(input_step.encoding)
   step.add_property(
       PropertyNames.OUTPUT_INFO,
       [{PropertyNames.USER_NAME: (
           '%s.%s' % (full_label, PropertyNames.OUTPUT)),
         PropertyNames.ENCODING: step.encoding,
         PropertyNames.OUTPUT_NAME: PropertyNames.OUT}])
   return step