예제 #1
0
  def from_runner_api(proto, runner, options, return_context=False):
    """For internal use only; no backwards-compatibility guarantees."""
    p = Pipeline(runner=runner, options=options)
    from apache_beam.runners import pipeline_context
    context = pipeline_context.PipelineContext(proto.components)
    root_transform_id, = proto.root_transform_ids
    p.transforms_stack = [
        context.transforms.get_by_id(root_transform_id)]
    # TODO(robertwb): These are only needed to continue construction. Omit?
    p.applied_labels = set([
        t.unique_name for t in proto.components.transforms.values()])
    for id in proto.components.pcollections:
      pcollection = context.pcollections.get_by_id(id)
      pcollection.pipeline = p
      if not pcollection.producer:
        raise ValueError('No producer for %s' % id)

    # Inject PBegin input where necessary.
    from apache_beam.io.iobase import Read
    from apache_beam.transforms.core import Create
    has_pbegin = [Read, Create]
    for id in proto.components.transforms:
      transform = context.transforms.get_by_id(id)
      if not transform.inputs and transform.transform.__class__ in has_pbegin:
        transform.inputs = (pvalue.PBegin(p),)

    if return_context:
      return p, context
    else:
      return p
예제 #2
0
    def test_root_transforms(self):
        root_create = Create('create', [[1, 2, 3]])

        class DummySource(iobase.BoundedSource):
            pass

        root_read = Read('read', DummySource())
        root_flatten = Flatten('flatten', pipeline=self.pipeline)

        pbegin = pvalue.PBegin(self.pipeline)
        pcoll_create = pbegin | root_create
        pbegin | root_read
        pcoll_create | FlatMap(lambda x: x)
        [] | root_flatten

        self.pipeline.visit(self.visitor)

        root_transforms = sorted(
            [t.transform for t in self.visitor.root_transforms])
        self.assertEqual(root_transforms,
                         sorted([root_read, root_create, root_flatten]))

        pbegin_consumers = sorted(
            [c.transform for c in self.visitor.value_to_consumers[pbegin]])
        self.assertEqual(pbegin_consumers, sorted([root_read, root_create]))
        self.assertEqual(len(self.visitor.step_names), 4)
    def finish_bundle(self):
        data = self._read_from_pubsub(self.source.timestamp_attribute)
        if data:
            output_pcollection = list(self._outputs)[0]
            bundle = self._evaluation_context.create_bundle(output_pcollection)
            # TODO(ccy): Respect the PubSub source's id_label field.
            for timestamp, message in data:
                if self.source.with_attributes:
                    element = message
                else:
                    element = message.payload
                bundle.output(
                    GlobalWindows.windowed_value(element, timestamp=timestamp))
            bundles = [bundle]
        else:
            bundles = []
        if self._applied_ptransform.inputs:
            input_pvalue = self._applied_ptransform.inputs[0]
        else:
            input_pvalue = pvalue.PBegin(
                self._applied_ptransform.transform.pipeline)
        unprocessed_bundle = self._evaluation_context.create_bundle(
            input_pvalue)

        # TODO(udim): Correct value for watermark hold.
        return TransformResult(self, bundles, [unprocessed_bundle], None,
                               {None: Timestamp.of(time.time())})
예제 #4
0
    def _extract_input_pvalues(self, pvalueish):
        """Extract all the pvalues contained in the input pvalueish.

    Returns pvalueish as well as the flat inputs list as the input may have to
    be copied as inspection may be destructive.

    By default, recursively extracts tuple components and dict values.

    Generally only needs to be overriden for multi-input PTransforms.
    """
        # pylint: disable=wrong-import-order
        from apache_beam import pipeline
        # pylint: enable=wrong-import-order
        if isinstance(pvalueish, pipeline.Pipeline):
            pvalueish = pvalue.PBegin(pvalueish)

        def _dict_tuple_leaves(pvalueish):
            if isinstance(pvalueish, tuple):
                for a in pvalueish:
                    for p in _dict_tuple_leaves(a):
                        yield p
            elif isinstance(pvalueish, dict):
                for a in pvalueish.values():
                    for p in _dict_tuple_leaves(a):
                        yield p
            else:
                yield pvalueish

        return pvalueish, tuple(_dict_tuple_leaves(pvalueish))
예제 #5
0
 def get_root_bundles(self):
   test_stream = self._applied_ptransform.transform
   bundle = self._evaluation_context.create_bundle(
       pvalue.PBegin(self._applied_ptransform.transform.pipeline))
   bundle.add(GlobalWindows.windowed_value(test_stream.begin(),
                                           timestamp=MIN_TIMESTAMP))
   bundle.commit(None)
   return [bundle]
예제 #6
0
 def get_root_bundles(self):
   test_stream = self._applied_ptransform.transform
   bundles = []
   if len(test_stream.events) > 0:
     bundle = self._evaluation_context.create_bundle(
         pvalue.PBegin(self._applied_ptransform.transform.pipeline))
     # Explicitly set timestamp to MIN_TIMESTAMP to ensure that we hold the
     # watermark.
     bundle.add(GlobalWindows.windowed_value(0, timestamp=MIN_TIMESTAMP))
     bundle.commit(None)
     bundles.append(bundle)
   return bundles
예제 #7
0
 def finish_bundle(self):
   unprocessed_bundles = []
   hold = None
   if self.current_index < len(self.test_stream.events) - 1:
     unprocessed_bundle = self._evaluation_context.create_bundle(
         pvalue.PBegin(self._applied_ptransform.transform.pipeline))
     unprocessed_bundle.add(GlobalWindows.windowed_value(
         self.current_index + 1, timestamp=self.watermark))
     unprocessed_bundles.append(unprocessed_bundle)
     hold = self.watermark
   return TransformResult(
       self._applied_ptransform, self.bundles, unprocessed_bundles, None,
       {None: hold})
예제 #8
0
  def finish_bundle(self):
    unprocessed_bundles = []
    next_index = self.test_stream.next(self.current_index)
    if not self.test_stream.end(next_index):
      unprocessed_bundle = self._evaluation_context.create_bundle(
          pvalue.PBegin(self._applied_ptransform.transform.pipeline))
      unprocessed_bundle.add(GlobalWindows.windowed_value(
          next_index, timestamp=self.watermark))
      unprocessed_bundles.append(unprocessed_bundle)

    # Returning the watermark in the dict here is used as a watermark hold.
    return TransformResult(
        self, self.bundles, unprocessed_bundles, None, {None: self.watermark})
예제 #9
0
    def finish_bundle(self):
        unprocessed_bundles = []

        # Continue to send its own state to itself via an unprocessed bundle. This
        # acts as a heartbeat, where each element will read the next event from the
        # event stream.
        if not self.is_done:
            unprocessed_bundle = self._evaluation_context.create_bundle(
                pvalue.PBegin(self._applied_ptransform.transform.pipeline))
            unprocessed_bundle.add(
                GlobalWindows.windowed_value(b'', timestamp=self.watermark))
            unprocessed_bundles.append(unprocessed_bundle)

        # Returning the watermark in the dict here is used as a watermark hold.
        return TransformResult(self, self.bundles, unprocessed_bundles, None,
                               {None: self.watermark})
예제 #10
0
  def get_root_bundles(self):
    test_stream = self._applied_ptransform.transform

    # If there was an endpoint defined then get the events from the
    # TestStreamService.
    if test_stream.endpoint:
      _TestStreamEvaluator.event_stream = _TestStream.events_from_rpc(
          test_stream.endpoint, test_stream.output_tags, test_stream.coder)
    else:
      _TestStreamEvaluator.event_stream = (
          _TestStream.events_from_script(test_stream._events))

    bundle = self._evaluation_context.create_bundle(
        pvalue.PBegin(self._applied_ptransform.transform.pipeline))
    bundle.add(GlobalWindows.windowed_value(b'', timestamp=MIN_TIMESTAMP))
    bundle.commit(None)
    return [bundle]
    def test_root_transforms(self):
        root_read = beam.Impulse()
        root_flatten = Flatten(pipeline=self.pipeline)

        pbegin = pvalue.PBegin(self.pipeline)
        pcoll_read = pbegin | 'read' >> root_read
        pcoll_read | FlatMap(lambda x: x)
        [] | 'flatten' >> root_flatten

        self.pipeline.visit(self.visitor)

        root_transforms = [t.transform for t in self.visitor.root_transforms]

        self.assertCountEqual(root_transforms, [root_read, root_flatten])

        pbegin_consumers = [
            c.transform for c in self.visitor.value_to_consumers[pbegin]
        ]
        self.assertCountEqual(pbegin_consumers, [root_read])
        self.assertEqual(len(self.visitor.step_names), 3)
예제 #12
0
    def _extract_input_pvalues(self, pvalueish):
        """Extract all the pvalues contained in the input pvalueish.

    Returns pvalueish as well as the flat inputs list as the input may have to
    be copied as inspection may be destructive.

    By default, recursively extracts tuple components and dict values.

    Generally only needs to be overriden for multi-input PTransforms.
    """
        # pylint: disable=wrong-import-order
        from apache_beam import pipeline
        # pylint: enable=wrong-import-order
        if isinstance(pvalueish, pipeline.Pipeline):
            pvalueish = pvalue.PBegin(pvalueish)

        return pvalueish, {
            str(tag): value
            for (tag,
                 value) in get_named_nested_pvalues(pvalueish, as_inputs=True)
        }
예제 #13
0
 def finish_bundle(self):
   data = self._read_from_pubsub()
   if data:
     output_pcollection = list(self._outputs)[0]
     bundle = self._evaluation_context.create_bundle(output_pcollection)
     # TODO(ccy): we currently do not use the PubSub message timestamp or
     # respect the PubSub source's id_label field.
     now = Timestamp.of(time.time())
     for message_data in data:
       bundle.output(GlobalWindows.windowed_value(message_data, timestamp=now))
     bundles = [bundle]
   else:
     bundles = []
   if self._applied_ptransform.inputs:
     input_pvalue = self._applied_ptransform.inputs[0]
   else:
     input_pvalue = pvalue.PBegin(self._applied_ptransform.transform.pipeline)
   unprocessed_bundle = self._evaluation_context.create_bundle(
       input_pvalue)
   return TransformResult(
       self._applied_ptransform, bundles,
       [unprocessed_bundle], None, {None: Timestamp.of(time.time())})
    def test_root_transforms(self):
        class DummySource(iobase.BoundedSource):
            pass

        root_read = Read(DummySource())
        root_flatten = Flatten(pipeline=self.pipeline)

        pbegin = pvalue.PBegin(self.pipeline)
        pcoll_read = pbegin | 'read' >> root_read
        pcoll_read | FlatMap(lambda x: x)
        [] | 'flatten' >> root_flatten

        self.pipeline.visit(self.visitor)

        root_transforms = sorted(
            [t.transform for t in self.visitor.root_transforms])

        self.assertEqual(root_transforms, sorted([root_read, root_flatten]))

        pbegin_consumers = sorted(
            [c.transform for c in self.visitor.value_to_consumers[pbegin]])
        self.assertEqual(pbegin_consumers, sorted([root_read]))
        self.assertEqual(len(self.visitor.step_names), 3)
예제 #15
0
 def get_root_bundles(self):
   input_node = pvalue.PBegin(self._applied_ptransform.transform.pipeline)
   empty_bundle = (
       self._evaluation_context.create_empty_committed_bundle(input_node))
   return [empty_bundle]
예제 #16
0
            def _replace_if_needed(self, original_transform_node):
                if override.matches(original_transform_node):
                    assert isinstance(original_transform_node,
                                      AppliedPTransform)
                    replacement_transform = override.get_replacement_transform(
                        original_transform_node.transform)
                    if replacement_transform is original_transform_node.transform:
                        return

                    replacement_transform_node = AppliedPTransform(
                        original_transform_node.parent, replacement_transform,
                        original_transform_node.full_label,
                        original_transform_node.inputs)

                    # Transform execution could depend on order in which nodes are
                    # considered. Hence we insert the replacement transform node to same
                    # index as the original transform node. Note that this operation
                    # removes the original transform node.
                    if original_transform_node.parent:
                        assert isinstance(original_transform_node.parent,
                                          AppliedPTransform)
                        parent_parts = original_transform_node.parent.parts
                        parent_parts[parent_parts.index(
                            original_transform_node)] = (
                                replacement_transform_node)
                    else:
                        # Original transform has to be a root.
                        roots = self.pipeline.transforms_stack[0].parts
                        assert original_transform_node in roots
                        roots[roots.index(original_transform_node)] = (
                            replacement_transform_node)

                    inputs = replacement_transform_node.inputs
                    # TODO:  Support replacing PTransforms with multiple inputs.
                    if len(inputs) > 1:
                        raise NotImplementedError(
                            'PTransform overriding is only supported for PTransforms that '
                            'have a single input. Tried to replace input of '
                            'AppliedPTransform %r that has %d inputs' %
                            original_transform_node, len(inputs))
                    elif len(inputs) == 1:
                        input_node = inputs[0]
                    elif len(inputs) == 0:
                        input_node = pvalue.PBegin(self)

                    # We have to add the new AppliedTransform to the stack before expand()
                    # and pop it out later to make sure that parts get added correctly.
                    self.pipeline.transforms_stack.append(
                        replacement_transform_node)

                    # Keeping the same label for the replaced node but recursively
                    # removing labels of child transforms of original transform since they
                    # will be replaced during the expand below. This is needed in case
                    # the replacement contains children that have labels that conflicts
                    # with labels of the children of the original.
                    self.pipeline._remove_labels_recursively(
                        original_transform_node)

                    new_output = replacement_transform.expand(input_node)

                    new_output.element_type = None
                    self.pipeline._infer_result_type(replacement_transform,
                                                     inputs, new_output)

                    replacement_transform_node.add_output(new_output)
                    if not new_output.producer:
                        new_output.producer = replacement_transform_node

                    # We only support replacing transforms with a single output with
                    # another transform that produces a single output.
                    # TODO: Support replacing PTransforms with multiple outputs.
                    if (len(original_transform_node.outputs) > 1
                            or not isinstance(
                                original_transform_node.outputs[None],
                                (PCollection, PDone))
                            or not isinstance(new_output,
                                              (PCollection, PDone))):
                        raise NotImplementedError(
                            'PTransform overriding is only supported for PTransforms that '
                            'have a single output. Tried to replace output of '
                            'AppliedPTransform %r with %r.' %
                            (original_transform_node, new_output))

                    # Recording updated outputs. This cannot be done in the same visitor
                    # since if we dynamically update output type here, we'll run into
                    # errors when visiting child nodes.
                    output_map[
                        original_transform_node.outputs[None]] = new_output

                    self.pipeline.transforms_stack.pop()