예제 #1
0
  def expand(self, pbegin):
    assert isinstance(pbegin, pvalue.PBegin)
    self.pipeline = pbegin.pipeline
    if not self.output_tags:
      self.output_tags = set([None])

    # For backwards compatibility return a single PCollection.
    if len(self.output_tags) == 1:
      return pvalue.PCollection(
          self.pipeline, is_bounded=False, tag=list(self.output_tags)[0])
    return {
        tag: pvalue.PCollection(self.pipeline, is_bounded=False, tag=tag)
        for tag in self.output_tags
    }
예제 #2
0
  def start_bundle(self):
    transform = self._applied_ptransform.transform

    self._tagged_receivers = _TaggedReceivers(self._evaluation_context)
    for output_tag in self._applied_ptransform.outputs:
      output_pcollection = pvalue.PCollection(None, tag=output_tag)
      output_pcollection.producer = self._applied_ptransform
      self._tagged_receivers[output_tag] = (
          self._evaluation_context.create_bundle(output_pcollection))
      self._tagged_receivers[output_tag].tag = output_tag

    self._counter_factory = counters.CounterFactory()

    # TODO(aaltay): Consider storing the serialized form as an optimization.
    dofn = pickler.loads(pickler.dumps(transform.dofn))

    pipeline_options = self._evaluation_context.pipeline_options
    if (pipeline_options is not None
        and pipeline_options.view_as(TypeOptions).runtime_type_check):
      dofn = TypeCheckWrapperDoFn(dofn, transform.get_type_hints())

    dofn = OutputCheckWrapperDoFn(dofn, self._applied_ptransform.full_label)
    self.runner = DoFnRunner(
        dofn, transform.args, transform.kwargs,
        self._side_inputs,
        self._applied_ptransform.inputs[0].windowing,
        tagged_receivers=self._tagged_receivers,
        step_name=self._applied_ptransform.full_label,
        state=DoFnState(self._counter_factory),
        scoped_metrics_container=self.scoped_metrics_container)
    self.runner.start()
    def start_bundle(self):
        transform = self._applied_ptransform.transform

        self._tagged_receivers = _TaggedReceivers(self._evaluation_context)
        for output_tag in self._applied_ptransform.outputs:
            output_pcollection = pvalue.PCollection(None, tag=output_tag)
            output_pcollection.producer = self._applied_ptransform
            self._tagged_receivers[output_tag] = (
                self._evaluation_context.create_bundle(output_pcollection))
            self._tagged_receivers[output_tag].tag = output_tag

        self._counter_factory = counters.CounterFactory()

        # TODO(aaltay): Consider storing the serialized form as an optimization.
        dofn = (pickler.loads(pickler.dumps(transform.dofn))
                if self._perform_dofn_pickle_test else transform.dofn)

        args = transform.args if hasattr(transform, 'args') else []
        kwargs = transform.kwargs if hasattr(transform, 'kwargs') else {}

        self.runner = DoFnRunner(dofn,
                                 args,
                                 kwargs,
                                 self._side_inputs,
                                 self._applied_ptransform.inputs[0].windowing,
                                 tagged_receivers=self._tagged_receivers,
                                 step_name=self._applied_ptransform.full_label,
                                 state=DoFnState(self._counter_factory))
        self.runner.start()
예제 #4
0
  def expand(self, pbegin):
    from apache_beam.options.pipeline_options import DebugOptions
    from apache_beam.transforms import util

    assert isinstance(pbegin, pvalue.PBegin)
    self.pipeline = pbegin.pipeline

    debug_options = self.pipeline._options.view_as(DebugOptions)
    if debug_options.experiments and 'beam_fn_api' in debug_options.experiments:
      source = self.source

      def split_source(unused_impulse):
        total_size = source.estimate_size()
        if total_size:
          # 1MB = 1 shard, 1GB = 32 shards, 1TB = 1000 shards, 1PB = 32k shards
          chunk_size = max(1 << 20, 1000 * int(math.sqrt(total_size)))
        else:
          chunk_size = 64 << 20  # 64mb
        return source.split(chunk_size)

      return (
          pbegin
          | core.Impulse()
          | 'Split' >> core.FlatMap(split_source)
          | util.Reshuffle()
          | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read(
              split.source.get_range_tracker(
                  split.start_position, split.stop_position))))
    else:
      # Treat Read itself as a primitive.
      return pvalue.PCollection(self.pipeline)
예제 #5
0
  def apply_CombineValues(self, transform, pcoll):
    # TODO(BEAM-2937): Disable combiner lifting for fnapi. Remove this
    # restrictions once this feature is supported in the dataflow runner
    # harness.
    # Import here to avoid adding the dependency for local running scenarios.
    # pylint: disable=wrong-import-order, wrong-import-position
    from apache_beam.runners.dataflow.internal import apiclient
    if apiclient._use_fnapi(pcoll.pipeline._options):
      return self.apply_PTransform(transform, pcoll)

    return pvalue.PCollection(pcoll.pipeline)
예제 #6
0
  def start_bundle(self):
    transform = self._applied_ptransform.transform

    self._tagged_receivers = _TaggedReceivers(self._evaluation_context)
    for output_tag in self._applied_ptransform.outputs:
      output_pcollection = pvalue.PCollection(None, tag=output_tag)
      output_pcollection.producer = self._applied_ptransform
      self._tagged_receivers[output_tag] = (
          self._evaluation_context.create_bundle(output_pcollection))
      self._tagged_receivers[output_tag].tag = output_tag

    self._counter_factory = counters.CounterFactory()

    # TODO(aaltay): Consider storing the serialized form as an optimization.
    dofn = (
        pickler.loads(pickler.dumps(transform.dofn))
        if self._perform_dofn_pickle_test else transform.dofn)

    args = transform.args if hasattr(transform, 'args') else []
    kwargs = transform.kwargs if hasattr(transform, 'kwargs') else {}

    self.user_state_context = None
    self.user_timer_map = {}
    if is_stateful_dofn(dofn):
      kv_type_hint = self._applied_ptransform.inputs[0].element_type
      if kv_type_hint and kv_type_hint != Any:
        coder = coders.registry.get_coder(kv_type_hint)
        self.key_coder = coder.key_coder()
      else:
        self.key_coder = coders.registry.get_coder(Any)

      self.user_state_context = DirectUserStateContext(
          self._step_context, dofn, self.key_coder)
      _, all_timer_specs = get_dofn_specs(dofn)
      for timer_spec in all_timer_specs:
        self.user_timer_map['user/%s' % timer_spec.name] = timer_spec

    self.runner = DoFnRunner(
        dofn,
        args,
        kwargs,
        self._side_inputs,
        self._applied_ptransform.inputs[0].windowing,
        tagged_receivers=self._tagged_receivers,
        step_name=self._applied_ptransform.full_label,
        state=DoFnState(self._counter_factory),
        user_state_context=self.user_state_context)
    self.runner.setup()
    self.runner.start()
예제 #7
0
  def apply_GroupByKey(self, transform, pcoll):
    # Infer coder of parent.
    #
    # TODO(ccy): make Coder inference and checking less specialized and more
    # comprehensive.
    parent = pcoll.producer
    if parent:
      coder = parent.transform._infer_output_coder()  # pylint: disable=protected-access
    if not coder:
      coder = self._get_coder(pcoll.element_type or typehints.Any, None)
    if not coder.is_kv_coder():
      raise ValueError(('Coder for the GroupByKey operation "%s" is not a '
                        'key-value coder: %s.') % (transform.label,
                                                   coder))
    # TODO(robertwb): Update the coder itself if it changed.
    coders.registry.verify_deterministic(
        coder.key_coder(), 'GroupByKey operation "%s"' % transform.label)

    return pvalue.PCollection(pcoll.pipeline)
예제 #8
0
파일: iobase.py 프로젝트: zhangminglei/beam
    def expand(self, pbegin):
        from apache_beam.options.pipeline_options import DebugOptions
        from apache_beam.transforms import util

        assert isinstance(pbegin, pvalue.PBegin)
        self.pipeline = pbegin.pipeline

        debug_options = self.pipeline._options.view_as(DebugOptions)
        if debug_options.experiments and 'beam_fn_api' in debug_options.experiments:
            NUM_SPLITS = 1000
            source = self.source
            return (
                pbegin
                | core.Impulse()
                | 'Split' >> core.FlatMap(lambda _: source.split(NUM_SPLITS))
                | util.Reshuffle()
                | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read(
                    split.source.get_range_tracker(split.start_position, split.
                                                   stop_position))))
        else:
            # Treat Read itself as a primitive.
            return pvalue.PCollection(self.pipeline)
 def expand(self, pbegin):
     assert isinstance(pbegin, pvalue.PBegin), (
         'Input to transform must be a PBegin but found %s' % pbegin)
     return pvalue.PCollection(pbegin.pipeline, is_bounded=False)
예제 #10
0
 def apply_CombineValues(self, transform, pcoll):
     return pvalue.PCollection(pcoll.pipeline)
예제 #11
0
 def expand(self, pcoll):
     return pvalue.PCollection(pcoll.pipeline)
예제 #12
0
 def expand(self, pbegin):
   return pvalue.PCollection(
       self.pipeline, is_bounded=self.source.is_bounded())
예제 #13
0
def _clone_items(pipeline, to_clone):
    """Clones dependency-sorted list of PCollections and PTransforms.

  Returns mappings of PCollection and PTransform replacements.

  Args:
    pipeline: The beam.Pipeline.
    to_clone: A dependency-sorted list of PCollections and PTransforms.

  Returns:
    pcollection_replacements: a dict mapping original to cloned PCollections.

  Raises:
    ValueError: if a clone is requested of an invalid object.
  """
    pcollection_replacements = {}
    ptransform_replacements = {}
    for item in to_clone:
        if isinstance(item, pvalue.PCollection):
            assert item not in pcollection_replacements
            copied = pvalue.PCollection(pipeline,
                                        tag=item.tag,
                                        element_type=item.element_type,
                                        windowing=item.windowing)
            copied.producer = item.producer
            # Update copied PCollection producer if its producer was copied as well.
            if copied.producer in ptransform_replacements:
                original_producer = copied.producer
                copied.producer = ptransform_replacements[original_producer]
                # Update producer outputs,
                for tag, output in original_producer.outputs.items():
                    if output == item:
                        copied.producer.outputs[tag] = copied
            assert copied.producer.transform is not None
            pcollection_replacements[item] = copied
        elif isinstance(item, beam_pipeline.AppliedPTransform):
            assert item.transform is not None
            assert item not in ptransform_replacements
            # The Beam pipeline graph keeps track of composite PTransforms by having
            # AppliedPTransform.parts be a list of "children" AppliedPTransforms that
            # are part of the "parent" AppliedPTransform. Any of these "composite
            # wrapper" AppliedPTransforms does not actually produce output independent
            # of the child non-composite transform. We therefore shouldn't ever clone
            # AppliedPTransforms with non-empty parts, since such AppliedPTransforms
            # are not reachable by tracing outputs in the pipeline graph.
            assert not item.parts, (
                'Reached invalid composite AppliedPTransform: %r.' % item)
            # Assign new label.
            new_label_prefix = item.full_label + '.Copy'
            new_label = new_label_prefix
            next_suffix = 0
            while new_label in pipeline.applied_labels:
                new_label = new_label_prefix + str(next_suffix)
                next_suffix += 1
            pipeline.applied_labels.add(new_label)

            # Update inputs.
            new_inputs = []
            for old_input in item.inputs:
                new_input = pcollection_replacements.get(old_input, old_input)
                new_inputs.append(new_input)
            new_inputs = tuple(new_inputs)

            # Create the copy. Note that in the copy, copied.outputs will start out
            # empty. Any outputs that are used will be repopulated in the PCollection
            # copy branch above.
            copied = beam_pipeline.AppliedPTransform(item.parent,
                                                     item.transform, new_label,
                                                     new_inputs)
            ptransform_replacements[item] = copied

            # Update composite transform parent to include this copy.
            # TODO(b/111366378): Reconcile the composite PTransform nesting hierarchy,
            # especially in the case where copied PTransforms should be copied in an
            # "all-or-nothing" manner. This would allow the deep copy operation to be
            # safe in the case runners replace well-known composite PTransforms in
            # their entirety during execution.
            copied.parent.parts.append(copied)
        else:
            raise ValueError('Invalid object to clone: %s' % item)

    return pcollection_replacements
예제 #14
0
 def expand(self, pbegin):
     assert isinstance(pbegin, pvalue.PBegin)
     self.pipeline = pbegin.pipeline
     return pvalue.PCollection(self.pipeline, is_bounded=False)
예제 #15
0
 def expand(self, pcoll):
   return pvalue.PCollection(pcoll.pipeline, is_bounded=False)
예제 #16
0
 def test_expand_method_pcollection_errors(self):
     with self.assertRaises(error.TransformError):
         self.native_write.expand(None)
     with self.assertRaises(error.TransformError):
         pcoll = pvalue.PCollection(pipeline=None)
         self.native_write.expand(pcoll)
예제 #17
0
def _clone_items(pipeline, to_clone):
    """Clones dependency-sorted list of PCollections and PTransforms.

  Returns mappings of PCollection and PTransform replacements.

  Args:
    pipeline: The beam.Pipeline.
    to_clone: A dependency-sorted list of PCollections and PTransforms.

  Returns:
    pcollection_replacements: a dict mapping original to cloned PCollections.

  Raises:
    ValueError: if a clone is requested of an invalid object.
  """
    pcollection_replacements = {}
    ptransform_replacements = {}
    for item in to_clone:
        if isinstance(item, pvalue.PCollection):
            assert item not in pcollection_replacements
            copied = pvalue.PCollection(pipeline,
                                        tag=item.tag,
                                        element_type=item.element_type,
                                        windowing=item.windowing)
            copied.producer = item.producer
            # Update copied PCollection producer if its producer was copied as well.
            if copied.producer in ptransform_replacements:
                original_producer = copied.producer
                copied.producer = ptransform_replacements[original_producer]
                # Update producer outputs,
                for tag, output in original_producer.outputs.items():
                    if output == item:
                        copied.producer.outputs[tag] = copied
            assert copied.producer.transform is not None
            pcollection_replacements[item] = copied
        elif isinstance(item, beam_pipeline.AppliedPTransform):
            assert item.transform is not None
            assert item not in ptransform_replacements
            # The Beam pipeline graph keeps track of composite PTransforms by having
            # AppliedPTransform.parts be a list of "children" AppliedPTransforms that
            # are part of the "parent" AppliedPTransform. Any of these "composite
            # wrapper" AppliedPTransforms does not actually produce output independent
            # of the child non-composite transform. We therefore shouldn't ever clone
            # AppliedPTransforms with non-empty parts, since such AppliedPTransforms
            # are not reachable by tracing outputs in the pipeline graph.
            assert not item.parts, (
                'Reached invalid composite AppliedPTransform: %r.' % item)

            # TODO(b/217271822): Implement resource hint 'close to resources' for
            # Beam/Dataflow, as when CSE makes it to Dataflow, 'close to resources'
            # cannot be recognized. Once this is fixed, we can change the tag prefix
            # to 'beam'.
            # TODO(b/238243699): Obviate the need for setting 'close to resources'
            # hints.
            close_to_resources_available = resources.ResourceHint.is_registered(
                'close_to_resources')

            if close_to_resources_available:
                # Assign close_to_resources resource hint to the orginal PTransforms.
                # The reason of adding this annotation is to prevent root Reads that are
                # generated from deep copy being merged due to common subexpression
                # elimination (CSE).
                item.resource_hints['beam:resources:close_to_resources:v1'] = (
                    b'/fake/DeepCopy.Original[0]')

            # Assign new label.
            count = 0
            copy_suffix = f'Copy{count}'
            new_label = f'{item.full_label}.{copy_suffix}'
            while new_label in pipeline.applied_labels:
                count += 1
                copy_suffix = f'Copy{count}'
                new_label = f'{item.full_label}.{copy_suffix}'
            pipeline.applied_labels.add(new_label)

            # Update inputs.
            new_inputs = {
                tag: pcollection_replacements.get(old_input, old_input)
                for tag, old_input in item.main_inputs.items()
            }

            # Create the copy. Note that in the copy, copied.outputs will start out
            # empty. Any outputs that are used will be repopulated in the PCollection
            # copy branch above.
            copied = beam_pipeline.AppliedPTransform(item.parent,
                                                     item.transform, new_label,
                                                     new_inputs)

            # Add a 'close to resource' resource hint to the copied PTransforms. The
            # PTransforms that are generated from each deep copy have the same unique
            # 'close to resource' resource hint. This is to make sure that the
            # PTransforms that are cloned from each deep copy can be fused together,
            # but not across copies nor with the original.
            if close_to_resources_available:
                copied.resource_hints[
                    'beam:resources:close_to_resources:v1'] = (
                        f'/fake/DeepCopy.{copy_suffix}[0]'.encode())

            ptransform_replacements[item] = copied

            # Update composite transform parent to include this copy.
            # TODO(b/111366378): Reconcile the composite PTransform nesting hierarchy,
            # especially in the case where copied PTransforms should be copied in an
            # "all-or-nothing" manner. This would allow the deep copy operation to be
            # safe in the case runners replace well-known composite PTransforms in
            # their entirety during execution.
            copied.parent.parts.append(copied)
        else:
            raise ValueError('Invalid object to clone: %s' % item)

    return pcollection_replacements