Python Pipeline Examples, apache_beam.pipeline.Pipeline Python Examples

Example #1

0

Show file

    def Expand(self, request, context=None):
        try:
            pipeline = beam_pipeline.Pipeline(options=self._options)

            def with_pipeline(component, pcoll_id=None):
                component.pipeline = pipeline
                if pcoll_id:
                    component.producer, component.tag = producers[pcoll_id]
                    # We need the lookup to resolve back to this id.
                    context.pcollections._obj_to_id[component] = pcoll_id
                return component

            context = pipeline_context.PipelineContext(
                request.components,
                default_environment=portable_runner.PortableRunner.
                _create_environment(self._options),
                namespace=request.namespace)
            producers = {
                pcoll_id: (context.transforms.get_by_id(t_id), pcoll_tag)
                for t_id, t_proto in request.components.transforms.items()
                for pcoll_tag, pcoll_id in t_proto.outputs.items()
            }
            transform = with_pipeline(
                ptransform.PTransform.from_runner_api(request.transform,
                                                      context))
            inputs = transform._pvaluish_from_dict({
                tag: with_pipeline(context.pcollections.get_by_id(pcoll_id),
                                   pcoll_id)
                for tag, pcoll_id in request.transform.inputs.items()
            })
            if not inputs:
                inputs = pipeline
            with external.ExternalTransform.outer_namespace(request.namespace):
                result = pipeline.apply(transform, inputs,
                                        request.transform.unique_name)
            expanded_transform = pipeline._root_transform().parts[-1]
            # TODO(BEAM-1833): Use named outputs internally.
            if isinstance(result, dict):
                expanded_transform.outputs = result
            pipeline_proto = pipeline.to_runner_api(context=context)
            # TODO(BEAM-1833): Use named inputs internally.
            expanded_transform_id = context.transforms.get_id(
                expanded_transform)
            expanded_transform_proto = pipeline_proto.components.transforms.pop(
                expanded_transform_id)
            expanded_transform_proto.inputs.clear()
            expanded_transform_proto.inputs.update(request.transform.inputs)
            for transform_id in pipeline_proto.root_transform_ids:
                del pipeline_proto.components.transforms[transform_id]
            return beam_expansion_api_pb2.ExpansionResponse(
                components=pipeline_proto.components,
                transform=expanded_transform_proto,
                requirements=pipeline_proto.requirements)

        except Exception:  # pylint: disable=broad-except
            return beam_expansion_api_pb2.ExpansionResponse(
                error=traceback.format_exc())

Example #2

0

Show file

File: ptransform.py Project: pareshsarafmdb/beam

 def __ror__(self, left, label=None):
     """Used to apply this PTransform to non-PValues, e.g., a tuple."""
     pvalueish, pvalues = self._extract_input_pvalues(left)
     if isinstance(pvalues, dict):
         pvalues = tuple(pvalues.values())
     pipelines = [
         v.pipeline for v in pvalues if isinstance(v, pvalue.PValue)
     ]
     if pvalues and not pipelines:
         deferred = False
         # pylint: disable=wrong-import-order, wrong-import-position
         from apache_beam import pipeline
         from apache_beam.options.pipeline_options import PipelineOptions
         # pylint: enable=wrong-import-order, wrong-import-position
         p = pipeline.Pipeline('DirectRunner', PipelineOptions(sys.argv))
     else:
         if not pipelines:
             if self.pipeline is not None:
                 p = self.pipeline
             else:
                 raise ValueError(
                     '"%s" requires a pipeline to be specified '
                     'as there are no deferred inputs.' % self.label)
         else:
             p = self.pipeline or pipelines[0]
             for pp in pipelines:
                 if p != pp:
                     raise ValueError(
                         'Mixing values in different pipelines is not allowed.'
                         '\n{%r} != {%r}' % (p, pp))
         deferred = not getattr(p.runner, 'is_eager', False)
     # pylint: disable=wrong-import-order, wrong-import-position
     from apache_beam.transforms.core import Create
     # pylint: enable=wrong-import-order, wrong-import-position
     replacements = {
         id(v): p | 'CreatePInput%s' % ix >> Create(v, reshuffle=False)
         for (ix, v) in enumerate(pvalues)
         if not isinstance(v, pvalue.PValue) and v is not None
     }
     pvalueish = _SetInputPValues().visit(pvalueish, replacements)
     self.pipeline = p
     result = p.apply(self, pvalueish, label)
     if deferred:
         return result
     _allocate_materialized_pipeline(p)
     materialized_result = _AddMaterializationTransforms().visit(result)
     p.run().wait_until_finish()
     _release_materialized_pipeline(p)
     return _FinalizeMaterialization().visit(materialized_result)

Example #3

0

Show file

 def __ror__(self, left, label=None):
     """Used to apply this PTransform to non-PValues, e.g., a tuple."""
     pvalueish, pvalues = self._extract_input_pvalues(left)
     pipelines = [
         v.pipeline for v in pvalues if isinstance(v, pvalue.PValue)
     ]
     if pvalues and not pipelines:
         deferred = False
         # pylint: disable=wrong-import-order, wrong-import-position
         from apache_beam import pipeline
         from apache_beam.utils.pipeline_options import PipelineOptions
         # pylint: enable=wrong-import-order, wrong-import-position
         p = pipeline.Pipeline('DirectRunner', PipelineOptions(sys.argv))
     else:
         if not pipelines:
             if self.pipeline is not None:
                 p = self.pipeline
             else:
                 raise ValueError(
                     '"%s" requires a pipeline to be specified '
                     'as there are no deferred inputs.' % self.label)
         else:
             p = self.pipeline or pipelines[0]
             for pp in pipelines:
                 if p != pp:
                     raise ValueError(
                         'Mixing value from different pipelines not allowed.'
                     )
         deferred = not getattr(p.runner, 'is_eager', False)
     # pylint: disable=wrong-import-order, wrong-import-position
     from apache_beam.transforms.core import Create
     # pylint: enable=wrong-import-order, wrong-import-position
     replacements = {
         id(v): p | 'CreatePInput%s' % ix >> Create(v)
         for ix, v in enumerate(pvalues)
         if not isinstance(v, pvalue.PValue) and v is not None
     }
     pvalueish = _SetInputPValues().visit(pvalueish, replacements)
     self.pipeline = p
     result = p.apply(self, pvalueish, label)
     if deferred:
         return result
     else:
         # Get a reference to the runners internal cache, otherwise runner may
         # clean it after run.
         cache = p.runner.cache
         p.run().wait_until_finish()
         return _MaterializePValues(cache).visit(result)

Example #4

0

Show file

File: test_gke_direct.py Project: spotify/klio

def test_direct_runner_metrics():
    class MyDoFn(beam.DoFn):
        def start_bundle(self):
            count = metric.Metrics.counter(self.__class__, "bundles")
            count.inc()

        def finish_bundle(self):
            count = metric.Metrics.counter(self.__class__, "finished_bundles")
            count.inc()

        def process(self, element):
            gauge = metric.Metrics.gauge(self.__class__, "latest_element")
            gauge.set(element)
            count = metric.Metrics.counter(self.__class__, "elements")
            count.inc()
            distro = metric.Metrics.distribution(self.__class__,
                                                 "element_dist")
            distro.update(element)
            return [element]

    p = pipeline.Pipeline(gke_direct.GkeDirectRunner())
    pcoll = (p
             | beam.Create([1, 2, 3, 4, 5], reshuffle=False)
             | "Do" >> beam.ParDo(MyDoFn()))
    util.assert_that(pcoll, util.equal_to([1, 2, 3, 4, 5]))
    result = p.run()
    result.wait_until_finish()
    metrics = result.metrics().query()
    namespace = "{}.{}".format(MyDoFn.__module__, MyDoFn.__name__)

    hc.assert_that(
        metrics["counters"],
        hc.contains_inanyorder(
            execution.MetricResult(
                execution.MetricKey(
                    "Do", metricbase.MetricName(namespace, "elements")),
                5,
                5,
            ),
            execution.MetricResult(
                execution.MetricKey(
                    "Do", metricbase.MetricName(namespace, "bundles")),
                1,
                1,
            ),
            execution.MetricResult(
                execution.MetricKey(
                    "Do", metricbase.MetricName(namespace,
                                                "finished_bundles")),
                1,
                1,
            ),
        ),
    )

    hc.assert_that(
        metrics["distributions"],
        hc.contains_inanyorder(
            execution.MetricResult(
                execution.MetricKey(
                    "Do", metricbase.MetricName(namespace, "element_dist")),
                cells.DistributionResult(cells.DistributionData(15, 5, 1, 5)),
                cells.DistributionResult(cells.DistributionData(15, 5, 1, 5)),
            )),
    )

    gauge_result = metrics["gauges"][0]
    hc.assert_that(
        gauge_result.key,
        hc.equal_to(
            execution.MetricKey(
                "Do", metricbase.MetricName(namespace, "latest_element"))),
    )
    hc.assert_that(gauge_result.committed.value, hc.equal_to(5))
    hc.assert_that(gauge_result.attempted.value, hc.equal_to(5))