def Expand(self, request, context=None): try: pipeline = beam_pipeline.Pipeline(options=self._options) def with_pipeline(component, pcoll_id=None): component.pipeline = pipeline if pcoll_id: component.producer, component.tag = producers[pcoll_id] # We need the lookup to resolve back to this id. context.pcollections._obj_to_id[component] = pcoll_id return component context = pipeline_context.PipelineContext( request.components, default_environment=portable_runner.PortableRunner. _create_environment(self._options), namespace=request.namespace) producers = { pcoll_id: (context.transforms.get_by_id(t_id), pcoll_tag) for t_id, t_proto in request.components.transforms.items() for pcoll_tag, pcoll_id in t_proto.outputs.items() } transform = with_pipeline( ptransform.PTransform.from_runner_api(request.transform, context)) inputs = transform._pvaluish_from_dict({ tag: with_pipeline(context.pcollections.get_by_id(pcoll_id), pcoll_id) for tag, pcoll_id in request.transform.inputs.items() }) if not inputs: inputs = pipeline with external.ExternalTransform.outer_namespace(request.namespace): result = pipeline.apply(transform, inputs, request.transform.unique_name) expanded_transform = pipeline._root_transform().parts[-1] # TODO(BEAM-1833): Use named outputs internally. if isinstance(result, dict): expanded_transform.outputs = result pipeline_proto = pipeline.to_runner_api(context=context) # TODO(BEAM-1833): Use named inputs internally. expanded_transform_id = context.transforms.get_id( expanded_transform) expanded_transform_proto = pipeline_proto.components.transforms.pop( expanded_transform_id) expanded_transform_proto.inputs.clear() expanded_transform_proto.inputs.update(request.transform.inputs) for transform_id in pipeline_proto.root_transform_ids: del pipeline_proto.components.transforms[transform_id] return beam_expansion_api_pb2.ExpansionResponse( components=pipeline_proto.components, transform=expanded_transform_proto, requirements=pipeline_proto.requirements) except Exception: # pylint: disable=broad-except return beam_expansion_api_pb2.ExpansionResponse( error=traceback.format_exc())
def __ror__(self, left, label=None): """Used to apply this PTransform to non-PValues, e.g., a tuple.""" pvalueish, pvalues = self._extract_input_pvalues(left) if isinstance(pvalues, dict): pvalues = tuple(pvalues.values()) pipelines = [ v.pipeline for v in pvalues if isinstance(v, pvalue.PValue) ] if pvalues and not pipelines: deferred = False # pylint: disable=wrong-import-order, wrong-import-position from apache_beam import pipeline from apache_beam.options.pipeline_options import PipelineOptions # pylint: enable=wrong-import-order, wrong-import-position p = pipeline.Pipeline('DirectRunner', PipelineOptions(sys.argv)) else: if not pipelines: if self.pipeline is not None: p = self.pipeline else: raise ValueError( '"%s" requires a pipeline to be specified ' 'as there are no deferred inputs.' % self.label) else: p = self.pipeline or pipelines[0] for pp in pipelines: if p != pp: raise ValueError( 'Mixing values in different pipelines is not allowed.' '\n{%r} != {%r}' % (p, pp)) deferred = not getattr(p.runner, 'is_eager', False) # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.transforms.core import Create # pylint: enable=wrong-import-order, wrong-import-position replacements = { id(v): p | 'CreatePInput%s' % ix >> Create(v, reshuffle=False) for (ix, v) in enumerate(pvalues) if not isinstance(v, pvalue.PValue) and v is not None } pvalueish = _SetInputPValues().visit(pvalueish, replacements) self.pipeline = p result = p.apply(self, pvalueish, label) if deferred: return result _allocate_materialized_pipeline(p) materialized_result = _AddMaterializationTransforms().visit(result) p.run().wait_until_finish() _release_materialized_pipeline(p) return _FinalizeMaterialization().visit(materialized_result)
def __ror__(self, left, label=None): """Used to apply this PTransform to non-PValues, e.g., a tuple.""" pvalueish, pvalues = self._extract_input_pvalues(left) pipelines = [ v.pipeline for v in pvalues if isinstance(v, pvalue.PValue) ] if pvalues and not pipelines: deferred = False # pylint: disable=wrong-import-order, wrong-import-position from apache_beam import pipeline from apache_beam.utils.pipeline_options import PipelineOptions # pylint: enable=wrong-import-order, wrong-import-position p = pipeline.Pipeline('DirectRunner', PipelineOptions(sys.argv)) else: if not pipelines: if self.pipeline is not None: p = self.pipeline else: raise ValueError( '"%s" requires a pipeline to be specified ' 'as there are no deferred inputs.' % self.label) else: p = self.pipeline or pipelines[0] for pp in pipelines: if p != pp: raise ValueError( 'Mixing value from different pipelines not allowed.' ) deferred = not getattr(p.runner, 'is_eager', False) # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.transforms.core import Create # pylint: enable=wrong-import-order, wrong-import-position replacements = { id(v): p | 'CreatePInput%s' % ix >> Create(v) for ix, v in enumerate(pvalues) if not isinstance(v, pvalue.PValue) and v is not None } pvalueish = _SetInputPValues().visit(pvalueish, replacements) self.pipeline = p result = p.apply(self, pvalueish, label) if deferred: return result else: # Get a reference to the runners internal cache, otherwise runner may # clean it after run. cache = p.runner.cache p.run().wait_until_finish() return _MaterializePValues(cache).visit(result)
def test_direct_runner_metrics(): class MyDoFn(beam.DoFn): def start_bundle(self): count = metric.Metrics.counter(self.__class__, "bundles") count.inc() def finish_bundle(self): count = metric.Metrics.counter(self.__class__, "finished_bundles") count.inc() def process(self, element): gauge = metric.Metrics.gauge(self.__class__, "latest_element") gauge.set(element) count = metric.Metrics.counter(self.__class__, "elements") count.inc() distro = metric.Metrics.distribution(self.__class__, "element_dist") distro.update(element) return [element] p = pipeline.Pipeline(gke_direct.GkeDirectRunner()) pcoll = (p | beam.Create([1, 2, 3, 4, 5], reshuffle=False) | "Do" >> beam.ParDo(MyDoFn())) util.assert_that(pcoll, util.equal_to([1, 2, 3, 4, 5])) result = p.run() result.wait_until_finish() metrics = result.metrics().query() namespace = "{}.{}".format(MyDoFn.__module__, MyDoFn.__name__) hc.assert_that( metrics["counters"], hc.contains_inanyorder( execution.MetricResult( execution.MetricKey( "Do", metricbase.MetricName(namespace, "elements")), 5, 5, ), execution.MetricResult( execution.MetricKey( "Do", metricbase.MetricName(namespace, "bundles")), 1, 1, ), execution.MetricResult( execution.MetricKey( "Do", metricbase.MetricName(namespace, "finished_bundles")), 1, 1, ), ), ) hc.assert_that( metrics["distributions"], hc.contains_inanyorder( execution.MetricResult( execution.MetricKey( "Do", metricbase.MetricName(namespace, "element_dist")), cells.DistributionResult(cells.DistributionData(15, 5, 1, 5)), cells.DistributionResult(cells.DistributionData(15, 5, 1, 5)), )), ) gauge_result = metrics["gauges"][0] hc.assert_that( gauge_result.key, hc.equal_to( execution.MetricKey( "Do", metricbase.MetricName(namespace, "latest_element"))), ) hc.assert_that(gauge_result.committed.value, hc.equal_to(5)) hc.assert_that(gauge_result.attempted.value, hc.equal_to(5))