def from_runner_api(proto, runner, options, return_context=False): """For internal use only; no backwards-compatibility guarantees.""" p = Pipeline(runner=runner, options=options) from apache_beam.runners import pipeline_context context = pipeline_context.PipelineContext(proto.components) root_transform_id, = proto.root_transform_ids p.transforms_stack = [ context.transforms.get_by_id(root_transform_id)] # TODO(robertwb): These are only needed to continue construction. Omit? p.applied_labels = set([ t.unique_name for t in proto.components.transforms.values()]) for id in proto.components.pcollections: pcollection = context.pcollections.get_by_id(id) pcollection.pipeline = p if not pcollection.producer: raise ValueError('No producer for %s' % id) # Inject PBegin input where necessary. from apache_beam.io.iobase import Read from apache_beam.transforms.core import Create has_pbegin = [Read, Create] for id in proto.components.transforms: transform = context.transforms.get_by_id(id) if not transform.inputs and transform.transform.__class__ in has_pbegin: transform.inputs = (pvalue.PBegin(p),) if return_context: return p, context else: return p
def test_root_transforms(self): root_create = Create('create', [[1, 2, 3]]) class DummySource(iobase.BoundedSource): pass root_read = Read('read', DummySource()) root_flatten = Flatten('flatten', pipeline=self.pipeline) pbegin = pvalue.PBegin(self.pipeline) pcoll_create = pbegin | root_create pbegin | root_read pcoll_create | FlatMap(lambda x: x) [] | root_flatten self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(root_transforms, sorted([root_read, root_create, root_flatten])) pbegin_consumers = sorted( [c.transform for c in self.visitor.value_to_consumers[pbegin]]) self.assertEqual(pbegin_consumers, sorted([root_read, root_create])) self.assertEqual(len(self.visitor.step_names), 4)
def finish_bundle(self): data = self._read_from_pubsub(self.source.timestamp_attribute) if data: output_pcollection = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(output_pcollection) # TODO(ccy): Respect the PubSub source's id_label field. for timestamp, message in data: if self.source.with_attributes: element = message else: element = message.payload bundle.output( GlobalWindows.windowed_value(element, timestamp=timestamp)) bundles = [bundle] else: bundles = [] if self._applied_ptransform.inputs: input_pvalue = self._applied_ptransform.inputs[0] else: input_pvalue = pvalue.PBegin( self._applied_ptransform.transform.pipeline) unprocessed_bundle = self._evaluation_context.create_bundle( input_pvalue) # TODO(udim): Correct value for watermark hold. return TransformResult(self, bundles, [unprocessed_bundle], None, {None: Timestamp.of(time.time())})
def _extract_input_pvalues(self, pvalueish): """Extract all the pvalues contained in the input pvalueish. Returns pvalueish as well as the flat inputs list as the input may have to be copied as inspection may be destructive. By default, recursively extracts tuple components and dict values. Generally only needs to be overriden for multi-input PTransforms. """ # pylint: disable=wrong-import-order from apache_beam import pipeline # pylint: enable=wrong-import-order if isinstance(pvalueish, pipeline.Pipeline): pvalueish = pvalue.PBegin(pvalueish) def _dict_tuple_leaves(pvalueish): if isinstance(pvalueish, tuple): for a in pvalueish: for p in _dict_tuple_leaves(a): yield p elif isinstance(pvalueish, dict): for a in pvalueish.values(): for p in _dict_tuple_leaves(a): yield p else: yield pvalueish return pvalueish, tuple(_dict_tuple_leaves(pvalueish))
def get_root_bundles(self): test_stream = self._applied_ptransform.transform bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) bundle.add(GlobalWindows.windowed_value(test_stream.begin(), timestamp=MIN_TIMESTAMP)) bundle.commit(None) return [bundle]
def get_root_bundles(self): test_stream = self._applied_ptransform.transform bundles = [] if len(test_stream.events) > 0: bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) # Explicitly set timestamp to MIN_TIMESTAMP to ensure that we hold the # watermark. bundle.add(GlobalWindows.windowed_value(0, timestamp=MIN_TIMESTAMP)) bundle.commit(None) bundles.append(bundle) return bundles
def finish_bundle(self): unprocessed_bundles = [] hold = None if self.current_index < len(self.test_stream.events) - 1: unprocessed_bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) unprocessed_bundle.add(GlobalWindows.windowed_value( self.current_index + 1, timestamp=self.watermark)) unprocessed_bundles.append(unprocessed_bundle) hold = self.watermark return TransformResult( self._applied_ptransform, self.bundles, unprocessed_bundles, None, {None: hold})
def finish_bundle(self): unprocessed_bundles = [] next_index = self.test_stream.next(self.current_index) if not self.test_stream.end(next_index): unprocessed_bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) unprocessed_bundle.add(GlobalWindows.windowed_value( next_index, timestamp=self.watermark)) unprocessed_bundles.append(unprocessed_bundle) # Returning the watermark in the dict here is used as a watermark hold. return TransformResult( self, self.bundles, unprocessed_bundles, None, {None: self.watermark})
def finish_bundle(self): unprocessed_bundles = [] # Continue to send its own state to itself via an unprocessed bundle. This # acts as a heartbeat, where each element will read the next event from the # event stream. if not self.is_done: unprocessed_bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) unprocessed_bundle.add( GlobalWindows.windowed_value(b'', timestamp=self.watermark)) unprocessed_bundles.append(unprocessed_bundle) # Returning the watermark in the dict here is used as a watermark hold. return TransformResult(self, self.bundles, unprocessed_bundles, None, {None: self.watermark})
def get_root_bundles(self): test_stream = self._applied_ptransform.transform # If there was an endpoint defined then get the events from the # TestStreamService. if test_stream.endpoint: _TestStreamEvaluator.event_stream = _TestStream.events_from_rpc( test_stream.endpoint, test_stream.output_tags, test_stream.coder) else: _TestStreamEvaluator.event_stream = ( _TestStream.events_from_script(test_stream._events)) bundle = self._evaluation_context.create_bundle( pvalue.PBegin(self._applied_ptransform.transform.pipeline)) bundle.add(GlobalWindows.windowed_value(b'', timestamp=MIN_TIMESTAMP)) bundle.commit(None) return [bundle]
def test_root_transforms(self): root_read = beam.Impulse() root_flatten = Flatten(pipeline=self.pipeline) pbegin = pvalue.PBegin(self.pipeline) pcoll_read = pbegin | 'read' >> root_read pcoll_read | FlatMap(lambda x: x) [] | 'flatten' >> root_flatten self.pipeline.visit(self.visitor) root_transforms = [t.transform for t in self.visitor.root_transforms] self.assertCountEqual(root_transforms, [root_read, root_flatten]) pbegin_consumers = [ c.transform for c in self.visitor.value_to_consumers[pbegin] ] self.assertCountEqual(pbegin_consumers, [root_read]) self.assertEqual(len(self.visitor.step_names), 3)
def _extract_input_pvalues(self, pvalueish): """Extract all the pvalues contained in the input pvalueish. Returns pvalueish as well as the flat inputs list as the input may have to be copied as inspection may be destructive. By default, recursively extracts tuple components and dict values. Generally only needs to be overriden for multi-input PTransforms. """ # pylint: disable=wrong-import-order from apache_beam import pipeline # pylint: enable=wrong-import-order if isinstance(pvalueish, pipeline.Pipeline): pvalueish = pvalue.PBegin(pvalueish) return pvalueish, { str(tag): value for (tag, value) in get_named_nested_pvalues(pvalueish, as_inputs=True) }
def finish_bundle(self): data = self._read_from_pubsub() if data: output_pcollection = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(output_pcollection) # TODO(ccy): we currently do not use the PubSub message timestamp or # respect the PubSub source's id_label field. now = Timestamp.of(time.time()) for message_data in data: bundle.output(GlobalWindows.windowed_value(message_data, timestamp=now)) bundles = [bundle] else: bundles = [] if self._applied_ptransform.inputs: input_pvalue = self._applied_ptransform.inputs[0] else: input_pvalue = pvalue.PBegin(self._applied_ptransform.transform.pipeline) unprocessed_bundle = self._evaluation_context.create_bundle( input_pvalue) return TransformResult( self._applied_ptransform, bundles, [unprocessed_bundle], None, {None: Timestamp.of(time.time())})
def test_root_transforms(self): class DummySource(iobase.BoundedSource): pass root_read = Read(DummySource()) root_flatten = Flatten(pipeline=self.pipeline) pbegin = pvalue.PBegin(self.pipeline) pcoll_read = pbegin | 'read' >> root_read pcoll_read | FlatMap(lambda x: x) [] | 'flatten' >> root_flatten self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(root_transforms, sorted([root_read, root_flatten])) pbegin_consumers = sorted( [c.transform for c in self.visitor.value_to_consumers[pbegin]]) self.assertEqual(pbegin_consumers, sorted([root_read])) self.assertEqual(len(self.visitor.step_names), 3)
def get_root_bundles(self): input_node = pvalue.PBegin(self._applied_ptransform.transform.pipeline) empty_bundle = ( self._evaluation_context.create_empty_committed_bundle(input_node)) return [empty_bundle]
def _replace_if_needed(self, original_transform_node): if override.matches(original_transform_node): assert isinstance(original_transform_node, AppliedPTransform) replacement_transform = override.get_replacement_transform( original_transform_node.transform) if replacement_transform is original_transform_node.transform: return replacement_transform_node = AppliedPTransform( original_transform_node.parent, replacement_transform, original_transform_node.full_label, original_transform_node.inputs) # Transform execution could depend on order in which nodes are # considered. Hence we insert the replacement transform node to same # index as the original transform node. Note that this operation # removes the original transform node. if original_transform_node.parent: assert isinstance(original_transform_node.parent, AppliedPTransform) parent_parts = original_transform_node.parent.parts parent_parts[parent_parts.index( original_transform_node)] = ( replacement_transform_node) else: # Original transform has to be a root. roots = self.pipeline.transforms_stack[0].parts assert original_transform_node in roots roots[roots.index(original_transform_node)] = ( replacement_transform_node) inputs = replacement_transform_node.inputs # TODO: Support replacing PTransforms with multiple inputs. if len(inputs) > 1: raise NotImplementedError( 'PTransform overriding is only supported for PTransforms that ' 'have a single input. Tried to replace input of ' 'AppliedPTransform %r that has %d inputs' % original_transform_node, len(inputs)) elif len(inputs) == 1: input_node = inputs[0] elif len(inputs) == 0: input_node = pvalue.PBegin(self) # We have to add the new AppliedTransform to the stack before expand() # and pop it out later to make sure that parts get added correctly. self.pipeline.transforms_stack.append( replacement_transform_node) # Keeping the same label for the replaced node but recursively # removing labels of child transforms of original transform since they # will be replaced during the expand below. This is needed in case # the replacement contains children that have labels that conflicts # with labels of the children of the original. self.pipeline._remove_labels_recursively( original_transform_node) new_output = replacement_transform.expand(input_node) new_output.element_type = None self.pipeline._infer_result_type(replacement_transform, inputs, new_output) replacement_transform_node.add_output(new_output) if not new_output.producer: new_output.producer = replacement_transform_node # We only support replacing transforms with a single output with # another transform that produces a single output. # TODO: Support replacing PTransforms with multiple outputs. if (len(original_transform_node.outputs) > 1 or not isinstance( original_transform_node.outputs[None], (PCollection, PDone)) or not isinstance(new_output, (PCollection, PDone))): raise NotImplementedError( 'PTransform overriding is only supported for PTransforms that ' 'have a single output. Tried to replace output of ' 'AppliedPTransform %r with %r.' % (original_transform_node, new_output)) # Recording updated outputs. This cannot be done in the same visitor # since if we dynamically update output type here, we'll run into # errors when visiting child nodes. output_map[ original_transform_node.outputs[None]] = new_output self.pipeline.transforms_stack.pop()