def test_visit_entire_graph(self): pipeline = Pipeline() pcoll1 = pipeline | 'pcoll' >> beam.Impulse() pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1]) pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1]) pcoll4 = pcoll2 | 'do3' >> FlatMap(lambda x: [x + 1]) transform = PipelineTest.CustomTransform() pcoll5 = pcoll4 | transform visitor = PipelineTest.Visitor(visited=[]) pipeline.visit(visitor) self.assertEqual({pcoll1, pcoll2, pcoll3, pcoll4, pcoll5}, set(visitor.visited)) self.assertEqual(set(visitor.enter_composite), set(visitor.leave_composite)) self.assertEqual(2, len(visitor.enter_composite)) self.assertEqual(visitor.enter_composite[1].transform, transform) self.assertEqual(visitor.leave_composite[0].transform, transform)
def test_visit_entire_graph(self): pipeline = Pipeline() pcoll1 = pipeline | 'pcoll' >> Create([1, 2, 3]) pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1]) pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1]) pcoll4 = pcoll2 | 'do3' >> FlatMap(lambda x: [x + 1]) transform = PipelineTest.CustomTransform() pcoll5 = pcoll4 | transform visitor = PipelineTest.Visitor(visited=[]) pipeline.visit(visitor) self.assertEqual(set([pcoll1, pcoll2, pcoll3, pcoll4, pcoll5]), set(visitor.visited)) self.assertEqual(set(visitor.enter_composite), set(visitor.leave_composite)) self.assertEqual(3, len(visitor.enter_composite)) self.assertEqual(visitor.enter_composite[2].transform, transform) self.assertEqual(visitor.leave_composite[1].transform, transform)
def test_visitor_not_sorted(self): p = Pipeline() # pylint: disable=expression-not-assigned from apache_beam.testing.test_stream import TestStream p | TestStream().add_elements(['']) | beam.Map(lambda _: _) original_graph = p.to_runner_api(return_context=False) out_of_order_graph = p.to_runner_api(return_context=False) root_id = out_of_order_graph.root_transform_ids[0] root = out_of_order_graph.components.transforms[root_id] tmp = root.subtransforms[0] root.subtransforms[0] = root.subtransforms[1] root.subtransforms[1] = tmp p = beam.Pipeline().from_runner_api(out_of_order_graph, runner='BundleBasedDirectRunner', options=None) v_out_of_order = ConsumerTrackingPipelineVisitor() p.visit(v_out_of_order) p = beam.Pipeline().from_runner_api(original_graph, runner='BundleBasedDirectRunner', options=None) v_original = ConsumerTrackingPipelineVisitor() p.visit(v_original) # Convert to string to assert they are equal. out_of_order_labels = { str(k): [str(t) for t in v_out_of_order.value_to_consumers[k]] for k in v_out_of_order.value_to_consumers } original_labels = { str(k): [str(t) for t in v_original.value_to_consumers[k]] for k in v_original.value_to_consumers } self.assertDictEqual(out_of_order_labels, original_labels)
class ConsumerTrackingPipelineVisitorTest(unittest.TestCase): def setUp(self): self.pipeline = Pipeline(DirectRunner()) self.visitor = ConsumerTrackingPipelineVisitor() def test_root_transforms(self): class DummySource(iobase.BoundedSource): pass root_read = Read(DummySource()) root_flatten = Flatten(pipeline=self.pipeline) pbegin = pvalue.PBegin(self.pipeline) pcoll_read = pbegin | 'read' >> root_read pcoll_read | FlatMap(lambda x: x) [] | 'flatten' >> root_flatten self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(root_transforms, sorted( [root_read, root_flatten])) pbegin_consumers = sorted( [c.transform for c in self.visitor.value_to_consumers[pbegin]]) self.assertEqual(pbegin_consumers, sorted([root_read])) self.assertEqual(len(self.visitor.step_names), 3) def test_side_inputs(self): class SplitNumbersFn(DoFn): def process(self, element): if element < 0: yield pvalue.OutputValue('tag_negative', element) else: yield element class ProcessNumbersFn(DoFn): def process(self, element, negatives): yield element class DummySource(iobase.BoundedSource): pass root_read = Read(DummySource()) result = (self.pipeline | 'read' >> root_read | ParDo(SplitNumbersFn()).with_outputs('tag_negative', main='positive')) positive, negative = result positive | ParDo(ProcessNumbersFn(), AsList(negative)) self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(root_transforms, sorted([root_read])) self.assertEqual(len(self.visitor.step_names), 3) self.assertEqual(len(self.visitor.views), 1) self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList)) def test_co_group_by_key(self): emails = self.pipeline | 'email' >> Create([('joe', '*****@*****.**')]) phones = self.pipeline | 'phone' >> Create([('mary', '111-222-3333')]) {'emails': emails, 'phones': phones} | CoGroupByKey() self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(len(root_transforms), 2) self.assertGreater( len(self.visitor.step_names), 3) # 2 creates + expanded CoGBK self.assertEqual(len(self.visitor.views), 0)
class ConsumerTrackingPipelineVisitorTest(unittest.TestCase): def setUp(self): self.pipeline = Pipeline(DirectRunner()) self.visitor = ConsumerTrackingPipelineVisitor() def test_root_transforms(self): class DummySource(iobase.BoundedSource): pass root_read = Read(DummySource()) root_flatten = Flatten(pipeline=self.pipeline) pbegin = pvalue.PBegin(self.pipeline) pcoll_read = pbegin | 'read' >> root_read pcoll_read | FlatMap(lambda x: x) [] | 'flatten' >> root_flatten self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(root_transforms, sorted([root_read, root_flatten])) pbegin_consumers = sorted( [c.transform for c in self.visitor.value_to_consumers[pbegin]]) self.assertEqual(pbegin_consumers, sorted([root_read])) self.assertEqual(len(self.visitor.step_names), 3) def test_side_inputs(self): class SplitNumbersFn(DoFn): def process(self, element): if element < 0: yield pvalue.OutputValue('tag_negative', element) else: yield element class ProcessNumbersFn(DoFn): def process(self, element, negatives): yield element class DummySource(iobase.BoundedSource): pass root_read = Read(DummySource()) result = (self.pipeline | 'read' >> root_read | ParDo(SplitNumbersFn()).with_outputs('tag_negative', main='positive')) positive, negative = result positive | ParDo(ProcessNumbersFn(), AsList(negative)) self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(root_transforms, sorted([root_read])) self.assertEqual(len(self.visitor.step_names), 3) self.assertEqual(len(self.visitor.views), 1) self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList)) def test_co_group_by_key(self): emails = self.pipeline | 'email' >> Create([('joe', '*****@*****.**') ]) phones = self.pipeline | 'phone' >> Create([('mary', '111-222-3333')]) {'emails': emails, 'phones': phones} | CoGroupByKey() self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(len(root_transforms), 2) self.assertGreater(len(self.visitor.step_names), 3) # 2 creates + expanded CoGBK self.assertEqual(len(self.visitor.views), 0)
class ConsumerTrackingPipelineVisitorTest(unittest.TestCase): def setUp(self): self.pipeline = Pipeline(DirectRunner()) self.visitor = ConsumerTrackingPipelineVisitor() def test_root_transforms(self): root_read = beam.Impulse() root_flatten = Flatten(pipeline=self.pipeline) pbegin = pvalue.PBegin(self.pipeline) pcoll_read = pbegin | 'read' >> root_read pcoll_read | FlatMap(lambda x: x) [] | 'flatten' >> root_flatten self.pipeline.visit(self.visitor) root_transforms = [t.transform for t in self.visitor.root_transforms] self.assertCountEqual(root_transforms, [root_read, root_flatten]) pbegin_consumers = [ c.transform for c in self.visitor.value_to_consumers[pbegin] ] self.assertCountEqual(pbegin_consumers, [root_read]) self.assertEqual(len(self.visitor.step_names), 3) def test_side_inputs(self): class SplitNumbersFn(DoFn): def process(self, element): if element < 0: yield pvalue.TaggedOutput('tag_negative', element) else: yield element class ProcessNumbersFn(DoFn): def process(self, element, negatives): yield element def _process_numbers(pcoll, negatives): first_output = (pcoll | 'process numbers step 1' >> ParDo( ProcessNumbersFn(), negatives)) second_output = (first_output | 'process numbers step 2' >> ParDo( ProcessNumbersFn(), negatives)) output_pc = ((first_output, second_output) | 'flatten results' >> beam.Flatten()) return output_pc root_read = beam.Impulse() result = (self.pipeline | 'read' >> root_read | ParDo(SplitNumbersFn()).with_outputs('tag_negative', main='positive')) positive, negative = result _process_numbers(positive, AsList(negative)) self.pipeline.visit(self.visitor) root_transforms = [t.transform for t in self.visitor.root_transforms] self.assertEqual(root_transforms, [root_read]) self.assertEqual(len(self.visitor.step_names), 5) self.assertEqual(len(self.visitor.views), 1) self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList)) def test_co_group_by_key(self): emails = self.pipeline | 'email' >> Create([('joe', '*****@*****.**') ]) phones = self.pipeline | 'phone' >> Create([('mary', '111-222-3333')]) {'emails': emails, 'phones': phones} | CoGroupByKey() self.pipeline.visit(self.visitor) root_transforms = [t.transform for t in self.visitor.root_transforms] self.assertEqual(len(root_transforms), 2) self.assertGreater(len(self.visitor.step_names), 3) # 2 creates + expanded CoGBK self.assertEqual(len(self.visitor.views), 0) def test_visitor_not_sorted(self): p = Pipeline() # pylint: disable=expression-not-assigned from apache_beam.testing.test_stream import TestStream p | TestStream().add_elements(['']) | beam.Map(lambda _: _) original_graph = p.to_runner_api(return_context=False) out_of_order_graph = p.to_runner_api(return_context=False) root_id = out_of_order_graph.root_transform_ids[0] root = out_of_order_graph.components.transforms[root_id] tmp = root.subtransforms[0] root.subtransforms[0] = root.subtransforms[1] root.subtransforms[1] = tmp p = beam.Pipeline().from_runner_api(out_of_order_graph, runner='BundleBasedDirectRunner', options=None) v_out_of_order = ConsumerTrackingPipelineVisitor() p.visit(v_out_of_order) p = beam.Pipeline().from_runner_api(original_graph, runner='BundleBasedDirectRunner', options=None) v_original = ConsumerTrackingPipelineVisitor() p.visit(v_original) # Convert to string to assert they are equal. out_of_order_labels = { str(k): [str(t) for t in v_out_of_order.value_to_consumers[k]] for k in v_out_of_order.value_to_consumers } original_labels = { str(k): [str(t) for t in v_original.value_to_consumers[k]] for k in v_original.value_to_consumers } self.assertDictEqual(out_of_order_labels, original_labels)
class ConsumerTrackingPipelineVisitorTest(unittest.TestCase): def setUp(self): self.pipeline = Pipeline(DirectRunner()) self.visitor = ConsumerTrackingPipelineVisitor() try: # Python 2 self.assertCountEqual = self.assertItemsEqual except AttributeError: # Python 3 pass def test_root_transforms(self): root_read = beam.Impulse() root_flatten = Flatten(pipeline=self.pipeline) pbegin = pvalue.PBegin(self.pipeline) pcoll_read = pbegin | 'read' >> root_read pcoll_read | FlatMap(lambda x: x) [] | 'flatten' >> root_flatten self.pipeline.visit(self.visitor) root_transforms = [t.transform for t in self.visitor.root_transforms] self.assertCountEqual(root_transforms, [root_read, root_flatten]) pbegin_consumers = [ c.transform for c in self.visitor.value_to_consumers[pbegin] ] self.assertCountEqual(pbegin_consumers, [root_read]) self.assertEqual(len(self.visitor.step_names), 3) def test_side_inputs(self): class SplitNumbersFn(DoFn): def process(self, element): if element < 0: yield pvalue.TaggedOutput('tag_negative', element) else: yield element class ProcessNumbersFn(DoFn): def process(self, element, negatives): yield element def _process_numbers(pcoll, negatives): first_output = (pcoll | 'process numbers step 1' >> ParDo( ProcessNumbersFn(), negatives)) second_output = (first_output | 'process numbers step 2' >> ParDo( ProcessNumbersFn(), negatives)) output_pc = ((first_output, second_output) | 'flatten results' >> beam.Flatten()) return output_pc root_read = beam.Impulse() result = (self.pipeline | 'read' >> root_read | ParDo(SplitNumbersFn()).with_outputs('tag_negative', main='positive')) positive, negative = result _process_numbers(positive, AsList(negative)) self.pipeline.visit(self.visitor) root_transforms = [t.transform for t in self.visitor.root_transforms] self.assertEqual(root_transforms, [root_read]) self.assertEqual(len(self.visitor.step_names), 5) self.assertEqual(len(self.visitor.views), 1) self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList)) def test_co_group_by_key(self): emails = self.pipeline | 'email' >> Create([('joe', '*****@*****.**') ]) phones = self.pipeline | 'phone' >> Create([('mary', '111-222-3333')]) {'emails': emails, 'phones': phones} | CoGroupByKey() self.pipeline.visit(self.visitor) root_transforms = [t.transform for t in self.visitor.root_transforms] self.assertEqual(len(root_transforms), 2) self.assertGreater(len(self.visitor.step_names), 3) # 2 creates + expanded CoGBK self.assertEqual(len(self.visitor.views), 0)