def test_visit_entire_graph(self):
    pipeline = Pipeline()
    pcoll1 = pipeline | 'pcoll' >> beam.Impulse()
    pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1])
    pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1])
    pcoll4 = pcoll2 | 'do3' >> FlatMap(lambda x: [x + 1])
    transform = PipelineTest.CustomTransform()
    pcoll5 = pcoll4 | transform

    visitor = PipelineTest.Visitor(visited=[])
    pipeline.visit(visitor)
    self.assertEqual({pcoll1, pcoll2, pcoll3, pcoll4, pcoll5},
                     set(visitor.visited))
    self.assertEqual(set(visitor.enter_composite), set(visitor.leave_composite))
    self.assertEqual(2, len(visitor.enter_composite))
    self.assertEqual(visitor.enter_composite[1].transform, transform)
    self.assertEqual(visitor.leave_composite[0].transform, transform)
  def test_visit_entire_graph(self):
    pipeline = Pipeline()
    pcoll1 = pipeline | 'pcoll' >> Create([1, 2, 3])
    pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1])
    pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1])
    pcoll4 = pcoll2 | 'do3' >> FlatMap(lambda x: [x + 1])
    transform = PipelineTest.CustomTransform()
    pcoll5 = pcoll4 | transform

    visitor = PipelineTest.Visitor(visited=[])
    pipeline.visit(visitor)
    self.assertEqual(set([pcoll1, pcoll2, pcoll3, pcoll4, pcoll5]),
                     set(visitor.visited))
    self.assertEqual(set(visitor.enter_composite),
                     set(visitor.leave_composite))
    self.assertEqual(3, len(visitor.enter_composite))
    self.assertEqual(visitor.enter_composite[2].transform, transform)
    self.assertEqual(visitor.leave_composite[1].transform, transform)
Exemple #3
0
    def test_visitor_not_sorted(self):
        p = Pipeline()
        # pylint: disable=expression-not-assigned
        from apache_beam.testing.test_stream import TestStream
        p | TestStream().add_elements(['']) | beam.Map(lambda _: _)

        original_graph = p.to_runner_api(return_context=False)
        out_of_order_graph = p.to_runner_api(return_context=False)

        root_id = out_of_order_graph.root_transform_ids[0]
        root = out_of_order_graph.components.transforms[root_id]
        tmp = root.subtransforms[0]
        root.subtransforms[0] = root.subtransforms[1]
        root.subtransforms[1] = tmp

        p = beam.Pipeline().from_runner_api(out_of_order_graph,
                                            runner='BundleBasedDirectRunner',
                                            options=None)
        v_out_of_order = ConsumerTrackingPipelineVisitor()
        p.visit(v_out_of_order)

        p = beam.Pipeline().from_runner_api(original_graph,
                                            runner='BundleBasedDirectRunner',
                                            options=None)
        v_original = ConsumerTrackingPipelineVisitor()
        p.visit(v_original)

        # Convert to string to assert they are equal.
        out_of_order_labels = {
            str(k): [str(t) for t in v_out_of_order.value_to_consumers[k]]
            for k in v_out_of_order.value_to_consumers
        }

        original_labels = {
            str(k): [str(t) for t in v_original.value_to_consumers[k]]
            for k in v_original.value_to_consumers
        }
        self.assertDictEqual(out_of_order_labels, original_labels)
class ConsumerTrackingPipelineVisitorTest(unittest.TestCase):

  def setUp(self):
    self.pipeline = Pipeline(DirectRunner())
    self.visitor = ConsumerTrackingPipelineVisitor()

  def test_root_transforms(self):
    class DummySource(iobase.BoundedSource):
      pass

    root_read = Read(DummySource())
    root_flatten = Flatten(pipeline=self.pipeline)

    pbegin = pvalue.PBegin(self.pipeline)
    pcoll_read = pbegin | 'read' >> root_read
    pcoll_read | FlatMap(lambda x: x)
    [] | 'flatten' >> root_flatten

    self.pipeline.visit(self.visitor)

    root_transforms = sorted(
        [t.transform for t in self.visitor.root_transforms])

    self.assertEqual(root_transforms, sorted(
        [root_read, root_flatten]))

    pbegin_consumers = sorted(
        [c.transform for c in self.visitor.value_to_consumers[pbegin]])
    self.assertEqual(pbegin_consumers, sorted([root_read]))
    self.assertEqual(len(self.visitor.step_names), 3)

  def test_side_inputs(self):

    class SplitNumbersFn(DoFn):

      def process(self, element):
        if element < 0:
          yield pvalue.OutputValue('tag_negative', element)
        else:
          yield element

    class ProcessNumbersFn(DoFn):

      def process(self, element, negatives):
        yield element

    class DummySource(iobase.BoundedSource):
      pass

    root_read = Read(DummySource())

    result = (self.pipeline
              | 'read' >> root_read
              | ParDo(SplitNumbersFn()).with_outputs('tag_negative',
                                                     main='positive'))
    positive, negative = result
    positive | ParDo(ProcessNumbersFn(), AsList(negative))

    self.pipeline.visit(self.visitor)

    root_transforms = sorted(
        [t.transform for t in self.visitor.root_transforms])
    self.assertEqual(root_transforms, sorted([root_read]))
    self.assertEqual(len(self.visitor.step_names), 3)
    self.assertEqual(len(self.visitor.views), 1)
    self.assertTrue(isinstance(self.visitor.views[0],
                               pvalue.AsList))

  def test_co_group_by_key(self):
    emails = self.pipeline | 'email' >> Create([('joe', '*****@*****.**')])
    phones = self.pipeline | 'phone' >> Create([('mary', '111-222-3333')])
    {'emails': emails, 'phones': phones} | CoGroupByKey()

    self.pipeline.visit(self.visitor)

    root_transforms = sorted(
        [t.transform for t in self.visitor.root_transforms])
    self.assertEqual(len(root_transforms), 2)
    self.assertGreater(
        len(self.visitor.step_names), 3)  # 2 creates + expanded CoGBK
    self.assertEqual(len(self.visitor.views), 0)
class ConsumerTrackingPipelineVisitorTest(unittest.TestCase):
    def setUp(self):
        self.pipeline = Pipeline(DirectRunner())
        self.visitor = ConsumerTrackingPipelineVisitor()

    def test_root_transforms(self):
        class DummySource(iobase.BoundedSource):
            pass

        root_read = Read(DummySource())
        root_flatten = Flatten(pipeline=self.pipeline)

        pbegin = pvalue.PBegin(self.pipeline)
        pcoll_read = pbegin | 'read' >> root_read
        pcoll_read | FlatMap(lambda x: x)
        [] | 'flatten' >> root_flatten

        self.pipeline.visit(self.visitor)

        root_transforms = sorted(
            [t.transform for t in self.visitor.root_transforms])

        self.assertEqual(root_transforms, sorted([root_read, root_flatten]))

        pbegin_consumers = sorted(
            [c.transform for c in self.visitor.value_to_consumers[pbegin]])
        self.assertEqual(pbegin_consumers, sorted([root_read]))
        self.assertEqual(len(self.visitor.step_names), 3)

    def test_side_inputs(self):
        class SplitNumbersFn(DoFn):
            def process(self, element):
                if element < 0:
                    yield pvalue.OutputValue('tag_negative', element)
                else:
                    yield element

        class ProcessNumbersFn(DoFn):
            def process(self, element, negatives):
                yield element

        class DummySource(iobase.BoundedSource):
            pass

        root_read = Read(DummySource())

        result = (self.pipeline
                  | 'read' >> root_read
                  | ParDo(SplitNumbersFn()).with_outputs('tag_negative',
                                                         main='positive'))
        positive, negative = result
        positive | ParDo(ProcessNumbersFn(), AsList(negative))

        self.pipeline.visit(self.visitor)

        root_transforms = sorted(
            [t.transform for t in self.visitor.root_transforms])
        self.assertEqual(root_transforms, sorted([root_read]))
        self.assertEqual(len(self.visitor.step_names), 3)
        self.assertEqual(len(self.visitor.views), 1)
        self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList))

    def test_co_group_by_key(self):
        emails = self.pipeline | 'email' >> Create([('joe', '*****@*****.**')
                                                    ])
        phones = self.pipeline | 'phone' >> Create([('mary', '111-222-3333')])
        {'emails': emails, 'phones': phones} | CoGroupByKey()

        self.pipeline.visit(self.visitor)

        root_transforms = sorted(
            [t.transform for t in self.visitor.root_transforms])
        self.assertEqual(len(root_transforms), 2)
        self.assertGreater(len(self.visitor.step_names),
                           3)  # 2 creates + expanded CoGBK
        self.assertEqual(len(self.visitor.views), 0)
Exemple #6
0
class ConsumerTrackingPipelineVisitorTest(unittest.TestCase):
    def setUp(self):
        self.pipeline = Pipeline(DirectRunner())
        self.visitor = ConsumerTrackingPipelineVisitor()

    def test_root_transforms(self):
        root_read = beam.Impulse()
        root_flatten = Flatten(pipeline=self.pipeline)

        pbegin = pvalue.PBegin(self.pipeline)
        pcoll_read = pbegin | 'read' >> root_read
        pcoll_read | FlatMap(lambda x: x)
        [] | 'flatten' >> root_flatten

        self.pipeline.visit(self.visitor)

        root_transforms = [t.transform for t in self.visitor.root_transforms]

        self.assertCountEqual(root_transforms, [root_read, root_flatten])

        pbegin_consumers = [
            c.transform for c in self.visitor.value_to_consumers[pbegin]
        ]
        self.assertCountEqual(pbegin_consumers, [root_read])
        self.assertEqual(len(self.visitor.step_names), 3)

    def test_side_inputs(self):
        class SplitNumbersFn(DoFn):
            def process(self, element):
                if element < 0:
                    yield pvalue.TaggedOutput('tag_negative', element)
                else:
                    yield element

        class ProcessNumbersFn(DoFn):
            def process(self, element, negatives):
                yield element

        def _process_numbers(pcoll, negatives):
            first_output = (pcoll
                            | 'process numbers step 1' >> ParDo(
                                ProcessNumbersFn(), negatives))

            second_output = (first_output
                             | 'process numbers step 2' >> ParDo(
                                 ProcessNumbersFn(), negatives))

            output_pc = ((first_output, second_output)
                         | 'flatten results' >> beam.Flatten())
            return output_pc

        root_read = beam.Impulse()

        result = (self.pipeline
                  | 'read' >> root_read
                  | ParDo(SplitNumbersFn()).with_outputs('tag_negative',
                                                         main='positive'))
        positive, negative = result
        _process_numbers(positive, AsList(negative))

        self.pipeline.visit(self.visitor)

        root_transforms = [t.transform for t in self.visitor.root_transforms]
        self.assertEqual(root_transforms, [root_read])
        self.assertEqual(len(self.visitor.step_names), 5)
        self.assertEqual(len(self.visitor.views), 1)
        self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList))

    def test_co_group_by_key(self):
        emails = self.pipeline | 'email' >> Create([('joe', '*****@*****.**')
                                                    ])
        phones = self.pipeline | 'phone' >> Create([('mary', '111-222-3333')])
        {'emails': emails, 'phones': phones} | CoGroupByKey()

        self.pipeline.visit(self.visitor)

        root_transforms = [t.transform for t in self.visitor.root_transforms]
        self.assertEqual(len(root_transforms), 2)
        self.assertGreater(len(self.visitor.step_names),
                           3)  # 2 creates + expanded CoGBK
        self.assertEqual(len(self.visitor.views), 0)

    def test_visitor_not_sorted(self):
        p = Pipeline()
        # pylint: disable=expression-not-assigned
        from apache_beam.testing.test_stream import TestStream
        p | TestStream().add_elements(['']) | beam.Map(lambda _: _)

        original_graph = p.to_runner_api(return_context=False)
        out_of_order_graph = p.to_runner_api(return_context=False)

        root_id = out_of_order_graph.root_transform_ids[0]
        root = out_of_order_graph.components.transforms[root_id]
        tmp = root.subtransforms[0]
        root.subtransforms[0] = root.subtransforms[1]
        root.subtransforms[1] = tmp

        p = beam.Pipeline().from_runner_api(out_of_order_graph,
                                            runner='BundleBasedDirectRunner',
                                            options=None)
        v_out_of_order = ConsumerTrackingPipelineVisitor()
        p.visit(v_out_of_order)

        p = beam.Pipeline().from_runner_api(original_graph,
                                            runner='BundleBasedDirectRunner',
                                            options=None)
        v_original = ConsumerTrackingPipelineVisitor()
        p.visit(v_original)

        # Convert to string to assert they are equal.
        out_of_order_labels = {
            str(k): [str(t) for t in v_out_of_order.value_to_consumers[k]]
            for k in v_out_of_order.value_to_consumers
        }

        original_labels = {
            str(k): [str(t) for t in v_original.value_to_consumers[k]]
            for k in v_original.value_to_consumers
        }
        self.assertDictEqual(out_of_order_labels, original_labels)
Exemple #7
0
class ConsumerTrackingPipelineVisitorTest(unittest.TestCase):
    def setUp(self):
        self.pipeline = Pipeline(DirectRunner())
        self.visitor = ConsumerTrackingPipelineVisitor()
        try:  # Python 2
            self.assertCountEqual = self.assertItemsEqual
        except AttributeError:  # Python 3
            pass

    def test_root_transforms(self):
        root_read = beam.Impulse()
        root_flatten = Flatten(pipeline=self.pipeline)

        pbegin = pvalue.PBegin(self.pipeline)
        pcoll_read = pbegin | 'read' >> root_read
        pcoll_read | FlatMap(lambda x: x)
        [] | 'flatten' >> root_flatten

        self.pipeline.visit(self.visitor)

        root_transforms = [t.transform for t in self.visitor.root_transforms]

        self.assertCountEqual(root_transforms, [root_read, root_flatten])

        pbegin_consumers = [
            c.transform for c in self.visitor.value_to_consumers[pbegin]
        ]
        self.assertCountEqual(pbegin_consumers, [root_read])
        self.assertEqual(len(self.visitor.step_names), 3)

    def test_side_inputs(self):
        class SplitNumbersFn(DoFn):
            def process(self, element):
                if element < 0:
                    yield pvalue.TaggedOutput('tag_negative', element)
                else:
                    yield element

        class ProcessNumbersFn(DoFn):
            def process(self, element, negatives):
                yield element

        def _process_numbers(pcoll, negatives):
            first_output = (pcoll
                            | 'process numbers step 1' >> ParDo(
                                ProcessNumbersFn(), negatives))

            second_output = (first_output
                             | 'process numbers step 2' >> ParDo(
                                 ProcessNumbersFn(), negatives))

            output_pc = ((first_output, second_output)
                         | 'flatten results' >> beam.Flatten())
            return output_pc

        root_read = beam.Impulse()

        result = (self.pipeline
                  | 'read' >> root_read
                  | ParDo(SplitNumbersFn()).with_outputs('tag_negative',
                                                         main='positive'))
        positive, negative = result
        _process_numbers(positive, AsList(negative))

        self.pipeline.visit(self.visitor)

        root_transforms = [t.transform for t in self.visitor.root_transforms]
        self.assertEqual(root_transforms, [root_read])
        self.assertEqual(len(self.visitor.step_names), 5)
        self.assertEqual(len(self.visitor.views), 1)
        self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList))

    def test_co_group_by_key(self):
        emails = self.pipeline | 'email' >> Create([('joe', '*****@*****.**')
                                                    ])
        phones = self.pipeline | 'phone' >> Create([('mary', '111-222-3333')])
        {'emails': emails, 'phones': phones} | CoGroupByKey()

        self.pipeline.visit(self.visitor)

        root_transforms = [t.transform for t in self.visitor.root_transforms]
        self.assertEqual(len(root_transforms), 2)
        self.assertGreater(len(self.visitor.step_names),
                           3)  # 2 creates + expanded CoGBK
        self.assertEqual(len(self.visitor.views), 0)