Ejemplo n.º 1
0
 def test_track_pcoll_unbounded(self):
     pipeline = TestPipeline()
     pcoll1 = pipeline | 'read' >> Read(FakeUnboundedSource())
     pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1])
     pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1])
     self.assertIs(pcoll1.is_bounded, False)
     self.assertIs(pcoll2.is_bounded, False)
     self.assertIs(pcoll3.is_bounded, False)
Ejemplo n.º 2
0
 def test_track_pcoll_bounded(self):
     pipeline = TestPipeline()
     pcoll1 = pipeline | 'label1' >> Create([1, 2, 3])
     pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1])
     pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1])
     self.assertIs(pcoll1.is_bounded, True)
     self.assertIs(pcoll2.is_bounded, True)
     self.assertIs(pcoll3.is_bounded, True)
Ejemplo n.º 3
0
  def test_flatmap_builtin(self):
    with TestPipeline() as pipeline:
      pcoll = pipeline | 'label1' >> Create([1, 2, 3])
      assert_that(pcoll, equal_to([1, 2, 3]))

      pcoll2 = pcoll | 'do' >> FlatMap(lambda x: [x + 10])
      assert_that(pcoll2, equal_to([11, 12, 13]), label='pcoll2')

      pcoll3 = pcoll2 | 'm1' >> Map(lambda x: [x, 12])
      assert_that(
          pcoll3, equal_to([[11, 12], [12, 12], [13, 12]]), label='pcoll3')

      pcoll4 = pcoll3 | 'do2' >> FlatMap(set)
      assert_that(pcoll4, equal_to([11, 12, 12, 12, 13]), label='pcoll4')
Ejemplo n.º 4
0
    def test_track_pcoll_bounded_flatten(self):
        pipeline = TestPipeline()
        pcoll1_a = pipeline | 'label_a' >> Create([1, 2, 3])
        pcoll2_a = pcoll1_a | 'do_a' >> FlatMap(lambda x: [x + 1])

        pcoll1_b = pipeline | 'label_b' >> Create([1, 2, 3])
        pcoll2_b = pcoll1_b | 'do_b' >> FlatMap(lambda x: [x + 1])

        merged = (pcoll2_a, pcoll2_b) | beam.Flatten()

        self.assertIs(pcoll1_a.is_bounded, True)
        self.assertIs(pcoll2_a.is_bounded, True)
        self.assertIs(pcoll1_b.is_bounded, True)
        self.assertIs(pcoll2_b.is_bounded, True)
        self.assertIs(merged.is_bounded, True)
Ejemplo n.º 5
0
  def test_track_pcoll_unbounded_flatten(self):
    pipeline = TestPipeline()
    pcoll1_bounded = pipeline | 'label1' >> Create([1, 2, 3])
    pcoll2_bounded = pcoll1_bounded | 'do1' >> FlatMap(lambda x: [x + 1])

    pcoll1_unbounded = pipeline | 'read' >> Read(FakeUnboundedSource())
    pcoll2_unbounded = pcoll1_unbounded | 'do2' >> FlatMap(lambda x: [x + 1])

    merged = (pcoll2_bounded, pcoll2_unbounded) | beam.Flatten()

    self.assertIs(pcoll1_bounded.is_bounded, True)
    self.assertIs(pcoll2_bounded.is_bounded, True)
    self.assertIs(pcoll1_unbounded.is_bounded, False)
    self.assertIs(pcoll2_unbounded.is_bounded, False)
    self.assertIs(merged.is_bounded, False)
Ejemplo n.º 6
0
    def expand(self, pcoll):
        # This is a composite transform involves the following:
        #   1. Create a singleton of the user provided `query` and apply a ``ParDo``
        #   that splits the query into `num_splits` and assign each split query a
        #   unique `int` as the key. The resulting output is of the type
        #   ``PCollection[(int, Query)]``.
        #
        #   If the value of `num_splits` is less than or equal to 0, then the
        #   number of splits will be computed dynamically based on the size of the
        #   data for the `query`.
        #
        #   2. The resulting ``PCollection`` is sharded using a ``GroupByKey``
        #   operation. The queries are extracted from the (int, Iterable[Query]) and
        #   flattened to output a ``PCollection[Query]``.
        #
        #   3. In the third step, a ``ParDo`` reads entities for each query and
        #   outputs a ``PCollection[Entity]``.

        queries = (pcoll.pipeline
                   | 'UserQuery' >> Create([self._query])
                   | 'SplitQuery' >> ParDo(
                       ReadFromDatastore.SplitQueryFn(
                           self._project, self._query,
                           self._datastore_namespace, self._num_splits)))

        sharded_queries = (queries
                           | GroupByKey()
                           | Values()
                           | 'Flatten' >> FlatMap(lambda x: x))

        entities = sharded_queries | 'Read' >> ParDo(
            ReadFromDatastore.ReadFn(self._project, self._datastore_namespace))
        return entities
Ejemplo n.º 7
0
    def test_root_transforms(self):
        root_create = Create('create', [[1, 2, 3]])

        class DummySource(iobase.BoundedSource):
            pass

        root_read = Read('read', DummySource())
        root_flatten = Flatten('flatten', pipeline=self.pipeline)

        pbegin = pvalue.PBegin(self.pipeline)
        pcoll_create = pbegin | root_create
        pbegin | root_read
        pcoll_create | FlatMap(lambda x: x)
        [] | root_flatten

        self.pipeline.visit(self.visitor)

        root_transforms = sorted(
            [t.transform for t in self.visitor.root_transforms])
        self.assertEqual(root_transforms,
                         sorted([root_read, root_create, root_flatten]))

        pbegin_consumers = sorted(
            [c.transform for c in self.visitor.value_to_consumers[pbegin]])
        self.assertEqual(pbegin_consumers, sorted([root_read, root_create]))
        self.assertEqual(len(self.visitor.step_names), 4)
Ejemplo n.º 8
0
  def test_visit_entire_graph(self):
    pipeline = Pipeline()
    pcoll1 = pipeline | 'pcoll' >> beam.Impulse()
    pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1])
    pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1])
    pcoll4 = pcoll2 | 'do3' >> FlatMap(lambda x: [x + 1])
    transform = PipelineTest.CustomTransform()
    pcoll5 = pcoll4 | transform

    visitor = PipelineTest.Visitor(visited=[])
    pipeline.visit(visitor)
    self.assertEqual({pcoll1, pcoll2, pcoll3, pcoll4, pcoll5},
                     set(visitor.visited))
    self.assertEqual(set(visitor.enter_composite), set(visitor.leave_composite))
    self.assertEqual(2, len(visitor.enter_composite))
    self.assertEqual(visitor.enter_composite[1].transform, transform)
    self.assertEqual(visitor.leave_composite[0].transform, transform)
Ejemplo n.º 9
0
    def test_create(self):
        with TestPipeline() as pipeline:
            pcoll = pipeline | 'label1' >> Create([1, 2, 3])
            assert_that(pcoll, equal_to([1, 2, 3]))

            # Test if initial value is an iterator object.
            pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6)))
            pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10])
            assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3')
Ejemplo n.º 10
0
 def expand(self, pvalue):
     return (pvalue
             | FlatMap(self._create_image_annotation_pairs)
             | util.BatchElements(min_batch_size=self.min_batch_size,
                                  max_batch_size=self.max_batch_size)
             | ParDo(
                 _ImageAnnotateFn(features=self.features,
                                  retry=self.retry,
                                  timeout=self.timeout,
                                  client_options=self.client_options,
                                  metadata=self.metadata)))
    def test_root_transforms(self):
        root_read = beam.Impulse()
        root_flatten = Flatten(pipeline=self.pipeline)

        pbegin = pvalue.PBegin(self.pipeline)
        pcoll_read = pbegin | 'read' >> root_read
        pcoll_read | FlatMap(lambda x: x)
        [] | 'flatten' >> root_flatten

        self.pipeline.visit(self.visitor)

        root_transforms = [t.transform for t in self.visitor.root_transforms]

        self.assertCountEqual(root_transforms, [root_read, root_flatten])

        pbegin_consumers = [
            c.transform for c in self.visitor.value_to_consumers[pbegin]
        ]
        self.assertCountEqual(pbegin_consumers, [root_read])
        self.assertEqual(len(self.visitor.step_names), 3)
    def test_root_transforms(self):
        class DummySource(iobase.BoundedSource):
            pass

        root_read = Read(DummySource())
        root_flatten = Flatten(pipeline=self.pipeline)

        pbegin = pvalue.PBegin(self.pipeline)
        pcoll_read = pbegin | 'read' >> root_read
        pcoll_read | FlatMap(lambda x: x)
        [] | 'flatten' >> root_flatten

        self.pipeline.visit(self.visitor)

        root_transforms = sorted(
            [t.transform for t in self.visitor.root_transforms])

        self.assertEqual(root_transforms, sorted([root_read, root_flatten]))

        pbegin_consumers = sorted(
            [c.transform for c in self.visitor.value_to_consumers[pbegin]])
        self.assertEqual(pbegin_consumers, sorted([root_read]))
        self.assertEqual(len(self.visitor.step_names), 3)
Ejemplo n.º 13
0
 def expand(self, pcoll):
     return pcoll | '+1' >> FlatMap(lambda x: [x + 1])
Ejemplo n.º 14
0
 def custom_callable(pcoll):
     return pcoll | '+1' >> FlatMap(lambda x: [x + 1])