def test_track_pcoll_unbounded(self): pipeline = TestPipeline() pcoll1 = pipeline | 'read' >> Read(FakeUnboundedSource()) pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1]) pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1]) self.assertIs(pcoll1.is_bounded, False) self.assertIs(pcoll2.is_bounded, False) self.assertIs(pcoll3.is_bounded, False)
def test_track_pcoll_bounded(self): pipeline = TestPipeline() pcoll1 = pipeline | 'label1' >> Create([1, 2, 3]) pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1]) pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1]) self.assertIs(pcoll1.is_bounded, True) self.assertIs(pcoll2.is_bounded, True) self.assertIs(pcoll3.is_bounded, True)
def test_flatmap_builtin(self): with TestPipeline() as pipeline: pcoll = pipeline | 'label1' >> Create([1, 2, 3]) assert_that(pcoll, equal_to([1, 2, 3])) pcoll2 = pcoll | 'do' >> FlatMap(lambda x: [x + 10]) assert_that(pcoll2, equal_to([11, 12, 13]), label='pcoll2') pcoll3 = pcoll2 | 'm1' >> Map(lambda x: [x, 12]) assert_that( pcoll3, equal_to([[11, 12], [12, 12], [13, 12]]), label='pcoll3') pcoll4 = pcoll3 | 'do2' >> FlatMap(set) assert_that(pcoll4, equal_to([11, 12, 12, 12, 13]), label='pcoll4')
def test_track_pcoll_bounded_flatten(self): pipeline = TestPipeline() pcoll1_a = pipeline | 'label_a' >> Create([1, 2, 3]) pcoll2_a = pcoll1_a | 'do_a' >> FlatMap(lambda x: [x + 1]) pcoll1_b = pipeline | 'label_b' >> Create([1, 2, 3]) pcoll2_b = pcoll1_b | 'do_b' >> FlatMap(lambda x: [x + 1]) merged = (pcoll2_a, pcoll2_b) | beam.Flatten() self.assertIs(pcoll1_a.is_bounded, True) self.assertIs(pcoll2_a.is_bounded, True) self.assertIs(pcoll1_b.is_bounded, True) self.assertIs(pcoll2_b.is_bounded, True) self.assertIs(merged.is_bounded, True)
def test_track_pcoll_unbounded_flatten(self): pipeline = TestPipeline() pcoll1_bounded = pipeline | 'label1' >> Create([1, 2, 3]) pcoll2_bounded = pcoll1_bounded | 'do1' >> FlatMap(lambda x: [x + 1]) pcoll1_unbounded = pipeline | 'read' >> Read(FakeUnboundedSource()) pcoll2_unbounded = pcoll1_unbounded | 'do2' >> FlatMap(lambda x: [x + 1]) merged = (pcoll2_bounded, pcoll2_unbounded) | beam.Flatten() self.assertIs(pcoll1_bounded.is_bounded, True) self.assertIs(pcoll2_bounded.is_bounded, True) self.assertIs(pcoll1_unbounded.is_bounded, False) self.assertIs(pcoll2_unbounded.is_bounded, False) self.assertIs(merged.is_bounded, False)
def expand(self, pcoll): # This is a composite transform involves the following: # 1. Create a singleton of the user provided `query` and apply a ``ParDo`` # that splits the query into `num_splits` and assign each split query a # unique `int` as the key. The resulting output is of the type # ``PCollection[(int, Query)]``. # # If the value of `num_splits` is less than or equal to 0, then the # number of splits will be computed dynamically based on the size of the # data for the `query`. # # 2. The resulting ``PCollection`` is sharded using a ``GroupByKey`` # operation. The queries are extracted from the (int, Iterable[Query]) and # flattened to output a ``PCollection[Query]``. # # 3. In the third step, a ``ParDo`` reads entities for each query and # outputs a ``PCollection[Entity]``. queries = (pcoll.pipeline | 'UserQuery' >> Create([self._query]) | 'SplitQuery' >> ParDo( ReadFromDatastore.SplitQueryFn( self._project, self._query, self._datastore_namespace, self._num_splits))) sharded_queries = (queries | GroupByKey() | Values() | 'Flatten' >> FlatMap(lambda x: x)) entities = sharded_queries | 'Read' >> ParDo( ReadFromDatastore.ReadFn(self._project, self._datastore_namespace)) return entities
def test_root_transforms(self): root_create = Create('create', [[1, 2, 3]]) class DummySource(iobase.BoundedSource): pass root_read = Read('read', DummySource()) root_flatten = Flatten('flatten', pipeline=self.pipeline) pbegin = pvalue.PBegin(self.pipeline) pcoll_create = pbegin | root_create pbegin | root_read pcoll_create | FlatMap(lambda x: x) [] | root_flatten self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(root_transforms, sorted([root_read, root_create, root_flatten])) pbegin_consumers = sorted( [c.transform for c in self.visitor.value_to_consumers[pbegin]]) self.assertEqual(pbegin_consumers, sorted([root_read, root_create])) self.assertEqual(len(self.visitor.step_names), 4)
def test_visit_entire_graph(self): pipeline = Pipeline() pcoll1 = pipeline | 'pcoll' >> beam.Impulse() pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1]) pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1]) pcoll4 = pcoll2 | 'do3' >> FlatMap(lambda x: [x + 1]) transform = PipelineTest.CustomTransform() pcoll5 = pcoll4 | transform visitor = PipelineTest.Visitor(visited=[]) pipeline.visit(visitor) self.assertEqual({pcoll1, pcoll2, pcoll3, pcoll4, pcoll5}, set(visitor.visited)) self.assertEqual(set(visitor.enter_composite), set(visitor.leave_composite)) self.assertEqual(2, len(visitor.enter_composite)) self.assertEqual(visitor.enter_composite[1].transform, transform) self.assertEqual(visitor.leave_composite[0].transform, transform)
def test_create(self): with TestPipeline() as pipeline: pcoll = pipeline | 'label1' >> Create([1, 2, 3]) assert_that(pcoll, equal_to([1, 2, 3])) # Test if initial value is an iterator object. pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6))) pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10]) assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3')
def expand(self, pvalue): return (pvalue | FlatMap(self._create_image_annotation_pairs) | util.BatchElements(min_batch_size=self.min_batch_size, max_batch_size=self.max_batch_size) | ParDo( _ImageAnnotateFn(features=self.features, retry=self.retry, timeout=self.timeout, client_options=self.client_options, metadata=self.metadata)))
def test_root_transforms(self): root_read = beam.Impulse() root_flatten = Flatten(pipeline=self.pipeline) pbegin = pvalue.PBegin(self.pipeline) pcoll_read = pbegin | 'read' >> root_read pcoll_read | FlatMap(lambda x: x) [] | 'flatten' >> root_flatten self.pipeline.visit(self.visitor) root_transforms = [t.transform for t in self.visitor.root_transforms] self.assertCountEqual(root_transforms, [root_read, root_flatten]) pbegin_consumers = [ c.transform for c in self.visitor.value_to_consumers[pbegin] ] self.assertCountEqual(pbegin_consumers, [root_read]) self.assertEqual(len(self.visitor.step_names), 3)
def test_root_transforms(self): class DummySource(iobase.BoundedSource): pass root_read = Read(DummySource()) root_flatten = Flatten(pipeline=self.pipeline) pbegin = pvalue.PBegin(self.pipeline) pcoll_read = pbegin | 'read' >> root_read pcoll_read | FlatMap(lambda x: x) [] | 'flatten' >> root_flatten self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(root_transforms, sorted([root_read, root_flatten])) pbegin_consumers = sorted( [c.transform for c in self.visitor.value_to_consumers[pbegin]]) self.assertEqual(pbegin_consumers, sorted([root_read])) self.assertEqual(len(self.visitor.step_names), 3)
def expand(self, pcoll): return pcoll | '+1' >> FlatMap(lambda x: [x + 1])
def custom_callable(pcoll): return pcoll | '+1' >> FlatMap(lambda x: [x + 1])