Python FlatMap.FlatMap Exemples, google.cloud.dataflow.transforms.FlatMap.FlatMap Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : pipeline_test.py Projet : finiterank/DataflowPythonSDK

    def test_visit_node_sub_graph(self):
        pipeline = Pipeline('DirectPipelineRunner')
        pcoll1 = pipeline | Create('pcoll', [1, 2, 3])
        pcoll2 = pcoll1 | FlatMap('do1', lambda x: [x + 1])
        pcoll3 = pcoll2 | FlatMap('do2', lambda x: [x + 1])
        pcoll4 = pcoll2 | FlatMap('do3', lambda x: [x + 1])

        visitor = PipelineTest.Visitor(visited=[])
        pipeline.visit(visitor, node=pcoll3)
        self.assertFalse(pcoll4 in visitor.visited)
        self.assertEqual(set([pcoll1, pcoll2, pcoll3]), set(visitor.visited))

Exemple #2

0

Afficher le fichier

 def test_multi_valued_singleton_side_input(self):
     pipeline = Pipeline('DirectPipelineRunner')
     pcol = pipeline | Create('start', [1, 2])
     side = pipeline | Create('side', [3, 4])  # 2 values in side input.
     pcol | FlatMap('compute', lambda x, s: [x * s], AsSingleton(side))
     with self.assertRaises(ValueError) as e:
         pipeline.run()

Exemple #3

0

Afficher le fichier

 def test_default_value_singleton_side_input(self):
     pipeline = Pipeline('DirectPipelineRunner')
     pcol = pipeline | Create('start', [1, 2])
     side = pipeline | Create('side', [])  # 0 values in side input.
     result = (pcol | FlatMap('compute', lambda x, s: [x * s],
                              AsSingleton(side, 10)))
     assert_that(result, equal_to([10, 20]))
     pipeline.run()

Exemple #4

0

Afficher le fichier

 def test_iterable_side_input(self):
     pipeline = Pipeline('DirectPipelineRunner')
     pcol = pipeline | Create('start', [1, 2])
     side = pipeline | Create('side', [3, 4])  # 2 values in side input.
     result = pcol | FlatMap('compute', lambda x, s: [x * y for y in s],
                             AllOf(side))
     assert_that(result, equal_to([3, 4, 6, 8]))
     pipeline.run()

Exemple #5

0

Afficher le fichier

 def test_word_count_using_get(self):
     pipeline = Pipeline('DirectPipelineRunner')
     lines = pipeline | Create('SomeWords', [DataflowTest.SAMPLE_DATA])
     result = ((lines | FlatMap('GetWords',
                                lambda x: re.findall(r'\w+', x))).apply(
                                    'CountWords', DataflowTest.Count))
     assert_that(result, equal_to(DataflowTest.SAMPLE_RESULT))
     pipeline.run()

Exemple #6

0

Afficher le fichier

    def test_visit_entire_graph(self):
        pipeline = Pipeline(self.runner_name)
        pcoll1 = pipeline | Create('pcoll', [1, 2, 3])
        pcoll2 = pcoll1 | FlatMap('do1', lambda x: [x + 1])
        pcoll3 = pcoll2 | FlatMap('do2', lambda x: [x + 1])
        pcoll4 = pcoll2 | FlatMap('do3', lambda x: [x + 1])
        transform = PipelineTest.CustomTransform()
        pcoll5 = pcoll4 | transform

        visitor = PipelineTest.Visitor(visited=[])
        pipeline.visit(visitor)
        self.assertEqual(set([pcoll1, pcoll2, pcoll3, pcoll4, pcoll5]),
                         set(visitor.visited))
        self.assertEqual(set(visitor.enter_composite),
                         set(visitor.leave_composite))
        self.assertEqual(2, len(visitor.enter_composite))
        self.assertEqual(visitor.enter_composite[1].transform, transform)
        self.assertEqual(visitor.leave_composite[0].transform, transform)

Exemple #7

0

Afficher le fichier

Fichier : pipeline_test.py Projet : finiterank/DataflowPythonSDK

    def test_create(self):
        pipeline = Pipeline('DirectPipelineRunner')
        pcoll = pipeline | Create('label1', [1, 2, 3])
        assert_that(pcoll, equal_to([1, 2, 3]))

        # Test if initial value is an iterator object.
        pcoll2 = pipeline | Create('label2', iter((4, 5, 6)))
        pcoll3 = pcoll2 | FlatMap('do', lambda x: [x + 10])
        assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3')
        pipeline.run()

Exemple #8

0

Afficher le fichier

  def test_cached_pvalues_are_refcounted(self):
    """Test that cached PValues are refcounted and deleted.

    The intermediary PValues computed by the workflow below contain
    one million elements so if the refcounting does not work the number of
    objects tracked by the garbage collector will increase by a few millions
    by the time we execute the final Map checking the objects tracked.
    Anything that is much larger than what we started with will fail the test.
    """
    def check_memory(value, count_threshold):
      gc.collect()
      objects_count = len(gc.get_objects())
      if objects_count > count_threshold:
        raise RuntimeError(
            'PValues are not refcounted: %s, %s' % (
                objects_count, count_threshold))
      return value

    def create_dupes(o, _):
      yield o
      yield SideOutputValue('side', o)

    pipeline = Pipeline('DirectPipelineRunner')

    gc.collect()
    count_threshold = len(gc.get_objects()) + 10000
    biglist = pipeline | Create('oom:create', ['x'] * 1000000)
    dupes = (
        biglist
        | Map('oom:addone', lambda x: (x, 1))
        | FlatMap('oom:dupes', create_dupes,
                  AsIter(biglist)).with_outputs('side', main='main'))
    result = (
        (dupes.side, dupes.main, dupes.side)
        | Flatten('oom:flatten')
        | CombinePerKey('oom:combine', sum)
        | Map('oom:check', check_memory, count_threshold))

    assert_that(result, equal_to([('x', 3000000)]))
    pipeline.run()
    self.assertEqual(
        pipeline.runner.debug_counters['element_counts'],
        {
            'oom:flatten': 3000000,
            ('oom:combine/GroupByKey/reify_windows', None): 3000000,
            ('oom:dupes/oom:dupes', 'side'): 1000000,
            ('oom:dupes/oom:dupes', None): 1000000,
            'oom:create': 1000000,
            ('oom:addone', None): 1000000,
            'oom:combine/GroupByKey/group_by_key': 1,
            ('oom:check', None): 1,
            'assert_that/singleton': 1,
            ('assert_that/Map(match)', None): 1,
            ('oom:combine/GroupByKey/group_by_window', None): 1,
            ('oom:combine/Combine/ParDo(CombineValuesDoFn)', None): 1})

Exemple #9

0

Afficher le fichier

 def test_par_do_with_side_input_as_arg(self):
     pipeline = Pipeline('DirectPipelineRunner')
     words_list = ['aa', 'bb', 'cc']
     words = pipeline | Create('SomeWords', words_list)
     prefix = pipeline | Create('SomeString', ['xyz'])  # side in
     suffix = 'zyx'
     result = words | FlatMap(
         'DecorateWords', lambda x, pfx, sfx: ['%s-%s-%s' % (pfx, x, sfx)],
         AsSingleton(prefix), suffix)
     assert_that(result, equal_to(['xyz-%s-zyx' % x for x in words_list]))
     pipeline.run()

Exemple #10

0

Afficher le fichier

 def test_empty_side_outputs(self):
     pipeline = Pipeline('DirectPipelineRunner')
     nums = pipeline | Create('Some Numbers', [1, 3, 5])
     results = nums | FlatMap(
         'ClassifyNumbers', lambda x:
         [x, SideOutputValue('even'
                             if x % 2 == 0 else 'odd', x)]).with_outputs()
     assert_that(results[None], equal_to([1, 3, 5]))
     assert_that(results.odd, equal_to([1, 3, 5]), label='assert:odd')
     assert_that(results.even, equal_to([]), label='assert:even')
     pipeline.run()

Exemple #11

0

Afficher le fichier

    def test_empty_singleton_side_input(self):
        pipeline = Pipeline('DirectPipelineRunner')
        pcol = pipeline | Create('start', [1, 2])
        side = pipeline | Create('side', [])  # Empty side input.

        def my_fn(k, s):
            v = ('empty' if isinstance(s, EmptySideInput) else 'full')
            return [(k, v)]

        result = pcol | FlatMap('compute', my_fn, AsSingleton(side))
        assert_that(result, equal_to([(1, 'empty'), (2, 'empty')]))
        pipeline.run()

Exemple #12

0

Afficher le fichier

 def test_undeclared_side_outputs(self):
     pipeline = Pipeline('DirectPipelineRunner')
     nums = pipeline | Create('Some Numbers', [1, 2, 3, 4])
     results = nums | FlatMap(
         'ClassifyNumbers', lambda x:
         [x, SideOutputValue('even'
                             if x % 2 == 0 else 'odd', x)]).with_outputs()
     # TODO(silviuc): Revisit this test to check for undeclared side outputs.
     # This should work with .with_outputs() without any tags declared and
     # the results[None] should work also.
     assert_that(results[None], equal_to([1, 2, 3, 4]))
     assert_that(results.odd, equal_to([1, 3]), label='assert:odd')
     assert_that(results.even, equal_to([2, 4]), label='assert:even')
     pipeline.run()

Exemple #13

0

Afficher le fichier

    def test_par_do_with_multiple_outputs_and_using_return(self):
        def some_fn(v):
            if v % 2 == 0:
                return [v, SideOutputValue('even', v)]
            else:
                return [v, SideOutputValue('odd', v)]

        pipeline = Pipeline('DirectPipelineRunner')
        nums = pipeline | Create('Some Numbers', [1, 2, 3, 4])
        results = nums | FlatMap('ClassifyNumbers', some_fn).with_outputs(
            'odd', 'even', main='main')
        assert_that(results.main, equal_to([1, 2, 3, 4]))
        assert_that(results.odd, equal_to([1, 3]), label='assert:odd')
        assert_that(results.even, equal_to([2, 4]), label='assert:even')
        pipeline.run()

Exemple #14

0

Afficher le fichier

    def test_as_singleton_with_different_defaults_without_unique_labels(self):
        # This should fail as AsSingleton with distinct default values should create
        # distinct PCollectionViews with the same full_label.
        a_list = [2]
        pipeline = Pipeline('DirectPipelineRunner')
        main_input = pipeline | Create('main input', [1])
        side_list = pipeline | Create('side list', a_list)

        with self.assertRaises(RuntimeError) as e:
            _ = main_input | FlatMap('test', lambda x, s1, s2: [[x, s1, s2]],
                                     AsSingleton(side_list),
                                     AsSingleton(side_list, default_value=3))
        self.assertTrue(
            e.exception.message.startswith(
                'Transform "ViewAsSingleton(side list.None)" does not have a '
                'stable unique label.'))

Exemple #15

0

Afficher le fichier

    def test_as_dict_with_unique_labels(self):
        some_kvs = [('a', 1), ('b', 2)]
        pipeline = Pipeline('DirectPipelineRunner')
        main_input = pipeline | Create('main input', [1])
        side_kvs = pipeline | Create('side kvs', some_kvs)
        results = main_input | FlatMap(
            'test', lambda x, dct1, dct2: [[x, dct1, dct2]], AsDict(side_kvs),
            AsDict(side_kvs, label='label'))

        def matcher(expected_elem, expected_kvs):
            def match(actual):
                [[actual_elem, actual_dict1, actual_dict2]] = actual
                equal_to([expected_elem])([actual_elem])
                equal_to(expected_kvs)(actual_dict1.iteritems())
                equal_to(expected_kvs)(actual_dict2.iteritems())

            return match

        assert_that(results, matcher(1, some_kvs))
        pipeline.run()

Exemple #16

0

Afficher le fichier

    def test_as_list_with_unique_labels(self):
        a_list = [1, 2, 3]
        pipeline = Pipeline('DirectPipelineRunner')
        main_input = pipeline | Create('main input', [1])
        side_list = pipeline | Create('side list', a_list)
        results = main_input | FlatMap(
            'test', lambda x, ls1, ls2: [[x, ls1, ls2]], AsList(side_list),
            AsList(side_list, label='label'))

        def matcher(expected_elem, expected_list):
            def match(actual):
                [[actual_elem, actual_list1, actual_list2]] = actual
                equal_to([expected_elem])([actual_elem])
                equal_to(expected_list)(actual_list1)
                equal_to(expected_list)(actual_list2)

            return match

        assert_that(results, matcher(1, [1, 2, 3]))
        pipeline.run()

Exemple #17

0

Afficher le fichier

    def test_as_singleton_with_different_defaults_with_unique_labels(self):
        a_list = []
        pipeline = Pipeline('DirectPipelineRunner')
        main_input = pipeline | Create('main input', [1])
        side_list = pipeline | Create('side list', a_list)
        results = main_input | FlatMap(
            'test', lambda x, s1, s2: [[x, s1, s2]],
            AsSingleton('si1', side_list, default_value=2),
            AsSingleton('si2', side_list, default_value=3))

        def matcher(expected_elem, expected_singleton1, expected_singleton2):
            def match(actual):
                [[actual_elem, actual_singleton1, actual_singleton2]] = actual
                equal_to([expected_elem])([actual_elem])
                equal_to([expected_singleton1])([actual_singleton1])
                equal_to([expected_singleton2])([actual_singleton2])

            return match

        assert_that(results, matcher(1, 2, 3))
        pipeline.run()

Exemple #18

0

Afficher le fichier

    def test_as_list_without_unique_labels(self):
        # This should succeed as calling AsList on the same PCollection twice will
        # return the same PCollectionView.
        a_list = [1, 2, 3]
        pipeline = Pipeline('DirectPipelineRunner')
        main_input = pipeline | Create('main input', [1])
        side_list = pipeline | Create('side list', a_list)
        results = main_input | FlatMap(
            'test', lambda x, ls1, ls2: [[x, ls1, ls2]], AsList(side_list),
            AsList(side_list))

        def matcher(expected_elem, expected_list):
            def match(actual):
                [[actual_elem, actual_list1, actual_list2]] = actual
                equal_to([expected_elem])([actual_elem])
                equal_to(expected_list)(actual_list1)
                equal_to(expected_list)(actual_list2)

            return match

        assert_that(results, matcher(1, [1, 2, 3]))
        pipeline.run()

Exemple #19

0

Afficher le fichier

    def test_as_singleton_without_unique_labels(self):
        # This should succeed as calling AsSingleton on the same PCollection twice
        # with the same defaults will return the same PCollectionView.
        a_list = [2]
        pipeline = Pipeline('DirectPipelineRunner')
        main_input = pipeline | Create('main input', [1])
        side_list = pipeline | Create('side list', a_list)
        results = main_input | FlatMap('test', lambda x, s1, s2: [[x, s1, s2]],
                                       AsSingleton(side_list),
                                       AsSingleton(side_list))

        def matcher(expected_elem, expected_singleton):
            def match(actual):
                [[actual_elem, actual_singleton1, actual_singleton2]] = actual
                equal_to([expected_elem])([actual_elem])
                equal_to([expected_singleton])([actual_singleton1])
                equal_to([expected_singleton])([actual_singleton2])

            return match

        assert_that(results, matcher(1, 2))
        pipeline.run()

Exemple #20

0

Afficher le fichier

    def test_as_list_and_as_dict_side_inputs(self):
        a_list = [5, 1, 3, 2, 9]
        some_pairs = [('crouton', 17), ('supreme', None)]
        pipeline = Pipeline('DirectPipelineRunner')
        main_input = pipeline | Create('main input', [1])
        side_list = pipeline | Create('side list', a_list)
        side_pairs = pipeline | Create('side pairs', some_pairs)
        results = main_input | FlatMap(
            'concatenate',
            lambda x, the_list, the_dict: [[x, the_list, the_dict]],
            AsList(side_list), AsDict(side_pairs))

        def matcher(expected_elem, expected_list, expected_pairs):
            def match(actual):
                [[actual_elem, actual_list, actual_dict]] = actual
                equal_to([expected_elem])([actual_elem])
                equal_to(expected_list)(actual_list)
                equal_to(expected_pairs)(actual_dict.iteritems())

            return match

        assert_that(results, matcher(1, a_list, some_pairs))
        pipeline.run()

Exemple #21

0

Afficher le fichier

Fichier : pipeline_test.py Projet : finiterank/DataflowPythonSDK

 def apply(self, pcoll):
     return pcoll | FlatMap('+1', lambda x: [x + 1])

Exemple #22

0

Afficher le fichier

Fichier : pipeline_test.py Projet : finiterank/DataflowPythonSDK

 def custom_callable(pcoll):
     return pcoll | FlatMap('+1', lambda x: [x + 1])