Example #1
0
    def test_bounded_sum(self):
        # [START combine_bounded_sum]
        pc = [1, 10, 100, 1000]

        def bounded_sum(values, bound=500):
            return min(sum(values), bound)

        small_sum = pc | df.CombineGlobally(bounded_sum)  # [500]
        large_sum = pc | df.CombineGlobally(bounded_sum, bound=5000)  # [1111]
        # [END combine_bounded_sum]
        self.assertEqual([500], small_sum)
        self.assertEqual([1111], large_sum)
Example #2
0
    def test_combine_concat(self):
        pc = ['a', 'b']

        # [START combine_concat]
        def concat(values, separator=', '):
            return separator.join(values)

        with_commas = pc | df.CombineGlobally(concat)
        with_dashes = pc | df.CombineGlobally(concat, separator='-')
        # [END combine_concat]
        self.assertEqual(1, len(with_commas))
        self.assertTrue(with_commas[0] in {'a, b', 'b, a'})
        self.assertEqual(1, len(with_dashes))
        self.assertTrue(with_dashes[0] in {'a-b', 'b-a'})
Example #3
0
def run(argv=None):
    # pylint: disable=expression-not-assigned

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file pattern to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file pattern to write results to.')
    parser.add_argument('--checksum_output',
                        required=True,
                        help='Checksum output file pattern.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Read the text file[pattern] into a PCollection.
    lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input))

    # Count the occurrences of each word.
    output = (lines
              | df.Map('split', lambda x: (x[:10], x[10:99]))
              | df.GroupByKey('group')
              | df.FlatMap(
                  'format', lambda
                  (key, vals): ['%s%s' % (key, val) for val in vals]))

    input_csum = (lines
                  | df.Map('input-csum', crc32line)
                  | df.CombineGlobally('combine-input-csum', sum)
                  | df.Map('hex-format', lambda x: '%x' % x))
    input_csum | df.io.Write(
        'write-input-csum',
        df.io.TextFileSink(known_args.checksum_output + '-input'))

    # Write the output using a "Write" transform that has side effects.
    output | df.io.Write('write', df.io.TextFileSink(known_args.output))
    # Write the output checksum
    output_csum = (output
                   | df.Map('output-csum', crc32line)
                   | df.CombineGlobally('combine-output-csum', sum)
                   | df.Map('hex-format-output', lambda x: '%x' % x))
    output_csum | df.io.Write(
        'write-output-csum',
        df.io.TextFileSink(known_args.checksum_output + '-output'))

    # Actually run the pipeline (all operations above are deferred).
    p.run()
 def test_tuple_combine_fn(self):
     p = Pipeline('DirectPipelineRunner')
     result = (p
               | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)])
               | df.CombineGlobally(
                   combine.TupleCombineFn(max, combine.MeanCombineFn(),
                                          sum)).without_defaults())
     assert_that(result, equal_to([('c', 111.0 / 3, 99.0)]))
     p.run()
Example #5
0
 def test_combine_reduce(self):
     factors = [2, 3, 5, 7]
     # [START combine_reduce]
     import functools
     import operator
     product = factors | df.CombineGlobally(
         functools.partial(reduce, operator.mul), 1)
     # [END combine_reduce]
     self.assertEqual([210], product)
 def test_tuple_combine_fn_without_defaults(self):
     p = Pipeline('DirectPipelineRunner')
     result = (p
               | Create([1, 1, 2, 3])
               | df.CombineGlobally(
                   combine.TupleCombineFn(
                       min, combine.MeanCombineFn(),
                       max).with_common_input()).without_defaults())
     assert_that(result, equal_to([(1, 7.0 / 4, 3)]))
     p.run()
    def test_top_shorthands(self):
        pipeline = Pipeline('DirectPipelineRunner')

        pcoll = pipeline | Create('start', [6, 3, 1, 1, 9, 1, 5, 2, 0, 6])
        result_top = pcoll | df.CombineGlobally('top', combiners.Largest(5))
        result_bot = pcoll | df.CombineGlobally('bot', combiners.Smallest(4))
        assert_that(result_top,
                    equal_to([[9, 6, 6, 5, 3]]),
                    label='assert:top')
        assert_that(result_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot')

        pcoll = pipeline | Create(
            'start-perkey', [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]])
        result_ktop = pcoll | df.CombinePerKey('top-perkey',
                                               combiners.Largest(5))
        result_kbot = pcoll | df.CombinePerKey('bot-perkey',
                                               combiners.Smallest(4))
        assert_that(result_ktop,
                    equal_to([('a', [9, 6, 6, 5, 3])]),
                    label='k:top')
        assert_that(result_kbot,
                    equal_to([('a', [0, 1, 1, 1])]),
                    label='k:bot')
        pipeline.run()
Example #8
0
def run(argv=None):

    parser = argparse.ArgumentParser()
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)
    # A thousand work items of a million tries each.
    (p  # pylint: disable=expression-not-assigned
     | df.Create('Initialize', [100000] * 100).with_output_types(int)
     | df.Map('Run trials', run_trials)
     | df.CombineGlobally('Sum', combine_results).without_defaults()
     | df.io.Write('Write',
                   df.io.TextFileSink(known_args.output, coder=JsonCoder())))

    # Actually run the pipeline (all operations above are deferred).
    p.run()
Example #9
0
    def test_pardo_side_input(self):
        p = df.Pipeline('DirectPipelineRunner')
        words = p | df.Create('start', ['a', 'bb', 'ccc', 'dddd'])

        # [START model_pardo_side_input]
        # Callable takes additional arguments.
        def filter_using_length(word, lower_bound, upper_bound=float('inf')):
            if lower_bound <= len(word) <= upper_bound:
                yield word

        # Construct a deferred side input.
        avg_word_len = words | df.Map(len) | df.CombineGlobally(
            df.combiners.MeanCombineFn())

        # Call with explicit side inputs.
        small_words = words | df.FlatMap('small', filter_using_length, 0, 3)

        # A single deferred side input.
        larger_than_average = words | df.FlatMap(
            'large',
            filter_using_length,
            lower_bound=pvalue.AsSingleton(avg_word_len))

        # Mix and match.
        small_but_nontrivial = words | df.FlatMap(
            filter_using_length,
            lower_bound=2,
            upper_bound=pvalue.AsSingleton(avg_word_len))
        # [END model_pardo_side_input]

        df.assert_that(small_words, df.equal_to(['a', 'bb', 'ccc']))
        df.assert_that(larger_than_average,
                       df.equal_to(['ccc', 'dddd']),
                       label='larger_than_average')
        df.assert_that(small_but_nontrivial,
                       df.equal_to(['bb']),
                       label='small_but_not_trivial')
        p.run()
Example #10
0
 def test_global_sum(self):
     pc = [1, 2, 3]
     # [START global_sum]
     result = pc | df.CombineGlobally(sum)
     # [END global_sum]
     self.assertEqual([6], result)
Example #11
0
        # [START combine_custom_average]
        class AverageFn(df.CombineFn):
            def create_accumulator(self):
                return (0.0, 0)

            def add_input(self, (sum, count), input):
                return sum + input, count + 1

            def merge_accumulators(self, accumulators):
                sums, counts = zip(*accumulators)
                return sum(sums), sum(counts)

            def extract_output(self, (sum, count)):
                return sum / count if count else float('NaN')

        average = pc | df.CombineGlobally(AverageFn())
        # [END combine_custom_average]
        self.assertEqual([4.25], average)

    def test_keys(self):
        occurrences = [('cat', 1), ('cat', 5), ('dog', 5), ('cat', 9),
                       ('dog', 2)]
        unique_keys = occurrences | snippets.Keys()
        self.assertEqual({'cat', 'dog'}, set(unique_keys))

    def test_count(self):
        occurrences = ['cat', 'dog', 'cat', 'cat', 'dog']
        perkey_counts = occurrences | snippets.Count()
        self.assertEqual({('cat', 3), ('dog', 2)}, set(perkey_counts))