Ejemplo n.º 1
0
    def test_bad_types(self):
        p = df.Pipeline('DirectPipelineRunner', argv=sys.argv)

        # [START type_hints_missing_define_numbers]
        numbers = p | df.Create(['1', '2', '3'])
        # [END type_hints_missing_define_numbers]

        # Consider the following code.
        # [START type_hints_missing_apply]
        evens = numbers | df.Filter(lambda x: x % 2 == 0)
        # [END type_hints_missing_apply]

        # Now suppose numers was defined as [snippet above].
        # When running this pipeline, you'd get a runtime error,
        # possibly on a remote machine, possibly very late.

        with self.assertRaises(TypeError):
            p.run()

        # To catch this early, we can assert what types we expect.
        with self.assertRaises(typehints.TypeCheckError):
            # [START type_hints_takes]
            p.options.view_as(TypeOptions).pipeline_type_check = True
            evens = numbers | df.Filter(lambda x: x % 2 == 0).with_input_types(
                int)
            # [END type_hints_takes]

        # Type hints can be declared on DoFns and callables as well, rather
        # than where they're used, to be more self contained.
        with self.assertRaises(typehints.TypeCheckError):
            # [START type_hints_do_fn]
            @df.typehints.with_input_types(int)
            class FilterEvensDoFn(df.DoFn):
                def process(self, context):
                    if context.element % 2 == 0:
                        yield context.element

            evens = numbers | df.ParDo(FilterEvensDoFn())
            # [END type_hints_do_fn]

        words = p | df.Create('words', ['a', 'bb', 'c'])
        # One can assert outputs and apply them to transforms as well.
        # Helps document the contract and checks it at pipeline construction time.
        # [START type_hints_transform]
        T = df.typehints.TypeVariable('T')

        @df.typehints.with_input_types(T)
        @df.typehints.with_output_types(df.typehints.Tuple[int, T])
        class MyTransform(df.PTransform):
            def apply(self, pcoll):
                return pcoll | df.Map(lambda x: (len(x), x))

        words_with_lens = words | MyTransform()
        # [END type_hints_transform]

        with self.assertRaises(typehints.TypeCheckError):
            words_with_lens | df.Map(lambda x: x).with_input_types(
                df.typehints.Tuple[int, int])
Ejemplo n.º 2
0
    def test_deferred_side_input_iterable(self):
        @typehints.with_input_types(str, typehints.Iterable[str])
        def concat(glue, items):
            return glue.join(sorted(items))

        p = df.Pipeline(options=PipelineOptions([]))
        main_input = p | df.Create(['a', 'bb', 'c'])
        side_input = p | df.Create('side', ['x', 'y', 'z'])
        result = main_input | df.Map(concat, pvalue.AsIter(side_input))
        assert_that(result, equal_to(['xayaz', 'xbbybbz', 'xcycz']))
        p.run()

        bad_side_input = p | df.Create('bad_side', [1, 2, 3])
        with self.assertRaises(typehints.TypeCheckError):
            main_input | df.Map('fail', concat, pvalue.AsIter(bad_side_input))
Ejemplo n.º 3
0
 def test_runtime_checks_on(self):
     p = df.Pipeline('DirectPipelineRunner', argv=sys.argv)
     with self.assertRaises(typehints.TypeCheckError):
         # [START type_hints_runtime_on]
         p.options.view_as(TypeOptions).runtime_type_check = True
         p | df.Create(['a']) | df.Map(lambda x: 3).with_output_types(str)
         p.run()
 def test_basics(self):
     p = df.Pipeline('DirectPipelineRunner')
     rows = (p | df.Create('create', [{
         'month': 1,
         'day': 1,
         'tornado': False
     }, {
         'month': 1,
         'day': 2,
         'tornado': True
     }, {
         'month': 1,
         'day': 3,
         'tornado': True
     }, {
         'month': 2,
         'day': 1,
         'tornado': True
     }]))
     results = bigquery_tornadoes.count_tornadoes(rows)
     df.assert_that(
         results,
         df.equal_to([{
             'month': 1,
             'tornado_count': 2
         }, {
             'month': 2,
             'tornado_count': 1
         }]))
     p.run()
    def _run_write_test(self,
                        data,
                        return_init_result=True,
                        return_write_results=True):
        write_to_test_sink = WriteToTestSink(return_init_result,
                                             return_write_results)
        p = Pipeline(options=PipelineOptions([]))
        result = p | df.Create('start', data) | write_to_test_sink

        assert_that(result, is_empty())
        p.run()

        sink = write_to_test_sink.last_sink
        self.assertIsNotNone(sink)

        self.assertEqual(sink.state, _TestSink.STATE_FINALIZED)
        if data:
            self.assertIsNotNone(sink.last_writer)
            self.assertEqual(sink.last_writer.state, _TestWriter.STATE_CLOSED)
            self.assertEqual(sink.last_writer.write_output, data)
            if return_init_result:
                self.assertEqual(sink.last_writer.init_result,
                                 _TestSink.TEST_INIT_RESULT)
                self.assertEqual(sink.init_result_at_finalize,
                                 _TestSink.TEST_INIT_RESULT)
            self.assertIsNotNone(sink.last_writer.uid)
            if return_write_results:
                self.assertEqual(sink.write_results_at_finalize,
                                 [_TestWriter.TEST_WRITE_RESULT])
        else:
            self.assertIsNone(sink.last_writer)
Ejemplo n.º 6
0
    def test_deferred_side_inputs(self):
        @typehints.with_input_types(str, int)
        def repeat(s, times):
            return s * times

        p = df.Pipeline(options=PipelineOptions([]))
        main_input = p | df.Create(['a', 'bb', 'c'])
        side_input = p | df.Create('side', [3])
        result = main_input | df.Map(repeat, pvalue.AsSingleton(side_input))
        assert_that(result, equal_to(['aaa', 'bbbbbb', 'ccc']))
        p.run()

        bad_side_input = p | df.Create('bad_side', ['z'])
        with self.assertRaises(typehints.TypeCheckError):
            main_input | df.Map('again', repeat,
                                pvalue.AsSingleton(bad_side_input))
    def test_compute_top_sessions(self):
        p = df.Pipeline('DirectPipelineRunner')
        edits = p | df.Create('create', self.EDITS)
        result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0)

        df.assert_that(result, df.equal_to(self.EXPECTED))
        p.run()
Ejemplo n.º 8
0
    def test_aggregation(self):

        mean = combiners.MeanCombineFn()
        mean.__name__ = 'mean'
        counter_types = [
            (sum, int, 6),
            (min, int, 0),
            (max, int, 3),
            (mean, int, 1),
            (sum, float, 6.0),
            (min, float, 0.0),
            (max, float, 3.0),
            (mean, float, 1.5),
            (any, int, True),
            (all, float, False),
        ]
        aggeregators = [
            Aggregator('%s_%s' % (f.__name__, t.__name__), f, t)
            for f, t, _ in counter_types
        ]

        class UpdateAggregators(df.DoFn):
            def process(self, context):
                for a in aggeregators:
                    context.aggregate_to(a, context.element)

        p = df.Pipeline('DirectPipelineRunner')
        p | df.Create([0, 1, 2, 3]) | df.ParDo(UpdateAggregators())
        res = p.run()
        for (_, _, expected), a in zip(counter_types, aggeregators):
            actual = res.aggregated_values(a).values()[0]
            self.assertEqual(expected, actual)
            self.assertEqual(type(expected), type(actual))
Ejemplo n.º 9
0
 def test_tfidf_transform(self):
     p = df.Pipeline('DirectPipelineRunner')
     uri_to_line = p | df.Create('create sample', [('1.txt', 'abc def ghi'),
                                                   ('2.txt', 'abc def'),
                                                   ('3.txt', 'abc')])
     result = (uri_to_line
               | tfidf.TfIdf()
               | df.Map('flatten', lambda (word, (uri, tfidf)):
                        (word, uri, tfidf)))
Ejemplo n.º 10
0
 def test_empty_write(self):
     temp_path = tempfile.NamedTemporaryFile().name
     sink = MyFileSink(temp_path,
                       file_name_suffix='.foo',
                       coder=coders.ToStringCoder())
     p = df.Pipeline('DirectPipelineRunner')
     p | df.Create([]) | df.io.Write(sink)  # pylint: disable=expression-not-assigned
     p.run()
     self.assertEqual(
         open(temp_path + '-00000-of-00001.foo').read(), '[start][end]')
Ejemplo n.º 11
0
def assert_that(pcoll, matcher):
    """Asserts that the give PCollection satisfies the constraints of the matcher
  in a way that is runnable locally or on a remote service.
  """
    singleton = pcoll.pipeline | df.Create('create_singleton', [None])

    def check_matcher(_, side_value):
        assert matcher(side_value)
        return []

    singleton | df.FlatMap(check_matcher, AsIter(pcoll))
Ejemplo n.º 12
0
def generate_julia_set_colors(pipeline, c, n, max_iterations):
    """Compute julia set coordinates for each point in our set."""
    def point_set(n):
        for x in range(n):
            for y in range(n):
                yield (x, y)

    julia_set_colors = (pipeline
                        | df.Create('add points', point_set(n))
                        | df.Map(get_julia_set_point_color, c, n,
                                 max_iterations))

    return julia_set_colors
Ejemplo n.º 13
0
    def test_create_groups(self):
        p = df.Pipeline('DirectPipelineRunner')

        group_ids_pcoll = p | df.Create('create_group_ids', ['A', 'B', 'C'])
        corpus_pcoll = p | df.Create('create_corpus', [{
            'f': 'corpus1'
        }, {
            'f': 'corpus2'
        }, {
            'f': 'corpus3'
        }])
        words_pcoll = p | df.Create('create_words', [{
            'f': 'word1'
        }, {
            'f': 'word2'
        }, {
            'f': 'word3'
        }])
        ignore_corpus_pcoll = p | df.Create('create_ignore_corpus',
                                            ['corpus1'])
        ignore_word_pcoll = p | df.Create('create_ignore_word', ['word1'])

        groups = bigquery_side_input.create_groups(group_ids_pcoll,
                                                   corpus_pcoll, words_pcoll,
                                                   ignore_corpus_pcoll,
                                                   ignore_word_pcoll)

        def group_matcher(actual):
            self.assertEqual(len(actual), 3)
            for group in actual:
                self.assertEqual(len(group), 3)
                self.assertTrue(group[1].startswith('corpus'))
                self.assertNotEqual(group[1], 'corpus1')
                self.assertTrue(group[2].startswith('word'))
                self.assertNotEqual(group[2], 'word1')

        df.assert_that(groups, group_matcher)
        p.run()
Ejemplo n.º 14
0
    def test_combine_per_key_with_callable(self):
        """CombinePerKey using a standard callable reducing iterables.

    A common case for Dataflow combiners is to sum (or max or min) over the
    values of each key. Such standard functions can be used directly as combiner
    functions. In fact, any function "reducing" an iterable to a single value
    can be used.
    """
        result = (df.Pipeline(runner=df.runners.DirectPipelineRunner())
                  | df.Create(CombinersTest.SAMPLE_DATA)
                  | df.CombinePerKey(sum))

        df.assert_that(result, df.equal_to([('a', 6), ('b', 30), ('c', 100)]))
        result.pipeline.run()
Ejemplo n.º 15
0
    def _run_write_test(self,
                        data,
                        return_init_result=True,
                        return_write_results=True):
        write_to_test_sink = WriteToTestSink(return_init_result,
                                             return_write_results)
        p = Pipeline(options=PipelineOptions([]))
        result = p | df.Create('start', data) | write_to_test_sink

        assert_that(result, is_empty())
        p.run()

        sink = write_to_test_sink.last_sink
        self.assertIsNotNone(sink)
Ejemplo n.º 16
0
    def test_combine_per_key_with_custom_callable(self):
        """CombinePerKey using a custom function reducing iterables."""
        def multiply(values):
            result = 1
            for v in values:
                result *= v
            return result

        result = (df.Pipeline(runner=df.runners.DirectPipelineRunner())
                  | df.Create(CombinersTest.SAMPLE_DATA)
                  | df.CombinePerKey(multiply))

        df.assert_that(result, df.equal_to([('a', 6), ('b', 200), ('c', 100)]))
        result.pipeline.run()
Ejemplo n.º 17
0
def run(argv=None):
    """Run the workflow."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--output')
    parser.add_argument('--ignore_corpus', default='')
    parser.add_argument('--ignore_word', default='')
    parser.add_argument('--num_groups')

    known_args, pipeline_args = parser.parse_known_args(argv)
    p = df.Pipeline(argv=pipeline_args)

    group_ids = []
    for i in xrange(0, int(known_args.num_groups)):
        group_ids.append('id' + str(i))

    query_corpus = 'select UNIQUE(corpus) from publicdata:samples.shakespeare'
    query_word = 'select UNIQUE(word) from publicdata:samples.shakespeare'
    ignore_corpus = known_args.ignore_corpus
    ignore_word = known_args.ignore_word

    pcoll_corpus = p | df.Read('read corpus',
                               df.io.BigQuerySource(query=query_corpus))
    pcoll_word = p | df.Read('read words',
                             df.io.BigQuerySource(query=query_word))
    pcoll_ignore_corpus = p | df.Create('create_ignore_corpus',
                                        [ignore_corpus])
    pcoll_ignore_word = p | df.Create('create_ignore_word', [ignore_word])
    pcoll_group_ids = p | df.Create('create groups', group_ids)

    pcoll_groups = create_groups(pcoll_group_ids, pcoll_corpus, pcoll_word,
                                 pcoll_ignore_corpus, pcoll_ignore_word)

    # pylint:disable=expression-not-assigned
    pcoll_groups | df.io.Write('WriteToText',
                               df.io.TextFileSink(known_args.output))
    p.run()
Ejemplo n.º 18
0
    def test_fixed_shard_write(self):
        temp_path = tempfile.NamedTemporaryFile().name
        sink = MyFileSink(temp_path,
                          file_name_suffix='.foo',
                          num_shards=3,
                          shard_name_template='_NN_SSS_',
                          coder=coders.ToStringCoder())
        p = df.Pipeline('DirectPipelineRunner')
        p | df.Create(['a', 'b']) | df.io.Write(sink)  # pylint: disable=expression-not-assigned

        p.run()

        concat = ''.join(
            open(temp_path + '_03_%03d_.foo' % shard_num).read()
            for shard_num in range(3))
        self.assertTrue('][a][' in concat, concat)
        self.assertTrue('][b][' in concat, concat)
Ejemplo n.º 19
0
 def test_top_prefixes(self):
     p = df.Pipeline('DirectPipelineRunner')
     words = p | df.Create('create', self.WORDS)
     result = words | autocomplete.TopPerPrefix('test', 5)
     # values must be hashable for now
     result = result | df.Map(lambda (k, vs): (k, tuple(vs)))
     assert_that(
         result,
         contains_in_any_order([
             ('t', ((3, 'to'), (2, 'this'), (1, 'that'))),
             ('to', ((3, 'to'), )),
             ('th', ((2, 'this'), (1, 'that'))),
             ('thi', ((2, 'this'), )),
             ('this', ((2, 'this'), )),
             ('tha', ((1, 'that'), )),
             ('that', ((1, 'that'), )),
         ]))
     p.run()
Ejemplo n.º 20
0
def run(argv=None):

    parser = argparse.ArgumentParser()
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)
    # A thousand work items of a million tries each.
    (p  # pylint: disable=expression-not-assigned
     | df.Create('Initialize', [100000] * 100).with_output_types(int)
     | df.Map('Run trials', run_trials)
     | df.CombineGlobally('Sum', combine_results).without_defaults()
     | df.io.Write('Write',
                   df.io.TextFileSink(known_args.output, coder=JsonCoder())))

    # Actually run the pipeline (all operations above are deferred).
    p.run()
Ejemplo n.º 21
0
 def test_after_count(self):
     p = Pipeline('DirectPipelineRunner')
     result = (p
               | df.Create([1, 2, 3, 4, 5, 10, 11])
               | df.FlatMap(lambda t: [('A', t), ('B', t + 5)])
               | df.Map(lambda (k, t): TimestampedValue((k, t), t))
               | df.WindowInto(
                   FixedWindows(10),
                   trigger=AfterCount(3),
                   accumulation_mode=AccumulationMode.DISCARDING)
               | df.GroupByKey()
               | df.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v))))
     assert_that(
         result,
         equal_to({
             'A-5': {1, 2, 3, 4, 5},
             # A-10, A-11 never emitted due to AfterCount(3) never firing.
             'B-4': {6, 7, 8, 9},
             'B-3': {10, 15, 16},
         }.iteritems()))
Ejemplo n.º 22
0
    def test_pardo_side_input(self):
        p = df.Pipeline('DirectPipelineRunner')
        words = p | df.Create('start', ['a', 'bb', 'ccc', 'dddd'])

        # [START model_pardo_side_input]
        # Callable takes additional arguments.
        def filter_using_length(word, lower_bound, upper_bound=float('inf')):
            if lower_bound <= len(word) <= upper_bound:
                yield word

        # Construct a deferred side input.
        avg_word_len = words | df.Map(len) | df.CombineGlobally(
            df.combiners.MeanCombineFn())

        # Call with explicit side inputs.
        small_words = words | df.FlatMap('small', filter_using_length, 0, 3)

        # A single deferred side input.
        larger_than_average = words | df.FlatMap(
            'large',
            filter_using_length,
            lower_bound=pvalue.AsSingleton(avg_word_len))

        # Mix and match.
        small_but_nontrivial = words | df.FlatMap(
            filter_using_length,
            lower_bound=2,
            upper_bound=pvalue.AsSingleton(avg_word_len))
        # [END model_pardo_side_input]

        df.assert_that(small_words, df.equal_to(['a', 'bb', 'ccc']))
        df.assert_that(larger_than_average,
                       df.equal_to(['ccc', 'dddd']),
                       label='larger_than_average')
        df.assert_that(small_but_nontrivial,
                       df.equal_to(['bb']),
                       label='small_but_not_trivial')
        p.run()
def run(argv=None):
    """Run the workflow."""
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--output',
        required=True,
        help=
        ('Output BigQuery table for results specified as: PROJECT:DATASET.TABLE '
         'or DATASET.TABLE.'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    from google.cloud.dataflow.internal.clients import bigquery  # pylint: disable=g-import-not-at-top

    table_schema = bigquery.TableSchema()

    # Fields that use standard types.
    kind_schema = bigquery.TableFieldSchema()
    kind_schema.name = 'kind'
    kind_schema.type = 'string'
    kind_schema.mode = 'nullable'
    table_schema.fields.append(kind_schema)

    full_name_schema = bigquery.TableFieldSchema()
    full_name_schema.name = 'fullName'
    full_name_schema.type = 'string'
    full_name_schema.mode = 'required'
    table_schema.fields.append(full_name_schema)

    age_schema = bigquery.TableFieldSchema()
    age_schema.name = 'age'
    age_schema.type = 'integer'
    age_schema.mode = 'nullable'
    table_schema.fields.append(age_schema)

    gender_schema = bigquery.TableFieldSchema()
    gender_schema.name = 'gender'
    gender_schema.type = 'string'
    gender_schema.mode = 'nullable'
    table_schema.fields.append(gender_schema)

    # A nested field
    phone_number_schema = bigquery.TableFieldSchema()
    phone_number_schema.name = 'phoneNumber'
    phone_number_schema.type = 'record'
    phone_number_schema.mode = 'nullable'

    area_code = bigquery.TableFieldSchema()
    area_code.name = 'areaCode'
    area_code.type = 'integer'
    area_code.mode = 'nullable'
    phone_number_schema.fields.append(area_code)

    number = bigquery.TableFieldSchema()
    number.name = 'number'
    number.type = 'integer'
    number.mode = 'nullable'
    phone_number_schema.fields.append(number)
    table_schema.fields.append(phone_number_schema)

    # A repeated field.
    children_schema = bigquery.TableFieldSchema()
    children_schema.name = 'children'
    children_schema.type = 'string'
    children_schema.mode = 'repeated'
    table_schema.fields.append(children_schema)

    def create_random_record(record_id):
        return {
            'kind':
            'kind' + record_id,
            'fullName':
            'fullName' + record_id,
            'age':
            int(record_id) * 10,
            'gender':
            'male',
            'phoneNumber': {
                'areaCode': int(record_id) * 100,
                'number': int(record_id) * 100000
            },
            'children': [
                'child' + record_id + '1', 'child' + record_id + '2',
                'child' + record_id + '3'
            ]
        }

    # pylint: disable=expression-not-assigned
    record_ids = p | df.Create('CreateIDs', ['1', '2', '3', '4', '5'])
    records = record_ids | df.Map('CreateRecords', create_random_record)
    records | df.io.Write(
        'write',
        df.io.BigQuerySink(
            known_args.output,
            schema=table_schema,
            create_disposition=df.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=df.io.BigQueryDisposition.WRITE_TRUNCATE))

    # Run the pipeline (all operations are deferred until run() is called).
    p.run()
Ejemplo n.º 24
0
  def _get_result_for_month(self, month):
    p = df.Pipeline('DirectPipelineRunner')
    rows = (p | df.Create('create', self.input_data))

    results = filters.filter_cold_days(rows, month)
    return results
Ejemplo n.º 25
0
 def test_runtime_checks_off(self):
     p = df.Pipeline('DirectPipelineRunner', argv=sys.argv)
     # [START type_hints_runtime_off]
     p | df.Create(['a']) | df.Map(lambda x: 3).with_output_types(str)
     p.run()