Example #1
0
def create_groups(group_ids, corpus, word, ignore_corpus, ignore_word):
    """Generate groups given the input PCollections."""
    def attach_corpus_fn(group, corpus, ignore):
        selected = None
        len_corpus = len(corpus)
        while not selected:
            c = corpus[randrange(0, len_corpus - 1)].values()[0]
            if c != ignore:
                selected = c

        yield (group, selected)

    def attach_word_fn(group, words, ignore):
        selected = None
        len_words = len(words)
        while not selected:
            c = words[randrange(0, len_words - 1)].values()[0]
            if c != ignore:
                selected = c

        yield group + (selected, )

    return (group_ids
            | df.FlatMap('attach corpus', attach_corpus_fn, AsList(corpus),
                         AsSingleton(ignore_corpus))
            | df.FlatMap('attach word', attach_word_fn, AsIter(word),
                         AsSingleton(ignore_word)))
Example #2
0
 def apply(self, pcoll):
     sink_coll = pcoll.pipeline | core.Create('create_sink_collection',
                                              [self.sink])
     init_result_coll = sink_coll | core.Map(
         'initialize_write', lambda sink: sink.initialize_write())
     write_result_coll = pcoll | core.ParDo(
         'write_bundles', _WriteBundleDoFn(), AsSingleton(sink_coll),
         AsSingleton(init_result_coll))
     return sink_coll | core.FlatMap(
         'finalize_write', lambda sink, init_result, write_results: sink.
         finalize_write(init_result, write_results),
         AsSingleton(init_result_coll), AsIter(write_result_coll))
Example #3
0
 def apply(self, pcoll):
     sink_coll = pcoll.pipeline | core.Create('create_sink_collection',
                                              [self.sink])
     init_result_coll = sink_coll | core.Map(
         'initialize_write', lambda sink: sink.initialize_write())
     write_result_coll = pcoll | core.ParDo(
         'write_bundles', _WriteBundleDoFn(), AsSingleton(sink_coll),
         AsSingleton(init_result_coll))
     return sink_coll | core.FlatMap(
         'finalize_write', lambda sink, init_result, write_results:
         (window.TimestampedValue(v, window.MAX_TIMESTAMP)
          for v in sink.finalize_write(init_result, write_results) or
          ()), AsSingleton(init_result_coll), AsIter(write_result_coll))
Example #4
0
    def test_as_singleton_with_different_defaults_without_unique_labels(self):
        # This should fail as AsSingleton with distinct default values should create
        # distinct PCollectionViews with the same full_label.
        a_list = [2]
        pipeline = Pipeline('DirectPipelineRunner')
        main_input = pipeline | Create('main input', [1])
        side_list = pipeline | Create('side list', a_list)

        with self.assertRaises(RuntimeError) as e:
            _ = main_input | FlatMap('test', lambda x, s1, s2: [[x, s1, s2]],
                                     AsSingleton(side_list),
                                     AsSingleton(side_list, default_value=3))
        self.assertTrue(
            e.exception.message.startswith(
                'Transform "ViewAsSingleton(side list.None)" does not have a '
                'stable unique label.'))
Example #5
0
 def test_multi_valued_singleton_side_input(self):
     pipeline = Pipeline('DirectPipelineRunner')
     pcol = pipeline | Create('start', [1, 2])
     side = pipeline | Create('side', [3, 4])  # 2 values in side input.
     pcol | FlatMap('compute', lambda x, s: [x * s], AsSingleton(side))
     with self.assertRaises(ValueError) as e:
         pipeline.run()
Example #6
0
 def test_default_value_singleton_side_input(self):
     pipeline = Pipeline('DirectPipelineRunner')
     pcol = pipeline | Create('start', [1, 2])
     side = pipeline | Create('side', [])  # 0 values in side input.
     result = (pcol | FlatMap('compute', lambda x, s: [x * s],
                              AsSingleton(side, 10)))
     assert_that(result, equal_to([10, 20]))
     pipeline.run()
Example #7
0
 def test_par_do_with_side_input_as_arg(self):
     pipeline = Pipeline('DirectPipelineRunner')
     words_list = ['aa', 'bb', 'cc']
     words = pipeline | Create('SomeWords', words_list)
     prefix = pipeline | Create('SomeString', ['xyz'])  # side in
     suffix = 'zyx'
     result = words | FlatMap(
         'DecorateWords', lambda x, pfx, sfx: ['%s-%s-%s' % (pfx, x, sfx)],
         AsSingleton(prefix), suffix)
     assert_that(result, equal_to(['xyz-%s-zyx' % x for x in words_list]))
     pipeline.run()
Example #8
0
    def test_as_singleton_with_different_defaults_with_unique_labels(self):
        a_list = []
        pipeline = Pipeline('DirectPipelineRunner')
        main_input = pipeline | Create('main input', [1])
        side_list = pipeline | Create('side list', a_list)
        results = main_input | FlatMap(
            'test', lambda x, s1, s2: [[x, s1, s2]],
            AsSingleton('si1', side_list, default_value=2),
            AsSingleton('si2', side_list, default_value=3))

        def matcher(expected_elem, expected_singleton1, expected_singleton2):
            def match(actual):
                [[actual_elem, actual_singleton1, actual_singleton2]] = actual
                equal_to([expected_elem])([actual_elem])
                equal_to([expected_singleton1])([actual_singleton1])
                equal_to([expected_singleton2])([actual_singleton2])

            return match

        assert_that(results, matcher(1, 2, 3))
        pipeline.run()
Example #9
0
    def test_empty_singleton_side_input(self):
        pipeline = Pipeline('DirectPipelineRunner')
        pcol = pipeline | Create('start', [1, 2])
        side = pipeline | Create('side', [])  # Empty side input.

        def my_fn(k, s):
            v = ('empty' if isinstance(s, EmptySideInput) else 'full')
            return [(k, v)]

        result = pcol | FlatMap('compute', my_fn, AsSingleton(side))
        assert_that(result, equal_to([(1, 'empty'), (2, 'empty')]))
        pipeline.run()
Example #10
0
    def test_as_singleton_without_unique_labels(self):
        # This should succeed as calling AsSingleton on the same PCollection twice
        # with the same defaults will return the same PCollectionView.
        a_list = [2]
        pipeline = Pipeline('DirectPipelineRunner')
        main_input = pipeline | Create('main input', [1])
        side_list = pipeline | Create('side list', a_list)
        results = main_input | FlatMap('test', lambda x, s1, s2: [[x, s1, s2]],
                                       AsSingleton(side_list),
                                       AsSingleton(side_list))

        def matcher(expected_elem, expected_singleton):
            def match(actual):
                [[actual_elem, actual_singleton1, actual_singleton2]] = actual
                equal_to([expected_elem])([actual_elem])
                equal_to([expected_singleton])([actual_singleton1])
                equal_to([expected_singleton])([actual_singleton2])

            return match

        assert_that(results, matcher(1, 2))
        pipeline.run()
    def test_pcollectionview_not_recreated(self):
        pipeline = Pipeline('DirectPipelineRunner')
        value = pipeline | Create('create1', [1, 2, 3])
        value2 = pipeline | Create('create2', [(1, 1), (2, 2), (3, 3)])
        self.assertEqual(AsSingleton(value), AsSingleton(value))
        self.assertEqual(AsSingleton('new', value, default_value=1),
                         AsSingleton('new', value, default_value=1))
        self.assertNotEqual(AsSingleton(value),
                            AsSingleton('new', value, default_value=1))
        self.assertEqual(AsIter(value), AsIter(value))
        self.assertEqual(AsList(value), AsList(value))
        self.assertEqual(AsDict(value2), AsDict(value2))

        self.assertNotEqual(AsSingleton(value), AsSingleton(value2))
        self.assertNotEqual(AsIter(value), AsIter(value2))
        self.assertNotEqual(AsList(value), AsList(value2))
        self.assertNotEqual(AsDict(value), AsDict(value2))
Example #12
0
    def test_par_do_with_do_fn_object(self):
        class SomeDoFn(DoFn):
            """A custom DoFn for a FlatMap transform."""
            def process(self, context, prefix, suffix):
                return ['%s-%s-%s' % (prefix, context.element, suffix)]

        pipeline = Pipeline('DirectPipelineRunner')
        words_list = ['aa', 'bb', 'cc']
        words = pipeline | Create('SomeWords', words_list)
        prefix = 'zyx'
        suffix = pipeline | Create('SomeString', ['xyz'])  # side in
        result = words | ParDo('DecorateWordsDoFn',
                               SomeDoFn(),
                               prefix,
                               suffix=AsSingleton(suffix))
        assert_that(result, equal_to(['zyx-%s-xyz' % x for x in words_list]))
        pipeline.run()
Example #13
0
def filter_cold_days(input_data, month_filter):
    """Workflow computing rows in a specific month with low temperatures.

  Args:
    input_data: a PCollection of dictionaries representing table rows. Each
      dictionary must have the keys ['year', 'month', 'day', and 'mean_temp'].
    month_filter: an int representing the month for which colder-than-average
      days should be returned.

  Returns:
    A PCollection of dictionaries with the same keys described above. Each
      row represents a day in the specified month where temperatures were
      colder than the global mean temperature in the entire dataset.
  """

    # Project to only the desired fields from a complete input row.
    # E.g., SELECT f1, f2, f3, ... FROM InputTable.
    projection_fields = ['year', 'month', 'day', 'mean_temp']
    fields_of_interest = (
        input_data
        | df.Map('projected',
                 lambda row: {f: row[f]
                              for f in projection_fields}))

    # Compute the global mean temperature.
    global_mean = AsSingleton(
        fields_of_interest
        | df.Map('extract mean', lambda row: row['mean_temp'])
        | df.combiners.Mean.Globally('global mean'))

    # Filter to the rows representing days in the month of interest
    # in which the mean daily temperature is below the global mean.
    return (
        fields_of_interest
        | df.Filter('desired month', lambda row: row['month'] == month_filter)
        | df.Filter('below mean', lambda row, mean: row['mean_temp'] < mean,
                    global_mean))
Example #14
0
        # Compute a mapping from each word to its document frequency.
        # A word's document frequency in a corpus is the number of
        # documents in which the word appears divided by the total
        # number of documents in the corpus.
        #
        # This calculation uses a side input, a Dataflow-computed auxiliary value
        # presented to each invocation of our MapFn lambda. The second argument to
        # the lambda (called total---note that we are unpacking the first argument)
        # receives the value we listed after the lambda in Map(). Additional side
        # inputs (and ordinary Python values, too) can be provided to MapFns and
        # DoFns in this way.
        word_to_df = (word_to_doc_count
                      | df.Map(
                          'compute doc frequencies', lambda
                          (word, count), total: (word, float(count) / total),
                          AsSingleton(total_documents)))

        # Join the term frequency and document frequency collections,
        # each keyed on the word.
        word_to_uri_and_tf_and_df = ({
            'tf': word_to_uri_and_tf,
            'df': word_to_df
        }
                                     |
                                     df.CoGroupByKey('cogroup words by tf-df'))

        # Compute a mapping from each word to a (URI, TF-IDF) score for each URI.
        # There are a variety of definitions of TF-IDF
        # ("term frequency - inverse document frequency") score; here we use a
        # basic version that is the term frequency divided by the log of the
        # document frequency.