Ejemplo n.º 1
0
 def test_pcollectionview_not_recreated(self):
   pipeline = Pipeline('DirectPipelineRunner')
   value = pipeline | Create('create1', [1, 2, 3])
   value2 = pipeline | Create('create2', [(1, 1), (2, 2), (3, 3)])
   self.assertEqual(AsSingleton(value), AsSingleton(value))
   self.assertEqual(AsSingleton('new', value, default_value=1),
                    AsSingleton('new', value, default_value=1))
   self.assertNotEqual(AsSingleton(value),
                       AsSingleton('new', value, default_value=1))
   self.assertEqual(AsIter(value), AsIter(value))
   self.assertEqual(AsList(value), AsList(value))
   self.assertEqual(AsDict(value2), AsDict(value2))
Ejemplo n.º 2
0
def create_groups(group_ids, corpus, word, ignore_corpus, ignore_word):
    """Generate groups given the input PCollections."""
    def attach_corpus_fn(group, corpus, ignore):
        selected = None
        len_corpus = len(corpus)
        while not selected:
            c = corpus[randrange(0, len_corpus - 1)].values()[0]
            if c != ignore:
                selected = c

        yield (group, selected)

    def attach_word_fn(group, words, ignore):
        selected = None
        len_words = len(words)
        while not selected:
            c = words[randrange(0, len_words - 1)].values()[0]
            if c != ignore:
                selected = c

        yield group + (selected, )

    return (group_ids
            | df.FlatMap('attach corpus', attach_corpus_fn, AsList(corpus),
                         AsSingleton(ignore_corpus))
            | df.FlatMap('attach word', attach_word_fn, AsIter(word),
                         AsSingleton(ignore_word)))
Ejemplo n.º 3
0
  def test_cached_pvalues_are_refcounted(self):
    """Test that cached PValues are refcounted and deleted.

    The intermediary PValues computed by the workflow below contain
    one million elements so if the refcounting does not work the number of
    objects tracked by the garbage collector will increase by a few millions
    by the time we execute the final Map checking the objects tracked.
    Anything that is much larger than what we started with will fail the test.
    """
    def check_memory(value, count_threshold):
      gc.collect()
      objects_count = len(gc.get_objects())
      if objects_count > count_threshold:
        raise RuntimeError(
            'PValues are not refcounted: %s, %s' % (
                objects_count, count_threshold))
      return value

    def create_dupes(o, _):
      yield o
      yield SideOutputValue('side', o)

    pipeline = Pipeline('DirectPipelineRunner')

    gc.collect()
    count_threshold = len(gc.get_objects()) + 10000
    biglist = pipeline | Create('oom:create', ['x'] * 1000000)
    dupes = (
        biglist
        | Map('oom:addone', lambda x: (x, 1))
        | FlatMap('oom:dupes', create_dupes,
                  AsIter(biglist)).with_outputs('side', main='main'))
    result = (
        (dupes.side, dupes.main, dupes.side)
        | Flatten('oom:flatten')
        | CombinePerKey('oom:combine', sum)
        | Map('oom:check', check_memory, count_threshold))

    assert_that(result, equal_to([('x', 3000000)]))
    pipeline.run()
    self.assertEqual(
        pipeline.runner.debug_counters['element_counts'],
        {
            'oom:flatten': 3000000,
            ('oom:combine/GroupByKey/reify_windows', None): 3000000,
            ('oom:dupes/oom:dupes', 'side'): 1000000,
            ('oom:dupes/oom:dupes', None): 1000000,
            'oom:create': 1000000,
            ('oom:addone', None): 1000000,
            'oom:combine/GroupByKey/group_by_key': 1,
            ('oom:check', None): 1,
            'assert_that/singleton': 1,
            ('assert_that/Map(match)', None): 1,
            ('oom:combine/GroupByKey/group_by_window', None): 1,
            ('oom:combine/Combine/ParDo(CombineValuesDoFn)', None): 1})
Ejemplo n.º 4
0
def assert_that(pcoll, matcher):
    """Asserts that the give PCollection satisfies the constraints of the matcher
  in a way that is runnable locally or on a remote service.
  """
    singleton = pcoll.pipeline | df.Create('create_singleton', [None])

    def check_matcher(_, side_value):
        assert matcher(side_value)
        return []

    singleton | df.FlatMap(check_matcher, AsIter(pcoll))
Ejemplo n.º 5
0
 def apply(self, pcoll):
     sink_coll = pcoll.pipeline | core.Create('create_sink_collection',
                                              [self.sink])
     init_result_coll = sink_coll | core.Map(
         'initialize_write', lambda sink: sink.initialize_write())
     write_result_coll = pcoll | core.ParDo(
         'write_bundles', _WriteBundleDoFn(), AsSingleton(sink_coll),
         AsSingleton(init_result_coll))
     return sink_coll | core.FlatMap(
         'finalize_write', lambda sink, init_result, write_results: sink.
         finalize_write(init_result, write_results),
         AsSingleton(init_result_coll), AsIter(write_result_coll))
Ejemplo n.º 6
0
 def apply(self, pcoll):
     sink_coll = pcoll.pipeline | core.Create('create_sink_collection',
                                              [self.sink])
     init_result_coll = sink_coll | core.Map(
         'initialize_write', lambda sink: sink.initialize_write())
     write_result_coll = pcoll | core.ParDo(
         'write_bundles', _WriteBundleDoFn(), AsSingleton(sink_coll),
         AsSingleton(init_result_coll))
     return sink_coll | core.FlatMap(
         'finalize_write', lambda sink, init_result, write_results:
         (window.TimestampedValue(v, window.MAX_TIMESTAMP)
          for v in sink.finalize_write(init_result, write_results) or
          ()), AsSingleton(init_result_coll), AsIter(write_result_coll))