def expand(self, pcoll): do_once = pcoll.pipeline | 'DoOnce' >> core.Create([None]) init_result_coll = do_once | 'InitializeWrite' >> core.Map( lambda _, sink: sink.initialize_write(), self.sink) if getattr(self.sink, 'num_shards', 0): min_shards = self.sink.num_shards if min_shards == 1: keyed_pcoll = pcoll | core.Map(lambda x: (None, x)) else: keyed_pcoll = pcoll | core.ParDo(_RoundRobinKeyFn(min_shards)) write_result_coll = ( keyed_pcoll | core.WindowInto(window.GlobalWindows()) | core.GroupByKey() | 'WriteBundles' >> core.ParDo(_WriteKeyedBundleDoFn(self.sink), AsSingleton(init_result_coll))) else: min_shards = 1 write_result_coll = ( pcoll | 'WriteBundles' >> core.ParDo(_WriteBundleDoFn(self.sink), AsSingleton(init_result_coll)) | 'Pair' >> core.Map(lambda x: (None, x)) | core.WindowInto(window.GlobalWindows()) | core.GroupByKey() | 'Extract' >> core.FlatMap(lambda x: x[1])) return do_once | 'FinalizeWrite' >> core.FlatMap( _finalize_write, self.sink, AsSingleton(init_result_coll), AsIter(write_result_coll), min_shards)
def expand(self, pcoll): compare = self._compare if (not self._args and not self._kwargs and pcoll.windowing.is_default()): if self._reverse: if compare is None or compare is operator.lt: compare = operator.gt else: original_compare = compare compare = lambda a, b: original_compare(b, a) # This is a more efficient global algorithm. top_per_bundle = pcoll | core.ParDo( _TopPerBundle(self._n, compare, self._key)) # If pcoll is empty, we can't guerentee that top_per_bundle # won't be empty, so inject at least one empty accumulator # so that downstream is guerenteed to produce non-empty output. empty_bundle = pcoll.pipeline | core.Create([(None, [])]) return ((top_per_bundle, empty_bundle) | core.Flatten() | core.GroupByKey() | core.ParDo( _MergeTopPerBundle(self._n, compare, self._key))) else: if self.has_defaults: return pcoll | core.CombineGlobally( TopCombineFn(self._n, compare, self._key, self._reverse), *self._args, ** self._kwargs) else: return pcoll | core.CombineGlobally( TopCombineFn(self._n, compare, self._key, self._reverse), *self._args, ** self._kwargs).without_defaults()
def expand(self, pcoll): do_once = pcoll.pipeline | 'DoOnceSuccess' >> core.Create([None]) main_write_result = pcoll | 'MainWrite' >> Write(self.sink) return (do_once | 'SuccessWrite' >> core.FlatMap( self._success_write, pvalue.AsIter(main_write_result)))
def expand(self, pcoll): if pcoll.windowing.is_default(): # This is a more efficient global algorithm. top_per_bundle = pcoll | core.ParDo( _TopPerBundle(self._n, self._key, self._reverse)) # If pcoll is empty, we can't guerentee that top_per_bundle # won't be empty, so inject at least one empty accumulator # so that downstream is guerenteed to produce non-empty output. empty_bundle = pcoll.pipeline | core.Create([(None, [])]) return ((top_per_bundle, empty_bundle) | core.Flatten() | core.GroupByKey() | core.ParDo( _MergeTopPerBundle(self._n, self._key, self._reverse))) else: if self.has_defaults: return pcoll | core.CombineGlobally( TopCombineFn(self._n, self._key, self._reverse)) else: return pcoll | core.CombineGlobally( TopCombineFn(self._n, self._key, self._reverse)).without_defaults()