def process(self, key_and_bundles): _, bundles = key_and_bundles heap = [] for bundle in bundles: if not heap: if self._less_than or self._key: heap = [ cy_combiners.ComparableValue(element, self._less_than, self._key) for element in bundle ] else: heap = bundle continue for element in reversed(bundle): if self._less_than or self._key: element = cy_combiners.ComparableValue( element, self._less_than, self._key) if len(heap) < self._n: heapq.heappush(heap, element) elif element < heap[0]: # Because _TopPerBundle returns sorted lists, all other elements # will also be smaller. break else: heapq.heappushpop(heap, element) heap.sort() if self._less_than or self._key: yield [wrapper.value for wrapper in reversed(heap)] else: yield heap[::-1]
def add_input(self, accumulator, element, *args, **kwargs): # Caching to avoid paying the price of variadic expansion of args / kwargs # when it's not needed (for the 'if' case below). if self._less_than is None: if args or kwargs: self._less_than = lambda a, b: self._compare( a, b, *args, **kwargs) else: self._less_than = self._compare holds_comparables, heap = accumulator if self._less_than is not operator.lt or self._key: if not holds_comparables: heap = [ cy_combiners.ComparableValue(value, self._less_than, self._key) for value in heap ] holds_comparables = True else: assert not holds_comparables comparable = (cy_combiners.ComparableValue(element, self._less_than, self._key) if holds_comparables else element) if len(heap) < self._n: heapq.heappush(heap, comparable) else: heapq.heappushpop(heap, comparable) return (holds_comparables, heap)
def process(self, key_and_bundles): _, bundles = key_and_bundles def push(hp, e): if len(hp) < self._n: heapq.heappush(hp, e) return False elif e < hp[0]: # Because _TopPerBundle returns sorted lists, all other elements # will also be smaller. return True else: heapq.heappushpop(hp, e) return False if self._compare or self._key: heapc = [] # type: List[cy_combiners.ComparableValue] for bundle in bundles: if not heapc: heapc = [ cy_combiners.ComparableValue(element, self._compare, self._key) for element in bundle ] continue # TODO(https://github.com/apache/beam/issues/21205): Remove this # workaround once legacy dataflow correctly handles coders with # combiner packing and/or is deprecated. if not isinstance(bundle, list): bundle = list(bundle) for element in reversed(bundle): if push(heapc, cy_combiners.ComparableValue(element, self._compare, self._key)): break heapc.sort() yield [wrapper.value for wrapper in reversed(heapc)] else: heap = [] for bundle in bundles: # TODO(https://github.com/apache/beam/issues/21205): Remove this # workaround once legacy dataflow correctly handles coders with # combiner packing and/or is deprecated. if not isinstance(bundle, list): bundle = list(bundle) if not heap: heap = bundle continue for element in reversed(bundle): if push(heap, element): break heap.sort() yield heap[::-1]
def add_input(self, accumulator, element, *args, **kwargs): # Caching to avoid paying the price of variadic expansion of args / kwargs # when it's not needed (for the 'if' case below). if self._less_than is None: if args or kwargs: self._less_than = lambda a, b: self._compare( a, b, *args, **kwargs) else: self._less_than = self._compare holds_comparables, heap = accumulator if self._less_than is not operator.lt or self._key: heap = self._hydrated_heap(heap) holds_comparables = True else: assert not holds_comparables # this is the new part of code for current_top_element in enumerate(heap): if element[0] == current_top_element[1].value[0]: # logging.info("Duplicate: " + element[0] + "," + str(element[1]) + ' --- ' + current_top_element[1].value[0] + ',' + str(current_top_element[1].value[1])) heap[current_top_element[0]] = heap[-1] heap.pop() heapq.heapify(heap) comparable = (cy_combiners.ComparableValue(element, self._less_than, self._key) if holds_comparables else element) if len(heap) < self._n: heapq.heappush(heap, comparable) else: heapq.heappushpop(heap, comparable) return (holds_comparables, heap)
def merge_accumulators(self, accumulators, *args, **kwargs): if args or kwargs: self._less_than = lambda a, b: self._compare(a, b, *args, **kwargs) add_input = lambda accumulator, element: self.add_input( accumulator, element, *args, **kwargs) else: self._less_than = self._compare add_input = self.add_input result_heap = None holds_comparables = None for accumulator in accumulators: holds_comparables, heap = accumulator if self._less_than is not operator.lt or self._key: if not holds_comparables: heap = [ cy_combiners.ComparableValue(value, self._less_than, self._key) for value in heap ] holds_comparables = True else: assert not holds_comparables if result_heap is None: result_heap = heap else: for comparable in heap: _, result_heap = add_input( (holds_comparables, result_heap), comparable.value if holds_comparables else comparable) assert result_heap is not None and holds_comparables is not None return (holds_comparables, result_heap)
def process(self, element): if self._compare or self._key: element = cy_combiners.ComparableValue(element, self._compare, self._key) if len(self._heap) < self._n: heapq.heappush(self._heap, element) else: heapq.heappushpop(self._heap, element)
def process(self, key_and_bundles): _, bundles = key_and_bundles def push(hp, e): if len(hp) < self._n: heapq.heappush(hp, e) return False elif e < hp[0]: # Because _TopPerBundle returns sorted lists, all other elements # will also be smaller. return True else: heapq.heappushpop(hp, e) return False if self._compare or self._key: heapc = [] # type: List[cy_combiners.ComparableValue] for bundle in bundles: if not heapc: heapc = [ cy_combiners.ComparableValue(element, self._compare, self._key) for element in bundle ] continue for element in reversed(bundle): if push( heapc, cy_combiners.ComparableValue( element, self._compare, self._key)): break heapc.sort() yield [wrapper.value for wrapper in reversed(heapc)] else: heap = [] for bundle in bundles: if not heap: heap = bundle continue for element in reversed(bundle): if push(heap, element): break heap.sort() yield heap[::-1]
def add_input(self, accumulator, element, *args, **kwargs): # Caching to avoid paying the price of variadic expansion of args / kwargs # when it's not needed (for the 'if' case below). holds_comparables, heap = accumulator if self._compare is not operator.lt or self._key: heap = self._hydrated_heap(heap) holds_comparables = True else: assert not holds_comparables comparable = (cy_combiners.ComparableValue(element, self._compare, self._key) if holds_comparables else element) if len(heap) < self._n: heapq.heappush(heap, comparable) else: heapq.heappushpop(heap, comparable) return (holds_comparables, heap)
def _hydrated_heap(self, heap): if heap: first = heap[0] if isinstance(first, cy_combiners.ComparableValue): if first.requires_hydration: for comparable in heap: assert comparable.requires_hydration comparable.hydrate(self._compare, self._key) assert not comparable.requires_hydration return heap else: return heap else: return [ cy_combiners.ComparableValue(element, self._compare, self._key) for element in heap ] else: return heap
def extract_output(self, accumulator, *args, **kwargs): if args or kwargs: self._less_than = lambda a, b: self._compare(a, b, *args, **kwargs) else: self._less_than = self._compare holds_comparables, heap = accumulator if self._less_than is not operator.lt or self._key: if not holds_comparables: heap = [ cy_combiners.ComparableValue(value, self._less_than, self._key) for value in heap ] holds_comparables = True else: assert not holds_comparables assert len(heap) <= self._n heap.sort(reverse=True) return [ comparable.value if holds_comparables else comparable for comparable in heap ]