def __iter__(self): if not self.archives: return six.iteritems(self.combined) items = self.combined.items() items.sort(key=itemgetter(0)) combined = items self.archives.append(iter(combined)) iters = list(map(iter, self.archives)) if self.rddconf.is_groupby and self.rddconf.iter_group: heap = HeapOnKey(key=lambda x: x[0], min_heap=True) it = GroupByNestedIter(heap.merge(iters), "") else: it = heap_merged(iters, self._get_merge_function()) return it
class HeapAggregator: def __init__(self, heap_limit, key=None, order_reverse=False): self.heap = HeapOnKey(key=key, min_heap=order_reverse) self.heap_limit = heap_limit assert (heap_limit > 0) def __getstate__(self): return self.heap, self.heap_limit def __setstate__(self, state): self.heap, self.heap_limit = state def createCombiner(self, x): return [x] def mergeValue(self, s, x): if len(s) >= self.heap_limit: self.heap.push_pop(s, x) else: self.heap.push(s, x) return s def mergeCombiners(self, x, y): for item in y: if len(x) < self.heap_limit: self.heap.push(x, item) else: self.heap.push_pop(x, item) return x
def test_merge(self): N = 100 n = 13 a = list(range(N)) random.shuffle(a) a = list(enumerate(a)) b = a lsts = [] while len(b): lsts.append(b[:n]) b = b[n:] key = lambda x: x[1] lsts = list(map(lambda x: sorted(x, key=key), lsts)) # pprint(lsts) h = HeapOnKey(key=key, min_heap=True) r = list(h.merge(lsts)) exp = sorted(a, key=key) # pprint(exp) # pprint(r) assert r == exp
def _merge_sorted(self, iters): heap = HeapOnKey(key=lambda x: x[0], min_heap=True) return GroupByNestedIter(heap.merge(iters), self.call_site)
def _merge_sorted(self, iters): heap = HeapOnKey(key=lambda x: x[0], min_heap=True) merged = heap.merge(iters) return self.aggregator.aggregate_sorted(merged)
def merge(self, iters): heap = HeapOnKey(key=lambda x: x[0], min_heap=True) self.combined = GroupByNestedIter(heap.merge(iters), self.rdd_name)
def merge(self, iters): heap = HeapOnKey(key=lambda x: x[0], min_heap=True) self.combined = self.aggregator.aggregate_sorted(heap.merge(iters))
def __init__(self, heap_limit, key=None, order_reverse=False): self.heap = HeapOnKey(key=key, min_heap=order_reverse) self.heap_limit = heap_limit assert (heap_limit > 0)