コード例 #1
0
 def __iter__(self):
     if not self.archives:
         return six.iteritems(self.combined)
     items = self.combined.items()
     items.sort(key=itemgetter(0))
     combined = items
     self.archives.append(iter(combined))
     iters = list(map(iter, self.archives))
     if self.rddconf.is_groupby and self.rddconf.iter_group:
         heap = HeapOnKey(key=lambda x: x[0], min_heap=True)
         it = GroupByNestedIter(heap.merge(iters), "")
     else:
         it = heap_merged(iters, self._get_merge_function())
     return it
コード例 #2
0
ファイル: dependency.py プロジェクト: zhaochl/dpark
class HeapAggregator:
    def __init__(self, heap_limit, key=None, order_reverse=False):
        self.heap = HeapOnKey(key=key, min_heap=order_reverse)
        self.heap_limit = heap_limit
        assert (heap_limit > 0)

    def __getstate__(self):
        return self.heap, self.heap_limit

    def __setstate__(self, state):
        self.heap, self.heap_limit = state

    def createCombiner(self, x):
        return [x]

    def mergeValue(self, s, x):
        if len(s) >= self.heap_limit:
            self.heap.push_pop(s, x)
        else:
            self.heap.push(s, x)
        return s

    def mergeCombiners(self, x, y):
        for item in y:
            if len(x) < self.heap_limit:
                self.heap.push(x, item)
            else:
                self.heap.push_pop(x, item)
        return x
コード例 #3
0
    def test_merge(self):
        N = 100
        n = 13
        a = list(range(N))
        random.shuffle(a)
        a = list(enumerate(a))
        b = a
        lsts = []
        while len(b):
            lsts.append(b[:n])
            b = b[n:]

        key = lambda x: x[1]
        lsts = list(map(lambda x: sorted(x, key=key), lsts))
        # pprint(lsts)

        h = HeapOnKey(key=key, min_heap=True)
        r = list(h.merge(lsts))
        exp = sorted(a, key=key)
        # pprint(exp)
        # pprint(r)

        assert r == exp
コード例 #4
0
 def _merge_sorted(self, iters):
     heap = HeapOnKey(key=lambda x: x[0], min_heap=True)
     return GroupByNestedIter(heap.merge(iters), self.call_site)
コード例 #5
0
 def _merge_sorted(self, iters):
     heap = HeapOnKey(key=lambda x: x[0], min_heap=True)
     merged = heap.merge(iters)
     return self.aggregator.aggregate_sorted(merged)
コード例 #6
0
ファイル: shuffle.py プロジェクト: weiqiangzheng/dpark
 def merge(self, iters):
     heap = HeapOnKey(key=lambda x: x[0], min_heap=True)
     self.combined = GroupByNestedIter(heap.merge(iters), self.rdd_name)
コード例 #7
0
ファイル: shuffle.py プロジェクト: weiqiangzheng/dpark
 def merge(self, iters):
     heap = HeapOnKey(key=lambda x: x[0], min_heap=True)
     self.combined = self.aggregator.aggregate_sorted(heap.merge(iters))
コード例 #8
0
ファイル: dependency.py プロジェクト: zhaochl/dpark
 def __init__(self, heap_limit, key=None, order_reverse=False):
     self.heap = HeapOnKey(key=key, min_heap=order_reverse)
     self.heap_limit = heap_limit
     assert (heap_limit > 0)