def sorted(self, iterator, key=None, reverse=False): """ Sort the elements in iterator, do external sort when the memory goes above the limit. """ global MemoryBytesSpilled, DiskBytesSpilled batch, limit = 100, self.memory_limit chunks, current_chunk = [], [] iterator = iter(iterator) while True: # pick elements in batch chunk = list(itertools.islice(iterator, batch)) current_chunk.extend(chunk) if len(chunk) < batch: break used_memory = get_used_memory() if used_memory > limit: # sort them inplace will save memory current_chunk.sort(key=key, reverse=reverse) path = self._get_path(len(chunks)) with open(path, 'wb') as f: self.serializer.dump_stream(current_chunk, f) def load(f): for v in self.serializer.load_stream(f): yield v # close the file explicit once we consume all the items # to avoid ResourceWarning in Python3 f.close() chunks.append(load(open(path, 'rb'))) current_chunk = [] gc.collect() batch //= 2 limit = self._next_limit() MemoryBytesSpilled += max(used_memory - get_used_memory(), 0) << 20 DiskBytesSpilled += os.path.getsize(path) os.unlink(path) # data will be deleted after close elif not chunks: batch = min(int(batch * 1.5), 10000) current_chunk.sort(key=key, reverse=reverse) if not chunks: return current_chunk if current_chunk: chunks.append(iter(current_chunk)) return heapq.merge(chunks, key=key, reverse=reverse)
def sorted(self, iterator, key=None, reverse=False): """ Sort the elements in iterator, do external sort when the memory goes above the limit. """ global MemoryBytesSpilled, DiskBytesSpilled batch, limit = 100, self.memory_limit chunks, current_chunk = [], [] iterator = iter(iterator) while True: # pick elements in batch chunk = list(itertools.islice(iterator, batch)) current_chunk.extend(chunk) if len(chunk) < batch: break used_memory = get_used_memory() if used_memory > limit: # sort them inplace will save memory current_chunk.sort(key=key, reverse=reverse) path = self._get_path(len(chunks)) with open(path, 'wb') as f: self.serializer.dump_stream(current_chunk, f) def load(f): for v in self.serializer.load_stream(f): yield v # close the file explicit once we consume all the items # to avoid ResourceWarning in Python3 f.close() chunks.append(load(open(path, 'rb'))) current_chunk = [] gc.collect() batch //= 2 limit = self._next_limit() MemoryBytesSpilled += (used_memory - get_used_memory()) << 20 DiskBytesSpilled += os.path.getsize(path) os.unlink(path) # data will be deleted after close elif not chunks: batch = min(int(batch * 1.5), 10000) current_chunk.sort(key=key, reverse=reverse) if not chunks: return current_chunk if current_chunk: chunks.append(iter(current_chunk)) return heapq.merge(chunks, key=key, reverse=reverse)
def sorted(self, iterator, key=None, reverse=False): """ Sort the elements in iterator, do external sort when the memory goes above the limit. """ global MemoryBytesSpilled, DiskBytesSpilled batch, limit = 100, self._next_limit() chunks, current_chunk = [], [] iterator = iter(iterator) while True: # pick elements in batch chunk = list(itertools.islice(iterator, batch)) current_chunk.extend(chunk) if len(chunk) < batch: break used_memory = get_used_memory() if used_memory > self.memory_limit: # sort them inplace will save memory current_chunk.sort(key=key, reverse=reverse) path = self._get_path(len(chunks)) with open(path, 'w') as f: self.serializer.dump_stream(current_chunk, f) chunks.append(self.serializer.load_stream(open(path))) current_chunk = [] gc.collect() limit = self._next_limit() MemoryBytesSpilled += (used_memory - get_used_memory()) << 20 DiskBytesSpilled += os.path.getsize(path) os.unlink(path) # data will be deleted after close elif not chunks: batch = min(batch * 2, 10000) current_chunk.sort(key=key, reverse=reverse) if not chunks: return current_chunk if current_chunk: chunks.append(iter(current_chunk)) return heapq.merge(chunks, key=key, reverse=reverse)
def _merge_sorted_items(self, index): """ load a partition from disk, then sort and group by key """ def load_partition(j): path = self._get_spill_dir(j) p = os.path.join(path, str(index)) return self.serializer.load_stream(open(p, 'r', 65536)) disk_items = [load_partition(j) for j in range(self.spills)] if self._sorted: # all the partitions are already sorted sorted_items = heapq.merge(disk_items, key=operator.itemgetter(0)) else: # Flatten the combined values, so it will not consume huge # memory during merging sort. ser = self.flattened_serializer() sorter = ExternalSorter(self.memory_limit, ser) sorted_items = sorter.sorted(itertools.chain(*disk_items), key=operator.itemgetter(0)) return ((k, vs) for k, vs in GroupByKey(sorted_items))
def sorted(self, iterator, key=None, reverse=False): """ Sort the elements in iterator, do external sort when the memory goes above the limit. """ batch = 10 chunks, current_chunk = [], [] iterator = iter(iterator) while True: # pick elements in batch chunk = list(itertools.islice(iterator, batch)) current_chunk.extend(chunk) if len(chunk) < batch: break if get_used_memory() > self.memory_limit: # sort them inplace will save memory current_chunk.sort(key=key, reverse=reverse) path = self._get_path(len(chunks)) with open(path, 'w') as f: self.serializer.dump_stream(current_chunk, f) self._spilled_bytes += os.path.getsize(path) chunks.append(self.serializer.load_stream(open(path))) current_chunk = [] elif not chunks: batch = min(batch * 2, 10000) current_chunk.sort(key=key, reverse=reverse) if not chunks: return current_chunk if current_chunk: chunks.append(iter(current_chunk)) return heapq.merge(chunks, key=key, reverse=reverse)