def sort_by_physical_order(self): chunk_files = [] try: log.info("Sort bucket: phase 1: sort chunks: group_id: {0}, bucket: {1}" .format(self.group_id, self.bucket_file.name)) max_chunks_size = 500 * 1024 * 1024 # 500 Mb chunk = [] chunk_size = 0 self.bucket_file.seek(0) for key_data in load_key_data_from_file(self.bucket_file): chunk_size += sys.getsizeof(key_data) + sys.getsizeof(key_data[0]) + sys.getsizeof(key_data[1]) + \ sum(sys.getsizeof(info) for info in key_data[1]) chunk.append((self._get_physical_position(key_data), key_data)) if chunk_size > max_chunks_size: chunk_size = 0 self._sort_chunk(chunk, chunk_files) if chunk: self._sort_chunk(chunk, chunk_files) log.info("Sort bucket: phase 2: merge chunks: group_id: {0}, bucket: {1}" .format(self.group_id, self.bucket_file.name)) class MergeDataWrapper(object): def __init__(self, gen, outer): self.gen = gen self.outer = outer self.next() def __cmp__(self, other): return cmp(self.physical_position, other.physical_position) def next(self): try: self.key_data = self.gen.next() self.physical_position = self.outer._get_physical_position(self.key_data) except StopIteration: return False return True heap = [] for chunk in chunk_files: chunk.seek(0) heapq.heappush(heap, MergeDataWrapper(load_key_data_from_file(chunk), self)) self.bucket_file.seek(0) self.bucket_file.truncate() while len(heap): wrapper = heapq.heappop(heap) dump_key_data(wrapper.key_data, self.bucket_file) if wrapper.next(): heapq.heappush(heap, wrapper) finally: for f in chunk_files: os.unlink(f.name)
def get_keys(self, max_keys_num): ''' Yields bunch of keys from the bucket file. @max_keys_num defines max number of keys in the bunch. ''' self.bucket_file.seek(0) for _, batch in groupby(enumerate(load_key_data_from_file(self.bucket_file)), key=lambda x: x[0] / max_keys_num): yield [item[1] for item in batch]