Beispiel #1
0
    def sort_by_physical_order(self):
        chunk_files = []
        try:
            log.info("Sort bucket: phase 1: sort chunks: group_id: {0}, bucket: {1}"
                     .format(self.group_id, self.bucket_file.name))
            max_chunks_size = 500 * 1024 * 1024 # 500 Mb

            chunk = []
            chunk_size = 0

            self.bucket_file.seek(0)
            for key_data in load_key_data_from_file(self.bucket_file):
                chunk_size += sys.getsizeof(key_data) + sys.getsizeof(key_data[0]) + sys.getsizeof(key_data[1]) + \
                              sum(sys.getsizeof(info) for info in key_data[1])
                chunk.append((self._get_physical_position(key_data), key_data))
                if chunk_size > max_chunks_size:
                    chunk_size = 0
                    self._sort_chunk(chunk, chunk_files)
            if chunk:
                self._sort_chunk(chunk, chunk_files)

            log.info("Sort bucket: phase 2: merge chunks: group_id: {0}, bucket: {1}"
                     .format(self.group_id, self.bucket_file.name))
            class MergeDataWrapper(object):
                def __init__(self, gen, outer):
                    self.gen = gen
                    self.outer = outer
                    self.next()

                def __cmp__(self, other):
                    return cmp(self.physical_position, other.physical_position)

                def next(self):
                    try:
                        self.key_data = self.gen.next()
                        self.physical_position = self.outer._get_physical_position(self.key_data)
                    except StopIteration:
                        return False
                    return True

            heap = []
            for chunk in chunk_files:
                chunk.seek(0)
                heapq.heappush(heap, MergeDataWrapper(load_key_data_from_file(chunk), self))

            self.bucket_file.seek(0)
            self.bucket_file.truncate()
            while len(heap):
                wrapper = heapq.heappop(heap)
                dump_key_data(wrapper.key_data, self.bucket_file)
                if wrapper.next():
                    heapq.heappush(heap, wrapper)
        finally:
            for f in chunk_files:
                os.unlink(f.name)
Beispiel #2
0
 def get_keys(self, max_keys_num):
     '''
     Yields bunch of keys from the bucket file.
     @max_keys_num defines max number of keys in the bunch.
     '''
     self.bucket_file.seek(0)
     for _, batch in groupby(enumerate(load_key_data_from_file(self.bucket_file)),
                             key=lambda x: x[0] / max_keys_num):
         yield [item[1] for item in batch]
Beispiel #3
0
 def get_keys(self, max_keys_num):
     '''
     Yields bunch of keys from the bucket file.
     @max_keys_num defines max number of keys in the bunch.
     '''
     self.bucket_file.seek(0)
     for _, batch in groupby(enumerate(load_key_data_from_file(self.bucket_file)),
                             key=lambda x: x[0] / max_keys_num):
         yield [item[1] for item in batch]