def merge_sort_stupid(fin: io.BufferedIOBase, fout: io.BufferedIOBase, memory_size: int, left=0, count=None): fout.seek(0) if count is None: count = content_length(fin, preserve_pos=False) if count <= memory_size: go_to_pos(fin, left) write_content(fout, sorted(read_content(fin, count=count)), batch_size=memory_size) return with tmp_file() as left_f, tmp_file() as right_f: merge_sort_stupid(fin, left_f, memory_size, left, count=count // 2) merge_sort_stupid(fin, right_f, memory_size, left + count // 2, count=count - count // 2) left_f.seek(0) right_f.seek(0) write_content(fout, heapq.merge(read_content(left_f, batch_size=memory_size // 2), read_content(right_f, batch_size=memory_size // 2)), batch_size=memory_size)
def _to_sorted_blocks(fin: io.BufferedIOBase, memory_size): while True: sorted_values = sorted(read_content(fin, memory_size)) if not sorted_values: break f = tmp_file() write_content(f, sorted_values) f.close() yield f
def _check_sorted(self, source: io.BufferedIOBase, res: io.BufferedIOBase): hashes_size = 2**20 def h(value): return hash(value) % hashes_size source.seek(0) source_content = list(itertools.repeat(0, hashes_size)) for v in read_content(source): source_content[h(v)] += 1 res.seek(0) res_content = list(itertools.repeat(0, hashes_size)) prev = None for cur in read_content(res): res_content[h(cur)] += 1 self.assertTrue(prev is None or prev <= cur) prev = cur self.assertTrue(source_content == res_content, 'Content differs')
def _merge_blocks(tmp_files, fout: io.BufferedIOBase, memory_size: int): # let's make output buffer slightly larger # we can use 3 times `memory_size` for buffers buffer_size = 3 * memory_size // (len(tmp_files) + 2) for i, f in enumerate(tmp_files): f = open(f.name, 'a+b') f.seek(0) tmp_files[i] = f generators = [read_content(f, batch_size=buffer_size) for f in tmp_files] write_content(fout, heapq.merge(*generators), batch_size=2 * buffer_size) for f in tmp_files: f.close()