Esempio n. 1
0
def merge_sort_stupid(fin: io.BufferedIOBase, fout: io.BufferedIOBase, memory_size: int, left=0, count=None):
    fout.seek(0)
    if count is None:
        count = content_length(fin, preserve_pos=False)

    if count <= memory_size:
        go_to_pos(fin, left)
        write_content(fout, sorted(read_content(fin, count=count)), batch_size=memory_size)
        return

    with tmp_file() as left_f, tmp_file() as right_f:
        merge_sort_stupid(fin, left_f, memory_size, left, count=count // 2)
        merge_sort_stupid(fin, right_f, memory_size, left + count // 2, count=count - count // 2)
        left_f.seek(0)
        right_f.seek(0)
        write_content(fout, heapq.merge(read_content(left_f, batch_size=memory_size // 2),
                                        read_content(right_f, batch_size=memory_size // 2)),
                      batch_size=memory_size)
Esempio n. 2
0
def _to_sorted_blocks(fin: io.BufferedIOBase, memory_size):
    while True:
        sorted_values = sorted(read_content(fin, memory_size))
        if not sorted_values:
            break

        f = tmp_file()
        write_content(f, sorted_values)
        f.close()
        yield f
Esempio n. 3
0
    def _check_sorted(self, source: io.BufferedIOBase, res: io.BufferedIOBase):
        hashes_size = 2**20

        def h(value):
            return hash(value) % hashes_size

        source.seek(0)
        source_content = list(itertools.repeat(0, hashes_size))
        for v in read_content(source):
            source_content[h(v)] += 1

        res.seek(0)
        res_content = list(itertools.repeat(0, hashes_size))
        prev = None
        for cur in read_content(res):
            res_content[h(cur)] += 1
            self.assertTrue(prev is None or prev <= cur)
            prev = cur

        self.assertTrue(source_content == res_content, 'Content differs')
Esempio n. 4
0
def _merge_blocks(tmp_files, fout: io.BufferedIOBase, memory_size: int):
    # let's make output buffer slightly larger
    # we can use 3 times `memory_size` for buffers
    buffer_size = 3 * memory_size // (len(tmp_files) + 2)
    for i, f in enumerate(tmp_files):
        f = open(f.name, 'a+b')
        f.seek(0)
        tmp_files[i] = f

    generators = [read_content(f, batch_size=buffer_size) for f in tmp_files]
    write_content(fout, heapq.merge(*generators), batch_size=2 * buffer_size)
    for f in tmp_files:
        f.close()