Ejemplo n.º 1
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.fileutils import AtomicFile
    from disco.worker.task_io import re_reader
    if worker:
        worker.send('MSG', "Downloading {0}".format(filename))
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, bytes):
            raise ValueError("Keys must be bytes for external sort", key)
        if b'\xff' in key or b'\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write(key + b'\xff')
            out_fd.write(encode(pickle_dumps(value, 0)) + b'\x00')
    out_fd.close()
    if worker:
        worker.send('MSG', "Downloaded {0:s} OK".format(format_size(getsize(filename))))
        worker.send('MSG', "Sorting {0}...".format(filename))
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    if worker:
        worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in sort_reader(fd, fd.url):
        yield k, bytes_to_str(decode(str_to_bytes(pickle_loads(v))))
Ejemplo n.º 2
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.util import format_size
    from disco.fileutils import AtomicFile
    worker.send('MSG', "Downloading {0}".format(filename))
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, bytes):
            raise ValueError("Keys must be bytes for external sort", key)
        if b'\xff' in key or b'\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write(key + b'\xff')
            out_fd.write(pickle_dumps(value, 0) + b'\x00')
    out_fd.close()
    worker.send('MSG',
                "Downloaded {0:s} OK".format(format_size(getsize(filename))))
    worker.send('MSG', "Sorting {0}...".format(filename))
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in re_reader(b"(?s)(.*?)\xff(.*?)\x00", fd, len(fd), fd.url):
        yield k, pickle_loads(v)
Ejemplo n.º 3
0
 def append(self, record):
     self.hunk_write(pickle_dumps(record, 1))
     if self.hunk_size > self.min_hunk_size:
         self.flush()
Ejemplo n.º 4
0
 def append(self, record):
     self.hunk_write(pickle_dumps(record, 1))
     if self.hunk_size > self.min_hunk_size:
         self.flush()
Ejemplo n.º 5
0
 def transform(self, s):
     pickled = pickle_dumps(s, 2)
     self.assertEqual(pickled, decode(encode(pickled)))
Ejemplo n.º 6
0
 def transform(self, s):
     pickled = pickle_dumps(s, 2)
     self.assertEqual(pickled, decode(encode(pickled)))