Example #1
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.fileutils import AtomicFile
    from disco.worker.task_io import re_reader
    if worker:
        worker.send('MSG', "Downloading {0}".format(filename))
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, bytes):
            raise ValueError("Keys must be bytes for external sort", key)
        if b'\xff' in key or b'\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write(key + b'\xff')
            out_fd.write(encode(pickle_dumps(value, 0)) + b'\x00')
    out_fd.close()
    if worker:
        worker.send('MSG', "Downloaded {0:s} OK".format(format_size(getsize(filename))))
        worker.send('MSG', "Sorting {0}...".format(filename))
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    if worker:
        worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in sort_reader(fd, fd.url):
        yield k, bytes_to_str(decode(str_to_bytes(pickle_loads(v))))
Example #2
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.util import format_size
    from disco.fileutils import AtomicFile
    worker.send('MSG', "Downloading {0}".format(filename))
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, bytes):
            raise ValueError("Keys must be bytes for external sort", key)
        if b'\xff' in key or b'\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write(key + b'\xff')
            out_fd.write(pickle_dumps(value, 0) + b'\x00')
    out_fd.close()
    worker.send('MSG',
                "Downloaded {0:s} OK".format(format_size(getsize(filename))))
    worker.send('MSG', "Sorting {0}...".format(filename))
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in re_reader(b"(?s)(.*?)\xff(.*?)\x00", fd, len(fd), fd.url):
        yield k, pickle_loads(v)
Example #3
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.util import format_size
    from disco.fileutils import AtomicFile
    worker.send('MSG', "Downloading %s" % filename)
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, str):
            raise ValueError("Keys must be strings for external sort", key)
        if '\xff' in key or '\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0)))
    out_fd.close()
    worker.send('MSG', "Downloaded %s OK" % format_size(getsize(filename)))
    worker.send('MSG', "Sorting %s..." % filename)
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in delimited_reader(fd,
                                 len(fd),
                                 fd.url,
                                 delimiter='\xff',
                                 line_terminator='\x00'):
        yield k, cPickle.loads(v)
Example #4
0
def input_stream(fd, size, url, params):
    """Opens the url path locally, relative to the path `[Task.root]/input`."""
    import os
    from disco.comm import open_local
    t, fname = url[6:].split("/", 1)
    path = os.path.join(Task.root, "input", fname)
    return open_local(path)
Example #5
0
def disk_sort(input, filename, sort_keys, binaries=(), sort_buffer_size='10%',
              desc=False):
    import ujson
    from disco.comm import open_local
    from disco.fileutils import AtomicFile
    import base64
    # import sys
    # sys.path.append('/Library/Python/2.7/site-packages/pycharm-debug.egg')
    # import pydevd
    # pydevd.settrace('localhost', port=12999, stdoutToServer=True, stderrToServer=True)
    out_fd = AtomicFile(filename)
    key_types = None
    MPT = ()
    # print "SORTKEY: %s" % repr(sort_keys)
    for key, _ in input:
        if isinstance(key, (str, unicode)):
            raise ValueError("Keys must be sequences", key)

        # determine if the key is numeric
        if key_types is None:
            key_types = []
            for kt in key:
                try:
                    float(kt)
                    key_types.append('n')
                except:
                    key_types.append('')

        #serialize the key - encoded either as NULL, json, or b64 - note that
        for i, each_key in enumerate(key):
            if each_key is None:
                ukey = b'\x00'
            elif i in binaries and key_types[i] != 'n':
                ukey = base64.b64encode(each_key)
            else:
                ukey = ujson.dumps(each_key)
            out_fd.write(ukey)
            out_fd.write(b'\xff')
        out_fd.write('\n')
    out_fd.flush()
    out_fd.close()
    unix_sort(filename,
              [(sk, key_types[sk]) for sk in sort_keys],
              sort_buffer_size=sort_buffer_size,
              desc=desc)
    fd = open_local(filename)
    for k in sort_reader(fd, fd.url):
        # yield [ujson.loads(key) if key != b'\x00' else None for key in k], MPT

        rval = []
        for i, key in enumerate(k):
            if key == b'\x00':
                rkey = None
            elif i in binaries:
                rkey = base64.b64decode(key)
            else:
                rkey = ujson.loads(key)
            rval.append(rkey)
        yield rval, MPT
Example #6
0
def input_stream(fd, size, url, params):
    """Opens the path on host using an http client and the setting `DISCO_PORT`.

    For instance, if `DISCO_PORT = 8989`, `disco://host/path` would be converted to `http://host:8989/path`.
    """
    host, fname = url[8:].split("/", 1)
    if host == Task.host or Task.has_flag("resultfs"):
        path = os.path.join(Task.root, "data", fname)
        return comm.open_local(path, url)
    return comm.open_remote("http://%s:%s/%s" % (host, Task.port, fname))
Example #7
0
def input_stream(fd, size, url, params):
    """
    Opens the path on host using an http client and the setting `DISCO_PORT`.
    """
    scheme, netloc, rest = urlsplit(url)
    prefix, fname = rest.split('/', 1)
    if netloc[0] == Task.netloc[0]:
        if prefix == 'ddfs':
            root = Task.settings['DDFS_ROOT']
        else:
            root = Task.settings['DISCO_DATA']
        path = os.path.join(root, fname)
        return comm.open_local(path)
    return comm.open_remote('http://%s/%s/%s' % (netloc, prefix, fname))
Example #8
0
    def sorted_entries(self):
        dlname = self.path('reduce-in-%d.dl' % self.id)
        Message("Downloading %s" % dlname)
        out_fd = AtomicFile(dlname, 'w')
        for key, value in self.entries:
            if not isinstance(key, str):
                raise ValueError("Keys must be strings for external sort")
            if '\xff' in key or '\x00' in key:
                raise ValueError("Cannot sort keys with 0xFF or 0x00 bytes")
            else:
                # value pickled using protocol 0 will always be printable ASCII
                out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0)))
        out_fd.close()
        Message("Downloaded OK")

        self.disk_sort(dlname)
        fd, size, url = comm.open_local(dlname)
        for k, v in func.re_reader("(?s)(.*?)\xff(.*?)\x00", fd, size, url):
            yield k, cPickle.loads(v)
Example #9
0
    def sorted_entries(self):
        dlname = self.path('reduce-in-%d.dl' % self.id)
        Status("Downloading %s" % dlname)
        out_fd = AtomicFile(dlname, 'w')
        for key, value in self.entries:
            if not isinstance(key, str):
                raise ValueError("Keys must be strings for external sort", key)
            if '\xff' in key or '\x00' in key:
                raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
            else:
                # value pickled using protocol 0 will always be printable ASCII
                out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0)))
        out_fd.close()
        Status("Downloaded OK")

        self.disk_sort(dlname)
        fd, size, url = comm.open_local(dlname)
        for k, v in func.re_reader("(?s)(.*?)\xff(.*?)\x00", fd, size, url):
            yield k, cPickle.loads(v)
Example #10
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.util import format_size
    from disco.fileutils import AtomicFile
    worker.send('MSG', "Downloading %s" % filename)
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, str):
            raise ValueError("Keys must be strings for external sort", key)
        if '\xff' in key or '\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0)))
    out_fd.close()
    worker.send('MSG', "Downloaded %s OK" % format_size(getsize(filename)))
    worker.send('MSG', "Sorting %s..." % filename)
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in delimited_reader(fd, len(fd), fd.url, delimiter='\xff', line_terminator='\x00'):
        yield k, cPickle.loads(v)
Example #11
0
def input_stream(fd, size, url, params):
    """Opens the url locally on the node."""
    return open_local(url[7:], url)
Example #12
0
def input_stream(fd, size, url, params):
    """Opens the url locally on the node."""
    scheme, path = schemesplit(url)
    return open_local(path, url)
Example #13
0
 def sort_reader(self, url):
     fd, sze, url = comm.open_local(url, url)
     for k, v in func.re_reader("(?s)(.*?)\xff(.*?)\x00", fd, sze, url):
         yield k, cPickle.loads(v)
Example #14
0
def input_stream(fd, size, url, params):
    """Opens the url locally on the node."""
    from disco.comm import open_local
    from disco.util import schemesplit
    scheme, path = schemesplit(url)
    return open_local(path)
Example #15
0
 def open_url(self, url):
     scheme, netloc, rest = util.urlsplit(url, localhost=self.host)
     if not scheme or scheme == 'file':
         return comm.open_local(rest)
     return comm.open_remote('%s://%s/%s' % (scheme, netloc, rest))