Esempio n. 1
0
    def download_and_sort(self, params):
        dlname = Task.path("REDUCE_DL", Task.id)
        msg("Reduce will be downloaded to %s" % dlname)
        out_fd = AtomicFile(dlname, "w")
        for url in self.inputs:
            fd, sze, url = connect_input(url, params)
            for k, v in fun_reader(fd, sze, url):
                if " " in k:
                    err("Spaces are not allowed in keys "\
                        "with external sort.")
                if "\0" in v:
                    err("Zero bytes are not allowed in "\
                        "values with external sort. "\
                        "Consider using base64 encoding.")
                out_fd.write("%s %s\0" % (k, v))
        out_fd.close()
        msg("Reduce input downloaded ok")

        msg("Starting external sort")
        sortname = Task.path("REDUCE_SORTED", Task.id)
        ensure_path(os.path.dirname(sortname))
        cmd = ["sort", "-n", "-k", "1,1", "-z",\
            "-t", " ", "-o", sortname, dlname]

        proc = subprocess.Popen(cmd)
        ret = proc.wait()
        if ret:
            err("Sorting %s to %s failed (%d)" %\
                (dlname, sortname, ret))

        msg("External sort done: %s" % sortname)
        return self.multi_file_iterator([sortname], params, reader =\
            lambda fd, sze, url:\
                re_reader("(?s)(.*?) (.*?)\000", fd, sze, url))
Esempio n. 2
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.util import format_size
    from disco.fileutils import AtomicFile
    worker.send('MSG', "Downloading {0}".format(filename))
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, bytes):
            raise ValueError("Keys must be bytes for external sort", key)
        if b'\xff' in key or b'\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write(key + b'\xff')
            out_fd.write(pickle_dumps(value, 0) + b'\x00')
    out_fd.close()
    worker.send('MSG',
                "Downloaded {0:s} OK".format(format_size(getsize(filename))))
    worker.send('MSG', "Sorting {0}...".format(filename))
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in re_reader(b"(?s)(.*?)\xff(.*?)\x00", fd, len(fd), fd.url):
        yield k, pickle_loads(v)
Esempio n. 3
0
def task_output_stream(stream, partition, url, params):
    """
    An :func:`output_stream` which returns a handle to a task output.
    The handle ensures that if a task fails, partially written data is ignored.
    """
    from disco.fileutils import AtomicFile
    return AtomicFile(url)
Esempio n. 4
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.fileutils import AtomicFile
    from disco.worker.task_io import re_reader
    if worker:
        worker.send('MSG', "Downloading {0}".format(filename))
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, bytes):
            raise ValueError("Keys must be bytes for external sort", key)
        if b'\xff' in key or b'\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write(key + b'\xff')
            out_fd.write(encode(pickle_dumps(value, 0)) + b'\x00')
    out_fd.close()
    if worker:
        worker.send('MSG', "Downloaded {0:s} OK".format(format_size(getsize(filename))))
        worker.send('MSG', "Sorting {0}...".format(filename))
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    if worker:
        worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in sort_reader(fd, fd.url):
        yield k, bytes_to_str(decode(str_to_bytes(pickle_loads(v))))
Esempio n. 5
0
def reduce_output_stream(stream, partition, url, params):
    """
    An :func:`output_stream` which returns a handle to a reduce output.
    The handle ensures that if a task fails, partially written data is ignored.
    """
    from disco.fileutils import AtomicFile
    path, url = Task.reduce_output
    Task.blobs.append(path)
    return AtomicFile(path, 'w'), url
Esempio n. 6
0
    def sorted_entries(self):
        dlname = self.path('reduce-in-%d.dl' % self.id)
        Message("Downloading %s" % dlname)
        out_fd = AtomicFile(dlname, 'w')
        for key, value in self.entries:
            if not isinstance(key, str):
                raise ValueError("Keys must be strings for external sort")
            if '\xff' in key or '\x00' in key:
                raise ValueError("Cannot sort keys with 0xFF or 0x00 bytes")
            else:
                # value pickled using protocol 0 will always be printable ASCII
                out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0)))
        out_fd.close()
        Message("Downloaded OK")

        self.disk_sort(dlname)
        fd, size, url = comm.open_local(dlname)
        for k, v in func.re_reader("(?s)(.*?)\xff(.*?)\x00", fd, size, url):
            yield k, cPickle.loads(v)
Esempio n. 7
0
def map_output_stream(stream, partition, url, params):
    """
    An :func:`output_stream` which returns a handle to a partition output.
    The handle ensures that if a task fails, partially written data is ignored.
    """
    from disco.fileutils import AtomicFile
    if Task.ispartitioned:
        path, url = Task.partition_output(partition)
    else:
        path, url = Task.map_output(partition)
    Task.blobs.append(path)
    return AtomicFile(path, 'w'), url
Esempio n. 8
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.util import format_size
    from disco.fileutils import AtomicFile
    worker.send('MSG', "Downloading %s" % filename)
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, str):
            raise ValueError("Keys must be strings for external sort", key)
        if '\xff' in key or '\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0)))
    out_fd.close()
    worker.send('MSG', "Downloaded %s OK" % format_size(getsize(filename)))
    worker.send('MSG', "Sorting %s..." % filename)
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in delimited_reader(fd,
                                 len(fd),
                                 fd.url,
                                 delimiter='\xff',
                                 line_terminator='\x00'):
        yield k, cPickle.loads(v)
Esempio n. 9
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.util import format_size
    from disco.fileutils import AtomicFile
    worker.send('MSG', "Downloading %s" % filename)
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, str):
            raise ValueError("Keys must be strings for external sort", key)
        if '\xff' in key or '\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0)))
    out_fd.close()
    worker.send('MSG', "Downloaded %s OK" % format_size(getsize(filename)))
    worker.send('MSG', "Sorting %s..." % filename)
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in delimited_reader(fd, len(fd), fd.url, delimiter='\xff', line_terminator='\x00'):
        yield k, cPickle.loads(v)
Esempio n. 10
0
    def download_and_sort(self):
        dlname = self.task.path('REDUCE_DL', self.task.id)
        Message("Reduce will be downloaded to %s" % dlname)
        out_fd = AtomicFile(dlname, 'w')
        for url in self.inputs:
            reader, sze, url = self.task.connect_input(url)
            for k, v in reader:
                self.sort_writer(out_fd, k, v)
        out_fd.close()
        Message("Reduce input downloaded ok")

        Message("Starting external sort")
        sortname = self.task.path('REDUCE_SORTED', self.task.id)
        ensure_path(os.path.dirname(sortname))
        cmd = ['sort', '-n', '-k', '1,1', '-T', '.',
                       '-z', '-t', '\xff', '-o', sortname, dlname]

        proc = subprocess.Popen(cmd)
        ret = proc.wait()
        if ret:
            TaskFailed("Sorting %s to %s failed (%d)" % (dlname, sortname, ret))

        Message("External sort done: %s" % sortname)
        return self.multi_file_iterator(self.sort_reader, inputs=[sortname])
Esempio n. 11
0
 def concat_input(cls, task, output_label, replicas):
     output = AtomicFile(task.output_path(output_label))
     BUFFER_SIZE = 1024 * 1024
     for reps in replicas:
         # Use only the first replica for now, since a set of one
         # is the most common case.
         # TODO: handle falling back to alternative replicas.
         inp = open_url(reps[0])
         buf = inp.read(BUFFER_SIZE)
         while (len(buf) > 0):
             output.write(buf)
             buf = inp.read(BUFFER_SIZE)
         inp.close()
     output.close()
     return output.path, output.size()
Esempio n. 12
0
 def concat_input(cls, task, output_label, replicas):
     output = AtomicFile(task.output_path(output_label))
     BUFFER_SIZE = 1024*1024
     for reps in replicas:
         # Use only the first replica for now, since a set of one
         # is the most common case.
         # TODO: handle falling back to alternative replicas.
         inp = open_url(reps[0])
         buf = inp.read(BUFFER_SIZE)
         while (len(buf) > 0):
             output.write(buf)
             buf = inp.read(BUFFER_SIZE)
         inp.close()
     output.close()
     return output.path, output.size()
Esempio n. 13
0
    def sorted_entries(self):
        dlname = self.path('reduce-in-%d.dl' % self.id)
        Status("Downloading %s" % dlname)
        out_fd = AtomicFile(dlname, 'w')
        for key, value in self.entries:
            if not isinstance(key, str):
                raise ValueError("Keys must be strings for external sort", key)
            if '\xff' in key or '\x00' in key:
                raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
            else:
                # value pickled using protocol 0 will always be printable ASCII
                out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0)))
        out_fd.close()
        Status("Downloaded OK")

        self.disk_sort(dlname)
        fd, size, url = comm.open_local(dlname)
        for k, v in func.re_reader("(?s)(.*?)\xff(.*?)\x00", fd, size, url):
            yield k, cPickle.loads(v)
Esempio n. 14
0
def disk_sort(input, filename, sort_keys, binaries=(), sort_buffer_size='10%', desc=False):
    import ujson
    from disco.comm import open_local
    from disco.fileutils import AtomicFile
    import base64
    # import sys
    # sys.path.append('/Library/Python/2.7/site-packages/pycharm-debug.egg')
    # import pydevd
    # pydevd.settrace('localhost', port=12999, stdoutToServer=True, stderrToServer=True)
    out_fd = AtomicFile(filename)
    key_types = None
    MPT = ()
    # print "SORTKEY: %s" % repr(sort_keys)
    for key, _ in input:
        if isinstance(key, (str, unicode)):
            raise ValueError("Keys must be sequences", key)

        # determine if the key is numeric
        if key_types is None:
            key_types = []
            for kt in key:
                try:
                    float(kt)
                    key_types.append('n')
                except:
                    key_types.append('')

        #serialize the key - encoded either as NULL, json, or b64 - note that
        for i, each_key in enumerate(key):
            if each_key is None:
                ukey = b'\x00'
            elif i in binaries and key_types[i] != 'n':
                ukey = base64.b64encode(each_key)
            else:
                ukey = ujson.dumps(each_key)
            out_fd.write(ukey)
            out_fd.write(b'\xff')
        out_fd.write('\n')
    out_fd.flush()
    out_fd.close()
    unix_sort(filename,
              [(sk, key_types[sk]) for sk in sort_keys],
              sort_buffer_size=sort_buffer_size,
              desc=desc)
    fd = open_local(filename)
    for k in sort_reader(fd, fd.url):
        # yield [ujson.loads(key) if key != b'\x00' else None for key in k], MPT

        rval = []
        for i, key in enumerate(k):
            if key == b'\x00':
                rkey = None
            elif i in binaries:
                rkey = base64.b64decode(key)
            else:
                rkey = ujson.loads(key)
            rval.append(rkey)
        yield rval, MPT
Esempio n. 15
0
def disk_sort(input, filename, sort_keys, binaries=(), sort_buffer_size='10%',
              desc=False):
    import ujson
    from disco.comm import open_local
    from disco.fileutils import AtomicFile
    import base64
    # import sys
    # sys.path.append('/Library/Python/2.7/site-packages/pycharm-debug.egg')
    # import pydevd
    # pydevd.settrace('localhost', port=12999, stdoutToServer=True, stderrToServer=True)
    out_fd = AtomicFile(filename)
    key_types = None
    MPT = ()
    # print "SORTKEY: %s" % repr(sort_keys)
    for key, _ in input:
        if isinstance(key, (str, unicode)):
            raise ValueError("Keys must be sequences", key)

        # determine if the key is numeric
        if key_types is None:
            key_types = []
            for kt in key:
                try:
                    float(kt)
                    key_types.append('n')
                except:
                    key_types.append('')

        #serialize the key - encoded either as NULL, json, or b64 - note that
        for i, each_key in enumerate(key):
            if each_key is None:
                ukey = b'\x00'
            elif i in binaries and key_types[i] != 'n':
                ukey = base64.b64encode(each_key)
            else:
                ukey = ujson.dumps(each_key)
            out_fd.write(ukey)
            out_fd.write(b'\xff')
        out_fd.write('\n')
    out_fd.flush()
    out_fd.close()
    unix_sort(filename,
              [(sk, key_types[sk]) for sk in sort_keys],
              sort_buffer_size=sort_buffer_size,
              desc=desc)
    fd = open_local(filename)
    for k in sort_reader(fd, fd.url):
        # yield [ujson.loads(key) if key != b'\x00' else None for key in k], MPT

        rval = []
        for i, key in enumerate(k):
            if key == b'\x00':
                rkey = None
            elif i in binaries:
                rkey = base64.b64decode(key)
            else:
                rkey = ujson.loads(key)
            rval.append(rkey)
        yield rval, MPT