Ejemplo n.º 1
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.util import format_size
    from disco.fileutils import AtomicFile
    worker.send('MSG', "Downloading %s" % filename)
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, str):
            raise ValueError("Keys must be strings for external sort", key)
        if '\xff' in key or '\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0)))
    out_fd.close()
    worker.send('MSG', "Downloaded %s OK" % format_size(getsize(filename)))
    worker.send('MSG', "Sorting %s..." % filename)
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in delimited_reader(fd,
                                 len(fd),
                                 fd.url,
                                 delimiter='\xff',
                                 line_terminator='\x00'):
        yield k, cPickle.loads(v)
Ejemplo n.º 2
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.util import format_size
    from disco.fileutils import AtomicFile
    worker.send('MSG', "Downloading {0}".format(filename))
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, bytes):
            raise ValueError("Keys must be bytes for external sort", key)
        if b'\xff' in key or b'\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write(key + b'\xff')
            out_fd.write(pickle_dumps(value, 0) + b'\x00')
    out_fd.close()
    worker.send('MSG',
                "Downloaded {0:s} OK".format(format_size(getsize(filename))))
    worker.send('MSG', "Sorting {0}...".format(filename))
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in re_reader(b"(?s)(.*?)\xff(.*?)\x00", fd, len(fd), fd.url):
        yield k, pickle_loads(v)
Ejemplo n.º 3
0
    def _run(self):
        entries = self.track_status(self, "%s entries reduced")
        red_out, out_url, fd_list = self.connect_output()
        params = self.params

        if self.ext_reduce:
            external.prepare(self.reduce, self.ext_params, self.path('ext.reduce'))
            self.reduce = FunctionType(external.ext_reduce.func_code,
                                       globals=external.__dict__)
            self.insert_globals([self.reduce])

        total_size = sum(size for fd, size, url in self.connected_inputs)
        Message("Input is %s" % (util.format_size(total_size)))

        self.init(entries, params)
        self.reduce(entries, red_out, params)

        self.close_output(fd_list)
        external.close_ext()

        if self.save:
            OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
            Message("Results pushed to DDFS")
        else:
            index, index_url = self.reduce_index
            safe_update(index, ['%d %s' % (self.id, out_url)])
            OutputURL(index_url)
Ejemplo n.º 4
0
    def _run(self):
        entries = self.track_status(self, "%s entries reduced")
        red_out, out_url, fd_list = self.connect_output()
        params = self.params

        if self.ext_reduce:
            external.prepare(self.reduce, self.ext_params, self.path('ext.reduce'))
            self.reduce = FunctionType(external.ext_reduce.func_code,
                                       globals=external.__dict__)
            self.insert_globals([self.reduce])

        total_size = sum(size for fd, size, url in self.connected_inputs)
        Status("Input is %s" % (util.format_size(total_size)))

        self.init(entries, params)
        if util.argcount(self.reduce) < 3:
            for k, v in self.reduce(entries, *(params, )):
                red_out.add(k, v)
        else:
            self.reduce(entries, red_out, params)

        self.close_output(fd_list)
        external.close_ext()

        if self.save:
            OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
            Status("Results pushed to DDFS")
        else:
            index, index_url = self.reduce_index
            f = file(index, 'w')
            print >> f, '%d %s' % (self.id, out_url)
            sync(f)
            f.close()
            OutputURL(index_url)
Ejemplo n.º 5
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.util import format_size
    from disco.fileutils import AtomicFile
    worker.send('MSG', "Downloading %s" % filename)
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, str):
            raise ValueError("Keys must be strings for external sort", key)
        if '\xff' in key or '\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0)))
    out_fd.close()
    worker.send('MSG', "Downloaded %s OK" % format_size(getsize(filename)))
    worker.send('MSG', "Sorting %s..." % filename)
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in delimited_reader(fd, len(fd), fd.url, delimiter='\xff', line_terminator='\x00'):
        yield k, cPickle.loads(v)
Ejemplo n.º 6
0
Archivo: func.py Proyecto: dangra/disco
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.util import format_size
    from disco.fileutils import AtomicFile
    worker.send('MSG', "Downloading {0}".format(filename))
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, bytes):
            raise ValueError("Keys must be bytes for external sort", key)
        if b'\xff' in key or b'\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write(key + b'\xff')
            out_fd.write(pickle_dumps(value, 0) + b'\x00')
    out_fd.close()
    worker.send('MSG', "Downloaded {0:s} OK".format(format_size(getsize(filename))))
    worker.send('MSG', "Sorting {0}...".format(filename))
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in re_reader(b"(?s)(.*?)\xff(.*?)\x00", fd, len(fd), fd.url):
        yield k, pickle_loads(v)