def disk_sort(worker, input, filename, sort_buffer_size='10%'): from os.path import getsize from disco.comm import open_local from disco.util import format_size from disco.fileutils import AtomicFile worker.send('MSG', "Downloading %s" % filename) out_fd = AtomicFile(filename) for key, value in input: if not isinstance(key, str): raise ValueError("Keys must be strings for external sort", key) if '\xff' in key or '\x00' in key: raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key) else: # value pickled using protocol 0 will always be printable ASCII out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0))) out_fd.close() worker.send('MSG', "Downloaded %s OK" % format_size(getsize(filename))) worker.send('MSG', "Sorting %s..." % filename) unix_sort(filename, sort_buffer_size=sort_buffer_size) worker.send('MSG', ("Finished sorting")) fd = open_local(filename) for k, v in delimited_reader(fd, len(fd), fd.url, delimiter='\xff', line_terminator='\x00'): yield k, cPickle.loads(v)
def disk_sort(worker, input, filename, sort_buffer_size='10%'): from os.path import getsize from disco.comm import open_local from disco.util import format_size from disco.fileutils import AtomicFile worker.send('MSG', "Downloading {0}".format(filename)) out_fd = AtomicFile(filename) for key, value in input: if not isinstance(key, bytes): raise ValueError("Keys must be bytes for external sort", key) if b'\xff' in key or b'\x00' in key: raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key) else: # value pickled using protocol 0 will always be printable ASCII out_fd.write(key + b'\xff') out_fd.write(pickle_dumps(value, 0) + b'\x00') out_fd.close() worker.send('MSG', "Downloaded {0:s} OK".format(format_size(getsize(filename)))) worker.send('MSG', "Sorting {0}...".format(filename)) unix_sort(filename, sort_buffer_size=sort_buffer_size) worker.send('MSG', ("Finished sorting")) fd = open_local(filename) for k, v in re_reader(b"(?s)(.*?)\xff(.*?)\x00", fd, len(fd), fd.url): yield k, pickle_loads(v)
def _run(self): entries = self.track_status(self, "%s entries reduced") red_out, out_url, fd_list = self.connect_output() params = self.params if self.ext_reduce: external.prepare(self.reduce, self.ext_params, self.path('ext.reduce')) self.reduce = FunctionType(external.ext_reduce.func_code, globals=external.__dict__) self.insert_globals([self.reduce]) total_size = sum(size for fd, size, url in self.connected_inputs) Message("Input is %s" % (util.format_size(total_size))) self.init(entries, params) self.reduce(entries, red_out, params) self.close_output(fd_list) external.close_ext() if self.save: OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master)) Message("Results pushed to DDFS") else: index, index_url = self.reduce_index safe_update(index, ['%d %s' % (self.id, out_url)]) OutputURL(index_url)
def _run(self): entries = self.track_status(self, "%s entries reduced") red_out, out_url, fd_list = self.connect_output() params = self.params if self.ext_reduce: external.prepare(self.reduce, self.ext_params, self.path('ext.reduce')) self.reduce = FunctionType(external.ext_reduce.func_code, globals=external.__dict__) self.insert_globals([self.reduce]) total_size = sum(size for fd, size, url in self.connected_inputs) Status("Input is %s" % (util.format_size(total_size))) self.init(entries, params) if util.argcount(self.reduce) < 3: for k, v in self.reduce(entries, *(params, )): red_out.add(k, v) else: self.reduce(entries, red_out, params) self.close_output(fd_list) external.close_ext() if self.save: OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master)) Status("Results pushed to DDFS") else: index, index_url = self.reduce_index f = file(index, 'w') print >> f, '%d %s' % (self.id, out_url) sync(f) f.close() OutputURL(index_url)