def _run(self): entries = self.track_status(self, "%s entries reduced") red_out, out_url, fd_list = self.connect_output() params = self.params if self.ext_reduce: external.prepare(self.reduce, self.ext_params, self.path('ext.reduce')) self.reduce = FunctionType(external.ext_reduce.func_code, globals=external.__dict__) self.insert_globals([self.reduce]) total_size = sum(size for fd, size, url in self.connected_inputs) Status("Input is %s" % (util.format_size(total_size))) self.init(entries, params) if util.argcount(self.reduce) < 3: for k, v in self.reduce(entries, *(params, )): red_out.add(k, v) else: self.reduce(entries, red_out, params) self.close_output(fd_list) external.close_ext() if self.save: OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master)) Status("Results pushed to DDFS") else: index, index_url = self.reduce_index f = file(index, 'w') print >> f, '%d %s' % (self.id, out_url) sync(f) f.close() OutputURL(index_url)
def track_status(self, iterator, message_template): status_interval = self.status_interval n = -1 for n, item in enumerate(iterator): if status_interval and (n + 1) % status_interval == 0: Status(message_template % (n + 1)) yield item Status("Done: %s" % (message_template % (n + 1)))
def __init__(self, mode, **taskargs): from disco import task AnnouncePID(os.getpid()) Status("Received a new %s task!" % mode) self.task = getattr(task, mode.capitalize())(**taskargs) self.task.run() WorkerDone("Worker done")
class Reduce(Task): def _run(self): entries = self.track_status(self, "%s entries reduced") red_out, out_url, fd_list = self.connect_output() params = self.params if self.ext_reduce: external.prepare(self.reduce, self.ext_params, self.path('ext.reduce')) self.reduce = FunctionType(external.ext_reduce.func_code, globals=external.__dict__) self.insert_globals([self.reduce]) total_size = sum(size for fd, size, url in self.connected_inputs) Status("Input is %s" % (util.format_size(total_size))) self.init(entries, params) if util.argcount(self.reduce) < 3: for k, v in self.reduce(entries, *(params, )): red_out.add(k, v) else: self.reduce(entries, red_out, params) self.close_output(fd_list) external.close_ext() if self.save: OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master)) Status("Results pushed to DDFS") else: index, index_url = self.reduce_index f = file(index, 'w') print >> f, '%d %s' % (self.id, out_url) sync(f) f.close() OutputURL(index_url) def __iter__(self): if self.sort == 'merge': return self.merge_sorted_entries elif self.sort: return self.sorted_entries return self.entries def disk_sort(self, filename): Status("Sorting %s..." % filename) try: subprocess.check_call(['sort', '-z', '-t', '\xff', '-k', '1,1', '-T', '.', '-S', self.sort_buffer_size, '-o', filename, filename]) except subprocess.CalledProcessError, e: raise DataError("Sorting %s failed: %s" % (filename, e), filename) Status("Finished sorting")
def sorted_entries(self): dlname = self.path('reduce-in-%d.dl' % self.id) Status("Downloading %s" % dlname) out_fd = AtomicFile(dlname, 'w') for key, value in self.entries: if not isinstance(key, str): raise ValueError("Keys must be strings for external sort", key) if '\xff' in key or '\x00' in key: raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key) else: # value pickled using protocol 0 will always be printable ASCII out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0))) out_fd.close() Status("Downloaded OK") self.disk_sort(dlname) fd, size, url = comm.open_local(dlname) for k, v in func.re_reader("(?s)(.*?)\xff(.*?)\x00", fd, size, url): yield k, cPickle.loads(v)
def disk_sort(self, filename): Status("Sorting %s..." % filename) try: subprocess.check_call(['sort', '-z', '-t', '\xff', '-k', '1,1', '-T', '.', '-S', self.sort_buffer_size, '-o', filename, filename]) except subprocess.CalledProcessError, e: raise DataError("Sorting %s failed: %s" % (filename, e), filename)
def _run(self): if len(self.inputs) != 1: TaskFailed("Map takes 1 input, got: %s" % ' '.join(self.inputs)) if self.save and not self.reduce and self.ispartitioned: TaskFailed("Storing partitioned outputs in DDFS is not yet supported") if self.ext_map: external.prepare(self.map, self.ext_params, self.path('ext.map')) self.map = FunctionType(external.ext_map.func_code, globals=external.__dict__) self.insert_globals([self.map]) entries = self.track_status(self, "%s entries mapped") params = self.params outputs = [MapOutput(self, i) for i in xrange(max(1, int(self.jobdict['partitions'])))] self.init(entries, params) for entry in entries: for k, v in self.map(entry, params): outputs[self.partition(k, len(outputs), params)].add(k, v) external.close_ext() index, index_url = self.map_index f = file(index, 'w') for i, output in enumerate(outputs): print >> f, '%d %s' % (i, output.url) output.close() sync(f) f.close() if self.save and not self.reduce: OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master)) Status("Results pushed to DDFS") else: OutputURL(index_url)
def map(e, params): Status("Internal msg") return []