def _run(self): red_out, out_url, fd_list = self.connect_output() red_in = iter(ReduceReader(self)) params = self.params if self.ext_reduce: path = self.path('EXT_REDUCE') external.prepare(self.reduce, self.ext_params, path) self.reduce = FunctionType(external.ext_reduce.func_code, globals=external.__dict__) self.insert_globals([self.reduce]) Message("Starting reduce") self.init(red_in, params) self.reduce(red_in, red_out, params) Message("Reduce done") self.close_output(fd_list) external.close_ext() if self.save: OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master)) Message("Results pushed to DDFS") else: index, index_url = self.reduce_index safe_update(index, {'%d %s' % (self.id, out_url): True}) OutputURL(index_url)
def _run(self): entries = self.track_status(self, "%s entries reduced") red_out, out_url, fd_list = self.connect_output() params = self.params if self.ext_reduce: external.prepare(self.reduce, self.ext_params, self.path('ext.reduce')) self.reduce = FunctionType(external.ext_reduce.func_code, globals=external.__dict__) self.insert_globals([self.reduce]) total_size = sum(size for fd, size, url in self.connected_inputs) Status("Input is %s" % (util.format_size(total_size))) self.init(entries, params) if util.argcount(self.reduce) < 3: for k, v in self.reduce(entries, *(params, )): red_out.add(k, v) else: self.reduce(entries, red_out, params) self.close_output(fd_list) external.close_ext() if self.save: OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master)) Status("Results pushed to DDFS") else: index, index_url = self.reduce_index f = file(index, 'w') print >> f, '%d %s' % (self.id, out_url) sync(f) f.close() OutputURL(index_url)
def _run(self): entries = self.track_status(self, "%s entries reduced") red_out, out_url, fd_list = self.connect_output() params = self.params if self.ext_reduce: external.prepare(self.reduce, self.ext_params, self.path('ext.reduce')) self.reduce = FunctionType(external.ext_reduce.func_code, globals=external.__dict__) self.insert_globals([self.reduce]) total_size = sum(size for fd, size, url in self.connected_inputs) Message("Input is %s" % (util.format_size(total_size))) self.init(entries, params) self.reduce(entries, red_out, params) self.close_output(fd_list) external.close_ext() if self.save: OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master)) Message("Results pushed to DDFS") else: index, index_url = self.reduce_index safe_update(index, ['%d %s' % (self.id, out_url)]) OutputURL(index_url)
def _run(self): if len(self.inputs) != 1: TaskFailed("Map can only handle one input. Got: %s" % ' '.join(self.inputs)) if self.ext_map: external.prepare(self.map, self.ext_params, self.path('EXT_MAP')) self.map = FunctionType(external.ext_map.func_code, globals=external.__dict__) self.insert_globals([self.map]) partitions = [MapOutput(self, i) for i in xrange(self.num_partitions)] reader, sze, url = self.connect_input(self.inputs[0]) params = self.params self.init(reader, params) entries = (self.map(entry, params) for entry in reader) for kvs in self.track_status(entries, "%s entries mapped"): for k, v in kvs: p = self.partition(k, self.num_partitions, params) partitions[p].add(k, v) external.close_ext() urls = {} for i, partition in enumerate(partitions): partition.close() urls['%d %s' % (i, partition.url)] = True index, index_url = self.map_index safe_update(index, urls) if self.save and not self.reduce: if self.ispartitioned: TaskFailed("Storing partitioned outputs in DDFS is not yet supported") else: OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master)) Message("Results pushed to DDFS") else: OutputURL(index_url)
def _run(self): if len(self.inputs) != 1: TaskFailed("Map takes 1 input, got: %s" % ' '.join(self.inputs)) if self.save and not self.reduce and self.ispartitioned: TaskFailed("Storing partitioned outputs in DDFS is not yet supported") if self.ext_map: external.prepare(self.map, self.ext_params, self.path('ext.map')) self.map = FunctionType(external.ext_map.func_code, globals=external.__dict__) self.insert_globals([self.map]) entries = self.track_status(self, "%s entries mapped") params = self.params outputs = [MapOutput(self, i) for i in xrange(max(1, int(self.jobdict['partitions'])))] self.init(entries, params) for entry in entries: for k, v in self.map(entry, params): outputs[self.partition(k, len(outputs), params)].add(k, v) external.close_ext() index, index_url = self.map_index f = file(index, 'w') for i, output in enumerate(outputs): print >> f, '%d %s' % (i, output.url) output.close() sync(f) f.close() if self.save and not self.reduce: OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master)) Status("Results pushed to DDFS") else: OutputURL(index_url)