def run(self): # Short circuit if not self._finalize_text: for obj in self.input: self.output.put(obj) self.output.put(StopIteration) return with open(self._fn, 'wb') as fp: for obj in self.input: fp.write(bson.BSON.encode(obj)) mm = self._job.map_read(self._fn) if not mm: self.output.put(StopIteration) return hdr = dict( jobtype='finalize', finalize_text=self._finalize_text, finalize_name=self._finalize_name, compress=self._job.options.zmr['compress']) def chunk_gen(): yield dict(hdr), lambda:mm result_iter = self._job.router.job_manager( self._job, 'finalize', chunk_gen()) for i, (header, content) in enumerate(result_iter): self._job.status['finalize'] = 'Processed %d chunks' % i r = resource.getrusage(resource.RUSAGE_SELF) log.debug('Retire finalize, rss %s', r.ru_maxrss) for part in content: for obj in util.bson_iter(part): self.output.put(obj) self.output.put(StopIteration) del self._job.status['finalize']
def run(self): # Short circuit if not self._reduce_text: pattern = os.path.join( self._job.jobdir, self._job.MAP_OUTPUT_TPL % '*') for fn in glob(pattern): if not os.path.exists(fn): continue mm = self._job.map_read(fn) for obj in util.bson_iter(mm): self.output.put(obj) self.output.put(StopIteration) return # Divvy out jobs to workers def chunk_gen(hdr): for j in xrange(self._job.options.zmr['reduce_count']): fn = os.path.join( self._job.jobdir, self._job.MAP_OUTPUT_TPL % j) if not os.path.exists(fn): continue yield dict(hdr), lambda: [ self._job.map_read(fn) ] if self._job.command == 'mapreduce': jobtype = 'reduce' elif self._job.command == 'xmapreduce': jobtype = 'xreduce' hdr = dict( jobtype=jobtype, reduce_text=self._reduce_text, reduce_name=self._reduce_name, compress=self._job.options.zmr['compress']) result_iter = self._job.router.job_manager( self._job, jobtype, chunk_gen(hdr)) for i, (header, content) in enumerate(result_iter): self._job.status['reduce'] = 'Processed %d chunks' % i r = resource.getrusage(resource.RUSAGE_SELF) log.debug('Retire reduce, rss %s', r.ru_maxrss) for part in content: for d in util.bson_iter(part): for obj in d['result']: self.output.put(obj) self.output.put(StopIteration)
def _handle_maplike(self, key, header, parts): ns = {} exec header['map_text'] in ns func = ns[header['map_name']] reduce_count = header['reduce_count'] result = [ [] for x in range(reduce_count) ] # Iterate, grouping chunks by the reduce chunk ID sz_input = 0 for part in parts: sz_input += len(part) for obj in func(util.bson_iter(part)): chunk_key = hash(key(obj)) % reduce_count result[chunk_key].append(obj) assert sz_input, 'There was no input!' # Emit reduce chunks one at a time util.send_bson(self._sink, header, zmq.SNDMORE) for result_chunk in result: self._sink.send( ''.join( bson.BSON.encode(dict(_id=key, value=value)) for key, value in result_chunk), zmq.SNDMORE) self._sink.send('')
def obj_iter(): for part in parts: for obj in util.bson_iter(part): yield obj