def finish(self, termswriter, doccount, lengthfile): _fieldlength_totals = self._fieldlength_totals if not self.tasks: return jobqueue = self.jobqueue rqueue = self.resultqueue for task in self.tasks: jobqueue.put((None, doccount)) for task in self.tasks: task.join() runs = [] lenfilenames = [] for task in self.tasks: taskruns, flentotals, flenmaxes, lenfilename = rqueue.get() runs.extend(taskruns) lenfilenames.append(lenfilename) for fieldnum, total in flentotals.iteritems(): _fieldlength_totals[fieldnum] += total for fieldnum, length in flenmaxes.iteritems(): if length > self._fieldlength_maxes.get(fieldnum, 0): self._fieldlength_maxes[fieldnum] = length jobqueue.close() rqueue.close() lw = LengthWriter(lengthfile, doccount) for lenfilename in lenfilenames: sublengths = LengthReader(StructFile(open(lenfilename, "rb")), doccount) lw.add_all(sublengths) os.remove(lenfilename) lw.close() lengths = lw.reader() # if len(runs) >= self.procs * 2: # pool = Pool(self.procs) # tempname = lambda: tempfile.mktemp(suffix=".run", dir=self.dir) # while len(runs) >= self.procs * 2: # runs2 = [(runs[i:i+4], tempname()) # for i in xrange(0, len(runs), 4)] # if len(runs) % 4: # last = runs2.pop()[0] # runs2[-1][0].extend(last) # runs = pool.map(merge_runs, runs2) # pool.close() iterator = imerge( [read_run(runname, count) for runname, count in runs]) total = sum(count for runname, count in runs) termswriter.add_iter(iterator, lengths.get) for runname, count in runs: os.remove(runname) self.cleanup()
def finish(self, termswriter, doccount, lengthfile): if self.buffer: self._enqueue() _fieldlength_totals = self._fieldlength_totals if not self.tasks: return jobqueue = self.jobqueue rqueue = self.resultqueue for task in self.tasks: jobqueue.put((None, doccount)) for task in self.tasks: task.join() runs = [] lenfilenames = [] for task in self.tasks: taskruns, flentotals, flenmaxes, lenfilename = rqueue.get() runs.extend(taskruns) lenfilenames.append(lenfilename) for fieldnum, total in flentotals.iteritems(): _fieldlength_totals[fieldnum] += total for fieldnum, length in flenmaxes.iteritems(): if length > self._fieldlength_maxes.get(fieldnum, 0): self._fieldlength_maxes[fieldnum] = length jobqueue.close() rqueue.close() lw = LengthWriter(lengthfile, doccount) for lenfilename in lenfilenames: sublengths = LengthReader(StructFile(open(lenfilename, "rb")), doccount) lw.add_all(sublengths) os.remove(lenfilename) lw.close() lengths = lw.reader() # if len(runs) >= self.procs * 2: # pool = Pool(self.procs) # tempname = lambda: tempfile.mktemp(suffix=".run", dir=self.dir) # while len(runs) >= self.procs * 2: # runs2 = [(runs[i:i+4], tempname()) # for i in xrange(0, len(runs), 4)] # if len(runs) % 4: # last = runs2.pop()[0] # runs2[-1][0].extend(last) # runs = pool.map(merge_runs, runs2) # pool.close() iterator = imerge([read_run(runname, count) for runname, count in runs]) total = sum(count for runname, count in runs) termswriter.add_iter(iterator, lengths.get) for runname, count in runs: os.remove(runname) self.cleanup()
def finish(self, doccount, lengthfile, termtable, postingwriter): _fieldlength_totals = self._fieldlength_totals if not self.tasks: return pqueue = self.postingqueue rqueue = self.resultsqueue for _ in xrange(self.procs): pqueue.put((-1, doccount)) #print "Joining..." t = now() for task in self.tasks: task.join() #print "Join:", now() - t #print "Getting results..." t = now() runs = [] lenfilenames = [] for task in self.tasks: taskruns, flentotals, flenmaxes, lenfilename = rqueue.get() runs.extend(taskruns) lenfilenames.append(lenfilename) for fieldnum, total in flentotals.iteritems(): _fieldlength_totals[fieldnum] += total for fieldnum, length in flenmaxes.iteritems(): if length > self._fieldlength_maxes.get(fieldnum, 0): self._fieldlength_maxes[fieldnum] = length #print "Results:", now() - t #print "Writing lengths..." t = now() lw = LengthWriter(lengthfile, doccount) for lenfilename in lenfilenames: sublengths = LengthReader(StructFile(open(lenfilename, "rb")), doccount) lw.add_all(sublengths) os.remove(lenfilename) lw.close() lengths = lw.reader() #print "Lengths:", now() - t t = now() iterator = imerge([read_run(runname, count) for runname, count in runs]) total = sum(count for runname, count in runs) write_postings(self.schema, termtable, lengths, postingwriter, iterator) for runname, count in runs: os.remove(runname) #print "Merge:", now() - t self.cleanup()
def _write_lengths(self, lengthfile, doccount): self._fill_lengths(doccount) lw = LengthWriter(lengthfile, doccount, lengths=self.length_arrays) lw.close()