def finish(self, termswriter, doccount, lengthfile): _fieldlength_totals = self._fieldlength_totals if not self.tasks: return jobqueue = self.jobqueue rqueue = self.resultqueue for task in self.tasks: jobqueue.put((None, doccount)) for task in self.tasks: task.join() runs = [] lenfilenames = [] for task in self.tasks: taskruns, flentotals, flenmaxes, lenfilename = rqueue.get() runs.extend(taskruns) lenfilenames.append(lenfilename) for fieldnum, total in flentotals.iteritems(): _fieldlength_totals[fieldnum] += total for fieldnum, length in flenmaxes.iteritems(): if length > self._fieldlength_maxes.get(fieldnum, 0): self._fieldlength_maxes[fieldnum] = length jobqueue.close() rqueue.close() lw = LengthWriter(lengthfile, doccount) for lenfilename in lenfilenames: sublengths = LengthReader(StructFile(open(lenfilename, "rb")), doccount) lw.add_all(sublengths) os.remove(lenfilename) lw.close() lengths = lw.reader() # if len(runs) >= self.procs * 2: # pool = Pool(self.procs) # tempname = lambda: tempfile.mktemp(suffix=".run", dir=self.dir) # while len(runs) >= self.procs * 2: # runs2 = [(runs[i:i+4], tempname()) # for i in xrange(0, len(runs), 4)] # if len(runs) % 4: # last = runs2.pop()[0] # runs2[-1][0].extend(last) # runs = pool.map(merge_runs, runs2) # pool.close() iterator = imerge( [read_run(runname, count) for runname, count in runs]) total = sum(count for runname, count in runs) termswriter.add_iter(iterator, lengths.get) for runname, count in runs: os.remove(runname) self.cleanup()
def finish(self, termswriter, doccount, lengthfile): self._write_lengths(lengthfile, doccount) lengths = LengthReader(None, doccount, self.length_arrays) if not self._flushed: gen = self.readback_buffer() else: if self.postbuf: self.flush() gen = self.readback() termswriter.add_iter(gen, lengths.get)
def finish(self, doccount, lengthfile, termtable, postingwriter): _fieldlength_totals = self._fieldlength_totals if not self.tasks: return pqueue = self.postingqueue rqueue = self.resultsqueue for _ in xrange(self.procs): pqueue.put((-1, doccount)) #print "Joining..." t = now() for task in self.tasks: task.join() #print "Join:", now() - t #print "Getting results..." t = now() runs = [] lenfilenames = [] for task in self.tasks: taskruns, flentotals, flenmaxes, lenfilename = rqueue.get() runs.extend(taskruns) lenfilenames.append(lenfilename) for fieldnum, total in flentotals.iteritems(): _fieldlength_totals[fieldnum] += total for fieldnum, length in flenmaxes.iteritems(): if length > self._fieldlength_maxes.get(fieldnum, 0): self._fieldlength_maxes[fieldnum] = length #print "Results:", now() - t #print "Writing lengths..." t = now() lw = LengthWriter(lengthfile, doccount) for lenfilename in lenfilenames: sublengths = LengthReader(StructFile(open(lenfilename, "rb")), doccount) lw.add_all(sublengths) os.remove(lenfilename) lw.close() lengths = lw.reader() #print "Lengths:", now() - t t = now() iterator = imerge([read_run(runname, count) for runname, count in runs]) total = sum(count for runname, count in runs) write_postings(self.schema, termtable, lengths, postingwriter, iterator) for runname, count in runs: os.remove(runname) #print "Merge:", now() - t self.cleanup()
def __init__(self, storage, schema, segment): self.storage = storage self.schema = schema self.segment = segment if hasattr(self.segment, "uuid"): self.uuid_string = str(self.segment.uuid) else: import uuid self.uuid_string = str(uuid.uuid4()) # Term index tf = storage.open_file(segment.termsindex_filename) self.termsindex = TermIndexReader(tf) # Term vector index, and vector postings: lazy load self.vectorindex = None self.vpostfile = None # Stored fields file sf = storage.open_file(segment.storedfields_filename, mapped=False) self.storedfields = StoredFieldReader(sf) # Field length file self.fieldlengths = None if self.schema.has_scorable_fields(): flf = storage.open_file(segment.fieldlengths_filename) self.fieldlengths = LengthReader(flf, segment.doc_count_all()) # Copy info from underlying segment self._has_deletions = segment.has_deletions() self._doc_count = segment.doc_count() # Postings file self.postfile = self.storage.open_file(segment.termposts_filename, mapped=False) # Dawg file self.dawg = None if any(field.spelling for field in self.schema): fname = segment.dawg_filename if self.storage.file_exists(fname): dawgfile = self.storage.open_file(fname, mapped=False) self.dawg = DiskNode.load(dawgfile, expand=False) self.dc = segment.doc_count_all() assert self.dc == self.storedfields.length self.set_caching_policy() self.is_closed = False self._sync_lock = Lock()
def finish(self, termswriter, doccount, lengthfile): from itertools import izip pbuf = self.postbuf self._write_lengths(lengthfile, doccount) lengths = LengthReader(None, doccount, self.length_arrays) def gen(): for term in sorted(pbuf): fieldname, text = term for docnum, weight, valuestring in izip(*pbuf[term]): yield (fieldname, text, docnum, weight, valuestring) termswriter.add_iter(gen(), lengths.get)
def finish(self, termswriter, doccount, lengthfile): self._write_lengths(lengthfile, doccount) lengths = LengthReader(None, doccount, self.length_arrays) if self.postings or self.runs: if self.postings and len(self.runs) == 0: self.postings.sort() postiter = iter(self.postings) elif not self.postings and not self.runs: postiter = iter([]) else: self.dump_run() postiter = imerge([read_run(runname, count) for runname, count in self.runs]) termswriter.add_iter(postiter, lengths.get) self.cleanup()
def __init__(self, storage, schema, segment): self.storage = storage self.schema = schema self.segment = segment if hasattr(self.segment, "uuid"): self.uuid_string = str(self.segment.uuid) else: import uuid self.uuid_string = str(uuid.uuid4()) # Term index tf = storage.open_file(segment.termsindex_filename) self.termsindex = TermIndexReader(tf) # Term postings file, vector index, and vector postings: lazy load self.postfile = None self.vectorindex = None self.vpostfile = None # Stored fields file sf = storage.open_file(segment.storedfields_filename, mapped=False) self.storedfields = StoredFieldReader(sf) # Field length file self.fieldlengths = None if self.schema.has_scorable_fields(): flf = storage.open_file(segment.fieldlengths_filename) self.fieldlengths = LengthReader(flf, segment.doc_count_all()) # Copy methods from underlying segment self.has_deletions = segment.has_deletions self.is_deleted = segment.is_deleted self.doc_count = segment.doc_count # Postings file self.postfile = self.storage.open_file(segment.termposts_filename, mapped=False) self.dc = segment.doc_count_all() assert self.dc == self.storedfields.length self.set_caching_policy() self.is_closed = False self._sync_lock = Lock()
def finish(self, doccount, lengthfile, termtable, postingwriter): self._write_lengths(lengthfile, doccount) lengths = LengthReader(None, doccount, self.length_arrays) if self.postings or self.runs: if self.postings and len(self.runs) == 0: self.postings.sort() postiter = iter(self.postings) elif not self.postings and not self.runs: postiter = iter([]) else: self.dump_run() postiter = imerge( [read_run(runname, count) for runname, count in self.runs]) write_postings(self.schema, termtable, lengths, postingwriter, postiter) self.cleanup()
def finish(self, termswriter, doccount, lengthfile): self._write_lengths(lengthfile, doccount) lengths = LengthReader(None, doccount, self.length_arrays) self.postbuf.sort() termswriter.add_iter(self.postbuf, lengths.get)