def test_termkey(): with TempStorage("termkey") as st: tw = TermIndexWriter(st.create_file("test.trm")) tw.add(("alfa", u("bravo")), FileTermInfo(1.0, 3)) tw.add(("alfa", u('\xc3\xa6\xc3\xaf\xc5\ufffd\xc3\xba')), FileTermInfo(4.0, 6)) tw.add(("text", u('\xe6\u2014\xa5\xe6\u0153\xac\xe8\xaa\u017e')), FileTermInfo(7.0, 9)) tw.close() tr = TermIndexReader(st.open_file("test.trm")) assert ("alfa", u("bravo")) in tr assert ("alfa", u('\xc3\xa6\xc3\xaf\xc5\ufffd\xc3\xba')) in tr assert ("text", u('\xe6\u2014\xa5\xe6\u0153\xac\xe8\xaa\u017e')) in tr tr.close()
def test_termindex(): terms = [("a", "alfa"), ("a", "bravo"), ("a", "charlie"), ("a", "delta"), ("b", "able"), ("b", "baker"), ("b", "dog"), ("b", "easy")] st = RamStorage() tw = TermIndexWriter(st.create_file("test.trm")) for i, t in enumerate(terms): tw.add(t, FileTermInfo(1.0, i)) tw.close() tr = TermIndexReader(st.open_file("test.trm")) for i, (t1, t2) in enumerate(zip(tr.keys(), terms)): assert_equal(t1, t2) ti = tr.get(t1) assert_equal(ti.weight(), 1.0) assert_equal(ti.doc_frequency(), i)
def test_random_termkeys(): def random_fieldname(): return "".join(chr(random.randint(65, 90)) for _ in xrange(1, 20)) def random_token(): return "".join(unichr(random.randint(0, 0xd7ff)) for _ in xrange(1, 20)) domain = sorted([(random_fieldname(), random_token()) for _ in xrange(1000)]) st = RamStorage() tw = TermIndexWriter(st.create_file("test.trm")) for term in domain: tw.add(term, FileTermInfo(1.0, 1)) tw.close() tr = TermIndexReader(st.open_file("test.trm")) for term in domain: assert term in tr
def test_random_termkeys(): def random_fieldname(): return "".join(chr(random.randint(65, 90)) for _ in xrange(1, 20)) def random_token(): return "".join( unichr(random.randint(0, 0xd7ff)) for _ in xrange(1, 20)) domain = sorted([(random_fieldname(), random_token()) for _ in xrange(1000)]) st = RamStorage() tw = TermIndexWriter(st.create_file("test.trm")) for term in domain: tw.add(term, FileTermInfo(1.0, 1)) tw.close() tr = TermIndexReader(st.open_file("test.trm")) for term in domain: assert term in tr
def __init__(self, ix, poolclass=None, procs=0, blocklimit=128, timeout=0.0, delay=0.1, name=None, _l=True, **poolargs): self.writelock = None if _l: self.writelock = ix.lock("WRITELOCK") if not try_for(self.writelock.acquire, timeout=timeout, delay=delay): raise LockError self.readlock = ix.lock("READLOCK") info = ix._read_toc() self.schema = info.schema self.segments = info.segments self.storage = ix.storage self.indexname = ix.indexname self.is_closed = False self.blocklimit = blocklimit self.segment_number = info.segment_counter + 1 self.generation = info.generation + 1 self._doc_offsets = [] base = 0 for s in self.segments: self._doc_offsets.append(base) base += s.doc_count_all() self.name = name or Segment.basename(self.indexname, self.segment_number) self.docnum = 0 self.fieldlength_totals = defaultdict(int) self._added = False self._unique_cache = {} # Create a temporary segment to use its .*_filename attributes segment = Segment(self.name, self.generation, 0, None, None) # Terms index tf = self.storage.create_file(segment.termsindex_filename) ti = TermIndexWriter(tf) # Term postings file pf = self.storage.create_file(segment.termposts_filename) pw = FilePostingWriter(pf, blocklimit=blocklimit) # Terms writer self.termswriter = TermsWriter(self.schema, ti, pw) if self.schema.has_vectored_fields(): # Vector index vf = self.storage.create_file(segment.vectorindex_filename) self.vectorindex = TermVectorWriter(vf) # Vector posting file vpf = self.storage.create_file(segment.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpf, stringids=True) else: self.vectorindex = None self.vpostwriter = None # Stored fields file sf = self.storage.create_file(segment.storedfields_filename) self.storedfields = StoredFieldWriter(sf, self.schema.stored_names()) # Field lengths file self.lengthfile = self.storage.create_file(segment.fieldlengths_filename) # Create the pool if poolclass is None: if procs > 1: from whoosh.filedb.multiproc import MultiPool poolclass = MultiPool else: poolclass = TempfilePool self.pool = poolclass(self.schema, procs=procs, **poolargs)
class SegmentWriter(IndexWriter): def __init__(self, ix, poolclass=None, procs=0, blocklimit=128, timeout=0.0, delay=0.1, name=None, **poolargs): self.writelock = ix.lock("WRITELOCK") if not try_for(self.writelock.acquire, timeout=timeout, delay=delay): raise LockError self.ix = ix self.storage = ix.storage self.indexname = ix.indexname self.is_closed = False info = ix._read_toc() self.schema = info.schema self.segments = info.segments self.blocklimit = blocklimit self.segment_number = info.segment_counter + 1 self.generation = info.generation + 1 self._doc_offsets = [] base = 0 for s in self.segments: self._doc_offsets.append(base) base += s.doc_count_all() self.name = name or "_%s_%s" % (self.indexname, self.segment_number) self.docnum = 0 self.fieldlength_totals = defaultdict(int) self._added = False self._unique_cache = {} # Create a temporary segment to use its .*_filename attributes segment = Segment(self.name, 0, None, None) # Terms index tf = self.storage.create_file(segment.termsindex_filename) self.termsindex = TermIndexWriter(tf) # Term postings file pf = self.storage.create_file(segment.termposts_filename) self.postwriter = FilePostingWriter(pf, blocklimit=blocklimit) if self.schema.has_vectored_fields(): # Vector index vf = self.storage.create_file(segment.vectorindex_filename) self.vectorindex = TermVectorWriter(vf) # Vector posting file vpf = self.storage.create_file(segment.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpf, stringids=True) else: self.vectorindex = None self.vpostwriter = None # Stored fields file sf = self.storage.create_file(segment.storedfields_filename) self.storedfields = StoredFieldWriter(sf, self.schema.stored_names()) # Field lengths file self.lengthfile = self.storage.create_file( segment.fieldlengths_filename) # Create the pool if poolclass is None: if procs > 1: from whoosh.filedb.multiproc import MultiPool poolclass = MultiPool else: poolclass = TempfilePool self.pool = poolclass(self.schema, procs=procs, **poolargs) def _check_state(self): if self.is_closed: raise IndexingError("This writer is closed") def add_field(self, fieldname, fieldspec): self._check_state() if self._added: raise Exception("Can't modify schema after adding data to writer") super(SegmentWriter, self).add_field(fieldname, fieldspec) def remove_field(self, fieldname): self._check_state() if self._added: raise Exception("Can't modify schema after adding data to writer") super(SegmentWriter, self).remove_field(fieldname) def _document_segment(self, docnum): #Returns the index.Segment object containing the given document #number. offsets = self._doc_offsets if len(offsets) == 1: return 0 return bisect_right(offsets, docnum) - 1 def _segment_and_docnum(self, docnum): #Returns an (index.Segment, segment_docnum) pair for the segment #containing the given document number. segmentnum = self._document_segment(docnum) offset = self._doc_offsets[segmentnum] segment = self.segments[segmentnum] return segment, docnum - offset def has_deletions(self): """ :returns: True if this index has documents that are marked deleted but haven't been optimized out of the index yet. """ return any(s.has_deletions() for s in self.segments) def delete_document(self, docnum, delete=True): self._check_state() segment, segdocnum = self._segment_and_docnum(docnum) segment.delete_document(segdocnum, delete=delete) def deleted_count(self): """ :returns: the total number of deleted documents in the index. """ return sum(s.deleted_count() for s in self.segments) def is_deleted(self, docnum): segment, segdocnum = self._segment_and_docnum(docnum) return segment.is_deleted(segdocnum) def searcher(self): self._check_state() from whoosh.filedb.fileindex import FileIndex return FileIndex(self.storage, indexname=self.indexname).searcher() def add_reader(self, reader): self._check_state() startdoc = self.docnum has_deletions = reader.has_deletions() if has_deletions: docmap = {} fieldnames = set(self.schema.names()) # Add stored documents, vectors, and field lengths for docnum in xrange(reader.doc_count_all()): if (not has_deletions) or (not reader.is_deleted(docnum)): d = dict(item for item in reader.stored_fields(docnum).iteritems() if item[0] in fieldnames) # We have to append a dictionary for every document, even if # it's empty. self.storedfields.append(d) if has_deletions: docmap[docnum] = self.docnum for fieldname, length in reader.doc_field_lengths(docnum): if fieldname in fieldnames: self.pool.add_field_length(self.docnum, fieldname, length) for fieldname in reader.vector_names(): if (fieldname in fieldnames and reader.has_vector(docnum, fieldname)): vpostreader = reader.vector(docnum, fieldname) self._add_vector_reader(self.docnum, fieldname, vpostreader) self.docnum += 1 for fieldname, text, _, _ in reader: if fieldname in fieldnames: postreader = reader.postings(fieldname, text) while postreader.is_active(): docnum = postreader.id() valuestring = postreader.value() if has_deletions: newdoc = docmap[docnum] else: newdoc = startdoc + docnum self.pool.add_posting(fieldname, text, newdoc, postreader.weight(), valuestring) postreader.next() self._added = True def add_document(self, **fields): self._check_state() schema = self.schema # Sort the keys fieldnames = sorted( [name for name in fields.keys() if not name.startswith("_")]) # Check if the caller gave us a bogus field for name in fieldnames: if name not in schema: raise UnknownFieldError("No field named %r in %s" % (name, schema)) self.storedfields storedvalues = {} docnum = self.docnum for fieldname in fieldnames: value = fields.get(fieldname) if value is not None: field = schema[fieldname] if field.indexed: self.pool.add_content(docnum, fieldname, field, value) vformat = field.vector if vformat: vlist = sorted((w, weight, valuestring) for w, freq, weight, valuestring in vformat.word_values(value, mode="index")) self._add_vector(docnum, fieldname, vlist) if field.stored: # Caller can override the stored value by including a key # _stored_<fieldname> storedvalue = value storedname = "_stored_" + fieldname if storedname in fields: storedvalue = fields[storedname] storedvalues[fieldname] = storedvalue self._added = True self.storedfields.append(storedvalues) self.docnum += 1 def update_document(self, **fields): self._check_state() _unique_cache = self._unique_cache # Check which of the supplied fields are unique unique_fields = [ name for name, field in self.schema.items() if name in fields and field.unique ] if not unique_fields: raise IndexingError("None of the fields in %r" " are unique" % fields.keys()) # Delete documents matching the unique terms delset = set() for name in unique_fields: field = self.schema[name] text = field.to_text(fields[name]) # If we've seen an update_document with this unique field before... if name in _unique_cache: # Get the cache for this field term2docnum = _unique_cache[name] # If the cache is None, that means we've seen this field once # before but didn't cache it the first time. Cache it now. if term2docnum is None: # Read the first document number found for every term in # this field and cache the mapping from term to doc num term2docnum = {} s = self.searcher() term2docnum = dict(s.first_ids(name)) s.close() _unique_cache[name] = term2docnum # Look up the cached document number for this term if text in term2docnum: delset.add(term2docnum[text]) else: # This is the first time we've seen an update_document with # this field. Mark it by putting None in the cache for this # field, but don't cache it. We'll only build the cache if we # see an update_document on this field again. This is to # prevent caching a field even when the user is only going to # call update_document once. reader = self.searcher().reader() try: delset.add(reader.postings(name, text).id()) _unique_cache[name] = None except TermNotFound: pass finally: reader.close() # Delete the old docs for docnum in delset: self.delete_document(docnum) # Add the given fields self.add_document(**fields) def _add_vector(self, docnum, fieldname, vlist): vpostwriter = self.vpostwriter offset = vpostwriter.start(self.schema[fieldname].vector) for text, weight, valuestring in vlist: assert isinstance(text, unicode), "%r is not unicode" % text vpostwriter.write(text, weight, valuestring, 0) vpostwriter.finish() self.vectorindex.add((docnum, fieldname), offset) def _add_vector_reader(self, docnum, fieldname, vreader): vpostwriter = self.vpostwriter offset = vpostwriter.start(self.schema[fieldname].vector) while vreader.is_active(): # text, weight, valuestring, fieldlen vpostwriter.write(vreader.id(), vreader.weight(), vreader.value(), 0) vreader.next() vpostwriter.finish() self.vectorindex.add((docnum, fieldname), offset) def _close_all(self): self.is_closed = True self.termsindex.close() self.postwriter.close() self.storedfields.close() if not self.lengthfile.is_closed: self.lengthfile.close() if self.vectorindex: self.vectorindex.close() if self.vpostwriter: self.vpostwriter.close() def _getsegment(self): return Segment(self.name, self.docnum, self.pool.fieldlength_totals(), self.pool.fieldlength_maxes()) def commit(self, mergetype=None, optimize=False, merge=True): """Finishes writing and saves all additions and changes to disk. There are four possible ways to use this method:: # Merge small segments but leave large segments, trying to # balance fast commits with fast searching: writer.commit() # Merge all segments into a single segment: writer.commit(optimize=True) # Don't merge any existing segments: writer.commit(merge=False) # Use a custom merge function writer.commit(mergetype=my_merge_function) :param mergetype: a custom merge function taking an Index object, Writer object, and segment list as arguments, and returning a new segment list. If you supply a ``mergetype`` function, the values of the ``optimize`` and ``merge`` arguments are ignored. :param optimize: if True, all existing segments are merged with the documents you've added to this writer (and the value of the ``merge`` argument is ignored). :param merge: if False, do not merge small segments. """ self._check_state() try: if mergetype: pass elif optimize: mergetype = OPTIMIZE elif not merge: mergetype = NO_MERGE else: mergetype = MERGE_SMALL # Call the merge policy function. The policy may choose to merge other # segments into this writer's pool new_segments = mergetype(self, self.segments) # Tell the pool we're finished adding information, it should add its # accumulated data to the lengths, terms index, and posting files. if self._added: self.pool.finish(self.docnum, self.lengthfile, self.termsindex, self.postwriter) # Create a Segment object for the segment created by this writer and # add it to the list of remaining segments returned by the merge policy # function new_segments.append(self._getsegment()) # Close all files, write a new TOC with the new segment list, and # release the lock. self._close_all() from whoosh.filedb.fileindex import _write_toc, _clean_files _write_toc(self.storage, self.schema, self.indexname, self.generation, self.segment_number, new_segments) readlock = self.ix.lock("READLOCK") readlock.acquire(True) try: _clean_files(self.storage, self.indexname, self.generation, new_segments) finally: readlock.release() finally: self.writelock.release() def cancel(self): self._check_state() try: self.pool.cancel() self._close_all() finally: self.writelock.release()
def __init__(self, ix, poolclass=None, procs=0, blocklimit=128, timeout=0.0, delay=0.1, name=None, **poolargs): self.writelock = ix.lock("WRITELOCK") if not try_for(self.writelock.acquire, timeout=timeout, delay=delay): raise LockError self.ix = ix self.storage = ix.storage self.indexname = ix.indexname self.is_closed = False info = ix._read_toc() self.schema = info.schema self.segments = info.segments self.blocklimit = blocklimit self.segment_number = info.segment_counter + 1 self.generation = info.generation + 1 self._doc_offsets = [] base = 0 for s in self.segments: self._doc_offsets.append(base) base += s.doc_count_all() self.name = name or "_%s_%s" % (self.indexname, self.segment_number) self.docnum = 0 self.fieldlength_totals = defaultdict(int) self._added = False self._unique_cache = {} # Create a temporary segment to use its .*_filename attributes segment = Segment(self.name, 0, None, None) # Terms index tf = self.storage.create_file(segment.termsindex_filename) self.termsindex = TermIndexWriter(tf) # Term postings file pf = self.storage.create_file(segment.termposts_filename) self.postwriter = FilePostingWriter(pf, blocklimit=blocklimit) if self.schema.has_vectored_fields(): # Vector index vf = self.storage.create_file(segment.vectorindex_filename) self.vectorindex = TermVectorWriter(vf) # Vector posting file vpf = self.storage.create_file(segment.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpf, stringids=True) else: self.vectorindex = None self.vpostwriter = None # Stored fields file sf = self.storage.create_file(segment.storedfields_filename) self.storedfields = StoredFieldWriter(sf, self.schema.stored_names()) # Field lengths file self.lengthfile = self.storage.create_file(segment.fieldlengths_filename) # Create the pool if poolclass is None: if procs > 1: from whoosh.filedb.multiproc import MultiPool poolclass = MultiPool else: poolclass = TempfilePool self.pool = poolclass(self.schema, procs=procs, **poolargs)
class SegmentWriter(IndexWriter): def __init__(self, ix, poolclass=None, procs=0, blocklimit=128, timeout=0.0, delay=0.1, name=None, **poolargs): self.writelock = ix.lock("WRITELOCK") if not try_for(self.writelock.acquire, timeout=timeout, delay=delay): raise LockError self.ix = ix self.storage = ix.storage self.indexname = ix.indexname self.is_closed = False info = ix._read_toc() self.schema = info.schema self.segments = info.segments self.blocklimit = blocklimit self.segment_number = info.segment_counter + 1 self.generation = info.generation + 1 self._doc_offsets = [] base = 0 for s in self.segments: self._doc_offsets.append(base) base += s.doc_count_all() self.name = name or "_%s_%s" % (self.indexname, self.segment_number) self.docnum = 0 self.fieldlength_totals = defaultdict(int) self._added = False self._unique_cache = {} # Create a temporary segment to use its .*_filename attributes segment = Segment(self.name, 0, None, None) # Terms index tf = self.storage.create_file(segment.termsindex_filename) self.termsindex = TermIndexWriter(tf) # Term postings file pf = self.storage.create_file(segment.termposts_filename) self.postwriter = FilePostingWriter(pf, blocklimit=blocklimit) if self.schema.has_vectored_fields(): # Vector index vf = self.storage.create_file(segment.vectorindex_filename) self.vectorindex = TermVectorWriter(vf) # Vector posting file vpf = self.storage.create_file(segment.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpf, stringids=True) else: self.vectorindex = None self.vpostwriter = None # Stored fields file sf = self.storage.create_file(segment.storedfields_filename) self.storedfields = StoredFieldWriter(sf, self.schema.stored_names()) # Field lengths file self.lengthfile = self.storage.create_file(segment.fieldlengths_filename) # Create the pool if poolclass is None: if procs > 1: from whoosh.filedb.multiproc import MultiPool poolclass = MultiPool else: poolclass = TempfilePool self.pool = poolclass(self.schema, procs=procs, **poolargs) def _check_state(self): if self.is_closed: raise IndexingError("This writer is closed") def add_field(self, fieldname, fieldspec): self._check_state() if self._added: raise Exception("Can't modify schema after adding data to writer") super(SegmentWriter, self).add_field(fieldname, fieldspec) def remove_field(self, fieldname): self._check_state() if self._added: raise Exception("Can't modify schema after adding data to writer") super(SegmentWriter, self).remove_field(fieldname) def _document_segment(self, docnum): # Returns the index.Segment object containing the given document # number. offsets = self._doc_offsets if len(offsets) == 1: return 0 return bisect_right(offsets, docnum) - 1 def _segment_and_docnum(self, docnum): # Returns an (index.Segment, segment_docnum) pair for the segment # containing the given document number. segmentnum = self._document_segment(docnum) offset = self._doc_offsets[segmentnum] segment = self.segments[segmentnum] return segment, docnum - offset def has_deletions(self): """ :returns: True if this index has documents that are marked deleted but haven't been optimized out of the index yet. """ return any(s.has_deletions() for s in self.segments) def delete_document(self, docnum, delete=True): self._check_state() segment, segdocnum = self._segment_and_docnum(docnum) segment.delete_document(segdocnum, delete=delete) def deleted_count(self): """ :returns: the total number of deleted documents in the index. """ return sum(s.deleted_count() for s in self.segments) def is_deleted(self, docnum): segment, segdocnum = self._segment_and_docnum(docnum) return segment.is_deleted(segdocnum) def searcher(self): self._check_state() from whoosh.filedb.fileindex import FileIndex return FileIndex(self.storage, indexname=self.indexname).searcher() def add_reader(self, reader): self._check_state() startdoc = self.docnum has_deletions = reader.has_deletions() if has_deletions: docmap = {} fieldnames = set(self.schema.names()) # Add stored documents, vectors, and field lengths for docnum in xrange(reader.doc_count_all()): if (not has_deletions) or (not reader.is_deleted(docnum)): d = dict(item for item in reader.stored_fields(docnum).iteritems() if item[0] in fieldnames) # We have to append a dictionary for every document, even if # it's empty. self.storedfields.append(d) if has_deletions: docmap[docnum] = self.docnum for fieldname, length in reader.doc_field_lengths(docnum): if fieldname in fieldnames: self.pool.add_field_length(self.docnum, fieldname, length) for fieldname in reader.vector_names(): if fieldname in fieldnames and reader.has_vector(docnum, fieldname): vpostreader = reader.vector(docnum, fieldname) self._add_vector_reader(self.docnum, fieldname, vpostreader) self.docnum += 1 for fieldname, text, _, _ in reader: if fieldname in fieldnames: postreader = reader.postings(fieldname, text) while postreader.is_active(): docnum = postreader.id() valuestring = postreader.value() if has_deletions: newdoc = docmap[docnum] else: newdoc = startdoc + docnum self.pool.add_posting(fieldname, text, newdoc, postreader.weight(), valuestring) postreader.next() self._added = True def add_document(self, **fields): self._check_state() schema = self.schema # Sort the keys fieldnames = sorted([name for name in fields.keys() if not name.startswith("_")]) # Check if the caller gave us a bogus field for name in fieldnames: if name not in schema: raise UnknownFieldError("No field named %r in %s" % (name, schema)) self.storedfields storedvalues = {} docnum = self.docnum for fieldname in fieldnames: value = fields.get(fieldname) if value is not None: field = schema[fieldname] if field.indexed: self.pool.add_content(docnum, fieldname, field, value) vformat = field.vector if vformat: vlist = sorted( (w, weight, valuestring) for w, freq, weight, valuestring in vformat.word_values(value, mode="index") ) self._add_vector(docnum, fieldname, vlist) if field.stored: # Caller can override the stored value by including a key # _stored_<fieldname> storedvalue = value storedname = "_stored_" + fieldname if storedname in fields: storedvalue = fields[storedname] storedvalues[fieldname] = storedvalue self._added = True self.storedfields.append(storedvalues) self.docnum += 1 def update_document(self, **fields): self._check_state() _unique_cache = self._unique_cache # Check which of the supplied fields are unique unique_fields = [name for name, field in self.schema.items() if name in fields and field.unique] if not unique_fields: raise IndexingError("None of the fields in %r" " are unique" % fields.keys()) # Delete documents matching the unique terms delset = set() for name in unique_fields: field = self.schema[name] text = field.to_text(fields[name]) # If we've seen an update_document with this unique field before... if name in _unique_cache: # Get the cache for this field term2docnum = _unique_cache[name] # If the cache is None, that means we've seen this field once # before but didn't cache it the first time. Cache it now. if term2docnum is None: # Read the first document number found for every term in # this field and cache the mapping from term to doc num term2docnum = {} s = self.searcher() term2docnum = dict(s.first_ids(name)) s.close() _unique_cache[name] = term2docnum # Look up the cached document number for this term if text in term2docnum: delset.add(term2docnum[text]) else: # This is the first time we've seen an update_document with # this field. Mark it by putting None in the cache for this # field, but don't cache it. We'll only build the cache if we # see an update_document on this field again. This is to # prevent caching a field even when the user is only going to # call update_document once. reader = self.searcher().reader() try: delset.add(reader.postings(name, text).id()) _unique_cache[name] = None except TermNotFound: pass finally: reader.close() # Delete the old docs for docnum in delset: self.delete_document(docnum) # Add the given fields self.add_document(**fields) def _add_vector(self, docnum, fieldname, vlist): vpostwriter = self.vpostwriter offset = vpostwriter.start(self.schema[fieldname].vector) for text, weight, valuestring in vlist: assert isinstance(text, unicode), "%r is not unicode" % text vpostwriter.write(text, weight, valuestring, 0) vpostwriter.finish() self.vectorindex.add((docnum, fieldname), offset) def _add_vector_reader(self, docnum, fieldname, vreader): vpostwriter = self.vpostwriter offset = vpostwriter.start(self.schema[fieldname].vector) while vreader.is_active(): # text, weight, valuestring, fieldlen vpostwriter.write(vreader.id(), vreader.weight(), vreader.value(), 0) vreader.next() vpostwriter.finish() self.vectorindex.add((docnum, fieldname), offset) def _close_all(self): self.is_closed = True self.termsindex.close() self.postwriter.close() self.storedfields.close() if not self.lengthfile.is_closed: self.lengthfile.close() if self.vectorindex: self.vectorindex.close() if self.vpostwriter: self.vpostwriter.close() def _getsegment(self): return Segment(self.name, self.docnum, self.pool.fieldlength_totals(), self.pool.fieldlength_maxes()) def commit(self, mergetype=None, optimize=False, merge=True): """Finishes writing and saves all additions and changes to disk. There are four possible ways to use this method:: # Merge small segments but leave large segments, trying to # balance fast commits with fast searching: writer.commit() # Merge all segments into a single segment: writer.commit(optimize=True) # Don't merge any existing segments: writer.commit(merge=False) # Use a custom merge function writer.commit(mergetype=my_merge_function) :param mergetype: a custom merge function taking an Index object, Writer object, and segment list as arguments, and returning a new segment list. If you supply a ``mergetype`` function, the values of the ``optimize`` and ``merge`` arguments are ignored. :param optimize: if True, all existing segments are merged with the documents you've added to this writer (and the value of the ``merge`` argument is ignored). :param merge: if False, do not merge small segments. """ self._check_state() try: if mergetype: pass elif optimize: mergetype = OPTIMIZE elif not merge: mergetype = NO_MERGE else: mergetype = MERGE_SMALL # Call the merge policy function. The policy may choose to merge other # segments into this writer's pool new_segments = mergetype(self, self.segments) # Tell the pool we're finished adding information, it should add its # accumulated data to the lengths, terms index, and posting files. if self._added: self.pool.finish(self.docnum, self.lengthfile, self.termsindex, self.postwriter) # Create a Segment object for the segment created by this writer and # add it to the list of remaining segments returned by the merge policy # function new_segments.append(self._getsegment()) # Close all files, write a new TOC with the new segment list, and # release the lock. self._close_all() from whoosh.filedb.fileindex import _write_toc, _clean_files _write_toc(self.storage, self.schema, self.indexname, self.generation, self.segment_number, new_segments) readlock = self.ix.lock("READLOCK") readlock.acquire(True) try: _clean_files(self.storage, self.indexname, self.generation, new_segments) finally: readlock.release() finally: self.writelock.release() def cancel(self): self._check_state() try: self.pool.cancel() self._close_all() finally: self.writelock.release()