def update_document(self, **fields): self._check_state() _unique_cache = self._unique_cache # Check which of the supplied fields are unique unique_fields = [ name for name, field in self.schema.items() if name in fields and field.unique ] if not unique_fields: raise IndexingError("None of the fields in %r" " are unique" % fields.keys()) # Delete documents matching the unique terms delset = set() for name in unique_fields: field = self.schema[name] text = field.to_text(fields[name]) # If we've seen an update_document with this unique field before... if name in _unique_cache: # Get the cache for this field term2docnum = _unique_cache[name] # If the cache is None, that means we've seen this field once # before but didn't cache it the first time. Cache it now. if term2docnum is None: # Read the first document number found for every term in # this field and cache the mapping from term to doc num term2docnum = {} s = self.searcher() term2docnum = dict(s.first_ids(name)) s.close() _unique_cache[name] = term2docnum # Look up the cached document number for this term if text in term2docnum: delset.add(term2docnum[text]) else: # This is the first time we've seen an update_document with # this field. Mark it by putting None in the cache for this # field, but don't cache it. We'll only build the cache if we # see an update_document on this field again. This is to # prevent caching a field even when the user is only going to # call update_document once. reader = self.searcher().reader() try: delset.add(reader.postings(name, text).id()) _unique_cache[name] = None except TermNotFound: pass finally: reader.close() # Delete the old docs for docnum in delset: self.delete_document(docnum) # Add the given fields self.add_document(**fields)
def delete_document(self, docnum, delete=True): self._check_state() if docnum >= sum(seg.doccount for seg in self.segments): raise IndexingError("No document ID %r in this index" % docnum) segment, segdocnum = self._segment_and_docnum(docnum) segment.delete_document(segdocnum, delete=delete)
def _check_state(self): if self.is_closed: raise IndexingError("This writer is closed")