def __init__(self, storage, schema, segment, generation=None, codec=None): self.storage = storage self.schema = schema self.segment = segment self._gen = generation self.is_closed = False # Copy info from underlying segment self._has_deletions = segment.has_deletions() self._dc = segment.doc_count() self._dc_all = segment.doc_count_all() if hasattr(self.segment, "segment_id"): self.segid = self.segment.segment_id() else: from whoosh.codec.base import Segment self.segid = Segment._random_id() # self.files is a storage object from which to load the segment files. # This is different from the general storage (which will be used for # cahces) if the segment is in a compound file. if segment.is_compound(): # Use an overlay here instead of just the compound storage because # in rare circumstances a segment file may be added after the # segment is written self.files = OverlayStorage(segment.open_compound_file(storage), self.storage) else: self.files = storage # Get microreaders from codec if codec is None: from whoosh.codec import default_codec codec = default_codec() self._codec = codec self._terms = codec.terms_reader(self.files, self.segment) self._lengths = codec.lengths_reader(self.files, self.segment) self._stored = codec.stored_fields_reader(self.files, self.segment) self._vectors = None # Lazy open with self._open_vectors() self._graph = None # Lazy open with self._open_dawg() self.set_caching_policy()
class SegmentReader(IndexReader): GZIP_CACHES = False def __init__(self, storage, schema, segment, generation=None, codec=None): self.storage = storage self.schema = schema self.segment = segment self._gen = generation self.is_closed = False # Copy info from underlying segment self._has_deletions = segment.has_deletions() self._dc = segment.doc_count() self._dc_all = segment.doc_count_all() if hasattr(self.segment, "segment_id"): self.segid = self.segment.segment_id() else: from whoosh.codec.base import Segment self.segid = Segment._random_id() # self.files is a storage object from which to load the segment files. # This is different from the general storage (which will be used for # cahces) if the segment is in a compound file. if segment.is_compound(): # Use an overlay here instead of just the compound storage because # in rare circumstances a segment file may be added after the # segment is written self.files = OverlayStorage(segment.open_compound_file(storage), self.storage) else: self.files = storage # Get microreaders from codec if codec is None: from whoosh.codec import default_codec codec = default_codec() self._codec = codec self._terms = codec.terms_reader(self.files, self.segment) self._lengths = codec.lengths_reader(self.files, self.segment) self._stored = codec.stored_fields_reader(self.files, self.segment) self._vectors = None # Lazy open with self._open_vectors() self._graph = None # Lazy open with self._open_dawg() self.set_caching_policy() def _open_vectors(self): if self._vectors: return self._vectors = self._codec.vector_reader(self.files, self.segment) def _open_dawg(self): if self._graph: return self._graph = self._codec.graph_reader(self.files, self.segment) def has_deletions(self): return self._has_deletions def doc_count(self): return self._dc def doc_count_all(self): return self._dc_all def is_deleted(self, docnum): return self.segment.is_deleted(docnum) def generation(self): return self._gen def __repr__(self): return "%s(%s)" % (self.__class__.__name__, self.segment) def __contains__(self, term): return term in self._terms def close(self): self._terms.close() self._stored.close() if self._lengths: self._lengths.close() if self._vectors: self._vectors.close() if self._graph: self._graph.close() self.files.close() self.caching_policy = None self.is_closed = True def stored_fields(self, docnum): assert docnum >= 0 schema = self.schema return dict(item for item in iteritems(self._stored[docnum]) if item[0] in schema) def all_stored_fields(self): is_deleted = self.segment.is_deleted sf = self.stored_fields for docnum in xrange(self._dc_all): if not is_deleted(docnum): yield sf(docnum) def field_length(self, fieldname): return self._lengths.field_length(fieldname) def min_field_length(self, fieldname): return self._lengths.min_field_length(fieldname) def max_field_length(self, fieldname): return self._lengths.max_field_length(fieldname) def doc_field_length(self, docnum, fieldname, default=0): return self._lengths.doc_field_length(docnum, fieldname, default=default) def has_vector(self, docnum, fieldname): if self.schema[fieldname].vector: try: self._open_vectors() except (NameError, IOError): return False return (docnum, fieldname) in self._vectors else: return False def _test_field(self, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) if self.schema[fieldname].format is None: raise TermNotFound("Field %r is not indexed" % fieldname) def all_terms(self): schema = self.schema return ((fieldname, text) for fieldname, text in self._terms.keys() if fieldname in schema) def terms_from(self, fieldname, prefix): self._test_field(fieldname) schema = self.schema return ((fname, text) for fname, text in self._terms.keys_from((fieldname, prefix)) if fname in schema) def term_info(self, fieldname, text): self._test_field(fieldname) try: return self._terms[fieldname, text] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) def _texts_in_fieldcache(self, fieldname, prefix=''): # The first value in a fieldcache is the default texts = self.fieldcache(fieldname).texts[1:] if prefix: i = bisect_left(texts, prefix) while i < len(texts) and texts[i].startswith(prefix): yield texts[i] i += 1 else: for text in texts: yield text def expand_prefix(self, fieldname, prefix): self._test_field(fieldname) # If a fieldcache for the field is already loaded, we already have the # values for the field in memory, so just yield them from there if self.fieldcache_loaded(fieldname): return self._texts_in_fieldcache(fieldname, prefix) else: # Call super return IndexReader.expand_prefix(self, fieldname, prefix) def lexicon(self, fieldname): self._test_field(fieldname) # If a fieldcache for the field is already loaded, we already have the # values for the field in memory, so just yield them from there if self.fieldcache_loaded(fieldname): return self._texts_in_fieldcache(fieldname) else: # Call super return IndexReader.lexicon(self, fieldname) def __iter__(self): schema = self.schema return ((term, terminfo) for term, terminfo in self._terms.items() if term[0] in schema) def iter_from(self, fieldname, text): schema = self.schema self._test_field(fieldname) for term, terminfo in self._terms.items_from((fieldname, text)): if term[0] not in schema: continue yield (term, terminfo) def frequency(self, fieldname, text): self._test_field(fieldname) try: return self._terms.frequency((fieldname, text)) except KeyError: return 0 def doc_frequency(self, fieldname, text): self._test_field(fieldname) try: return self._terms.doc_frequency((fieldname, text)) except KeyError: return 0 def postings(self, fieldname, text, scorer=None): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) format_ = self.schema[fieldname].format matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer) deleted = self.segment.deleted if deleted: matcher = FilterMatcher(matcher, deleted, exclude=True) return matcher def vector(self, docnum, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = self.schema[fieldname].vector if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) self._open_vectors() return self._vectors.matcher(docnum, fieldname, vformat) # DAWG methods def has_word_graph(self, fieldname): if fieldname not in self.schema: return False if not self.schema[fieldname].spelling: return False try: self._open_dawg() except (NameError, IOError, dawg.FileVersionError): return False return self._graph.has_root(fieldname) def word_graph(self, fieldname): if not self.has_word_graph(fieldname): raise KeyError("No word graph for field %r" % fieldname) return dawg.Node(self._graph, self._graph.root(fieldname)) def terms_within(self, fieldname, text, maxdist, prefix=0): if not self.has_word_graph(fieldname): # This reader doesn't have a graph stored, use the slow method return IndexReader.terms_within(self, fieldname, text, maxdist, prefix=prefix) return dawg.within(self._graph, text, k=maxdist, prefix=prefix, address=self._graph.root(fieldname)) # Field cache methods def supports_caches(self): return True def set_caching_policy(self, cp=None, save=True, storage=None): """This method lets you control the caching policy of the reader. You can either pass a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy` as the first argument, *or* use the `save` and `storage` keywords to alter the default caching policy:: # Use a custom field caching policy object reader.set_caching_policy(MyPolicy()) # Use the default caching policy but turn off saving caches to disk reader.set_caching_policy(save=False) # Use the default caching policy but save caches to a custom # storage from whoosh.filedb.filestore import FileStorage mystorage = FileStorage("path/to/cachedir") reader.set_caching_policy(storage=mystorage) :param cp: a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy` object. If this argument is not given, the default caching policy is used. :param save: save field caches to disk for re-use. If a caching policy object is specified using `cp`, this argument is ignored. :param storage: a custom :class:`whoosh.store.Storage` object to use for saving field caches. If a caching policy object is specified using `cp` or `save` is `False`, this argument is ignored. """ if not cp: if save and storage is None: storage = self.storage elif not save: storage = None cp = DefaultFieldCachingPolicy(self.segment.segment_id(), storage=storage) if type(cp) is type: cp = cp() self.caching_policy = cp def _fieldkey(self, fieldname): return "%s/%s" % (self.segid, fieldname) def fieldcache(self, fieldname, save=SAVE_BY_DEFAULT): """Returns a :class:`whoosh.filedb.fieldcache.FieldCache` object for the given field. :param fieldname: the name of the field to get a cache for. :param save: if True (the default), the cache is saved to disk if it doesn't already exist. """ key = self._fieldkey(fieldname) fc = self.caching_policy.get(key) if not fc: fc = FieldCache.from_field(self, fieldname) self.caching_policy.put(key, fc, save=save) return fc def fieldcache_available(self, fieldname): """Returns True if a field cache exists for the given field (either in memory already or on disk). """ return self._fieldkey(fieldname) in self.caching_policy def fieldcache_loaded(self, fieldname): """Returns True if a field cache for the given field is in memory. """ return self.caching_policy.is_loaded(self._fieldkey(fieldname)) def unload_fieldcache(self, name): self.caching_policy.delete(self._fieldkey(name))