def __init__(self, storage, schema, segment, generation=None, codec=None): self.schema = schema self.is_closed = False self._segment = segment self._segid = self._segment.segment_id() self._gen = generation # self.files is a storage object from which to load the segment files. # This is different from the general storage (which will be used for # caches) if the segment is in a compound file. if segment.is_compound(): # Open the compound file as a storage object files = segment.open_compound_file(storage) # Use an overlay here instead of just the compound storage, in rare # circumstances a segment file may be added after the segment is # written self._storage = OverlayStorage(files, storage) else: self._storage = storage # Get subreaders from codec self._codec = codec if codec else segment.codec() self._terms = self._codec.terms_reader(self._storage, segment) self._perdoc = self._codec.per_document_reader(self._storage, segment)
def __init__(self, storage, schema, segment, generation=None, codec=None): self.schema = schema self.is_closed = False self._segment = segment self._segid = self._segment.segment_id() self._gen = generation # self.files is a storage object from which to load the segment files. # This is different from the general storage (which will be used for # cahces) if the segment is in a compound file. if segment.is_compound(): # Open the compound file as a storage object files = segment.open_compound_file(storage) # Use an overlay here instead of just the compound storage, in rare # circumstances a segment file may be added after the segment is # written self._storage = OverlayStorage(files, storage) else: self._storage = storage # Get subreaders from codec self._codec = codec if codec else segment.codec() self._terms = self._codec.terms_reader(self._storage, segment) self._perdoc = self._codec.per_document_reader(self._storage, segment) self._graph = None # Lazy open with self._get_graph()
class SegmentReader(IndexReader): def __init__(self, storage, schema, segment, generation=None, codec=None): self.schema = schema self.is_closed = False self._segment = segment self._segid = self._segment.segment_id() self._gen = generation # self.files is a storage object from which to load the segment files. # This is different from the general storage (which will be used for # caches) if the segment is in a compound file. if segment.is_compound(): # Open the compound file as a storage object files = segment.open_compound_file(storage) # Use an overlay here instead of just the compound storage, in rare # circumstances a segment file may be added after the segment is # written self._storage = OverlayStorage(files, storage) else: self._storage = storage # Get subreaders from codec self._codec = codec if codec else segment.codec() self._terms = self._codec.terms_reader(self._storage, segment) self._perdoc = self._codec.per_document_reader(self._storage, segment) def codec(self): return self._codec def segment(self): return self._segment def storage(self): return self._storage def has_deletions(self): if self.is_closed: raise ReaderClosed return self._perdoc.has_deletions() def doc_count(self): if self.is_closed: raise ReaderClosed return self._perdoc.doc_count() def doc_count_all(self): if self.is_closed: raise ReaderClosed return self._perdoc.doc_count_all() def is_deleted(self, docnum): if self.is_closed: raise ReaderClosed return self._perdoc.is_deleted(docnum) def generation(self): return self._gen def __repr__(self): return "%s(%r, %r)" % (self.__class__.__name__, self._storage, self._segment) def __contains__(self, term): if self.is_closed: raise ReaderClosed fieldname, text = term if fieldname not in self.schema: return False text = self._text_to_bytes(fieldname, text) return (fieldname, text) in self._terms def close(self): if self.is_closed: raise ReaderClosed("Reader already closed") self._terms.close() self._perdoc.close() # It's possible some weird codec that doesn't use storage might have # passed None instead of a storage object if self._storage: self._storage.close() self.is_closed = True def stored_fields(self, docnum): if self.is_closed: raise ReaderClosed assert docnum >= 0 schema = self.schema sfs = self._perdoc.stored_fields(docnum) # Double-check with schema to filter out removed fields return dict(item for item in iteritems(sfs) if item[0] in schema) # Delegate doc methods to the per-doc reader def all_doc_ids(self): if self.is_closed: raise ReaderClosed return self._perdoc.all_doc_ids() def iter_docs(self): if self.is_closed: raise ReaderClosed return self._perdoc.iter_docs() def all_stored_fields(self): if self.is_closed: raise ReaderClosed return self._perdoc.all_stored_fields() def field_length(self, fieldname): if self.is_closed: raise ReaderClosed return self._perdoc.field_length(fieldname) def min_field_length(self, fieldname): if self.is_closed: raise ReaderClosed return self._perdoc.min_field_length(fieldname) def max_field_length(self, fieldname): if self.is_closed: raise ReaderClosed return self._perdoc.max_field_length(fieldname) def doc_field_length(self, docnum, fieldname, default=0): if self.is_closed: raise ReaderClosed return self._perdoc.doc_field_length(docnum, fieldname, default) def has_vector(self, docnum, fieldname): if self.is_closed: raise ReaderClosed return self._perdoc.has_vector(docnum, fieldname) # def _test_field(self, fieldname): if self.is_closed: raise ReaderClosed if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) if self.schema[fieldname].format is None: raise TermNotFound("Field %r is not indexed" % fieldname) def indexed_field_names(self): return self._terms.indexed_field_names() def all_terms(self): if self.is_closed: raise ReaderClosed schema = self.schema return ((fieldname, text) for fieldname, text in self._terms.terms() if fieldname in schema) def terms_from(self, fieldname, prefix): self._test_field(fieldname) prefix = self._text_to_bytes(fieldname, prefix) schema = self.schema return ((fname, text) for fname, text in self._terms.terms_from(fieldname, prefix) if fname in schema) def term_info(self, fieldname, text): self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) try: return self._terms.term_info(fieldname, text) except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) def expand_prefix(self, fieldname, prefix): self._test_field(fieldname) prefix = self._text_to_bytes(fieldname, prefix) return IndexReader.expand_prefix(self, fieldname, prefix) def lexicon(self, fieldname): self._test_field(fieldname) return IndexReader.lexicon(self, fieldname) def __iter__(self): if self.is_closed: raise ReaderClosed schema = self.schema return ((term, terminfo) for term, terminfo in list(self._terms.items()) if term[0] in schema) def iter_from(self, fieldname, text): self._test_field(fieldname) schema = self.schema text = self._text_to_bytes(fieldname, text) for term, terminfo in self._terms.items_from(fieldname, text): if term[0] not in schema: continue yield (term, terminfo) def frequency(self, fieldname, text): self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) try: return self._terms.frequency(fieldname, text) except KeyError: return 0 def doc_frequency(self, fieldname, text): self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) try: return self._terms.doc_frequency(fieldname, text) except KeyError: return 0 def postings(self, fieldname, text, scorer=None): from whoosh.matching.wrappers import FilterMatcher if self.is_closed: raise ReaderClosed if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) text = self._text_to_bytes(fieldname, text) format_ = self.schema[fieldname].format matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer) deleted = frozenset(self._perdoc.deleted_docs()) if deleted: matcher = FilterMatcher(matcher, deleted, exclude=True) return matcher def vector(self, docnum, fieldname, format_=None): if self.is_closed: raise ReaderClosed if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = format_ or self.schema[fieldname].vector if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) return self._perdoc.vector(docnum, fieldname, vformat) def cursor(self, fieldname): if self.is_closed: raise ReaderClosed fieldobj = self.schema[fieldname] return self._terms.cursor(fieldname, fieldobj) def terms_within(self, fieldname, text, maxdist, prefix=0): # Replaces the horribly inefficient base implementation with one based # on skipping through the word list efficiently using a DFA fieldobj = self.schema[fieldname] spellfield = fieldobj.spelling_fieldname(fieldname) auto = self._codec.automata(self._storage, self._segment) fieldcur = self.cursor(spellfield) return auto.terms_within(fieldcur, text, maxdist, prefix) # Column methods def has_column(self, fieldname): if self.is_closed: raise ReaderClosed coltype = self.schema[fieldname].column_type return coltype and self._perdoc.has_column(fieldname) def column_reader(self, fieldname, column=None, reverse=False, translate=True): if self.is_closed: raise ReaderClosed fieldobj = self.schema[fieldname] column = column or fieldobj.column_type if not column: raise Exception("No column for field %r in %r" % (fieldname, self)) if self._perdoc.has_column(fieldname): creader = self._perdoc.column_reader(fieldname, column) if reverse: creader.set_reverse() else: # This segment doesn't have a column file for this field, so create # a fake column reader that always returns the default value. default = column.default_value(reverse) creader = columns.EmptyColumnReader(default, self.doc_count_all()) if translate: # Wrap the column in a Translator to give the caller # nice values instead of sortable representations fcv = fieldobj.from_column_value creader = columns.TranslatingColumnReader(creader, fcv) return creader
class SegmentReader(IndexReader): def __init__(self, storage, schema, segment, generation=None, codec=None): self.schema = schema self.is_closed = False self._segment = segment self._segid = self._segment.segment_id() self._gen = generation # self.files is a storage object from which to load the segment files. # This is different from the general storage (which will be used for # cahces) if the segment is in a compound file. if segment.is_compound(): # Open the compound file as a storage object files = segment.open_compound_file(storage) # Use an overlay here instead of just the compound storage, in rare # circumstances a segment file may be added after the segment is # written self._storage = OverlayStorage(files, storage) else: self._storage = storage # Get subreaders from codec self._codec = codec if codec else segment.codec() self._terms = self._codec.terms_reader(self._storage, segment) self._perdoc = self._codec.per_document_reader(self._storage, segment) self._graph = None # Lazy open with self._get_graph() def _get_graph(self): if not self._graph: self._graph = self._codec.graph_reader(self._storage, self._segment) return self._graph def codec(self): return self._codec def segment(self): return self._segment def storage(self): return self._storage def has_deletions(self): return self._perdoc.has_deletions() def doc_count(self): return self._perdoc.doc_count() def doc_count_all(self): return self._perdoc.doc_count_all() def is_deleted(self, docnum): return self._perdoc.is_deleted(docnum) def generation(self): return self._gen def __repr__(self): return "%s(%r, %r)" % (self.__class__.__name__, self._storage, self._segment) def __contains__(self, term): fieldname, text = term if fieldname not in self.schema: return False text = self._text_to_bytes(fieldname, text) return (fieldname, text) in self._terms def close(self): self._terms.close() self._perdoc.close() if self._graph: self._graph.close() # It's possible some weird codec that doesn't use storage might have # passed None instead of a storage object if self._storage: self._storage.close() self.is_closed = True def stored_fields(self, docnum): assert docnum >= 0 schema = self.schema sfs = self._perdoc.stored_fields(docnum) # Double-check with schema to filter out removed fields return dict(item for item in iteritems(sfs) if item[0] in schema) # Delegate doc methods to the per-doc reader def all_doc_ids(self): return self._perdoc.all_doc_ids() def iter_docs(self): return self._perdoc.iter_docs() def all_stored_fields(self): return self._perdoc.all_stored_fields() def field_length(self, fieldname): return self._perdoc.field_length(fieldname) def min_field_length(self, fieldname): return self._perdoc.min_field_length(fieldname) def max_field_length(self, fieldname): return self._perdoc.max_field_length(fieldname) def doc_field_length(self, docnum, fieldname, default=0): return self._perdoc.doc_field_length(docnum, fieldname, default) def has_vector(self, docnum, fieldname): return self._perdoc.has_vector(docnum, fieldname) # def _test_field(self, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) if self.schema[fieldname].format is None: raise TermNotFound("Field %r is not indexed" % fieldname) def all_terms(self): schema = self.schema return ((fieldname, text) for fieldname, text in self._terms.terms() if fieldname in schema) def terms_from(self, fieldname, prefix): self._test_field(fieldname) prefix = self._text_to_bytes(fieldname, prefix) schema = self.schema return ((fname, text) for fname, text in self._terms.terms_from(fieldname, prefix) if fname in schema) def term_info(self, fieldname, text): self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) try: return self._terms.term_info(fieldname, text) except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) def expand_prefix(self, fieldname, prefix): self._test_field(fieldname) prefix = self._text_to_bytes(fieldname, prefix) return IndexReader.expand_prefix(self, fieldname, prefix) def lexicon(self, fieldname): self._test_field(fieldname) return IndexReader.lexicon(self, fieldname) def __iter__(self): schema = self.schema return ((term, terminfo) for term, terminfo in self._terms.items() if term[0] in schema) def iter_from(self, fieldname, text): schema = self.schema self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) for term, terminfo in self._terms.items_from(fieldname, text): if term[0] not in schema: continue yield (term, terminfo) def frequency(self, fieldname, text): self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) try: return self._terms.frequency(fieldname, text) except KeyError: return 0 def doc_frequency(self, fieldname, text): self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) try: return self._terms.doc_frequency(fieldname, text) except KeyError: return 0 def postings(self, fieldname, text, scorer=None): from whoosh.matching.wrappers import FilterMatcher if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) text = self._text_to_bytes(fieldname, text) format_ = self.schema[fieldname].format matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer) deleted = frozenset(self._perdoc.deleted_docs()) if deleted: matcher = FilterMatcher(matcher, deleted, exclude=True) return matcher def vector(self, docnum, fieldname, format_=None): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = format_ or self.schema[fieldname].vector if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) return self._perdoc.vector(docnum, fieldname, vformat) # Graph methods def has_word_graph(self, fieldname): if fieldname not in self.schema: return False if not self.schema[fieldname].spelling: return False try: gr = self._get_graph() except NoGraphError: return False return gr.has_root(fieldname) def word_graph(self, fieldname): if not self.has_word_graph(fieldname): raise KeyError("No word graph for field %r" % fieldname) gr = self._get_graph() return fst.Node(gr, gr.root(fieldname)) def terms_within(self, fieldname, text, maxdist, prefix=0): if not self.has_word_graph(fieldname): # This reader doesn't have a graph stored, use the slow method return IndexReader.terms_within(self, fieldname, text, maxdist, prefix=prefix) gr = self._get_graph() return fst.within(gr, text, k=maxdist, prefix=prefix, address=self._graph.root(fieldname)) # Column methods def has_column(self, fieldname): coltype = self.schema[fieldname].column_type return coltype and self._perdoc.has_column(fieldname) def column_reader(self, fieldname, column=None): fieldobj = self.schema[fieldname] column = column or fieldobj.column_type reader = self._perdoc.column_reader(fieldname, column) translate = fieldobj.from_column_value creader = columns.TranslatingColumnReader(reader, translate) return creader
class SegmentReader(IndexReader): def __init__(self, storage, schema, segment, generation=None, codec=None): self.schema = schema self.is_closed = False self._segment = segment self._segid = self._segment.segment_id() self._gen = generation # self.files is a storage object from which to load the segment files. # This is different from the general storage (which will be used for # caches) if the segment is in a compound file. if segment.is_compound(): # Open the compound file as a storage object files = segment.open_compound_file(storage) # Use an overlay here instead of just the compound storage, in rare # circumstances a segment file may be added after the segment is # written self._storage = OverlayStorage(files, storage) else: self._storage = storage # Get subreaders from codec self._codec = codec if codec else segment.codec() self._terms = self._codec.terms_reader(self._storage, segment) self._perdoc = self._codec.per_document_reader(self._storage, segment) def codec(self): return self._codec def segment(self): return self._segment def storage(self): return self._storage def has_deletions(self): if self.is_closed: raise ReaderClosed return self._perdoc.has_deletions() def doc_count(self): if self.is_closed: raise ReaderClosed return self._perdoc.doc_count() def doc_count_all(self): if self.is_closed: raise ReaderClosed return self._perdoc.doc_count_all() def is_deleted(self, docnum): if self.is_closed: raise ReaderClosed return self._perdoc.is_deleted(docnum) def generation(self): return self._gen def __repr__(self): return "%s(%r, %r)" % (self.__class__.__name__, self._storage, self._segment) def __contains__(self, term): if self.is_closed: raise ReaderClosed fieldname, text = term if fieldname not in self.schema: return False text = self._text_to_bytes(fieldname, text) return (fieldname, text) in self._terms def close(self): if self.is_closed: raise ReaderClosed("Reader already closed") self._terms.close() self._perdoc.close() # It's possible some weird codec that doesn't use storage might have # passed None instead of a storage object if self._storage: self._storage.close() self.is_closed = True def stored_fields(self, docnum): if self.is_closed: raise ReaderClosed assert docnum >= 0 schema = self.schema sfs = self._perdoc.stored_fields(docnum) # Double-check with schema to filter out removed fields return dict(item for item in iteritems(sfs) if item[0] in schema) # Delegate doc methods to the per-doc reader def all_doc_ids(self): if self.is_closed: raise ReaderClosed return self._perdoc.all_doc_ids() def iter_docs(self): if self.is_closed: raise ReaderClosed return self._perdoc.iter_docs() def all_stored_fields(self): if self.is_closed: raise ReaderClosed return self._perdoc.all_stored_fields() def field_length(self, fieldname): if self.is_closed: raise ReaderClosed return self._perdoc.field_length(fieldname) def min_field_length(self, fieldname): if self.is_closed: raise ReaderClosed return self._perdoc.min_field_length(fieldname) def max_field_length(self, fieldname): if self.is_closed: raise ReaderClosed return self._perdoc.max_field_length(fieldname) def doc_field_length(self, docnum, fieldname, default=0): if self.is_closed: raise ReaderClosed return self._perdoc.doc_field_length(docnum, fieldname, default) def has_vector(self, docnum, fieldname): if self.is_closed: raise ReaderClosed return self._perdoc.has_vector(docnum, fieldname) # def _test_field(self, fieldname): if self.is_closed: raise ReaderClosed if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) if self.schema[fieldname].format is None: raise TermNotFound("Field %r is not indexed" % fieldname) def indexed_field_names(self): return self._terms.indexed_field_names() def all_terms(self): if self.is_closed: raise ReaderClosed schema = self.schema return ((fieldname, text) for fieldname, text in self._terms.terms() if fieldname in schema) def terms_from(self, fieldname, prefix): self._test_field(fieldname) prefix = self._text_to_bytes(fieldname, prefix) schema = self.schema return ((fname, text) for fname, text in self._terms.terms_from(fieldname, prefix) if fname in schema) def term_info(self, fieldname, text): self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) try: return self._terms.term_info(fieldname, text) except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) def expand_prefix(self, fieldname, prefix): self._test_field(fieldname) prefix = self._text_to_bytes(fieldname, prefix) return IndexReader.expand_prefix(self, fieldname, prefix) def lexicon(self, fieldname): self._test_field(fieldname) return IndexReader.lexicon(self, fieldname) def __iter__(self): if self.is_closed: raise ReaderClosed schema = self.schema return ((term, terminfo) for term, terminfo in self._terms.items() if term[0] in schema) def iter_from(self, fieldname, text): self._test_field(fieldname) schema = self.schema text = self._text_to_bytes(fieldname, text) for term, terminfo in self._terms.items_from(fieldname, text): if term[0] not in schema: continue yield (term, terminfo) def frequency(self, fieldname, text): self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) try: return self._terms.frequency(fieldname, text) except KeyError: return 0 def doc_frequency(self, fieldname, text): self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) try: return self._terms.doc_frequency(fieldname, text) except KeyError: return 0 def postings(self, fieldname, text, scorer=None): from whoosh.matching.wrappers import FilterMatcher if self.is_closed: raise ReaderClosed if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) text = self._text_to_bytes(fieldname, text) format_ = self.schema[fieldname].format matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer) deleted = frozenset(self._perdoc.deleted_docs()) if deleted: matcher = FilterMatcher(matcher, deleted, exclude=True) return matcher def vector(self, docnum, fieldname, format_=None): if self.is_closed: raise ReaderClosed if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = format_ or self.schema[fieldname].vector if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) return self._perdoc.vector(docnum, fieldname, vformat) def cursor(self, fieldname): if self.is_closed: raise ReaderClosed fieldobj = self.schema[fieldname] return self._terms.cursor(fieldname, fieldobj) def terms_within(self, fieldname, text, maxdist, prefix=0): # Replaces the horribly inefficient base implementation with one based # on skipping through the word list efficiently using a DFA fieldobj = self.schema[fieldname] spellfield = fieldobj.spelling_fieldname(fieldname) auto = self._codec.automata(self._storage, self._segment) fieldcur = self.cursor(spellfield) return auto.terms_within(fieldcur, text, maxdist, prefix) # Column methods def has_column(self, fieldname): if self.is_closed: raise ReaderClosed coltype = self.schema[fieldname].column_type return coltype and self._perdoc.has_column(fieldname) def column_reader(self, fieldname, column=None, reverse=False, translate=True): if self.is_closed: raise ReaderClosed fieldobj = self.schema[fieldname] column = column or fieldobj.column_type if not column: raise Exception("No column for field %r in %r" % (fieldname, self)) if self._perdoc.has_column(fieldname): creader = self._perdoc.column_reader(fieldname, column) if reverse: creader.set_reverse() else: # This segment doesn't have a column file for this field, so create # a fake column reader that always returns the default value. default = column.default_value(reverse) creader = columns.EmptyColumnReader(default, self.doc_count_all()) if translate: # Wrap the column in a Translator to give the caller # nice values instead of sortable representations fcv = fieldobj.from_column_value creader = columns.TranslatingColumnReader(creader, fcv) return creader
class SegmentReader(IndexReader): def __init__(self, storage, schema, segment, generation=None, codec=None): self.schema = schema self.is_closed = False self._segment = segment self._segid = self._segment.segment_id() self._gen = generation # self.files is a storage object from which to load the segment files. # This is different from the general storage (which will be used for # caches) if the segment is in a compound file. if segment.is_compound(): # Open the compound file as a storage object files = segment.open_compound_file(storage) # Use an overlay here instead of just the compound storage, in rare # circumstances a segment file may be added after the segment is # written self._storage = OverlayStorage(files, storage) else: self._storage = storage # Get subreaders from codec self._codec = codec if codec else segment.codec() self._terms = self._codec.terms_reader(self._storage, segment) self._perdoc = self._codec.per_document_reader(self._storage, segment) self._graph = None # Lazy open with self._get_graph() def _get_graph(self): if not self._graph: self._graph = self._codec.graph_reader(self._storage, self._segment) return self._graph def codec(self): return self._codec def segment(self): return self._segment def storage(self): return self._storage def has_deletions(self): return self._perdoc.has_deletions() def doc_count(self): return self._perdoc.doc_count() def doc_count_all(self): return self._perdoc.doc_count_all() def is_deleted(self, docnum): return self._perdoc.is_deleted(docnum) def generation(self): return self._gen def __repr__(self): return "%s(%r, %r)" % (self.__class__.__name__, self._storage, self._segment) def __contains__(self, term): fieldname, text = term if fieldname not in self.schema: return False text = self._text_to_bytes(fieldname, text) return (fieldname, text) in self._terms def close(self): self._terms.close() self._perdoc.close() if self._graph: self._graph.close() # It's possible some weird codec that doesn't use storage might have # passed None instead of a storage object if self._storage: self._storage.close() self.is_closed = True def stored_fields(self, docnum): assert docnum >= 0 schema = self.schema sfs = self._perdoc.stored_fields(docnum) # Double-check with schema to filter out removed fields return dict(item for item in iteritems(sfs) if item[0] in schema) # Delegate doc methods to the per-doc reader def all_doc_ids(self): return self._perdoc.all_doc_ids() def iter_docs(self): return self._perdoc.iter_docs() def all_stored_fields(self): return self._perdoc.all_stored_fields() def field_length(self, fieldname): return self._perdoc.field_length(fieldname) def min_field_length(self, fieldname): return self._perdoc.min_field_length(fieldname) def max_field_length(self, fieldname): return self._perdoc.max_field_length(fieldname) def doc_field_length(self, docnum, fieldname, default=0): return self._perdoc.doc_field_length(docnum, fieldname, default) def has_vector(self, docnum, fieldname): return self._perdoc.has_vector(docnum, fieldname) # def _test_field(self, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) if self.schema[fieldname].format is None: raise TermNotFound("Field %r is not indexed" % fieldname) def all_terms(self): schema = self.schema return ((fieldname, text) for fieldname, text in self._terms.terms() if fieldname in schema) def terms_from(self, fieldname, prefix): self._test_field(fieldname) prefix = self._text_to_bytes(fieldname, prefix) schema = self.schema return ((fname, text) for fname, text in self._terms.terms_from(fieldname, prefix) if fname in schema) def term_info(self, fieldname, text): self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) try: return self._terms.term_info(fieldname, text) except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) def expand_prefix(self, fieldname, prefix): self._test_field(fieldname) prefix = self._text_to_bytes(fieldname, prefix) return IndexReader.expand_prefix(self, fieldname, prefix) def lexicon(self, fieldname): self._test_field(fieldname) return IndexReader.lexicon(self, fieldname) def __iter__(self): schema = self.schema return ((term, terminfo) for term, terminfo in self._terms.items() if term[0] in schema) def iter_from(self, fieldname, text): schema = self.schema self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) for term, terminfo in self._terms.items_from(fieldname, text): if term[0] not in schema: continue yield (term, terminfo) def frequency(self, fieldname, text): self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) try: return self._terms.frequency(fieldname, text) except KeyError: return 0 def doc_frequency(self, fieldname, text): self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) try: return self._terms.doc_frequency(fieldname, text) except KeyError: return 0 def postings(self, fieldname, text, scorer=None): from whoosh.matching.wrappers import FilterMatcher if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) text = self._text_to_bytes(fieldname, text) format_ = self.schema[fieldname].format matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer) deleted = frozenset(self._perdoc.deleted_docs()) if deleted: matcher = FilterMatcher(matcher, deleted, exclude=True) return matcher def vector(self, docnum, fieldname, format_=None): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = format_ or self.schema[fieldname].vector if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) return self._perdoc.vector(docnum, fieldname, vformat) # Graph methods def has_word_graph(self, fieldname): if fieldname not in self.schema: return False if not self.schema[fieldname].spelling: return False try: gr = self._get_graph() except NoGraphError: return False return gr.has_root(fieldname) def word_graph(self, fieldname): if not self.has_word_graph(fieldname): raise KeyError("No word graph for field %r" % fieldname) gr = self._get_graph() return fst.Node(gr, gr.root(fieldname)) def terms_within(self, fieldname, text, maxdist, prefix=0): if not self.has_word_graph(fieldname): # This reader doesn't have a graph stored, use the slow method return IndexReader.terms_within(self, fieldname, text, maxdist, prefix=prefix) gr = self._get_graph() return fst.within(gr, text, k=maxdist, prefix=prefix, address=self._graph.root(fieldname)) # Column methods def has_column(self, fieldname): coltype = self.schema[fieldname].column_type return coltype and self._perdoc.has_column(fieldname) def column_reader(self, fieldname, column=None, translate=True): fieldobj = self.schema[fieldname] if not self.has_column(fieldname): raise Exception("No column for field %r" % fieldname) ctype = column or fieldobj.column_type creader = self._perdoc.column_reader(fieldname, ctype) if translate: # Wrap the column in a Translator to give the caller # nice values instead of sortable representations fcv = fieldobj.from_column_value creader = columns.TranslatingColumnReader(creader, fcv) return creader