Example #1
0
    def __init__(self, storage, schema, segment, generation=None, codec=None):
        self.schema = schema
        self.is_closed = False

        self._segment = segment
        self._segid = self._segment.segment_id()
        self._gen = generation

        # self.files is a storage object from which to load the segment files.
        # This is different from the general storage (which will be used for
        # caches) if the segment is in a compound file.
        if segment.is_compound():
            # Open the compound file as a storage object
            files = segment.open_compound_file(storage)
            # Use an overlay here instead of just the compound storage, in rare
            # circumstances a segment file may be added after the segment is
            # written
            self._storage = OverlayStorage(files, storage)
        else:
            self._storage = storage

        # Get subreaders from codec
        self._codec = codec if codec else segment.codec()
        self._terms = self._codec.terms_reader(self._storage, segment)
        self._perdoc = self._codec.per_document_reader(self._storage, segment)
Example #2
0
    def __init__(self, storage, schema, segment, generation=None, codec=None):
        self.schema = schema
        self.is_closed = False

        self._segment = segment
        self._segid = self._segment.segment_id()
        self._gen = generation

        # self.files is a storage object from which to load the segment files.
        # This is different from the general storage (which will be used for
        # cahces) if the segment is in a compound file.
        if segment.is_compound():
            # Open the compound file as a storage object
            files = segment.open_compound_file(storage)
            # Use an overlay here instead of just the compound storage, in rare
            # circumstances a segment file may be added after the segment is
            # written
            self._storage = OverlayStorage(files, storage)
        else:
            self._storage = storage

        # Get subreaders from codec
        self._codec = codec if codec else segment.codec()
        self._terms = self._codec.terms_reader(self._storage, segment)
        self._perdoc = self._codec.per_document_reader(self._storage, segment)
        self._graph = None  # Lazy open with self._get_graph()
Example #3
0
class SegmentReader(IndexReader):
    def __init__(self, storage, schema, segment, generation=None, codec=None):
        self.schema = schema
        self.is_closed = False

        self._segment = segment
        self._segid = self._segment.segment_id()
        self._gen = generation

        # self.files is a storage object from which to load the segment files.
        # This is different from the general storage (which will be used for
        # caches) if the segment is in a compound file.
        if segment.is_compound():
            # Open the compound file as a storage object
            files = segment.open_compound_file(storage)
            # Use an overlay here instead of just the compound storage, in rare
            # circumstances a segment file may be added after the segment is
            # written
            self._storage = OverlayStorage(files, storage)
        else:
            self._storage = storage

        # Get subreaders from codec
        self._codec = codec if codec else segment.codec()
        self._terms = self._codec.terms_reader(self._storage, segment)
        self._perdoc = self._codec.per_document_reader(self._storage, segment)

    def codec(self):
        return self._codec

    def segment(self):
        return self._segment

    def storage(self):
        return self._storage

    def has_deletions(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.has_deletions()

    def doc_count(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.doc_count()

    def doc_count_all(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.doc_count_all()

    def is_deleted(self, docnum):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.is_deleted(docnum)

    def generation(self):
        return self._gen

    def __repr__(self):
        return "%s(%r, %r)" % (self.__class__.__name__, self._storage,
                               self._segment)

    def __contains__(self, term):
        if self.is_closed:
            raise ReaderClosed
        fieldname, text = term
        if fieldname not in self.schema:
            return False
        text = self._text_to_bytes(fieldname, text)
        return (fieldname, text) in self._terms

    def close(self):
        if self.is_closed:
            raise ReaderClosed("Reader already closed")
        self._terms.close()
        self._perdoc.close()

        # It's possible some weird codec that doesn't use storage might have
        # passed None instead of a storage object
        if self._storage:
            self._storage.close()

        self.is_closed = True

    def stored_fields(self, docnum):
        if self.is_closed:
            raise ReaderClosed
        assert docnum >= 0
        schema = self.schema
        sfs = self._perdoc.stored_fields(docnum)
        # Double-check with schema to filter out removed fields
        return dict(item for item in iteritems(sfs) if item[0] in schema)

    # Delegate doc methods to the per-doc reader

    def all_doc_ids(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.all_doc_ids()

    def iter_docs(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.iter_docs()

    def all_stored_fields(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.all_stored_fields()

    def field_length(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.field_length(fieldname)

    def min_field_length(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.min_field_length(fieldname)

    def max_field_length(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.max_field_length(fieldname)

    def doc_field_length(self, docnum, fieldname, default=0):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.doc_field_length(docnum, fieldname, default)

    def has_vector(self, docnum, fieldname):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.has_vector(docnum, fieldname)

    #

    def _test_field(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        if fieldname not in self.schema:
            raise TermNotFound("No field %r" % fieldname)
        if self.schema[fieldname].format is None:
            raise TermNotFound("Field %r is not indexed" % fieldname)

    def indexed_field_names(self):
        return self._terms.indexed_field_names()

    def all_terms(self):
        if self.is_closed:
            raise ReaderClosed
        schema = self.schema
        return ((fieldname, text) for fieldname, text in self._terms.terms()
                if fieldname in schema)

    def terms_from(self, fieldname, prefix):
        self._test_field(fieldname)
        prefix = self._text_to_bytes(fieldname, prefix)
        schema = self.schema
        return ((fname, text) for fname, text
                in self._terms.terms_from(fieldname, prefix)
                if fname in schema)

    def term_info(self, fieldname, text):
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        try:
            return self._terms.term_info(fieldname, text)
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

    def expand_prefix(self, fieldname, prefix):
        self._test_field(fieldname)
        prefix = self._text_to_bytes(fieldname, prefix)
        return IndexReader.expand_prefix(self, fieldname, prefix)

    def lexicon(self, fieldname):
        self._test_field(fieldname)
        return IndexReader.lexicon(self, fieldname)

    def __iter__(self):
        if self.is_closed:
            raise ReaderClosed
        schema = self.schema
        return ((term, terminfo) for term, terminfo in list(self._terms.items())
                if term[0] in schema)

    def iter_from(self, fieldname, text):
        self._test_field(fieldname)
        schema = self.schema
        text = self._text_to_bytes(fieldname, text)
        for term, terminfo in self._terms.items_from(fieldname, text):
            if term[0] not in schema:
                continue
            yield (term, terminfo)

    def frequency(self, fieldname, text):
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        try:
            return self._terms.frequency(fieldname, text)
        except KeyError:
            return 0

    def doc_frequency(self, fieldname, text):
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        try:
            return self._terms.doc_frequency(fieldname, text)
        except KeyError:
            return 0

    def postings(self, fieldname, text, scorer=None):
        from whoosh.matching.wrappers import FilterMatcher

        if self.is_closed:
            raise ReaderClosed
        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        text = self._text_to_bytes(fieldname, text)
        format_ = self.schema[fieldname].format
        matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer)
        deleted = frozenset(self._perdoc.deleted_docs())
        if deleted:
            matcher = FilterMatcher(matcher, deleted, exclude=True)
        return matcher

    def vector(self, docnum, fieldname, format_=None):
        if self.is_closed:
            raise ReaderClosed
        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        vformat = format_ or self.schema[fieldname].vector
        if not vformat:
            raise Exception("No vectors are stored for field %r" % fieldname)
        return self._perdoc.vector(docnum, fieldname, vformat)

    def cursor(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        fieldobj = self.schema[fieldname]
        return self._terms.cursor(fieldname, fieldobj)

    def terms_within(self, fieldname, text, maxdist, prefix=0):
        # Replaces the horribly inefficient base implementation with one based
        # on skipping through the word list efficiently using a DFA

        fieldobj = self.schema[fieldname]
        spellfield = fieldobj.spelling_fieldname(fieldname)
        auto = self._codec.automata(self._storage, self._segment)
        fieldcur = self.cursor(spellfield)
        return auto.terms_within(fieldcur, text, maxdist, prefix)

    # Column methods

    def has_column(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        coltype = self.schema[fieldname].column_type
        return coltype and self._perdoc.has_column(fieldname)

    def column_reader(self, fieldname, column=None, reverse=False,
                      translate=True):
        if self.is_closed:
            raise ReaderClosed

        fieldobj = self.schema[fieldname]
        column = column or fieldobj.column_type
        if not column:
            raise Exception("No column for field %r in %r"
                            % (fieldname, self))

        if self._perdoc.has_column(fieldname):
            creader = self._perdoc.column_reader(fieldname, column)
            if reverse:
                creader.set_reverse()
        else:
            # This segment doesn't have a column file for this field, so create
            # a fake column reader that always returns the default value.
            default = column.default_value(reverse)
            creader = columns.EmptyColumnReader(default, self.doc_count_all())

        if translate:
            # Wrap the column in a Translator to give the caller
            # nice values instead of sortable representations
            fcv = fieldobj.from_column_value
            creader = columns.TranslatingColumnReader(creader, fcv)

        return creader
Example #4
0
class SegmentReader(IndexReader):
    def __init__(self, storage, schema, segment, generation=None, codec=None):
        self.schema = schema
        self.is_closed = False

        self._segment = segment
        self._segid = self._segment.segment_id()
        self._gen = generation

        # self.files is a storage object from which to load the segment files.
        # This is different from the general storage (which will be used for
        # cahces) if the segment is in a compound file.
        if segment.is_compound():
            # Open the compound file as a storage object
            files = segment.open_compound_file(storage)
            # Use an overlay here instead of just the compound storage, in rare
            # circumstances a segment file may be added after the segment is
            # written
            self._storage = OverlayStorage(files, storage)
        else:
            self._storage = storage

        # Get subreaders from codec
        self._codec = codec if codec else segment.codec()
        self._terms = self._codec.terms_reader(self._storage, segment)
        self._perdoc = self._codec.per_document_reader(self._storage, segment)
        self._graph = None  # Lazy open with self._get_graph()

    def _get_graph(self):
        if not self._graph:
            self._graph = self._codec.graph_reader(self._storage, self._segment)
        return self._graph

    def codec(self):
        return self._codec

    def segment(self):
        return self._segment

    def storage(self):
        return self._storage

    def has_deletions(self):
        return self._perdoc.has_deletions()

    def doc_count(self):
        return self._perdoc.doc_count()

    def doc_count_all(self):
        return self._perdoc.doc_count_all()

    def is_deleted(self, docnum):
        return self._perdoc.is_deleted(docnum)

    def generation(self):
        return self._gen

    def __repr__(self):
        return "%s(%r, %r)" % (self.__class__.__name__, self._storage,
                               self._segment)

    def __contains__(self, term):
        fieldname, text = term
        if fieldname not in self.schema:
            return False
        text = self._text_to_bytes(fieldname, text)
        return (fieldname, text) in self._terms

    def close(self):
        self._terms.close()
        self._perdoc.close()
        if self._graph:
            self._graph.close()

        # It's possible some weird codec that doesn't use storage might have
        # passed None instead of a storage object
        if self._storage:
            self._storage.close()

        self.is_closed = True

    def stored_fields(self, docnum):
        assert docnum >= 0
        schema = self.schema
        sfs = self._perdoc.stored_fields(docnum)
        # Double-check with schema to filter out removed fields
        return dict(item for item in iteritems(sfs) if item[0] in schema)

    # Delegate doc methods to the per-doc reader

    def all_doc_ids(self):
        return self._perdoc.all_doc_ids()

    def iter_docs(self):
        return self._perdoc.iter_docs()

    def all_stored_fields(self):
        return self._perdoc.all_stored_fields()

    def field_length(self, fieldname):
        return self._perdoc.field_length(fieldname)

    def min_field_length(self, fieldname):
        return self._perdoc.min_field_length(fieldname)

    def max_field_length(self, fieldname):
        return self._perdoc.max_field_length(fieldname)

    def doc_field_length(self, docnum, fieldname, default=0):
        return self._perdoc.doc_field_length(docnum, fieldname, default)

    def has_vector(self, docnum, fieldname):
        return self._perdoc.has_vector(docnum, fieldname)

    #

    def _test_field(self, fieldname):
        if fieldname not in self.schema:
            raise TermNotFound("No field %r" % fieldname)
        if self.schema[fieldname].format is None:
            raise TermNotFound("Field %r is not indexed" % fieldname)

    def all_terms(self):
        schema = self.schema
        return ((fieldname, text) for fieldname, text in self._terms.terms()
                if fieldname in schema)

    def terms_from(self, fieldname, prefix):
        self._test_field(fieldname)
        prefix = self._text_to_bytes(fieldname, prefix)
        schema = self.schema
        return ((fname, text) for fname, text
                in self._terms.terms_from(fieldname, prefix)
                if fname in schema)

    def term_info(self, fieldname, text):
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        try:
            return self._terms.term_info(fieldname, text)
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

    def expand_prefix(self, fieldname, prefix):
        self._test_field(fieldname)
        prefix = self._text_to_bytes(fieldname, prefix)
        return IndexReader.expand_prefix(self, fieldname, prefix)

    def lexicon(self, fieldname):
        self._test_field(fieldname)
        return IndexReader.lexicon(self, fieldname)

    def __iter__(self):
        schema = self.schema
        return ((term, terminfo) for term, terminfo in self._terms.items()
                if term[0] in schema)

    def iter_from(self, fieldname, text):
        schema = self.schema
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        for term, terminfo in self._terms.items_from(fieldname, text):
            if term[0] not in schema:
                continue
            yield (term, terminfo)

    def frequency(self, fieldname, text):
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        try:
            return self._terms.frequency(fieldname, text)
        except KeyError:
            return 0

    def doc_frequency(self, fieldname, text):
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        try:
            return self._terms.doc_frequency(fieldname, text)
        except KeyError:
            return 0

    def postings(self, fieldname, text, scorer=None):
        from whoosh.matching.wrappers import FilterMatcher

        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        text = self._text_to_bytes(fieldname, text)
        format_ = self.schema[fieldname].format
        matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer)
        deleted = frozenset(self._perdoc.deleted_docs())
        if deleted:
            matcher = FilterMatcher(matcher, deleted, exclude=True)
        return matcher

    def vector(self, docnum, fieldname, format_=None):
        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        vformat = format_ or self.schema[fieldname].vector
        if not vformat:
            raise Exception("No vectors are stored for field %r" % fieldname)
        return self._perdoc.vector(docnum, fieldname, vformat)

    # Graph methods

    def has_word_graph(self, fieldname):
        if fieldname not in self.schema:
            return False
        if not self.schema[fieldname].spelling:
            return False

        try:
            gr = self._get_graph()
        except NoGraphError:
            return False

        return gr.has_root(fieldname)

    def word_graph(self, fieldname):
        if not self.has_word_graph(fieldname):
            raise KeyError("No word graph for field %r" % fieldname)
        gr = self._get_graph()
        return fst.Node(gr, gr.root(fieldname))

    def terms_within(self, fieldname, text, maxdist, prefix=0):
        if not self.has_word_graph(fieldname):
            # This reader doesn't have a graph stored, use the slow method
            return IndexReader.terms_within(self, fieldname, text, maxdist,
                                            prefix=prefix)
        gr = self._get_graph()
        return fst.within(gr, text, k=maxdist, prefix=prefix,
                           address=self._graph.root(fieldname))

    # Column methods

    def has_column(self, fieldname):
        coltype = self.schema[fieldname].column_type
        return coltype and self._perdoc.has_column(fieldname)

    def column_reader(self, fieldname, column=None):
        fieldobj = self.schema[fieldname]
        column = column or fieldobj.column_type
        reader = self._perdoc.column_reader(fieldname, column)

        translate = fieldobj.from_column_value
        creader = columns.TranslatingColumnReader(reader, translate)
        return creader
Example #5
0
class SegmentReader(IndexReader):
    def __init__(self, storage, schema, segment, generation=None, codec=None):
        self.schema = schema
        self.is_closed = False

        self._segment = segment
        self._segid = self._segment.segment_id()
        self._gen = generation

        # self.files is a storage object from which to load the segment files.
        # This is different from the general storage (which will be used for
        # caches) if the segment is in a compound file.
        if segment.is_compound():
            # Open the compound file as a storage object
            files = segment.open_compound_file(storage)
            # Use an overlay here instead of just the compound storage, in rare
            # circumstances a segment file may be added after the segment is
            # written
            self._storage = OverlayStorage(files, storage)
        else:
            self._storage = storage

        # Get subreaders from codec
        self._codec = codec if codec else segment.codec()
        self._terms = self._codec.terms_reader(self._storage, segment)
        self._perdoc = self._codec.per_document_reader(self._storage, segment)

    def codec(self):
        return self._codec

    def segment(self):
        return self._segment

    def storage(self):
        return self._storage

    def has_deletions(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.has_deletions()

    def doc_count(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.doc_count()

    def doc_count_all(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.doc_count_all()

    def is_deleted(self, docnum):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.is_deleted(docnum)

    def generation(self):
        return self._gen

    def __repr__(self):
        return "%s(%r, %r)" % (self.__class__.__name__, self._storage,
                               self._segment)

    def __contains__(self, term):
        if self.is_closed:
            raise ReaderClosed
        fieldname, text = term
        if fieldname not in self.schema:
            return False
        text = self._text_to_bytes(fieldname, text)
        return (fieldname, text) in self._terms

    def close(self):
        if self.is_closed:
            raise ReaderClosed("Reader already closed")
        self._terms.close()
        self._perdoc.close()

        # It's possible some weird codec that doesn't use storage might have
        # passed None instead of a storage object
        if self._storage:
            self._storage.close()

        self.is_closed = True

    def stored_fields(self, docnum):
        if self.is_closed:
            raise ReaderClosed
        assert docnum >= 0
        schema = self.schema
        sfs = self._perdoc.stored_fields(docnum)
        # Double-check with schema to filter out removed fields
        return dict(item for item in iteritems(sfs) if item[0] in schema)

    # Delegate doc methods to the per-doc reader

    def all_doc_ids(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.all_doc_ids()

    def iter_docs(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.iter_docs()

    def all_stored_fields(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.all_stored_fields()

    def field_length(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.field_length(fieldname)

    def min_field_length(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.min_field_length(fieldname)

    def max_field_length(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.max_field_length(fieldname)

    def doc_field_length(self, docnum, fieldname, default=0):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.doc_field_length(docnum, fieldname, default)

    def has_vector(self, docnum, fieldname):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.has_vector(docnum, fieldname)

    #

    def _test_field(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        if fieldname not in self.schema:
            raise TermNotFound("No field %r" % fieldname)
        if self.schema[fieldname].format is None:
            raise TermNotFound("Field %r is not indexed" % fieldname)

    def indexed_field_names(self):
        return self._terms.indexed_field_names()

    def all_terms(self):
        if self.is_closed:
            raise ReaderClosed
        schema = self.schema
        return ((fieldname, text) for fieldname, text in self._terms.terms()
                if fieldname in schema)

    def terms_from(self, fieldname, prefix):
        self._test_field(fieldname)
        prefix = self._text_to_bytes(fieldname, prefix)
        schema = self.schema
        return ((fname, text) for fname, text
                in self._terms.terms_from(fieldname, prefix)
                if fname in schema)

    def term_info(self, fieldname, text):
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        try:
            return self._terms.term_info(fieldname, text)
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

    def expand_prefix(self, fieldname, prefix):
        self._test_field(fieldname)
        prefix = self._text_to_bytes(fieldname, prefix)
        return IndexReader.expand_prefix(self, fieldname, prefix)

    def lexicon(self, fieldname):
        self._test_field(fieldname)
        return IndexReader.lexicon(self, fieldname)

    def __iter__(self):
        if self.is_closed:
            raise ReaderClosed
        schema = self.schema
        return ((term, terminfo) for term, terminfo in self._terms.items()
                if term[0] in schema)

    def iter_from(self, fieldname, text):
        self._test_field(fieldname)
        schema = self.schema
        text = self._text_to_bytes(fieldname, text)
        for term, terminfo in self._terms.items_from(fieldname, text):
            if term[0] not in schema:
                continue
            yield (term, terminfo)

    def frequency(self, fieldname, text):
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        try:
            return self._terms.frequency(fieldname, text)
        except KeyError:
            return 0

    def doc_frequency(self, fieldname, text):
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        try:
            return self._terms.doc_frequency(fieldname, text)
        except KeyError:
            return 0

    def postings(self, fieldname, text, scorer=None):
        from whoosh.matching.wrappers import FilterMatcher

        if self.is_closed:
            raise ReaderClosed
        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        text = self._text_to_bytes(fieldname, text)
        format_ = self.schema[fieldname].format
        matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer)
        deleted = frozenset(self._perdoc.deleted_docs())
        if deleted:
            matcher = FilterMatcher(matcher, deleted, exclude=True)
        return matcher

    def vector(self, docnum, fieldname, format_=None):
        if self.is_closed:
            raise ReaderClosed
        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        vformat = format_ or self.schema[fieldname].vector
        if not vformat:
            raise Exception("No vectors are stored for field %r" % fieldname)
        return self._perdoc.vector(docnum, fieldname, vformat)

    def cursor(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        fieldobj = self.schema[fieldname]
        return self._terms.cursor(fieldname, fieldobj)

    def terms_within(self, fieldname, text, maxdist, prefix=0):
        # Replaces the horribly inefficient base implementation with one based
        # on skipping through the word list efficiently using a DFA

        fieldobj = self.schema[fieldname]
        spellfield = fieldobj.spelling_fieldname(fieldname)
        auto = self._codec.automata(self._storage, self._segment)
        fieldcur = self.cursor(spellfield)
        return auto.terms_within(fieldcur, text, maxdist, prefix)

    # Column methods

    def has_column(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        coltype = self.schema[fieldname].column_type
        return coltype and self._perdoc.has_column(fieldname)

    def column_reader(self, fieldname, column=None, reverse=False,
                      translate=True):
        if self.is_closed:
            raise ReaderClosed

        fieldobj = self.schema[fieldname]
        column = column or fieldobj.column_type
        if not column:
            raise Exception("No column for field %r in %r"
                            % (fieldname, self))

        if self._perdoc.has_column(fieldname):
            creader = self._perdoc.column_reader(fieldname, column)
            if reverse:
                creader.set_reverse()
        else:
            # This segment doesn't have a column file for this field, so create
            # a fake column reader that always returns the default value.
            default = column.default_value(reverse)
            creader = columns.EmptyColumnReader(default, self.doc_count_all())

        if translate:
            # Wrap the column in a Translator to give the caller
            # nice values instead of sortable representations
            fcv = fieldobj.from_column_value
            creader = columns.TranslatingColumnReader(creader, fcv)

        return creader
Example #6
0
class SegmentReader(IndexReader):
    def __init__(self, storage, schema, segment, generation=None, codec=None):
        self.schema = schema
        self.is_closed = False

        self._segment = segment
        self._segid = self._segment.segment_id()
        self._gen = generation

        # self.files is a storage object from which to load the segment files.
        # This is different from the general storage (which will be used for
        # caches) if the segment is in a compound file.
        if segment.is_compound():
            # Open the compound file as a storage object
            files = segment.open_compound_file(storage)
            # Use an overlay here instead of just the compound storage, in rare
            # circumstances a segment file may be added after the segment is
            # written
            self._storage = OverlayStorage(files, storage)
        else:
            self._storage = storage

        # Get subreaders from codec
        self._codec = codec if codec else segment.codec()
        self._terms = self._codec.terms_reader(self._storage, segment)
        self._perdoc = self._codec.per_document_reader(self._storage, segment)
        self._graph = None  # Lazy open with self._get_graph()

    def _get_graph(self):
        if not self._graph:
            self._graph = self._codec.graph_reader(self._storage,
                                                   self._segment)
        return self._graph

    def codec(self):
        return self._codec

    def segment(self):
        return self._segment

    def storage(self):
        return self._storage

    def has_deletions(self):
        return self._perdoc.has_deletions()

    def doc_count(self):
        return self._perdoc.doc_count()

    def doc_count_all(self):
        return self._perdoc.doc_count_all()

    def is_deleted(self, docnum):
        return self._perdoc.is_deleted(docnum)

    def generation(self):
        return self._gen

    def __repr__(self):
        return "%s(%r, %r)" % (self.__class__.__name__, self._storage,
                               self._segment)

    def __contains__(self, term):
        fieldname, text = term
        if fieldname not in self.schema:
            return False
        text = self._text_to_bytes(fieldname, text)
        return (fieldname, text) in self._terms

    def close(self):
        self._terms.close()
        self._perdoc.close()
        if self._graph:
            self._graph.close()

        # It's possible some weird codec that doesn't use storage might have
        # passed None instead of a storage object
        if self._storage:
            self._storage.close()

        self.is_closed = True

    def stored_fields(self, docnum):
        assert docnum >= 0
        schema = self.schema
        sfs = self._perdoc.stored_fields(docnum)
        # Double-check with schema to filter out removed fields
        return dict(item for item in iteritems(sfs) if item[0] in schema)

    # Delegate doc methods to the per-doc reader

    def all_doc_ids(self):
        return self._perdoc.all_doc_ids()

    def iter_docs(self):
        return self._perdoc.iter_docs()

    def all_stored_fields(self):
        return self._perdoc.all_stored_fields()

    def field_length(self, fieldname):
        return self._perdoc.field_length(fieldname)

    def min_field_length(self, fieldname):
        return self._perdoc.min_field_length(fieldname)

    def max_field_length(self, fieldname):
        return self._perdoc.max_field_length(fieldname)

    def doc_field_length(self, docnum, fieldname, default=0):
        return self._perdoc.doc_field_length(docnum, fieldname, default)

    def has_vector(self, docnum, fieldname):
        return self._perdoc.has_vector(docnum, fieldname)

    #

    def _test_field(self, fieldname):
        if fieldname not in self.schema:
            raise TermNotFound("No field %r" % fieldname)
        if self.schema[fieldname].format is None:
            raise TermNotFound("Field %r is not indexed" % fieldname)

    def all_terms(self):
        schema = self.schema
        return ((fieldname, text) for fieldname, text in self._terms.terms()
                if fieldname in schema)

    def terms_from(self, fieldname, prefix):
        self._test_field(fieldname)
        prefix = self._text_to_bytes(fieldname, prefix)
        schema = self.schema
        return ((fname, text)
                for fname, text in self._terms.terms_from(fieldname, prefix)
                if fname in schema)

    def term_info(self, fieldname, text):
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        try:
            return self._terms.term_info(fieldname, text)
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

    def expand_prefix(self, fieldname, prefix):
        self._test_field(fieldname)
        prefix = self._text_to_bytes(fieldname, prefix)
        return IndexReader.expand_prefix(self, fieldname, prefix)

    def lexicon(self, fieldname):
        self._test_field(fieldname)
        return IndexReader.lexicon(self, fieldname)

    def __iter__(self):
        schema = self.schema
        return ((term, terminfo) for term, terminfo in self._terms.items()
                if term[0] in schema)

    def iter_from(self, fieldname, text):
        schema = self.schema
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        for term, terminfo in self._terms.items_from(fieldname, text):
            if term[0] not in schema:
                continue
            yield (term, terminfo)

    def frequency(self, fieldname, text):
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        try:
            return self._terms.frequency(fieldname, text)
        except KeyError:
            return 0

    def doc_frequency(self, fieldname, text):
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        try:
            return self._terms.doc_frequency(fieldname, text)
        except KeyError:
            return 0

    def postings(self, fieldname, text, scorer=None):
        from whoosh.matching.wrappers import FilterMatcher

        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        text = self._text_to_bytes(fieldname, text)
        format_ = self.schema[fieldname].format
        matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer)
        deleted = frozenset(self._perdoc.deleted_docs())
        if deleted:
            matcher = FilterMatcher(matcher, deleted, exclude=True)
        return matcher

    def vector(self, docnum, fieldname, format_=None):
        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        vformat = format_ or self.schema[fieldname].vector
        if not vformat:
            raise Exception("No vectors are stored for field %r" % fieldname)
        return self._perdoc.vector(docnum, fieldname, vformat)

    # Graph methods

    def has_word_graph(self, fieldname):
        if fieldname not in self.schema:
            return False
        if not self.schema[fieldname].spelling:
            return False

        try:
            gr = self._get_graph()
        except NoGraphError:
            return False

        return gr.has_root(fieldname)

    def word_graph(self, fieldname):
        if not self.has_word_graph(fieldname):
            raise KeyError("No word graph for field %r" % fieldname)
        gr = self._get_graph()
        return fst.Node(gr, gr.root(fieldname))

    def terms_within(self, fieldname, text, maxdist, prefix=0):
        if not self.has_word_graph(fieldname):
            # This reader doesn't have a graph stored, use the slow method
            return IndexReader.terms_within(self,
                                            fieldname,
                                            text,
                                            maxdist,
                                            prefix=prefix)
        gr = self._get_graph()
        return fst.within(gr,
                          text,
                          k=maxdist,
                          prefix=prefix,
                          address=self._graph.root(fieldname))

    # Column methods

    def has_column(self, fieldname):
        coltype = self.schema[fieldname].column_type
        return coltype and self._perdoc.has_column(fieldname)

    def column_reader(self, fieldname, column=None, translate=True):
        fieldobj = self.schema[fieldname]
        if not self.has_column(fieldname):
            raise Exception("No column for field %r" % fieldname)

        ctype = column or fieldobj.column_type
        creader = self._perdoc.column_reader(fieldname, ctype)
        if translate:
            # Wrap the column in a Translator to give the caller
            # nice values instead of sortable representations
            fcv = fieldobj.from_column_value
            creader = columns.TranslatingColumnReader(creader, fcv)

        return creader