Esempio n. 1
0
    def __init__(self, storage, schema, segment):
        self.storage = storage
        self.schema = schema
        self.segment = segment

        if hasattr(self.segment, "uuid"):
            self.uuid_string = str(self.segment.uuid)
        else:
            import uuid
            self.uuid_string = str(uuid.uuid4())

        # Term index
        tf = storage.open_file(segment.termsindex_filename)
        self.termsindex = TermIndexReader(tf)

        # Term vector index, and vector postings: lazy load
        self.vectorindex = None
        self.vpostfile = None

        # Stored fields file
        sf = storage.open_file(segment.storedfields_filename, mapped=False)
        self.storedfields = StoredFieldReader(sf)

        # Field length file
        self.fieldlengths = None
        if self.schema.has_scorable_fields():
            flf = storage.open_file(segment.fieldlengths_filename)
            self.fieldlengths = LengthReader(flf, segment.doc_count_all())

        # Copy info from underlying segment
        self._has_deletions = segment.has_deletions()
        self._doc_count = segment.doc_count()

        # Postings file
        self.postfile = self.storage.open_file(segment.termposts_filename,
                                               mapped=False)

        # Dawg file
        self.dawg = None
        if any(field.spelling for field in self.schema):
            fname = segment.dawg_filename
            if self.storage.file_exists(fname):
                dawgfile = self.storage.open_file(fname, mapped=False)
                self.dawg = DiskNode.load(dawgfile, expand=False)

        self.dc = segment.doc_count_all()
        assert self.dc == self.storedfields.length

        self.set_caching_policy()

        self.is_closed = False
        self._sync_lock = Lock()
Esempio n. 2
0
    def __init__(self, storage, schema, segment):
        self.storage = storage
        self.schema = schema
        self.segment = segment

        if hasattr(self.segment, "uuid"):
            self.uuid_string = str(self.segment.uuid)
        else:
            import uuid
            self.uuid_string = str(uuid.uuid4())

        # Term index
        tf = storage.open_file(segment.termsindex_filename)
        self.termsindex = TermIndexReader(tf)

        # Term postings file, vector index, and vector postings: lazy load
        self.postfile = None
        self.vectorindex = None
        self.vpostfile = None

        # Stored fields file
        sf = storage.open_file(segment.storedfields_filename, mapped=False)
        self.storedfields = StoredFieldReader(sf)

        # Field length file
        self.fieldlengths = None
        if self.schema.has_scorable_fields():
            flf = storage.open_file(segment.fieldlengths_filename)
            self.fieldlengths = LengthReader(flf, segment.doc_count_all())

        # Copy methods from underlying segment
        self.has_deletions = segment.has_deletions
        self.is_deleted = segment.is_deleted
        self.doc_count = segment.doc_count

        # Postings file
        self.postfile = self.storage.open_file(segment.termposts_filename,
                                               mapped=False)

        self.dc = segment.doc_count_all()
        assert self.dc == self.storedfields.length

        self.set_caching_policy()

        self.is_closed = False
        self._sync_lock = Lock()
Esempio n. 3
0
    def finish(self, termswriter, doccount, lengthfile):
        _fieldlength_totals = self._fieldlength_totals
        if not self.tasks:
            return

        jobqueue = self.jobqueue
        rqueue = self.resultqueue

        for task in self.tasks:
            jobqueue.put((None, doccount))

        for task in self.tasks:
            task.join()

        runs = []
        lenfilenames = []
        for task in self.tasks:
            taskruns, flentotals, flenmaxes, lenfilename = rqueue.get()
            runs.extend(taskruns)
            lenfilenames.append(lenfilename)
            for fieldnum, total in flentotals.iteritems():
                _fieldlength_totals[fieldnum] += total
            for fieldnum, length in flenmaxes.iteritems():
                if length > self._fieldlength_maxes.get(fieldnum, 0):
                    self._fieldlength_maxes[fieldnum] = length

        jobqueue.close()
        rqueue.close()

        lw = LengthWriter(lengthfile, doccount)
        for lenfilename in lenfilenames:
            sublengths = LengthReader(StructFile(open(lenfilename, "rb")),
                                      doccount)
            lw.add_all(sublengths)
            os.remove(lenfilename)
        lw.close()
        lengths = lw.reader()

        #        if len(runs) >= self.procs * 2:
        #            pool = Pool(self.procs)
        #            tempname = lambda: tempfile.mktemp(suffix=".run", dir=self.dir)
        #            while len(runs) >= self.procs * 2:
        #                runs2 = [(runs[i:i+4], tempname())
        #                         for i in xrange(0, len(runs), 4)]
        #                if len(runs) % 4:
        #                    last = runs2.pop()[0]
        #                    runs2[-1][0].extend(last)
        #                runs = pool.map(merge_runs, runs2)
        #            pool.close()

        iterator = imerge(
            [read_run(runname, count) for runname, count in runs])
        total = sum(count for runname, count in runs)
        termswriter.add_iter(iterator, lengths.get)
        for runname, count in runs:
            os.remove(runname)

        self.cleanup()
Esempio n. 4
0
    def finish(self, termswriter, doccount, lengthfile):
        self._write_lengths(lengthfile, doccount)
        lengths = LengthReader(None, doccount, self.length_arrays)

        if not self._flushed:
            gen = self.readback_buffer()
        else:
            if self.postbuf:
                self.flush()
            gen = self.readback()

        termswriter.add_iter(gen, lengths.get)
Esempio n. 5
0
 def finish(self, doccount, lengthfile, termtable, postingwriter):
     _fieldlength_totals = self._fieldlength_totals
     if not self.tasks:
         return
     
     pqueue = self.postingqueue
     rqueue = self.resultsqueue
     
     for _ in xrange(self.procs):
         pqueue.put((-1, doccount))
     
     #print "Joining..."
     t = now()
     for task in self.tasks:
         task.join()
     #print "Join:", now() - t
     
     #print "Getting results..."
     t = now()
     runs = []
     lenfilenames = []
     for task in self.tasks:
         taskruns, flentotals, flenmaxes, lenfilename = rqueue.get()
         runs.extend(taskruns)
         lenfilenames.append(lenfilename)
         for fieldnum, total in flentotals.iteritems():
             _fieldlength_totals[fieldnum] += total
         for fieldnum, length in flenmaxes.iteritems():
             if length > self._fieldlength_maxes.get(fieldnum, 0):
                 self._fieldlength_maxes[fieldnum] = length
     #print "Results:", now() - t
     
     #print "Writing lengths..."
     t = now()
     lw = LengthWriter(lengthfile, doccount)
     for lenfilename in lenfilenames:
         sublengths = LengthReader(StructFile(open(lenfilename, "rb")), doccount)
         lw.add_all(sublengths)
         os.remove(lenfilename)
     lw.close()
     lengths = lw.reader()
     #print "Lengths:", now() - t
     
     t = now()
     iterator = imerge([read_run(runname, count) for runname, count in runs])
     total = sum(count for runname, count in runs)
     write_postings(self.schema, termtable, lengths, postingwriter, iterator)
     for runname, count in runs:
         os.remove(runname)
     #print "Merge:", now() - t
     
     self.cleanup()
Esempio n. 6
0
    def __init__(self, storage, schema, segment):
        self.storage = storage
        self.schema = schema
        self.segment = segment

        if hasattr(self.segment, "uuid"):
            self.uuid_string = str(self.segment.uuid)
        else:
            import uuid
            self.uuid_string = str(uuid.uuid4())

        # Term index
        tf = storage.open_file(segment.termsindex_filename)
        self.termsindex = TermIndexReader(tf)

        # Term vector index, and vector postings: lazy load
        self.vectorindex = None
        self.vpostfile = None

        # Stored fields file
        sf = storage.open_file(segment.storedfields_filename, mapped=False)
        self.storedfields = StoredFieldReader(sf)

        # Field length file
        self.fieldlengths = None
        if self.schema.has_scorable_fields():
            flf = storage.open_file(segment.fieldlengths_filename)
            self.fieldlengths = LengthReader(flf, segment.doc_count_all())

        # Copy info from underlying segment
        self._has_deletions = segment.has_deletions()
        self._doc_count = segment.doc_count()

        # Postings file
        self.postfile = self.storage.open_file(segment.termposts_filename,
                                               mapped=False)

        # Dawg file
        self.dawg = None
        if any(field.spelling for field in self.schema):
            fname = segment.dawg_filename
            if self.storage.file_exists(fname):
                dawgfile = self.storage.open_file(fname, mapped=False)
                self.dawg = DiskNode.load(dawgfile, expand=False)

        self.dc = segment.doc_count_all()
        assert self.dc == self.storedfields.length

        self.set_caching_policy()

        self.is_closed = False
        self._sync_lock = Lock()
Esempio n. 7
0
    def finish(self, termswriter, doccount, lengthfile):
        from itertools import izip

        pbuf = self.postbuf
        self._write_lengths(lengthfile, doccount)
        lengths = LengthReader(None, doccount, self.length_arrays)

        def gen():
            for term in sorted(pbuf):
                fieldname, text = term
                for docnum, weight, valuestring in izip(*pbuf[term]):
                    yield (fieldname, text, docnum, weight, valuestring)

        termswriter.add_iter(gen(), lengths.get)
Esempio n. 8
0
    def finish(self, termswriter, doccount, lengthfile):
        self._write_lengths(lengthfile, doccount)
        lengths = LengthReader(None, doccount, self.length_arrays)

        if self.postings or self.runs:
            if self.postings and len(self.runs) == 0:
                self.postings.sort()
                postiter = iter(self.postings)
            elif not self.postings and not self.runs:
                postiter = iter([])
            else:
                self.dump_run()
                postiter = imerge([read_run(runname, count)
                                   for runname, count in self.runs])

            termswriter.add_iter(postiter, lengths.get)
        self.cleanup()
Esempio n. 9
0
    def __init__(self, storage, schema, segment):
        self.storage = storage
        self.schema = schema
        self.segment = segment

        if hasattr(self.segment, "uuid"):
            self.uuid_string = str(self.segment.uuid)
        else:
            import uuid
            self.uuid_string = str(uuid.uuid4())

        # Term index
        tf = storage.open_file(segment.termsindex_filename)
        self.termsindex = TermIndexReader(tf)

        # Term postings file, vector index, and vector postings: lazy load
        self.postfile = None
        self.vectorindex = None
        self.vpostfile = None

        # Stored fields file
        sf = storage.open_file(segment.storedfields_filename, mapped=False)
        self.storedfields = StoredFieldReader(sf)

        # Field length file
        self.fieldlengths = None
        if self.schema.has_scorable_fields():
            flf = storage.open_file(segment.fieldlengths_filename)
            self.fieldlengths = LengthReader(flf, segment.doc_count_all())

        # Copy methods from underlying segment
        self.has_deletions = segment.has_deletions
        self.is_deleted = segment.is_deleted
        self.doc_count = segment.doc_count

        # Postings file
        self.postfile = self.storage.open_file(segment.termposts_filename,
                                               mapped=False)

        self.dc = segment.doc_count_all()
        assert self.dc == self.storedfields.length

        self.set_caching_policy()

        self.is_closed = False
        self._sync_lock = Lock()
Esempio n. 10
0
    def finish(self, doccount, lengthfile, termtable, postingwriter):
        self._write_lengths(lengthfile, doccount)
        lengths = LengthReader(None, doccount, self.length_arrays)

        if self.postings or self.runs:
            if self.postings and len(self.runs) == 0:
                self.postings.sort()
                postiter = iter(self.postings)
            elif not self.postings and not self.runs:
                postiter = iter([])
            else:
                self.dump_run()
                postiter = imerge(
                    [read_run(runname, count) for runname, count in self.runs])

            write_postings(self.schema, termtable, lengths, postingwriter,
                           postiter)
        self.cleanup()
Esempio n. 11
0
class SegmentReader(IndexReader):
    GZIP_CACHES = False

    def __init__(self, storage, schema, segment):
        self.storage = storage
        self.schema = schema
        self.segment = segment

        if hasattr(self.segment, "uuid"):
            self.uuid_string = str(self.segment.uuid)
        else:
            import uuid
            self.uuid_string = str(uuid.uuid4())

        # Term index
        tf = storage.open_file(segment.termsindex_filename)
        self.termsindex = TermIndexReader(tf)

        # Term postings file, vector index, and vector postings: lazy load
        self.postfile = None
        self.vectorindex = None
        self.vpostfile = None

        # Stored fields file
        sf = storage.open_file(segment.storedfields_filename, mapped=False)
        self.storedfields = StoredFieldReader(sf)

        # Field length file
        self.fieldlengths = None
        if self.schema.has_scorable_fields():
            flf = storage.open_file(segment.fieldlengths_filename)
            self.fieldlengths = LengthReader(flf, segment.doc_count_all())

        # Copy methods from underlying segment
        self.has_deletions = segment.has_deletions
        self.is_deleted = segment.is_deleted
        self.doc_count = segment.doc_count

        # Postings file
        self.postfile = self.storage.open_file(segment.termposts_filename,
                                               mapped=False)

        self.dc = segment.doc_count_all()
        assert self.dc == self.storedfields.length

        self.set_caching_policy()

        self.is_closed = False
        self._sync_lock = Lock()

    def generation(self):
        return self.segment.generation

    def _open_vectors(self):
        if self.vectorindex:
            return

        storage, segment = self.storage, self.segment

        # Vector index
        vf = storage.open_file(segment.vectorindex_filename)
        self.vectorindex = TermVectorReader(vf)

        # Vector postings file
        self.vpostfile = storage.open_file(segment.vectorposts_filename,
                                           mapped=False)

    def __repr__(self):
        return "%s(%s)" % (self.__class__.__name__, self.segment)

    @protected
    def __contains__(self, term):
        return term in self.termsindex

    def close(self):
        self.storedfields.close()
        self.termsindex.close()
        if self.postfile:
            self.postfile.close()
        if self.vectorindex:
            self.vectorindex.close()
        if self.vpostfile:
            self.vpostfile.close()
        #if self.fieldlengths:
        #    self.fieldlengths.close()
        self.caching_policy = None
        self.is_closed = True

    def doc_count_all(self):
        return self.dc

    @protected
    def stored_fields(self, docnum):
        schema = self.schema
        return dict(item for item in self.storedfields[docnum].iteritems()
                    if item[0] in schema)

    @protected
    def all_stored_fields(self):
        is_deleted = self.segment.is_deleted
        sf = self.stored_fields
        for docnum in xrange(self.segment.doc_count_all()):
            if not is_deleted(docnum):
                yield sf(docnum)

    def field_length(self, fieldname):
        return self.segment.field_length(fieldname)

    @protected
    def doc_field_length(self, docnum, fieldname, default=0):
        if self.fieldlengths is None:
            return default
        return self.fieldlengths.get(docnum, fieldname, default=default)

    def max_field_length(self, fieldname):
        return self.segment.max_field_length(fieldname)

    @protected
    def has_vector(self, docnum, fieldname):
        if self.schema[fieldname].vector:
            self._open_vectors()
            return (docnum, fieldname) in self.vectorindex
        else:
            return False

    @protected
    def __iter__(self):
        schema = self.schema
        for (fieldname, t), (totalfreq, _, postcount) in self.termsindex:
            if fieldname not in schema:
                continue
            yield (fieldname, t, postcount, totalfreq)

    def _test_field(self, fieldname):
        if fieldname not in self.schema:
            raise TermNotFound("No field %r" % fieldname)
        if self.schema[fieldname].format is None:
            raise TermNotFound("Field %r is not indexed" % fieldname)

    @protected
    def iter_from(self, fieldname, text):
        schema = self.schema
        self._test_field(fieldname)
        for (fn, t), (totalfreq, _, postcount) in self.termsindex.items_from(
            (fieldname, text)):
            if fn not in schema:
                continue
            yield (fn, t, postcount, totalfreq)

    @protected
    def _term_info(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self.termsindex[fieldname, text]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

    def doc_frequency(self, fieldname, text):
        try:
            return self._term_info(fieldname, text)[2]
        except TermNotFound:
            return 0

    def frequency(self, fieldname, text):
        try:
            return self._term_info(fieldname, text)[0]
        except TermNotFound:
            return 0

    def lexicon(self, fieldname):
        # The base class has a lexicon() implementation that uses iter_from()
        # and throws away the value, but overriding to use
        # FileTableReader.keys_from() is much, much faster.

        self._test_field(fieldname)

        # If a field cache happens to already be loaded for this field, use it
        # instead of loading the field values from disk
        if self.fieldcache_loaded(fieldname):
            fieldcache = self.fieldcache(fieldname)
            it = iter(fieldcache.texts)
            # The first value in fieldcache.texts is the default; throw it away
            it.next()
            return it

        return self.expand_prefix(fieldname, '')

    @protected
    def expand_prefix(self, fieldname, prefix):
        # The base class has an expand_prefix() implementation that uses
        # iter_from() and throws away the value, but overriding to use
        # FileTableReader.keys_from() is much, much faster.

        self._test_field(fieldname)

        if self.fieldcache_loaded(fieldname):
            texts = self.fieldcache(fieldname).texts
            i = bisect_left(texts, prefix)
            while i < len(texts) and texts[i].startswith(prefix):
                yield texts[i]
                i += 1
        else:
            for fn, t in self.termsindex.keys_from((fieldname, prefix)):
                if fn != fieldname or not t.startswith(prefix):
                    break
                yield t

    def postings(self, fieldname, text, scorer=None):
        try:
            offset = self.termsindex[fieldname, text][1]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

        format = self.schema[fieldname].format
        if isinstance(offset, (int, long)):
            postreader = FilePostingReader(self.postfile,
                                           offset,
                                           format,
                                           scorer=scorer,
                                           fieldname=fieldname,
                                           text=text)
        else:
            docids, weights, values, maxwol, minlength = offset
            postreader = ListMatcher(docids,
                                     weights,
                                     values,
                                     format,
                                     scorer,
                                     maxwol=maxwol,
                                     minlength=minlength)

        deleted = self.segment.deleted
        if deleted:
            postreader = FilterMatcher(postreader, deleted, exclude=True)

        return postreader

    def vector(self, docnum, fieldname):
        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        vformat = self.schema[fieldname].vector
        if not vformat:
            raise Exception("No vectors are stored for field %r" % fieldname)

        self._open_vectors()
        offset = self.vectorindex.get((docnum, fieldname))
        if offset is None:
            raise Exception("No vector found for document"
                            " %s field %r" % (docnum, fieldname))

        return FilePostingReader(self.vpostfile,
                                 offset,
                                 vformat,
                                 stringids=True)

    # Field cache methods

    def supports_caches(self):
        return True

    def set_caching_policy(self, cp=None, save=True, storage=None):
        """This method lets you control the caching policy of the reader. You
        can either pass a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`
        as the first argument, *or* use the `save` and `storage` keywords to
        alter the default caching policy::
        
            # Use a custom field caching policy object
            reader.set_caching_policy(MyPolicy())
            
            # Use the default caching policy but turn off saving caches to disk
            reader.set_caching_policy(save=False)
            
            # Use the default caching policy but save caches to a custom storage
            from whoosh.filedb.filestore import FileStorage
            mystorage = FileStorage("path/to/cachedir")
            reader.set_caching_policy(storage=mystorage)
        
        :param cp: a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`
            object. If this argument is not given, the default caching policy
            is used.
        :param save: save field caches to disk for re-use. If a caching policy
            object is specified using `cp`, this argument is ignored.
        :param storage: a custom :class:`whoosh.store.Storage` object to use
            for saving field caches. If a caching policy object is specified
            using `cp` or `save` is `False`, this argument is ignored. 
        """

        if not cp:
            if save and storage is None:
                storage = self.storage
            else:
                storage = None
            cp = DefaultFieldCachingPolicy(self.segment.name, storage=storage)

        if type(cp) is type:
            cp = cp()

        self.caching_policy = cp

    def _fieldkey(self, fieldname):
        return "%s/%s" % (self.uuid_string, fieldname)

    def define_facets(self, name, qs, save=SAVE_BY_DEFAULT):
        if name in self.schema:
            raise Exception(
                "Can't define facets using the name of a field (%r)" % name)

        if self.fieldcache_available(name):
            # Don't recreate the cache if it already exists
            return

        cache = self.caching_policy.get_class().from_lists(
            qs, self.doc_count_all())
        self.caching_policy.put(self._fieldkey(name), cache, save=save)

    def fieldcache(self, fieldname, save=SAVE_BY_DEFAULT):
        """Returns a :class:`whoosh.filedb.fieldcache.FieldCache` object for
        the given field.
        
        :param fieldname: the name of the field to get a cache for.
        :param save: if True (the default), the cache is saved to disk if it
            doesn't already exist.
        """

        key = self._fieldkey(fieldname)
        fc = self.caching_policy.get(key)
        if not fc:
            fc = FieldCache.from_field(self, fieldname)
            self.caching_policy.put(key, fc, save=save)
        return fc

    def fieldcache_available(self, fieldname):
        """Returns True if a field cache exists for the given field (either in
        memory already or on disk).
        """

        return self._fieldkey(fieldname) in self.caching_policy

    def fieldcache_loaded(self, fieldname):
        """Returns True if a field cache for the given field is in memory.
        """

        return self.caching_policy.is_loaded(self._fieldkey(fieldname))

    def unload_fieldcache(self, name):
        self.caching_policy.delete(self._fieldkey(name))

    # Sorting and faceting methods

    def key_fn(self, fields):
        if isinstance(fields, basestring):
            fields = (fields, )

        if len(fields) > 1:
            fcs = [self.fieldcache(fn) for fn in fields]
            return lambda docnum: tuple(fc.key_for(docnum) for fc in fcs)
        else:
            return self.fieldcache(fields[0]).key_for

    def sort_docs_by(self, fields, docnums, reverse=False):
        keyfn = self.key_fn(fields)
        return sorted(docnums, key=keyfn, reverse=reverse)

    def key_docs_by(self, fields, docnums, limit, reverse=False, offset=0):
        keyfn = self.key_fn(fields)

        if limit is None:
            # Don't bother sorting, the caller will do that
            return [(keyfn(docnum), docnum + offset) for docnum in docnums]
        else:
            # A non-reversed sort (the usual case) is inefficient because we
            # have to use nsmallest, but I can't think of a cleverer thing to
            # do right now. I thought I had an idea, but I was wrong.
            op = nlargest if reverse else nsmallest

            return op(limit,
                      ((keyfn(docnum), docnum + offset) for docnum in docnums))
Esempio n. 12
0
class SegmentReader(IndexReader):
    def __init__(self, storage, schema, segment, generation=None):
        self.storage = storage
        self.schema = schema
        self.segment = segment
        self._generation = generation
        
        # Term index
        tf = storage.open_file(segment.termsindex_filename)
        self.termsindex = TermIndexReader(tf)
        
        # Term postings file, vector index, and vector postings: lazy load
        self.postfile = None
        self.vectorindex = None
        self.vpostfile = None
        
        # Stored fields file
        sf = storage.open_file(segment.storedfields_filename, mapped=False)
        self.storedfields = StoredFieldReader(sf)
        
        # Field length file
        self.fieldlengths = None
        if self.schema.has_scorable_fields():
            flf = storage.open_file(segment.fieldlengths_filename)
            self.fieldlengths = LengthReader(flf, segment.doc_count_all())
        
        # Copy methods from underlying segment
        self.has_deletions = segment.has_deletions
        self.is_deleted = segment.is_deleted
        self.doc_count = segment.doc_count
        
        # Postings file
        self.postfile = self.storage.open_file(segment.termposts_filename,
                                               mapped=False)
        
        self.dc = segment.doc_count_all()
        assert self.dc == self.storedfields.length
        
        self.is_closed = False
        self._sync_lock = Lock()

    def _open_vectors(self):
        if self.vectorindex: return
        
        storage, segment = self.storage, self.segment
        
        # Vector index
        vf = storage.open_file(segment.vectorindex_filename)
        self.vectorindex = TermVectorReader(vf)
        
        # Vector postings file
        self.vpostfile = storage.open_file(segment.vectorposts_filename,
                                           mapped=False)
    
    def __repr__(self):
        return "%s(%s)" % (self.__class__.__name__, self.segment)

    @protected
    def __contains__(self, term):
        return term in self.termsindex

    def generation(self):
        return self._generation

    def close(self):
        self.storedfields.close()
        self.termsindex.close()
        if self.postfile:
            self.postfile.close()
        if self.vectorindex:
            self.vectorindex.close()
        #if self.fieldlengths:
        #    self.fieldlengths.close()
        self.is_closed = True

    def doc_count_all(self):
        return self.dc

    def field(self, fieldname):
        return self.schema[fieldname]

    def scorable(self, fieldname):
        return self.schema[fieldname].scorable
    
    def scorable_names(self):
        return self.schema.scorable_names()
    
    def vector_names(self):
        return self.schema.vector_names()
    
    def format(self, fieldname):
        return self.schema[fieldname].format
    
    def vector_format(self, fieldname):
        return self.schema[fieldname].vector

    @protected
    def stored_fields(self, docnum):
        schema = self.schema
        return dict(item for item
                    in self.storedfields[docnum].iteritems()
                    if item[0] in schema)

    @protected
    def all_stored_fields(self):
        is_deleted = self.segment.is_deleted
        sf = self.stored_fields
        for docnum in xrange(self.segment.doc_count_all()):
            if not is_deleted(docnum):
                yield sf(docnum)

    def field_length(self, fieldname):
        return self.segment.field_length(fieldname)

    @protected
    def doc_field_length(self, docnum, fieldname, default=0):
        if self.fieldlengths is None: return default
        return self.fieldlengths.get(docnum, fieldname, default=default)

    def max_field_length(self, fieldname):
        return self.segment.max_field_length(fieldname)

    @protected
    def has_vector(self, docnum, fieldname):
        self._open_vectors()
        return (docnum, fieldname) in self.vectorindex

    @protected
    def __iter__(self):
        schema = self.schema
        for (fieldname, t), (totalfreq, _, postcount) in self.termsindex:
            if fieldname not in schema:
                continue
            yield (fieldname, t, postcount, totalfreq)

    def _test_field(self, fieldname):
        if fieldname not in self.schema:
            raise TermNotFound("No field %r" % fieldname)
        if self.schema[fieldname].format is None:
            raise TermNotFound("Field %r is not indexed" % fieldname)

    @protected
    def iter_from(self, fieldname, text):
        schema = self.schema
        self._test_field(fieldname)
        for (fn, t), (totalfreq, _, postcount) in self.termsindex.items_from((fieldname, text)):
            if fn not in schema:
                continue
            yield (fn, t, postcount, totalfreq)

    @protected
    def _term_info(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self.termsindex[(fieldname, text)]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

    def doc_frequency(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self._term_info(fieldname, text)[2]
        except TermNotFound:
            return 0

    def frequency(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self._term_info(fieldname, text)[0]
        except TermNotFound:
            return 0

    def lexicon(self, fieldname):
        # The base class has a lexicon() implementation that uses iter_from()
        # and throws away the value, but overriding to use
        # FileTableReader.keys_from() is much, much faster.

        self._test_field(fieldname)
        return self.expand_prefix(fieldname, '')

    @protected
    def expand_prefix(self, fieldname, prefix):
        # The base class has an expand_prefix() implementation that uses
        # iter_from() and throws away the value, but overriding to use
        # FileTableReader.keys_from() is much, much faster.

        self._test_field(fieldname)
        for fn, t in self.termsindex.keys_from((fieldname, prefix)):
            if fn != fieldname or not t.startswith(prefix):
                return
            yield t

    def postings(self, fieldname, text, exclude_docs=frozenset(), scorer=None):
        self._test_field(fieldname)
        format = self.format(fieldname)
        try:
            offset = self.termsindex[(fieldname, text)][1]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

        if self.segment.deleted and exclude_docs:
            exclude_docs = self.segment.deleted | exclude_docs
        elif self.segment.deleted:
            exclude_docs = self.segment.deleted

        postreader = FilePostingReader(self.postfile, offset, format,
                                       scorer=scorer, fieldname=fieldname,
                                       text=text)
        if exclude_docs:
            postreader = ExcludeMatcher(postreader, exclude_docs)
            
        return postreader
    
    def vector(self, docnum, fieldname):
        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        vformat = self.vector_format(fieldname)
        if not vformat:
            raise Exception("No vectors are stored for field %r" % fieldname)
        
        self._open_vectors()
        offset = self.vectorindex.get((docnum, fieldname))
        if offset is None:
            raise Exception("No vector found"
                            " for document %s field %r" % (docnum, fieldname))
        
        return FilePostingReader(self.vpostfile, offset, vformat, stringids=True)
Esempio n. 13
0
class SegmentReader(IndexReader):
    GZIP_CACHES = False

    def __init__(self, storage, schema, segment):
        self.storage = storage
        self.schema = schema
        self.segment = segment

        if hasattr(self.segment, "uuid"):
            self.uuid_string = str(self.segment.uuid)
        else:
            import uuid
            self.uuid_string = str(uuid.uuid4())

        # Term index
        tf = storage.open_file(segment.termsindex_filename)
        self.termsindex = TermIndexReader(tf)

        # Term vector index, and vector postings: lazy load
        self.vectorindex = None
        self.vpostfile = None

        # Stored fields file
        sf = storage.open_file(segment.storedfields_filename, mapped=False)
        self.storedfields = StoredFieldReader(sf)

        # Field length file
        self.fieldlengths = None
        if self.schema.has_scorable_fields():
            flf = storage.open_file(segment.fieldlengths_filename)
            self.fieldlengths = LengthReader(flf, segment.doc_count_all())

        # Copy info from underlying segment
        self._has_deletions = segment.has_deletions()
        self._doc_count = segment.doc_count()

        # Postings file
        self.postfile = self.storage.open_file(segment.termposts_filename,
                                               mapped=False)

        # Dawg file
        self.dawg = None
        if any(field.spelling for field in self.schema):
            fname = segment.dawg_filename
            if self.storage.file_exists(fname):
                dawgfile = self.storage.open_file(fname, mapped=False)
                self.dawg = DiskNode.load(dawgfile, expand=False)

        self.dc = segment.doc_count_all()
        assert self.dc == self.storedfields.length

        self.set_caching_policy()

        self.is_closed = False
        self._sync_lock = Lock()

    def has_deletions(self):
        return self._has_deletions

    def doc_count(self):
        return self._doc_count

    def is_deleted(self, docnum):
        return self.segment.is_deleted(docnum)

    def generation(self):
        return self.segment.generation

    def _open_vectors(self):
        if self.vectorindex:
            return

        storage, segment = self.storage, self.segment

        # Vector index
        vf = storage.open_file(segment.vectorindex_filename)
        self.vectorindex = TermVectorReader(vf)

        # Vector postings file
        self.vpostfile = storage.open_file(segment.vectorposts_filename,
                                           mapped=False)

    def __repr__(self):
        return "%s(%s)" % (self.__class__.__name__, self.segment)

    def __contains__(self, term):
        return term in self.termsindex

    def close(self):
        self.storedfields.close()
        self.termsindex.close()
        if self.postfile:
            self.postfile.close()
        if self.vectorindex:
            self.vectorindex.close()
        if self.vpostfile:
            self.vpostfile.close()
        #if self.fieldlengths:
        #    self.fieldlengths.close()
        self.caching_policy = None
        self.is_closed = True

    def doc_count_all(self):
        return self.dc

    def stored_fields(self, docnum):
        assert docnum >= 0
        schema = self.schema
        return dict(item for item in iteritems(self.storedfields[docnum])
                    if item[0] in schema)

    def all_stored_fields(self):
        is_deleted = self.segment.is_deleted
        sf = self.stored_fields
        for docnum in xrange(self.segment.doc_count_all()):
            if not is_deleted(docnum):
                yield sf(docnum)

    def field_length(self, fieldname):
        return self.segment.field_length(fieldname)

    def min_field_length(self, fieldname):
        return self.segment.min_field_length(fieldname)

    def max_field_length(self, fieldname):
        return self.segment.max_field_length(fieldname)

    def doc_field_length(self, docnum, fieldname, default=0):
        if self.fieldlengths is None:
            return default
        return self.fieldlengths.get(docnum, fieldname, default=default)

    def has_vector(self, docnum, fieldname):
        if self.schema[fieldname].vector:
            self._open_vectors()
            return (docnum, fieldname) in self.vectorindex
        else:
            return False

    def _test_field(self, fieldname):
        if fieldname not in self.schema:
            raise TermNotFound("No field %r" % fieldname)
        if self.schema[fieldname].format is None:
            raise TermNotFound("Field %r is not indexed" % fieldname)

    def all_terms(self):
        schema = self.schema
        return ((fieldname, text)
                for fieldname, text in self.termsindex.keys()
                if fieldname in schema)

    def terms_from(self, fieldname, prefix):
        self._test_field(fieldname)
        schema = self.schema
        return ((fname, text)
                for fname, text in self.termsindex.keys_from((fieldname,
                                                              prefix))
                if fname in schema)

    def term_info(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self.termsindex[fieldname, text]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

    def _texts_in_fieldcache(self, fieldname, prefix=''):
        # The first value in a fieldcache is the default
        texts = self.fieldcache(fieldname).texts[1:]
        if prefix:
            i = bisect_left(texts, prefix)
            while i < len(texts) and texts[i].startswith(prefix):
                yield texts[i]
                i += 1
        else:
            for text in texts:
                yield text

    def expand_prefix(self, fieldname, prefix):
        self._test_field(fieldname)
        # If a fieldcache for the field is already loaded, we already have the
        # values for the field in memory, so just yield them from there
        if self.fieldcache_loaded(fieldname):
            return self._texts_in_fieldcache(fieldname, prefix)
        else:
            return IndexReader.expand_prefix(self, fieldname, prefix)

    def lexicon(self, fieldname):
        self._test_field(fieldname)
        # If a fieldcache for the field is already loaded, we already have the
        # values for the field in memory, so just yield them from there
        if self.fieldcache_loaded(fieldname):
            return self._texts_in_fieldcache(fieldname)
        else:
            return IndexReader.lexicon(self, fieldname)

    def __iter__(self):
        schema = self.schema
        return ((term, terminfo) for term, terminfo in self.termsindex.items()
                if term[0] in schema)

    def iter_from(self, fieldname, text):
        schema = self.schema
        self._test_field(fieldname)
        for term, terminfo in self.termsindex.items_from((fieldname, text)):
            if term[0] not in schema:
                continue
            yield (term, terminfo)

    def frequency(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self.termsindex.frequency((fieldname, text))
        except KeyError:
            return 0

    def doc_frequency(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self.termsindex.doc_frequency((fieldname, text))
        except KeyError:
            return 0

    def postings(self, fieldname, text, scorer=None):
        try:
            terminfo = self.termsindex[fieldname, text]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

        format = self.schema[fieldname].format
        postings = terminfo.postings
        if isinstance(postings, integer_types):
            postreader = FilePostingReader(self.postfile,
                                           postings,
                                           format,
                                           scorer=scorer,
                                           term=(fieldname, text))
        else:
            docids, weights, values = postings
            postreader = ListMatcher(docids,
                                     weights,
                                     values,
                                     format,
                                     scorer=scorer,
                                     term=(fieldname, text),
                                     terminfo=terminfo)

        deleted = self.segment.deleted
        if deleted:
            postreader = FilterMatcher(postreader, deleted, exclude=True)

        return postreader

    def vector(self, docnum, fieldname):
        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        vformat = self.schema[fieldname].vector
        if not vformat:
            raise Exception("No vectors are stored for field %r" % fieldname)

        self._open_vectors()
        try:
            offset = self.vectorindex.get((docnum, fieldname))
        except KeyError:
            raise KeyError("No vector found for document "
                           "%s field %r" % (docnum, fieldname))

        return FilePostingReader(self.vpostfile,
                                 offset,
                                 vformat,
                                 stringids=True)

    # DAWG methods

    def has_word_graph(self, fieldname):
        if fieldname not in self.schema:
            return False
        if not self.schema[fieldname].spelling:
            return False
        if self.dawg:
            return fieldname in self.dawg
        return False

    def word_graph(self, fieldname):
        if not self.has_word_graph(fieldname):
            raise Exception("No word graph for field %r" % fieldname)
        return self.dawg.edge(fieldname)

    # Field cache methods

    def supports_caches(self):
        return True

    def set_caching_policy(self, cp=None, save=True, storage=None):
        """This method lets you control the caching policy of the reader. You
        can either pass a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`
        as the first argument, *or* use the `save` and `storage` keywords to
        alter the default caching policy::
        
            # Use a custom field caching policy object
            reader.set_caching_policy(MyPolicy())
            
            # Use the default caching policy but turn off saving caches to disk
            reader.set_caching_policy(save=False)
            
            # Use the default caching policy but save caches to a custom
            # storage
            from whoosh.filedb.filestore import FileStorage
            mystorage = FileStorage("path/to/cachedir")
            reader.set_caching_policy(storage=mystorage)
        
        :param cp: a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`
            object. If this argument is not given, the default caching policy
            is used.
        :param save: save field caches to disk for re-use. If a caching policy
            object is specified using `cp`, this argument is ignored.
        :param storage: a custom :class:`whoosh.store.Storage` object to use
            for saving field caches. If a caching policy object is specified
            using `cp` or `save` is `False`, this argument is ignored. 
        """

        if not cp:
            if save and storage is None:
                storage = self.storage
            elif not save:
                storage = None
            cp = DefaultFieldCachingPolicy(self.segment.name, storage=storage)

        if type(cp) is type:
            cp = cp()

        self.caching_policy = cp

    def _fieldkey(self, fieldname):
        return "%s/%s" % (self.uuid_string, fieldname)

    def fieldcache(self, fieldname, save=SAVE_BY_DEFAULT):
        """Returns a :class:`whoosh.filedb.fieldcache.FieldCache` object for
        the given field.
        
        :param fieldname: the name of the field to get a cache for.
        :param save: if True (the default), the cache is saved to disk if it
            doesn't already exist.
        """

        key = self._fieldkey(fieldname)
        fc = self.caching_policy.get(key)
        if not fc:
            fc = FieldCache.from_field(self, fieldname)
            self.caching_policy.put(key, fc, save=save)
        return fc

    def fieldcache_available(self, fieldname):
        """Returns True if a field cache exists for the given field (either in
        memory already or on disk).
        """

        return self._fieldkey(fieldname) in self.caching_policy

    def fieldcache_loaded(self, fieldname):
        """Returns True if a field cache for the given field is in memory.
        """

        return self.caching_policy.is_loaded(self._fieldkey(fieldname))

    def unload_fieldcache(self, name):
        self.caching_policy.delete(self._fieldkey(name))
Esempio n. 14
0
class SegmentReader(IndexReader):
    GZIP_CACHES = False

    def __init__(self, storage, schema, segment):
        self.storage = storage
        self.schema = schema
        self.segment = segment

        if hasattr(self.segment, "uuid"):
            self.uuid_string = str(self.segment.uuid)
        else:
            import uuid
            self.uuid_string = str(uuid.uuid4())

        # Term index
        tf = storage.open_file(segment.termsindex_filename)
        self.termsindex = TermIndexReader(tf)

        # Term postings file, vector index, and vector postings: lazy load
        self.postfile = None
        self.vectorindex = None
        self.vpostfile = None

        # Stored fields file
        sf = storage.open_file(segment.storedfields_filename, mapped=False)
        self.storedfields = StoredFieldReader(sf)

        # Field length file
        self.fieldlengths = None
        if self.schema.has_scorable_fields():
            flf = storage.open_file(segment.fieldlengths_filename)
            self.fieldlengths = LengthReader(flf, segment.doc_count_all())

        # Copy methods from underlying segment
        self.has_deletions = segment.has_deletions
        self.is_deleted = segment.is_deleted
        self.doc_count = segment.doc_count

        # Postings file
        self.postfile = self.storage.open_file(segment.termposts_filename,
                                               mapped=False)

        self.dc = segment.doc_count_all()
        assert self.dc == self.storedfields.length

        self.set_caching_policy()

        self.is_closed = False
        self._sync_lock = Lock()

    def generation(self):
        return self.segment.generation

    def _open_vectors(self):
        if self.vectorindex:
            return

        storage, segment = self.storage, self.segment

        # Vector index
        vf = storage.open_file(segment.vectorindex_filename)
        self.vectorindex = TermVectorReader(vf)

        # Vector postings file
        self.vpostfile = storage.open_file(segment.vectorposts_filename,
                                           mapped=False)

    def __repr__(self):
        return "%s(%s)" % (self.__class__.__name__, self.segment)

    @protected
    def __contains__(self, term):
        return term in self.termsindex

    def close(self):
        self.storedfields.close()
        self.termsindex.close()
        if self.postfile:
            self.postfile.close()
        if self.vectorindex:
            self.vectorindex.close()
        if self.vpostfile:
            self.vpostfile.close()
        #if self.fieldlengths:
        #    self.fieldlengths.close()
        self.caching_policy = None
        self.is_closed = True

    def doc_count_all(self):
        return self.dc

    @protected
    def stored_fields(self, docnum):
        schema = self.schema
        return dict(item for item
                    in self.storedfields[docnum].iteritems()
                    if item[0] in schema)

    @protected
    def all_stored_fields(self):
        is_deleted = self.segment.is_deleted
        sf = self.stored_fields
        for docnum in xrange(self.segment.doc_count_all()):
            if not is_deleted(docnum):
                yield sf(docnum)

    def field_length(self, fieldname):
        return self.segment.field_length(fieldname)

    @protected
    def doc_field_length(self, docnum, fieldname, default=0):
        if self.fieldlengths is None:
            return default
        return self.fieldlengths.get(docnum, fieldname, default=default)

    def max_field_length(self, fieldname):
        return self.segment.max_field_length(fieldname)

    @protected
    def has_vector(self, docnum, fieldname):
        if self.schema[fieldname].vector:
            self._open_vectors()
            return (docnum, fieldname) in self.vectorindex
        else:
            return False

    @protected
    def __iter__(self):
        schema = self.schema
        for (fieldname, t), (totalfreq, _, postcount) in self.termsindex:
            if fieldname not in schema:
                continue
            yield (fieldname, t, postcount, totalfreq)

    def _test_field(self, fieldname):
        if fieldname not in self.schema:
            raise TermNotFound("No field %r" % fieldname)
        if self.schema[fieldname].format is None:
            raise TermNotFound("Field %r is not indexed" % fieldname)

    @protected
    def iter_from(self, fieldname, text):
        schema = self.schema
        self._test_field(fieldname)
        for (fn, t), (totalfreq, _, postcount) in self.termsindex.items_from((fieldname, text)):
            if fn not in schema:
                continue
            yield (fn, t, postcount, totalfreq)

    @protected
    def _term_info(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self.termsindex[fieldname, text]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

    def doc_frequency(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self._term_info(fieldname, text)[2]
        except TermNotFound:
            return 0

    def frequency(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self._term_info(fieldname, text)[0]
        except TermNotFound:
            return 0

    def lexicon(self, fieldname):
        # The base class has a lexicon() implementation that uses iter_from()
        # and throws away the value, but overriding to use
        # FileTableReader.keys_from() is much, much faster.

        self._test_field(fieldname)

        # If a field cache happens to already be loaded for this field, use it
        # instead of loading the field values from disk
        if self.fieldcache_loaded(fieldname):
            fieldcache = self.fieldcache(fieldname)
            it = iter(fieldcache.texts)
            # The first value in fieldcache.texts is the default; throw it away
            it.next()
            return it

        return self.expand_prefix(fieldname, '')

    @protected
    def expand_prefix(self, fieldname, prefix):
        # The base class has an expand_prefix() implementation that uses
        # iter_from() and throws away the value, but overriding to use
        # FileTableReader.keys_from() is much, much faster.

        self._test_field(fieldname)

        if self.fieldcache_loaded(fieldname):
            texts = self.fieldcache(fieldname).texts
            i = bisect_left(texts, prefix)
            while i < len(texts) and texts[i].startswith(prefix):
                yield texts[i]
                i += 1
        else:
            for fn, t in self.termsindex.keys_from((fieldname, prefix)):
                if fn != fieldname or not t.startswith(prefix):
                    break
                yield t

    def postings(self, fieldname, text, scorer=None):
        self._test_field(fieldname)
        format = self.schema[fieldname].format
        try:
            offset = self.termsindex[fieldname, text][1]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

        if isinstance(offset, (int, long)):
            postreader = FilePostingReader(self.postfile, offset, format,
                                           scorer=scorer, fieldname=fieldname,
                                           text=text)
        else:
            docids, weights, values, maxwol, minlength = offset
            postreader = ListMatcher(docids, weights, values, format, scorer,
                                     maxwol=maxwol, minlength=minlength)

        deleted = self.segment.deleted
        if deleted:
            postreader = FilterMatcher(postreader, deleted, exclude=True)

        return postreader

    def vector(self, docnum, fieldname):
        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        vformat = self.schema[fieldname].vector
        if not vformat:
            raise Exception("No vectors are stored for field %r" % fieldname)

        self._open_vectors()
        offset = self.vectorindex.get((docnum, fieldname))
        if offset is None:
            raise Exception("No vector found for document"
                            " %s field %r" % (docnum, fieldname))

        return FilePostingReader(self.vpostfile, offset, vformat, stringids=True)

    # Field cache methods

    def supports_caches(self):
        return True

    def set_caching_policy(self, cp=None, save=True, storage=None):
        """This method lets you control the caching policy of the reader. You
        can either pass a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`
        as the first argument, *or* use the `save` and `storage` keywords to
        alter the default caching policy::

            # Use a custom field caching policy object
            reader.set_caching_policy(MyPolicy())

            # Use the default caching policy but turn off saving caches to disk
            reader.set_caching_policy(save=False)

            # Use the default caching policy but save caches to a custom storage
            from whoosh.filedb.filestore import FileStorage
            mystorage = FileStorage("path/to/cachedir")
            reader.set_caching_policy(storage=mystorage)

        :param cp: a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`
            object. If this argument is not given, the default caching policy
            is used.
        :param save: save field caches to disk for re-use. If a caching policy
            object is specified using `cp`, this argument is ignored.
        :param storage: a custom :class:`whoosh.store.Storage` object to use
            for saving field caches. If a caching policy object is specified
            using `cp` or `save` is `False`, this argument is ignored.
        """

        if not cp:
            if save and storage is None:
                storage = self.storage
            else:
                storage = None
            cp = DefaultFieldCachingPolicy(self.segment.name, storage=storage)

        if type(cp) is type:
            cp = cp()

        self.caching_policy = cp

    def _fieldkey(self, fieldname):
        return "%s/%s" % (self.uuid_string, fieldname)

    def define_facets(self, name, qs, save=SAVE_BY_DEFAULT):
        if name in self.schema:
            raise Exception("Can't define facets using the name of a field (%r)" % name)

        if self.fieldcache_available(name):
            # Don't recreate the cache if it already exists
            return

        cache = self.caching_policy.get_class().from_lists(qs, self.doc_count_all())
        self.caching_policy.put(self._fieldkey(name), cache, save=save)

    def fieldcache(self, fieldname, save=SAVE_BY_DEFAULT):
        """Returns a :class:`whoosh.filedb.fieldcache.FieldCache` object for
        the given field.

        :param fieldname: the name of the field to get a cache for.
        :param save: if True (the default), the cache is saved to disk if it
            doesn't already exist.
        """

        key = self._fieldkey(fieldname)
        fc = self.caching_policy.get(key)
        if not fc:
            fc = FieldCache.from_field(self, fieldname)
            self.caching_policy.put(key, fc, save=save)
        return fc

    def fieldcache_available(self, fieldname):
        """Returns True if a field cache exists for the given field (either in
        memory already or on disk).
        """

        return self._fieldkey(fieldname) in self.caching_policy

    def fieldcache_loaded(self, fieldname):
        """Returns True if a field cache for the given field is in memory.
        """

        return self.caching_policy.is_loaded(self._fieldkey(fieldname))

    def unload_fieldcache(self, name):
        self.caching_policy.delete(self._fieldkey(name))

    # Sorting and faceting methods

    def key_fn(self, fields):
        if isinstance(fields, basestring):
            fields = (fields, )

        if len(fields) > 1:
            fcs = [self.fieldcache(fn) for fn in fields]
            return lambda docnum: tuple(fc.key_for(docnum) for fc in fcs)
        else:
            return self.fieldcache(fields[0]).key_for

    def sort_docs_by(self, fields, docnums, reverse=False):
        keyfn = self.key_fn(fields)
        return sorted(docnums, key=keyfn, reverse=reverse)

    def key_docs_by(self, fields, docnums, limit, reverse=False, offset=0):
        keyfn = self.key_fn(fields)

        if limit is None:
            # Don't bother sorting, the caller will do that
            return [(keyfn(docnum), docnum + offset) for docnum in docnums]
        else:
            # A non-reversed sort (the usual case) is inefficient because we
            # have to use nsmallest, but I can't think of a cleverer thing to
            # do right now. I thought I had an idea, but I was wrong.
            op = nlargest if reverse else nsmallest

            return op(limit, ((keyfn(docnum), docnum + offset)
                              for docnum in docnums))
Esempio n. 15
0
class SegmentReader(IndexReader):
    def __init__(self, storage, schema, segment, generation=None):
        self.storage = storage
        self.schema = schema
        self.segment = segment
        self._generation = generation

        # Term index
        tf = storage.open_file(segment.termsindex_filename)
        self.termsindex = TermIndexReader(tf)

        # Term postings file, vector index, and vector postings: lazy load
        self.postfile = None
        self.vectorindex = None
        self.vpostfile = None

        # Stored fields file
        sf = storage.open_file(segment.storedfields_filename, mapped=False)
        self.storedfields = StoredFieldReader(sf)

        # Field length file
        self.fieldlengths = None
        if self.schema.has_scorable_fields():
            flf = storage.open_file(segment.fieldlengths_filename)
            self.fieldlengths = LengthReader(flf, segment.doc_count_all())

        # Copy methods from underlying segment
        self.has_deletions = segment.has_deletions
        self.is_deleted = segment.is_deleted
        self.doc_count = segment.doc_count

        # Postings file
        self.postfile = self.storage.open_file(segment.termposts_filename,
                                               mapped=False)

        self.dc = segment.doc_count_all()
        assert self.dc == self.storedfields.length

        self.is_closed = False
        self._sync_lock = Lock()

    def _open_vectors(self):
        if self.vectorindex: return

        storage, segment = self.storage, self.segment

        # Vector index
        vf = storage.open_file(segment.vectorindex_filename)
        self.vectorindex = TermVectorReader(vf)

        # Vector postings file
        self.vpostfile = storage.open_file(segment.vectorposts_filename,
                                           mapped=False)

    def __repr__(self):
        return "%s(%s)" % (self.__class__.__name__, self.segment)

    @protected
    def __contains__(self, term):
        return term in self.termsindex

    def generation(self):
        return self._generation

    def close(self):
        self.storedfields.close()
        self.termsindex.close()
        if self.postfile:
            self.postfile.close()
        if self.vectorindex:
            self.vectorindex.close()
        #if self.fieldlengths:
        #    self.fieldlengths.close()
        self.is_closed = True

    def doc_count_all(self):
        return self.dc

    def field(self, fieldname):
        return self.schema[fieldname]

    def scorable(self, fieldname):
        return self.schema[fieldname].scorable

    def scorable_names(self):
        return self.schema.scorable_names()

    def vector_names(self):
        return self.schema.vector_names()

    def format(self, fieldname):
        return self.schema[fieldname].format

    def vector_format(self, fieldname):
        return self.schema[fieldname].vector

    @protected
    def stored_fields(self, docnum):
        schema = self.schema
        return dict(item for item in self.storedfields[docnum].iteritems()
                    if item[0] in schema)

    @protected
    def all_stored_fields(self):
        is_deleted = self.segment.is_deleted
        sf = self.stored_fields
        for docnum in xrange(self.segment.doc_count_all()):
            if not is_deleted(docnum):
                yield sf(docnum)

    def field_length(self, fieldname):
        return self.segment.field_length(fieldname)

    @protected
    def doc_field_length(self, docnum, fieldname, default=0):
        if self.fieldlengths is None: return default
        return self.fieldlengths.get(docnum, fieldname, default=default)

    def max_field_length(self, fieldname):
        return self.segment.max_field_length(fieldname)

    @protected
    def has_vector(self, docnum, fieldname):
        self._open_vectors()
        return (docnum, fieldname) in self.vectorindex

    @protected
    def __iter__(self):
        schema = self.schema
        for (fieldname, t), (totalfreq, _, postcount) in self.termsindex:
            if fieldname not in schema:
                continue
            yield (fieldname, t, postcount, totalfreq)

    def _test_field(self, fieldname):
        if fieldname not in self.schema:
            raise TermNotFound("No field %r" % fieldname)
        if self.schema[fieldname].format is None:
            raise TermNotFound("Field %r is not indexed" % fieldname)

    @protected
    def iter_from(self, fieldname, text):
        schema = self.schema
        self._test_field(fieldname)
        for (fn, t), (totalfreq, _, postcount) in self.termsindex.items_from(
            (fieldname, text)):
            if fn not in schema:
                continue
            yield (fn, t, postcount, totalfreq)

    @protected
    def _term_info(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self.termsindex[(fieldname, text)]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

    def doc_frequency(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self._term_info(fieldname, text)[2]
        except TermNotFound:
            return 0

    def frequency(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self._term_info(fieldname, text)[0]
        except TermNotFound:
            return 0

    def lexicon(self, fieldname):
        # The base class has a lexicon() implementation that uses iter_from()
        # and throws away the value, but overriding to use
        # FileTableReader.keys_from() is much, much faster.

        self._test_field(fieldname)
        return self.expand_prefix(fieldname, '')

    @protected
    def expand_prefix(self, fieldname, prefix):
        # The base class has an expand_prefix() implementation that uses
        # iter_from() and throws away the value, but overriding to use
        # FileTableReader.keys_from() is much, much faster.

        self._test_field(fieldname)
        for fn, t in self.termsindex.keys_from((fieldname, prefix)):
            if fn != fieldname or not t.startswith(prefix):
                return
            yield t

    def postings(self, fieldname, text, exclude_docs=frozenset(), scorer=None):
        self._test_field(fieldname)
        format = self.format(fieldname)
        try:
            offset = self.termsindex[(fieldname, text)][1]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

        if self.segment.deleted and exclude_docs:
            exclude_docs = self.segment.deleted | exclude_docs
        elif self.segment.deleted:
            exclude_docs = self.segment.deleted

        postreader = FilePostingReader(self.postfile,
                                       offset,
                                       format,
                                       scorer=scorer,
                                       fieldname=fieldname,
                                       text=text)
        if exclude_docs:
            postreader = ExcludeMatcher(postreader, exclude_docs)

        return postreader

    def vector(self, docnum, fieldname):
        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        vformat = self.vector_format(fieldname)
        if not vformat:
            raise Exception("No vectors are stored for field %r" % fieldname)

        self._open_vectors()
        offset = self.vectorindex.get((docnum, fieldname))
        if offset is None:
            raise Exception("No vector found"
                            " for document %s field %r" % (docnum, fieldname))

        return FilePostingReader(self.vpostfile,
                                 offset,
                                 vformat,
                                 stringids=True)
Esempio n. 16
0
 def finish(self, termswriter, doccount, lengthfile):
     self._write_lengths(lengthfile, doccount)
     lengths = LengthReader(None, doccount, self.length_arrays)
     self.postbuf.sort()
     termswriter.add_iter(self.postbuf, lengths.get)
Esempio n. 17
0
class SegmentReader(IndexReader):
    GZIP_CACHES = False

    def __init__(self, storage, schema, segment):
        self.storage = storage
        self.schema = schema
        self.segment = segment

        if hasattr(self.segment, "uuid"):
            self.uuid_string = str(self.segment.uuid)
        else:
            import uuid
            self.uuid_string = str(uuid.uuid4())

        # Term index
        tf = storage.open_file(segment.termsindex_filename)
        self.termsindex = TermIndexReader(tf)

        # Term vector index, and vector postings: lazy load
        self.vectorindex = None
        self.vpostfile = None

        # Stored fields file
        sf = storage.open_file(segment.storedfields_filename, mapped=False)
        self.storedfields = StoredFieldReader(sf)

        # Field length file
        self.fieldlengths = None
        if self.schema.has_scorable_fields():
            flf = storage.open_file(segment.fieldlengths_filename)
            self.fieldlengths = LengthReader(flf, segment.doc_count_all())

        # Copy info from underlying segment
        self._has_deletions = segment.has_deletions()
        self._doc_count = segment.doc_count()

        # Postings file
        self.postfile = self.storage.open_file(segment.termposts_filename,
                                               mapped=False)

        # Dawg file
        self.dawg = None
        if any(field.spelling for field in self.schema):
            fname = segment.dawg_filename
            if self.storage.file_exists(fname):
                dawgfile = self.storage.open_file(fname, mapped=False)
                self.dawg = DiskNode.load(dawgfile, expand=False)

        self.dc = segment.doc_count_all()
        assert self.dc == self.storedfields.length

        self.set_caching_policy()

        self.is_closed = False
        self._sync_lock = Lock()

    def has_deletions(self):
        return self._has_deletions

    def doc_count(self):
        return self._doc_count

    def is_deleted(self, docnum):
        return self.segment.is_deleted(docnum)

    def generation(self):
        return self.segment.generation

    def _open_vectors(self):
        if self.vectorindex:
            return

        storage, segment = self.storage, self.segment

        # Vector index
        vf = storage.open_file(segment.vectorindex_filename)
        self.vectorindex = TermVectorReader(vf)

        # Vector postings file
        self.vpostfile = storage.open_file(segment.vectorposts_filename,
                                           mapped=False)

    def __repr__(self):
        return "%s(%s)" % (self.__class__.__name__, self.segment)

    def __contains__(self, term):
        return term in self.termsindex

    def close(self):
        self.storedfields.close()
        self.termsindex.close()
        if self.postfile:
            self.postfile.close()
        if self.vectorindex:
            self.vectorindex.close()
        if self.vpostfile:
            self.vpostfile.close()
        #if self.fieldlengths:
        #    self.fieldlengths.close()
        self.caching_policy = None
        self.is_closed = True

    def doc_count_all(self):
        return self.dc

    def stored_fields(self, docnum):
        assert docnum >= 0
        schema = self.schema
        return dict(item for item
                    in iteritems(self.storedfields[docnum])
                    if item[0] in schema)

    def all_stored_fields(self):
        is_deleted = self.segment.is_deleted
        sf = self.stored_fields
        for docnum in xrange(self.segment.doc_count_all()):
            if not is_deleted(docnum):
                yield sf(docnum)

    def field_length(self, fieldname):
        return self.segment.field_length(fieldname)

    def min_field_length(self, fieldname):
        return self.segment.min_field_length(fieldname)

    def max_field_length(self, fieldname):
        return self.segment.max_field_length(fieldname)

    def doc_field_length(self, docnum, fieldname, default=0):
        if self.fieldlengths is None:
            return default
        return self.fieldlengths.get(docnum, fieldname, default=default)

    def has_vector(self, docnum, fieldname):
        if self.schema[fieldname].vector:
            self._open_vectors()
            return (docnum, fieldname) in self.vectorindex
        else:
            return False

    def _test_field(self, fieldname):
        if fieldname not in self.schema:
            raise TermNotFound("No field %r" % fieldname)
        if self.schema[fieldname].format is None:
            raise TermNotFound("Field %r is not indexed" % fieldname)

    def all_terms(self):
        schema = self.schema
        return ((fieldname, text) for fieldname, text
                in self.termsindex.keys()
                if fieldname in schema)

    def terms_from(self, fieldname, prefix):
        self._test_field(fieldname)
        schema = self.schema
        return ((fname, text) for fname, text
                in self.termsindex.keys_from((fieldname, prefix))
                if fname in schema)

    def term_info(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self.termsindex[fieldname, text]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

    def _texts_in_fieldcache(self, fieldname, prefix=''):
        # The first value in a fieldcache is the default
        texts = self.fieldcache(fieldname).texts[1:]
        if prefix:
            i = bisect_left(texts, prefix)
            while i < len(texts) and texts[i].startswith(prefix):
                yield texts[i]
                i += 1
        else:
            for text in texts:
                yield text

    def expand_prefix(self, fieldname, prefix):
        self._test_field(fieldname)
        # If a fieldcache for the field is already loaded, we already have the
        # values for the field in memory, so just yield them from there
        if self.fieldcache_loaded(fieldname):
            return self._texts_in_fieldcache(fieldname, prefix)
        else:
            return IndexReader.expand_prefix(self, fieldname, prefix)

    def lexicon(self, fieldname):
        self._test_field(fieldname)
        # If a fieldcache for the field is already loaded, we already have the
        # values for the field in memory, so just yield them from there
        if self.fieldcache_loaded(fieldname):
            return self._texts_in_fieldcache(fieldname)
        else:
            return IndexReader.lexicon(self, fieldname)

    def __iter__(self):
        schema = self.schema
        return ((term, terminfo) for term, terminfo
                in self.termsindex.items()
                if term[0] in schema)

    def iter_from(self, fieldname, text):
        schema = self.schema
        self._test_field(fieldname)
        for term, terminfo in self.termsindex.items_from((fieldname, text)):
            if term[0] not in schema:
                continue
            yield (term, terminfo)

    def frequency(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self.termsindex.frequency((fieldname, text))
        except KeyError:
            return 0

    def doc_frequency(self, fieldname, text):
        self._test_field(fieldname)
        try:
            return self.termsindex.doc_frequency((fieldname, text))
        except KeyError:
            return 0

    def postings(self, fieldname, text, scorer=None):
        try:
            terminfo = self.termsindex[fieldname, text]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

        format = self.schema[fieldname].format
        postings = terminfo.postings
        if isinstance(postings, integer_types):
            postreader = FilePostingReader(self.postfile, postings, format,
                                           scorer=scorer,
                                           term=(fieldname, text))
        else:
            docids, weights, values = postings
            postreader = ListMatcher(docids, weights, values, format,
                                     scorer=scorer, term=(fieldname, text),
                                     terminfo=terminfo)

        deleted = self.segment.deleted
        if deleted:
            postreader = FilterMatcher(postreader, deleted, exclude=True)

        return postreader

    def vector(self, docnum, fieldname):
        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        vformat = self.schema[fieldname].vector
        if not vformat:
            raise Exception("No vectors are stored for field %r" % fieldname)

        self._open_vectors()
        try:
            offset = self.vectorindex.get((docnum, fieldname))
        except KeyError:
            raise KeyError("No vector found for document "
                           "%s field %r" % (docnum, fieldname))

        return FilePostingReader(self.vpostfile, offset, vformat,
                                 stringids=True)

    # DAWG methods

    def has_word_graph(self, fieldname):
        if fieldname not in self.schema:
            return False
        if not self.schema[fieldname].spelling:
            return False
        if self.dawg:
            return fieldname in self.dawg
        return False

    def word_graph(self, fieldname):
        if not self.has_word_graph(fieldname):
            raise Exception("No word graph for field %r" % fieldname)
        return self.dawg.edge(fieldname)

    # Field cache methods

    def supports_caches(self):
        return True

    def set_caching_policy(self, cp=None, save=True, storage=None):
        """This method lets you control the caching policy of the reader. You
        can either pass a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`
        as the first argument, *or* use the `save` and `storage` keywords to
        alter the default caching policy::
        
            # Use a custom field caching policy object
            reader.set_caching_policy(MyPolicy())
            
            # Use the default caching policy but turn off saving caches to disk
            reader.set_caching_policy(save=False)
            
            # Use the default caching policy but save caches to a custom
            # storage
            from whoosh.filedb.filestore import FileStorage
            mystorage = FileStorage("path/to/cachedir")
            reader.set_caching_policy(storage=mystorage)
        
        :param cp: a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`
            object. If this argument is not given, the default caching policy
            is used.
        :param save: save field caches to disk for re-use. If a caching policy
            object is specified using `cp`, this argument is ignored.
        :param storage: a custom :class:`whoosh.store.Storage` object to use
            for saving field caches. If a caching policy object is specified
            using `cp` or `save` is `False`, this argument is ignored. 
        """

        if not cp:
            if save and storage is None:
                storage = self.storage
            else:
                storage = None
            cp = DefaultFieldCachingPolicy(self.segment.name, storage=storage)

        if type(cp) is type:
            cp = cp()

        self.caching_policy = cp

    def _fieldkey(self, fieldname):
        return "%s/%s" % (self.uuid_string, fieldname)

    def fieldcache(self, fieldname, save=SAVE_BY_DEFAULT):
        """Returns a :class:`whoosh.filedb.fieldcache.FieldCache` object for
        the given field.
        
        :param fieldname: the name of the field to get a cache for.
        :param save: if True (the default), the cache is saved to disk if it
            doesn't already exist.
        """

        key = self._fieldkey(fieldname)
        fc = self.caching_policy.get(key)
        if not fc:
            fc = FieldCache.from_field(self, fieldname)
            self.caching_policy.put(key, fc, save=save)
        return fc

    def fieldcache_available(self, fieldname):
        """Returns True if a field cache exists for the given field (either in
        memory already or on disk).
        """

        return self._fieldkey(fieldname) in self.caching_policy

    def fieldcache_loaded(self, fieldname):
        """Returns True if a field cache for the given field is in memory.
        """

        return self.caching_policy.is_loaded(self._fieldkey(fieldname))

    def unload_fieldcache(self, name):
        self.caching_policy.delete(self._fieldkey(name))