def __init__(self, storage, schema, segment): self.storage = storage self.schema = schema self.segment = segment if hasattr(self.segment, "uuid"): self.uuid_string = str(self.segment.uuid) else: import uuid self.uuid_string = str(uuid.uuid4()) # Term index tf = storage.open_file(segment.termsindex_filename) self.termsindex = TermIndexReader(tf) # Term vector index, and vector postings: lazy load self.vectorindex = None self.vpostfile = None # Stored fields file sf = storage.open_file(segment.storedfields_filename, mapped=False) self.storedfields = StoredFieldReader(sf) # Field length file self.fieldlengths = None if self.schema.has_scorable_fields(): flf = storage.open_file(segment.fieldlengths_filename) self.fieldlengths = LengthReader(flf, segment.doc_count_all()) # Copy info from underlying segment self._has_deletions = segment.has_deletions() self._doc_count = segment.doc_count() # Postings file self.postfile = self.storage.open_file(segment.termposts_filename, mapped=False) # Dawg file self.dawg = None if any(field.spelling for field in self.schema): fname = segment.dawg_filename if self.storage.file_exists(fname): dawgfile = self.storage.open_file(fname, mapped=False) self.dawg = DiskNode.load(dawgfile, expand=False) self.dc = segment.doc_count_all() assert self.dc == self.storedfields.length self.set_caching_policy() self.is_closed = False self._sync_lock = Lock()
def __init__(self, storage, schema, segment): self.storage = storage self.schema = schema self.segment = segment if hasattr(self.segment, "uuid"): self.uuid_string = str(self.segment.uuid) else: import uuid self.uuid_string = str(uuid.uuid4()) # Term index tf = storage.open_file(segment.termsindex_filename) self.termsindex = TermIndexReader(tf) # Term postings file, vector index, and vector postings: lazy load self.postfile = None self.vectorindex = None self.vpostfile = None # Stored fields file sf = storage.open_file(segment.storedfields_filename, mapped=False) self.storedfields = StoredFieldReader(sf) # Field length file self.fieldlengths = None if self.schema.has_scorable_fields(): flf = storage.open_file(segment.fieldlengths_filename) self.fieldlengths = LengthReader(flf, segment.doc_count_all()) # Copy methods from underlying segment self.has_deletions = segment.has_deletions self.is_deleted = segment.is_deleted self.doc_count = segment.doc_count # Postings file self.postfile = self.storage.open_file(segment.termposts_filename, mapped=False) self.dc = segment.doc_count_all() assert self.dc == self.storedfields.length self.set_caching_policy() self.is_closed = False self._sync_lock = Lock()
def finish(self, termswriter, doccount, lengthfile): _fieldlength_totals = self._fieldlength_totals if not self.tasks: return jobqueue = self.jobqueue rqueue = self.resultqueue for task in self.tasks: jobqueue.put((None, doccount)) for task in self.tasks: task.join() runs = [] lenfilenames = [] for task in self.tasks: taskruns, flentotals, flenmaxes, lenfilename = rqueue.get() runs.extend(taskruns) lenfilenames.append(lenfilename) for fieldnum, total in flentotals.iteritems(): _fieldlength_totals[fieldnum] += total for fieldnum, length in flenmaxes.iteritems(): if length > self._fieldlength_maxes.get(fieldnum, 0): self._fieldlength_maxes[fieldnum] = length jobqueue.close() rqueue.close() lw = LengthWriter(lengthfile, doccount) for lenfilename in lenfilenames: sublengths = LengthReader(StructFile(open(lenfilename, "rb")), doccount) lw.add_all(sublengths) os.remove(lenfilename) lw.close() lengths = lw.reader() # if len(runs) >= self.procs * 2: # pool = Pool(self.procs) # tempname = lambda: tempfile.mktemp(suffix=".run", dir=self.dir) # while len(runs) >= self.procs * 2: # runs2 = [(runs[i:i+4], tempname()) # for i in xrange(0, len(runs), 4)] # if len(runs) % 4: # last = runs2.pop()[0] # runs2[-1][0].extend(last) # runs = pool.map(merge_runs, runs2) # pool.close() iterator = imerge( [read_run(runname, count) for runname, count in runs]) total = sum(count for runname, count in runs) termswriter.add_iter(iterator, lengths.get) for runname, count in runs: os.remove(runname) self.cleanup()
def finish(self, termswriter, doccount, lengthfile): self._write_lengths(lengthfile, doccount) lengths = LengthReader(None, doccount, self.length_arrays) if not self._flushed: gen = self.readback_buffer() else: if self.postbuf: self.flush() gen = self.readback() termswriter.add_iter(gen, lengths.get)
def finish(self, doccount, lengthfile, termtable, postingwriter): _fieldlength_totals = self._fieldlength_totals if not self.tasks: return pqueue = self.postingqueue rqueue = self.resultsqueue for _ in xrange(self.procs): pqueue.put((-1, doccount)) #print "Joining..." t = now() for task in self.tasks: task.join() #print "Join:", now() - t #print "Getting results..." t = now() runs = [] lenfilenames = [] for task in self.tasks: taskruns, flentotals, flenmaxes, lenfilename = rqueue.get() runs.extend(taskruns) lenfilenames.append(lenfilename) for fieldnum, total in flentotals.iteritems(): _fieldlength_totals[fieldnum] += total for fieldnum, length in flenmaxes.iteritems(): if length > self._fieldlength_maxes.get(fieldnum, 0): self._fieldlength_maxes[fieldnum] = length #print "Results:", now() - t #print "Writing lengths..." t = now() lw = LengthWriter(lengthfile, doccount) for lenfilename in lenfilenames: sublengths = LengthReader(StructFile(open(lenfilename, "rb")), doccount) lw.add_all(sublengths) os.remove(lenfilename) lw.close() lengths = lw.reader() #print "Lengths:", now() - t t = now() iterator = imerge([read_run(runname, count) for runname, count in runs]) total = sum(count for runname, count in runs) write_postings(self.schema, termtable, lengths, postingwriter, iterator) for runname, count in runs: os.remove(runname) #print "Merge:", now() - t self.cleanup()
def finish(self, termswriter, doccount, lengthfile): from itertools import izip pbuf = self.postbuf self._write_lengths(lengthfile, doccount) lengths = LengthReader(None, doccount, self.length_arrays) def gen(): for term in sorted(pbuf): fieldname, text = term for docnum, weight, valuestring in izip(*pbuf[term]): yield (fieldname, text, docnum, weight, valuestring) termswriter.add_iter(gen(), lengths.get)
def finish(self, termswriter, doccount, lengthfile): self._write_lengths(lengthfile, doccount) lengths = LengthReader(None, doccount, self.length_arrays) if self.postings or self.runs: if self.postings and len(self.runs) == 0: self.postings.sort() postiter = iter(self.postings) elif not self.postings and not self.runs: postiter = iter([]) else: self.dump_run() postiter = imerge([read_run(runname, count) for runname, count in self.runs]) termswriter.add_iter(postiter, lengths.get) self.cleanup()
def finish(self, doccount, lengthfile, termtable, postingwriter): self._write_lengths(lengthfile, doccount) lengths = LengthReader(None, doccount, self.length_arrays) if self.postings or self.runs: if self.postings and len(self.runs) == 0: self.postings.sort() postiter = iter(self.postings) elif not self.postings and not self.runs: postiter = iter([]) else: self.dump_run() postiter = imerge( [read_run(runname, count) for runname, count in self.runs]) write_postings(self.schema, termtable, lengths, postingwriter, postiter) self.cleanup()
class SegmentReader(IndexReader): GZIP_CACHES = False def __init__(self, storage, schema, segment): self.storage = storage self.schema = schema self.segment = segment if hasattr(self.segment, "uuid"): self.uuid_string = str(self.segment.uuid) else: import uuid self.uuid_string = str(uuid.uuid4()) # Term index tf = storage.open_file(segment.termsindex_filename) self.termsindex = TermIndexReader(tf) # Term postings file, vector index, and vector postings: lazy load self.postfile = None self.vectorindex = None self.vpostfile = None # Stored fields file sf = storage.open_file(segment.storedfields_filename, mapped=False) self.storedfields = StoredFieldReader(sf) # Field length file self.fieldlengths = None if self.schema.has_scorable_fields(): flf = storage.open_file(segment.fieldlengths_filename) self.fieldlengths = LengthReader(flf, segment.doc_count_all()) # Copy methods from underlying segment self.has_deletions = segment.has_deletions self.is_deleted = segment.is_deleted self.doc_count = segment.doc_count # Postings file self.postfile = self.storage.open_file(segment.termposts_filename, mapped=False) self.dc = segment.doc_count_all() assert self.dc == self.storedfields.length self.set_caching_policy() self.is_closed = False self._sync_lock = Lock() def generation(self): return self.segment.generation def _open_vectors(self): if self.vectorindex: return storage, segment = self.storage, self.segment # Vector index vf = storage.open_file(segment.vectorindex_filename) self.vectorindex = TermVectorReader(vf) # Vector postings file self.vpostfile = storage.open_file(segment.vectorposts_filename, mapped=False) def __repr__(self): return "%s(%s)" % (self.__class__.__name__, self.segment) @protected def __contains__(self, term): return term in self.termsindex def close(self): self.storedfields.close() self.termsindex.close() if self.postfile: self.postfile.close() if self.vectorindex: self.vectorindex.close() if self.vpostfile: self.vpostfile.close() #if self.fieldlengths: # self.fieldlengths.close() self.caching_policy = None self.is_closed = True def doc_count_all(self): return self.dc @protected def stored_fields(self, docnum): schema = self.schema return dict(item for item in self.storedfields[docnum].iteritems() if item[0] in schema) @protected def all_stored_fields(self): is_deleted = self.segment.is_deleted sf = self.stored_fields for docnum in xrange(self.segment.doc_count_all()): if not is_deleted(docnum): yield sf(docnum) def field_length(self, fieldname): return self.segment.field_length(fieldname) @protected def doc_field_length(self, docnum, fieldname, default=0): if self.fieldlengths is None: return default return self.fieldlengths.get(docnum, fieldname, default=default) def max_field_length(self, fieldname): return self.segment.max_field_length(fieldname) @protected def has_vector(self, docnum, fieldname): if self.schema[fieldname].vector: self._open_vectors() return (docnum, fieldname) in self.vectorindex else: return False @protected def __iter__(self): schema = self.schema for (fieldname, t), (totalfreq, _, postcount) in self.termsindex: if fieldname not in schema: continue yield (fieldname, t, postcount, totalfreq) def _test_field(self, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) if self.schema[fieldname].format is None: raise TermNotFound("Field %r is not indexed" % fieldname) @protected def iter_from(self, fieldname, text): schema = self.schema self._test_field(fieldname) for (fn, t), (totalfreq, _, postcount) in self.termsindex.items_from( (fieldname, text)): if fn not in schema: continue yield (fn, t, postcount, totalfreq) @protected def _term_info(self, fieldname, text): self._test_field(fieldname) try: return self.termsindex[fieldname, text] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) def doc_frequency(self, fieldname, text): try: return self._term_info(fieldname, text)[2] except TermNotFound: return 0 def frequency(self, fieldname, text): try: return self._term_info(fieldname, text)[0] except TermNotFound: return 0 def lexicon(self, fieldname): # The base class has a lexicon() implementation that uses iter_from() # and throws away the value, but overriding to use # FileTableReader.keys_from() is much, much faster. self._test_field(fieldname) # If a field cache happens to already be loaded for this field, use it # instead of loading the field values from disk if self.fieldcache_loaded(fieldname): fieldcache = self.fieldcache(fieldname) it = iter(fieldcache.texts) # The first value in fieldcache.texts is the default; throw it away it.next() return it return self.expand_prefix(fieldname, '') @protected def expand_prefix(self, fieldname, prefix): # The base class has an expand_prefix() implementation that uses # iter_from() and throws away the value, but overriding to use # FileTableReader.keys_from() is much, much faster. self._test_field(fieldname) if self.fieldcache_loaded(fieldname): texts = self.fieldcache(fieldname).texts i = bisect_left(texts, prefix) while i < len(texts) and texts[i].startswith(prefix): yield texts[i] i += 1 else: for fn, t in self.termsindex.keys_from((fieldname, prefix)): if fn != fieldname or not t.startswith(prefix): break yield t def postings(self, fieldname, text, scorer=None): try: offset = self.termsindex[fieldname, text][1] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) format = self.schema[fieldname].format if isinstance(offset, (int, long)): postreader = FilePostingReader(self.postfile, offset, format, scorer=scorer, fieldname=fieldname, text=text) else: docids, weights, values, maxwol, minlength = offset postreader = ListMatcher(docids, weights, values, format, scorer, maxwol=maxwol, minlength=minlength) deleted = self.segment.deleted if deleted: postreader = FilterMatcher(postreader, deleted, exclude=True) return postreader def vector(self, docnum, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = self.schema[fieldname].vector if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) self._open_vectors() offset = self.vectorindex.get((docnum, fieldname)) if offset is None: raise Exception("No vector found for document" " %s field %r" % (docnum, fieldname)) return FilePostingReader(self.vpostfile, offset, vformat, stringids=True) # Field cache methods def supports_caches(self): return True def set_caching_policy(self, cp=None, save=True, storage=None): """This method lets you control the caching policy of the reader. You can either pass a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy` as the first argument, *or* use the `save` and `storage` keywords to alter the default caching policy:: # Use a custom field caching policy object reader.set_caching_policy(MyPolicy()) # Use the default caching policy but turn off saving caches to disk reader.set_caching_policy(save=False) # Use the default caching policy but save caches to a custom storage from whoosh.filedb.filestore import FileStorage mystorage = FileStorage("path/to/cachedir") reader.set_caching_policy(storage=mystorage) :param cp: a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy` object. If this argument is not given, the default caching policy is used. :param save: save field caches to disk for re-use. If a caching policy object is specified using `cp`, this argument is ignored. :param storage: a custom :class:`whoosh.store.Storage` object to use for saving field caches. If a caching policy object is specified using `cp` or `save` is `False`, this argument is ignored. """ if not cp: if save and storage is None: storage = self.storage else: storage = None cp = DefaultFieldCachingPolicy(self.segment.name, storage=storage) if type(cp) is type: cp = cp() self.caching_policy = cp def _fieldkey(self, fieldname): return "%s/%s" % (self.uuid_string, fieldname) def define_facets(self, name, qs, save=SAVE_BY_DEFAULT): if name in self.schema: raise Exception( "Can't define facets using the name of a field (%r)" % name) if self.fieldcache_available(name): # Don't recreate the cache if it already exists return cache = self.caching_policy.get_class().from_lists( qs, self.doc_count_all()) self.caching_policy.put(self._fieldkey(name), cache, save=save) def fieldcache(self, fieldname, save=SAVE_BY_DEFAULT): """Returns a :class:`whoosh.filedb.fieldcache.FieldCache` object for the given field. :param fieldname: the name of the field to get a cache for. :param save: if True (the default), the cache is saved to disk if it doesn't already exist. """ key = self._fieldkey(fieldname) fc = self.caching_policy.get(key) if not fc: fc = FieldCache.from_field(self, fieldname) self.caching_policy.put(key, fc, save=save) return fc def fieldcache_available(self, fieldname): """Returns True if a field cache exists for the given field (either in memory already or on disk). """ return self._fieldkey(fieldname) in self.caching_policy def fieldcache_loaded(self, fieldname): """Returns True if a field cache for the given field is in memory. """ return self.caching_policy.is_loaded(self._fieldkey(fieldname)) def unload_fieldcache(self, name): self.caching_policy.delete(self._fieldkey(name)) # Sorting and faceting methods def key_fn(self, fields): if isinstance(fields, basestring): fields = (fields, ) if len(fields) > 1: fcs = [self.fieldcache(fn) for fn in fields] return lambda docnum: tuple(fc.key_for(docnum) for fc in fcs) else: return self.fieldcache(fields[0]).key_for def sort_docs_by(self, fields, docnums, reverse=False): keyfn = self.key_fn(fields) return sorted(docnums, key=keyfn, reverse=reverse) def key_docs_by(self, fields, docnums, limit, reverse=False, offset=0): keyfn = self.key_fn(fields) if limit is None: # Don't bother sorting, the caller will do that return [(keyfn(docnum), docnum + offset) for docnum in docnums] else: # A non-reversed sort (the usual case) is inefficient because we # have to use nsmallest, but I can't think of a cleverer thing to # do right now. I thought I had an idea, but I was wrong. op = nlargest if reverse else nsmallest return op(limit, ((keyfn(docnum), docnum + offset) for docnum in docnums))
class SegmentReader(IndexReader): def __init__(self, storage, schema, segment, generation=None): self.storage = storage self.schema = schema self.segment = segment self._generation = generation # Term index tf = storage.open_file(segment.termsindex_filename) self.termsindex = TermIndexReader(tf) # Term postings file, vector index, and vector postings: lazy load self.postfile = None self.vectorindex = None self.vpostfile = None # Stored fields file sf = storage.open_file(segment.storedfields_filename, mapped=False) self.storedfields = StoredFieldReader(sf) # Field length file self.fieldlengths = None if self.schema.has_scorable_fields(): flf = storage.open_file(segment.fieldlengths_filename) self.fieldlengths = LengthReader(flf, segment.doc_count_all()) # Copy methods from underlying segment self.has_deletions = segment.has_deletions self.is_deleted = segment.is_deleted self.doc_count = segment.doc_count # Postings file self.postfile = self.storage.open_file(segment.termposts_filename, mapped=False) self.dc = segment.doc_count_all() assert self.dc == self.storedfields.length self.is_closed = False self._sync_lock = Lock() def _open_vectors(self): if self.vectorindex: return storage, segment = self.storage, self.segment # Vector index vf = storage.open_file(segment.vectorindex_filename) self.vectorindex = TermVectorReader(vf) # Vector postings file self.vpostfile = storage.open_file(segment.vectorposts_filename, mapped=False) def __repr__(self): return "%s(%s)" % (self.__class__.__name__, self.segment) @protected def __contains__(self, term): return term in self.termsindex def generation(self): return self._generation def close(self): self.storedfields.close() self.termsindex.close() if self.postfile: self.postfile.close() if self.vectorindex: self.vectorindex.close() #if self.fieldlengths: # self.fieldlengths.close() self.is_closed = True def doc_count_all(self): return self.dc def field(self, fieldname): return self.schema[fieldname] def scorable(self, fieldname): return self.schema[fieldname].scorable def scorable_names(self): return self.schema.scorable_names() def vector_names(self): return self.schema.vector_names() def format(self, fieldname): return self.schema[fieldname].format def vector_format(self, fieldname): return self.schema[fieldname].vector @protected def stored_fields(self, docnum): schema = self.schema return dict(item for item in self.storedfields[docnum].iteritems() if item[0] in schema) @protected def all_stored_fields(self): is_deleted = self.segment.is_deleted sf = self.stored_fields for docnum in xrange(self.segment.doc_count_all()): if not is_deleted(docnum): yield sf(docnum) def field_length(self, fieldname): return self.segment.field_length(fieldname) @protected def doc_field_length(self, docnum, fieldname, default=0): if self.fieldlengths is None: return default return self.fieldlengths.get(docnum, fieldname, default=default) def max_field_length(self, fieldname): return self.segment.max_field_length(fieldname) @protected def has_vector(self, docnum, fieldname): self._open_vectors() return (docnum, fieldname) in self.vectorindex @protected def __iter__(self): schema = self.schema for (fieldname, t), (totalfreq, _, postcount) in self.termsindex: if fieldname not in schema: continue yield (fieldname, t, postcount, totalfreq) def _test_field(self, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) if self.schema[fieldname].format is None: raise TermNotFound("Field %r is not indexed" % fieldname) @protected def iter_from(self, fieldname, text): schema = self.schema self._test_field(fieldname) for (fn, t), (totalfreq, _, postcount) in self.termsindex.items_from((fieldname, text)): if fn not in schema: continue yield (fn, t, postcount, totalfreq) @protected def _term_info(self, fieldname, text): self._test_field(fieldname) try: return self.termsindex[(fieldname, text)] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) def doc_frequency(self, fieldname, text): self._test_field(fieldname) try: return self._term_info(fieldname, text)[2] except TermNotFound: return 0 def frequency(self, fieldname, text): self._test_field(fieldname) try: return self._term_info(fieldname, text)[0] except TermNotFound: return 0 def lexicon(self, fieldname): # The base class has a lexicon() implementation that uses iter_from() # and throws away the value, but overriding to use # FileTableReader.keys_from() is much, much faster. self._test_field(fieldname) return self.expand_prefix(fieldname, '') @protected def expand_prefix(self, fieldname, prefix): # The base class has an expand_prefix() implementation that uses # iter_from() and throws away the value, but overriding to use # FileTableReader.keys_from() is much, much faster. self._test_field(fieldname) for fn, t in self.termsindex.keys_from((fieldname, prefix)): if fn != fieldname or not t.startswith(prefix): return yield t def postings(self, fieldname, text, exclude_docs=frozenset(), scorer=None): self._test_field(fieldname) format = self.format(fieldname) try: offset = self.termsindex[(fieldname, text)][1] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) if self.segment.deleted and exclude_docs: exclude_docs = self.segment.deleted | exclude_docs elif self.segment.deleted: exclude_docs = self.segment.deleted postreader = FilePostingReader(self.postfile, offset, format, scorer=scorer, fieldname=fieldname, text=text) if exclude_docs: postreader = ExcludeMatcher(postreader, exclude_docs) return postreader def vector(self, docnum, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = self.vector_format(fieldname) if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) self._open_vectors() offset = self.vectorindex.get((docnum, fieldname)) if offset is None: raise Exception("No vector found" " for document %s field %r" % (docnum, fieldname)) return FilePostingReader(self.vpostfile, offset, vformat, stringids=True)
class SegmentReader(IndexReader): GZIP_CACHES = False def __init__(self, storage, schema, segment): self.storage = storage self.schema = schema self.segment = segment if hasattr(self.segment, "uuid"): self.uuid_string = str(self.segment.uuid) else: import uuid self.uuid_string = str(uuid.uuid4()) # Term index tf = storage.open_file(segment.termsindex_filename) self.termsindex = TermIndexReader(tf) # Term vector index, and vector postings: lazy load self.vectorindex = None self.vpostfile = None # Stored fields file sf = storage.open_file(segment.storedfields_filename, mapped=False) self.storedfields = StoredFieldReader(sf) # Field length file self.fieldlengths = None if self.schema.has_scorable_fields(): flf = storage.open_file(segment.fieldlengths_filename) self.fieldlengths = LengthReader(flf, segment.doc_count_all()) # Copy info from underlying segment self._has_deletions = segment.has_deletions() self._doc_count = segment.doc_count() # Postings file self.postfile = self.storage.open_file(segment.termposts_filename, mapped=False) # Dawg file self.dawg = None if any(field.spelling for field in self.schema): fname = segment.dawg_filename if self.storage.file_exists(fname): dawgfile = self.storage.open_file(fname, mapped=False) self.dawg = DiskNode.load(dawgfile, expand=False) self.dc = segment.doc_count_all() assert self.dc == self.storedfields.length self.set_caching_policy() self.is_closed = False self._sync_lock = Lock() def has_deletions(self): return self._has_deletions def doc_count(self): return self._doc_count def is_deleted(self, docnum): return self.segment.is_deleted(docnum) def generation(self): return self.segment.generation def _open_vectors(self): if self.vectorindex: return storage, segment = self.storage, self.segment # Vector index vf = storage.open_file(segment.vectorindex_filename) self.vectorindex = TermVectorReader(vf) # Vector postings file self.vpostfile = storage.open_file(segment.vectorposts_filename, mapped=False) def __repr__(self): return "%s(%s)" % (self.__class__.__name__, self.segment) def __contains__(self, term): return term in self.termsindex def close(self): self.storedfields.close() self.termsindex.close() if self.postfile: self.postfile.close() if self.vectorindex: self.vectorindex.close() if self.vpostfile: self.vpostfile.close() #if self.fieldlengths: # self.fieldlengths.close() self.caching_policy = None self.is_closed = True def doc_count_all(self): return self.dc def stored_fields(self, docnum): assert docnum >= 0 schema = self.schema return dict(item for item in iteritems(self.storedfields[docnum]) if item[0] in schema) def all_stored_fields(self): is_deleted = self.segment.is_deleted sf = self.stored_fields for docnum in xrange(self.segment.doc_count_all()): if not is_deleted(docnum): yield sf(docnum) def field_length(self, fieldname): return self.segment.field_length(fieldname) def min_field_length(self, fieldname): return self.segment.min_field_length(fieldname) def max_field_length(self, fieldname): return self.segment.max_field_length(fieldname) def doc_field_length(self, docnum, fieldname, default=0): if self.fieldlengths is None: return default return self.fieldlengths.get(docnum, fieldname, default=default) def has_vector(self, docnum, fieldname): if self.schema[fieldname].vector: self._open_vectors() return (docnum, fieldname) in self.vectorindex else: return False def _test_field(self, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) if self.schema[fieldname].format is None: raise TermNotFound("Field %r is not indexed" % fieldname) def all_terms(self): schema = self.schema return ((fieldname, text) for fieldname, text in self.termsindex.keys() if fieldname in schema) def terms_from(self, fieldname, prefix): self._test_field(fieldname) schema = self.schema return ((fname, text) for fname, text in self.termsindex.keys_from((fieldname, prefix)) if fname in schema) def term_info(self, fieldname, text): self._test_field(fieldname) try: return self.termsindex[fieldname, text] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) def _texts_in_fieldcache(self, fieldname, prefix=''): # The first value in a fieldcache is the default texts = self.fieldcache(fieldname).texts[1:] if prefix: i = bisect_left(texts, prefix) while i < len(texts) and texts[i].startswith(prefix): yield texts[i] i += 1 else: for text in texts: yield text def expand_prefix(self, fieldname, prefix): self._test_field(fieldname) # If a fieldcache for the field is already loaded, we already have the # values for the field in memory, so just yield them from there if self.fieldcache_loaded(fieldname): return self._texts_in_fieldcache(fieldname, prefix) else: return IndexReader.expand_prefix(self, fieldname, prefix) def lexicon(self, fieldname): self._test_field(fieldname) # If a fieldcache for the field is already loaded, we already have the # values for the field in memory, so just yield them from there if self.fieldcache_loaded(fieldname): return self._texts_in_fieldcache(fieldname) else: return IndexReader.lexicon(self, fieldname) def __iter__(self): schema = self.schema return ((term, terminfo) for term, terminfo in self.termsindex.items() if term[0] in schema) def iter_from(self, fieldname, text): schema = self.schema self._test_field(fieldname) for term, terminfo in self.termsindex.items_from((fieldname, text)): if term[0] not in schema: continue yield (term, terminfo) def frequency(self, fieldname, text): self._test_field(fieldname) try: return self.termsindex.frequency((fieldname, text)) except KeyError: return 0 def doc_frequency(self, fieldname, text): self._test_field(fieldname) try: return self.termsindex.doc_frequency((fieldname, text)) except KeyError: return 0 def postings(self, fieldname, text, scorer=None): try: terminfo = self.termsindex[fieldname, text] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) format = self.schema[fieldname].format postings = terminfo.postings if isinstance(postings, integer_types): postreader = FilePostingReader(self.postfile, postings, format, scorer=scorer, term=(fieldname, text)) else: docids, weights, values = postings postreader = ListMatcher(docids, weights, values, format, scorer=scorer, term=(fieldname, text), terminfo=terminfo) deleted = self.segment.deleted if deleted: postreader = FilterMatcher(postreader, deleted, exclude=True) return postreader def vector(self, docnum, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = self.schema[fieldname].vector if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) self._open_vectors() try: offset = self.vectorindex.get((docnum, fieldname)) except KeyError: raise KeyError("No vector found for document " "%s field %r" % (docnum, fieldname)) return FilePostingReader(self.vpostfile, offset, vformat, stringids=True) # DAWG methods def has_word_graph(self, fieldname): if fieldname not in self.schema: return False if not self.schema[fieldname].spelling: return False if self.dawg: return fieldname in self.dawg return False def word_graph(self, fieldname): if not self.has_word_graph(fieldname): raise Exception("No word graph for field %r" % fieldname) return self.dawg.edge(fieldname) # Field cache methods def supports_caches(self): return True def set_caching_policy(self, cp=None, save=True, storage=None): """This method lets you control the caching policy of the reader. You can either pass a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy` as the first argument, *or* use the `save` and `storage` keywords to alter the default caching policy:: # Use a custom field caching policy object reader.set_caching_policy(MyPolicy()) # Use the default caching policy but turn off saving caches to disk reader.set_caching_policy(save=False) # Use the default caching policy but save caches to a custom # storage from whoosh.filedb.filestore import FileStorage mystorage = FileStorage("path/to/cachedir") reader.set_caching_policy(storage=mystorage) :param cp: a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy` object. If this argument is not given, the default caching policy is used. :param save: save field caches to disk for re-use. If a caching policy object is specified using `cp`, this argument is ignored. :param storage: a custom :class:`whoosh.store.Storage` object to use for saving field caches. If a caching policy object is specified using `cp` or `save` is `False`, this argument is ignored. """ if not cp: if save and storage is None: storage = self.storage elif not save: storage = None cp = DefaultFieldCachingPolicy(self.segment.name, storage=storage) if type(cp) is type: cp = cp() self.caching_policy = cp def _fieldkey(self, fieldname): return "%s/%s" % (self.uuid_string, fieldname) def fieldcache(self, fieldname, save=SAVE_BY_DEFAULT): """Returns a :class:`whoosh.filedb.fieldcache.FieldCache` object for the given field. :param fieldname: the name of the field to get a cache for. :param save: if True (the default), the cache is saved to disk if it doesn't already exist. """ key = self._fieldkey(fieldname) fc = self.caching_policy.get(key) if not fc: fc = FieldCache.from_field(self, fieldname) self.caching_policy.put(key, fc, save=save) return fc def fieldcache_available(self, fieldname): """Returns True if a field cache exists for the given field (either in memory already or on disk). """ return self._fieldkey(fieldname) in self.caching_policy def fieldcache_loaded(self, fieldname): """Returns True if a field cache for the given field is in memory. """ return self.caching_policy.is_loaded(self._fieldkey(fieldname)) def unload_fieldcache(self, name): self.caching_policy.delete(self._fieldkey(name))
class SegmentReader(IndexReader): GZIP_CACHES = False def __init__(self, storage, schema, segment): self.storage = storage self.schema = schema self.segment = segment if hasattr(self.segment, "uuid"): self.uuid_string = str(self.segment.uuid) else: import uuid self.uuid_string = str(uuid.uuid4()) # Term index tf = storage.open_file(segment.termsindex_filename) self.termsindex = TermIndexReader(tf) # Term postings file, vector index, and vector postings: lazy load self.postfile = None self.vectorindex = None self.vpostfile = None # Stored fields file sf = storage.open_file(segment.storedfields_filename, mapped=False) self.storedfields = StoredFieldReader(sf) # Field length file self.fieldlengths = None if self.schema.has_scorable_fields(): flf = storage.open_file(segment.fieldlengths_filename) self.fieldlengths = LengthReader(flf, segment.doc_count_all()) # Copy methods from underlying segment self.has_deletions = segment.has_deletions self.is_deleted = segment.is_deleted self.doc_count = segment.doc_count # Postings file self.postfile = self.storage.open_file(segment.termposts_filename, mapped=False) self.dc = segment.doc_count_all() assert self.dc == self.storedfields.length self.set_caching_policy() self.is_closed = False self._sync_lock = Lock() def generation(self): return self.segment.generation def _open_vectors(self): if self.vectorindex: return storage, segment = self.storage, self.segment # Vector index vf = storage.open_file(segment.vectorindex_filename) self.vectorindex = TermVectorReader(vf) # Vector postings file self.vpostfile = storage.open_file(segment.vectorposts_filename, mapped=False) def __repr__(self): return "%s(%s)" % (self.__class__.__name__, self.segment) @protected def __contains__(self, term): return term in self.termsindex def close(self): self.storedfields.close() self.termsindex.close() if self.postfile: self.postfile.close() if self.vectorindex: self.vectorindex.close() if self.vpostfile: self.vpostfile.close() #if self.fieldlengths: # self.fieldlengths.close() self.caching_policy = None self.is_closed = True def doc_count_all(self): return self.dc @protected def stored_fields(self, docnum): schema = self.schema return dict(item for item in self.storedfields[docnum].iteritems() if item[0] in schema) @protected def all_stored_fields(self): is_deleted = self.segment.is_deleted sf = self.stored_fields for docnum in xrange(self.segment.doc_count_all()): if not is_deleted(docnum): yield sf(docnum) def field_length(self, fieldname): return self.segment.field_length(fieldname) @protected def doc_field_length(self, docnum, fieldname, default=0): if self.fieldlengths is None: return default return self.fieldlengths.get(docnum, fieldname, default=default) def max_field_length(self, fieldname): return self.segment.max_field_length(fieldname) @protected def has_vector(self, docnum, fieldname): if self.schema[fieldname].vector: self._open_vectors() return (docnum, fieldname) in self.vectorindex else: return False @protected def __iter__(self): schema = self.schema for (fieldname, t), (totalfreq, _, postcount) in self.termsindex: if fieldname not in schema: continue yield (fieldname, t, postcount, totalfreq) def _test_field(self, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) if self.schema[fieldname].format is None: raise TermNotFound("Field %r is not indexed" % fieldname) @protected def iter_from(self, fieldname, text): schema = self.schema self._test_field(fieldname) for (fn, t), (totalfreq, _, postcount) in self.termsindex.items_from((fieldname, text)): if fn not in schema: continue yield (fn, t, postcount, totalfreq) @protected def _term_info(self, fieldname, text): self._test_field(fieldname) try: return self.termsindex[fieldname, text] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) def doc_frequency(self, fieldname, text): self._test_field(fieldname) try: return self._term_info(fieldname, text)[2] except TermNotFound: return 0 def frequency(self, fieldname, text): self._test_field(fieldname) try: return self._term_info(fieldname, text)[0] except TermNotFound: return 0 def lexicon(self, fieldname): # The base class has a lexicon() implementation that uses iter_from() # and throws away the value, but overriding to use # FileTableReader.keys_from() is much, much faster. self._test_field(fieldname) # If a field cache happens to already be loaded for this field, use it # instead of loading the field values from disk if self.fieldcache_loaded(fieldname): fieldcache = self.fieldcache(fieldname) it = iter(fieldcache.texts) # The first value in fieldcache.texts is the default; throw it away it.next() return it return self.expand_prefix(fieldname, '') @protected def expand_prefix(self, fieldname, prefix): # The base class has an expand_prefix() implementation that uses # iter_from() and throws away the value, but overriding to use # FileTableReader.keys_from() is much, much faster. self._test_field(fieldname) if self.fieldcache_loaded(fieldname): texts = self.fieldcache(fieldname).texts i = bisect_left(texts, prefix) while i < len(texts) and texts[i].startswith(prefix): yield texts[i] i += 1 else: for fn, t in self.termsindex.keys_from((fieldname, prefix)): if fn != fieldname or not t.startswith(prefix): break yield t def postings(self, fieldname, text, scorer=None): self._test_field(fieldname) format = self.schema[fieldname].format try: offset = self.termsindex[fieldname, text][1] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) if isinstance(offset, (int, long)): postreader = FilePostingReader(self.postfile, offset, format, scorer=scorer, fieldname=fieldname, text=text) else: docids, weights, values, maxwol, minlength = offset postreader = ListMatcher(docids, weights, values, format, scorer, maxwol=maxwol, minlength=minlength) deleted = self.segment.deleted if deleted: postreader = FilterMatcher(postreader, deleted, exclude=True) return postreader def vector(self, docnum, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = self.schema[fieldname].vector if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) self._open_vectors() offset = self.vectorindex.get((docnum, fieldname)) if offset is None: raise Exception("No vector found for document" " %s field %r" % (docnum, fieldname)) return FilePostingReader(self.vpostfile, offset, vformat, stringids=True) # Field cache methods def supports_caches(self): return True def set_caching_policy(self, cp=None, save=True, storage=None): """This method lets you control the caching policy of the reader. You can either pass a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy` as the first argument, *or* use the `save` and `storage` keywords to alter the default caching policy:: # Use a custom field caching policy object reader.set_caching_policy(MyPolicy()) # Use the default caching policy but turn off saving caches to disk reader.set_caching_policy(save=False) # Use the default caching policy but save caches to a custom storage from whoosh.filedb.filestore import FileStorage mystorage = FileStorage("path/to/cachedir") reader.set_caching_policy(storage=mystorage) :param cp: a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy` object. If this argument is not given, the default caching policy is used. :param save: save field caches to disk for re-use. If a caching policy object is specified using `cp`, this argument is ignored. :param storage: a custom :class:`whoosh.store.Storage` object to use for saving field caches. If a caching policy object is specified using `cp` or `save` is `False`, this argument is ignored. """ if not cp: if save and storage is None: storage = self.storage else: storage = None cp = DefaultFieldCachingPolicy(self.segment.name, storage=storage) if type(cp) is type: cp = cp() self.caching_policy = cp def _fieldkey(self, fieldname): return "%s/%s" % (self.uuid_string, fieldname) def define_facets(self, name, qs, save=SAVE_BY_DEFAULT): if name in self.schema: raise Exception("Can't define facets using the name of a field (%r)" % name) if self.fieldcache_available(name): # Don't recreate the cache if it already exists return cache = self.caching_policy.get_class().from_lists(qs, self.doc_count_all()) self.caching_policy.put(self._fieldkey(name), cache, save=save) def fieldcache(self, fieldname, save=SAVE_BY_DEFAULT): """Returns a :class:`whoosh.filedb.fieldcache.FieldCache` object for the given field. :param fieldname: the name of the field to get a cache for. :param save: if True (the default), the cache is saved to disk if it doesn't already exist. """ key = self._fieldkey(fieldname) fc = self.caching_policy.get(key) if not fc: fc = FieldCache.from_field(self, fieldname) self.caching_policy.put(key, fc, save=save) return fc def fieldcache_available(self, fieldname): """Returns True if a field cache exists for the given field (either in memory already or on disk). """ return self._fieldkey(fieldname) in self.caching_policy def fieldcache_loaded(self, fieldname): """Returns True if a field cache for the given field is in memory. """ return self.caching_policy.is_loaded(self._fieldkey(fieldname)) def unload_fieldcache(self, name): self.caching_policy.delete(self._fieldkey(name)) # Sorting and faceting methods def key_fn(self, fields): if isinstance(fields, basestring): fields = (fields, ) if len(fields) > 1: fcs = [self.fieldcache(fn) for fn in fields] return lambda docnum: tuple(fc.key_for(docnum) for fc in fcs) else: return self.fieldcache(fields[0]).key_for def sort_docs_by(self, fields, docnums, reverse=False): keyfn = self.key_fn(fields) return sorted(docnums, key=keyfn, reverse=reverse) def key_docs_by(self, fields, docnums, limit, reverse=False, offset=0): keyfn = self.key_fn(fields) if limit is None: # Don't bother sorting, the caller will do that return [(keyfn(docnum), docnum + offset) for docnum in docnums] else: # A non-reversed sort (the usual case) is inefficient because we # have to use nsmallest, but I can't think of a cleverer thing to # do right now. I thought I had an idea, but I was wrong. op = nlargest if reverse else nsmallest return op(limit, ((keyfn(docnum), docnum + offset) for docnum in docnums))
class SegmentReader(IndexReader): def __init__(self, storage, schema, segment, generation=None): self.storage = storage self.schema = schema self.segment = segment self._generation = generation # Term index tf = storage.open_file(segment.termsindex_filename) self.termsindex = TermIndexReader(tf) # Term postings file, vector index, and vector postings: lazy load self.postfile = None self.vectorindex = None self.vpostfile = None # Stored fields file sf = storage.open_file(segment.storedfields_filename, mapped=False) self.storedfields = StoredFieldReader(sf) # Field length file self.fieldlengths = None if self.schema.has_scorable_fields(): flf = storage.open_file(segment.fieldlengths_filename) self.fieldlengths = LengthReader(flf, segment.doc_count_all()) # Copy methods from underlying segment self.has_deletions = segment.has_deletions self.is_deleted = segment.is_deleted self.doc_count = segment.doc_count # Postings file self.postfile = self.storage.open_file(segment.termposts_filename, mapped=False) self.dc = segment.doc_count_all() assert self.dc == self.storedfields.length self.is_closed = False self._sync_lock = Lock() def _open_vectors(self): if self.vectorindex: return storage, segment = self.storage, self.segment # Vector index vf = storage.open_file(segment.vectorindex_filename) self.vectorindex = TermVectorReader(vf) # Vector postings file self.vpostfile = storage.open_file(segment.vectorposts_filename, mapped=False) def __repr__(self): return "%s(%s)" % (self.__class__.__name__, self.segment) @protected def __contains__(self, term): return term in self.termsindex def generation(self): return self._generation def close(self): self.storedfields.close() self.termsindex.close() if self.postfile: self.postfile.close() if self.vectorindex: self.vectorindex.close() #if self.fieldlengths: # self.fieldlengths.close() self.is_closed = True def doc_count_all(self): return self.dc def field(self, fieldname): return self.schema[fieldname] def scorable(self, fieldname): return self.schema[fieldname].scorable def scorable_names(self): return self.schema.scorable_names() def vector_names(self): return self.schema.vector_names() def format(self, fieldname): return self.schema[fieldname].format def vector_format(self, fieldname): return self.schema[fieldname].vector @protected def stored_fields(self, docnum): schema = self.schema return dict(item for item in self.storedfields[docnum].iteritems() if item[0] in schema) @protected def all_stored_fields(self): is_deleted = self.segment.is_deleted sf = self.stored_fields for docnum in xrange(self.segment.doc_count_all()): if not is_deleted(docnum): yield sf(docnum) def field_length(self, fieldname): return self.segment.field_length(fieldname) @protected def doc_field_length(self, docnum, fieldname, default=0): if self.fieldlengths is None: return default return self.fieldlengths.get(docnum, fieldname, default=default) def max_field_length(self, fieldname): return self.segment.max_field_length(fieldname) @protected def has_vector(self, docnum, fieldname): self._open_vectors() return (docnum, fieldname) in self.vectorindex @protected def __iter__(self): schema = self.schema for (fieldname, t), (totalfreq, _, postcount) in self.termsindex: if fieldname not in schema: continue yield (fieldname, t, postcount, totalfreq) def _test_field(self, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) if self.schema[fieldname].format is None: raise TermNotFound("Field %r is not indexed" % fieldname) @protected def iter_from(self, fieldname, text): schema = self.schema self._test_field(fieldname) for (fn, t), (totalfreq, _, postcount) in self.termsindex.items_from( (fieldname, text)): if fn not in schema: continue yield (fn, t, postcount, totalfreq) @protected def _term_info(self, fieldname, text): self._test_field(fieldname) try: return self.termsindex[(fieldname, text)] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) def doc_frequency(self, fieldname, text): self._test_field(fieldname) try: return self._term_info(fieldname, text)[2] except TermNotFound: return 0 def frequency(self, fieldname, text): self._test_field(fieldname) try: return self._term_info(fieldname, text)[0] except TermNotFound: return 0 def lexicon(self, fieldname): # The base class has a lexicon() implementation that uses iter_from() # and throws away the value, but overriding to use # FileTableReader.keys_from() is much, much faster. self._test_field(fieldname) return self.expand_prefix(fieldname, '') @protected def expand_prefix(self, fieldname, prefix): # The base class has an expand_prefix() implementation that uses # iter_from() and throws away the value, but overriding to use # FileTableReader.keys_from() is much, much faster. self._test_field(fieldname) for fn, t in self.termsindex.keys_from((fieldname, prefix)): if fn != fieldname or not t.startswith(prefix): return yield t def postings(self, fieldname, text, exclude_docs=frozenset(), scorer=None): self._test_field(fieldname) format = self.format(fieldname) try: offset = self.termsindex[(fieldname, text)][1] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) if self.segment.deleted and exclude_docs: exclude_docs = self.segment.deleted | exclude_docs elif self.segment.deleted: exclude_docs = self.segment.deleted postreader = FilePostingReader(self.postfile, offset, format, scorer=scorer, fieldname=fieldname, text=text) if exclude_docs: postreader = ExcludeMatcher(postreader, exclude_docs) return postreader def vector(self, docnum, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = self.vector_format(fieldname) if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) self._open_vectors() offset = self.vectorindex.get((docnum, fieldname)) if offset is None: raise Exception("No vector found" " for document %s field %r" % (docnum, fieldname)) return FilePostingReader(self.vpostfile, offset, vformat, stringids=True)
def finish(self, termswriter, doccount, lengthfile): self._write_lengths(lengthfile, doccount) lengths = LengthReader(None, doccount, self.length_arrays) self.postbuf.sort() termswriter.add_iter(self.postbuf, lengths.get)
class SegmentReader(IndexReader): GZIP_CACHES = False def __init__(self, storage, schema, segment): self.storage = storage self.schema = schema self.segment = segment if hasattr(self.segment, "uuid"): self.uuid_string = str(self.segment.uuid) else: import uuid self.uuid_string = str(uuid.uuid4()) # Term index tf = storage.open_file(segment.termsindex_filename) self.termsindex = TermIndexReader(tf) # Term vector index, and vector postings: lazy load self.vectorindex = None self.vpostfile = None # Stored fields file sf = storage.open_file(segment.storedfields_filename, mapped=False) self.storedfields = StoredFieldReader(sf) # Field length file self.fieldlengths = None if self.schema.has_scorable_fields(): flf = storage.open_file(segment.fieldlengths_filename) self.fieldlengths = LengthReader(flf, segment.doc_count_all()) # Copy info from underlying segment self._has_deletions = segment.has_deletions() self._doc_count = segment.doc_count() # Postings file self.postfile = self.storage.open_file(segment.termposts_filename, mapped=False) # Dawg file self.dawg = None if any(field.spelling for field in self.schema): fname = segment.dawg_filename if self.storage.file_exists(fname): dawgfile = self.storage.open_file(fname, mapped=False) self.dawg = DiskNode.load(dawgfile, expand=False) self.dc = segment.doc_count_all() assert self.dc == self.storedfields.length self.set_caching_policy() self.is_closed = False self._sync_lock = Lock() def has_deletions(self): return self._has_deletions def doc_count(self): return self._doc_count def is_deleted(self, docnum): return self.segment.is_deleted(docnum) def generation(self): return self.segment.generation def _open_vectors(self): if self.vectorindex: return storage, segment = self.storage, self.segment # Vector index vf = storage.open_file(segment.vectorindex_filename) self.vectorindex = TermVectorReader(vf) # Vector postings file self.vpostfile = storage.open_file(segment.vectorposts_filename, mapped=False) def __repr__(self): return "%s(%s)" % (self.__class__.__name__, self.segment) def __contains__(self, term): return term in self.termsindex def close(self): self.storedfields.close() self.termsindex.close() if self.postfile: self.postfile.close() if self.vectorindex: self.vectorindex.close() if self.vpostfile: self.vpostfile.close() #if self.fieldlengths: # self.fieldlengths.close() self.caching_policy = None self.is_closed = True def doc_count_all(self): return self.dc def stored_fields(self, docnum): assert docnum >= 0 schema = self.schema return dict(item for item in iteritems(self.storedfields[docnum]) if item[0] in schema) def all_stored_fields(self): is_deleted = self.segment.is_deleted sf = self.stored_fields for docnum in xrange(self.segment.doc_count_all()): if not is_deleted(docnum): yield sf(docnum) def field_length(self, fieldname): return self.segment.field_length(fieldname) def min_field_length(self, fieldname): return self.segment.min_field_length(fieldname) def max_field_length(self, fieldname): return self.segment.max_field_length(fieldname) def doc_field_length(self, docnum, fieldname, default=0): if self.fieldlengths is None: return default return self.fieldlengths.get(docnum, fieldname, default=default) def has_vector(self, docnum, fieldname): if self.schema[fieldname].vector: self._open_vectors() return (docnum, fieldname) in self.vectorindex else: return False def _test_field(self, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) if self.schema[fieldname].format is None: raise TermNotFound("Field %r is not indexed" % fieldname) def all_terms(self): schema = self.schema return ((fieldname, text) for fieldname, text in self.termsindex.keys() if fieldname in schema) def terms_from(self, fieldname, prefix): self._test_field(fieldname) schema = self.schema return ((fname, text) for fname, text in self.termsindex.keys_from((fieldname, prefix)) if fname in schema) def term_info(self, fieldname, text): self._test_field(fieldname) try: return self.termsindex[fieldname, text] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) def _texts_in_fieldcache(self, fieldname, prefix=''): # The first value in a fieldcache is the default texts = self.fieldcache(fieldname).texts[1:] if prefix: i = bisect_left(texts, prefix) while i < len(texts) and texts[i].startswith(prefix): yield texts[i] i += 1 else: for text in texts: yield text def expand_prefix(self, fieldname, prefix): self._test_field(fieldname) # If a fieldcache for the field is already loaded, we already have the # values for the field in memory, so just yield them from there if self.fieldcache_loaded(fieldname): return self._texts_in_fieldcache(fieldname, prefix) else: return IndexReader.expand_prefix(self, fieldname, prefix) def lexicon(self, fieldname): self._test_field(fieldname) # If a fieldcache for the field is already loaded, we already have the # values for the field in memory, so just yield them from there if self.fieldcache_loaded(fieldname): return self._texts_in_fieldcache(fieldname) else: return IndexReader.lexicon(self, fieldname) def __iter__(self): schema = self.schema return ((term, terminfo) for term, terminfo in self.termsindex.items() if term[0] in schema) def iter_from(self, fieldname, text): schema = self.schema self._test_field(fieldname) for term, terminfo in self.termsindex.items_from((fieldname, text)): if term[0] not in schema: continue yield (term, terminfo) def frequency(self, fieldname, text): self._test_field(fieldname) try: return self.termsindex.frequency((fieldname, text)) except KeyError: return 0 def doc_frequency(self, fieldname, text): self._test_field(fieldname) try: return self.termsindex.doc_frequency((fieldname, text)) except KeyError: return 0 def postings(self, fieldname, text, scorer=None): try: terminfo = self.termsindex[fieldname, text] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) format = self.schema[fieldname].format postings = terminfo.postings if isinstance(postings, integer_types): postreader = FilePostingReader(self.postfile, postings, format, scorer=scorer, term=(fieldname, text)) else: docids, weights, values = postings postreader = ListMatcher(docids, weights, values, format, scorer=scorer, term=(fieldname, text), terminfo=terminfo) deleted = self.segment.deleted if deleted: postreader = FilterMatcher(postreader, deleted, exclude=True) return postreader def vector(self, docnum, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = self.schema[fieldname].vector if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) self._open_vectors() try: offset = self.vectorindex.get((docnum, fieldname)) except KeyError: raise KeyError("No vector found for document " "%s field %r" % (docnum, fieldname)) return FilePostingReader(self.vpostfile, offset, vformat, stringids=True) # DAWG methods def has_word_graph(self, fieldname): if fieldname not in self.schema: return False if not self.schema[fieldname].spelling: return False if self.dawg: return fieldname in self.dawg return False def word_graph(self, fieldname): if not self.has_word_graph(fieldname): raise Exception("No word graph for field %r" % fieldname) return self.dawg.edge(fieldname) # Field cache methods def supports_caches(self): return True def set_caching_policy(self, cp=None, save=True, storage=None): """This method lets you control the caching policy of the reader. You can either pass a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy` as the first argument, *or* use the `save` and `storage` keywords to alter the default caching policy:: # Use a custom field caching policy object reader.set_caching_policy(MyPolicy()) # Use the default caching policy but turn off saving caches to disk reader.set_caching_policy(save=False) # Use the default caching policy but save caches to a custom # storage from whoosh.filedb.filestore import FileStorage mystorage = FileStorage("path/to/cachedir") reader.set_caching_policy(storage=mystorage) :param cp: a :class:`whoosh.filedb.fieldcache.FieldCachingPolicy` object. If this argument is not given, the default caching policy is used. :param save: save field caches to disk for re-use. If a caching policy object is specified using `cp`, this argument is ignored. :param storage: a custom :class:`whoosh.store.Storage` object to use for saving field caches. If a caching policy object is specified using `cp` or `save` is `False`, this argument is ignored. """ if not cp: if save and storage is None: storage = self.storage else: storage = None cp = DefaultFieldCachingPolicy(self.segment.name, storage=storage) if type(cp) is type: cp = cp() self.caching_policy = cp def _fieldkey(self, fieldname): return "%s/%s" % (self.uuid_string, fieldname) def fieldcache(self, fieldname, save=SAVE_BY_DEFAULT): """Returns a :class:`whoosh.filedb.fieldcache.FieldCache` object for the given field. :param fieldname: the name of the field to get a cache for. :param save: if True (the default), the cache is saved to disk if it doesn't already exist. """ key = self._fieldkey(fieldname) fc = self.caching_policy.get(key) if not fc: fc = FieldCache.from_field(self, fieldname) self.caching_policy.put(key, fc, save=save) return fc def fieldcache_available(self, fieldname): """Returns True if a field cache exists for the given field (either in memory already or on disk). """ return self._fieldkey(fieldname) in self.caching_policy def fieldcache_loaded(self, fieldname): """Returns True if a field cache for the given field is in memory. """ return self.caching_policy.is_loaded(self._fieldkey(fieldname)) def unload_fieldcache(self, name): self.caching_policy.delete(self._fieldkey(name))