def get_database_tfs(ix_reader: MultiReader, field_name='body'): LOGGER.info('Building TF for [{}] field of the Index'.format(field_name)) tfs = defaultdict(lambda: defaultdict(int)) all_terms = ix_reader.field_terms(field_name) for term in all_terms: f = ix_reader.frequency(field_name, term) tfs[term] = f return tfs
def reader(self, **kwargs): from whoosh.reading import MultiReader writer = self._get_writer() ramreader = self.ramindex if self.index.is_empty(): return ramreader else: reader = writer.reader(**kwargs) if reader.is_atomic(): reader = MultiReader([reader, ramreader]) else: reader.add_reader(ramreader) return reader
def reader(self, **kwargs): from whoosh.reading import MultiReader reader = self.writer.reader() with self.lock: ramreader = self._get_ram_reader() # If there are in-memory docs, combine the readers if ramreader.doc_count(): if reader.is_atomic(): reader = MultiReader([reader, ramreader]) else: reader.add_reader(ramreader) return reader
def get_docs_tfs(article_ids: list, ix_reader: MultiReader, fieldname='body') -> defaultdict(lambda: defaultdict(int)): docs_tfs = defaultdict(lambda: defaultdict(int)) for aId in article_ids: dn = get_index_docnum_of_article_id(aId, ix_reader) if dn == -1: continue if ix_reader.has_vector(dn, fieldname): tf_d = defaultdict(int, ix_reader.vector_as('frequency', dn, fieldname)) docs_tfs[aId] = tf_d else: LOGGER.warning( 'No forward vector was found for docnum {}, articleID {}'. format(dn, aId)) return docs_tfs
def reader(self, storage, schema): from whoosh.filedb.filereading import SegmentReader segments = self.segments if len(segments) == 1: return SegmentReader(storage, segments[0], schema) else: from whoosh.reading import MultiReader readers = [SegmentReader(storage, segment, schema) for segment in segments] return MultiReader(readers, self._doc_offsets, schema)
def _reader(cls, storage, schema, segments, generation, reuse=None): # Returns a reader for the given segments, possibly reusing already # opened readers from whoosh.reading import SegmentReader, MultiReader, EmptyReader reusable = {} try: if len(segments) == 0: # This index has no segments! Return an EmptyReader object, # which simply returns empty or zero to every method return EmptyReader(schema) if reuse: # Put all atomic readers in a dictionary keyed by their # generation, so we can re-use them if them if possible readers = [r for r, _ in reuse.leaf_readers()] reusable = dict((r.generation(), r) for r in readers) # Make a function to open readers, which reuses reusable readers. # It removes any readers it reuses from the "reusable" dictionary, # so later we can close any readers left in the dictionary. def segreader(segment): segid = segment.segment_id() if segid in reusable: r = reusable[segid] del reusable[segid] return r else: return SegmentReader(storage, schema, segment, generation=generation) if len(segments) == 1: # This index has one segment, so return a SegmentReader object # for the segment return segreader(segments[0]) else: # This index has multiple segments, so create a list of # SegmentReaders for the segments, then composite them with a # MultiReader readers = [segreader(segment) for segment in segments] return MultiReader(readers, generation=generation) finally: for r in reusable.values(): r.close()