Esempio n. 1
0
def get_database_tfs(ix_reader: MultiReader, field_name='body'):
    LOGGER.info('Building TF for [{}] field of the Index'.format(field_name))
    tfs = defaultdict(lambda: defaultdict(int))
    all_terms = ix_reader.field_terms(field_name)
    for term in all_terms:
        f = ix_reader.frequency(field_name, term)
        tfs[term] = f
    return tfs
Esempio n. 2
0
File: writing.py Progetto: oier/Yaki
    def reader(self, **kwargs):
        from whoosh.reading import MultiReader

        writer = self._get_writer()
        ramreader = self.ramindex
        if self.index.is_empty():
            return ramreader
        else:
            reader = writer.reader(**kwargs)
            if reader.is_atomic():
                reader = MultiReader([reader, ramreader])
            else:
                reader.add_reader(ramreader)
            return reader
Esempio n. 3
0
    def reader(self, **kwargs):
        from whoosh.reading import MultiReader

        writer = self._get_writer()
        ramreader = self.ramindex
        if self.index.is_empty():
            return ramreader
        else:
            reader = writer.reader(**kwargs)
            if reader.is_atomic():
                reader = MultiReader([reader, ramreader])
            else:
                reader.add_reader(ramreader)
            return reader
Esempio n. 4
0
    def reader(self, **kwargs):
        from whoosh.reading import MultiReader

        reader = self.writer.reader()
        with self.lock:
            ramreader = self._get_ram_reader()

        # If there are in-memory docs, combine the readers
        if ramreader.doc_count():
            if reader.is_atomic():
                reader = MultiReader([reader, ramreader])
            else:
                reader.add_reader(ramreader)

        return reader
Esempio n. 5
0
    def reader(self, **kwargs):
        from whoosh.reading import MultiReader

        reader = self.writer.reader()
        with self.lock:
            ramreader = self._get_ram_reader()

        # If there are in-memory docs, combine the readers
        if ramreader.doc_count():
            if reader.is_atomic():
                reader = MultiReader([reader, ramreader])
            else:
                reader.add_reader(ramreader)

        return reader
def get_docs_tfs(article_ids: list,
                 ix_reader: MultiReader,
                 fieldname='body') -> defaultdict(lambda: defaultdict(int)):
    docs_tfs = defaultdict(lambda: defaultdict(int))
    for aId in article_ids:
        dn = get_index_docnum_of_article_id(aId, ix_reader)
        if dn == -1:
            continue
        if ix_reader.has_vector(dn, fieldname):
            tf_d = defaultdict(int,
                               ix_reader.vector_as('frequency', dn, fieldname))
            docs_tfs[aId] = tf_d
        else:
            LOGGER.warning(
                'No forward vector was found for docnum {}, articleID {}'.
                format(dn, aId))
    return docs_tfs
Esempio n. 7
0
 def reader(self, storage, schema):
     from whoosh.filedb.filereading import SegmentReader
     segments = self.segments
     if len(segments) == 1:
         return SegmentReader(storage, segments[0], schema)
     else:
         from whoosh.reading import MultiReader
         readers = [SegmentReader(storage, segment, schema)
                    for segment in segments]
         return MultiReader(readers, self._doc_offsets, schema)
Esempio n. 8
0
    def _reader(cls, storage, schema, segments, generation, reuse=None):
        # Returns a reader for the given segments, possibly reusing already
        # opened readers
        from whoosh.reading import SegmentReader, MultiReader, EmptyReader

        reusable = {}
        try:
            if len(segments) == 0:
                # This index has no segments! Return an EmptyReader object,
                # which simply returns empty or zero to every method
                return EmptyReader(schema)

            if reuse:
                # Put all atomic readers in a dictionary keyed by their
                # generation, so we can re-use them if them if possible
                readers = [r for r, _ in reuse.leaf_readers()]
                reusable = dict((r.generation(), r) for r in readers)

            # Make a function to open readers, which reuses reusable readers.
            # It removes any readers it reuses from the "reusable" dictionary,
            # so later we can close any readers left in the dictionary.
            def segreader(segment):
                segid = segment.segment_id()
                if segid in reusable:
                    r = reusable[segid]
                    del reusable[segid]
                    return r
                else:
                    return SegmentReader(storage,
                                         schema,
                                         segment,
                                         generation=generation)

            if len(segments) == 1:
                # This index has one segment, so return a SegmentReader object
                # for the segment
                return segreader(segments[0])
            else:
                # This index has multiple segments, so create a list of
                # SegmentReaders for the segments, then composite them with a
                # MultiReader

                readers = [segreader(segment) for segment in segments]
                return MultiReader(readers, generation=generation)
        finally:
            for r in reusable.values():
                r.close()