Esempio n. 1
0
def MERGE_SMALL(writer, segments):
    """This policy merges small segments, where "small" is defined using a
    heuristic based on the fibonacci sequence.
    """

    from whoosh.reading import SegmentReader

    unchanged_segments = []
    segments_to_merge = []

    sorted_segment_list = sorted(segments, key=lambda s: s.doc_count_all())
    total_docs = 0

    merge_point_found = False
    for i, seg in enumerate(sorted_segment_list):
        count = seg.doc_count_all()
        if count > 0:
            total_docs += count

        if merge_point_found:  # append the remaining to unchanged
            unchanged_segments.append(seg)
        else:  # look for a merge point
            segments_to_merge.append(
                (seg, i))  # merge every segment up to the merge point
            if i > 3 and total_docs < fib(i + 5):
                merge_point_found = True

    if merge_point_found and len(segments_to_merge) > 1:
        for seg, i in segments_to_merge:
            reader = SegmentReader(writer.storage, writer.schema, seg)
            writer.add_reader(reader)
            reader.close()
        return unchanged_segments
    else:
        return segments
Esempio n. 2
0
def OPTIMIZE(writer, segments):
    """This policy merges all existing segments.
    """

    from whoosh.reading import SegmentReader

    for seg in segments:
        reader = SegmentReader(writer.storage, writer.schema, seg)
        writer.add_reader(reader)
        reader.close()
    return []
Esempio n. 3
0
 def segreader(segment):
     if segment in reusable:
         r = reusable[segment]
         del reusable[segment]
         return r
     else:
         return SegmentReader(storage, schema, segment,
                              generation=generation)
Esempio n. 4
0
 def segreader(segment):
     segid = segment.segment_id()
     if segid in reusable:
         r = reusable[segid]
         del reusable[segid]
         return r
     else:
         return SegmentReader(storage, schema, segment,
                              generation=generation)
Esempio n. 5
0
def MERGE_CUSTOM(writer, segments):
    """This policy merges small segments, where "small" is defined using a
    heuristic based on the fibonacci sequence.
    """

    from whoosh.reading import SegmentReader
    from whoosh.util import fib

    unchanged_segments = []
    segments_to_merge = []

    sorted_segment_list = sorted(segments, key=lambda s: s.doc_count_all())
    total_docs = 0

    log_stats = False

    merge_point_found = False
    for i, seg in enumerate(sorted_segment_list):
        count = seg.doc_count_all()
        if count > 0:
            total_docs += count

        if log_stats:
            logger.debug("%s: %s/%s, fib %s", i, count, total_docs, fib(i + 5))

        if merge_point_found:
            unchanged_segments.append(seg)
        else:
            segments_to_merge.append((seg, i))
            if i > 3 and total_docs < fib(i + 5):
                logger.debug("Merge point found at %s - %s", i, total_docs)
                merge_point_found = True

    if merge_point_found and len(segments_to_merge) > 1:
        for seg, i in segments_to_merge:
            logger.info("Merging segment %s having size %s", i,
                        seg.doc_count_all())
            reader = SegmentReader(writer.storage, writer.schema, seg)
            writer.add_reader(reader)
            reader.close()
        return unchanged_segments
    else:
        logger.debug("No merge point found, no merge yet")
        return segments
Esempio n. 6
0
def MERGE_SMALL(writer, segments):
    """This policy merges small segments, where "small" is defined using a
    heuristic based on the fibonacci sequence.
    """

    from whoosh.reading import SegmentReader

    newsegments = []
    sorted_segment_list = sorted(segments, key=lambda s: s.doc_count_all())
    total_docs = 0
    for i, seg in enumerate(sorted_segment_list):
        count = seg.doc_count_all()
        if count > 0:
            total_docs += count
            if total_docs < fib(i + 5):
                reader = SegmentReader(writer.storage, writer.schema, seg)
                writer.add_reader(reader)
                reader.close()
            else:
                newsegments.append(seg)
    return newsegments
Esempio n. 7
0
def add_spelling(ix, fieldnames, commit=True):
    """Adds spelling files to an existing index that was created without
    them, and modifies the schema so the given fields have the ``spelling``
    attribute. Only works on filedb indexes.

    >>> ix = index.open_dir("testindex")
    >>> add_spelling(ix, ["content", "tags"])

    :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object.
    :param fieldnames: a list of field names to create word graphs for.
    :param force: if True, overwrites existing word graph files. This is only
        useful for debugging.
    """

    from whoosh.automata import fst
    from whoosh.reading import SegmentReader

    writer = ix.writer()
    storage = writer.storage
    schema = writer.schema
    segments = writer.segments

    for segment in segments:
        ext = segment.codec().FST_EXT

        r = SegmentReader(storage, schema, segment)
        f = segment.create_file(storage, ext)
        gw = fst.GraphWriter(f)
        for fieldname in fieldnames:
            gw.start_field(fieldname)
            for word in r.lexicon(fieldname):
                gw.insert(word)
            gw.finish_field()
        gw.close()

    for fieldname in fieldnames:
        schema[fieldname].spelling = True

    if commit:
        writer.commit(merge=False)
Esempio n. 8
0
 def reader(self, schema):
     return SegmentReader(self.storage, schema, self.segment, codec=self)