Beispiel #1
0
def MERGE_SQUARES(writer, segments):
    """This is an alternative merge policy similar to Lucene's. It is less
    optimal than the default MERGE_SMALL.
    """

    from whoosh.filedb.filereading import SegmentReader

    sizedsegs = [(s.doc_count_all(), s) for s in segments]
    tomerge = []
    for size in (10, 100, 1000, 10000, 100000):
        smaller = [
            seg for segsize, seg in sizedsegs
            if segsize < size - 1 and segsize >= size // 10
        ]
        if len(smaller) >= 10:
            tomerge.extend(smaller)
            for seg in smaller:
                segments.remove(seg)

    for seg in tomerge:
        reader = SegmentReader(writer.storage, writer.schema, seg)
        writer.add_reader(reader)
        reader.close()

    return segments
Beispiel #2
0
def OPTIMIZE(writer, segments):
    """This policy merges all existing segments.
    """

    from whoosh.filedb.filereading import SegmentReader
    for seg in segments:
        reader = SegmentReader(writer.storage, writer.schema, seg)
        writer.add_reader(reader)
        reader.close()
    return []
def OPTIMIZE(writer, segments):
    """This policy merges all existing segments.
    """

    from whoosh.filedb.filereading import SegmentReader
    for seg in segments:
        reader = SegmentReader(writer.storage, writer.schema, seg)
        writer.add_reader(reader)
        reader.close()
    return []
Beispiel #4
0
 def reader(self, storage, schema):
     from whoosh.filedb.filereading import SegmentReader
     segments = self.segments
     if len(segments) == 1:
         return SegmentReader(storage, segments[0], schema)
     else:
         from whoosh.reading import MultiReader
         readers = [SegmentReader(storage, segment, schema)
                    for segment in segments]
         return MultiReader(readers, self._doc_offsets, schema)
def CUSTOM_MERGE_SMALL(writer, segments):
    """This policy merges small segments, where "small" is defined using a
    fixed number of documents. Unlike whoosh.filedb.filewriting.MERGE_SMALL,
    this one does nothing unless there's more than one segment to merge.
    """

    from whoosh.filedb.filereading import SegmentReader
    unchanged_segments = []
    segments_to_merge = []

    for segment in segments:
        if segment.doc_count_all() < 10000:
            segments_to_merge.append(segment)
        else:
            unchanged_segments.append(segment)

    if len(segments_to_merge) > 1:
        for segment in segments_to_merge:
            with SegmentReader(writer.storage, writer.schema,
                               segment) as reader:
                writer.add_reader(reader)
    else:
        # don't bother merging a single segment
        unchanged_segments.extend(segments_to_merge)

    return unchanged_segments
Beispiel #6
0
def OPTIMIZE(ix, writer, segments):
    """This policy merges all existing segments.
    """

    from whoosh.filedb.filereading import SegmentReader
    for seg in segments:
        writer.add_reader(SegmentReader(ix.storage, seg, ix.schema))
    return SegmentSet()
Beispiel #7
0
def MERGE_SMALL(writer, segments):
    """This policy merges small segments, where "small" is defined using a
    heuristic based on the fibonacci sequence.
    """

    from whoosh.filedb.filereading import SegmentReader
    newsegments = []
    sorted_segment_list = sorted((s.doc_count_all(), s) for s in segments)
    total_docs = 0
    for i, (count, seg) in enumerate(sorted_segment_list):
        if count > 0:
            total_docs += count
            if total_docs < fib(i + 5):
                reader = SegmentReader(writer.storage, writer.schema, seg)
                writer.add_reader(reader)
                reader.close()
            else:
                newsegments.append(seg)
    return newsegments
Beispiel #8
0
 def segreader(segment):
     segid = segment.segment_id()
     if segid in reusable:
         r = reusable[segid]
         del reusable[segid]
         return r
     else:
         return SegmentReader(storage,
                              schema,
                              segment,
                              generation=generation)
Beispiel #9
0
def add_spelling(ix, fieldnames, commit=True):
    """Adds spelling files to an existing index that was created without
    them, and modifies the schema so the given fields have the ``spelling``
    attribute. Only works on filedb indexes.
    
    >>> ix = index.open_dir("testindex")
    >>> add_spelling(ix, ["content", "tags"])
    
    :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object.
    :param fieldnames: a list of field names to create word graphs for.
    :param force: if True, overwrites existing word graph files. This is only
        useful for debugging.
    """

    from whoosh.filedb.filereading import SegmentReader
    from whoosh.support import dawg

    writer = ix.writer()
    storage = writer.storage
    schema = writer.schema
    segments = writer.segments

    for segment in segments:
        r = SegmentReader(storage, schema, segment)
        f = segment.create_file(storage, ".dag")
        gw = dawg.GraphWriter(f)
        for fieldname in fieldnames:
            gw.start_field(fieldname)
            for word in r.lexicon(fieldname):
                gw.insert(word)
            gw.finish_field()
        gw.close()

    for fieldname in fieldnames:
        schema[fieldname].spelling = True

    if commit:
        writer.commit(merge=False)
Beispiel #10
0
def add_spelling(ix, fieldnames, commit=True):
    """Adds spelling files to an existing index that was created without
    them, and modifies the schema so the given fields have the ``spelling``
    attribute. Only works on filedb indexes.
    
    >>> ix = index.open_dir("testindex")
    >>> add_spelling(ix, ["content", "tags"])
    
    :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object.
    :param fieldnames: a list of field names to create word graphs for.
    :param force: if True, overwrites existing word graph files. This is only
        useful for debugging.
    """

    from whoosh.filedb.filereading import SegmentReader
    from whoosh.support import dawg

    writer = ix.writer()
    storage = writer.storage
    schema = writer.schema
    segments = writer.segments

    for segment in segments:
        r = SegmentReader(storage, schema, segment)
        f = segment.create_file(storage, ".dag")
        gw = dawg.GraphWriter(f)
        for fieldname in fieldnames:
            gw.start_field(fieldname)
            for word in r.lexicon(fieldname):
                gw.insert(word)
            gw.finish_field()
        gw.close()

    for fieldname in fieldnames:
        schema[fieldname].spelling = True

    if commit:
        writer.commit(merge=False)
Beispiel #11
0
def MERGE_SQUARES(writer, segments):
    """This is an alternative merge policy similar to Lucene's. It is less
    optimal than the default MERGE_SMALL.
    """

    from whoosh.filedb.filereading import SegmentReader

    sizedsegs = [(s.doc_count_all(), s) for s in segments]
    tomerge = []
    for size in (10, 100, 1000, 10000, 100000):
        smaller = [seg for segsize, seg in sizedsegs
                   if segsize < size - 1 and segsize >= size//10]
        if len(smaller) >= 10:
            tomerge.extend(smaller)
            for seg in smaller:
                segments.remove(seg)

    for seg in tomerge:
        reader = SegmentReader(writer.storage, writer.schema, seg)
        writer.add_reader(reader)
        reader.close()

    return segments