def MERGE_SQUARES(writer, segments): """This is an alternative merge policy similar to Lucene's. It is less optimal than the default MERGE_SMALL. """ from whoosh.filedb.filereading import SegmentReader sizedsegs = [(s.doc_count_all(), s) for s in segments] tomerge = [] for size in (10, 100, 1000, 10000, 100000): smaller = [ seg for segsize, seg in sizedsegs if segsize < size - 1 and segsize >= size // 10 ] if len(smaller) >= 10: tomerge.extend(smaller) for seg in smaller: segments.remove(seg) for seg in tomerge: reader = SegmentReader(writer.storage, writer.schema, seg) writer.add_reader(reader) reader.close() return segments
def OPTIMIZE(writer, segments): """This policy merges all existing segments. """ from whoosh.filedb.filereading import SegmentReader for seg in segments: reader = SegmentReader(writer.storage, writer.schema, seg) writer.add_reader(reader) reader.close() return []
def MERGE_SMALL(writer, segments): """This policy merges small segments, where "small" is defined using a heuristic based on the fibonacci sequence. """ from whoosh.filedb.filereading import SegmentReader newsegments = [] sorted_segment_list = sorted((s.doc_count_all(), s) for s in segments) total_docs = 0 for i, (count, seg) in enumerate(sorted_segment_list): if count > 0: total_docs += count if total_docs < fib(i + 5): reader = SegmentReader(writer.storage, writer.schema, seg) writer.add_reader(reader) reader.close() else: newsegments.append(seg) return newsegments
def MERGE_SQUARES(writer, segments): """This is an alternative merge policy similar to Lucene's. It is less optimal than the default MERGE_SMALL. """ from whoosh.filedb.filereading import SegmentReader sizedsegs = [(s.doc_count_all(), s) for s in segments] tomerge = [] for size in (10, 100, 1000, 10000, 100000): smaller = [seg for segsize, seg in sizedsegs if segsize < size - 1 and segsize >= size//10] if len(smaller) >= 10: tomerge.extend(smaller) for seg in smaller: segments.remove(seg) for seg in tomerge: reader = SegmentReader(writer.storage, writer.schema, seg) writer.add_reader(reader) reader.close() return segments