def MERGE_SQUARES(writer, segments): """This is an alternative merge policy similar to Lucene's. It is less optimal than the default MERGE_SMALL. """ from whoosh.filedb.filereading import SegmentReader sizedsegs = [(s.doc_count_all(), s) for s in segments] tomerge = [] for size in (10, 100, 1000, 10000, 100000): smaller = [ seg for segsize, seg in sizedsegs if segsize < size - 1 and segsize >= size // 10 ] if len(smaller) >= 10: tomerge.extend(smaller) for seg in smaller: segments.remove(seg) for seg in tomerge: reader = SegmentReader(writer.storage, writer.schema, seg) writer.add_reader(reader) reader.close() return segments
def OPTIMIZE(writer, segments): """This policy merges all existing segments. """ from whoosh.filedb.filereading import SegmentReader for seg in segments: reader = SegmentReader(writer.storage, writer.schema, seg) writer.add_reader(reader) reader.close() return []
def reader(self, storage, schema): from whoosh.filedb.filereading import SegmentReader segments = self.segments if len(segments) == 1: return SegmentReader(storage, segments[0], schema) else: from whoosh.reading import MultiReader readers = [SegmentReader(storage, segment, schema) for segment in segments] return MultiReader(readers, self._doc_offsets, schema)
def CUSTOM_MERGE_SMALL(writer, segments): """This policy merges small segments, where "small" is defined using a fixed number of documents. Unlike whoosh.filedb.filewriting.MERGE_SMALL, this one does nothing unless there's more than one segment to merge. """ from whoosh.filedb.filereading import SegmentReader unchanged_segments = [] segments_to_merge = [] for segment in segments: if segment.doc_count_all() < 10000: segments_to_merge.append(segment) else: unchanged_segments.append(segment) if len(segments_to_merge) > 1: for segment in segments_to_merge: with SegmentReader(writer.storage, writer.schema, segment) as reader: writer.add_reader(reader) else: # don't bother merging a single segment unchanged_segments.extend(segments_to_merge) return unchanged_segments
def OPTIMIZE(ix, writer, segments): """This policy merges all existing segments. """ from whoosh.filedb.filereading import SegmentReader for seg in segments: writer.add_reader(SegmentReader(ix.storage, seg, ix.schema)) return SegmentSet()
def MERGE_SMALL(writer, segments): """This policy merges small segments, where "small" is defined using a heuristic based on the fibonacci sequence. """ from whoosh.filedb.filereading import SegmentReader newsegments = [] sorted_segment_list = sorted((s.doc_count_all(), s) for s in segments) total_docs = 0 for i, (count, seg) in enumerate(sorted_segment_list): if count > 0: total_docs += count if total_docs < fib(i + 5): reader = SegmentReader(writer.storage, writer.schema, seg) writer.add_reader(reader) reader.close() else: newsegments.append(seg) return newsegments
def segreader(segment): segid = segment.segment_id() if segid in reusable: r = reusable[segid] del reusable[segid] return r else: return SegmentReader(storage, schema, segment, generation=generation)
def add_spelling(ix, fieldnames, commit=True): """Adds spelling files to an existing index that was created without them, and modifies the schema so the given fields have the ``spelling`` attribute. Only works on filedb indexes. >>> ix = index.open_dir("testindex") >>> add_spelling(ix, ["content", "tags"]) :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object. :param fieldnames: a list of field names to create word graphs for. :param force: if True, overwrites existing word graph files. This is only useful for debugging. """ from whoosh.filedb.filereading import SegmentReader from whoosh.support import dawg writer = ix.writer() storage = writer.storage schema = writer.schema segments = writer.segments for segment in segments: r = SegmentReader(storage, schema, segment) f = segment.create_file(storage, ".dag") gw = dawg.GraphWriter(f) for fieldname in fieldnames: gw.start_field(fieldname) for word in r.lexicon(fieldname): gw.insert(word) gw.finish_field() gw.close() for fieldname in fieldnames: schema[fieldname].spelling = True if commit: writer.commit(merge=False)
def MERGE_SQUARES(writer, segments): """This is an alternative merge policy similar to Lucene's. It is less optimal than the default MERGE_SMALL. """ from whoosh.filedb.filereading import SegmentReader sizedsegs = [(s.doc_count_all(), s) for s in segments] tomerge = [] for size in (10, 100, 1000, 10000, 100000): smaller = [seg for segsize, seg in sizedsegs if segsize < size - 1 and segsize >= size//10] if len(smaller) >= 10: tomerge.extend(smaller) for seg in smaller: segments.remove(seg) for seg in tomerge: reader = SegmentReader(writer.storage, writer.schema, seg) writer.add_reader(reader) reader.close() return segments