def MERGE_SMALL(writer, segments): """This policy merges small segments, where "small" is defined using a heuristic based on the fibonacci sequence. """ from whoosh.reading import SegmentReader unchanged_segments = [] segments_to_merge = [] sorted_segment_list = sorted(segments, key=lambda s: s.doc_count_all()) total_docs = 0 merge_point_found = False for i, seg in enumerate(sorted_segment_list): count = seg.doc_count_all() if count > 0: total_docs += count if merge_point_found: # append the remaining to unchanged unchanged_segments.append(seg) else: # look for a merge point segments_to_merge.append( (seg, i)) # merge every segment up to the merge point if i > 3 and total_docs < fib(i + 5): merge_point_found = True if merge_point_found and len(segments_to_merge) > 1: for seg, i in segments_to_merge: reader = SegmentReader(writer.storage, writer.schema, seg) writer.add_reader(reader) reader.close() return unchanged_segments else: return segments
def OPTIMIZE(writer, segments): """This policy merges all existing segments. """ from whoosh.reading import SegmentReader for seg in segments: reader = SegmentReader(writer.storage, writer.schema, seg) writer.add_reader(reader) reader.close() return []
def segreader(segment): if segment in reusable: r = reusable[segment] del reusable[segment] return r else: return SegmentReader(storage, schema, segment, generation=generation)
def segreader(segment): segid = segment.segment_id() if segid in reusable: r = reusable[segid] del reusable[segid] return r else: return SegmentReader(storage, schema, segment, generation=generation)
def MERGE_CUSTOM(writer, segments): """This policy merges small segments, where "small" is defined using a heuristic based on the fibonacci sequence. """ from whoosh.reading import SegmentReader from whoosh.util import fib unchanged_segments = [] segments_to_merge = [] sorted_segment_list = sorted(segments, key=lambda s: s.doc_count_all()) total_docs = 0 log_stats = False merge_point_found = False for i, seg in enumerate(sorted_segment_list): count = seg.doc_count_all() if count > 0: total_docs += count if log_stats: logger.debug("%s: %s/%s, fib %s", i, count, total_docs, fib(i + 5)) if merge_point_found: unchanged_segments.append(seg) else: segments_to_merge.append((seg, i)) if i > 3 and total_docs < fib(i + 5): logger.debug("Merge point found at %s - %s", i, total_docs) merge_point_found = True if merge_point_found and len(segments_to_merge) > 1: for seg, i in segments_to_merge: logger.info("Merging segment %s having size %s", i, seg.doc_count_all()) reader = SegmentReader(writer.storage, writer.schema, seg) writer.add_reader(reader) reader.close() return unchanged_segments else: logger.debug("No merge point found, no merge yet") return segments
def MERGE_SMALL(writer, segments): """This policy merges small segments, where "small" is defined using a heuristic based on the fibonacci sequence. """ from whoosh.reading import SegmentReader newsegments = [] sorted_segment_list = sorted(segments, key=lambda s: s.doc_count_all()) total_docs = 0 for i, seg in enumerate(sorted_segment_list): count = seg.doc_count_all() if count > 0: total_docs += count if total_docs < fib(i + 5): reader = SegmentReader(writer.storage, writer.schema, seg) writer.add_reader(reader) reader.close() else: newsegments.append(seg) return newsegments
def add_spelling(ix, fieldnames, commit=True): """Adds spelling files to an existing index that was created without them, and modifies the schema so the given fields have the ``spelling`` attribute. Only works on filedb indexes. >>> ix = index.open_dir("testindex") >>> add_spelling(ix, ["content", "tags"]) :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object. :param fieldnames: a list of field names to create word graphs for. :param force: if True, overwrites existing word graph files. This is only useful for debugging. """ from whoosh.automata import fst from whoosh.reading import SegmentReader writer = ix.writer() storage = writer.storage schema = writer.schema segments = writer.segments for segment in segments: ext = segment.codec().FST_EXT r = SegmentReader(storage, schema, segment) f = segment.create_file(storage, ext) gw = fst.GraphWriter(f) for fieldname in fieldnames: gw.start_field(fieldname) for word in r.lexicon(fieldname): gw.insert(word) gw.finish_field() gw.close() for fieldname in fieldnames: schema[fieldname].spelling = True if commit: writer.commit(merge=False)
def reader(self, schema): return SegmentReader(self.storage, schema, self.segment, codec=self)