def test_dawg(): from whoosh.support.dawg import DawgBuilder with TempStorage() as st: df = st.create_file("test.dawg") dw = DawgBuilder(field_root=True) dw.insert(["test"] + list("special")) dw.insert(["test"] + list("specials")) dw.write(df) assert_equal(list(dawg.flatten(dw.root.edge("test"))), ["special", "specials"])
def add_spelling(ix, fieldnames, commit=True): """Adds spelling files to an existing index that was created without them, and modifies the schema so the given fields have the ``spelling`` attribute. Only works on filedb indexes. >>> ix = index.open_dir("testindex") >>> add_spelling(ix, ["content", "tags"]) :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object. :param fieldnames: a list of field names to create word graphs for. :param force: if True, overwrites existing word graph files. This is only useful for debugging. """ from whoosh.filedb.filereading import SegmentReader writer = ix.writer() storage = writer.storage schema = writer.schema segments = writer.segments for segment in segments: filename = segment.dawg_filename r = SegmentReader(storage, schema, segment) f = storage.create_file(filename) dawg = DawgBuilder(field_root=True) for fieldname in fieldnames: ft = (fieldname, ) for word in r.lexicon(fieldname): dawg.insert(ft + tuple(word)) dawg.write(f) for fieldname in fieldnames: schema[fieldname].spelling = True if commit: writer.commit(merge=False)
def add_spelling(ix, fieldnames, commit=True): """Adds spelling files to an existing index that was created without them, and modifies the schema so the given fields have the ``spelling`` attribute. Only works on filedb indexes. >>> ix = index.open_dir("testindex") >>> add_spelling(ix, ["content", "tags"]) :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object. :param fieldnames: a list of field names to create word graphs for. :param force: if True, overwrites existing word graph files. This is only useful for debugging. """ from whoosh.filedb.filereading import SegmentReader writer = ix.writer() storage = writer.storage schema = writer.schema segments = writer.segments for segment in segments: filename = segment.dawg_filename r = SegmentReader(storage, schema, segment) f = storage.create_file(filename) dawg = DawgBuilder(field_root=True) for fieldname in fieldnames: ft = (fieldname,) for word in r.lexicon(fieldname): dawg.insert(ft + tuple(word)) dawg.write(f) for fieldname in fieldnames: schema[fieldname].spelling = True if commit: writer.commit(merge=False)
class StdFieldWriter(base.FieldWriter): def __init__(self, storage, segment, blocklimit=128, compression=3, inlinelimit=1): self.storage = storage self.segment = segment self.fieldname = None self.text = None self.field = None self.format = None self.spelling = False tifile = segment.create_file(storage, StdCodec.TERMS_EXT) self.termsindex = TermIndexWriter(tifile) self.postfile = segment.create_file(storage, StdCodec.POSTS_EXT) # We'll wait to create the DAWG builder until someone actually adds # a spelled field self.dawg = None self.blocklimit = blocklimit self.compression = compression self.inlinelimit = inlinelimit self.block = None self.terminfo = None def _make_dawg_files(self): dawgfile = self.segment.create_file(self.storage, StdCodec.DAWG_EXT) self.dawg = DawgBuilder(dawgfile, field_root=True) def _reset_block(self): self.block = StdBlock(self.format.posting_size) def _write_block(self): self.terminfo.add_block(self.block) self.block.to_file(self.postfile, compression=self.compression) self._reset_block() self.blockcount += 1 def _start_blocklist(self): postfile = self.postfile self._reset_block() # Magic number self.startoffset = postfile.tell() postfile.write(StdBlock.magic) # Placeholder for block count self.blockcount = 0 postfile.write_uint(0) def _finish_blocklist(self): if self.block: self._write_block() # Seek back to the start of this list of posting blocks and write the # number of blocks postfile = self.postfile postfile.flush() here = postfile.tell() postfile.seek(self.startoffset + 4) postfile.write_uint(self.blockcount) postfile.seek(here) self.block = None def start_field(self, fieldname, fieldobj): self.fieldname = fieldname self.field = fieldobj self.format = fieldobj.format self.spelling = fieldobj.spelling and not fieldobj.separate_spelling() def start_term(self, text): if self.block is not None: raise Exception("Called start_term in a block") self.text = text self.terminfo = base.FileTermInfo() if self.spelling: if self.dawg is None: self._make_dawg_files() self.dawg.insert((self.fieldname, ) + tuple(text)) self._start_blocklist() def add(self, docnum, weight, valuestring, length): self.block.add(docnum, weight, valuestring, length) if len(self.block) > self.blocklimit: self._write_block() def add_spell_word(self, fieldname, text): if self.dawg is None: self._make_dawg_files() self.dawg.insert((fieldname, ) + tuple(text)) def finish_term(self): if self.block is None: raise Exception("Called finish_term when not in a block") block = self.block terminfo = self.terminfo if self.blockcount < 1 and block and len(block) < self.inlinelimit: # Inline the single block terminfo.add_block(block) vals = None if not block.values else tuple(block.values) postings = (tuple(block.ids), tuple(block.weights), vals) else: self._finish_blocklist() postings = self.startoffset self.block = None terminfo.postings = postings self.termsindex.add((self.fieldname, self.text), terminfo) def close(self): self.termsindex.close() self.postfile.close() if self.dawg is not None: self.dawg.close()
class StdFieldWriter(base.FieldWriter): def __init__(self, storage, segment, blocklimit=128, compression=3, inlinelimit=1): self.storage = storage self.segment = segment self.fieldname = None self.text = None self.field = None self.format = None self.spelling = False tifile = segment.create_file(storage, StdCodec.TERMS_EXT) self.termsindex = TermIndexWriter(tifile) self.postfile = segment.create_file(storage, StdCodec.POSTS_EXT) # We'll wait to create the DAWG builder until someone actually adds # a spelled field self.dawg = None self.blocklimit = blocklimit self.compression = compression self.inlinelimit = inlinelimit self.block = None self.terminfo = None def _make_dawg_files(self): dawgfile = self.segment.create_file(self.storage, StdCodec.DAWG_EXT) self.dawg = DawgBuilder(dawgfile, field_root=True) def _reset_block(self): self.block = StdBlock(self.format.posting_size) def _write_block(self): self.terminfo.add_block(self.block) self.block.to_file(self.postfile, compression=self.compression) self._reset_block() self.blockcount += 1 def _start_blocklist(self): postfile = self.postfile self._reset_block() # Magic number self.startoffset = postfile.tell() postfile.write(StdBlock.magic) # Placeholder for block count self.blockcount = 0 postfile.write_uint(0) def _finish_blocklist(self): if self.block: self._write_block() # Seek back to the start of this list of posting blocks and write the # number of blocks postfile = self.postfile postfile.flush() here = postfile.tell() postfile.seek(self.startoffset + 4) postfile.write_uint(self.blockcount) postfile.seek(here) self.block = None def start_field(self, fieldname, fieldobj): self.fieldname = fieldname self.field = fieldobj self.format = fieldobj.format self.spelling = fieldobj.spelling and not fieldobj.separate_spelling() def start_term(self, text): if self.block is not None: raise Exception("Called start_term in a block") self.text = text self.terminfo = base.FileTermInfo() if self.spelling: if self.dawg is None: self._make_dawg_files() self.dawg.insert((self.fieldname,) + tuple(text)) self._start_blocklist() def add(self, docnum, weight, valuestring, length): self.block.add(docnum, weight, valuestring, length) if len(self.block) > self.blocklimit: self._write_block() def add_spell_word(self, fieldname, text): if self.dawg is None: self._make_dawg_files() self.dawg.insert((fieldname,) + tuple(text)) def finish_term(self): if self.block is None: raise Exception("Called finish_term when not in a block") block = self.block terminfo = self.terminfo if self.blockcount < 1 and block and len(block) < self.inlinelimit: # Inline the single block terminfo.add_block(block) vals = None if not block.values else tuple(block.values) postings = (tuple(block.ids), tuple(block.weights), vals) else: self._finish_blocklist() postings = self.startoffset self.block = None terminfo.postings = postings self.termsindex.add((self.fieldname, self.text), terminfo) def close(self): self.termsindex.close() self.postfile.close() if self.dawg is not None: self.dawg.close()