def _make_dawg_files(self): dawgfile = self.segment.create_file(self.storage, W2Codec.DAWG_EXT) self.dawg = GraphWriter(dawgfile)
class W2FieldWriter(base.FieldWriter): def __init__(self, storage, segment, blocklimit=128, compression=3, inlinelimit=1): assert isinstance(storage, Storage) assert isinstance(segment, base.Segment) assert isinstance(blocklimit, int) assert isinstance(compression, int) assert isinstance(inlinelimit, int) self.storage = storage self.segment = segment self.fieldname = None self.text = None self.field = None self.format = None self.spelling = False tifile = segment.create_file(storage, W2Codec.TERMS_EXT) self.termsindex = TermIndexWriter(tifile) self.postfile = segment.create_file(storage, W2Codec.POSTS_EXT) # We'll wait to create the DAWG builder until someone actually adds # a spelled field self.dawg = None self.blocklimit = blocklimit self.compression = compression self.inlinelimit = inlinelimit self.block = None self.terminfo = None self._infield = False def _make_dawg_files(self): dawgfile = self.segment.create_file(self.storage, W2Codec.DAWG_EXT) self.dawg = GraphWriter(dawgfile) def _new_block(self): return W2Block(self.format.posting_size) def _reset_block(self): self.block = self._new_block() def _write_block(self): self.terminfo.add_block(self.block) self.block.to_file(self.postfile, compression=self.compression) self._reset_block() self.blockcount += 1 def _start_blocklist(self): postfile = self.postfile self._reset_block() # Magic number self.startoffset = postfile.tell() postfile.write(W2Block.magic) # Placeholder for block count self.blockcount = 0 postfile.write_uint(0) def start_field(self, fieldname, fieldobj): self.fieldname = fieldname self.field = fieldobj self.format = fieldobj.format self.spelling = fieldobj.spelling and not fieldobj.separate_spelling() self._dawgfield = False if self.spelling or fieldobj.separate_spelling(): if self.dawg is None: self._make_dawg_files() self.dawg.start_field(fieldname) self._dawgfield = True self._infield = True def start_term(self, text): if self.block is not None: raise Exception("Called start_term in a block") self.text = text self.terminfo = base.FileTermInfo() if self.spelling: self.dawg.insert(text) self._start_blocklist() def add(self, docnum, weight, valuestring, length): self.block.add(docnum, weight, valuestring, length) if len(self.block) > self.blocklimit: self._write_block() def add_spell_word(self, fieldname, text): if self.dawg is None: self._make_dawg_files() self.dawg.insert(text) def finish_term(self): block = self.block if block is None: raise Exception("Called finish_term when not in a block") terminfo = self.terminfo if self.blockcount < 1 and block and len(block) < self.inlinelimit: # Inline the single block terminfo.add_block(block) vals = None if not block.values else tuple(block.values) postings = (tuple(block.ids), tuple(block.weights), vals) else: if block: # Write the current unfinished block to disk self._write_block() # Seek back to the start of this list of posting blocks and write # the number of blocks postfile = self.postfile postfile.flush() here = postfile.tell() postfile.seek(self.startoffset + 4) postfile.write_uint(self.blockcount) postfile.seek(here) self.block = None postings = self.startoffset self.block = None terminfo.postings = postings self.termsindex.add((self.fieldname, self.text), terminfo) def finish_field(self): if not self._infield: raise Exception("Called finish_field before start_field") self._infield = False if self._dawgfield: self.dawg.finish_field() self._dawgfield = False def close(self): self.termsindex.close() self.postfile.close() if self.dawg is not None: self.dawg.close()
class W2FieldWriter(base.FieldWriter): def __init__(self, storage, segment, blocklimit=128, compression=3, inlinelimit=1): assert isinstance(storage, Storage) assert isinstance(segment, base.Segment) assert isinstance(blocklimit, int) assert isinstance(compression, int) assert isinstance(inlinelimit, int) self.storage = storage self.segment = segment self.fieldname = None self.text = None self.field = None self.format = None self.spelling = False tifile = segment.create_file(storage, W2Codec.TERMS_EXT) self.termsindex = TermIndexWriter(tifile) self.postfile = segment.create_file(storage, W2Codec.POSTS_EXT) # We'll wait to create the DAWG builder until someone actually adds # a spelled field self.dawg = None self.blocklimit = blocklimit self.compression = compression self.inlinelimit = inlinelimit self.block = None self.terminfo = None def _make_dawg_files(self): dawgfile = self.segment.create_file(self.storage, W2Codec.DAWG_EXT) self.dawg = GraphWriter(dawgfile) def _new_block(self): return W2Block(self.format.posting_size) def _reset_block(self): self.block = self._new_block() def _write_block(self): self.terminfo.add_block(self.block) self.block.to_file(self.postfile, compression=self.compression) self._reset_block() self.blockcount += 1 def _start_blocklist(self): postfile = self.postfile self._reset_block() # Magic number self.startoffset = postfile.tell() postfile.write(W2Block.magic) # Placeholder for block count self.blockcount = 0 postfile.write_uint(0) def start_field(self, fieldname, fieldobj): self.fieldname = fieldname self.field = fieldobj self.format = fieldobj.format self.spelling = fieldobj.spelling and not fieldobj.separate_spelling() if self.spelling or fieldobj.separate_spelling(): if self.dawg is None: self._make_dawg_files() self.dawg.start_field(fieldname) def start_term(self, text): if self.block is not None: raise Exception("Called start_term in a block") self.text = text self.terminfo = base.FileTermInfo() if self.spelling: self.dawg.insert(text) self._start_blocklist() def add(self, docnum, weight, valuestring, length): self.block.add(docnum, weight, valuestring, length) if len(self.block) > self.blocklimit: self._write_block() def add_spell_word(self, fieldname, text): if self.dawg is None: self._make_dawg_files() self.dawg.insert(text) def finish_term(self): block = self.block if block is None: raise Exception("Called finish_term when not in a block") terminfo = self.terminfo if self.blockcount < 1 and block and len(block) < self.inlinelimit: # Inline the single block terminfo.add_block(block) vals = None if not block.values else tuple(block.values) postings = (tuple(block.ids), tuple(block.weights), vals) else: if block: # Write the current unfinished block to disk self._write_block() # Seek back to the start of this list of posting blocks and write # the number of blocks postfile = self.postfile postfile.flush() here = postfile.tell() postfile.seek(self.startoffset + 4) postfile.write_uint(self.blockcount) postfile.seek(here) self.block = None postings = self.startoffset self.block = None terminfo.postings = postings self.termsindex.add((self.fieldname, self.text), terminfo) def finish_field(self): if self.dawg: self.dawg.finish_field() def close(self): self.termsindex.close() self.postfile.close() if self.dawg is not None: self.dawg.close()