Ejemplo n.º 1
0
 def _make_dawg_files(self):
     dawgfile = self.segment.create_file(self.storage, W2Codec.DAWG_EXT)
     self.dawg = GraphWriter(dawgfile)
Ejemplo n.º 2
0
 def _make_dawg_files(self):
     dawgfile = self.segment.create_file(self.storage, W2Codec.DAWG_EXT)
     self.dawg = GraphWriter(dawgfile)
Ejemplo n.º 3
0
class W2FieldWriter(base.FieldWriter):
    def __init__(self, storage, segment, blocklimit=128, compression=3,
                 inlinelimit=1):
        assert isinstance(storage, Storage)
        assert isinstance(segment, base.Segment)
        assert isinstance(blocklimit, int)
        assert isinstance(compression, int)
        assert isinstance(inlinelimit, int)

        self.storage = storage
        self.segment = segment
        self.fieldname = None
        self.text = None
        self.field = None
        self.format = None
        self.spelling = False

        tifile = segment.create_file(storage, W2Codec.TERMS_EXT)
        self.termsindex = TermIndexWriter(tifile)
        self.postfile = segment.create_file(storage, W2Codec.POSTS_EXT)

        # We'll wait to create the DAWG builder until someone actually adds
        # a spelled field
        self.dawg = None

        self.blocklimit = blocklimit
        self.compression = compression
        self.inlinelimit = inlinelimit
        self.block = None
        self.terminfo = None
        self._infield = False

    def _make_dawg_files(self):
        dawgfile = self.segment.create_file(self.storage, W2Codec.DAWG_EXT)
        self.dawg = GraphWriter(dawgfile)

    def _new_block(self):
        return W2Block(self.format.posting_size)

    def _reset_block(self):
        self.block = self._new_block()

    def _write_block(self):
        self.terminfo.add_block(self.block)
        self.block.to_file(self.postfile, compression=self.compression)
        self._reset_block()
        self.blockcount += 1

    def _start_blocklist(self):
        postfile = self.postfile
        self._reset_block()

        # Magic number
        self.startoffset = postfile.tell()
        postfile.write(W2Block.magic)
        # Placeholder for block count
        self.blockcount = 0
        postfile.write_uint(0)

    def start_field(self, fieldname, fieldobj):
        self.fieldname = fieldname
        self.field = fieldobj
        self.format = fieldobj.format
        self.spelling = fieldobj.spelling and not fieldobj.separate_spelling()
        self._dawgfield = False
        if self.spelling or fieldobj.separate_spelling():
            if self.dawg is None:
                self._make_dawg_files()
            self.dawg.start_field(fieldname)
            self._dawgfield = True
        self._infield = True

    def start_term(self, text):
        if self.block is not None:
            raise Exception("Called start_term in a block")
        self.text = text
        self.terminfo = base.FileTermInfo()
        if self.spelling:
            self.dawg.insert(text)
        self._start_blocklist()

    def add(self, docnum, weight, valuestring, length):
        self.block.add(docnum, weight, valuestring, length)
        if len(self.block) > self.blocklimit:
            self._write_block()

    def add_spell_word(self, fieldname, text):
        if self.dawg is None:
            self._make_dawg_files()
        self.dawg.insert(text)

    def finish_term(self):
        block = self.block
        if block is None:
            raise Exception("Called finish_term when not in a block")

        terminfo = self.terminfo
        if self.blockcount < 1 and block and len(block) < self.inlinelimit:
            # Inline the single block
            terminfo.add_block(block)
            vals = None if not block.values else tuple(block.values)
            postings = (tuple(block.ids), tuple(block.weights), vals)
        else:
            if block:
                # Write the current unfinished block to disk
                self._write_block()

            # Seek back to the start of this list of posting blocks and write
            # the number of blocks
            postfile = self.postfile
            postfile.flush()
            here = postfile.tell()
            postfile.seek(self.startoffset + 4)
            postfile.write_uint(self.blockcount)
            postfile.seek(here)

            self.block = None
            postings = self.startoffset

        self.block = None
        terminfo.postings = postings
        self.termsindex.add((self.fieldname, self.text), terminfo)

    def finish_field(self):
        if not self._infield:
            raise Exception("Called finish_field before start_field")
        self._infield = False

        if self._dawgfield:
            self.dawg.finish_field()
            self._dawgfield = False

    def close(self):
        self.termsindex.close()
        self.postfile.close()
        if self.dawg is not None:
            self.dawg.close()
Ejemplo n.º 4
0
class W2FieldWriter(base.FieldWriter):
    def __init__(self,
                 storage,
                 segment,
                 blocklimit=128,
                 compression=3,
                 inlinelimit=1):
        assert isinstance(storage, Storage)
        assert isinstance(segment, base.Segment)
        assert isinstance(blocklimit, int)
        assert isinstance(compression, int)
        assert isinstance(inlinelimit, int)

        self.storage = storage
        self.segment = segment
        self.fieldname = None
        self.text = None
        self.field = None
        self.format = None
        self.spelling = False

        tifile = segment.create_file(storage, W2Codec.TERMS_EXT)
        self.termsindex = TermIndexWriter(tifile)
        self.postfile = segment.create_file(storage, W2Codec.POSTS_EXT)

        # We'll wait to create the DAWG builder until someone actually adds
        # a spelled field
        self.dawg = None

        self.blocklimit = blocklimit
        self.compression = compression
        self.inlinelimit = inlinelimit
        self.block = None
        self.terminfo = None

    def _make_dawg_files(self):
        dawgfile = self.segment.create_file(self.storage, W2Codec.DAWG_EXT)
        self.dawg = GraphWriter(dawgfile)

    def _new_block(self):
        return W2Block(self.format.posting_size)

    def _reset_block(self):
        self.block = self._new_block()

    def _write_block(self):
        self.terminfo.add_block(self.block)
        self.block.to_file(self.postfile, compression=self.compression)
        self._reset_block()
        self.blockcount += 1

    def _start_blocklist(self):
        postfile = self.postfile
        self._reset_block()

        # Magic number
        self.startoffset = postfile.tell()
        postfile.write(W2Block.magic)
        # Placeholder for block count
        self.blockcount = 0
        postfile.write_uint(0)

    def start_field(self, fieldname, fieldobj):
        self.fieldname = fieldname
        self.field = fieldobj
        self.format = fieldobj.format
        self.spelling = fieldobj.spelling and not fieldobj.separate_spelling()
        if self.spelling or fieldobj.separate_spelling():
            if self.dawg is None:
                self._make_dawg_files()
            self.dawg.start_field(fieldname)

    def start_term(self, text):
        if self.block is not None:
            raise Exception("Called start_term in a block")
        self.text = text
        self.terminfo = base.FileTermInfo()
        if self.spelling:
            self.dawg.insert(text)
        self._start_blocklist()

    def add(self, docnum, weight, valuestring, length):
        self.block.add(docnum, weight, valuestring, length)
        if len(self.block) > self.blocklimit:
            self._write_block()

    def add_spell_word(self, fieldname, text):
        if self.dawg is None:
            self._make_dawg_files()
        self.dawg.insert(text)

    def finish_term(self):
        block = self.block
        if block is None:
            raise Exception("Called finish_term when not in a block")

        terminfo = self.terminfo
        if self.blockcount < 1 and block and len(block) < self.inlinelimit:
            # Inline the single block
            terminfo.add_block(block)
            vals = None if not block.values else tuple(block.values)
            postings = (tuple(block.ids), tuple(block.weights), vals)
        else:
            if block:
                # Write the current unfinished block to disk
                self._write_block()

            # Seek back to the start of this list of posting blocks and write
            # the number of blocks
            postfile = self.postfile
            postfile.flush()
            here = postfile.tell()
            postfile.seek(self.startoffset + 4)
            postfile.write_uint(self.blockcount)
            postfile.seek(here)

            self.block = None
            postings = self.startoffset

        self.block = None
        terminfo.postings = postings
        self.termsindex.add((self.fieldname, self.text), terminfo)

    def finish_field(self):
        if self.dawg:
            self.dawg.finish_field()

    def close(self):
        self.termsindex.close()
        self.postfile.close()
        if self.dawg is not None:
            self.dawg.close()