Example #1
0
def test_termkey():
    with TempStorage("termkey") as st:
        tw = TermIndexWriter(st.create_file("test.trm"))
        tw.add(("alfa", u("bravo")), FileTermInfo(1.0, 3))
        tw.add(("alfa", u('\xc3\xa6\xc3\xaf\xc5\ufffd\xc3\xba')),
               FileTermInfo(4.0, 6))
        tw.add(("text", u('\xe6\u2014\xa5\xe6\u0153\xac\xe8\xaa\u017e')),
               FileTermInfo(7.0, 9))
        tw.close()

        tr = TermIndexReader(st.open_file("test.trm"))
        assert ("alfa", u("bravo")) in tr
        assert ("alfa", u('\xc3\xa6\xc3\xaf\xc5\ufffd\xc3\xba')) in tr
        assert ("text", u('\xe6\u2014\xa5\xe6\u0153\xac\xe8\xaa\u017e')) in tr
        tr.close()
    def start(self, format):
        if self.block is not None:
            raise Exception("Called start() in a block")

        self.format = format
        self.blockcount = 0
        self.startoffset = self.postfile.tell()
        self.terminfo = FileTermInfo()

        # Magic number
        self.postfile.write_int(self.blockclass.magic)
        # Placeholder for block count
        self.postfile.write_uint(0)

        self._reset_block()
        return self.startoffset
Example #3
0
def test_block():
    st = RamStorage()
    f = st.create_file("postfile")

    b = current(f, 0)
    b.append(0, 1.0, '', 1)
    b.append(1, 2.0, '', 2)
    b.append(2, 12.0, '', 6)
    b.append(5, 6.5, '', 420)
    assert b

    assert_equal(len(b), 4)
    assert_equal(list(b.ids), [0, 1, 2, 5])
    assert_equal(list(b.weights), [1.0, 2.0, 12.0, 6.5])
    assert_equal(b.values, None)
    assert_equal(b.min_length(), 1)
    assert_equal(b.max_length(), byte_to_length(length_to_byte(420)))
    assert_equal(b.max_weight(), 12.0)
    assert_equal(b.max_wol(), 2.0)

    ti = FileTermInfo()
    ti.add_block(b)
    assert_equal(ti.weight(), 21.5)
    assert_equal(ti.doc_frequency(), 4)
    assert_equal(ti.min_length(), 1)
    assert_equal(ti.max_length(), byte_to_length(length_to_byte(420)))
    assert_equal(ti.max_weight(), 12.0)
    assert_equal(ti.max_wol(), 2.0)

    b.write(compression=3)
    f.close()
    f = st.open_file("postfile")
    bb = current.from_file(f, 0)

    bb.read_ids()
    assert_equal(list(bb.ids), [0, 1, 2, 5])
    bb.read_weights()
    assert_equal(list(bb.weights), [1.0, 2.0, 12.0, 6.5])
    bb.read_values()
    assert_equal(b.values, None)
    assert_equal(bb.min_length(), 1)
    assert_equal(bb.max_length(), byte_to_length(length_to_byte(420)))
    assert_equal(bb.max_weight(), 12.0)
    assert_equal(bb.max_wol(), 2.0)
Example #4
0
    def start(self, format):
        if self.block is not None:
            raise Exception("Called start() in a block")

        self.format = format
        self.blockcount = 0
        self.startoffset = self.postfile.tell()
        self.terminfo = FileTermInfo()

        # Magic number
        self.postfile.write_int(self.blockclass.magic)
        # Placeholder for block count
        self.postfile.write_uint(0)

        self._reset_block()
        return self.startoffset
Example #5
0
def test_termindex():
    terms = [("a", "alfa"), ("a", "bravo"), ("a", "charlie"), ("a", "delta"),
             ("b", "able"), ("b", "baker"), ("b", "dog"), ("b", "easy")]
    st = RamStorage()

    tw = TermIndexWriter(st.create_file("test.trm"))
    for i, t in enumerate(terms):
        tw.add(t, FileTermInfo(1.0, i))
    tw.close()

    tr = TermIndexReader(st.open_file("test.trm"))
    for i, (t1, t2) in enumerate(zip(tr.keys(), terms)):
        assert_equal(t1, t2)
        ti = tr.get(t1)
        assert_equal(ti.weight(), 1.0)
        assert_equal(ti.doc_frequency(), i)
Example #6
0
def test_random_termkeys():
    def random_fieldname():
        return "".join(chr(random.randint(65, 90)) for _ in xrange(1, 20))

    def random_token():
        return "".join(
            unichr(random.randint(0, 0xd7ff)) for _ in xrange(1, 20))

    domain = sorted([(random_fieldname(), random_token())
                     for _ in xrange(1000)])

    st = RamStorage()
    tw = TermIndexWriter(st.create_file("test.trm"))
    for term in domain:
        tw.add(term, FileTermInfo(1.0, 1))
    tw.close()

    tr = TermIndexReader(st.open_file("test.trm"))
    for term in domain:
        assert term in tr
Example #7
0
class FilePostingWriter(PostingWriter):
    blockclass = postblocks.current

    def __init__(self, postfile, stringids=False, blocklimit=128,
                 compression=3):
        self.postfile = postfile
        self.stringids = stringids

        if blocklimit > 255:
            raise ValueError("blocklimit argument must be <= 255")
        elif blocklimit < 1:
            raise ValueError("blocklimit argument must be > 0")
        self.blocklimit = blocklimit
        self.compression = compression
        self.block = None

    def _reset_block(self):
        self.block = self.blockclass(self.postfile, self.format.posting_size,
                                     stringids=self.stringids)

    def start(self, format):
        if self.block is not None:
            raise Exception("Called start() in a block")

        self.format = format
        self.blockcount = 0
        self.startoffset = self.postfile.tell()
        self.terminfo = FileTermInfo()

        # Magic number
        self.postfile.write_int(self.blockclass.magic)
        # Placeholder for block count
        self.postfile.write_uint(0)

        self._reset_block()
        return self.startoffset

    def write(self, id, weight, valuestring, dfl):
        self.block.append(id, weight, valuestring, dfl)
        if len(self.block) >= self.blocklimit:
            self._write_block()

    def finish(self, inlinelimit=1):
        assert isinstance(inlinelimit, integer_types)
        if self.block is None:
            raise Exception("Called finish() when not in a block")

        block = self.block
        terminfo = self.terminfo

        if self.blockcount < 1 and block and len(block) <= inlinelimit:
            terminfo.add_block(block)
            vals = None if not block.values else tuple(block.values)
            postings = (tuple(block.ids), tuple(block.weights), vals)
        else:
            if block:
                self._write_block()

            # Seek back to the start of this list of posting blocks and write
            # the number of blocks
            pf = self.postfile
            pf.flush()
            offset = pf.tell()
            pf.seek(self.startoffset + _INT_SIZE)
            pf.write_uint(self.blockcount)
            pf.seek(offset)
            postings = self.startoffset

        self.block = None

        terminfo.postings = postings
        return terminfo

    def close(self):
        if self.block:
            raise Exception("Closed posting writer without finishing")
        self.postfile.close()

    def block_stats(self):
        return self.block.stats()

    def _write_block(self):
        self.block.write(compression=self.compression)
        self.terminfo.add_block(self.block)
        self._reset_block()
        self.blockcount += 1
class FilePostingWriter(PostingWriter):
    blockclass = postblocks.current

    def __init__(self,
                 postfile,
                 stringids=False,
                 blocklimit=128,
                 compression=3):
        self.postfile = postfile
        self.stringids = stringids

        if blocklimit > 255:
            raise ValueError("blocklimit argument must be <= 255")
        elif blocklimit < 1:
            raise ValueError("blocklimit argument must be > 0")
        self.blocklimit = blocklimit
        self.compression = compression
        self.block = None

    def _reset_block(self):
        self.block = self.blockclass(self.postfile,
                                     self.format.posting_size,
                                     stringids=self.stringids)

    def start(self, format):
        if self.block is not None:
            raise Exception("Called start() in a block")

        self.format = format
        self.blockcount = 0
        self.startoffset = self.postfile.tell()
        self.terminfo = FileTermInfo()

        # Magic number
        self.postfile.write_int(self.blockclass.magic)
        # Placeholder for block count
        self.postfile.write_uint(0)

        self._reset_block()
        return self.startoffset

    def write(self, id, weight, valuestring, dfl):
        self.block.append(id, weight, valuestring, dfl)
        if len(self.block) >= self.blocklimit:
            self._write_block()

    def finish(self, inlinelimit=1):
        assert isinstance(inlinelimit, integer_types)
        if self.block is None:
            raise Exception("Called finish() when not in a block")

        block = self.block
        terminfo = self.terminfo

        if self.blockcount < 1 and block and len(block) <= inlinelimit:
            terminfo.add_block(block)
            vals = None if not block.values else tuple(block.values)
            postings = (tuple(block.ids), tuple(block.weights), vals)
        else:
            if block:
                self._write_block()

            # Seek back to the start of this list of posting blocks and write
            # the number of blocks
            pf = self.postfile
            pf.flush()
            offset = pf.tell()
            pf.seek(self.startoffset + _INT_SIZE)
            pf.write_uint(self.blockcount)
            pf.seek(offset)
            postings = self.startoffset

        self.block = None

        terminfo.postings = postings
        return terminfo

    def close(self):
        if self.block:
            raise Exception("Closed posting writer without finishing")
        self.postfile.close()

    def block_stats(self):
        return self.block.stats()

    def _write_block(self):
        self.block.write(compression=self.compression)
        self.terminfo.add_block(self.block)
        self._reset_block()
        self.blockcount += 1