def test_termkey(): with TempStorage("termkey") as st: tw = TermIndexWriter(st.create_file("test.trm")) tw.add(("alfa", u("bravo")), FileTermInfo(1.0, 3)) tw.add(("alfa", u('\xc3\xa6\xc3\xaf\xc5\ufffd\xc3\xba')), FileTermInfo(4.0, 6)) tw.add(("text", u('\xe6\u2014\xa5\xe6\u0153\xac\xe8\xaa\u017e')), FileTermInfo(7.0, 9)) tw.close() tr = TermIndexReader(st.open_file("test.trm")) assert ("alfa", u("bravo")) in tr assert ("alfa", u('\xc3\xa6\xc3\xaf\xc5\ufffd\xc3\xba')) in tr assert ("text", u('\xe6\u2014\xa5\xe6\u0153\xac\xe8\xaa\u017e')) in tr tr.close()
def start(self, format): if self.block is not None: raise Exception("Called start() in a block") self.format = format self.blockcount = 0 self.startoffset = self.postfile.tell() self.terminfo = FileTermInfo() # Magic number self.postfile.write_int(self.blockclass.magic) # Placeholder for block count self.postfile.write_uint(0) self._reset_block() return self.startoffset
def test_block(): st = RamStorage() f = st.create_file("postfile") b = current(f, 0) b.append(0, 1.0, '', 1) b.append(1, 2.0, '', 2) b.append(2, 12.0, '', 6) b.append(5, 6.5, '', 420) assert b assert_equal(len(b), 4) assert_equal(list(b.ids), [0, 1, 2, 5]) assert_equal(list(b.weights), [1.0, 2.0, 12.0, 6.5]) assert_equal(b.values, None) assert_equal(b.min_length(), 1) assert_equal(b.max_length(), byte_to_length(length_to_byte(420))) assert_equal(b.max_weight(), 12.0) assert_equal(b.max_wol(), 2.0) ti = FileTermInfo() ti.add_block(b) assert_equal(ti.weight(), 21.5) assert_equal(ti.doc_frequency(), 4) assert_equal(ti.min_length(), 1) assert_equal(ti.max_length(), byte_to_length(length_to_byte(420))) assert_equal(ti.max_weight(), 12.0) assert_equal(ti.max_wol(), 2.0) b.write(compression=3) f.close() f = st.open_file("postfile") bb = current.from_file(f, 0) bb.read_ids() assert_equal(list(bb.ids), [0, 1, 2, 5]) bb.read_weights() assert_equal(list(bb.weights), [1.0, 2.0, 12.0, 6.5]) bb.read_values() assert_equal(b.values, None) assert_equal(bb.min_length(), 1) assert_equal(bb.max_length(), byte_to_length(length_to_byte(420))) assert_equal(bb.max_weight(), 12.0) assert_equal(bb.max_wol(), 2.0)
def test_termindex(): terms = [("a", "alfa"), ("a", "bravo"), ("a", "charlie"), ("a", "delta"), ("b", "able"), ("b", "baker"), ("b", "dog"), ("b", "easy")] st = RamStorage() tw = TermIndexWriter(st.create_file("test.trm")) for i, t in enumerate(terms): tw.add(t, FileTermInfo(1.0, i)) tw.close() tr = TermIndexReader(st.open_file("test.trm")) for i, (t1, t2) in enumerate(zip(tr.keys(), terms)): assert_equal(t1, t2) ti = tr.get(t1) assert_equal(ti.weight(), 1.0) assert_equal(ti.doc_frequency(), i)
def test_random_termkeys(): def random_fieldname(): return "".join(chr(random.randint(65, 90)) for _ in xrange(1, 20)) def random_token(): return "".join( unichr(random.randint(0, 0xd7ff)) for _ in xrange(1, 20)) domain = sorted([(random_fieldname(), random_token()) for _ in xrange(1000)]) st = RamStorage() tw = TermIndexWriter(st.create_file("test.trm")) for term in domain: tw.add(term, FileTermInfo(1.0, 1)) tw.close() tr = TermIndexReader(st.open_file("test.trm")) for term in domain: assert term in tr
class FilePostingWriter(PostingWriter): blockclass = postblocks.current def __init__(self, postfile, stringids=False, blocklimit=128, compression=3): self.postfile = postfile self.stringids = stringids if blocklimit > 255: raise ValueError("blocklimit argument must be <= 255") elif blocklimit < 1: raise ValueError("blocklimit argument must be > 0") self.blocklimit = blocklimit self.compression = compression self.block = None def _reset_block(self): self.block = self.blockclass(self.postfile, self.format.posting_size, stringids=self.stringids) def start(self, format): if self.block is not None: raise Exception("Called start() in a block") self.format = format self.blockcount = 0 self.startoffset = self.postfile.tell() self.terminfo = FileTermInfo() # Magic number self.postfile.write_int(self.blockclass.magic) # Placeholder for block count self.postfile.write_uint(0) self._reset_block() return self.startoffset def write(self, id, weight, valuestring, dfl): self.block.append(id, weight, valuestring, dfl) if len(self.block) >= self.blocklimit: self._write_block() def finish(self, inlinelimit=1): assert isinstance(inlinelimit, integer_types) if self.block is None: raise Exception("Called finish() when not in a block") block = self.block terminfo = self.terminfo if self.blockcount < 1 and block and len(block) <= inlinelimit: terminfo.add_block(block) vals = None if not block.values else tuple(block.values) postings = (tuple(block.ids), tuple(block.weights), vals) else: if block: self._write_block() # Seek back to the start of this list of posting blocks and write # the number of blocks pf = self.postfile pf.flush() offset = pf.tell() pf.seek(self.startoffset + _INT_SIZE) pf.write_uint(self.blockcount) pf.seek(offset) postings = self.startoffset self.block = None terminfo.postings = postings return terminfo def close(self): if self.block: raise Exception("Closed posting writer without finishing") self.postfile.close() def block_stats(self): return self.block.stats() def _write_block(self): self.block.write(compression=self.compression) self.terminfo.add_block(self.block) self._reset_block() self.blockcount += 1