def test_termkey(): with TempStorage("termkey") as st: tw = TermIndexWriter(st.create_file("test.trm")) tw.add(("alfa", u("bravo")), FileTermInfo(1.0, 3)) tw.add(("alfa", u('\xc3\xa6\xc3\xaf\xc5\ufffd\xc3\xba')), FileTermInfo(4.0, 6)) tw.add(("text", u('\xe6\u2014\xa5\xe6\u0153\xac\xe8\xaa\u017e')), FileTermInfo(7.0, 9)) tw.close() tr = TermIndexReader(st.open_file("test.trm")) assert ("alfa", u("bravo")) in tr assert ("alfa", u('\xc3\xa6\xc3\xaf\xc5\ufffd\xc3\xba')) in tr assert ("text", u('\xe6\u2014\xa5\xe6\u0153\xac\xe8\xaa\u017e')) in tr tr.close()
def test_termindex(): terms = [("a", "alfa"), ("a", "bravo"), ("a", "charlie"), ("a", "delta"), ("b", "able"), ("b", "baker"), ("b", "dog"), ("b", "easy")] st = RamStorage() tw = TermIndexWriter(st.create_file("test.trm")) for i, t in enumerate(terms): tw.add(t, FileTermInfo(1.0, i)) tw.close() tr = TermIndexReader(st.open_file("test.trm")) for i, (t1, t2) in enumerate(zip(tr.keys(), terms)): assert_equal(t1, t2) ti = tr.get(t1) assert_equal(ti.weight(), 1.0) assert_equal(ti.doc_frequency(), i)
def test_random_termkeys(): def random_fieldname(): return "".join(chr(random.randint(65, 90)) for _ in xrange(1, 20)) def random_token(): return "".join( unichr(random.randint(0, 0xd7ff)) for _ in xrange(1, 20)) domain = sorted([(random_fieldname(), random_token()) for _ in xrange(1000)]) st = RamStorage() tw = TermIndexWriter(st.create_file("test.trm")) for term in domain: tw.add(term, FileTermInfo(1.0, 1)) tw.close() tr = TermIndexReader(st.open_file("test.trm")) for term in domain: assert term in tr
def __init__(self, ix, poolclass=None, procs=0, blocklimit=128, timeout=0.0, delay=0.1, name=None, _l=True, **poolargs): self.writelock = None if _l: self.writelock = ix.lock("WRITELOCK") if not try_for(self.writelock.acquire, timeout=timeout, delay=delay): raise LockError self.readlock = ix.lock("READLOCK") info = ix._read_toc() self.schema = info.schema self.segments = info.segments self.storage = ix.storage self.indexname = ix.indexname self.is_closed = False self.blocklimit = blocklimit self.segment_number = info.segment_counter + 1 self.generation = info.generation + 1 self._doc_offsets = [] base = 0 for s in self.segments: self._doc_offsets.append(base) base += s.doc_count_all() self.name = name or Segment.basename(self.indexname, self.segment_number) self.docnum = 0 self.fieldlength_totals = defaultdict(int) self._added = False self._unique_cache = {} # Create a temporary segment to use its .*_filename attributes segment = Segment(self.name, self.generation, 0, None, None) # Terms index tf = self.storage.create_file(segment.termsindex_filename) ti = TermIndexWriter(tf) # Term postings file pf = self.storage.create_file(segment.termposts_filename) pw = FilePostingWriter(pf, blocklimit=blocklimit) # Terms writer self.termswriter = TermsWriter(self.schema, ti, pw) if self.schema.has_vectored_fields(): # Vector index vf = self.storage.create_file(segment.vectorindex_filename) self.vectorindex = TermVectorWriter(vf) # Vector posting file vpf = self.storage.create_file(segment.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpf, stringids=True) else: self.vectorindex = None self.vpostwriter = None # Stored fields file sf = self.storage.create_file(segment.storedfields_filename) self.storedfields = StoredFieldWriter(sf, self.schema.stored_names()) # Field lengths file self.lengthfile = self.storage.create_file(segment.fieldlengths_filename) # Create the pool if poolclass is None: if procs > 1: from whoosh.filedb.multiproc import MultiPool poolclass = MultiPool else: poolclass = TempfilePool self.pool = poolclass(self.schema, procs=procs, **poolargs)