def test_multisegment(): schema = fields.Schema(text=fields.TEXT(spelling=True)) ix = RamStorage().create_index(schema) domain = u("special specious spectacular spongy spring specials").split() for word in domain: w = ix.writer() w.add_document(text=word) w.commit(merge=False) with ix.reader() as r: assert not r.is_atomic() words = list(dawg.flatten(r.word_graph("text"))) assert_equal(words, sorted(domain)) corr = r.corrector("text") assert_equal(corr.suggest("specail", maxdist=2), ["special", "specials"]) ix.optimize() with ix.reader() as r: assert r.is_atomic() assert_equal(list(r.lexicon("text")), sorted(domain)) assert r.has_word_graph("text") words = list(dawg.flatten(r.word_graph("text"))) assert_equal(words, sorted(domain)) corr = r.corrector("text") assert_equal(corr.suggest("specail", maxdist=2), ["special", "specials"])
def test_dawg(): from whoosh.support.dawg import DawgBuilder with TempStorage() as st: df = st.create_file("test.dawg") dw = DawgBuilder(field_root=True) dw.insert(["test"] + list("special")) dw.insert(["test"] + list("specials")) dw.write(df) assert_equal(list(dawg.flatten(dw.root.edge("test"))), ["special", "specials"])
def test_bypass_stemming(): from whoosh.support.dawg import flatten ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("rendering shading modeling reactions")) w.commit() with ix.reader() as r: assert_equal(list(r.lexicon("text")), ["model", "reaction", "render", "shade"]) assert_equal(list(flatten(r.word_graph("text"))), ["modeling", "reactions", "rendering", "shading"])
def add_reader(self, reader): self._check_state() startdoc = self.docnum has_deletions = reader.has_deletions() if has_deletions: docmap = {} fieldnames = set(self.schema.names()) # Add stored documents, vectors, and field lengths for docnum in reader.all_doc_ids(): if (not has_deletions) or (not reader.is_deleted(docnum)): d = dict(item for item in iteritems(reader.stored_fields(docnum)) if item[0] in fieldnames) # We have to append a dictionary for every document, even if # it's empty. self.storedfields.append(d) if has_deletions: docmap[docnum] = self.docnum for fieldname in reader.schema.scorable_names(): length = reader.doc_field_length(docnum, fieldname) if length and fieldname in fieldnames: self.pool.add_field_length(self.docnum, fieldname, length) for fieldname in reader.schema.vector_names(): if (fieldname in fieldnames and reader.has_vector(docnum, fieldname)): vpostreader = reader.vector(docnum, fieldname) self._add_vector_reader(self.docnum, fieldname, vpostreader) self.docnum += 1 # Add dawg contents to word sets for fields that require separate # handling for fieldname in self.schema.separate_spelling_names(): if reader.has_word_graph(fieldname): graph = reader.word_graph(fieldname) self.add_spell_words(fieldname, flatten(graph)) # Add postings for fieldname, text in reader.all_terms(): if fieldname in fieldnames: postreader = reader.postings(fieldname, text) while postreader.is_active(): docnum = postreader.id() valuestring = postreader.value() if has_deletions: newdoc = docmap[docnum] else: newdoc = startdoc + docnum self.pool.add_posting(fieldname, text, newdoc, postreader.weight(), valuestring) postreader.next() self._added = True