def test_keyterms(): ix = create_index() with ix.searcher() as s: docnum = s.document_number(path="/a") keys = list(s.key_terms([docnum], "content", numterms=3)) assert ([t[0] for t in keys] == [u("collision"), u("calculations"), u("damped")])
def __unicode__(self): r = u("DisMax(") r += " ".join(sorted(text_type(s) for s in self.subqueries)) r += u(")") if self.tiebreak: r += u("~") + text_type(self.tiebreak) return r
def test_andnot(): qp = default.QueryParser("content", None) q = qp.parse(u("this ANDNOT that")) assert q.__class__ == query.AndNot assert q.a.__class__ == query.Term assert q.b.__class__ == query.Term assert q.a.text == "this" assert q.b.text == "that" q = qp.parse(u("foo ANDNOT bar baz")) assert q.__class__ == query.And assert len(q) == 2 assert q[0].__class__ == query.AndNot assert q[1].__class__ == query.Term q = qp.parse(u("foo fie ANDNOT bar baz")) assert q.__class__ == query.And assert len(q) == 3 assert q[0].__class__ == query.Term assert q[1].__class__ == query.AndNot assert q[2].__class__ == query.Term q = qp.parse(u("a AND b ANDNOT c")) assert q.__class__ == query.AndNot assert text_type(q) == "((content:a AND content:b) ANDNOT content:c)"
def test_phrase_andmaybe(): qp = default.QueryParser("f", None) q = qp.parse(u('Dahmen ANDMAYBE "Besov Spaces"')) assert isinstance(q, query.AndMaybe) assert q[0] == query.Term("f", u("Dahmen")) assert q[1] == query.Phrase("f", [u("Besov"), u("Spaces")])
def test_compatibility(): from whoosh.scoring import Weighting # This is the old way of doing a custom weighting model, check that # it's still supported... class LegacyWeighting(Weighting): use_final = True def score(self, searcher, fieldname, text, docnum, weight): return weight + 0.5 def final(self, searcher, docnum, score): return score * 1.5 schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() domain = "alfa bravo charlie delta".split() for ls in permutations(domain, 3): w.add_document(text=u(" ").join(ls)) w.commit() s = ix.searcher(weighting=LegacyWeighting()) r = s.search(query.Term("text", u("bravo"))) assert r.score(0) == 2.25
def __unicode__(self): r = u("(") r += (self.JOINT).join([text_type(s) for s in self.subqueries]) r += u(")") if self.minmatch: r += u(">%s") % self.minmatch return r
def test_andnot(): qp = default.QueryParser("content", None) q = qp.parse(u("this ANDNOT that")) assert_equal(q.__class__, query.AndNot) assert_equal(q.a.__class__, query.Term) assert_equal(q.b.__class__, query.Term) assert_equal(q.a.text, "this") assert_equal(q.b.text, "that") q = qp.parse(u("foo ANDNOT bar baz")) assert_equal(q.__class__, query.And) assert_equal(len(q), 2) assert_equal(q[0].__class__, query.AndNot) assert_equal(q[1].__class__, query.Term) q = qp.parse(u("foo fie ANDNOT bar baz")) assert_equal(q.__class__, query.And) assert_equal(len(q), 3) assert_equal(q[0].__class__, query.Term) assert_equal(q[1].__class__, query.AndNot) assert_equal(q[2].__class__, query.Term) q = qp.parse(u("a AND b ANDNOT c")) assert_equal(q.__class__, query.AndNot) assert_equal(text_type(q), "((content:a AND content:b) ANDNOT content:c)")
def test_deleteall(): schema = fields.Schema(text=fields.TEXT) with TempIndex(schema, "deleteall") as ix: w = ix.writer() domain = u("alfa bravo charlie delta echo").split() for i, ls in enumerate(permutations(domain)): w.add_document(text=u(" ").join(ls)) if not i % 10: w.commit() w = ix.writer() w.commit() # This is just a test, don't use this method to delete all docs IRL! doccount = ix.doc_count_all() w = ix.writer() for docnum in xrange(doccount): w.delete_document(docnum) w.commit() with ix.searcher() as s: r = s.search(query.Or([query.Term("text", u("alfa")), query.Term("text", u("bravo"))])) assert len(r) == 0 ix.optimize() assert ix.doc_count_all() == 0 with ix.reader() as r: assert list(r) == []
def test_boolean_strings(): schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(i=0, b="true") w.add_document(i=1, b="True") w.add_document(i=2, b="false") w.add_document(i=3, b="False") w.add_document(i=4, b=u("true")) w.add_document(i=5, b=u("True")) w.add_document(i=6, b=u("false")) w.add_document(i=7, b=u("False")) with ix.searcher() as s: qp = qparser.QueryParser("b", ix.schema) def check(qs, nums): q = qp.parse(qs) r = s.search(q, limit=None) assert [hit["i"] for hit in r] == nums trues = [0, 1, 4, 5] falses = [2, 3, 6, 7] check("true", trues) check("True", trues) check("false", falses) check("False", falses) check("t", trues) check("f", falses)
def test_requires(): a = Term("f", u("a")) b = Term("f", u("b")) assert And([a, b]).requires() == set([a, b]) assert Or([a, b]).requires() == set() assert AndMaybe(a, b).requires() == set([a]) assert a.requires() == set([a])
def test_colonspace(): s = fields.Schema(content=fields.TEXT, url=fields.ID) qp = default.QueryParser("content", s) q = qp.parse(u("url:test")) assert_equal(q.__class__, query.Term) assert_equal(q.fieldname, "url") assert_equal(q.text, "test") q = qp.parse(u("url: test")) assert_equal(q.__class__, query.And) assert_equal(q[0].__class__, query.Term) assert_equal(q[1].__class__, query.Term) assert_equal(q[0].fieldname, "content") assert_equal(q[1].fieldname, "content") assert_equal(q[0].text, "url") assert_equal(q[1].text, "test") q = qp.parse(u("url:")) assert_equal(q.__class__, query.Term) assert_equal(q.fieldname, "content") assert_equal(q.text, "url") s = fields.Schema(foo=fields.KEYWORD) qp = default.QueryParser("foo", s) q = qp.parse(u("blah:")) assert_equal(q.__class__, query.Term) assert_equal(q.fieldname, "foo") assert_equal(q.text, "blah:")
def test_datetime(): dtf = fields.DATETIME(stored=True) schema = fields.Schema(id=fields.ID(stored=True), date=dtf) st = RamStorage() ix = st.create_index(schema) w = ix.writer() for month in xrange(1, 12): for day in xrange(1, 28): w.add_document(id=u("%s-%s") % (month, day), date=datetime(2010, month, day, 14, 0, 0)) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("id", schema) r = s.search(qp.parse("date:20100523")) assert len(r) == 1 assert r[0]["id"] == "5-23" assert r[0]["date"].__class__ is datetime assert r[0]["date"].month == 5 assert r[0]["date"].day == 23 r = s.search(qp.parse("date:'2010 02'")) assert len(r) == 27 q = qp.parse(u("date:[2010-05 to 2010-08]")) startdt = datetime(2010, 5, 1, 0, 0, 0, 0) enddt = datetime(2010, 8, 31, 23, 59, 59, 999999) assert q.__class__ is query.NumericRange assert q.start == times.datetime_to_long(startdt) assert q.end == times.datetime_to_long(enddt)
def test_boolean(): schema = fields.Schema(id=fields.ID(stored=True), done=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("a"), done=True) w.add_document(id=u("b"), done=False) w.add_document(id=u("c"), done=True) w.add_document(id=u("d"), done=False) w.add_document(id=u("e"), done=True) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("id", schema) r = s.search(qp.parse("done:true")) assert sorted([d["id"] for d in r]) == ["a", "c", "e"] assert all(d["done"] for d in r) r = s.search(qp.parse("done:yes")) assert sorted([d["id"] for d in r]) == ["a", "c", "e"] assert all(d["done"] for d in r) q = qp.parse("done:false") assert q.__class__ == query.Term assert q.text is False assert schema["done"].to_bytes(False) == b("f") r = s.search(q) assert sorted([d["id"] for d in r]) == ["b", "d"] assert not any(d["done"] for d in r) r = s.search(qp.parse("done:no")) assert sorted([d["id"] for d in r]) == ["b", "d"] assert not any(d["done"] for d in r)
def test_substitution(): mf = analysis.RegexTokenizer(r"\S+") | analysis.SubstitutionFilter("-", "") assert_equal([t.text for t in mf(u("one-two th-re-ee four"))], ["onetwo", "threee", "four"]) mf = analysis.RegexTokenizer(r"\S+") | analysis.SubstitutionFilter("([^=]*)=(.*)", r"\2=\1") assert_equal([t.text for t in mf(u("a=b c=d ef"))], ["b=a", "d=c", "ef"])
def test_requires(): a = Term("f", u("a")) b = Term("f", u("b")) assert_equal(And([a, b]).requires(), set([a, b])) assert_equal(Or([a, b]).requires(), set()) assert_equal(AndMaybe(a, b).requires(), set([a])) assert_equal(a.requires(), set([a]))
def __unicode__(self): r = u("%s:%s") % (self.fieldname, self.text) + u("~") if self.maxdist > 1: r += u("%d") % self.maxdist if self.boost != 1.0: r += u("^%f") % self.boost return r
def test_missing_field_scoring(): schema = fields.Schema(name=fields.TEXT(stored=True), hobbies=fields.TEXT(stored=True)) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u('Frank'), hobbies=u('baseball, basketball')) writer.commit() r = ix.reader() assert_equal(r.field_length("hobbies"), 2) assert_equal(r.field_length("name"), 1) r.close() writer = ix.writer() writer.add_document(name=u('Jonny')) writer.commit() with ix.searcher() as s: r = s.reader() assert_equal(len(ix._segments()), 1) assert_equal(r.field_length("hobbies"), 2) assert_equal(r.field_length("name"), 2) parser = qparser.MultifieldParser(['name', 'hobbies'], schema) q = parser.parse(u("baseball")) result = s.search(q) assert_equal(len(result), 1)
def __unicode__(self): r = self.text + u("~") if self.maxdist > 1: r += u("%d") % self.maxdist if self.boost != 1.0: r += u("^%f") % self.boost return r
def test_boost_phrase(): schema = fields.Schema(title=fields.TEXT(field_boost=5.0, stored=True), text=fields.TEXT) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta").split() w = ix.writer() for ls in permutations(domain): t = u(" ").join(ls) w.add_document(title=t, text=t) w.commit() q = Or([Term("title", u("alfa")), Term("title", u("bravo")), Phrase("text", [u("bravo"), u("charlie"), u("delta")])]) def boost_phrases(q): if isinstance(q, Phrase): q.boost *= 1000.0 return q else: return q.apply(boost_phrases) q = boost_phrases(q) with ix.searcher() as s: r = s.search(q, limit=None) for hit in r: if "bravo charlie delta" in hit["title"]: assert hit.score > 100.0
def test_fractional_weights(): ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() # With Positions format schema = fields.Schema(f=fields.TEXT(analyzer=ana)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(f=u("alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5")) w.commit() with ix.searcher() as s: wts = [] for word in s.lexicon("f"): p = s.postings("f", word) wts.append(p.weight()) assert_equal(wts, [0.5, 1.5, 2.0, 1.5]) # Try again with Frequency format schema = fields.Schema(f=fields.TEXT(analyzer=ana, phrase=False)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(f=u("alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5")) w.commit() with ix.searcher() as s: wts = [] for word in s.lexicon("f"): p = s.postings("f", word) wts.append(p.weight()) assert_equal(wts, [0.5, 1.5, 2.0, 1.5])
def test_delete_nonexistant(): from whoosh.writing import IndexingError schema = fields.Schema(id=fields.ID(stored=True)) # Single segment with TempIndex(schema, "deletenon1") as ix: w = ix.writer() for char in u("ABC"): w.add_document(id=char) w.commit() try: w = ix.writer() assert_raises(IndexingError, w.delete_document, 5) finally: w.cancel() # Multiple segments with TempIndex(schema, "deletenon1") as ix: for char in u("ABC"): w = ix.writer() w.add_document(id=char) w.commit(merge=False) try: w = ix.writer() assert_raises(IndexingError, w.delete_document, 5) finally: w.cancel()
def test_gtlt(): schema = fields.Schema(a=fields.KEYWORD, b=fields.NUMERIC, c=fields.KEYWORD, d=fields.NUMERIC(float), e=fields.DATETIME) qp = qparser.QueryParser("a", schema) qp.add_plugin(plugins.GtLtPlugin()) qp.add_plugin(dateparse.DateParserPlugin()) q = qp.parse(u("a:hello b:>100 c:<=z there")) assert_equal(q.__class__, query.And) assert_equal(len(q), 4) assert_equal(q[0], query.Term("a", "hello")) assert_equal(q[1], query.NumericRange("b", 100, None, startexcl=True)) assert_equal(q[2], query.TermRange("c", None, 'z')) assert_equal(q[3], query.Term("a", "there")) q = qp.parse(u("hello e:>'29 mar 2001' there")) assert_equal(q.__class__, query.And) assert_equal(len(q), 3) assert_equal(q[0], query.Term("a", "hello")) # As of this writing, date ranges don't support startexcl/endexcl assert_equal(q[1], query.DateRange("e", datetime(2001, 3, 29, 0, 0), None)) assert_equal(q[2], query.Term("a", "there")) q = qp.parse(u("a:> alfa c:<= bravo")) assert_equal(text_type(q), "(a:a: AND a:alfa AND a:c: AND a:bravo)") qp.remove_plugin_class(plugins.FieldsPlugin) qp.remove_plugin_class(plugins.RangePlugin) q = qp.parse(u("hello a:>500 there")) assert_equal(text_type(q), "(a:hello AND a:a: AND a:500 AND a:there)")
def test_pseudofield(): schema = fields.Schema(a=fields.KEYWORD, b=fields.TEXT) def regex_maker(node): if node.has_text: node = qparser.RegexPlugin.RegexNode(node.text) node.set_fieldname("content") return node qp = qparser.QueryParser("a", schema) qp.add_plugin(qparser.PseudoFieldPlugin({"regex": regex_maker})) q = qp.parse(u("alfa regex:br.vo")) assert_equal(q.__unicode__(), '(a:alfa AND content:r"br.vo")') def rev_text(node): if node.has_text: # Create a word node for the reversed text revtext = node.text[::-1] # Reverse the text rnode = qparser.WordNode(revtext) # Duplicate the original node's start and end char rnode.set_range(node.startchar, node.endchar) # Put the original node and the reversed node in an OrGroup group = qparser.OrGroup([node, rnode]) # Need to set the fieldname here because the PseudoFieldPlugin # removes the field name syntax group.set_fieldname("reverse") return group qp = qparser.QueryParser("content", schema) qp.add_plugin(qparser.PseudoFieldPlugin({"reverse": rev_text})) q = qp.parse(u("alfa reverse:bravo")) assert_equal(q.__unicode__(), '(content:alfa AND (reverse:bravo OR reverse:ovarb))')
def test_hit_column(): # Not stored schema = fields.Schema(text=fields.TEXT()) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(text=u("alfa bravo charlie")) with ix.searcher() as s: r = s.search(query.Term("text", "alfa")) assert len(r) == 1 hit = r[0] with pytest.raises(KeyError): _ = hit["text"] # With column schema = fields.Schema(text=fields.TEXT(sortable=True)) ix = RamStorage().create_index(schema) with ix.writer(codec=W3Codec()) as w: w.add_document(text=u("alfa bravo charlie")) with ix.searcher() as s: r = s.search(query.Term("text", "alfa")) assert len(r) == 1 hit = r[0] assert hit["text"] == u("alfa bravo charlie")
def test_sorting_function(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT(stored=True, vector=True)) ix = RamStorage().create_index(schema) w = ix.writer() domain = ("alfa", "bravo", "charlie") count = 1 for w1 in domain: for w2 in domain: for w3 in domain: for w4 in domain: w.add_document(id=count, text=u(" ").join((w1, w2, w3, w4))) count += 1 w.commit() def fn(searcher, docnum): v = dict(searcher.vector_as("frequency", docnum, "text")) # Sort documents that have equal number of "alfa" # and "bravo" first return 0 - 1.0 / (abs(v.get("alfa", 0) - v.get("bravo", 0)) + 1.0) fnfacet = sorting.FunctionFacet(fn) with ix.searcher() as s: q = query.And([query.Term("text", u("alfa")), query.Term("text", u("bravo"))]) results = s.search(q, sortedby=fnfacet) r = [hit["text"] for hit in results] for t in r[:10]: tks = t.split() assert tks.count("alfa") == tks.count("bravo")
def test_compound_sort(): fspec = fields.KEYWORD(stored=True, sortable=True) schema = fields.Schema(a=fspec, b=fspec, c=fspec) ix = RamStorage().create_index(schema) alist = u("alfa bravo alfa bravo alfa bravo alfa bravo alfa bravo").split() blist = u("alfa bravo charlie alfa bravo charlie alfa bravo charlie alfa").split() clist = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet").split() assert all(len(ls) == 10 for ls in (alist, blist, clist)) with ix.writer() as w: for i in xrange(10): w.add_document(a=alist[i], b=blist[i], c=clist[i]) with ix.searcher() as s: q = query.Every() sortedby = [sorting.FieldFacet("a"), sorting.FieldFacet("b", reverse=True), sorting.FieldFacet("c")] r = s.search(q, sortedby=sortedby) output = [] for hit in r: output.append(" ".join((hit["a"], hit["b"], hit["c"]))) assert output == [ "alfa charlie charlie", "alfa charlie india", "alfa bravo echo", "alfa alfa alfa", "alfa alfa golf", "bravo charlie foxtrot", "bravo bravo bravo", "bravo bravo hotel", "bravo alfa delta", "bravo alfa juliet", ]
def test_overlapping_lists(): schema = fields.Schema(id=fields.STORED, tags=fields.KEYWORD) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, tags=u("alfa bravo charlie")) w.add_document(id=1, tags=u("bravo charlie delta")) w.add_document(id=2, tags=u("charlie delta echo")) w.add_document(id=3, tags=u("delta echo alfa")) w.add_document(id=4, tags=u("echo alfa bravo")) with ix.searcher() as s: of = sorting.FieldFacet("tags", allow_overlap=True) cat = of.categorizer(s) assert not cat._use_vectors r = s.search(query.Every(), groupedby={"tags": of}) assert r.groups("tags") == { "alfa": [0, 3, 4], "bravo": [0, 1, 4], "charlie": [0, 1, 2], "delta": [1, 2, 3], "echo": [2, 3, 4], } fcts = sorting.Facets() fcts.add_field("tags", allow_overlap=True) r = s.search(query.Every(), groupedby=fcts) assert r.groups("tags") == { "alfa": [0, 3, 4], "bravo": [0, 1, 4], "charlie": [0, 1, 2], "delta": [1, 2, 3], "echo": [2, 3, 4], }
def check(method): with TempIndex(get_schema()) as ix: method(ix) with ix.searcher() as s: results = s.search(query.Every(), groupedby="tag") groups = results.groups() assert sorted(groups.items()) == [(u("one"), [0, 6]), (u("three"), [1, 3, 7, 8]), (u("two"), [2, 4, 5])]
def test_term_inspection(): schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT) st = RamStorage() ix = st.create_index(schema) writer = ix.writer() writer.add_document(title=u("My document"), content=u("AA AA BB BB CC AA AA AA BB BB CC DD EE EE")) writer.add_document(title=u("My other document"), content=u("AA AB BB CC EE EE AX AX DD")) writer.commit() reader = ix.reader() assert " ".join(reader.field_terms("content")) == "aa ab ax bb cc dd ee" assert list(reader.expand_prefix("content", "a")) == [b('aa'), b('ab'), b('ax')] assert set(reader.all_terms()) == set([('content', b('aa')), ('content', b('ab')), ('content', b('ax')), ('content', b('bb')), ('content', b('cc')), ('content', b('dd')), ('content', b('ee')), ('title', b('document')), ('title', b('my')), ('title', b('other'))]) # (text, doc_freq, index_freq) assert _fstats(reader.iter_field("content")) == [(b('aa'), 2, 6), (b('ab'), 1, 1), (b('ax'), 1, 2), (b('bb'), 2, 5), (b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4)] assert _fstats(reader.iter_field("content", prefix="c")) == [(b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4)] assert list(reader.most_frequent_terms("content")) == [(6, b('aa')), (5, b('bb')), (4, b('ee')), (3, b('cc')), (2, b('dd'))] assert list(reader.most_frequent_terms("content", prefix="a")) == [(6, b('aa')), (2, b('ax')), (1, b('ab'))] assert list(reader.most_distinctive_terms("content", 3)) == [(1.3862943611198906, b('ax')), (0.6931471805599453, b('ab')), (0.0, b('ee'))]
def test_suggest_prefix(): domain = ("Shoot To Kill", "Bloom, Split and Deviate", "Rankle the Seas and the Skies", "Lightning Flash Flame Shell", "Flower Wind Rage and Flower God Roar, Heavenly Wind Rage and " "Heavenly Demon Sneer", "All Waves, Rise now and Become my Shield, Lightning, Strike " "now and Become my Blade", "Cry, Raise Your Head, Rain Without end", "Sting All Enemies To Death", "Reduce All Creation to Ash", "Sit Upon the Frozen Heavens", "Call forth the Twilight") schema = fields.Schema(content=fields.TEXT(stored=True, spelling=True), quick=fields.NGRAM(maxsize=10, stored=True)) with TempIndex(schema, "sugprefix") as ix: with ix.writer() as w: for item in domain: content = u(item) w.add_document(content=content, quick=content) with ix.searcher() as s: sugs = s.suggest("content", u("ra"), maxdist=2, prefix=2) assert sugs == ['rage', 'rain'] sugs = s.suggest("content", "ra", maxdist=2, prefix=1) assert sugs == ["rage", "rain", "roar"]
def test_variations(): _run_query(query.Variations("value", u("render")), [u("A"), u("C"), u("E")])
def test_find_missing(): schema = fields.Schema(id=fields.ID, text=fields.KEYWORD(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("1"), text=u("alfa")) w.add_document(id=u("2"), text=u("bravo")) w.add_document(text=u("charlie")) w.add_document(id=u("4"), text=u("delta")) w.add_document(text=u("echo")) w.add_document(id=u("6"), text=u("foxtrot")) w.add_document(text=u("golf")) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("text", schema) q = qp.parse(u("NOT id:*")) r = s.search(q, limit=None) assert_equal(list(h["text"] for h in r), ["charlie", "echo", "golf"])
def test_missing_wildcard(): schema = fields.Schema(id=fields.ID(stored=True), f1=fields.TEXT, f2=fields.TEXT) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=u("1"), f1=u("alfa"), f2=u("apple")) w.add_document(id=u("2"), f1=u("bravo")) w.add_document(id=u("3"), f1=u("charlie"), f2=u("candy")) w.add_document(id=u("4"), f2=u("donut")) w.add_document(id=u("5")) w.commit() with ix.searcher() as s: r = s.search(query.Every("id")) assert_equal(sorted([d['id'] for d in r]), ["1", "2", "3", "4", "5"]) r = s.search(query.Every("f1")) assert_equal(sorted([d['id'] for d in r]), ["1", "2", "3"]) r = s.search(query.Every("f2")) assert_equal(sorted([d['id'] for d in r]), ["1", "3", "4"])
def test_wildcard(): _run_query(query.Or([query.Wildcard('value', u('*red*')), query.Wildcard('name', u('*yellow*'))]), [u("A"), u("C"), u("D"), u("E")]) # Missing _run_query(query.Wildcard('value', 'glonk*'), [])
def test_phrase_score(): schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u("A"), value=u("Little Miss Muffet sat on a tuffet")) writer.add_document(name=u("D"), value=u("Gibberish blonk falunk miss muffet sat " + "tuffet garbonzo")) writer.add_document(name=u("E"), value=u("Blah blah blah pancakes")) writer.add_document(name=u("F"), value=u("Little miss muffet little miss muffet")) writer.commit() with ix.searcher() as s: q = query.Phrase("value", [u("little"), u("miss"), u("muffet")]) m = q.matcher(s) assert_equal(m.id(), 0) score1 = m.weight() assert score1 > 0 m.next() assert_equal(m.id(), 3) assert m.weight() > score1
def test_posting_phrase(): schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u("A"), value=u("Little Miss Muffet sat on a tuffet")) writer.add_document(name=u("B"), value=u("Miss Little Muffet tuffet")) writer.add_document(name=u("C"), value=u("Miss Little Muffet tuffet sat")) writer.add_document(name=u("D"), value=u("Gibberish blonk falunk miss muffet sat " + "tuffet garbonzo")) writer.add_document(name=u("E"), value=u("Blah blah blah pancakes")) writer.commit() with ix.searcher() as s: def names(results): return sorted([fields['name'] for fields in results]) q = query.Phrase("value", [u("little"), u("miss"), u("muffet"), u("sat"), u("tuffet")]) m = q.matcher(s) assert_equal(m.__class__.__name__, "SpanNearMatcher") r = s.search(q) assert_equal(names(r), ["A"]) assert_equal(len(r), 1) q = query.Phrase("value", [u("miss"), u("muffet"), u("sat"), u("tuffet")]) assert_equal(names(s.search(q)), ["A", "D"]) q = query.Phrase("value", [u("falunk"), u("gibberish")]) r = s.search(q) assert_equal(names(r), []) assert_equal(len(r), 0) q = query.Phrase("value", [u("gibberish"), u("falunk")], slop=2) assert_equal(names(s.search(q)), ["D"]) q = query.Phrase("value", [u("blah")] * 4) assert_equal(names(s.search(q)), []) # blah blah blah blah q = query.Phrase("value", [u("blah")] * 3) m = q.matcher(s) assert_equal(names(s.search(q)), ["E"])
def test_multireader(): sc = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) st = RamStorage() ix = st.create_index(sc) w = ix.writer() w.add_document(id=u("alfa"), content=u("alfa")) w.add_document(id=u("bravo"), content=u("bravo")) w.add_document(id=u("charlie"), content=u("charlie")) w.add_document(id=u("delta"), content=u("delta")) w.add_document(id=u("echo"), content=u("echo")) w.add_document(id=u("foxtrot"), content=u("foxtrot")) w.add_document(id=u("golf"), content=u("golf")) w.add_document(id=u("hotel"), content=u("hotel")) w.add_document(id=u("india"), content=u("india")) w.commit() with ix.searcher() as s: r = s.search(query.Term("content", u("bravo"))) assert_equal(len(r), 1) assert_equal(r[0]["id"], "bravo") w = ix.writer() w.add_document(id=u("juliet"), content=u("juliet")) w.add_document(id=u("kilo"), content=u("kilo")) w.add_document(id=u("lima"), content=u("lima")) w.add_document(id=u("mike"), content=u("mike")) w.add_document(id=u("november"), content=u("november")) w.add_document(id=u("oscar"), content=u("oscar")) w.add_document(id=u("papa"), content=u("papa")) w.add_document(id=u("quebec"), content=u("quebec")) w.add_document(id=u("romeo"), content=u("romeo")) w.commit() assert_equal(len(ix._segments()), 2) #r = ix.reader() #assert r.__class__.__name__, "MultiReader") #pr = r.postings("content", u("bravo")) with ix.searcher() as s: r = s.search(query.Term("content", u("bravo"))) assert_equal(len(r), 1) assert_equal(r[0]["id"], "bravo")
def test_merged(): sc = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) st = RamStorage() ix = st.create_index(sc) w = ix.writer() w.add_document(id=u("alfa"), content=u("alfa")) w.add_document(id=u("bravo"), content=u("bravo")) w.add_document(id=u("charlie"), content=u("charlie")) w.add_document(id=u("delta"), content=u("delta")) w.commit() with ix.searcher() as s: r = s.search(query.Term("content", u("bravo"))) assert_equal(len(r), 1) assert_equal(r[0]["id"], "bravo") w = ix.writer() w.add_document(id=u("echo"), content=u("echo")) w.commit() assert_equal(len(ix._segments()), 1) with ix.searcher() as s: r = s.search(query.Term("content", u("bravo"))) assert_equal(len(r), 1) assert_equal(r[0]["id"], "bravo")
def test_range(): schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=u("A"), content=u("alfa bravo charlie delta echo")) w.add_document(id=u("B"), content=u("bravo charlie delta echo foxtrot")) w.add_document(id=u("C"), content=u("charlie delta echo foxtrot golf")) w.add_document(id=u("D"), content=u("delta echo foxtrot golf hotel")) w.add_document(id=u("E"), content=u("echo foxtrot golf hotel india")) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("content", schema) q = qp.parse(u("charlie [delta TO foxtrot]")) assert_equal(q.__class__, query.And) assert_equal(q[0].__class__, query.Term) assert_equal(q[1].__class__, query.TermRange) assert_equal(q[1].start, "delta") assert_equal(q[1].end, "foxtrot") assert_equal(q[1].startexcl, False) assert_equal(q[1].endexcl, False) ids = sorted([d['id'] for d in s.search(q)]) assert_equal(ids, [u('A'), u('B'), u('C')]) q = qp.parse(u("foxtrot {echo TO hotel]")) assert_equal(q.__class__, query.And) assert_equal(q[0].__class__, query.Term) assert_equal(q[1].__class__, query.TermRange) assert_equal(q[1].start, "echo") assert_equal(q[1].end, "hotel") assert_equal(q[1].startexcl, True) assert_equal(q[1].endexcl, False) ids = sorted([d['id'] for d in s.search(q)]) assert_equal(ids, [u('B'), u('C'), u('D'), u('E')]) q = qp.parse(u("{bravo TO delta}")) assert_equal(q.__class__, query.TermRange) assert_equal(q.start, "bravo") assert_equal(q.end, "delta") assert_equal(q.startexcl, True) assert_equal(q.endexcl, True) ids = sorted([d['id'] for d in s.search(q)]) assert_equal(ids, [u('A'), u('B'), u('C')]) # Shouldn't match anything q = qp.parse(u("[1 to 10]")) assert_equal(q.__class__, query.TermRange) assert_equal(len(s.search(q)), 0)
def test_not2(): schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u("a"), value=u("alfa bravo charlie delta echo")) writer.add_document(name=u("b"), value=u("bravo charlie delta echo foxtrot")) writer.add_document(name=u("c"), value=u("charlie delta echo foxtrot golf")) writer.add_document(name=u("d"), value=u("delta echo golf hotel india")) writer.add_document(name=u("e"), value=u("echo golf hotel india juliet")) writer.commit() with ix.searcher() as s: p = qparser.QueryParser("value", None) results = s.search(p.parse("echo NOT golf")) assert_equal(sorted([d["name"] for d in results]), ["a", "b"]) results = s.search(p.parse("echo NOT bravo")) assert_equal(sorted([d["name"] for d in results]), ["c", "d", "e"]) ix.delete_by_term("value", u("bravo")) with ix.searcher() as s: results = s.search(p.parse("echo NOT charlie")) assert_equal(sorted([d["name"] for d in results]), ["d", "e"])
def make_index(): s = fields.Schema(key=fields.ID(stored=True), name=fields.TEXT, value=fields.TEXT) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(key=u("A"), name=u("Yellow brown"), value=u("Blue red green render purple?")) w.add_document(key=u("B"), name=u("Alpha beta"), value=u("Gamma delta epsilon omega.")) w.add_document(key=u("C"), name=u("One two"), value=u("Three rendered four five.")) w.add_document(key=u("D"), name=u("Quick went"), value=u("Every red town.")) w.add_document(key=u("E"), name=u("Yellow uptown"), value=u("Interest rendering outer photo!")) w.commit() return ix
def test_collect_limit(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id="a", text=u("alfa bravo charlie delta echo")) w.add_document(id="b", text=u("bravo charlie delta echo foxtrot")) w.add_document(id="c", text=u("charlie delta echo foxtrot golf")) w.add_document(id="d", text=u("delta echo foxtrot golf hotel")) w.add_document(id="e", text=u("echo foxtrot golf hotel india")) w.commit() with ix.searcher() as s: r = s.search(query.Term("text", u("golf")), limit=10) assert_equal(len(r), 3) count = 0 for _ in r: count += 1 assert_equal(count, 3) w = ix.writer() w.add_document(id="f", text=u("foxtrot golf hotel india juliet")) w.add_document(id="g", text=u("golf hotel india juliet kilo")) w.add_document(id="h", text=u("hotel india juliet kilo lima")) w.add_document(id="i", text=u("india juliet kilo lima mike")) w.add_document(id="j", text=u("juliet kilo lima mike november")) w.commit(merge=False) with ix.searcher() as s: r = s.search(query.Term("text", u("golf")), limit=20) assert_equal(len(r), 5) count = 0 for _ in r: count += 1 assert_equal(count, 5)
def test_filter(): schema = fields.Schema(id=fields.STORED, path=fields.ID, text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, path=u("/a/1"), text=u("alfa bravo charlie")) w.add_document(id=2, path=u("/b/1"), text=u("bravo charlie delta")) w.add_document(id=3, path=u("/c/1"), text=u("charlie delta echo")) w.commit(merge=False) w = ix.writer() w.add_document(id=4, path=u("/a/2"), text=u("delta echo alfa")) w.add_document(id=5, path=u("/b/2"), text=u("echo alfa bravo")) w.add_document(id=6, path=u("/c/2"), text=u("alfa bravo charlie")) w.commit(merge=False) w = ix.writer() w.add_document(id=7, path=u("/a/3"), text=u("bravo charlie delta")) w.add_document(id=8, path=u("/b/3"), text=u("charlie delta echo")) w.add_document(id=9, path=u("/c/3"), text=u("delta echo alfa")) w.commit(merge=False) with ix.searcher() as s: fq = query.Or([query.Prefix("path", "/a"), query.Prefix("path", "/b")]) r = s.search(query.Term("text", "alfa"), filter=fq) assert_equal([d["id"] for d in r], [1, 4, 5]) r = s.search(query.Term("text", "bravo"), filter=fq) assert_equal([d["id"] for d in r], [1, 2, 5, 7, ])
def test_fieldboost(): schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, a=u("alfa bravo charlie"), b=u("echo foxtrot india")) w.add_document(id=1, a=u("delta bravo charlie"), b=u("alfa alfa alfa")) w.add_document(id=2, a=u("alfa alfa alfa"), b=u("echo foxtrot india")) w.add_document(id=3, a=u("alfa sierra romeo"), b=u("alfa tango echo")) w.add_document(id=4, a=u("bravo charlie delta"), b=u("alfa foxtrot india")) w.add_document(id=5, a=u("alfa alfa echo"), b=u("tango tango tango")) w.add_document(id=6, a=u("alfa bravo echo"), b=u("alfa alfa tango")) w.commit() def field_booster(fieldname, factor=2.0): "Returns a function which will boost the given field in a query tree" def booster_fn(obj): if obj.is_leaf() and obj.field() == fieldname: obj = copy.deepcopy(obj) obj.boost *= factor return obj else: return obj return booster_fn with ix.searcher() as s: q = query.Or([query.Term("a", u("alfa")), query.Term("b", u("alfa"))]) q = q.accept(field_booster("a", 100.0)) assert_equal(text_type(q), text_type("(a:alfa^100.0 OR b:alfa)")) r = s.search(q) assert_equal([hit["id"] for hit in r], [2, 5, 6, 3, 0, 1, 4])
def test_random_intersections(): domain = [ u("alpha"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"), u("kilo"), u("lima"), u("mike") ] segments = 5 docsperseg = 50 fieldlimits = (3, 10) documents = [] schema = fields.Schema(key=fields.STORED, value=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) # Create docsperseg * segments documents containing random words from # the domain list. Add the documents to the index, but also keep them # in the "documents" list for the sanity check for i in xrange(segments): w = ix.writer() for j in xrange(docsperseg): docnum = i * docsperseg + j # Create a string of random words doc = u(" ").join( choice(domain) for _ in xrange(randint(*fieldlimits))) # Add the string to the index w.add_document(key=docnum, value=doc) # Add a (docnum, string) tuple to the documents list documents.append((docnum, doc)) w.commit() assert len(ix._segments()) != 1 testcount = 20 testlimits = (2, 5) with ix.searcher() as s: for i in xrange(s.doc_count_all()): assert s.stored_fields(i).get("key") is not None for _ in xrange(testcount): # Create a random list of words and manually do an intersection of # items in "documents" that contain the words ("target"). words = sample(domain, randint(*testlimits)) target = [] for docnum, doc in documents: if all((doc.find(w) > -1) for w in words): target.append(docnum) target.sort() # Create a query from the list of words and get two matchers from # it. q = And([Term("value", w) for w in words]) m1 = q.matcher(s) m2 = q.matcher(s) # Try getting the list of IDs from all_ids() ids1 = list(m1.all_ids()) # Try getting the list of IDs using id()/next() ids2 = [] while m2.is_active(): ids2.append(m2.id()) m2.next() # Check that the two methods return the same list assert ids1 == ids2 # Check that the IDs match the ones we manually calculated assert _keys(s, ids1) == target
def test_andnot(): _run_query(query.AndNot(query.Term("name", u("yellow")), query.Term("value", u("purple"))), [u("E")])
def test_intersection(): schema = fields.Schema(key=fields.ID(stored=True), value=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(key=u("a"), value=u("alpha bravo charlie delta")) w.add_document(key=u("b"), value=u("echo foxtrot alpha bravo")) w.add_document(key=u("c"), value=u("charlie delta golf hotel")) w.commit() w = ix.writer() w.add_document(key=u("d"), value=u("india alpha bravo charlie")) w.add_document(key=u("e"), value=u("delta bravo india bravo")) w.commit() with ix.searcher() as s: q = And([Term("value", u("bravo")), Term("value", u("delta"))]) m = q.matcher(s) assert _keys(s, m.all_ids()) == ["a", "e"] q = And([Term("value", u("bravo")), Term("value", u("alpha"))]) m = q.matcher(s) assert _keys(s, m.all_ids()) == ["a", "b", "d"]
def test_topnot(): _run_query(query.Not(query.Term("value", "red")), [u("B"), "C", "E"]) _run_query(query.Not(query.Term("name", "yellow")), [u("B"), u("C"), u("D")])
def test_add_spelling(): schema = fields.Schema(text1=fields.TEXT, text2=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text1=u("render zorro kaori postal"), text2=u("alfa")) w.add_document(text1=u("reader zebra koala pastry"), text2=u("alpa")) w.add_document(text1=u("leader libra ooala paster"), text2=u("alpha")) w.add_document(text1=u("feeder lorry zoala baster"), text2=u("olfo")) w.commit() with ix.reader() as r: assert not r.has_word_graph("text1") assert not r.has_word_graph("text2") from whoosh.filedb.filewriting import add_spelling add_spelling(ix, ["text1", "text2"]) with ix.reader() as r: assert r.has_word_graph("text1") assert r.has_word_graph("text2") sp = spelling.ReaderCorrector(r, "text1") assert_equal(sp.suggest(u("kaola"), maxdist=1), [u('koala')]) assert_equal( sp.suggest(u("kaola"), maxdist=2), [u('koala'), u('kaori'), u('ooala'), u('zoala')]) sp = spelling.ReaderCorrector(r, "text2") assert_equal(sp.suggest(u("alfo"), maxdist=1), [u("alfa"), u("olfo")])
def test_require(): _run_query(query.Require(query.Term("value", u("red")), query.Term("name", u("yellow"))), [u("A")])
def test_addfield(): schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) with TempIndex(schema, "addfield") as ix: w = ix.writer() w.add_document(id=u("a"), content=u("alfa")) w.add_document(id=u("b"), content=u("bravo")) w.add_document(id=u("c"), content=u("charlie")) w.commit() ix.add_field("added", fields.KEYWORD(stored=True)) w = ix.writer() w.add_document(id=u("d"), content=u("delta"), added=u("fourth")) w.add_document(id=u("e"), content=u("echo"), added=u("fifth")) w.commit(merge=False) with ix.searcher() as s: assert ("id", "d") in s.reader() assert_equal(s.document(id="d"), {"id": "d", "added": "fourth"}) assert_equal(s.document(id="b"), {"id": "b"})
from __future__ import with_statement import gzip from whoosh import analysis, fields, highlight, query, spelling from whoosh.compat import u from whoosh.qparser import QueryParser from whoosh.support.levenshtein import levenshtein from whoosh.util.testing import TempIndex _wordlist = sorted( u("render animation animate shader shading zebra koala" "ready kismet reaction page delete quick fox jumped" "over lazy dog wicked erase red team yellow under interest" "open print acrid sear deaf feed grow heal jolly kilt" "low zone xylophone crown vale brown neat meat reduction" "blunder preaction lamppost").split()) def test_list_corrector(): corr = spelling.ListCorrector(_wordlist) typo = "reoction" sugs = list(corr.suggest(typo, maxdist=2)) target = [] for lev_dist in range(1, 3): # sugs will return suggest first ordered by levenshtein distance # then second order by dictionary order target += [ w for w in _wordlist if levenshtein(typo, w) <= lev_dist and w not in target ] assert sugs == target
def test_removefield(): schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT, city=fields.KEYWORD(stored=True)) with TempIndex(schema, "removefield") as ix: w = ix.writer() w.add_document(id=u("b"), content=u("bravo"), city=u("baghdad")) w.add_document(id=u("c"), content=u("charlie"), city=u("cairo")) w.add_document(id=u("d"), content=u("delta"), city=u("dakar")) w.commit() with ix.searcher() as s: assert_equal(s.document(id=u("c")), {"id": "c", "city": "cairo"}) w = ix.writer() w.remove_field("content") w.remove_field("city") w.commit() ixschema = ix._current_schema() assert_equal(ixschema.names(), ["id"]) assert_equal(ixschema.stored_names(), ["id"]) with ix.searcher() as s: assert ("content", u("charlie")) not in s.reader() assert_equal(s.document(id=u("c")), {"id": u("c")})
def test_reader_corrector(): schema = fields.Schema(text=fields.TEXT(spelling=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("render zorro kaori postal")) w.add_document(text=u("reader zebra koala pastry")) w.add_document(text=u("leader libra ooala paster")) w.add_document(text=u("feeder lorry zoala baster")) w.commit() with ix.reader() as r: assert r.has_word_graph("text") sp = spelling.ReaderCorrector(r, "text") assert_equal(sp.suggest(u("kaola"), maxdist=1), [u('koala')]) assert_equal( sp.suggest(u("kaola"), maxdist=2), [u('koala'), u('kaori'), u('ooala'), u('zoala')])
def _multi_segment_index(): ix = _create_index() w = ix.writer() w.add_document(f1=u("A B C"), f2=u("1 2 3"), f3=u("X Y Z")) w.add_document(f1=u("D E F"), f2=u("4 5 6"), f3=u("Q R S")) w.commit() w = ix.writer() w.add_document(f1=u("A E C"), f2=u("1 4 6"), f3=u("X Q S")) w.add_document(f1=u("A A A"), f2=u("2 3 5"), f3=u("Y R Z")) w.commit(merge=False) w = ix.writer() w.add_document(f1=u("A B"), f2=u("1 2"), f3=u("X Y")) w.commit(merge=False) return ix
def test_optimize_away(): schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT, city=fields.KEYWORD(stored=True)) with TempIndex(schema, "optimizeaway") as ix: w = ix.writer() w.add_document(id=u("b"), content=u("bravo"), city=u("baghdad")) w.add_document(id=u("c"), content=u("charlie"), city=u("cairo")) w.add_document(id=u("d"), content=u("delta"), city=u("dakar")) w.commit() with ix.searcher() as s: assert_equal(s.document(id=u("c")), {"id": "c", "city": "cairo"}) w = ix.writer() w.remove_field("content") w.remove_field("city") w.commit(optimize=True) with ix.searcher() as s: assert ("content", u("charlie")) not in s.reader() assert_equal(s.document(id=u("c")), {"id": u("c")})
def test_short_prefix(): s = fields.Schema(name=fields.ID, value=fields.TEXT) qp = qparser.QueryParser("value", schema=s) q = qp.parse(u("s*")) assert_equal(q.__class__.__name__, "Prefix") assert_equal(q.text, "s")
def __unicode__(self): return u("%s:<%s>") % (self.fieldname, self.text)
def test_term(): _run_query(query.Term("name", u("yellow")), [u("A"), u("E")]) _run_query(query.Term("value", u("zeta")), []) _run_query(query.Term("value", u("red")), [u("A"), u("D")])
def test_first_id(): schema = fields.Schema(path=fields.ID(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(path=u("/a")) w.add_document(path=u("/b")) w.add_document(path=u("/c")) w.commit() r = ix.reader() docid = r.first_id("path", u("/b")) assert r.stored_fields(docid) == {"path": "/b"} ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(path=u("/a")) w.add_document(path=u("/b")) w.add_document(path=u("/c")) w.commit(merge=False) w = ix.writer() w.add_document(path=u("/d")) w.add_document(path=u("/e")) w.add_document(path=u("/f")) w.commit(merge=False) w = ix.writer() w.add_document(path=u("/g")) w.add_document(path=u("/h")) w.add_document(path=u("/i")) w.commit(merge=False) r = ix.reader() assert r.__class__ == reading.MultiReader docid = r.first_id("path", u("/e")) assert r.stored_fields(docid) == {"path": "/e"} with pytest.raises(NotImplementedError): r.cursor("path")