def _create_index(): s = fields.Schema(f1 = fields.KEYWORD(stored = True), f2 = fields.KEYWORD, f3 = fields.KEYWORD) st = RamStorage() ix = st.create_index(s) return ix
def test_term_inspection(): schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT) st = RamStorage() ix = st.create_index(schema) writer = ix.writer() writer.add_document(title=u("My document"), content=u("AA AA BB BB CC AA AA AA BB BB CC DD EE EE")) writer.add_document(title=u("My other document"), content=u("AA AB BB CC EE EE AX AX DD")) writer.commit() reader = ix.reader() assert_equal(list(reader.lexicon("content")), [u('aa'), u('ab'), u('ax'), u('bb'), u('cc'), u('dd'), u('ee')]) assert_equal(list(reader.expand_prefix("content", "a")), [u('aa'), u('ab'), u('ax')]) assert (set(reader.all_terms()) == set([('content', u('aa')), ('content', u('ab')), ('content', u('ax')), ('content', u('bb')), ('content', u('cc')), ('content', u('dd')), ('content', u('ee')), ('title', u('document')), ('title', u('my')), ('title', u('other'))])) # (text, doc_freq, index_freq) assert_equal(_fstats(reader.iter_field("content")), [(u('aa'), 2, 6), (u('ab'), 1, 1), (u('ax'), 1, 2), (u('bb'), 2, 5), (u('cc'), 2, 3), (u('dd'), 2, 2), (u('ee'), 2, 4)]) assert_equal(_fstats(reader.iter_field("content", prefix="c")), [(u('cc'), 2, 3), (u('dd'), 2, 2), (u('ee'), 2, 4)]) assert_equal(list(reader.most_frequent_terms("content")), [(6, u('aa')), (5, u('bb')), (4, u('ee')), (3, u('cc')), (2, u('dd'))]) assert_equal(list(reader.most_frequent_terms("content", prefix="a")), [(6, u('aa')), (2, u('ax')), (1, u('ab'))])
def test_ramstorage(): from whoosh.filedb.filestore import RamStorage st = RamStorage() lock = st.lock("test") lock.acquire() lock.release()
def test_decimal_numeric(): from decimal import Decimal f = fields.NUMERIC(int, decimal_places=4) schema = fields.Schema(id=fields.ID(stored=True), deci=f) ix = RamStorage().create_index(schema) # assert f.from_text(f.to_text(Decimal("123.56"))), Decimal("123.56")) w = ix.writer() w.add_document(id=u("a"), deci=Decimal("123.56")) w.add_document(id=u("b"), deci=Decimal("0.536255")) w.add_document(id=u("c"), deci=Decimal("2.5255")) w.add_document(id=u("d"), deci=Decimal("58")) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("deci", schema) q = qp.parse(u("123.56")) r = s.search(q) assert len(r) == 1 assert r[0]["id"] == "a" r = s.search(qp.parse(u("0.536255"))) assert len(r) == 1 assert r[0]["id"] == "b"
def test_add_spelling(): schema = fields.Schema(text1=fields.TEXT, text2=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text1=u("render zorro kaori postal"), text2=u("alfa")) w.add_document(text1=u("reader zebra koala pastry"), text2=u("alpa")) w.add_document(text1=u("leader libra ooala paster"), text2=u("alpha")) w.add_document(text1=u("feeder lorry zoala baster"), text2=u("olfo")) w.commit() with ix.reader() as r: assert not r.has_word_graph("text1") assert not r.has_word_graph("text2") from whoosh.writing import add_spelling add_spelling(ix, ["text1", "text2"]) with ix.reader() as r: assert r.has_word_graph("text1") assert r.has_word_graph("text2") sp = spelling.ReaderCorrector(r, "text1") assert sp.suggest(u("kaola"), maxdist=1) == [u('koala')] assert sp.suggest(u("kaola"), maxdist=2) == [u('koala'), u('kaori'), u('ooala'), u('zoala')] sp = spelling.ReaderCorrector(r, "text2") assert sp.suggest(u("alfo"), maxdist=1) == [u("alfa"), u("olfo")]
def test_compatibility(): from whoosh.scoring import Weighting # This is the old way of doing a custom weighting model, check that # it's still supported... class LegacyWeighting(Weighting): use_final = True def score(self, searcher, fieldname, text, docnum, weight): return weight + 0.5 def final(self, searcher, docnum, score): return score * 1.5 schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() domain = "alfa bravo charlie delta".split() for ls in permutations(domain, 3): w.add_document(text=u(" ").join(ls)) w.commit() s = ix.searcher(weighting=LegacyWeighting()) r = s.search(query.Term("text", u("bravo"))) assert r.score(0) == 2.25
def test_boolean_strings(): schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(i=0, b="true") w.add_document(i=1, b="True") w.add_document(i=2, b="false") w.add_document(i=3, b="False") w.add_document(i=4, b=u("true")) w.add_document(i=5, b=u("True")) w.add_document(i=6, b=u("false")) w.add_document(i=7, b=u("False")) with ix.searcher() as s: qp = qparser.QueryParser("b", ix.schema) def check(qs, nums): q = qp.parse(qs) r = s.search(q, limit=None) assert [hit["i"] for hit in r] == nums trues = [0, 1, 4, 5] falses = [2, 3, 6, 7] check("true", trues) check("True", trues) check("false", falses) check("False", falses) check("t", trues) check("f", falses)
def test_numeric_ranges(): schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC) ix = RamStorage().create_index(schema) w = ix.writer() for i in xrange(400): w.add_document(id=i, num=i) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("num", schema) def check(qs, target): q = qp.parse(qs) result = [s.stored_fields(d)["id"] for d in q.docs(s)] assert result == target # Note that range() is always inclusive-exclusive check("[10 to 390]", list(range(10, 390 + 1))) check("[100 to]", list(range(100, 400))) check("[to 350]", list(range(0, 350 + 1))) check("[16 to 255]", list(range(16, 255 + 1))) check("{10 to 390]", list(range(11, 390 + 1))) check("[10 to 390}", list(range(10, 390))) check("{10 to 390}", list(range(11, 390))) check("{16 to 255}", list(range(17, 255)))
def test_nested_parent(): schema = fields.Schema(name=fields.ID(stored=True), type=fields.ID, part=fields.ID, price=fields.NUMERIC) ix = RamStorage().create_index(schema) with ix.writer() as w: with w.group(): w.add_document(name=u("iPad"), type=u("product")) w.add_document(part=u("screen"), price=100) w.add_document(part=u("battery"), price=50) w.add_document(part=u("case"), price=20) with w.group(): w.add_document(name=u("iPhone"), type=u("product")) w.add_document(part=u("screen"), price=60) w.add_document(part=u("battery"), price=30) w.add_document(part=u("case"), price=10) with w.group(): w.add_document(name=u("Mac mini"), type=u("product")) w.add_document(part=u("hard drive"), price=50) w.add_document(part=u("case"), price=50) with ix.searcher() as s: price = s.schema["price"] pq = query.Term("type", "product") cq = query.Term("price", 50) q = query.NestedParent(pq, cq) r = s.search(q) assert sorted([hit["name"] for hit in r]) == ["Mac mini", "iPad"]
def test_datetime(): dtf = fields.DATETIME(stored=True) schema = fields.Schema(id=fields.ID(stored=True), date=dtf) st = RamStorage() ix = st.create_index(schema) w = ix.writer() for month in xrange(1, 12): for day in xrange(1, 28): w.add_document(id=u("%s-%s") % (month, day), date=datetime(2010, month, day, 14, 0, 0)) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("id", schema) r = s.search(qp.parse("date:20100523")) assert len(r) == 1 assert r[0]["id"] == "5-23" assert r[0]["date"].__class__ is datetime assert r[0]["date"].month == 5 assert r[0]["date"].day == 23 r = s.search(qp.parse("date:'2010 02'")) assert len(r) == 27 q = qp.parse(u("date:[2010-05 to 2010-08]")) startdt = datetime(2010, 5, 1, 0, 0, 0, 0) enddt = datetime(2010, 8, 31, 23, 59, 59, 999999) assert q.__class__ is query.NumericRange assert q.start == times.datetime_to_long(startdt) assert q.end == times.datetime_to_long(enddt)
def test_merged_lengths(self): s = fields.Schema(f1 = fields.KEYWORD(stored = True, scorable = True), f2 = fields.KEYWORD(stored = True, scorable = True)) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(f1 = u"A B C", f2 = u"X") w.add_document(f1 = u"B C D E", f2 = u"Y Z") w.commit() w = ix.writer() w.add_document(f1 = u"A", f2 = u"B C D E X Y") w.add_document(f1 = u"B C", f2 = u"X") w.commit(NO_MERGE) w = ix.writer() w.add_document(f1 = u"A B X Y Z", f2 = u"B C") w.add_document(f1 = u"Y X", f2 = u"A B") w.commit(NO_MERGE) dr = ix.reader() self.assertEqual(dr.stored_fields(0)["f1"], u"A B C") self.assertEqual(dr.doc_field_length(0, "f1"), 3) self.assertEqual(dr.doc_field_length(2, "f2"), 6) self.assertEqual(dr.doc_field_length(4, "f1"), 5) dr.close()
def test_no_parents(): schema = fields.Schema(id=fields.STORED, kind=fields.ID, name=fields.ID(stored=True)) k = u("alfa") ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, kind=k, name=u("one")) w.add_document(id=1, kind=k, name=u("two")) w.add_document(id=2, kind=k, name=u("three")) w.add_document(id=3, kind=k, name=u("four")) w.add_document(id=4, kind=k, name=u("one")) w.add_document(id=5, kind=k, name=u("two")) w.add_document(id=6, kind=k, name=u("three")) w.add_document(id=7, kind=k, name=u("four")) w.add_document(id=8, kind=k, name=u("one")) w.add_document(id=9, kind=k, name=u("two")) w.add_document(id=10, kind=k, name=u("three")) w.add_document(id=11, kind=k, name=u("four")) with ix.searcher() as s: pq = query.Term("kind", "bravo") cq = query.Or([query.Term("name", "two"), query.Term("name", "four")]) q = query.NestedParent(pq, cq) r = s.search(q) assert r.is_empty()
def test_filter(): schema = fields.Schema(id=fields.STORED, path=fields.ID, text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, path=u("/a/1"), text=u("alfa bravo charlie")) w.add_document(id=2, path=u("/b/1"), text=u("bravo charlie delta")) w.add_document(id=3, path=u("/c/1"), text=u("charlie delta echo")) w.commit(merge=False) w = ix.writer() w.add_document(id=4, path=u("/a/2"), text=u("delta echo alfa")) w.add_document(id=5, path=u("/b/2"), text=u("echo alfa bravo")) w.add_document(id=6, path=u("/c/2"), text=u("alfa bravo charlie")) w.commit(merge=False) w = ix.writer() w.add_document(id=7, path=u("/a/3"), text=u("bravo charlie delta")) w.add_document(id=8, path=u("/b/3"), text=u("charlie delta echo")) w.add_document(id=9, path=u("/c/3"), text=u("delta echo alfa")) w.commit(merge=False) with ix.searcher() as s: fq = Or([Prefix("path", "/a"), Prefix("path", "/b")]) r = s.search(Term("text", "alfa"), filter=fq) assert_equal([d["id"] for d in r], [1, 4, 5]) r = s.search(Term("text", "bravo"), filter=fq) assert_equal([d["id"] for d in r], [1, 2, 5, 7, ])
def test_frequency_keyword(self): s = fields.Schema(content = fields.KEYWORD) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(content = u"A B C D E") w.add_document(content = u"B B B B C D D") w.add_document(content = u"D E F") w.commit() tr = ix.reader() self.assertEqual(tr.doc_frequency("content", u"B"), 2) self.assertEqual(tr.frequency("content", u"B"), 5) self.assertEqual(tr.doc_frequency("content", u"E"), 2) self.assertEqual(tr.frequency("content", u"E"), 2) self.assertEqual(tr.doc_frequency("content", u"A"), 1) self.assertEqual(tr.frequency("content", u"A"), 1) self.assertEqual(tr.doc_frequency("content", u"D"), 3) self.assertEqual(tr.frequency("content", u"D"), 4) self.assertEqual(tr.doc_frequency("content", u"F"), 1) self.assertEqual(tr.frequency("content", u"F"), 1) self.assertEqual(tr.doc_frequency("content", u"Z"), 0) self.assertEqual(tr.frequency("content", u"Z"), 0) self.assertEqual(list(tr), [(0, u"A", 1, 1), (0, u"B", 2, 5), (0, u"C", 2, 2), (0, u"D", 3, 4), (0, u"E", 2, 2), (0, u"F", 1, 1)]) tr.close()
def test_frequency_text(self): s = fields.Schema(content = fields.KEYWORD) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(content = u"alfa bravo charlie delta echo") w.add_document(content = u"bravo bravo bravo bravo charlie delta delta") w.add_document(content = u"delta echo foxtrot") w.commit() tr = ix.reader() self.assertEqual(tr.doc_frequency("content", u"bravo"), 2) self.assertEqual(tr.frequency("content", u"bravo"), 5) self.assertEqual(tr.doc_frequency("content", u"echo"), 2) self.assertEqual(tr.frequency("content", u"echo"), 2) self.assertEqual(tr.doc_frequency("content", u"alfa"), 1) self.assertEqual(tr.frequency("content", u"alfa"), 1) self.assertEqual(tr.doc_frequency("content", u"delta"), 3) self.assertEqual(tr.frequency("content", u"delta"), 4) self.assertEqual(tr.doc_frequency("content", u"foxtrot"), 1) self.assertEqual(tr.frequency("content", u"foxtrot"), 1) self.assertEqual(tr.doc_frequency("content", u"zulu"), 0) self.assertEqual(tr.frequency("content", u"zulu"), 0) self.assertEqual(list(tr), [(0, u"alfa", 1, 1), (0, u"bravo", 2, 5), (0, u"charlie", 2, 2), (0, u"delta", 3, 4), (0, u"echo", 2, 2), (0, u"foxtrot", 1, 1)]) tr.close()
def test_frequency_keyword(): s = fields.Schema(content=fields.KEYWORD) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(content=u("A B C D E")) w.add_document(content=u("B B B B C D D")) w.add_document(content=u("D E F")) w.commit() with ix.reader() as tr: assert tr.doc_frequency("content", u("B")) == 2 assert tr.frequency("content", u("B")) == 5 assert tr.doc_frequency("content", u("E")) == 2 assert tr.frequency("content", u("E")) == 2 assert tr.doc_frequency("content", u("A")) == 1 assert tr.frequency("content", u("A")) == 1 assert tr.doc_frequency("content", u("D")) == 3 assert tr.frequency("content", u("D")) == 4 assert tr.doc_frequency("content", u("F")) == 1 assert tr.frequency("content", u("F")) == 1 assert tr.doc_frequency("content", u("Z")) == 0 assert tr.frequency("content", u("Z")) == 0 stats = [(fname, text, ti.doc_frequency(), ti.weight()) for (fname, text), ti in tr] assert stats == [("content", b("A"), 1, 1), ("content", b("B"), 2, 5), ("content", b("C"), 2, 2), ("content", b("D"), 3, 4), ("content", b("E"), 2, 2), ("content", b("F"), 1, 1)]
def test_frequency_text(): s = fields.Schema(content=fields.KEYWORD) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(content=u("alfa bravo charlie delta echo")) w.add_document(content=u("bravo bravo bravo bravo charlie delta delta")) w.add_document(content=u("delta echo foxtrot")) w.commit() with ix.reader() as tr: assert tr.doc_frequency("content", u("bravo")) == 2 assert tr.frequency("content", u("bravo")) == 5 assert tr.doc_frequency("content", u("echo")) == 2 assert tr.frequency("content", u("echo")) == 2 assert tr.doc_frequency("content", u("alfa")) == 1 assert tr.frequency("content", u("alfa")) == 1 assert tr.doc_frequency("content", u("delta")) == 3 assert tr.frequency("content", u("delta")) == 4 assert tr.doc_frequency("content", u("foxtrot")) == 1 assert tr.frequency("content", u("foxtrot")) == 1 assert tr.doc_frequency("content", u("zulu")) == 0 assert tr.frequency("content", u("zulu")) == 0 stats = [(fname, text, ti.doc_frequency(), ti.weight()) for (fname, text), ti in tr] assert stats == [("content", b("alfa"), 1, 1), ("content", b("bravo"), 2, 5), ("content", b("charlie"), 2, 2), ("content", b("delta"), 3, 4), ("content", b("echo"), 2, 2), ("content", b("foxtrot"), 1, 1)]
def test_intersection(self): schema = fields.Schema(key = fields.ID(stored=True), value = fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(key=u"a", value=u"alpha bravo charlie delta") w.add_document(key=u"b", value=u"echo foxtrot alpha bravo") w.add_document(key=u"c", value=u"charlie delta golf hotel") w.commit() w = ix.writer() w.add_document(key=u"d", value=u"india alpha bravo charlie") w.add_document(key=u"e", value=u"delta bravo india bravo") w.commit() searcher = ix.searcher() q = And([Term("value", u"bravo"), Term("value", u"delta")]) sc = q.scorer(searcher) self.assertEqual(self._keys(searcher, sc.all_ids()), ["a", "e"]) q = And([Term("value", u"bravo"), Term("value", u"alpha")]) sc = q.scorer(searcher) self.assertEqual(self._keys(searcher, sc.all_ids()), ["a", "b", "d"])
def test_whole_noterms(): schema = fields.Schema(text=fields.TEXT(stored=True), tag=fields.KEYWORD) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(text=u("alfa bravo charlie delta echo foxtrot golf"), tag=u("foo")) with ix.searcher() as s: r = s.search(query.Term("text", u("delta"))) assert len(r) == 1 r.fragmenter = highlight.WholeFragmenter() r.formatter = highlight.UppercaseFormatter() hi = r[0].highlights("text") assert hi == u("alfa bravo charlie DELTA echo foxtrot golf") r = s.search(query.Term("tag", u("foo"))) assert len(r) == 1 r.fragmenter = highlight.WholeFragmenter() r.formatter = highlight.UppercaseFormatter() hi = r[0].highlights("text") assert hi == u("") hi = r[0].highlights("text", minscore=0) assert hi == u("alfa bravo charlie delta echo foxtrot golf")
def test_everything_is_a_parent(): schema = fields.Schema(id=fields.STORED, kind=fields.ID, name=fields.ID(stored=True)) k = u("alfa") ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, kind=k, name=u("one")) w.add_document(id=1, kind=k, name=u("two")) w.add_document(id=2, kind=k, name=u("three")) w.add_document(id=3, kind=k, name=u("four")) w.add_document(id=4, kind=k, name=u("one")) w.add_document(id=5, kind=k, name=u("two")) w.add_document(id=6, kind=k, name=u("three")) w.add_document(id=7, kind=k, name=u("four")) w.add_document(id=8, kind=k, name=u("one")) w.add_document(id=9, kind=k, name=u("two")) w.add_document(id=10, kind=k, name=u("three")) w.add_document(id=11, kind=k, name=u("four")) with ix.searcher() as s: pq = query.Term("kind", k) cq = query.Or([query.Term("name", "two"), query.Term("name", "four")]) q = query.NestedParent(pq, cq) r = s.search(q) assert [hit["id"] for hit in r] == [1, 3, 5, 7, 9, 11]
def test_pinpoint(): domain = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet " "kilo lima mike november oskar papa quebec romeo sierra tango") schema = fields.Schema(text=fields.TEXT(stored=True, chars=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=domain) w.commit() assert ix.schema["text"].supports("characters") with ix.searcher() as s: r = s.search(query.Term("text", "juliet"), terms=True) hit = r[0] hi = highlight.Highlighter() hi.formatter = highlight.UppercaseFormatter() assert not hi.can_load_chars(r, "text") assert hi.highlight_hit(hit, "text") == "golf hotel india JULIET kilo lima mike november" hi.fragmenter = highlight.PinpointFragmenter() assert hi.can_load_chars(r, "text") assert hi.highlight_hit(hit, "text") == "ot golf hotel india JULIET kilo lima mike nove" hi.fragmenter.autotrim = True assert hi.highlight_hit(hit, "text") == "golf hotel india JULIET kilo lima mike"
def test_scoring(): schema = fields.Schema(kind=fields.ID, name=fields.KEYWORD(scorable=True, stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: with w.group(): w.add_document(kind=u("class"), name=u("Index")) w.add_document(kind=u("method"), name=u("add document")) w.add_document(kind=u("method"), name=u("add reader")) w.add_document(kind=u("method"), name=u("close")) with w.group(): w.add_document(kind=u("class"), name=u("Accumulator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("get result")) with w.group(): w.add_document(kind=u("class"), name=u("Calculator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("add all")) w.add_document(kind=u("method"), name=u("add some")) w.add_document(kind=u("method"), name=u("multiply")) w.add_document(kind=u("method"), name=u("close")) with ix.searcher() as s: q = query.NestedParent(query.Term("kind", "class"), query.Term("name", "add")) r = s.search(q) assert [hit["name"] for hit in r] == ["Calculator", "Index", "Accumulator"]
def test_missing_field(): schema = fields.Schema() ix = RamStorage().create_index(schema) with ix.searcher() as s: with pytest.raises(KeyError): s.document_numbers(id=u("test"))
def test_boolean(): schema = fields.Schema(id=fields.ID(stored=True), done=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("a"), done=True) w.add_document(id=u("b"), done=False) w.add_document(id=u("c"), done=True) w.add_document(id=u("d"), done=False) w.add_document(id=u("e"), done=True) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("id", schema) r = s.search(qp.parse("done:true")) assert sorted([d["id"] for d in r]) == ["a", "c", "e"] assert all(d["done"] for d in r) r = s.search(qp.parse("done:yes")) assert sorted([d["id"] for d in r]) == ["a", "c", "e"] assert all(d["done"] for d in r) q = qp.parse("done:false") assert q.__class__ == query.Term assert q.text is False assert schema["done"].to_bytes(False) == b("f") r = s.search(q) assert sorted([d["id"] for d in r]) == ["b", "d"] assert not any(d["done"] for d in r) r = s.search(qp.parse("done:no")) assert sorted([d["id"] for d in r]) == ["b", "d"] assert not any(d["done"] for d in r)
def test_correct_query(): schema = fields.Schema(a=fields.TEXT(spelling=True), b=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(a=u("alfa bravo charlie delta")) w.add_document(a=u("delta echo foxtrot golf")) w.add_document(a=u("golf hotel india juliet")) w.add_document(a=u("juliet kilo lima mike")) w.commit() s = ix.searcher() qp = QueryParser("a", ix.schema) qtext = u('alpha ("brovo november" OR b:dolta) detail') q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) assert c.query.__unicode__() == '(a:alfa AND (a:"bravo november" OR b:dolta) AND a:detail)' assert c.string == 'alfa ("bravo november" OR b:dolta) detail' qtext = u('alpha b:("brovo november" a:delta) detail') q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) assert c.query.__unicode__() == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)' assert c.string == 'alfa b:("brovo november" a:delta) detail' hf = highlight.HtmlFormatter(classname="c") assert c.format_string(hf) == '<strong class="c term0">alfa</strong> b:("brovo november" a:delta) detail'
def create_index(cls, app, wh): """Creates and opens an index for the given whoosheer and app. If the index already exists, it just opens it, otherwise it creates it first. :param app: The application instance. :param wh: The whoosheer instance for which a index should be created. """ # TODO: do we really want/need to use camel casing? # everywhere else, there is just .lower() if app.extensions['whooshee']['memory_storage']: storage = RamStorage() index = storage.create_index(wh.schema) assert index return index else: index_path = os.path.join(app.extensions['whooshee']['index_path_root'], getattr(wh, 'index_subdir', cls.camel_to_snake(wh.__name__))) if whoosh.index.exists_in(index_path): index = whoosh.index.open_dir(index_path) else: if not os.path.exists(index_path): os.makedirs(index_path) index = whoosh.index.create_in(index_path, wh.schema) return index
def test_numeric(): schema = fields.Schema(id=fields.ID(stored=True), integer=fields.NUMERIC(int), floating=fields.NUMERIC(float)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("a"), integer=5820, floating=1.2) w.add_document(id=u("b"), integer=22, floating=2.3) w.add_document(id=u("c"), integer=78, floating=3.4) w.add_document(id=u("d"), integer=13, floating=4.5) w.add_document(id=u("e"), integer=9, floating=5.6) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("integer", schema) q = qp.parse(u("5820")) r = s.search(q) assert len(r) == 1 assert r[0]["id"] == "a" with ix.searcher() as s: r = s.search(qp.parse("floating:4.5")) assert len(r) == 1 assert r[0]["id"] == "d" q = qp.parse("integer:*") assert q.__class__ == query.Every assert q.field() == "integer" q = qp.parse("integer:5?6") assert q == query.NullQuery
def test_wildcard_existing_terms(): s = fields.Schema(key=fields.ID, value=fields.TEXT) ix = RamStorage().create_index(s) w = ix.writer() w.add_document(key=u("a"), value=u("alfa bravo bear charlie delta")) w.add_document(key=u("a"), value=u("boggle echo render rendering renders")) w.commit() r = ix.reader() qp = QueryParser("value", ix.schema) def words(terms): z = [] for t in terms: assert t[0] == "value" z.append(t[1]) return " ".join(sorted(z)) q = qp.parse(u("b*")) ts = q.existing_terms(r) assert_equal(ts, set()) ts = q.existing_terms(r, expand=True) assert_equal(words(ts), "bear boggle bravo") q = qp.parse(u("[a TO f]")) ts = q.existing_terms(r) assert_equal(ts, set()) ts = q.existing_terms(r, expand=True) assert_equal(words(ts), "alfa bear boggle bravo charlie delta echo") q = query.Variations("value", "render") ts = q.existing_terms(r, expand=False) assert_equal(ts, set()) ts = q.existing_terms(r, expand=True) assert_equal(words(ts), "render rendering renders")
def test_highlight_daterange(): from datetime import datetime schema = fields.Schema(id=fields.ID(unique=True, stored=True), title=fields.TEXT(stored=True), content=fields.TEXT(stored=True), released=fields.DATETIME(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.update_document( id=u('1'), title=u('Life Aquatic'), content=u('A nautic film crew sets out to kill a gigantic shark.'), released=datetime(2004, 12, 25) ) w.update_document( id=u('2'), title=u('Darjeeling Limited'), content=u('Three brothers meet in India for a life changing train journey.'), released=datetime(2007, 10, 27) ) w.commit() s = ix.searcher() r = s.search(Term('content', u('train')), terms=True) assert_equal(len(r), 1) assert_equal(r[0]["id"], "2") assert_equal(r[0].highlights("content"), 'for a life changing <b class="match term0">train</b> journey') r = s.search(DateRange('released', datetime(2007, 1, 1), None)) assert_equal(len(r), 1) assert_equal(r[0].highlights("content"), '')
def test_decimal_ranges(): from decimal import Decimal schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC(int, decimal_places=2)) ix = RamStorage().create_index(schema) w = ix.writer() count = Decimal("0.0") inc = Decimal("0.2") for _ in xrange(500): w.add_document(id=str(count), num=count) count += inc w.commit() with ix.searcher() as s: qp = qparser.QueryParser("num", schema) def check(qs, start, end): q = qp.parse(qs) result = [s.stored_fields(d)["id"] for d in q.docs(s)] target = [] count = Decimal(start) limit = Decimal(end) while count <= limit: target.append(str(count)) count += inc assert result == target check("[10.2 to 80.8]", "10.2", "80.8") check("{10.2 to 80.8]", "10.4", "80.8") check("[10.2 to 80.8}", "10.2", "80.6") check("{10.2 to 80.8}", "10.4", "80.6")
def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError( "The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if getattr(LOCALS, 'RAM_STORE', None) is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(connections[ self.connection_alias].get_unified_index().all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True
def test_index_decimals(): from decimal import Decimal schema = fields.Schema(name=fields.KEYWORD(stored=True), num=fields.NUMERIC(int)) ix = RamStorage().create_index(schema) with ix.writer() as w: with pytest.raises(TypeError): w.add_document(name=u("hello"), num=Decimal("3.2")) schema = fields.Schema(name=fields.KEYWORD(stored=True), num=fields.NUMERIC(Decimal, decimal_places=5)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(name=u("hello"), num=Decimal("3.2"))
def test_term_stats(): schema = fields.Schema(t=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(t=u("alfa bravo charlie delta echo")) w.add_document(t=u("bravo charlie delta echo foxtrot")) w.add_document(t=u("charlie delta echo foxtrot golf")) w.add_document(t=u("delta echo foxtrot")) w.add_document(t=u("echo foxtrot golf hotel india juliet")) w.add_document(t=u("foxtrot alfa alfa alfa")) w.commit() with ix.reader() as r: ti = r.term_info("t", u("alfa")) assert_equal(ti.weight(), 4.0) assert_equal(ti.doc_frequency(), 2) assert_equal(ti.min_length(), 4) assert_equal(ti.max_length(), 5) assert_equal(ti.max_weight(), 3.0) assert_equal(r.term_info("t", u("echo")).min_length(), 3) assert_equal(r.doc_field_length(3, "t"), 3) assert_equal(r.min_field_length("t"), 3) assert_equal(r.max_field_length("t"), 6) w = ix.writer() w.add_document(t=u("alfa")) w.add_document(t=u("bravo charlie")) w.add_document(t=u("echo foxtrot tango bravo")) w.add_document(t=u("golf hotel")) w.add_document(t=u("india")) w.add_document(t=u("juliet alfa bravo charlie delta echo foxtrot")) w.commit(merge=False) with ix.reader() as r: ti = r.term_info("t", u("alfa")) assert_equal(ti.weight(), 6.0) assert_equal(ti.doc_frequency(), 4) assert_equal(ti.min_length(), 1) assert_equal(ti.max_length(), 7) assert_equal(ti.max_weight(), 3.0) assert_equal(r.term_info("t", u("echo")).min_length(), 3) assert_equal(r.min_field_length("t"), 1) assert_equal(r.max_field_length("t"), 7)
def test_spelling(self): st = RamStorage() sp = spelling.SpellChecker(st, mingram=2) wordlist = ["render", "animation", "animate", "shader", "shading", "zebra", "koala", "lamppost", "ready", "kismet", "reaction", "page", "delete", "quick", "brown", "fox", "jumped", "over", "lazy", "dog", "wicked", "erase", "red", "team", "yellow", "under", "interest", "open", "print", "acrid", "sear", "deaf", "feed", "grow", "heal", "jolly", "kilt", "low", "zone", "xylophone", "crown", "vale", "brown", "neat", "meat", "reduction", "blunder", "preaction"] sp.add_words([unicode(w) for w in wordlist]) sugs = sp.suggest(u"reoction") self.assertNotEqual(len(sugs), 0) self.assertEqual(sugs, [u"reaction", u"reduction", u"preaction"])
class ToolBoxSearch(object): """ Support searching tools in a toolbox. This implementation uses the "whoosh" search library. """ def __init__(self, toolbox): """ Create a searcher for `toolbox`. """ self.toolbox = toolbox self.enabled = tool_search_enabled if tool_search_enabled: self.build_index() def build_index(self): self.storage = RamStorage() self.index = self.storage.create_index(schema) writer = self.index.writer() ## TODO: would also be nice to search section headers. for id, tool in self.toolbox.tools_by_id.iteritems(): writer.add_document(id=id, title=to_unicode(tool.name), description=to_unicode(tool.description), help=to_unicode(tool.help)) writer.commit() def search(self, query, return_attribute='id'): if not tool_search_enabled: return [] # Change field boosts for searcher to place more weight on title, description than help. searcher = self.index.searcher( \ weighting=BM25F( field_B={ 'title_B' : 3, 'description_B' : 2, 'help_B' : 1 } \ ) ) # Set query to search title, description, and help. parser = MultifieldParser(['title', 'description', 'help'], schema=schema) results = searcher.search(parser.parse(query), minscore=2.0) return [result[return_attribute] for result in results]
class test_translate(): domain = [("alfa", 100, 50), ("bravo", 20, 80), ("charlie", 10, 10), ("delta", 82, 39), ("echo", 20, 73), ("foxtrot", 81, 59), ("golf", 39, 93), ("hotel", 57, 48), ("india", 84, 75), ] schema = fields.Schema(name=fields.TEXT(sortable=True), a=fields.NUMERIC(sortable=True), b=fields.NUMERIC(sortable=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for name, a, b in domain: w.add_document(name=u(name), a=a, b=b) with ix.searcher() as s: q = query.Every() # Baseline: just sort by a field r = s.search(q, sortedby="a") assert " ".join([hit["name"] for hit in r]) == "charlie bravo echo golf hotel foxtrot delta india alfa" # Sort by reversed name target = [x[0] for x in sorted(domain, key=lambda x: x[0][::-1])] tf = sorting.TranslateFacet(lambda name: name[::-1], sorting.FieldFacet("name")) r = s.search(q, sortedby=tf) assert [hit["name"] for hit in r] == target # Sort by average of a and b def avg(a, b): return (a + b) / 2 target = [x[0] for x in sorted(domain, key=lambda x: (x[1] + x[2]) / 2)] af = sorting.FieldFacet("a") bf = sorting.FieldFacet("b") tf = sorting.TranslateFacet(avg, af, bf) r = s.search(q, sortedby=tf) assert [hit["name"] for hit in r] == target
def test_missing(): schema = fields.Schema(kind=fields.ID, name=fields.KEYWORD(scorable=True, stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: with w.group(): w.add_document(kind=u("class"), name=u("Index")) w.add_document(kind=u("method"), name=u("add document")) w.add_document(kind=u("method"), name=u("add reader")) w.add_document(kind=u("method"), name=u("close")) with w.group(): w.add_document(kind=u("class"), name=u("Accumulator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("get result")) with w.group(): w.add_document(kind=u("class"), name=u("Calculator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("add all")) w.add_document(kind=u("method"), name=u("add some")) w.add_document(kind=u("method"), name=u("multiply")) w.add_document(kind=u("method"), name=u("close")) with w.group(): w.add_document(kind=u("class"), name=u("Deleter")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("delete")) with ix.searcher() as s: q = query.NestedParent(query.Term("kind", "class"), query.Term("name", "add")) r = s.search(q) assert [hit["name"] for hit in r ] == ["Calculator", "Index", "Accumulator", "Deleter"] with ix.writer() as w: w.delete_by_term("name", "Accumulator") w.delete_by_term("name", "Calculator") with ix.searcher() as s: pq = query.Term("kind", "class") assert len(list(pq.docs(s))) == 2 q = query.NestedParent(pq, query.Term("name", "add")) r = s.search(q) assert [hit["name"] for hit in r] == ["Index", "Deleter"]
def setup(self): """ Defers loading until needed. """ new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(settings.HAYSTACK_WHOOSH_PATH): os.makedirs(settings.HAYSTACK_WHOOSH_PATH) new_index = True if self.use_file_storage and not os.access(settings.HAYSTACK_WHOOSH_PATH, os.W_OK): raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % settings.HAYSTACK_WHOOSH_PATH) if self.use_file_storage: self.storage = FileStorage(settings.HAYSTACK_WHOOSH_PATH) else: global LOCALS if LOCALS.RAM_STORE is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(self.site.all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True
def test_min_max_id(): schema = fields.Schema(id=fields.STORED, t=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, t=u("alfa bravo charlie")) w.add_document(id=1, t=u("bravo charlie delta")) w.add_document(id=2, t=u("charlie delta echo")) w.add_document(id=3, t=u("delta echo foxtrot")) w.add_document(id=4, t=u("echo foxtrot golf")) w.commit() with ix.reader() as r: ti = r.term_info("t", u("delta")) assert_equal(ti.min_id(), 1) assert_equal(ti.max_id(), 3) ti = r.term_info("t", u("alfa")) assert_equal(ti.min_id(), 0) assert_equal(ti.max_id(), 0) ti = r.term_info("t", u("foxtrot")) assert_equal(ti.min_id(), 3) assert_equal(ti.max_id(), 4) w = ix.writer() w.add_document(id=5, t=u("foxtrot golf hotel")) w.add_document(id=6, t=u("golf hotel alfa")) w.add_document(id=7, t=u("hotel alfa bravo")) w.add_document(id=8, t=u("alfa bravo charlie")) w.commit(merge=False) with ix.reader() as r: ti = r.term_info("t", u("delta")) assert_equal(ti.min_id(), 1) assert_equal(ti.max_id(), 3) ti = r.term_info("t", u("alfa")) assert_equal(ti.min_id(), 0) assert_equal(ti.max_id(), 8) ti = r.term_info("t", u("foxtrot")) assert_equal(ti.min_id(), 3) assert_equal(ti.max_id(), 5)
def test_clear(): schema = fields.Schema(a=fields.KEYWORD) ix = RamStorage().create_index(schema) # Add some segments with ix.writer() as w: w.add_document(a=u"one two three") w.merge = False with ix.writer() as w: w.add_document(a=u"two three four") w.merge = False with ix.writer() as w: w.add_document(a=u"three four five") w.merge = False # Clear with ix.writer() as w: w.add_document(a=u"foo bar baz") w.mergetype = writing.CLEAR with ix.searcher() as s: assert s.doc_count_all() == 1 assert list(s.reader().lexicon("a")) == [b("bar"), b("baz"), b("foo")]
def _rt(c, values, default): # Continuous st = RamStorage() f = st.create_file("test1") f.write(b("hello")) w = c.writer(f) for docnum, v in enumerate(values): w.add(docnum, v) w.finish(len(values)) length = f.tell() - 5 f.close() f = st.open_file("test1") r = c.reader(f, 5, length, len(values)) assert values == list(r) for x in range(len(values)): assert values[x] == r[x] f.close() # Sparse doccount = len(values) * 7 + 15 target = [default] * doccount f = st.create_file("test2") f.write(b("hello")) w = c.writer(f) for docnum, v in izip(xrange(10, doccount, 7), values): target[docnum] = v w.add(docnum, v) w.finish(doccount) length = f.tell() - 5 f.close() f = st.open_file("test2") r = c.reader(f, 5, length, doccount) assert target == list(r) for x in range(doccount): assert target[x] == r[x] lr = r.load() assert target == list(lr) f.close()
def test_missing_column(): from whoosh import collectors schema = fields.Schema(id=fields.STORED, tags=fields.KEYWORD) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, tags=u("alfa bravo charlie")) w.add_document(id=1, tags=u("bravo charlie delta")) w.add_document(id=2, tags=u("charlie delta echo")) w.merge = False with ix.writer() as w: w.add_field("age", fields.NUMERIC(sortable=True)) w.add_document(id=3, tags=u("delta echo foxtrot"), age=10) w.add_document(id=4, tags=u("echo foxtrot golf"), age=5) w.add_document(id=5, tags=u("foxtrot golf alfa"), age=20) w.merge = False with ix.writer() as w: w.add_document(id=6, tags=u("golf alfa bravo"), age=2) w.add_document(id=7, tags=u("alfa hotel india"), age=50) w.add_document(id=8, tags=u("hotel india bravo"), age=15) w.merge = False with ix.searcher() as s: assert not s.is_atomic() q = query.Term("tags", u("alfa")) # Have to use yucky low-level collector API to make sure we used a # ColumnCategorizer to do the sorting c = s.collector(sortedby="age") assert isinstance(c, collectors.SortingCollector) s.search_with_collector(q, c) assert isinstance(c.categorizer, sorting.ColumnCategorizer) r = c.results() assert [hit["id"] for hit in r] == [6, 5, 7, 0] r = s.search(q, sortedby="age", reverse=True) assert [hit["id"] for hit in r] == [0, 7, 5, 6]
def test_doc_boost(): schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, a=u("alfa alfa alfa"), b=u("bravo")) w.add_document(id=1, a=u("alfa"), b=u("bear"), _a_boost=5.0) w.add_document(id=2, a=u("alfa alfa alfa alfa"), _boost=0.5) w.commit() with ix.searcher() as s: r = s.search(query.Term("a", "alfa")) assert [hit["id"] for hit in r] == [1, 0, 2] w = ix.writer() w.add_document(id=3, a=u("alfa"), b=u("bottle")) w.add_document(id=4, b=u("bravo"), _b_boost=2.0) w.commit(merge=False) with ix.searcher() as s: r = s.search(query.Term("a", "alfa")) assert [hit["id"] for hit in r] == [1, 0, 3, 2]
def test_score_facet(): schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT, c=fields.ID) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, a=u("alfa alfa bravo"), b=u("bottle"), c=u("c")) w.add_document(id=2, a=u("alfa alfa alfa"), b=u("bottle"), c=u("c")) w.commit() w = ix.writer() w.add_document(id=3, a=u("alfa bravo bravo"), b=u("bottle"), c=u("c")) w.add_document(id=4, a=u("alfa bravo alfa"), b=u("apple"), c=u("c")) w.commit(merge=False) w = ix.writer() w.add_document(id=5, a=u("alfa bravo bravo"), b=u("apple"), c=u("c")) w.add_document(id=6, a=u("alfa alfa alfa"), b=u("apple"), c=u("c")) w.commit(merge=False) with ix.searcher() as s: facet = sorting.MultiFacet(["b", sorting.ScoreFacet()]) r = s.search(q=query.Term("a", u("alfa")), sortedby=facet) assert [h["id"] for h in r] == [6, 4, 5, 2, 1, 3]
def test_numeric_field_facet(): schema = fields.Schema(id=fields.STORED, v1=fields.NUMERIC, v2=fields.NUMERIC) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, v1=2, v2=100) w.add_document(id=2, v1=1, v2=50) w.commit() w = ix.writer() w.add_document(id=3, v1=2, v2=200) w.add_document(id=4, v1=1, v2=100) w.commit() w = ix.writer(merge=False) w.add_document(id=5, v1=2, v2=50) w.add_document(id=6, v1=1, v2=200) w.commit() with ix.searcher() as s: mf = sorting.MultiFacet().add_field("v1").add_field("v2", reverse=True) r = s.search(query.Every(), sortedby=mf) assert [hit["id"] for hit in r] == [6, 4, 2, 3, 1, 5]
def test_checksum_file(): from whoosh.filedb.structfile import ChecksumFile from zlib import crc32 def wr(f): f.write(b("Testing")) f.write_int(-100) f.write_varint(10395) f.write_string(b("Hello")) f.write_ushort(32959) st = RamStorage() # Write a file normally f = st.create_file("control") wr(f) f.close() # Checksum the contents f = st.open_file("control") target = crc32(f.read()) & 0xffffffff f.close() # Write a file with checksumming f = st.create_file("test") cf = ChecksumFile(f) wr(cf) assert cf.checksum() == target f.close() # Read the file with checksumming f = st.open_file("test") cf = ChecksumFile(f) assert cf.read(7) == b("Testing") assert cf.read_int() == -100 assert cf.read_varint() == 10395 assert cf.read_string() == b("Hello") assert cf.read_ushort() == 32959 assert cf.checksum() == target cf.close()
def test_empty_index(): schema = fields.Schema(key=fields.ID(stored=True), value=fields.TEXT) st = RamStorage() assert_raises(index.EmptyIndexError, st.open_index, schema=schema)
class WhooshStore(SAMLStoreBase): def __init__(self): self.schema = Schema(scopes=KEYWORD(), descr=TEXT(), service_name=TEXT(), service_descr=TEXT(), keywords=KEYWORD()) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) for a in ATTRS.keys(): self.schema.add(a, KEYWORD()) self._collections = set() from whoosh.filedb.filestore import RamStorage, FileStorage self.storage = RamStorage() self.storage.create() self.index = self.storage.create_index(self.schema) self.objects = dict() self.infos = dict() def dump(self): ix = self.storage.open_index() print(ix.schema) from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result) def _index_prep(self, info): if 'entity_attributes' in info: for a, v in info.pop('entity_attributes').items(): info[a] = v for a, v in info.items(): if type(v) is not list and type(v) is not tuple: info[a] = [info.pop(a)] if a in ATTRS_INV: info[ATTRS_INV[a]] = info.pop(a) for a in info.keys(): if not a in self.schema.names(): del info[a] for a, v in info.items(): info[a] = [six.text_type(vv) for vv in v] def _index(self, e, tid=None): info = entity_info(e) if tid is not None: info['collection_id'] = tid self._index_prep(info) id = six.text_type(object_id(e)) # mix in tid here self.infos[id] = info self.objects[id] = e ix = self.storage.open_index() with ix.writer() as writer: writer.add_document(object_id=id, **info) writer.mergetype = writing.CLEAR def update(self, t, tid=None, ts=None, merge_strategy=None): relt = root(t) assert (relt is not None) ne = 0 if relt.tag == "{%s}EntityDescriptor" % NS['md']: self._index(relt) ne += 1 elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') self._collections.add(tid) for e in iter_entities(t): self._index(e, tid=tid) ne += 1 return ne def collections(self): return self._collections def reset(self): self.__init__() def size(self, a=None, v=None): if a is None: return len(self.objects.keys()) elif a is not None and v is None: return len(self.attribute(a)) else: return len(self.lookup("{!s}={!s}".format(a, v))) def _attributes(self): ix = self.storage.open_index() with ix.reader() as reader: for n in reader.indexed_field_names(): if n in ATTRS: yield ATTRS[n] def attributes(self): return list(self._attributes()) def attribute(self, a): if a in ATTRS_INV: n = ATTRS_INV[a] ix = self.storage.open_index() with ix.searcher() as searcher: return list(searcher.lexicon(n)) else: return [] def lookup(self, key, raw=True, field="entity_id"): if key == 'entities' or key is None: if raw: return self.objects.values() else: return self.infos.values() from whoosh.qparser import QueryParser #import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') for uri, a in ATTRS_INV.items(): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: if raw: lst.add(self.objects[result['object_id']]) else: lst.add(self.infos[result['object_id']]) return list(lst)
def test_doc_count(): schema = fields.Schema(id=fields.NUMERIC) ix = RamStorage().create_index(schema) with ix.writer() as w: for i in xrange(10): w.add_document(id=i) r = ix.reader() assert r.doc_count() == 10 assert r.doc_count_all() == 10 w = ix.writer() w.delete_document(2) w.delete_document(4) w.delete_document(6) w.delete_document(8) w.commit() r = ix.reader() assert r.doc_count() == 6 assert r.doc_count_all() == 10 w = ix.writer() for i in xrange(10, 15): w.add_document(id=i) w.commit(merge=False) r = ix.reader() assert r.doc_count() == 11 assert r.doc_count_all() == 15 w = ix.writer() w.delete_document(10) w.delete_document(12) w.delete_document(14) w.commit(merge=False) r = ix.reader() assert r.doc_count() == 8 assert r.doc_count_all() == 15 ix.optimize() r = ix.reader() assert r.doc_count() == 8 assert r.doc_count_all() == 8
def test_nested_children(): schema = fields.Schema(t=fields.ID(stored=True), track=fields.NUMERIC(stored=True), album_name=fields.TEXT(stored=True), song_name=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: with w.group(): w.add_document(t=u("album"), album_name=u("alfa bravo charlie")) w.add_document(t=u("track"), track=1, song_name=u("delta echo foxtrot")) w.add_document(t=u("track"), track=2, song_name=u("golf hotel india")) w.add_document(t=u("track"), track=3, song_name=u("juliet kilo lima")) with w.group(): w.add_document(t=u("album"), album_name=u("mike november oskar")) w.add_document(t=u("track"), track=1, song_name=u("papa quebec romeo")) w.add_document(t=u("track"), track=2, song_name=u("sierra tango ultra")) w.add_document(t=u("track"), track=3, song_name=u("victor whiskey xray")) with w.group(): w.add_document(t=u("album"), album_name=u("yankee zulu one")) w.add_document(t=u("track"), track=1, song_name=u("two three four")) w.add_document(t=u("track"), track=2, song_name=u("five six seven")) w.add_document(t=u("track"), track=3, song_name=u("eight nine ten")) with ix.searcher() as s: pq = query.Term("t", "album") aq = query.Term("album_name", "november") r = s.search(query.NestedChildren(pq, pq), limit=None) assert len(r) == 9 assert [str(hit["t"]) for hit in r] == ["track"] * 9 ncq = query.NestedChildren(pq, aq) assert list(ncq.docs(s)) == [5, 6, 7] r = s.search(ncq, limit=None) assert len(r) == 3 assert [str(hit["song_name"]) for hit in r] == [ "papa quebec romeo", "sierra tango ultra", "victor whiskey xray" ] zq = query.NestedChildren(pq, query.Term("album_name", "zulu")) f = sorting.StoredFieldFacet("song_name") r = s.search(zq, sortedby=f) assert [hit["track"] for hit in r] == [3, 2, 1]
def words_to_corrector(words): st = RamStorage() f = st.create_file("test") spelling.wordlist_to_graph_file(words, f) f = st.open_file("test") return spelling.GraphCorrector(fst.GraphReader(f))
def __init__(self, data_source): self.index = RamStorage().create_index(SCHEMA) self.data_source = data_source
from whoosh.index import create_in from whoosh.fields import * from app import cursor from whoosh.qparser import QueryParser from whoosh.filedb.filestore import RamStorage from whoosh.analysis import NgramAnalyzer import pdb from whoosh import query storage = RamStorage() def load_states(): analyzer = NgramAnalyzer(1, 2) state_schema = Schema(state=ID(stored=True, analyzer=analyzer)) with cursor() as cur: print('Loading states...') cur.execute('SELECT DISTINCT state FROM msa') state_index = storage.create_index(state_schema) writer = state_index.writer() for s in cur.fetchall(): writer.add_document(state=s[u'state']) writer.commit() return state_index def load_cities(): analyzer = NgramAnalyzer(1) city_schema = Schema(state=ID(stored=True), city=ID(stored=True, analyzer=analyzer)) with cursor() as cur:
def __init__(self): storage = RamStorage() self.index = storage.create_index(schema)
def test_simple_compound_nomap(): st = RamStorage() _test_simple_compound(st)
class ToolBoxSearch(object): """ Support searching tools in a toolbox. This implementation uses the Whoosh search library. """ def __init__(self, toolbox, index_help=True): """ Create a searcher for `toolbox`. """ self.schema = Schema(id=STORED, stub=KEYWORD, name=TEXT(analyzer=analysis.SimpleAnalyzer()), description=TEXT, section=TEXT, help=TEXT, labels=KEYWORD) self.rex = analysis.RegexTokenizer() self.toolbox = toolbox self.build_index(index_help) def build_index(self, index_help=True): """Prepare search index for tools loaded in toolbox.""" RamStorage.temp_storage = _temp_storage # Works around https://bitbucket.org/mchaput/whoosh/issues/391/race-conditions-with-temp-storage self.storage = RamStorage() self.index = self.storage.create_index(self.schema) writer = self.index.writer() start_time = datetime.now() log.debug('Starting to build toolbox index.') for id, tool in self.toolbox.tools(): # Do not add data managers to the public index if tool.tool_type == 'manage_data': continue add_doc_kwds = { "id": id, "description": to_unicode(tool.description), "section": to_unicode(tool.get_panel_section( )[1] if len(tool.get_panel_section()) == 2 else ''), "help": to_unicode("") } if tool.name.find('-') != -1: # Hyphens are wildcards in Whoosh causing bad things add_doc_kwds['name'] = (' ').join( [token.text for token in self.rex(to_unicode(tool.name))]) else: add_doc_kwds['name'] = to_unicode(tool.name) if tool.guid: # Create a stub consisting of owner, repo, and tool from guid slash_indexes = [ m.start() for m in re.finditer('/', tool.guid) ] id_stub = tool.guid[(slash_indexes[1] + 1):slash_indexes[4]] add_doc_kwds['stub'] = (' ').join( [token.text for token in self.rex(to_unicode(id_stub))]) else: add_doc_kwds['stub'] = to_unicode(id) if tool.labels: add_doc_kwds['labels'] = to_unicode(" ".join(tool.labels)) if index_help and tool.help: try: add_doc_kwds['help'] = to_unicode( tool.help.render(host_url="", static_path="")) except Exception: # Don't fail to build index just because a help message # won't render. pass writer.add_document(**add_doc_kwds) writer.commit() stop_time = datetime.now() log.debug('Toolbox index finished. It took: ' + str(stop_time - start_time)) def search(self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher(weighting=BM25F( field_B={ 'name_B': float(tool_name_boost), 'section_B': float(tool_section_boost), 'description_B': float(tool_description_boost), 'labels_B': float(tool_label_boost), 'stub_B': float(tool_stub_boost), 'help_B': float(tool_help_boost) })) # Set query to search name, description, section, help, and labels. parser = MultifieldParser( ['name', 'description', 'section', 'help', 'labels', 'stub'], schema=self.schema) # Hyphens are wildcards in Whoosh causing bad things if q.find('-') != -1: q = (' ').join([token.text for token in self.rex(to_unicode(q))]) # Perform tool search with ngrams if set to true in the config file if (tool_enable_ngram_search is True or tool_enable_ngram_search == "True"): hits_with_score = {} token_analyzer = StandardAnalyzer() | analysis.NgramFilter( minsize=int(tool_ngram_minsize), maxsize=int(tool_ngram_maxsize)) ngrams = [token.text for token in token_analyzer(q)] for query in ngrams: # Get the tool list with respective scores for each qgram curr_hits = searcher.search(parser.parse('*' + query + '*'), limit=float(tool_search_limit)) for i, curr_hit in enumerate(curr_hits): is_present = False for prev_hit in hits_with_score: # Check if the tool appears again for the next qgram search if curr_hit['id'] == prev_hit: is_present = True # Add the current score with the previous one if the # tool appears again for the next qgram hits_with_score[prev_hit] = curr_hits.score( i) + hits_with_score[prev_hit] # Add the tool if not present to the collection with its score if not is_present: hits_with_score[curr_hit['id']] = curr_hits.score(i) # Sort the results based on aggregated BM25 score in decreasing order of scores hits_with_score = sorted(hits_with_score.items(), key=lambda x: x[1], reverse=True) # Return the tool ids return [ item[0] for item in hits_with_score[0:int(tool_search_limit)] ] else: # Perform the search hits = searcher.search(parser.parse('*' + q + '*'), limit=float(tool_search_limit)) return [hit['id'] for hit in hits]
def test_multireader_not(): schema = fields.Schema(id=fields.STORED, f=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, f=u("alfa bravo chralie")) w.add_document(id=1, f=u("bravo chralie delta")) w.add_document(id=2, f=u("charlie delta echo")) w.add_document(id=3, f=u("delta echo foxtrot")) w.add_document(id=4, f=u("echo foxtrot golf")) w.commit() with ix.searcher() as s: q = And([Term("f", "delta"), Not(Term("f", "delta"))]) r = s.search(q) assert_equal(len(r), 0) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=5, f=u("alfa bravo chralie")) w.add_document(id=6, f=u("bravo chralie delta")) w.commit(merge=False) w = ix.writer() w.add_document(id=7, f=u("charlie delta echo")) w.add_document(id=8, f=u("delta echo foxtrot")) w.commit(merge=False) w = ix.writer() w.add_document(id=9, f=u("echo foxtrot golf")) w.add_document(id=10, f=u("foxtrot golf delta")) w.commit(merge=False) assert len(ix._segments()) > 1 with ix.searcher() as s: q = And([Term("f", "delta"), Not(Term("f", "delta"))]) r = s.search(q) assert_equal(len(r), 0)
class ToolBoxSearch(object): """ Support searching tools in a toolbox. This implementation uses the Whoosh search library. """ def __init__(self, toolbox, index_help=True): """ Create a searcher for `toolbox`. """ self.toolbox = toolbox self.build_index(index_help) def build_index(self, index_help=True): log.debug('Starting to build toolbox index.') self.storage = RamStorage() self.index = self.storage.create_index(schema) writer = self.index.writer() for id, tool in self.toolbox.tools(): # Do not add data managers to the public index if tool.tool_type == 'manage_data': continue add_doc_kwds = { "id": id, "name": to_unicode(tool.name), "description": to_unicode(tool.description), "section": to_unicode(tool.get_panel_section( )[1] if len(tool.get_panel_section()) == 2 else ''), "help": to_unicode("") } if tool.labels: add_doc_kwds['labels'] = to_unicode(" ".join(tool.labels)) if index_help and tool.help: try: add_doc_kwds['help'] = to_unicode( tool.help.render(host_url="", static_path="")) except Exception: # Don't fail to build index just because a help message # won't render. pass writer.add_document(**add_doc_kwds) writer.commit() log.debug('Toolbox index finished.') def search(self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_help_boost, tool_search_limit): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher(weighting=BM25F( field_B={ 'name_B': float(tool_name_boost), 'section_B': float(tool_section_boost), 'description_B': float(tool_description_boost), 'help_B': float(tool_help_boost) })) # Set query to search name, description, section, help, and labels. parser = MultifieldParser( ['name', 'description', 'section', 'help', 'labels'], schema=schema) # Perform the search hits = searcher.search(parser.parse('*' + q + '*'), limit=float(tool_search_limit)) return [hit['id'] for hit in hits]
class ToolBoxSearch(object): """ Support searching tools in a toolbox. This implementation uses the Whoosh search library. """ def __init__(self, toolbox, index_help=True): """ Create a searcher for `toolbox`. """ self.schema = Schema(id=STORED, stub=KEYWORD, name=TEXT(analyzer=analysis.SimpleAnalyzer()), description=TEXT, section=TEXT, help=TEXT, labels=KEYWORD) self.rex = analysis.RegexTokenizer() self.toolbox = toolbox self.build_index(index_help) def build_index(self, index_help=True): # Works around https://bitbucket.org/mchaput/whoosh/issues/391/race-conditions-with-temp-storage RamStorage.temp_storage = _temp_storage self.storage = RamStorage() self.index = self.storage.create_index(self.schema) writer = self.index.writer() start_time = datetime.now() log.debug('Starting to build toolbox index.') for id, tool in self.toolbox.tools(): # Do not add data managers to the public index if tool.tool_type == 'manage_data': continue add_doc_kwds = { "id": id, "description": to_unicode(tool.description), "section": to_unicode(tool.get_panel_section( )[1] if len(tool.get_panel_section()) == 2 else ''), "help": to_unicode("") } # Hyphens are wildcards in Whoosh causing bad things if tool.name.find('-') != -1: add_doc_kwds['name'] = (' ').join( [token.text for token in self.rex(to_unicode(tool.name))]) else: add_doc_kwds['name'] = to_unicode(tool.name) # We do not want to search Tool Shed or version parts # of the long ids if id.find('/') != -1: slash_indexes = [m.start() for m in re.finditer('/', id)] id_stub = id[(slash_indexes[1] + 1):slash_indexes[4]] add_doc_kwds['stub'] = (' ').join( [token.text for token in self.rex(to_unicode(id_stub))]) else: add_doc_kwds['stub'] = to_unicode(id) if tool.labels: add_doc_kwds['labels'] = to_unicode(" ".join(tool.labels)) if index_help and tool.help: try: add_doc_kwds['help'] = to_unicode( tool.help.render(host_url="", static_path="")) except Exception: # Don't fail to build index just because a help message # won't render. pass writer.add_document(**add_doc_kwds) writer.commit() stop_time = datetime.now() log.debug('Toolbox index finished. It took: ' + str(stop_time - start_time)) def search(self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher(weighting=BM25F( field_B={ 'name_B': float(tool_name_boost), 'section_B': float(tool_section_boost), 'description_B': float(tool_description_boost), 'labels_B': float(tool_label_boost), 'stub_B': float(tool_stub_boost), 'help_B': float(tool_help_boost) })) # Set query to search name, description, section, help, and labels. parser = MultifieldParser( ['name', 'description', 'section', 'help', 'labels', 'stub'], schema=self.schema) # Hyphens are wildcards in Whoosh causing bad things if q.find('-') != -1: q = (' ').join([token.text for token in self.rex(to_unicode(q))]) # Perform the search hits = searcher.search(parser.parse('*' + q + '*'), limit=float(tool_search_limit)) return [hit['id'] for hit in hits]
def __init__(self): from whoosh.filedb.filestore import RamStorage self.storage = RamStorage() self.segment = MemSegment(self, "blah")