def _create_index():
    s = fields.Schema(f1 = fields.KEYWORD(stored = True),
                      f2 = fields.KEYWORD,
                      f3 = fields.KEYWORD)
    st = RamStorage()
    ix = st.create_index(s)
    return ix
def test_term_inspection():
    schema = fields.Schema(title=fields.TEXT(stored=True),
                           content=fields.TEXT)
    st = RamStorage()
    ix = st.create_index(schema)
    writer = ix.writer()
    writer.add_document(title=u("My document"),
                        content=u("AA AA BB BB CC AA AA AA BB BB CC DD EE EE"))
    writer.add_document(title=u("My other document"),
                        content=u("AA AB BB CC EE EE AX AX DD"))
    writer.commit()
    
    reader = ix.reader()
    assert_equal(list(reader.lexicon("content")), [u('aa'), u('ab'), u('ax'), u('bb'), u('cc'), u('dd'), u('ee')])
    assert_equal(list(reader.expand_prefix("content", "a")), [u('aa'), u('ab'), u('ax')])
    assert (set(reader.all_terms())
            == set([('content', u('aa')), ('content', u('ab')), ('content', u('ax')),
                    ('content', u('bb')), ('content', u('cc')), ('content', u('dd')),
                    ('content', u('ee')), ('title', u('document')), ('title', u('my')),
                    ('title', u('other'))]))
    # (text, doc_freq, index_freq)
    assert_equal(_fstats(reader.iter_field("content")),
                 [(u('aa'), 2, 6), (u('ab'), 1, 1), (u('ax'), 1, 2), (u('bb'), 2, 5),
                  (u('cc'), 2, 3), (u('dd'), 2, 2), (u('ee'), 2, 4)])
    assert_equal(_fstats(reader.iter_field("content", prefix="c")),
                 [(u('cc'), 2, 3), (u('dd'), 2, 2), (u('ee'), 2, 4)])
    assert_equal(list(reader.most_frequent_terms("content")),
                 [(6, u('aa')), (5, u('bb')), (4, u('ee')), (3, u('cc')), (2, u('dd'))])
    assert_equal(list(reader.most_frequent_terms("content", prefix="a")),
                 [(6, u('aa')), (2, u('ax')), (1, u('ab'))])
Exemple #3
0
def test_ramstorage():
    from whoosh.filedb.filestore import RamStorage

    st = RamStorage()
    lock = st.lock("test")
    lock.acquire()
    lock.release()
Exemple #4
0
def test_decimal_numeric():
    from decimal import Decimal

    f = fields.NUMERIC(int, decimal_places=4)
    schema = fields.Schema(id=fields.ID(stored=True), deci=f)
    ix = RamStorage().create_index(schema)

    # assert f.from_text(f.to_text(Decimal("123.56"))), Decimal("123.56"))

    w = ix.writer()
    w.add_document(id=u("a"), deci=Decimal("123.56"))
    w.add_document(id=u("b"), deci=Decimal("0.536255"))
    w.add_document(id=u("c"), deci=Decimal("2.5255"))
    w.add_document(id=u("d"), deci=Decimal("58"))
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("deci", schema)
        q = qp.parse(u("123.56"))
        r = s.search(q)
        assert len(r) == 1
        assert r[0]["id"] == "a"

        r = s.search(qp.parse(u("0.536255")))
        assert len(r) == 1
        assert r[0]["id"] == "b"
Exemple #5
0
def test_add_spelling():
    schema = fields.Schema(text1=fields.TEXT, text2=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(text1=u("render zorro kaori postal"), text2=u("alfa"))
    w.add_document(text1=u("reader zebra koala pastry"), text2=u("alpa"))
    w.add_document(text1=u("leader libra ooala paster"), text2=u("alpha"))
    w.add_document(text1=u("feeder lorry zoala baster"), text2=u("olfo"))
    w.commit()

    with ix.reader() as r:
        assert not r.has_word_graph("text1")
        assert not r.has_word_graph("text2")

    from whoosh.writing import add_spelling
    add_spelling(ix, ["text1", "text2"])

    with ix.reader() as r:
        assert r.has_word_graph("text1")
        assert r.has_word_graph("text2")

        sp = spelling.ReaderCorrector(r, "text1")
        assert sp.suggest(u("kaola"), maxdist=1) == [u('koala')]
        assert sp.suggest(u("kaola"), maxdist=2) == [u('koala'), u('kaori'), u('ooala'), u('zoala')]

        sp = spelling.ReaderCorrector(r, "text2")
        assert sp.suggest(u("alfo"), maxdist=1) == [u("alfa"), u("olfo")]
Exemple #6
0
def test_compatibility():
    from whoosh.scoring import Weighting

    # This is the old way of doing a custom weighting model, check that
    # it's still supported...
    class LegacyWeighting(Weighting):
        use_final = True

        def score(self, searcher, fieldname, text, docnum, weight):
            return weight + 0.5

        def final(self, searcher, docnum, score):
            return score * 1.5

    schema = fields.Schema(text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    domain = "alfa bravo charlie delta".split()
    for ls in permutations(domain, 3):
        w.add_document(text=u(" ").join(ls))
    w.commit()

    s = ix.searcher(weighting=LegacyWeighting())
    r = s.search(query.Term("text", u("bravo")))
    assert r.score(0) == 2.25
Exemple #7
0
def test_boolean_strings():
    schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(i=0, b="true")
        w.add_document(i=1, b="True")
        w.add_document(i=2, b="false")
        w.add_document(i=3, b="False")
        w.add_document(i=4, b=u("true"))
        w.add_document(i=5, b=u("True"))
        w.add_document(i=6, b=u("false"))
        w.add_document(i=7, b=u("False"))

    with ix.searcher() as s:
        qp = qparser.QueryParser("b", ix.schema)

        def check(qs, nums):
            q = qp.parse(qs)
            r = s.search(q, limit=None)
            assert [hit["i"] for hit in r] == nums

        trues = [0, 1, 4, 5]
        falses = [2, 3, 6, 7]
        check("true", trues)
        check("True", trues)
        check("false", falses)
        check("False", falses)
        check("t", trues)
        check("f", falses)
Exemple #8
0
def test_numeric_ranges():
    schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC)
    ix = RamStorage().create_index(schema)
    w = ix.writer()

    for i in xrange(400):
        w.add_document(id=i, num=i)
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("num", schema)

        def check(qs, target):
            q = qp.parse(qs)
            result = [s.stored_fields(d)["id"] for d in q.docs(s)]
            assert result == target

        # Note that range() is always inclusive-exclusive
        check("[10 to 390]", list(range(10, 390 + 1)))
        check("[100 to]", list(range(100, 400)))
        check("[to 350]", list(range(0, 350 + 1)))
        check("[16 to 255]", list(range(16, 255 + 1)))
        check("{10 to 390]", list(range(11, 390 + 1)))
        check("[10 to 390}", list(range(10, 390)))
        check("{10 to 390}", list(range(11, 390)))
        check("{16 to 255}", list(range(17, 255)))
Exemple #9
0
def test_nested_parent():
    schema = fields.Schema(name=fields.ID(stored=True), type=fields.ID,
                           part=fields.ID, price=fields.NUMERIC)
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        with w.group():
            w.add_document(name=u("iPad"), type=u("product"))
            w.add_document(part=u("screen"), price=100)
            w.add_document(part=u("battery"), price=50)
            w.add_document(part=u("case"), price=20)

        with w.group():
            w.add_document(name=u("iPhone"), type=u("product"))
            w.add_document(part=u("screen"), price=60)
            w.add_document(part=u("battery"), price=30)
            w.add_document(part=u("case"), price=10)

        with w.group():
            w.add_document(name=u("Mac mini"), type=u("product"))
            w.add_document(part=u("hard drive"), price=50)
            w.add_document(part=u("case"), price=50)

    with ix.searcher() as s:
        price = s.schema["price"]

        pq = query.Term("type", "product")
        cq = query.Term("price", 50)
        q = query.NestedParent(pq, cq)

        r = s.search(q)
        assert sorted([hit["name"] for hit in r]) == ["Mac mini", "iPad"]
Exemple #10
0
def test_datetime():
    dtf = fields.DATETIME(stored=True)
    schema = fields.Schema(id=fields.ID(stored=True), date=dtf)
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    for month in xrange(1, 12):
        for day in xrange(1, 28):
            w.add_document(id=u("%s-%s") % (month, day),
                           date=datetime(2010, month, day, 14, 0, 0))
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)

        r = s.search(qp.parse("date:20100523"))
        assert len(r) == 1
        assert r[0]["id"] == "5-23"
        assert r[0]["date"].__class__ is datetime
        assert r[0]["date"].month == 5
        assert r[0]["date"].day == 23

        r = s.search(qp.parse("date:'2010 02'"))
        assert len(r) == 27

        q = qp.parse(u("date:[2010-05 to 2010-08]"))
        startdt = datetime(2010, 5, 1, 0, 0, 0, 0)
        enddt = datetime(2010, 8, 31, 23, 59, 59, 999999)
        assert q.__class__ is query.NumericRange
        assert q.start == times.datetime_to_long(startdt)
        assert q.end == times.datetime_to_long(enddt)
Exemple #11
0
 def test_merged_lengths(self):
     s = fields.Schema(f1 = fields.KEYWORD(stored = True, scorable = True),
                       f2 = fields.KEYWORD(stored = True, scorable = True))
     st = RamStorage()
     ix = st.create_index(s)
     w = ix.writer()
     w.add_document(f1 = u"A B C", f2 = u"X")
     w.add_document(f1 = u"B C D E", f2 = u"Y Z")
     w.commit()
     
     w = ix.writer()
     w.add_document(f1 = u"A", f2 = u"B C D E X Y")
     w.add_document(f1 = u"B C", f2 = u"X")
     w.commit(NO_MERGE)
     
     w = ix.writer()
     w.add_document(f1 = u"A B X Y Z", f2 = u"B C")
     w.add_document(f1 = u"Y X", f2 = u"A B")
     w.commit(NO_MERGE)
     
     dr = ix.reader()
     self.assertEqual(dr.stored_fields(0)["f1"], u"A B C")
     self.assertEqual(dr.doc_field_length(0, "f1"), 3)
     self.assertEqual(dr.doc_field_length(2, "f2"), 6)
     self.assertEqual(dr.doc_field_length(4, "f1"), 5)
     dr.close()
Exemple #12
0
def test_no_parents():
    schema = fields.Schema(id=fields.STORED, kind=fields.ID,
                           name=fields.ID(stored=True))
    k = u("alfa")
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(id=0, kind=k, name=u("one"))
        w.add_document(id=1, kind=k, name=u("two"))
        w.add_document(id=2, kind=k, name=u("three"))
        w.add_document(id=3, kind=k, name=u("four"))
        w.add_document(id=4, kind=k, name=u("one"))
        w.add_document(id=5, kind=k, name=u("two"))
        w.add_document(id=6, kind=k, name=u("three"))
        w.add_document(id=7, kind=k, name=u("four"))
        w.add_document(id=8, kind=k, name=u("one"))
        w.add_document(id=9, kind=k, name=u("two"))
        w.add_document(id=10, kind=k, name=u("three"))
        w.add_document(id=11, kind=k, name=u("four"))

    with ix.searcher() as s:
        pq = query.Term("kind", "bravo")
        cq = query.Or([query.Term("name", "two"), query.Term("name", "four")])
        q = query.NestedParent(pq, cq)
        r = s.search(q)
        assert r.is_empty()
def test_filter():
    schema = fields.Schema(id=fields.STORED, path=fields.ID, text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=1, path=u("/a/1"), text=u("alfa bravo charlie"))
    w.add_document(id=2, path=u("/b/1"), text=u("bravo charlie delta"))
    w.add_document(id=3, path=u("/c/1"), text=u("charlie delta echo"))
    w.commit(merge=False)
    w = ix.writer()
    w.add_document(id=4, path=u("/a/2"), text=u("delta echo alfa"))
    w.add_document(id=5, path=u("/b/2"), text=u("echo alfa bravo"))
    w.add_document(id=6, path=u("/c/2"), text=u("alfa bravo charlie"))
    w.commit(merge=False)
    w = ix.writer()
    w.add_document(id=7, path=u("/a/3"), text=u("bravo charlie delta"))
    w.add_document(id=8, path=u("/b/3"), text=u("charlie delta echo"))
    w.add_document(id=9, path=u("/c/3"), text=u("delta echo alfa"))
    w.commit(merge=False)

    with ix.searcher() as s:
        fq = Or([Prefix("path", "/a"), Prefix("path", "/b")])
        r = s.search(Term("text", "alfa"), filter=fq)
        assert_equal([d["id"] for d in r], [1, 4, 5])

        r = s.search(Term("text", "bravo"), filter=fq)
        assert_equal([d["id"] for d in r], [1, 2, 5, 7, ])
Exemple #14
0
 def test_frequency_keyword(self):
     s = fields.Schema(content = fields.KEYWORD)
     st = RamStorage()
     ix = st.create_index(s)
     
     w = ix.writer()
     w.add_document(content = u"A B C D E")
     w.add_document(content = u"B B B B C D D")
     w.add_document(content = u"D E F")
     w.commit()
     
     tr = ix.reader()
     self.assertEqual(tr.doc_frequency("content", u"B"), 2)
     self.assertEqual(tr.frequency("content", u"B"), 5)
     self.assertEqual(tr.doc_frequency("content", u"E"), 2)
     self.assertEqual(tr.frequency("content", u"E"), 2)
     self.assertEqual(tr.doc_frequency("content", u"A"), 1)
     self.assertEqual(tr.frequency("content", u"A"), 1)
     self.assertEqual(tr.doc_frequency("content", u"D"), 3)
     self.assertEqual(tr.frequency("content", u"D"), 4)
     self.assertEqual(tr.doc_frequency("content", u"F"), 1)
     self.assertEqual(tr.frequency("content", u"F"), 1)
     self.assertEqual(tr.doc_frequency("content", u"Z"), 0)
     self.assertEqual(tr.frequency("content", u"Z"), 0)
     self.assertEqual(list(tr), [(0, u"A", 1, 1), (0, u"B", 2, 5),
                                 (0, u"C", 2, 2), (0, u"D", 3, 4),
                                 (0, u"E", 2, 2), (0, u"F", 1, 1)])
     tr.close()
Exemple #15
0
 def test_frequency_text(self):
     s = fields.Schema(content = fields.KEYWORD)
     st = RamStorage()
     ix = st.create_index(s)
     
     w = ix.writer()
     w.add_document(content = u"alfa bravo charlie delta echo")
     w.add_document(content = u"bravo bravo bravo bravo charlie delta delta")
     w.add_document(content = u"delta echo foxtrot")
     w.commit()
     
     tr = ix.reader()
     self.assertEqual(tr.doc_frequency("content", u"bravo"), 2)
     self.assertEqual(tr.frequency("content", u"bravo"), 5)
     self.assertEqual(tr.doc_frequency("content", u"echo"), 2)
     self.assertEqual(tr.frequency("content", u"echo"), 2)
     self.assertEqual(tr.doc_frequency("content", u"alfa"), 1)
     self.assertEqual(tr.frequency("content", u"alfa"), 1)
     self.assertEqual(tr.doc_frequency("content", u"delta"), 3)
     self.assertEqual(tr.frequency("content", u"delta"), 4)
     self.assertEqual(tr.doc_frequency("content", u"foxtrot"), 1)
     self.assertEqual(tr.frequency("content", u"foxtrot"), 1)
     self.assertEqual(tr.doc_frequency("content", u"zulu"), 0)
     self.assertEqual(tr.frequency("content", u"zulu"), 0)
     self.assertEqual(list(tr), [(0, u"alfa", 1, 1), (0, u"bravo", 2, 5),
                                 (0, u"charlie", 2, 2), (0, u"delta", 3, 4),
                                 (0, u"echo", 2, 2), (0, u"foxtrot", 1, 1)])
     tr.close()
Exemple #16
0
def test_frequency_keyword():
    s = fields.Schema(content=fields.KEYWORD)
    st = RamStorage()
    ix = st.create_index(s)

    w = ix.writer()
    w.add_document(content=u("A B C D E"))
    w.add_document(content=u("B B B B C D D"))
    w.add_document(content=u("D E F"))
    w.commit()

    with ix.reader() as tr:
        assert tr.doc_frequency("content", u("B")) == 2
        assert tr.frequency("content", u("B")) == 5
        assert tr.doc_frequency("content", u("E")) == 2
        assert tr.frequency("content", u("E")) == 2
        assert tr.doc_frequency("content", u("A")) == 1
        assert tr.frequency("content", u("A")) == 1
        assert tr.doc_frequency("content", u("D")) == 3
        assert tr.frequency("content", u("D")) == 4
        assert tr.doc_frequency("content", u("F")) == 1
        assert tr.frequency("content", u("F")) == 1
        assert tr.doc_frequency("content", u("Z")) == 0
        assert tr.frequency("content", u("Z")) == 0

        stats = [(fname, text, ti.doc_frequency(), ti.weight())
                 for (fname, text), ti in tr]

        assert stats == [("content", b("A"), 1, 1), ("content", b("B"), 2, 5),
                         ("content", b("C"), 2, 2), ("content", b("D"), 3, 4),
                         ("content", b("E"), 2, 2), ("content", b("F"), 1, 1)]
Exemple #17
0
def test_frequency_text():
    s = fields.Schema(content=fields.KEYWORD)
    st = RamStorage()
    ix = st.create_index(s)

    w = ix.writer()
    w.add_document(content=u("alfa bravo charlie delta echo"))
    w.add_document(content=u("bravo bravo bravo bravo charlie delta delta"))
    w.add_document(content=u("delta echo foxtrot"))
    w.commit()

    with ix.reader() as tr:
        assert tr.doc_frequency("content", u("bravo")) == 2
        assert tr.frequency("content", u("bravo")) == 5
        assert tr.doc_frequency("content", u("echo")) == 2
        assert tr.frequency("content", u("echo")) == 2
        assert tr.doc_frequency("content", u("alfa")) == 1
        assert tr.frequency("content", u("alfa")) == 1
        assert tr.doc_frequency("content", u("delta")) == 3
        assert tr.frequency("content", u("delta")) == 4
        assert tr.doc_frequency("content", u("foxtrot")) == 1
        assert tr.frequency("content", u("foxtrot")) == 1
        assert tr.doc_frequency("content", u("zulu")) == 0
        assert tr.frequency("content", u("zulu")) == 0

        stats = [(fname, text, ti.doc_frequency(), ti.weight())
                 for (fname, text), ti in tr]

        assert stats == [("content", b("alfa"), 1, 1),
                         ("content", b("bravo"), 2, 5),
                         ("content", b("charlie"), 2, 2),
                         ("content", b("delta"), 3, 4),
                         ("content", b("echo"), 2, 2),
                         ("content", b("foxtrot"), 1, 1)]
 def test_intersection(self):
     schema = fields.Schema(key = fields.ID(stored=True), value = fields.TEXT(stored=True))
     st = RamStorage()
     ix = st.create_index(schema)
     
     w = ix.writer()
     w.add_document(key=u"a", value=u"alpha bravo charlie delta")
     w.add_document(key=u"b", value=u"echo foxtrot alpha bravo")
     w.add_document(key=u"c", value=u"charlie delta golf hotel")
     w.commit()
     
     w = ix.writer()
     w.add_document(key=u"d", value=u"india alpha bravo charlie")
     w.add_document(key=u"e", value=u"delta bravo india bravo")
     w.commit()
     
     searcher = ix.searcher()
     
     q = And([Term("value", u"bravo"), Term("value", u"delta")])
     sc = q.scorer(searcher)
     self.assertEqual(self._keys(searcher, sc.all_ids()), ["a", "e"])
     
     q = And([Term("value", u"bravo"), Term("value", u"alpha")])
     sc = q.scorer(searcher)
     self.assertEqual(self._keys(searcher, sc.all_ids()), ["a", "b", "d"])
Exemple #19
0
def test_whole_noterms():
    schema = fields.Schema(text=fields.TEXT(stored=True), tag=fields.KEYWORD)
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(text=u("alfa bravo charlie delta echo foxtrot golf"),
                       tag=u("foo"))

    with ix.searcher() as s:
        r = s.search(query.Term("text", u("delta")))
        assert len(r) == 1

        r.fragmenter = highlight.WholeFragmenter()
        r.formatter = highlight.UppercaseFormatter()
        hi = r[0].highlights("text")
        assert hi == u("alfa bravo charlie DELTA echo foxtrot golf")

        r = s.search(query.Term("tag", u("foo")))
        assert len(r) == 1
        r.fragmenter = highlight.WholeFragmenter()
        r.formatter = highlight.UppercaseFormatter()
        hi = r[0].highlights("text")
        assert hi == u("")

        hi = r[0].highlights("text", minscore=0)
        assert hi == u("alfa bravo charlie delta echo foxtrot golf")
Exemple #20
0
def test_everything_is_a_parent():
    schema = fields.Schema(id=fields.STORED, kind=fields.ID,
                           name=fields.ID(stored=True))
    k = u("alfa")
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(id=0, kind=k, name=u("one"))
        w.add_document(id=1, kind=k, name=u("two"))
        w.add_document(id=2, kind=k, name=u("three"))
        w.add_document(id=3, kind=k, name=u("four"))
        w.add_document(id=4, kind=k, name=u("one"))
        w.add_document(id=5, kind=k, name=u("two"))
        w.add_document(id=6, kind=k, name=u("three"))
        w.add_document(id=7, kind=k, name=u("four"))
        w.add_document(id=8, kind=k, name=u("one"))
        w.add_document(id=9, kind=k, name=u("two"))
        w.add_document(id=10, kind=k, name=u("three"))
        w.add_document(id=11, kind=k, name=u("four"))

    with ix.searcher() as s:
        pq = query.Term("kind", k)
        cq = query.Or([query.Term("name", "two"), query.Term("name", "four")])
        q = query.NestedParent(pq, cq)
        r = s.search(q)
        assert [hit["id"] for hit in r] == [1, 3, 5, 7, 9, 11]
Exemple #21
0
def test_pinpoint():
    domain = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet "
               "kilo lima mike november oskar papa quebec romeo sierra tango")
    schema = fields.Schema(text=fields.TEXT(stored=True, chars=True))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(text=domain)
    w.commit()

    assert ix.schema["text"].supports("characters")
    with ix.searcher() as s:
        r = s.search(query.Term("text", "juliet"), terms=True)
        hit = r[0]
        hi = highlight.Highlighter()
        hi.formatter = highlight.UppercaseFormatter()

        assert not hi.can_load_chars(r, "text")
        assert hi.highlight_hit(hit, "text") == "golf hotel india JULIET kilo lima mike november"

        hi.fragmenter = highlight.PinpointFragmenter()
        assert hi.can_load_chars(r, "text")
        assert hi.highlight_hit(hit, "text") == "ot golf hotel india JULIET kilo lima mike nove"

        hi.fragmenter.autotrim = True
        assert hi.highlight_hit(hit, "text") == "golf hotel india JULIET kilo lima mike"
Exemple #22
0
def test_scoring():
    schema = fields.Schema(kind=fields.ID,
                           name=fields.KEYWORD(scorable=True, stored=True))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        with w.group():
            w.add_document(kind=u("class"), name=u("Index"))
            w.add_document(kind=u("method"), name=u("add document"))
            w.add_document(kind=u("method"), name=u("add reader"))
            w.add_document(kind=u("method"), name=u("close"))
        with w.group():
            w.add_document(kind=u("class"), name=u("Accumulator"))
            w.add_document(kind=u("method"), name=u("add"))
            w.add_document(kind=u("method"), name=u("get result"))
        with w.group():
            w.add_document(kind=u("class"), name=u("Calculator"))
            w.add_document(kind=u("method"), name=u("add"))
            w.add_document(kind=u("method"), name=u("add all"))
            w.add_document(kind=u("method"), name=u("add some"))
            w.add_document(kind=u("method"), name=u("multiply"))
            w.add_document(kind=u("method"), name=u("close"))

    with ix.searcher() as s:
        q = query.NestedParent(query.Term("kind", "class"),
                               query.Term("name", "add"))
        r = s.search(q)
        assert [hit["name"] for hit in r] == ["Calculator", "Index", "Accumulator"]
Exemple #23
0
def test_missing_field():
    schema = fields.Schema()
    ix = RamStorage().create_index(schema)

    with ix.searcher() as s:
        with pytest.raises(KeyError):
            s.document_numbers(id=u("test"))
Exemple #24
0
def test_boolean():
    schema = fields.Schema(id=fields.ID(stored=True),
                           done=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.add_document(id=u("a"), done=True)
    w.add_document(id=u("b"), done=False)
    w.add_document(id=u("c"), done=True)
    w.add_document(id=u("d"), done=False)
    w.add_document(id=u("e"), done=True)
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)

        r = s.search(qp.parse("done:true"))
        assert sorted([d["id"] for d in r]) == ["a", "c", "e"]
        assert all(d["done"] for d in r)

        r = s.search(qp.parse("done:yes"))
        assert sorted([d["id"] for d in r]) == ["a", "c", "e"]
        assert all(d["done"] for d in r)

        q = qp.parse("done:false")
        assert q.__class__ == query.Term
        assert q.text is False
        assert schema["done"].to_bytes(False) == b("f")
        r = s.search(q)
        assert sorted([d["id"] for d in r]) == ["b", "d"]
        assert not any(d["done"] for d in r)

        r = s.search(qp.parse("done:no"))
        assert sorted([d["id"] for d in r]) == ["b", "d"]
        assert not any(d["done"] for d in r)
Exemple #25
0
def test_correct_query():
    schema = fields.Schema(a=fields.TEXT(spelling=True), b=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(a=u("alfa bravo charlie delta"))
    w.add_document(a=u("delta echo foxtrot golf"))
    w.add_document(a=u("golf hotel india juliet"))
    w.add_document(a=u("juliet kilo lima mike"))
    w.commit()

    s = ix.searcher()
    qp = QueryParser("a", ix.schema)
    qtext = u('alpha ("brovo november" OR b:dolta) detail')
    q = qp.parse(qtext, ix.schema)

    c = s.correct_query(q, qtext)
    assert c.query.__unicode__() == '(a:alfa AND (a:"bravo november" OR b:dolta) AND a:detail)'
    assert c.string == 'alfa ("bravo november" OR b:dolta) detail'

    qtext = u('alpha b:("brovo november" a:delta) detail')
    q = qp.parse(qtext, ix.schema)
    c = s.correct_query(q, qtext)
    assert c.query.__unicode__() == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)'
    assert c.string == 'alfa b:("brovo november" a:delta) detail'

    hf = highlight.HtmlFormatter(classname="c")
    assert c.format_string(hf) == '<strong class="c term0">alfa</strong> b:("brovo november" a:delta) detail'
    def create_index(cls, app, wh):
        """Creates and opens an index for the given whoosheer and app.
        If the index already exists, it just opens it, otherwise it creates
        it first.

        :param app: The application instance.
        :param wh: The whoosheer instance for which a index should be created.
        """
        # TODO: do we really want/need to use camel casing?
        # everywhere else, there is just .lower()
        if app.extensions['whooshee']['memory_storage']:
            storage = RamStorage()
            index = storage.create_index(wh.schema)
            assert index
            return index
        else:
            index_path = os.path.join(app.extensions['whooshee']['index_path_root'],
                                      getattr(wh, 'index_subdir', cls.camel_to_snake(wh.__name__)))
            if whoosh.index.exists_in(index_path):
                index = whoosh.index.open_dir(index_path)
            else:
                if not os.path.exists(index_path):
                    os.makedirs(index_path)
                index = whoosh.index.create_in(index_path, wh.schema)
            return index
Exemple #27
0
def test_numeric():
    schema = fields.Schema(id=fields.ID(stored=True),
                           integer=fields.NUMERIC(int),
                           floating=fields.NUMERIC(float))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.add_document(id=u("a"), integer=5820, floating=1.2)
    w.add_document(id=u("b"), integer=22, floating=2.3)
    w.add_document(id=u("c"), integer=78, floating=3.4)
    w.add_document(id=u("d"), integer=13, floating=4.5)
    w.add_document(id=u("e"), integer=9, floating=5.6)
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("integer", schema)

        q = qp.parse(u("5820"))
        r = s.search(q)
        assert len(r) == 1
        assert r[0]["id"] == "a"

    with ix.searcher() as s:
        r = s.search(qp.parse("floating:4.5"))
        assert len(r) == 1
        assert r[0]["id"] == "d"

    q = qp.parse("integer:*")
    assert q.__class__ == query.Every
    assert q.field() == "integer"

    q = qp.parse("integer:5?6")
    assert q == query.NullQuery
Exemple #28
0
def test_wildcard_existing_terms():
    s = fields.Schema(key=fields.ID, value=fields.TEXT)
    ix = RamStorage().create_index(s)

    w = ix.writer()
    w.add_document(key=u("a"), value=u("alfa bravo bear charlie delta"))
    w.add_document(key=u("a"), value=u("boggle echo render rendering renders"))
    w.commit()
    r = ix.reader()
    qp = QueryParser("value", ix.schema)

    def words(terms):
        z = []
        for t in terms:
            assert t[0] == "value"
            z.append(t[1])
        return " ".join(sorted(z))

    q = qp.parse(u("b*"))
    ts = q.existing_terms(r)
    assert_equal(ts, set())
    ts = q.existing_terms(r, expand=True)
    assert_equal(words(ts), "bear boggle bravo")

    q = qp.parse(u("[a TO f]"))
    ts = q.existing_terms(r)
    assert_equal(ts, set())
    ts = q.existing_terms(r, expand=True)
    assert_equal(words(ts), "alfa bear boggle bravo charlie delta echo")

    q = query.Variations("value", "render")
    ts = q.existing_terms(r, expand=False)
    assert_equal(ts, set())
    ts = q.existing_terms(r, expand=True)
    assert_equal(words(ts), "render rendering renders")
Exemple #29
0
def test_highlight_daterange():
    from datetime import datetime

    schema = fields.Schema(id=fields.ID(unique=True, stored=True),
                           title=fields.TEXT(stored=True),
                           content=fields.TEXT(stored=True),
                           released=fields.DATETIME(stored=True))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.update_document(
        id=u('1'),
        title=u('Life Aquatic'),
        content=u('A nautic film crew sets out to kill a gigantic shark.'),
        released=datetime(2004, 12, 25)
    )
    w.update_document(
        id=u('2'),
        title=u('Darjeeling Limited'),
        content=u('Three brothers meet in India for a life changing train journey.'),
        released=datetime(2007, 10, 27)
    )
    w.commit()

    s = ix.searcher()
    r = s.search(Term('content', u('train')), terms=True)
    assert_equal(len(r), 1)
    assert_equal(r[0]["id"], "2")
    assert_equal(r[0].highlights("content"), 'for a life changing <b class="match term0">train</b> journey')

    r = s.search(DateRange('released', datetime(2007, 1, 1), None))
    assert_equal(len(r), 1)
    assert_equal(r[0].highlights("content"), '')
Exemple #30
0
def test_decimal_ranges():
    from decimal import Decimal

    schema = fields.Schema(id=fields.STORED,
                           num=fields.NUMERIC(int, decimal_places=2))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    count = Decimal("0.0")
    inc = Decimal("0.2")
    for _ in xrange(500):
        w.add_document(id=str(count), num=count)
        count += inc
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("num", schema)

        def check(qs, start, end):
            q = qp.parse(qs)
            result = [s.stored_fields(d)["id"] for d in q.docs(s)]

            target = []
            count = Decimal(start)
            limit = Decimal(end)
            while count <= limit:
                target.append(str(count))
                count += inc

            assert result == target

        check("[10.2 to 80.8]", "10.2", "80.8")
        check("{10.2 to 80.8]", "10.4", "80.8")
        check("[10.2 to 80.8}", "10.2", "80.6")
        check("{10.2 to 80.8}", "10.4", "80.6")
Exemple #31
0
    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections
        new_index = False

        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True

        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError(
                "The path to your Whoosh index '%s' is not writable for the current user/group."
                % self.path)

        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS

            if getattr(LOCALS, 'RAM_STORE', None) is None:
                LOCALS.RAM_STORE = RamStorage()

            self.storage = LOCALS.RAM_STORE

        self.content_field_name, self.schema = self.build_schema(connections[
            self.connection_alias].get_unified_index().all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)

        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)

        self.setup_complete = True
Exemple #32
0
def test_index_decimals():
    from decimal import Decimal

    schema = fields.Schema(name=fields.KEYWORD(stored=True),
                           num=fields.NUMERIC(int))
    ix = RamStorage().create_index(schema)

    with ix.writer() as w:
        with pytest.raises(TypeError):
            w.add_document(name=u("hello"), num=Decimal("3.2"))

    schema = fields.Schema(name=fields.KEYWORD(stored=True),
                           num=fields.NUMERIC(Decimal, decimal_places=5))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(name=u("hello"), num=Decimal("3.2"))
Exemple #33
0
def test_term_stats():
    schema = fields.Schema(t=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(t=u("alfa bravo charlie delta echo"))
    w.add_document(t=u("bravo charlie delta echo foxtrot"))
    w.add_document(t=u("charlie delta echo foxtrot golf"))
    w.add_document(t=u("delta echo foxtrot"))
    w.add_document(t=u("echo foxtrot golf hotel india juliet"))
    w.add_document(t=u("foxtrot alfa alfa alfa"))
    w.commit()

    with ix.reader() as r:
        ti = r.term_info("t", u("alfa"))
        assert_equal(ti.weight(), 4.0)
        assert_equal(ti.doc_frequency(), 2)
        assert_equal(ti.min_length(), 4)
        assert_equal(ti.max_length(), 5)
        assert_equal(ti.max_weight(), 3.0)

        assert_equal(r.term_info("t", u("echo")).min_length(), 3)

        assert_equal(r.doc_field_length(3, "t"), 3)
        assert_equal(r.min_field_length("t"), 3)
        assert_equal(r.max_field_length("t"), 6)

    w = ix.writer()
    w.add_document(t=u("alfa"))
    w.add_document(t=u("bravo charlie"))
    w.add_document(t=u("echo foxtrot tango bravo"))
    w.add_document(t=u("golf hotel"))
    w.add_document(t=u("india"))
    w.add_document(t=u("juliet alfa bravo charlie delta echo foxtrot"))
    w.commit(merge=False)

    with ix.reader() as r:
        ti = r.term_info("t", u("alfa"))
        assert_equal(ti.weight(), 6.0)
        assert_equal(ti.doc_frequency(), 4)
        assert_equal(ti.min_length(), 1)
        assert_equal(ti.max_length(), 7)
        assert_equal(ti.max_weight(), 3.0)

        assert_equal(r.term_info("t", u("echo")).min_length(), 3)

        assert_equal(r.min_field_length("t"), 1)
        assert_equal(r.max_field_length("t"), 7)
Exemple #34
0
 def test_spelling(self):
     st = RamStorage()
     
     sp = spelling.SpellChecker(st, mingram=2)
     
     wordlist = ["render", "animation", "animate", "shader",
                 "shading", "zebra", "koala", "lamppost",
                 "ready", "kismet", "reaction", "page",
                 "delete", "quick", "brown", "fox", "jumped",
                 "over", "lazy", "dog", "wicked", "erase",
                 "red", "team", "yellow", "under", "interest",
                 "open", "print", "acrid", "sear", "deaf",
                 "feed", "grow", "heal", "jolly", "kilt",
                 "low", "zone", "xylophone", "crown",
                 "vale", "brown", "neat", "meat", "reduction",
                 "blunder", "preaction"]
     
     sp.add_words([unicode(w) for w in wordlist])
     
     sugs = sp.suggest(u"reoction")
     self.assertNotEqual(len(sugs), 0)
     self.assertEqual(sugs, [u"reaction", u"reduction", u"preaction"])
Exemple #35
0
class ToolBoxSearch(object):
    """
    Support searching tools in a toolbox. This implementation uses
    the "whoosh" search library.
    """
    def __init__(self, toolbox):
        """
        Create a searcher for `toolbox`. 
        """
        self.toolbox = toolbox
        self.enabled = tool_search_enabled
        if tool_search_enabled:
            self.build_index()

    def build_index(self):
        self.storage = RamStorage()
        self.index = self.storage.create_index(schema)
        writer = self.index.writer()
        ## TODO: would also be nice to search section headers.
        for id, tool in self.toolbox.tools_by_id.iteritems():
            writer.add_document(id=id,
                                title=to_unicode(tool.name),
                                description=to_unicode(tool.description),
                                help=to_unicode(tool.help))
        writer.commit()

    def search(self, query, return_attribute='id'):
        if not tool_search_enabled:
            return []
        # Change field boosts for searcher to place more weight on title, description than help.
        searcher = self.index.searcher( \
                        weighting=BM25F( field_B={ 'title_B' : 3, 'description_B' : 2, 'help_B' : 1 } \
                                    ) )
        # Set query to search title, description, and help.
        parser = MultifieldParser(['title', 'description', 'help'],
                                  schema=schema)
        results = searcher.search(parser.parse(query), minscore=2.0)
        return [result[return_attribute] for result in results]
Exemple #36
0
class test_translate():
    domain = [("alfa", 100, 50), ("bravo", 20, 80), ("charlie", 10, 10),
              ("delta", 82, 39), ("echo", 20, 73), ("foxtrot", 81, 59),
              ("golf", 39, 93), ("hotel", 57, 48), ("india", 84, 75),
              ]

    schema = fields.Schema(name=fields.TEXT(sortable=True),
                           a=fields.NUMERIC(sortable=True),
                           b=fields.NUMERIC(sortable=True))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        for name, a, b in domain:
            w.add_document(name=u(name), a=a, b=b)

    with ix.searcher() as s:
        q = query.Every()

        # Baseline: just sort by a field
        r = s.search(q, sortedby="a")
        assert " ".join([hit["name"] for hit in r]) == "charlie bravo echo golf hotel foxtrot delta india alfa"

        # Sort by reversed name
        target = [x[0] for x in sorted(domain, key=lambda x: x[0][::-1])]
        tf = sorting.TranslateFacet(lambda name: name[::-1], sorting.FieldFacet("name"))
        r = s.search(q, sortedby=tf)
        assert [hit["name"] for hit in r] == target

        # Sort by average of a and b
        def avg(a, b):
            return (a + b) / 2

        target = [x[0] for x in sorted(domain, key=lambda x: (x[1] + x[2]) / 2)]
        af = sorting.FieldFacet("a")
        bf = sorting.FieldFacet("b")
        tf = sorting.TranslateFacet(avg, af, bf)
        r = s.search(q, sortedby=tf)
        assert [hit["name"] for hit in r] == target
Exemple #37
0
def test_missing():
    schema = fields.Schema(kind=fields.ID,
                           name=fields.KEYWORD(scorable=True, stored=True))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        with w.group():
            w.add_document(kind=u("class"), name=u("Index"))
            w.add_document(kind=u("method"), name=u("add document"))
            w.add_document(kind=u("method"), name=u("add reader"))
            w.add_document(kind=u("method"), name=u("close"))
        with w.group():
            w.add_document(kind=u("class"), name=u("Accumulator"))
            w.add_document(kind=u("method"), name=u("add"))
            w.add_document(kind=u("method"), name=u("get result"))
        with w.group():
            w.add_document(kind=u("class"), name=u("Calculator"))
            w.add_document(kind=u("method"), name=u("add"))
            w.add_document(kind=u("method"), name=u("add all"))
            w.add_document(kind=u("method"), name=u("add some"))
            w.add_document(kind=u("method"), name=u("multiply"))
            w.add_document(kind=u("method"), name=u("close"))
        with w.group():
            w.add_document(kind=u("class"), name=u("Deleter"))
            w.add_document(kind=u("method"), name=u("add"))
            w.add_document(kind=u("method"), name=u("delete"))

    with ix.searcher() as s:
        q = query.NestedParent(query.Term("kind", "class"),
                               query.Term("name", "add"))

        r = s.search(q)
        assert [hit["name"] for hit in r
                ] == ["Calculator", "Index", "Accumulator", "Deleter"]

    with ix.writer() as w:
        w.delete_by_term("name", "Accumulator")
        w.delete_by_term("name", "Calculator")

    with ix.searcher() as s:
        pq = query.Term("kind", "class")
        assert len(list(pq.docs(s))) == 2
        q = query.NestedParent(pq, query.Term("name", "add"))
        r = s.search(q)
        assert [hit["name"] for hit in r] == ["Index", "Deleter"]
Exemple #38
0
 def setup(self):
     """
     Defers loading until needed.
     """
     new_index = False
     
     # Make sure the index is there.
     if self.use_file_storage and not os.path.exists(settings.HAYSTACK_WHOOSH_PATH):
         os.makedirs(settings.HAYSTACK_WHOOSH_PATH)
         new_index = True
     
     if self.use_file_storage and not os.access(settings.HAYSTACK_WHOOSH_PATH, os.W_OK):
         raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % settings.HAYSTACK_WHOOSH_PATH)
     
     if self.use_file_storage:
         self.storage = FileStorage(settings.HAYSTACK_WHOOSH_PATH)
     else:
         global LOCALS
         
         if LOCALS.RAM_STORE is None:
             LOCALS.RAM_STORE = RamStorage()
         
         self.storage = LOCALS.RAM_STORE
     
     self.content_field_name, self.schema = self.build_schema(self.site.all_searchfields())
     self.parser = QueryParser(self.content_field_name, schema=self.schema)
     
     if new_index is True:
         self.index = self.storage.create_index(self.schema)
     else:
         try:
             self.index = self.storage.open_index(schema=self.schema)
         except index.EmptyIndexError:
             self.index = self.storage.create_index(self.schema)
     
     self.setup_complete = True
Exemple #39
0
def test_min_max_id():
    schema = fields.Schema(id=fields.STORED, t=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=0, t=u("alfa bravo charlie"))
    w.add_document(id=1, t=u("bravo charlie delta"))
    w.add_document(id=2, t=u("charlie delta echo"))
    w.add_document(id=3, t=u("delta echo foxtrot"))
    w.add_document(id=4, t=u("echo foxtrot golf"))
    w.commit()

    with ix.reader() as r:
        ti = r.term_info("t", u("delta"))
        assert_equal(ti.min_id(), 1)
        assert_equal(ti.max_id(), 3)

        ti = r.term_info("t", u("alfa"))
        assert_equal(ti.min_id(), 0)
        assert_equal(ti.max_id(), 0)

        ti = r.term_info("t", u("foxtrot"))
        assert_equal(ti.min_id(), 3)
        assert_equal(ti.max_id(), 4)

    w = ix.writer()
    w.add_document(id=5, t=u("foxtrot golf hotel"))
    w.add_document(id=6, t=u("golf hotel alfa"))
    w.add_document(id=7, t=u("hotel alfa bravo"))
    w.add_document(id=8, t=u("alfa bravo charlie"))
    w.commit(merge=False)

    with ix.reader() as r:
        ti = r.term_info("t", u("delta"))
        assert_equal(ti.min_id(), 1)
        assert_equal(ti.max_id(), 3)

        ti = r.term_info("t", u("alfa"))
        assert_equal(ti.min_id(), 0)
        assert_equal(ti.max_id(), 8)

        ti = r.term_info("t", u("foxtrot"))
        assert_equal(ti.min_id(), 3)
        assert_equal(ti.max_id(), 5)
def test_clear():
    schema = fields.Schema(a=fields.KEYWORD)
    ix = RamStorage().create_index(schema)

    # Add some segments
    with ix.writer() as w:
        w.add_document(a=u"one two three")
        w.merge = False
    with ix.writer() as w:
        w.add_document(a=u"two three four")
        w.merge = False
    with ix.writer() as w:
        w.add_document(a=u"three four five")
        w.merge = False

    # Clear
    with ix.writer() as w:
        w.add_document(a=u"foo bar baz")
        w.mergetype = writing.CLEAR

    with ix.searcher() as s:
        assert s.doc_count_all() == 1
        assert list(s.reader().lexicon("a")) == [b("bar"), b("baz"), b("foo")]
def _rt(c, values, default):
    # Continuous
    st = RamStorage()
    f = st.create_file("test1")
    f.write(b("hello"))
    w = c.writer(f)
    for docnum, v in enumerate(values):
        w.add(docnum, v)
    w.finish(len(values))
    length = f.tell() - 5
    f.close()

    f = st.open_file("test1")
    r = c.reader(f, 5, length, len(values))
    assert values == list(r)
    for x in range(len(values)):
        assert values[x] == r[x]
    f.close()

    # Sparse
    doccount = len(values) * 7 + 15
    target = [default] * doccount

    f = st.create_file("test2")
    f.write(b("hello"))
    w = c.writer(f)
    for docnum, v in izip(xrange(10, doccount, 7), values):
        target[docnum] = v
        w.add(docnum, v)
    w.finish(doccount)
    length = f.tell() - 5
    f.close()

    f = st.open_file("test2")
    r = c.reader(f, 5, length, doccount)
    assert target == list(r)
    for x in range(doccount):
        assert target[x] == r[x]

    lr = r.load()
    assert target == list(lr)
    f.close()
Exemple #42
0
def test_missing_column():
    from whoosh import collectors

    schema = fields.Schema(id=fields.STORED, tags=fields.KEYWORD)
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(id=0, tags=u("alfa bravo charlie"))
        w.add_document(id=1, tags=u("bravo charlie delta"))
        w.add_document(id=2, tags=u("charlie delta echo"))
        w.merge = False

    with ix.writer() as w:
        w.add_field("age", fields.NUMERIC(sortable=True))

        w.add_document(id=3, tags=u("delta echo foxtrot"), age=10)
        w.add_document(id=4, tags=u("echo foxtrot golf"), age=5)
        w.add_document(id=5, tags=u("foxtrot golf alfa"), age=20)
        w.merge = False

    with ix.writer() as w:
        w.add_document(id=6, tags=u("golf alfa bravo"), age=2)
        w.add_document(id=7, tags=u("alfa hotel india"), age=50)
        w.add_document(id=8, tags=u("hotel india bravo"), age=15)
        w.merge = False

    with ix.searcher() as s:
        assert not s.is_atomic()

        q = query.Term("tags", u("alfa"))

        # Have to use yucky low-level collector API to make sure we used a
        # ColumnCategorizer to do the sorting
        c = s.collector(sortedby="age")
        assert isinstance(c, collectors.SortingCollector)
        s.search_with_collector(q, c)
        assert isinstance(c.categorizer, sorting.ColumnCategorizer)

        r = c.results()
        assert [hit["id"] for hit in r] == [6, 5, 7, 0]

        r = s.search(q, sortedby="age", reverse=True)
        assert [hit["id"] for hit in r] == [0, 7, 5, 6]
Exemple #43
0
def test_doc_boost():
    schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=0, a=u("alfa alfa alfa"), b=u("bravo"))
    w.add_document(id=1, a=u("alfa"), b=u("bear"), _a_boost=5.0)
    w.add_document(id=2, a=u("alfa alfa alfa alfa"), _boost=0.5)
    w.commit()

    with ix.searcher() as s:
        r = s.search(query.Term("a", "alfa"))
        assert [hit["id"] for hit in r] == [1, 0, 2]

    w = ix.writer()
    w.add_document(id=3, a=u("alfa"), b=u("bottle"))
    w.add_document(id=4, b=u("bravo"), _b_boost=2.0)
    w.commit(merge=False)

    with ix.searcher() as s:
        r = s.search(query.Term("a", "alfa"))
        assert [hit["id"] for hit in r] == [1, 0, 3, 2]
Exemple #44
0
def test_score_facet():
    schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT,
                           c=fields.ID)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=1, a=u("alfa alfa bravo"), b=u("bottle"), c=u("c"))
    w.add_document(id=2, a=u("alfa alfa alfa"), b=u("bottle"), c=u("c"))
    w.commit()
    w = ix.writer()
    w.add_document(id=3, a=u("alfa bravo bravo"), b=u("bottle"), c=u("c"))
    w.add_document(id=4, a=u("alfa bravo alfa"), b=u("apple"), c=u("c"))
    w.commit(merge=False)
    w = ix.writer()
    w.add_document(id=5, a=u("alfa bravo bravo"), b=u("apple"), c=u("c"))
    w.add_document(id=6, a=u("alfa alfa alfa"), b=u("apple"), c=u("c"))
    w.commit(merge=False)

    with ix.searcher() as s:
        facet = sorting.MultiFacet(["b", sorting.ScoreFacet()])
        r = s.search(q=query.Term("a", u("alfa")), sortedby=facet)
        assert [h["id"] for h in r] == [6, 4, 5, 2, 1, 3]
Exemple #45
0
def test_numeric_field_facet():
    schema = fields.Schema(id=fields.STORED, v1=fields.NUMERIC,
                           v2=fields.NUMERIC)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=1, v1=2, v2=100)
    w.add_document(id=2, v1=1, v2=50)
    w.commit()
    w = ix.writer()
    w.add_document(id=3, v1=2, v2=200)
    w.add_document(id=4, v1=1, v2=100)
    w.commit()
    w = ix.writer(merge=False)
    w.add_document(id=5, v1=2, v2=50)
    w.add_document(id=6, v1=1, v2=200)
    w.commit()

    with ix.searcher() as s:
        mf = sorting.MultiFacet().add_field("v1").add_field("v2", reverse=True)
        r = s.search(query.Every(), sortedby=mf)
        assert [hit["id"] for hit in r] == [6, 4, 2, 3, 1, 5]
Exemple #46
0
def test_checksum_file():
    from whoosh.filedb.structfile import ChecksumFile
    from zlib import crc32

    def wr(f):
        f.write(b("Testing"))
        f.write_int(-100)
        f.write_varint(10395)
        f.write_string(b("Hello"))
        f.write_ushort(32959)

    st = RamStorage()
    # Write a file normally
    f = st.create_file("control")
    wr(f)
    f.close()
    # Checksum the contents
    f = st.open_file("control")
    target = crc32(f.read()) & 0xffffffff
    f.close()

    # Write a file with checksumming
    f = st.create_file("test")
    cf = ChecksumFile(f)
    wr(cf)
    assert cf.checksum() == target
    f.close()

    # Read the file with checksumming
    f = st.open_file("test")
    cf = ChecksumFile(f)
    assert cf.read(7) == b("Testing")
    assert cf.read_int() == -100
    assert cf.read_varint() == 10395
    assert cf.read_string() == b("Hello")
    assert cf.read_ushort() == 32959
    assert cf.checksum() == target
    cf.close()
Exemple #47
0
def test_empty_index():
    schema = fields.Schema(key=fields.ID(stored=True), value=fields.TEXT)
    st = RamStorage()
    assert_raises(index.EmptyIndexError, st.open_index, schema=schema)
Exemple #48
0
class WhooshStore(SAMLStoreBase):
    def __init__(self):
        self.schema = Schema(scopes=KEYWORD(),
                             descr=TEXT(),
                             service_name=TEXT(),
                             service_descr=TEXT(),
                             keywords=KEYWORD())
        self.schema.add("object_id", ID(stored=True, unique=True))
        self.schema.add("entity_id", ID(stored=True, unique=True))
        for a in ATTRS.keys():
            self.schema.add(a, KEYWORD())
        self._collections = set()
        from whoosh.filedb.filestore import RamStorage, FileStorage
        self.storage = RamStorage()
        self.storage.create()
        self.index = self.storage.create_index(self.schema)
        self.objects = dict()
        self.infos = dict()

    def dump(self):
        ix = self.storage.open_index()
        print(ix.schema)
        from whoosh.query import Every
        with ix.searcher() as searcher:
            for result in ix.searcher().search(Every('object_id')):
                print(result)

    def _index_prep(self, info):
        if 'entity_attributes' in info:
            for a, v in info.pop('entity_attributes').items():
                info[a] = v
        for a, v in info.items():
            if type(v) is not list and type(v) is not tuple:
                info[a] = [info.pop(a)]

            if a in ATTRS_INV:
                info[ATTRS_INV[a]] = info.pop(a)

        for a in info.keys():
            if not a in self.schema.names():
                del info[a]

        for a, v in info.items():
            info[a] = [six.text_type(vv) for vv in v]

    def _index(self, e, tid=None):
        info = entity_info(e)
        if tid is not None:
            info['collection_id'] = tid
        self._index_prep(info)
        id = six.text_type(object_id(e))
        # mix in tid here
        self.infos[id] = info
        self.objects[id] = e
        ix = self.storage.open_index()
        with ix.writer() as writer:
            writer.add_document(object_id=id, **info)
            writer.mergetype = writing.CLEAR

    def update(self, t, tid=None, ts=None, merge_strategy=None):
        relt = root(t)
        assert (relt is not None)
        ne = 0

        if relt.tag == "{%s}EntityDescriptor" % NS['md']:
            self._index(relt)
            ne += 1
        elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']:
            if tid is None:
                tid = relt.get('Name')
            self._collections.add(tid)
            for e in iter_entities(t):
                self._index(e, tid=tid)
                ne += 1

        return ne

    def collections(self):
        return self._collections

    def reset(self):
        self.__init__()

    def size(self, a=None, v=None):
        if a is None:
            return len(self.objects.keys())
        elif a is not None and v is None:
            return len(self.attribute(a))
        else:
            return len(self.lookup("{!s}={!s}".format(a, v)))

    def _attributes(self):
        ix = self.storage.open_index()
        with ix.reader() as reader:
            for n in reader.indexed_field_names():
                if n in ATTRS:
                    yield ATTRS[n]

    def attributes(self):
        return list(self._attributes())

    def attribute(self, a):
        if a in ATTRS_INV:
            n = ATTRS_INV[a]
            ix = self.storage.open_index()
            with ix.searcher() as searcher:
                return list(searcher.lexicon(n))
        else:
            return []

    def lookup(self, key, raw=True, field="entity_id"):
        if key == 'entities' or key is None:
            if raw:
                return self.objects.values()
            else:
                return self.infos.values()

        from whoosh.qparser import QueryParser
        #import pdb; pdb.set_trace()
        key = key.strip('+')
        key = key.replace('+', ' AND ')
        for uri, a in ATTRS_INV.items():
            key = key.replace(uri, a)
        key = " {!s} ".format(key)
        key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key)
        key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key)
        key = key.strip()

        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                if raw:
                    lst.add(self.objects[result['object_id']])
                else:
                    lst.add(self.infos[result['object_id']])

        return list(lst)
Exemple #49
0
def test_doc_count():
    schema = fields.Schema(id=fields.NUMERIC)
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        for i in xrange(10):
            w.add_document(id=i)

    r = ix.reader()
    assert r.doc_count() == 10
    assert r.doc_count_all() == 10

    w = ix.writer()
    w.delete_document(2)
    w.delete_document(4)
    w.delete_document(6)
    w.delete_document(8)
    w.commit()

    r = ix.reader()
    assert r.doc_count() == 6
    assert r.doc_count_all() == 10

    w = ix.writer()
    for i in xrange(10, 15):
        w.add_document(id=i)
    w.commit(merge=False)

    r = ix.reader()
    assert r.doc_count() == 11
    assert r.doc_count_all() == 15

    w = ix.writer()
    w.delete_document(10)
    w.delete_document(12)
    w.delete_document(14)
    w.commit(merge=False)

    r = ix.reader()
    assert r.doc_count() == 8
    assert r.doc_count_all() == 15

    ix.optimize()
    r = ix.reader()
    assert r.doc_count() == 8
    assert r.doc_count_all() == 8
Exemple #50
0
def test_nested_children():
    schema = fields.Schema(t=fields.ID(stored=True),
                           track=fields.NUMERIC(stored=True),
                           album_name=fields.TEXT(stored=True),
                           song_name=fields.TEXT(stored=True))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        with w.group():
            w.add_document(t=u("album"), album_name=u("alfa bravo charlie"))
            w.add_document(t=u("track"),
                           track=1,
                           song_name=u("delta echo foxtrot"))
            w.add_document(t=u("track"),
                           track=2,
                           song_name=u("golf hotel india"))
            w.add_document(t=u("track"),
                           track=3,
                           song_name=u("juliet kilo lima"))
        with w.group():
            w.add_document(t=u("album"), album_name=u("mike november oskar"))
            w.add_document(t=u("track"),
                           track=1,
                           song_name=u("papa quebec romeo"))
            w.add_document(t=u("track"),
                           track=2,
                           song_name=u("sierra tango ultra"))
            w.add_document(t=u("track"),
                           track=3,
                           song_name=u("victor whiskey xray"))
        with w.group():
            w.add_document(t=u("album"), album_name=u("yankee zulu one"))
            w.add_document(t=u("track"),
                           track=1,
                           song_name=u("two three four"))
            w.add_document(t=u("track"),
                           track=2,
                           song_name=u("five six seven"))
            w.add_document(t=u("track"),
                           track=3,
                           song_name=u("eight nine ten"))

    with ix.searcher() as s:
        pq = query.Term("t", "album")
        aq = query.Term("album_name", "november")

        r = s.search(query.NestedChildren(pq, pq), limit=None)
        assert len(r) == 9
        assert [str(hit["t"]) for hit in r] == ["track"] * 9

        ncq = query.NestedChildren(pq, aq)
        assert list(ncq.docs(s)) == [5, 6, 7]
        r = s.search(ncq, limit=None)
        assert len(r) == 3
        assert [str(hit["song_name"]) for hit in r] == [
            "papa quebec romeo", "sierra tango ultra", "victor whiskey xray"
        ]

        zq = query.NestedChildren(pq, query.Term("album_name", "zulu"))
        f = sorting.StoredFieldFacet("song_name")
        r = s.search(zq, sortedby=f)
        assert [hit["track"] for hit in r] == [3, 2, 1]
Exemple #51
0
def words_to_corrector(words):
    st = RamStorage()
    f = st.create_file("test")
    spelling.wordlist_to_graph_file(words, f)
    f = st.open_file("test")
    return spelling.GraphCorrector(fst.GraphReader(f))
Exemple #52
0
 def __init__(self, data_source):
     self.index = RamStorage().create_index(SCHEMA)
     self.data_source = data_source
Exemple #53
0
from whoosh.index import create_in
from whoosh.fields import *
from app import cursor
from whoosh.qparser import QueryParser
from whoosh.filedb.filestore import RamStorage
from whoosh.analysis import NgramAnalyzer
import pdb
from whoosh import query

storage = RamStorage()


def load_states():
    analyzer = NgramAnalyzer(1, 2)
    state_schema = Schema(state=ID(stored=True, analyzer=analyzer))
    with cursor() as cur:
        print('Loading states...')
        cur.execute('SELECT DISTINCT state FROM msa')
        state_index = storage.create_index(state_schema)
        writer = state_index.writer()
        for s in cur.fetchall():
            writer.add_document(state=s[u'state'])
        writer.commit()
    return state_index


def load_cities():
    analyzer = NgramAnalyzer(1)
    city_schema = Schema(state=ID(stored=True),
                         city=ID(stored=True, analyzer=analyzer))
    with cursor() as cur:
Exemple #54
0
 def __init__(self):
     storage = RamStorage()
     self.index = storage.create_index(schema)
Exemple #55
0
def test_simple_compound_nomap():
    st = RamStorage()
    _test_simple_compound(st)
Exemple #56
0
class ToolBoxSearch(object):
    """
    Support searching tools in a toolbox. This implementation uses
    the Whoosh search library.
    """
    def __init__(self, toolbox, index_help=True):
        """
        Create a searcher for `toolbox`.
        """
        self.schema = Schema(id=STORED,
                             stub=KEYWORD,
                             name=TEXT(analyzer=analysis.SimpleAnalyzer()),
                             description=TEXT,
                             section=TEXT,
                             help=TEXT,
                             labels=KEYWORD)
        self.rex = analysis.RegexTokenizer()
        self.toolbox = toolbox
        self.build_index(index_help)

    def build_index(self, index_help=True):
        """Prepare search index for tools loaded in toolbox."""
        RamStorage.temp_storage = _temp_storage
        # Works around https://bitbucket.org/mchaput/whoosh/issues/391/race-conditions-with-temp-storage
        self.storage = RamStorage()
        self.index = self.storage.create_index(self.schema)
        writer = self.index.writer()
        start_time = datetime.now()
        log.debug('Starting to build toolbox index.')
        for id, tool in self.toolbox.tools():
            #  Do not add data managers to the public index
            if tool.tool_type == 'manage_data':
                continue
            add_doc_kwds = {
                "id":
                id,
                "description":
                to_unicode(tool.description),
                "section":
                to_unicode(tool.get_panel_section(
                )[1] if len(tool.get_panel_section()) == 2 else ''),
                "help":
                to_unicode("")
            }
            if tool.name.find('-') != -1:
                # Hyphens are wildcards in Whoosh causing bad things
                add_doc_kwds['name'] = (' ').join(
                    [token.text for token in self.rex(to_unicode(tool.name))])
            else:
                add_doc_kwds['name'] = to_unicode(tool.name)
            if tool.guid:
                # Create a stub consisting of owner, repo, and tool from guid
                slash_indexes = [
                    m.start() for m in re.finditer('/', tool.guid)
                ]
                id_stub = tool.guid[(slash_indexes[1] + 1):slash_indexes[4]]
                add_doc_kwds['stub'] = (' ').join(
                    [token.text for token in self.rex(to_unicode(id_stub))])
            else:
                add_doc_kwds['stub'] = to_unicode(id)
            if tool.labels:
                add_doc_kwds['labels'] = to_unicode(" ".join(tool.labels))
            if index_help and tool.help:
                try:
                    add_doc_kwds['help'] = to_unicode(
                        tool.help.render(host_url="", static_path=""))
                except Exception:
                    # Don't fail to build index just because a help message
                    # won't render.
                    pass
            writer.add_document(**add_doc_kwds)
        writer.commit()
        stop_time = datetime.now()
        log.debug('Toolbox index finished. It took: ' +
                  str(stop_time - start_time))

    def search(self, q, tool_name_boost, tool_section_boost,
               tool_description_boost, tool_label_boost, tool_stub_boost,
               tool_help_boost, tool_search_limit, tool_enable_ngram_search,
               tool_ngram_minsize, tool_ngram_maxsize):
        """
        Perform search on the in-memory index. Weight in the given boosts.
        """
        # Change field boosts for searcher
        searcher = self.index.searcher(weighting=BM25F(
            field_B={
                'name_B': float(tool_name_boost),
                'section_B': float(tool_section_boost),
                'description_B': float(tool_description_boost),
                'labels_B': float(tool_label_boost),
                'stub_B': float(tool_stub_boost),
                'help_B': float(tool_help_boost)
            }))
        # Set query to search name, description, section, help, and labels.
        parser = MultifieldParser(
            ['name', 'description', 'section', 'help', 'labels', 'stub'],
            schema=self.schema)
        # Hyphens are wildcards in Whoosh causing bad things
        if q.find('-') != -1:
            q = (' ').join([token.text for token in self.rex(to_unicode(q))])
        # Perform tool search with ngrams if set to true in the config file
        if (tool_enable_ngram_search is True
                or tool_enable_ngram_search == "True"):
            hits_with_score = {}
            token_analyzer = StandardAnalyzer() | analysis.NgramFilter(
                minsize=int(tool_ngram_minsize),
                maxsize=int(tool_ngram_maxsize))
            ngrams = [token.text for token in token_analyzer(q)]
            for query in ngrams:
                # Get the tool list with respective scores for each qgram
                curr_hits = searcher.search(parser.parse('*' + query + '*'),
                                            limit=float(tool_search_limit))
                for i, curr_hit in enumerate(curr_hits):
                    is_present = False
                    for prev_hit in hits_with_score:
                        # Check if the tool appears again for the next qgram search
                        if curr_hit['id'] == prev_hit:
                            is_present = True
                            # Add the current score with the previous one if the
                            # tool appears again for the next qgram
                            hits_with_score[prev_hit] = curr_hits.score(
                                i) + hits_with_score[prev_hit]
                    # Add the tool if not present to the collection with its score
                    if not is_present:
                        hits_with_score[curr_hit['id']] = curr_hits.score(i)
            # Sort the results based on aggregated BM25 score in decreasing order of scores
            hits_with_score = sorted(hits_with_score.items(),
                                     key=lambda x: x[1],
                                     reverse=True)
            # Return the tool ids
            return [
                item[0] for item in hits_with_score[0:int(tool_search_limit)]
            ]
        else:
            # Perform the search
            hits = searcher.search(parser.parse('*' + q + '*'),
                                   limit=float(tool_search_limit))
            return [hit['id'] for hit in hits]
Exemple #57
0
def test_multireader_not():
    schema = fields.Schema(id=fields.STORED, f=fields.TEXT)

    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=0, f=u("alfa bravo chralie"))
    w.add_document(id=1, f=u("bravo chralie delta"))
    w.add_document(id=2, f=u("charlie delta echo"))
    w.add_document(id=3, f=u("delta echo foxtrot"))
    w.add_document(id=4, f=u("echo foxtrot golf"))
    w.commit()

    with ix.searcher() as s:
        q = And([Term("f", "delta"), Not(Term("f", "delta"))])
        r = s.search(q)
        assert_equal(len(r), 0)

    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=5, f=u("alfa bravo chralie"))
    w.add_document(id=6, f=u("bravo chralie delta"))
    w.commit(merge=False)
    w = ix.writer()
    w.add_document(id=7, f=u("charlie delta echo"))
    w.add_document(id=8, f=u("delta echo foxtrot"))
    w.commit(merge=False)
    w = ix.writer()
    w.add_document(id=9, f=u("echo foxtrot golf"))
    w.add_document(id=10, f=u("foxtrot golf delta"))
    w.commit(merge=False)
    assert len(ix._segments()) > 1

    with ix.searcher() as s:
        q = And([Term("f", "delta"), Not(Term("f", "delta"))])
        r = s.search(q)
        assert_equal(len(r), 0)
Exemple #58
0
class ToolBoxSearch(object):
    """
    Support searching tools in a toolbox. This implementation uses
    the Whoosh search library.
    """
    def __init__(self, toolbox, index_help=True):
        """
        Create a searcher for `toolbox`.
        """
        self.toolbox = toolbox
        self.build_index(index_help)

    def build_index(self, index_help=True):
        log.debug('Starting to build toolbox index.')
        self.storage = RamStorage()
        self.index = self.storage.create_index(schema)
        writer = self.index.writer()
        for id, tool in self.toolbox.tools():
            #  Do not add data managers to the public index
            if tool.tool_type == 'manage_data':
                continue
            add_doc_kwds = {
                "id":
                id,
                "name":
                to_unicode(tool.name),
                "description":
                to_unicode(tool.description),
                "section":
                to_unicode(tool.get_panel_section(
                )[1] if len(tool.get_panel_section()) == 2 else ''),
                "help":
                to_unicode("")
            }
            if tool.labels:
                add_doc_kwds['labels'] = to_unicode(" ".join(tool.labels))
            if index_help and tool.help:
                try:
                    add_doc_kwds['help'] = to_unicode(
                        tool.help.render(host_url="", static_path=""))
                except Exception:
                    # Don't fail to build index just because a help message
                    # won't render.
                    pass
            writer.add_document(**add_doc_kwds)
        writer.commit()
        log.debug('Toolbox index finished.')

    def search(self, q, tool_name_boost, tool_section_boost,
               tool_description_boost, tool_help_boost, tool_search_limit):
        """
        Perform search on the in-memory index. Weight in the given boosts.
        """
        # Change field boosts for searcher
        searcher = self.index.searcher(weighting=BM25F(
            field_B={
                'name_B': float(tool_name_boost),
                'section_B': float(tool_section_boost),
                'description_B': float(tool_description_boost),
                'help_B': float(tool_help_boost)
            }))
        # Set query to search name, description, section, help, and labels.
        parser = MultifieldParser(
            ['name', 'description', 'section', 'help', 'labels'],
            schema=schema)
        # Perform the search
        hits = searcher.search(parser.parse('*' + q + '*'),
                               limit=float(tool_search_limit))
        return [hit['id'] for hit in hits]
Exemple #59
0
class ToolBoxSearch(object):
    """
    Support searching tools in a toolbox. This implementation uses
    the Whoosh search library.
    """
    def __init__(self, toolbox, index_help=True):
        """
        Create a searcher for `toolbox`.
        """
        self.schema = Schema(id=STORED,
                             stub=KEYWORD,
                             name=TEXT(analyzer=analysis.SimpleAnalyzer()),
                             description=TEXT,
                             section=TEXT,
                             help=TEXT,
                             labels=KEYWORD)
        self.rex = analysis.RegexTokenizer()
        self.toolbox = toolbox
        self.build_index(index_help)

    def build_index(self, index_help=True):
        # Works around https://bitbucket.org/mchaput/whoosh/issues/391/race-conditions-with-temp-storage
        RamStorage.temp_storage = _temp_storage
        self.storage = RamStorage()
        self.index = self.storage.create_index(self.schema)
        writer = self.index.writer()
        start_time = datetime.now()
        log.debug('Starting to build toolbox index.')
        for id, tool in self.toolbox.tools():
            #  Do not add data managers to the public index
            if tool.tool_type == 'manage_data':
                continue
            add_doc_kwds = {
                "id":
                id,
                "description":
                to_unicode(tool.description),
                "section":
                to_unicode(tool.get_panel_section(
                )[1] if len(tool.get_panel_section()) == 2 else ''),
                "help":
                to_unicode("")
            }
            # Hyphens are wildcards in Whoosh causing bad things
            if tool.name.find('-') != -1:
                add_doc_kwds['name'] = (' ').join(
                    [token.text for token in self.rex(to_unicode(tool.name))])
            else:
                add_doc_kwds['name'] = to_unicode(tool.name)
            # We do not want to search Tool Shed or version parts
            # of the long ids
            if id.find('/') != -1:
                slash_indexes = [m.start() for m in re.finditer('/', id)]
                id_stub = id[(slash_indexes[1] + 1):slash_indexes[4]]
                add_doc_kwds['stub'] = (' ').join(
                    [token.text for token in self.rex(to_unicode(id_stub))])
            else:
                add_doc_kwds['stub'] = to_unicode(id)
            if tool.labels:
                add_doc_kwds['labels'] = to_unicode(" ".join(tool.labels))
            if index_help and tool.help:
                try:
                    add_doc_kwds['help'] = to_unicode(
                        tool.help.render(host_url="", static_path=""))
                except Exception:
                    # Don't fail to build index just because a help message
                    # won't render.
                    pass
            writer.add_document(**add_doc_kwds)
        writer.commit()
        stop_time = datetime.now()
        log.debug('Toolbox index finished. It took: ' +
                  str(stop_time - start_time))

    def search(self, q, tool_name_boost, tool_section_boost,
               tool_description_boost, tool_label_boost, tool_stub_boost,
               tool_help_boost, tool_search_limit):
        """
        Perform search on the in-memory index. Weight in the given boosts.
        """
        # Change field boosts for searcher
        searcher = self.index.searcher(weighting=BM25F(
            field_B={
                'name_B': float(tool_name_boost),
                'section_B': float(tool_section_boost),
                'description_B': float(tool_description_boost),
                'labels_B': float(tool_label_boost),
                'stub_B': float(tool_stub_boost),
                'help_B': float(tool_help_boost)
            }))
        # Set query to search name, description, section, help, and labels.
        parser = MultifieldParser(
            ['name', 'description', 'section', 'help', 'labels', 'stub'],
            schema=self.schema)
        # Hyphens are wildcards in Whoosh causing bad things
        if q.find('-') != -1:
            q = (' ').join([token.text for token in self.rex(to_unicode(q))])
        # Perform the search
        hits = searcher.search(parser.parse('*' + q + '*'),
                               limit=float(tool_search_limit))
        return [hit['id'] for hit in hits]
Exemple #60
0
    def __init__(self):
        from whoosh.filedb.filestore import RamStorage

        self.storage = RamStorage()
        self.segment = MemSegment(self, "blah")