Example #1
0
def test_keyterms():
    ix = create_index()
    with ix.searcher() as s:
        docnum = s.document_number(path="/a")
        keys = list(s.key_terms([docnum], "content", numterms=3))
        assert ([t[0] for t in keys]
                == [u("collision"), u("calculations"), u("damped")])
Example #2
0
 def __unicode__(self):
     r = u("DisMax(")
     r += " ".join(sorted(text_type(s) for s in self.subqueries))
     r += u(")")
     if self.tiebreak:
         r += u("~") + text_type(self.tiebreak)
     return r
Example #3
0
def test_andnot():
    qp = default.QueryParser("content", None)
    q = qp.parse(u("this ANDNOT that"))
    assert q.__class__ == query.AndNot
    assert q.a.__class__ == query.Term
    assert q.b.__class__ == query.Term
    assert q.a.text == "this"
    assert q.b.text == "that"

    q = qp.parse(u("foo ANDNOT bar baz"))
    assert q.__class__ == query.And
    assert len(q) == 2
    assert q[0].__class__ == query.AndNot
    assert q[1].__class__ == query.Term

    q = qp.parse(u("foo fie ANDNOT bar baz"))
    assert q.__class__ == query.And
    assert len(q) == 3
    assert q[0].__class__ == query.Term
    assert q[1].__class__ == query.AndNot
    assert q[2].__class__ == query.Term

    q = qp.parse(u("a AND b ANDNOT c"))
    assert q.__class__ == query.AndNot
    assert text_type(q) == "((content:a AND content:b) ANDNOT content:c)"
Example #4
0
def test_phrase_andmaybe():
    qp = default.QueryParser("f", None)

    q = qp.parse(u('Dahmen ANDMAYBE "Besov Spaces"'))
    assert isinstance(q, query.AndMaybe)
    assert q[0] == query.Term("f", u("Dahmen"))
    assert q[1] == query.Phrase("f", [u("Besov"), u("Spaces")])
Example #5
0
def test_compatibility():
    from whoosh.scoring import Weighting

    # This is the old way of doing a custom weighting model, check that
    # it's still supported...
    class LegacyWeighting(Weighting):
        use_final = True

        def score(self, searcher, fieldname, text, docnum, weight):
            return weight + 0.5

        def final(self, searcher, docnum, score):
            return score * 1.5

    schema = fields.Schema(text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    domain = "alfa bravo charlie delta".split()
    for ls in permutations(domain, 3):
        w.add_document(text=u(" ").join(ls))
    w.commit()

    s = ix.searcher(weighting=LegacyWeighting())
    r = s.search(query.Term("text", u("bravo")))
    assert r.score(0) == 2.25
Example #6
0
 def __unicode__(self):
     r = u("(")
     r += (self.JOINT).join([text_type(s) for s in self.subqueries])
     r += u(")")
     if self.minmatch:
         r += u(">%s") % self.minmatch
     return r
Example #7
0
def test_andnot():
    qp = default.QueryParser("content", None)
    q = qp.parse(u("this ANDNOT that"))
    assert_equal(q.__class__, query.AndNot)
    assert_equal(q.a.__class__, query.Term)
    assert_equal(q.b.__class__, query.Term)
    assert_equal(q.a.text, "this")
    assert_equal(q.b.text, "that")

    q = qp.parse(u("foo ANDNOT bar baz"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 2)
    assert_equal(q[0].__class__, query.AndNot)
    assert_equal(q[1].__class__, query.Term)

    q = qp.parse(u("foo fie ANDNOT bar baz"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 3)
    assert_equal(q[0].__class__, query.Term)
    assert_equal(q[1].__class__, query.AndNot)
    assert_equal(q[2].__class__, query.Term)

    q = qp.parse(u("a AND b ANDNOT c"))
    assert_equal(q.__class__, query.AndNot)
    assert_equal(text_type(q), "((content:a AND content:b) ANDNOT content:c)")
Example #8
0
def test_deleteall():
    schema = fields.Schema(text=fields.TEXT)
    with TempIndex(schema, "deleteall") as ix:
        w = ix.writer()
        domain = u("alfa bravo charlie delta echo").split()
        for i, ls in enumerate(permutations(domain)):
            w.add_document(text=u(" ").join(ls))
            if not i % 10:
                w.commit()
                w = ix.writer()
        w.commit()

        # This is just a test, don't use this method to delete all docs IRL!
        doccount = ix.doc_count_all()
        w = ix.writer()
        for docnum in xrange(doccount):
            w.delete_document(docnum)
        w.commit()

        with ix.searcher() as s:
            r = s.search(query.Or([query.Term("text", u("alfa")),
                                   query.Term("text", u("bravo"))]))
            assert len(r) == 0

        ix.optimize()
        assert ix.doc_count_all() == 0

        with ix.reader() as r:
            assert list(r) == []
Example #9
0
def test_boolean_strings():
    schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(i=0, b="true")
        w.add_document(i=1, b="True")
        w.add_document(i=2, b="false")
        w.add_document(i=3, b="False")
        w.add_document(i=4, b=u("true"))
        w.add_document(i=5, b=u("True"))
        w.add_document(i=6, b=u("false"))
        w.add_document(i=7, b=u("False"))

    with ix.searcher() as s:
        qp = qparser.QueryParser("b", ix.schema)

        def check(qs, nums):
            q = qp.parse(qs)
            r = s.search(q, limit=None)
            assert [hit["i"] for hit in r] == nums

        trues = [0, 1, 4, 5]
        falses = [2, 3, 6, 7]
        check("true", trues)
        check("True", trues)
        check("false", falses)
        check("False", falses)
        check("t", trues)
        check("f", falses)
Example #10
0
def test_requires():
    a = Term("f", u("a"))
    b = Term("f", u("b"))
    assert And([a, b]).requires() == set([a, b])
    assert Or([a, b]).requires() == set()
    assert AndMaybe(a, b).requires() == set([a])
    assert a.requires() == set([a])
Example #11
0
def test_colonspace():
    s = fields.Schema(content=fields.TEXT, url=fields.ID)
    qp = default.QueryParser("content", s)
    q = qp.parse(u("url:test"))
    assert_equal(q.__class__, query.Term)
    assert_equal(q.fieldname, "url")
    assert_equal(q.text, "test")

    q = qp.parse(u("url: test"))
    assert_equal(q.__class__, query.And)
    assert_equal(q[0].__class__, query.Term)
    assert_equal(q[1].__class__, query.Term)
    assert_equal(q[0].fieldname, "content")
    assert_equal(q[1].fieldname, "content")
    assert_equal(q[0].text, "url")
    assert_equal(q[1].text, "test")

    q = qp.parse(u("url:"))
    assert_equal(q.__class__, query.Term)
    assert_equal(q.fieldname, "content")
    assert_equal(q.text, "url")

    s = fields.Schema(foo=fields.KEYWORD)
    qp = default.QueryParser("foo", s)
    q = qp.parse(u("blah:"))
    assert_equal(q.__class__, query.Term)
    assert_equal(q.fieldname, "foo")
    assert_equal(q.text, "blah:")
Example #12
0
def test_datetime():
    dtf = fields.DATETIME(stored=True)
    schema = fields.Schema(id=fields.ID(stored=True), date=dtf)
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    for month in xrange(1, 12):
        for day in xrange(1, 28):
            w.add_document(id=u("%s-%s") % (month, day),
                           date=datetime(2010, month, day, 14, 0, 0))
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)

        r = s.search(qp.parse("date:20100523"))
        assert len(r) == 1
        assert r[0]["id"] == "5-23"
        assert r[0]["date"].__class__ is datetime
        assert r[0]["date"].month == 5
        assert r[0]["date"].day == 23

        r = s.search(qp.parse("date:'2010 02'"))
        assert len(r) == 27

        q = qp.parse(u("date:[2010-05 to 2010-08]"))
        startdt = datetime(2010, 5, 1, 0, 0, 0, 0)
        enddt = datetime(2010, 8, 31, 23, 59, 59, 999999)
        assert q.__class__ is query.NumericRange
        assert q.start == times.datetime_to_long(startdt)
        assert q.end == times.datetime_to_long(enddt)
Example #13
0
def test_boolean():
    schema = fields.Schema(id=fields.ID(stored=True),
                           done=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.add_document(id=u("a"), done=True)
    w.add_document(id=u("b"), done=False)
    w.add_document(id=u("c"), done=True)
    w.add_document(id=u("d"), done=False)
    w.add_document(id=u("e"), done=True)
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)

        r = s.search(qp.parse("done:true"))
        assert sorted([d["id"] for d in r]) == ["a", "c", "e"]
        assert all(d["done"] for d in r)

        r = s.search(qp.parse("done:yes"))
        assert sorted([d["id"] for d in r]) == ["a", "c", "e"]
        assert all(d["done"] for d in r)

        q = qp.parse("done:false")
        assert q.__class__ == query.Term
        assert q.text is False
        assert schema["done"].to_bytes(False) == b("f")
        r = s.search(q)
        assert sorted([d["id"] for d in r]) == ["b", "d"]
        assert not any(d["done"] for d in r)

        r = s.search(qp.parse("done:no"))
        assert sorted([d["id"] for d in r]) == ["b", "d"]
        assert not any(d["done"] for d in r)
Example #14
0
def test_substitution():
    mf = analysis.RegexTokenizer(r"\S+") | analysis.SubstitutionFilter("-", "")
    assert_equal([t.text for t in mf(u("one-two th-re-ee four"))],
                 ["onetwo", "threee", "four"])
    
    mf = analysis.RegexTokenizer(r"\S+") | analysis.SubstitutionFilter("([^=]*)=(.*)", r"\2=\1")
    assert_equal([t.text for t in mf(u("a=b c=d ef"))], ["b=a", "d=c", "ef"])
Example #15
0
def test_requires():
    a = Term("f", u("a"))
    b = Term("f", u("b"))
    assert_equal(And([a, b]).requires(), set([a, b]))
    assert_equal(Or([a, b]).requires(), set())
    assert_equal(AndMaybe(a, b).requires(), set([a]))
    assert_equal(a.requires(), set([a]))
 def __unicode__(self):
     r = u("%s:%s") % (self.fieldname, self.text) + u("~")
     if self.maxdist > 1:
         r += u("%d") % self.maxdist
     if self.boost != 1.0:
         r += u("^%f") % self.boost
     return r
Example #17
0
def test_missing_field_scoring():
    schema = fields.Schema(name=fields.TEXT(stored=True),
                           hobbies=fields.TEXT(stored=True))
    storage = RamStorage()
    ix = storage.create_index(schema)
    writer = ix.writer()
    writer.add_document(name=u('Frank'), hobbies=u('baseball, basketball'))
    writer.commit()
    r = ix.reader()
    assert_equal(r.field_length("hobbies"), 2)
    assert_equal(r.field_length("name"), 1)
    r.close()

    writer = ix.writer()
    writer.add_document(name=u('Jonny'))
    writer.commit()

    with ix.searcher() as s:
        r = s.reader()
        assert_equal(len(ix._segments()), 1)
        assert_equal(r.field_length("hobbies"), 2)
        assert_equal(r.field_length("name"), 2)

        parser = qparser.MultifieldParser(['name', 'hobbies'], schema)
        q = parser.parse(u("baseball"))
        result = s.search(q)
        assert_equal(len(result), 1)
Example #18
0
 def __unicode__(self):
     r = self.text + u("~")
     if self.maxdist > 1:
         r += u("%d") % self.maxdist
     if self.boost != 1.0:
         r += u("^%f") % self.boost
     return r
Example #19
0
def test_boost_phrase():
    schema = fields.Schema(title=fields.TEXT(field_boost=5.0, stored=True), text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    domain = u("alfa bravo charlie delta").split()
    w = ix.writer()
    for ls in permutations(domain):
        t = u(" ").join(ls)
        w.add_document(title=t, text=t)
    w.commit()

    q = Or([Term("title", u("alfa")), Term("title", u("bravo")), Phrase("text", [u("bravo"), u("charlie"), u("delta")])])

    def boost_phrases(q):
        if isinstance(q, Phrase):
            q.boost *= 1000.0
            return q
        else:
            return q.apply(boost_phrases)
    q = boost_phrases(q)

    with ix.searcher() as s:
        r = s.search(q, limit=None)
        for hit in r:
            if "bravo charlie delta" in hit["title"]:
                assert hit.score > 100.0
Example #20
0
def test_fractional_weights():
    ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter()
    
    # With Positions format
    schema = fields.Schema(f=fields.TEXT(analyzer=ana))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(f=u("alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5"))
    w.commit()
    
    with ix.searcher() as s:
        wts = []
        for word in s.lexicon("f"):
            p = s.postings("f", word)
            wts.append(p.weight())
        assert_equal(wts, [0.5, 1.5, 2.0, 1.5])
    
    # Try again with Frequency format
    schema = fields.Schema(f=fields.TEXT(analyzer=ana, phrase=False))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(f=u("alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5"))
    w.commit()
    
    with ix.searcher() as s:
        wts = []
        for word in s.lexicon("f"):
            p = s.postings("f", word)
            wts.append(p.weight())
        assert_equal(wts, [0.5, 1.5, 2.0, 1.5])
Example #21
0
def test_delete_nonexistant():
    from whoosh.writing import IndexingError
    
    schema = fields.Schema(id=fields.ID(stored=True))
    # Single segment
    with TempIndex(schema, "deletenon1") as ix:
        w = ix.writer()
        for char in u("ABC"):
            w.add_document(id=char)
        w.commit()
        
        try:
            w = ix.writer()
            assert_raises(IndexingError, w.delete_document, 5)
        finally:
            w.cancel()
    
    # Multiple segments
    with TempIndex(schema, "deletenon1") as ix:
        for char in u("ABC"):
            w = ix.writer()
            w.add_document(id=char)
            w.commit(merge=False)
        
        try:
            w = ix.writer()
            assert_raises(IndexingError, w.delete_document, 5)
        finally:
            w.cancel()
def test_gtlt():
    schema = fields.Schema(a=fields.KEYWORD, b=fields.NUMERIC,
                           c=fields.KEYWORD,
                           d=fields.NUMERIC(float), e=fields.DATETIME)
    qp = qparser.QueryParser("a", schema)
    qp.add_plugin(plugins.GtLtPlugin())
    qp.add_plugin(dateparse.DateParserPlugin())

    q = qp.parse(u("a:hello b:>100 c:<=z there"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 4)
    assert_equal(q[0], query.Term("a", "hello"))
    assert_equal(q[1], query.NumericRange("b", 100, None, startexcl=True))
    assert_equal(q[2], query.TermRange("c", None, 'z'))
    assert_equal(q[3], query.Term("a", "there"))

    q = qp.parse(u("hello e:>'29 mar 2001' there"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 3)
    assert_equal(q[0], query.Term("a", "hello"))
    # As of this writing, date ranges don't support startexcl/endexcl
    assert_equal(q[1], query.DateRange("e", datetime(2001, 3, 29, 0, 0), None))
    assert_equal(q[2], query.Term("a", "there"))

    q = qp.parse(u("a:> alfa c:<= bravo"))
    assert_equal(text_type(q), "(a:a: AND a:alfa AND a:c: AND a:bravo)")

    qp.remove_plugin_class(plugins.FieldsPlugin)
    qp.remove_plugin_class(plugins.RangePlugin)
    q = qp.parse(u("hello a:>500 there"))
    assert_equal(text_type(q), "(a:hello AND a:a: AND a:500 AND a:there)")
def test_pseudofield():
    schema = fields.Schema(a=fields.KEYWORD, b=fields.TEXT)

    def regex_maker(node):
        if node.has_text:
            node = qparser.RegexPlugin.RegexNode(node.text)
            node.set_fieldname("content")
            return node

    qp = qparser.QueryParser("a", schema)
    qp.add_plugin(qparser.PseudoFieldPlugin({"regex": regex_maker}))
    q = qp.parse(u("alfa regex:br.vo"))
    assert_equal(q.__unicode__(), '(a:alfa AND content:r"br.vo")')

    def rev_text(node):
        if node.has_text:
            # Create a word node for the reversed text
            revtext = node.text[::-1]  # Reverse the text
            rnode = qparser.WordNode(revtext)
            # Duplicate the original node's start and end char
            rnode.set_range(node.startchar, node.endchar)

            # Put the original node and the reversed node in an OrGroup
            group = qparser.OrGroup([node, rnode])

            # Need to set the fieldname here because the PseudoFieldPlugin
            # removes the field name syntax
            group.set_fieldname("reverse")

            return group

    qp = qparser.QueryParser("content", schema)
    qp.add_plugin(qparser.PseudoFieldPlugin({"reverse": rev_text}))
    q = qp.parse(u("alfa reverse:bravo"))
    assert_equal(q.__unicode__(), '(content:alfa AND (reverse:bravo OR reverse:ovarb))')
Example #24
0
def test_hit_column():
    # Not stored
    schema = fields.Schema(text=fields.TEXT())
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(text=u("alfa bravo charlie"))

    with ix.searcher() as s:
        r = s.search(query.Term("text", "alfa"))
        assert len(r) == 1
        hit = r[0]
        with pytest.raises(KeyError):
            _ = hit["text"]

    # With column
    schema = fields.Schema(text=fields.TEXT(sortable=True))
    ix = RamStorage().create_index(schema)
    with ix.writer(codec=W3Codec()) as w:
        w.add_document(text=u("alfa bravo charlie"))

    with ix.searcher() as s:
        r = s.search(query.Term("text", "alfa"))
        assert len(r) == 1
        hit = r[0]
        assert hit["text"] == u("alfa bravo charlie")
Example #25
0
def test_sorting_function():
    schema = fields.Schema(id=fields.STORED, text=fields.TEXT(stored=True, vector=True))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    domain = ("alfa", "bravo", "charlie")
    count = 1
    for w1 in domain:
        for w2 in domain:
            for w3 in domain:
                for w4 in domain:
                    w.add_document(id=count, text=u(" ").join((w1, w2, w3, w4)))
                    count += 1
    w.commit()

    def fn(searcher, docnum):
        v = dict(searcher.vector_as("frequency", docnum, "text"))
        # Sort documents that have equal number of "alfa"
        # and "bravo" first
        return 0 - 1.0 / (abs(v.get("alfa", 0) - v.get("bravo", 0)) + 1.0)

    fnfacet = sorting.FunctionFacet(fn)

    with ix.searcher() as s:
        q = query.And([query.Term("text", u("alfa")), query.Term("text", u("bravo"))])
        results = s.search(q, sortedby=fnfacet)
        r = [hit["text"] for hit in results]
        for t in r[:10]:
            tks = t.split()
            assert tks.count("alfa") == tks.count("bravo")
Example #26
0
def test_compound_sort():
    fspec = fields.KEYWORD(stored=True, sortable=True)
    schema = fields.Schema(a=fspec, b=fspec, c=fspec)
    ix = RamStorage().create_index(schema)

    alist = u("alfa bravo alfa bravo alfa bravo alfa bravo alfa bravo").split()
    blist = u("alfa bravo charlie alfa bravo charlie alfa bravo charlie alfa").split()
    clist = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet").split()
    assert all(len(ls) == 10 for ls in (alist, blist, clist))

    with ix.writer() as w:
        for i in xrange(10):
            w.add_document(a=alist[i], b=blist[i], c=clist[i])

    with ix.searcher() as s:
        q = query.Every()
        sortedby = [sorting.FieldFacet("a"), sorting.FieldFacet("b", reverse=True), sorting.FieldFacet("c")]

        r = s.search(q, sortedby=sortedby)
        output = []
        for hit in r:
            output.append(" ".join((hit["a"], hit["b"], hit["c"])))

        assert output == [
            "alfa charlie charlie",
            "alfa charlie india",
            "alfa bravo echo",
            "alfa alfa alfa",
            "alfa alfa golf",
            "bravo charlie foxtrot",
            "bravo bravo bravo",
            "bravo bravo hotel",
            "bravo alfa delta",
            "bravo alfa juliet",
        ]
Example #27
0
def test_overlapping_lists():
    schema = fields.Schema(id=fields.STORED, tags=fields.KEYWORD)
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(id=0, tags=u("alfa bravo charlie"))
        w.add_document(id=1, tags=u("bravo charlie delta"))
        w.add_document(id=2, tags=u("charlie delta echo"))
        w.add_document(id=3, tags=u("delta echo alfa"))
        w.add_document(id=4, tags=u("echo alfa bravo"))

    with ix.searcher() as s:
        of = sorting.FieldFacet("tags", allow_overlap=True)
        cat = of.categorizer(s)
        assert not cat._use_vectors

        r = s.search(query.Every(), groupedby={"tags": of})
        assert r.groups("tags") == {
            "alfa": [0, 3, 4],
            "bravo": [0, 1, 4],
            "charlie": [0, 1, 2],
            "delta": [1, 2, 3],
            "echo": [2, 3, 4],
        }

        fcts = sorting.Facets()
        fcts.add_field("tags", allow_overlap=True)
        r = s.search(query.Every(), groupedby=fcts)
        assert r.groups("tags") == {
            "alfa": [0, 3, 4],
            "bravo": [0, 1, 4],
            "charlie": [0, 1, 2],
            "delta": [1, 2, 3],
            "echo": [2, 3, 4],
        }
Example #28
0
 def check(method):
     with TempIndex(get_schema()) as ix:
         method(ix)
         with ix.searcher() as s:
             results = s.search(query.Every(), groupedby="tag")
             groups = results.groups()
             assert sorted(groups.items()) == [(u("one"), [0, 6]), (u("three"), [1, 3, 7, 8]), (u("two"), [2, 4, 5])]
Example #29
0
def test_term_inspection():
    schema = fields.Schema(title=fields.TEXT(stored=True),
                           content=fields.TEXT)
    st = RamStorage()
    ix = st.create_index(schema)
    writer = ix.writer()
    writer.add_document(title=u("My document"),
                        content=u("AA AA BB BB CC AA AA AA BB BB CC DD EE EE"))
    writer.add_document(title=u("My other document"),
                        content=u("AA AB BB CC EE EE AX AX DD"))
    writer.commit()

    reader = ix.reader()
    assert " ".join(reader.field_terms("content")) == "aa ab ax bb cc dd ee"
    assert list(reader.expand_prefix("content", "a")) == [b('aa'), b('ab'), b('ax')]
    assert set(reader.all_terms()) == set([('content', b('aa')), ('content', b('ab')),
                                           ('content', b('ax')), ('content', b('bb')),
                                           ('content', b('cc')), ('content', b('dd')),
                                           ('content', b('ee')), ('title', b('document')),
                                           ('title', b('my')), ('title', b('other'))])
    # (text, doc_freq, index_freq)
    assert _fstats(reader.iter_field("content")) == [(b('aa'), 2, 6), (b('ab'), 1, 1), (b('ax'), 1, 2),
                                                     (b('bb'), 2, 5), (b('cc'), 2, 3), (b('dd'), 2, 2),
                                                     (b('ee'), 2, 4)]
    assert _fstats(reader.iter_field("content", prefix="c")) == [(b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4)]
    assert list(reader.most_frequent_terms("content")) == [(6, b('aa')), (5, b('bb')), (4, b('ee')), (3, b('cc')), (2, b('dd'))]
    assert list(reader.most_frequent_terms("content", prefix="a")) == [(6, b('aa')), (2, b('ax')), (1, b('ab'))]
    assert list(reader.most_distinctive_terms("content", 3)) == [(1.3862943611198906, b('ax')), (0.6931471805599453, b('ab')), (0.0, b('ee'))]
Example #30
0
def test_suggest_prefix():
    domain = ("Shoot To Kill",
              "Bloom, Split and Deviate",
              "Rankle the Seas and the Skies",
              "Lightning Flash Flame Shell",
              "Flower Wind Rage and Flower God Roar, Heavenly Wind Rage and "
              "Heavenly Demon Sneer",
              "All Waves, Rise now and Become my Shield, Lightning, Strike "
              "now and Become my Blade",
              "Cry, Raise Your Head, Rain Without end",
              "Sting All Enemies To Death",
              "Reduce All Creation to Ash",
              "Sit Upon the Frozen Heavens",
              "Call forth the Twilight")

    schema = fields.Schema(content=fields.TEXT(stored=True, spelling=True),
                           quick=fields.NGRAM(maxsize=10, stored=True))

    with TempIndex(schema, "sugprefix") as ix:
        with ix.writer() as w:
            for item in domain:
                content = u(item)
                w.add_document(content=content, quick=content)

        with ix.searcher() as s:
            sugs = s.suggest("content", u("ra"), maxdist=2, prefix=2)
            assert sugs == ['rage', 'rain']

            sugs = s.suggest("content", "ra", maxdist=2, prefix=1)
            assert sugs == ["rage", "rain", "roar"]
Example #31
0
def test_variations():
    _run_query(query.Variations("value", u("render")),
               [u("A"), u("C"), u("E")])
Example #32
0
def test_find_missing():
    schema = fields.Schema(id=fields.ID, text=fields.KEYWORD(stored=True))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.add_document(id=u("1"), text=u("alfa"))
    w.add_document(id=u("2"), text=u("bravo"))
    w.add_document(text=u("charlie"))
    w.add_document(id=u("4"), text=u("delta"))
    w.add_document(text=u("echo"))
    w.add_document(id=u("6"), text=u("foxtrot"))
    w.add_document(text=u("golf"))
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("text", schema)
        q = qp.parse(u("NOT id:*"))
        r = s.search(q, limit=None)
        assert_equal(list(h["text"] for h in r), ["charlie", "echo", "golf"])
Example #33
0
def test_missing_wildcard():
    schema = fields.Schema(id=fields.ID(stored=True), f1=fields.TEXT,
                           f2=fields.TEXT)
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    w.add_document(id=u("1"), f1=u("alfa"), f2=u("apple"))
    w.add_document(id=u("2"), f1=u("bravo"))
    w.add_document(id=u("3"), f1=u("charlie"), f2=u("candy"))
    w.add_document(id=u("4"), f2=u("donut"))
    w.add_document(id=u("5"))
    w.commit()

    with ix.searcher() as s:
        r = s.search(query.Every("id"))
        assert_equal(sorted([d['id'] for d in r]), ["1", "2", "3", "4", "5"])

        r = s.search(query.Every("f1"))
        assert_equal(sorted([d['id'] for d in r]), ["1", "2", "3"])

        r = s.search(query.Every("f2"))
        assert_equal(sorted([d['id'] for d in r]), ["1", "3", "4"])
Example #34
0
def test_wildcard():
    _run_query(query.Or([query.Wildcard('value', u('*red*')),
                         query.Wildcard('name', u('*yellow*'))]),
               [u("A"), u("C"), u("D"), u("E")])
    # Missing
    _run_query(query.Wildcard('value', 'glonk*'), [])
Example #35
0
def test_phrase_score():
    schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT)
    storage = RamStorage()
    ix = storage.create_index(schema)
    writer = ix.writer()
    writer.add_document(name=u("A"),
                        value=u("Little Miss Muffet sat on a tuffet"))
    writer.add_document(name=u("D"),
                        value=u("Gibberish blonk falunk miss muffet sat " +
                                "tuffet garbonzo"))
    writer.add_document(name=u("E"), value=u("Blah blah blah pancakes"))
    writer.add_document(name=u("F"),
                        value=u("Little miss muffet little miss muffet"))
    writer.commit()

    with ix.searcher() as s:
        q = query.Phrase("value", [u("little"), u("miss"), u("muffet")])
        m = q.matcher(s)
        assert_equal(m.id(), 0)
        score1 = m.weight()
        assert score1 > 0
        m.next()
        assert_equal(m.id(), 3)
        assert m.weight() > score1
Example #36
0
def test_posting_phrase():
    schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT)
    storage = RamStorage()
    ix = storage.create_index(schema)
    writer = ix.writer()
    writer.add_document(name=u("A"),
                        value=u("Little Miss Muffet sat on a tuffet"))
    writer.add_document(name=u("B"), value=u("Miss Little Muffet tuffet"))
    writer.add_document(name=u("C"), value=u("Miss Little Muffet tuffet sat"))
    writer.add_document(name=u("D"),
                        value=u("Gibberish blonk falunk miss muffet sat " +
                                "tuffet garbonzo"))
    writer.add_document(name=u("E"), value=u("Blah blah blah pancakes"))
    writer.commit()

    with ix.searcher() as s:
        def names(results):
            return sorted([fields['name'] for fields in results])

        q = query.Phrase("value", [u("little"), u("miss"), u("muffet"),
                                   u("sat"), u("tuffet")])
        m = q.matcher(s)
        assert_equal(m.__class__.__name__, "SpanNearMatcher")

        r = s.search(q)
        assert_equal(names(r), ["A"])
        assert_equal(len(r), 1)

        q = query.Phrase("value", [u("miss"), u("muffet"), u("sat"),
                                   u("tuffet")])
        assert_equal(names(s.search(q)), ["A", "D"])

        q = query.Phrase("value", [u("falunk"), u("gibberish")])
        r = s.search(q)
        assert_equal(names(r), [])
        assert_equal(len(r), 0)

        q = query.Phrase("value", [u("gibberish"), u("falunk")], slop=2)
        assert_equal(names(s.search(q)), ["D"])

        q = query.Phrase("value", [u("blah")] * 4)
        assert_equal(names(s.search(q)), [])  # blah blah blah blah

        q = query.Phrase("value", [u("blah")] * 3)
        m = q.matcher(s)
        assert_equal(names(s.search(q)), ["E"])
Example #37
0
def test_multireader():
    sc = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT)
    st = RamStorage()
    ix = st.create_index(sc)
    w = ix.writer()
    w.add_document(id=u("alfa"), content=u("alfa"))
    w.add_document(id=u("bravo"), content=u("bravo"))
    w.add_document(id=u("charlie"), content=u("charlie"))
    w.add_document(id=u("delta"), content=u("delta"))
    w.add_document(id=u("echo"), content=u("echo"))
    w.add_document(id=u("foxtrot"), content=u("foxtrot"))
    w.add_document(id=u("golf"), content=u("golf"))
    w.add_document(id=u("hotel"), content=u("hotel"))
    w.add_document(id=u("india"), content=u("india"))
    w.commit()

    with ix.searcher() as s:
        r = s.search(query.Term("content", u("bravo")))
        assert_equal(len(r), 1)
        assert_equal(r[0]["id"], "bravo")

    w = ix.writer()
    w.add_document(id=u("juliet"), content=u("juliet"))
    w.add_document(id=u("kilo"), content=u("kilo"))
    w.add_document(id=u("lima"), content=u("lima"))
    w.add_document(id=u("mike"), content=u("mike"))
    w.add_document(id=u("november"), content=u("november"))
    w.add_document(id=u("oscar"), content=u("oscar"))
    w.add_document(id=u("papa"), content=u("papa"))
    w.add_document(id=u("quebec"), content=u("quebec"))
    w.add_document(id=u("romeo"), content=u("romeo"))
    w.commit()
    assert_equal(len(ix._segments()), 2)

    #r = ix.reader()
    #assert r.__class__.__name__, "MultiReader")
    #pr = r.postings("content", u("bravo"))

    with ix.searcher() as s:
        r = s.search(query.Term("content", u("bravo")))
        assert_equal(len(r), 1)
        assert_equal(r[0]["id"], "bravo")
Example #38
0
def test_merged():
    sc = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT)
    st = RamStorage()
    ix = st.create_index(sc)
    w = ix.writer()
    w.add_document(id=u("alfa"), content=u("alfa"))
    w.add_document(id=u("bravo"), content=u("bravo"))
    w.add_document(id=u("charlie"), content=u("charlie"))
    w.add_document(id=u("delta"), content=u("delta"))
    w.commit()

    with ix.searcher() as s:
        r = s.search(query.Term("content", u("bravo")))
        assert_equal(len(r), 1)
        assert_equal(r[0]["id"], "bravo")

    w = ix.writer()
    w.add_document(id=u("echo"), content=u("echo"))
    w.commit()
    assert_equal(len(ix._segments()), 1)

    with ix.searcher() as s:
        r = s.search(query.Term("content", u("bravo")))
        assert_equal(len(r), 1)
        assert_equal(r[0]["id"], "bravo")
Example #39
0
def test_range():
    schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT)
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    w.add_document(id=u("A"), content=u("alfa bravo charlie delta echo"))
    w.add_document(id=u("B"), content=u("bravo charlie delta echo foxtrot"))
    w.add_document(id=u("C"), content=u("charlie delta echo foxtrot golf"))
    w.add_document(id=u("D"), content=u("delta echo foxtrot golf hotel"))
    w.add_document(id=u("E"), content=u("echo foxtrot golf hotel india"))
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("content", schema)

        q = qp.parse(u("charlie [delta TO foxtrot]"))
        assert_equal(q.__class__, query.And)
        assert_equal(q[0].__class__, query.Term)
        assert_equal(q[1].__class__, query.TermRange)
        assert_equal(q[1].start, "delta")
        assert_equal(q[1].end, "foxtrot")
        assert_equal(q[1].startexcl, False)
        assert_equal(q[1].endexcl, False)
        ids = sorted([d['id'] for d in s.search(q)])
        assert_equal(ids, [u('A'), u('B'), u('C')])

        q = qp.parse(u("foxtrot {echo TO hotel]"))
        assert_equal(q.__class__, query.And)
        assert_equal(q[0].__class__, query.Term)
        assert_equal(q[1].__class__, query.TermRange)
        assert_equal(q[1].start, "echo")
        assert_equal(q[1].end, "hotel")
        assert_equal(q[1].startexcl, True)
        assert_equal(q[1].endexcl, False)
        ids = sorted([d['id'] for d in s.search(q)])
        assert_equal(ids, [u('B'), u('C'), u('D'), u('E')])

        q = qp.parse(u("{bravo TO delta}"))
        assert_equal(q.__class__, query.TermRange)
        assert_equal(q.start, "bravo")
        assert_equal(q.end, "delta")
        assert_equal(q.startexcl, True)
        assert_equal(q.endexcl, True)
        ids = sorted([d['id'] for d in s.search(q)])
        assert_equal(ids, [u('A'), u('B'), u('C')])

        # Shouldn't match anything
        q = qp.parse(u("[1 to 10]"))
        assert_equal(q.__class__, query.TermRange)
        assert_equal(len(s.search(q)), 0)
Example #40
0
def test_not2():
    schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT)
    storage = RamStorage()
    ix = storage.create_index(schema)
    writer = ix.writer()
    writer.add_document(name=u("a"), value=u("alfa bravo charlie delta echo"))
    writer.add_document(name=u("b"),
                        value=u("bravo charlie delta echo foxtrot"))
    writer.add_document(name=u("c"),
                        value=u("charlie delta echo foxtrot golf"))
    writer.add_document(name=u("d"), value=u("delta echo golf hotel india"))
    writer.add_document(name=u("e"), value=u("echo golf hotel india juliet"))
    writer.commit()

    with ix.searcher() as s:
        p = qparser.QueryParser("value", None)
        results = s.search(p.parse("echo NOT golf"))
        assert_equal(sorted([d["name"] for d in results]), ["a", "b"])

        results = s.search(p.parse("echo NOT bravo"))
        assert_equal(sorted([d["name"] for d in results]), ["c", "d", "e"])

    ix.delete_by_term("value", u("bravo"))

    with ix.searcher() as s:
        results = s.search(p.parse("echo NOT charlie"))
        assert_equal(sorted([d["name"] for d in results]), ["d", "e"])
Example #41
0
def make_index():
    s = fields.Schema(key=fields.ID(stored=True),
                      name=fields.TEXT,
                      value=fields.TEXT)
    st = RamStorage()
    ix = st.create_index(s)

    w = ix.writer()
    w.add_document(key=u("A"), name=u("Yellow brown"),
                   value=u("Blue red green render purple?"))
    w.add_document(key=u("B"), name=u("Alpha beta"),
                   value=u("Gamma delta epsilon omega."))
    w.add_document(key=u("C"), name=u("One two"),
                   value=u("Three rendered four five."))
    w.add_document(key=u("D"), name=u("Quick went"),
                   value=u("Every red town."))
    w.add_document(key=u("E"), name=u("Yellow uptown"),
                   value=u("Interest rendering outer photo!"))
    w.commit()

    return ix
Example #42
0
def test_collect_limit():
    schema = fields.Schema(id=fields.STORED, text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id="a", text=u("alfa bravo charlie delta echo"))
    w.add_document(id="b", text=u("bravo charlie delta echo foxtrot"))
    w.add_document(id="c", text=u("charlie delta echo foxtrot golf"))
    w.add_document(id="d", text=u("delta echo foxtrot golf hotel"))
    w.add_document(id="e", text=u("echo foxtrot golf hotel india"))
    w.commit()

    with ix.searcher() as s:
        r = s.search(query.Term("text", u("golf")), limit=10)
        assert_equal(len(r), 3)
        count = 0
        for _ in r:
            count += 1
        assert_equal(count, 3)

    w = ix.writer()
    w.add_document(id="f", text=u("foxtrot golf hotel india juliet"))
    w.add_document(id="g", text=u("golf hotel india juliet kilo"))
    w.add_document(id="h", text=u("hotel india juliet kilo lima"))
    w.add_document(id="i", text=u("india juliet kilo lima mike"))
    w.add_document(id="j", text=u("juliet kilo lima mike november"))
    w.commit(merge=False)

    with ix.searcher() as s:
        r = s.search(query.Term("text", u("golf")), limit=20)
        assert_equal(len(r), 5)
        count = 0
        for _ in r:
            count += 1
        assert_equal(count, 5)
Example #43
0
def test_filter():
    schema = fields.Schema(id=fields.STORED, path=fields.ID, text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=1, path=u("/a/1"), text=u("alfa bravo charlie"))
    w.add_document(id=2, path=u("/b/1"), text=u("bravo charlie delta"))
    w.add_document(id=3, path=u("/c/1"), text=u("charlie delta echo"))
    w.commit(merge=False)
    w = ix.writer()
    w.add_document(id=4, path=u("/a/2"), text=u("delta echo alfa"))
    w.add_document(id=5, path=u("/b/2"), text=u("echo alfa bravo"))
    w.add_document(id=6, path=u("/c/2"), text=u("alfa bravo charlie"))
    w.commit(merge=False)
    w = ix.writer()
    w.add_document(id=7, path=u("/a/3"), text=u("bravo charlie delta"))
    w.add_document(id=8, path=u("/b/3"), text=u("charlie delta echo"))
    w.add_document(id=9, path=u("/c/3"), text=u("delta echo alfa"))
    w.commit(merge=False)

    with ix.searcher() as s:
        fq = query.Or([query.Prefix("path", "/a"),
                       query.Prefix("path", "/b")])
        r = s.search(query.Term("text", "alfa"), filter=fq)
        assert_equal([d["id"] for d in r], [1, 4, 5])

        r = s.search(query.Term("text", "bravo"), filter=fq)
        assert_equal([d["id"] for d in r], [1, 2, 5, 7, ])
Example #44
0
def test_fieldboost():
    schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=0, a=u("alfa bravo charlie"), b=u("echo foxtrot india"))
    w.add_document(id=1, a=u("delta bravo charlie"), b=u("alfa alfa alfa"))
    w.add_document(id=2, a=u("alfa alfa alfa"), b=u("echo foxtrot india"))
    w.add_document(id=3, a=u("alfa sierra romeo"), b=u("alfa tango echo"))
    w.add_document(id=4, a=u("bravo charlie delta"), b=u("alfa foxtrot india"))
    w.add_document(id=5, a=u("alfa alfa echo"), b=u("tango tango tango"))
    w.add_document(id=6, a=u("alfa bravo echo"), b=u("alfa alfa tango"))
    w.commit()

    def field_booster(fieldname, factor=2.0):
        "Returns a function which will boost the given field in a query tree"
        def booster_fn(obj):
            if obj.is_leaf() and obj.field() == fieldname:
                obj = copy.deepcopy(obj)
                obj.boost *= factor
                return obj
            else:
                return obj
        return booster_fn

    with ix.searcher() as s:
        q = query.Or([query.Term("a", u("alfa")),
                      query.Term("b", u("alfa"))])
        q = q.accept(field_booster("a", 100.0))
        assert_equal(text_type(q), text_type("(a:alfa^100.0 OR b:alfa)"))
        r = s.search(q)
        assert_equal([hit["id"] for hit in r], [2, 5, 6, 3, 0, 1, 4])
Example #45
0
def test_random_intersections():
    domain = [
        u("alpha"),
        u("bravo"),
        u("charlie"),
        u("delta"),
        u("echo"),
        u("foxtrot"),
        u("golf"),
        u("hotel"),
        u("india"),
        u("juliet"),
        u("kilo"),
        u("lima"),
        u("mike")
    ]
    segments = 5
    docsperseg = 50
    fieldlimits = (3, 10)
    documents = []

    schema = fields.Schema(key=fields.STORED, value=fields.TEXT(stored=True))
    st = RamStorage()
    ix = st.create_index(schema)

    # Create docsperseg * segments documents containing random words from
    # the domain list. Add the documents to the index, but also keep them
    # in the "documents" list for the sanity check
    for i in xrange(segments):
        w = ix.writer()
        for j in xrange(docsperseg):
            docnum = i * docsperseg + j
            # Create a string of random words
            doc = u(" ").join(
                choice(domain) for _ in xrange(randint(*fieldlimits)))
            # Add the string to the index
            w.add_document(key=docnum, value=doc)
            # Add a (docnum, string) tuple to the documents list
            documents.append((docnum, doc))
        w.commit()
    assert len(ix._segments()) != 1

    testcount = 20
    testlimits = (2, 5)

    with ix.searcher() as s:
        for i in xrange(s.doc_count_all()):
            assert s.stored_fields(i).get("key") is not None

        for _ in xrange(testcount):
            # Create a random list of words and manually do an intersection of
            # items in "documents" that contain the words ("target").
            words = sample(domain, randint(*testlimits))
            target = []
            for docnum, doc in documents:
                if all((doc.find(w) > -1) for w in words):
                    target.append(docnum)
            target.sort()

            # Create a query from the list of words and get two matchers from
            # it.
            q = And([Term("value", w) for w in words])
            m1 = q.matcher(s)
            m2 = q.matcher(s)

            # Try getting the list of IDs from all_ids()
            ids1 = list(m1.all_ids())

            # Try getting the list of IDs using id()/next()
            ids2 = []
            while m2.is_active():
                ids2.append(m2.id())
                m2.next()

            # Check that the two methods return the same list
            assert ids1 == ids2

            # Check that the IDs match the ones we manually calculated
            assert _keys(s, ids1) == target
Example #46
0
def test_andnot():
    _run_query(query.AndNot(query.Term("name", u("yellow")),
                            query.Term("value", u("purple"))),
               [u("E")])
Example #47
0
def test_intersection():
    schema = fields.Schema(key=fields.ID(stored=True),
                           value=fields.TEXT(stored=True))
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    w.add_document(key=u("a"), value=u("alpha bravo charlie delta"))
    w.add_document(key=u("b"), value=u("echo foxtrot alpha bravo"))
    w.add_document(key=u("c"), value=u("charlie delta golf hotel"))
    w.commit()

    w = ix.writer()
    w.add_document(key=u("d"), value=u("india alpha bravo charlie"))
    w.add_document(key=u("e"), value=u("delta bravo india bravo"))
    w.commit()

    with ix.searcher() as s:
        q = And([Term("value", u("bravo")), Term("value", u("delta"))])
        m = q.matcher(s)
        assert _keys(s, m.all_ids()) == ["a", "e"]

        q = And([Term("value", u("bravo")), Term("value", u("alpha"))])
        m = q.matcher(s)
        assert _keys(s, m.all_ids()) == ["a", "b", "d"]
Example #48
0
def test_topnot():
    _run_query(query.Not(query.Term("value", "red")), [u("B"), "C", "E"])
    _run_query(query.Not(query.Term("name", "yellow")), [u("B"), u("C"),
                                                         u("D")])
Example #49
0
def test_add_spelling():
    schema = fields.Schema(text1=fields.TEXT, text2=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(text1=u("render zorro kaori postal"), text2=u("alfa"))
    w.add_document(text1=u("reader zebra koala pastry"), text2=u("alpa"))
    w.add_document(text1=u("leader libra ooala paster"), text2=u("alpha"))
    w.add_document(text1=u("feeder lorry zoala baster"), text2=u("olfo"))
    w.commit()

    with ix.reader() as r:
        assert not r.has_word_graph("text1")
        assert not r.has_word_graph("text2")

    from whoosh.filedb.filewriting import add_spelling
    add_spelling(ix, ["text1", "text2"])

    with ix.reader() as r:
        assert r.has_word_graph("text1")
        assert r.has_word_graph("text2")

        sp = spelling.ReaderCorrector(r, "text1")
        assert_equal(sp.suggest(u("kaola"), maxdist=1), [u('koala')])
        assert_equal(
            sp.suggest(u("kaola"), maxdist=2),
            [u('koala'), u('kaori'),
             u('ooala'), u('zoala')])

        sp = spelling.ReaderCorrector(r, "text2")
        assert_equal(sp.suggest(u("alfo"), maxdist=1), [u("alfa"), u("olfo")])
Example #50
0
def test_require():
    _run_query(query.Require(query.Term("value", u("red")),
                             query.Term("name", u("yellow"))),
               [u("A")])
Example #51
0
def test_addfield():
    schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT)
    with TempIndex(schema, "addfield") as ix:
        w = ix.writer()
        w.add_document(id=u("a"), content=u("alfa"))
        w.add_document(id=u("b"), content=u("bravo"))
        w.add_document(id=u("c"), content=u("charlie"))
        w.commit()

        ix.add_field("added", fields.KEYWORD(stored=True))

        w = ix.writer()
        w.add_document(id=u("d"), content=u("delta"), added=u("fourth"))
        w.add_document(id=u("e"), content=u("echo"), added=u("fifth"))
        w.commit(merge=False)

        with ix.searcher() as s:
            assert ("id", "d") in s.reader()
            assert_equal(s.document(id="d"), {"id": "d", "added": "fourth"})
            assert_equal(s.document(id="b"), {"id": "b"})
Example #52
0
from __future__ import with_statement
import gzip

from whoosh import analysis, fields, highlight, query, spelling
from whoosh.compat import u
from whoosh.qparser import QueryParser
from whoosh.support.levenshtein import levenshtein
from whoosh.util.testing import TempIndex

_wordlist = sorted(
    u("render animation animate shader shading zebra koala"
      "ready kismet reaction page delete quick fox jumped"
      "over lazy dog wicked erase red team yellow under interest"
      "open print acrid sear deaf feed grow heal jolly kilt"
      "low zone xylophone crown vale brown neat meat reduction"
      "blunder preaction lamppost").split())


def test_list_corrector():
    corr = spelling.ListCorrector(_wordlist)
    typo = "reoction"
    sugs = list(corr.suggest(typo, maxdist=2))
    target = []
    for lev_dist in range(1, 3):
        # sugs will return suggest first ordered by levenshtein distance
        # then second order by dictionary order
        target += [
            w for w in _wordlist
            if levenshtein(typo, w) <= lev_dist and w not in target
        ]
    assert sugs == target
Example #53
0
def test_removefield():
    schema = fields.Schema(id=fields.ID(stored=True),
                           content=fields.TEXT,
                           city=fields.KEYWORD(stored=True))
    with TempIndex(schema, "removefield") as ix:
        w = ix.writer()
        w.add_document(id=u("b"), content=u("bravo"), city=u("baghdad"))
        w.add_document(id=u("c"), content=u("charlie"), city=u("cairo"))
        w.add_document(id=u("d"), content=u("delta"), city=u("dakar"))
        w.commit()

        with ix.searcher() as s:
            assert_equal(s.document(id=u("c")), {"id": "c", "city": "cairo"})

        w = ix.writer()
        w.remove_field("content")
        w.remove_field("city")
        w.commit()

        ixschema = ix._current_schema()
        assert_equal(ixschema.names(), ["id"])
        assert_equal(ixschema.stored_names(), ["id"])

        with ix.searcher() as s:
            assert ("content", u("charlie")) not in s.reader()
            assert_equal(s.document(id=u("c")), {"id": u("c")})
Example #54
0
def test_reader_corrector():
    schema = fields.Schema(text=fields.TEXT(spelling=True))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(text=u("render zorro kaori postal"))
    w.add_document(text=u("reader zebra koala pastry"))
    w.add_document(text=u("leader libra ooala paster"))
    w.add_document(text=u("feeder lorry zoala baster"))
    w.commit()

    with ix.reader() as r:
        assert r.has_word_graph("text")
        sp = spelling.ReaderCorrector(r, "text")
        assert_equal(sp.suggest(u("kaola"), maxdist=1), [u('koala')])
        assert_equal(
            sp.suggest(u("kaola"), maxdist=2),
            [u('koala'), u('kaori'),
             u('ooala'), u('zoala')])
Example #55
0
def _multi_segment_index():
    ix = _create_index()
    w = ix.writer()
    w.add_document(f1=u("A B C"), f2=u("1 2 3"), f3=u("X Y Z"))
    w.add_document(f1=u("D E F"), f2=u("4 5 6"), f3=u("Q R S"))
    w.commit()

    w = ix.writer()
    w.add_document(f1=u("A E C"), f2=u("1 4 6"), f3=u("X Q S"))
    w.add_document(f1=u("A A A"), f2=u("2 3 5"), f3=u("Y R Z"))
    w.commit(merge=False)

    w = ix.writer()
    w.add_document(f1=u("A B"), f2=u("1 2"), f3=u("X Y"))
    w.commit(merge=False)

    return ix
Example #56
0
def test_optimize_away():
    schema = fields.Schema(id=fields.ID(stored=True),
                           content=fields.TEXT,
                           city=fields.KEYWORD(stored=True))
    with TempIndex(schema, "optimizeaway") as ix:
        w = ix.writer()
        w.add_document(id=u("b"), content=u("bravo"), city=u("baghdad"))
        w.add_document(id=u("c"), content=u("charlie"), city=u("cairo"))
        w.add_document(id=u("d"), content=u("delta"), city=u("dakar"))
        w.commit()

        with ix.searcher() as s:
            assert_equal(s.document(id=u("c")), {"id": "c", "city": "cairo"})

        w = ix.writer()
        w.remove_field("content")
        w.remove_field("city")
        w.commit(optimize=True)

        with ix.searcher() as s:
            assert ("content", u("charlie")) not in s.reader()
            assert_equal(s.document(id=u("c")), {"id": u("c")})
Example #57
0
def test_short_prefix():
    s = fields.Schema(name=fields.ID, value=fields.TEXT)
    qp = qparser.QueryParser("value", schema=s)
    q = qp.parse(u("s*"))
    assert_equal(q.__class__.__name__, "Prefix")
    assert_equal(q.text, "s")
Example #58
0
 def __unicode__(self):
     return u("%s:<%s>") % (self.fieldname, self.text)
Example #59
0
def test_term():
    _run_query(query.Term("name", u("yellow")), [u("A"), u("E")])
    _run_query(query.Term("value", u("zeta")), [])
    _run_query(query.Term("value", u("red")), [u("A"), u("D")])
Example #60
0
def test_first_id():
    schema = fields.Schema(path=fields.ID(stored=True))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.add_document(path=u("/a"))
    w.add_document(path=u("/b"))
    w.add_document(path=u("/c"))
    w.commit()

    r = ix.reader()
    docid = r.first_id("path", u("/b"))
    assert r.stored_fields(docid) == {"path": "/b"}

    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(path=u("/a"))
    w.add_document(path=u("/b"))
    w.add_document(path=u("/c"))
    w.commit(merge=False)

    w = ix.writer()
    w.add_document(path=u("/d"))
    w.add_document(path=u("/e"))
    w.add_document(path=u("/f"))
    w.commit(merge=False)

    w = ix.writer()
    w.add_document(path=u("/g"))
    w.add_document(path=u("/h"))
    w.add_document(path=u("/i"))
    w.commit(merge=False)

    r = ix.reader()
    assert r.__class__ == reading.MultiReader
    docid = r.first_id("path", u("/e"))
    assert r.stored_fields(docid) == {"path": "/e"}

    with pytest.raises(NotImplementedError):
        r.cursor("path")