Beispiel #1
0
def test_multifilter():
    iwf_for_index = analysis.IntraWordFilter(mergewords=True, mergenums=False)
    iwf_for_query = analysis.IntraWordFilter(mergewords=False, mergenums=False)
    mf = analysis.MultiFilter(index=iwf_for_index, query=iwf_for_query)
    ana = analysis.RegexTokenizer() | mf | analysis.LowercaseFilter()
    schema = fields.Schema(text=fields.TEXT(analyzer=ana, stored=True))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(text=u("Our BabbleTron5000 is great"))
    w.commit()

    with ix.searcher() as s:
        hit = s.search(query.Term("text", "5000"))[0]
        assert_equal(hit.highlights("text"),
                     'Our BabbleTron<b class="match term0">5000</b> is great')
Beispiel #2
0
def test_intraword_possessive():
    iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True)
    ana = analysis.RegexTokenizer(r"\S+") | iwf | analysis.LowercaseFilter()

    target = u("O'Malley's-Bar")
    tokens = [(t.text, t.startchar, t.endchar)
              for t in ana(target, chars=True)]
    assert_equal(tokens, [("o", 0, 1), ("malley", 2, 8), ("bar", 11, 14),
                          ("omalleybar", 0, 14)])
Beispiel #3
0
def test_frowny_face():
    # See https://bitbucket.org/mchaput/whoosh/issue/166/
    ana = analysis.RegexTokenizer(r"\S+") | analysis.IntraWordFilter()
    # text is all delimiters
    tokens = [t.text for t in ana(u(":-("))]
    assert_equal(tokens, [])

    # text has consecutive delimiters
    tokens = [t.text for t in ana(u("LOL:)"))]
    assert_equal(tokens, ["LOL"])
Beispiel #4
0
def test_intraword():
    iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True)
    ana = analysis.RegexTokenizer(r"\S+") | iwf

    def check(text, ls):
        assert_equal([(t.pos, t.text) for t in ana(text)], ls)

    check(u("PowerShot)"), [(0, "Power"), (1, "Shot"), (1, "PowerShot")])
    check(u("A's+B's&C's"), [(0, "A"), (1, "B"), (2, "C"), (2, "ABC")])
    check(u("Super-Duper-XL500-42-AutoCoder!"),
          [(0, "Super"), (1, "Duper"), (2, "XL"), (2, "SuperDuperXL"),
           (3, "500"), (4, "42"), (4, "50042"), (5, "Auto"), (6, "Coder"),
           (6, "AutoCoder")])
Beispiel #5
0
def test_intraword_chars():
    iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True)
    ana = analysis.RegexTokenizer(r"\S+") | iwf | analysis.LowercaseFilter()

    target = u("WiKiWo-rd")
    tokens = [(t.text, t.startchar, t.endchar)
              for t in ana(target, chars=True)]
    assert_equal(tokens, [("wi", 0, 2), ("ki", 2, 4), ("wo", 4, 6),
                          ("rd", 7, 9), ("wikiword", 0, 9)])

    target = u("Zo WiKiWo-rd")
    tokens = [(t.text, t.startchar, t.endchar)
              for t in ana(target, chars=True)]
    assert_equal(tokens, [("zo", 0, 2), ("wi", 3, 5), ("ki", 5, 7),
                          ("wo", 7, 9), ("rd", 10, 12), ("wikiword", 3, 12)])
Beispiel #6
0
title_re = re.compile("^\s*((\w|[:]).*?)\n[-*=#+%]{3,}$", re.MULTILINE)
charclass_re = re.compile(":(?P<cls>[^:]+):`~?(?P<ref>[^`\n]+)`", re.MULTILINE)
def_re = re.compile("^[.][.] ?([^:]+):: (.*?)$", re.MULTILINE)

deffields = {"class": "cls", "module": "mod"}
reffields = {
    "mod": "modref",
    "class": "clsref",
    "func": "funcref",
    "pep": "pep"
}

ana = analysis.StemmingAnalyzer(stoplist=stoplists["en"], maxsize=40)

cls_ana = (analysis.SpaceSeparatedTokenizer()
           | analysis.IntraWordFilter(mergewords=True)
           | analysis.LowercaseFilter())

tech_ana = (analysis.RegexTokenizer("\w+") | analysis.LowercaseFilter())


class PydocSchema(fields.SchemaClass):
    path = fields.STORED

    title = fields.TEXT(stored=True,
                        sortable=True,
                        spelling=True,
                        analyzer=ana)
    tgrams = fields.NGRAMWORDS

    content = fields.TEXT(spelling=True, analyzer=ana)