Beispiel #1
0
def test_tee_filter():
    target = u("Alfa Bravo Charlie")
    f1 = analysis.LowercaseFilter()
    f2 = analysis.ReverseTextFilter()
    ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2)
    result = " ".join([t.text for t in ana(target)])
    assert_equal(result, "alfa aflA bravo ovarB charlie eilrahC")

    class ucfilter(analysis.Filter):
        def __call__(self, tokens):
            for t in tokens:
                t.text = t.text.upper()
                yield t

    f2 = analysis.ReverseTextFilter() | ucfilter()
    ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2)
    result = " ".join([t.text for t in ana(target)])
    assert_equal(result, "alfa AFLA bravo OVARB charlie EILRAHC")

    f1 = analysis.PassFilter()
    f2 = analysis.BiWordFilter()
    ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(
        f1, f2) | analysis.LowercaseFilter()
    result = " ".join([t.text for t in ana(target)])
    assert_equal(result, "alfa alfa-bravo bravo bravo-charlie charlie")
Beispiel #2
0
def test_substitution():
    mf = analysis.RegexTokenizer(r"\S+") | analysis.SubstitutionFilter("-", "")
    assert_equal([t.text for t in mf(u("one-two th-re-ee four"))],
                 ["onetwo", "threee", "four"])

    mf = analysis.RegexTokenizer(r"\S+") | analysis.SubstitutionFilter(
        "([^=]*)=(.*)", r"\2=\1")
    assert_equal([t.text for t in mf(u("a=b c=d ef"))], ["b=a", "d=c", "ef"])
Beispiel #3
0
def test_regextokenizer():
    value = u("AAAaaaBBBbbbCCCcccDDDddd")

    rex = analysis.RegexTokenizer("[A-Z]+")
    assert_equal([t.text for t in rex(value)], ["AAA", "BBB", "CCC", "DDD"])

    rex = analysis.RegexTokenizer("[A-Z]+", gaps=True)
    assert_equal([t.text for t in rex(value)], ["aaa", "bbb", "ccc", "ddd"])
Beispiel #4
0
def test_stop_lang():
    stopper = analysis.RegexTokenizer() | analysis.StopFilter()
    ls = [token.text for token in stopper(u("this is a test"))]
    assert ls == [u("test")]

    es_stopper = analysis.RegexTokenizer() | analysis.StopFilter(lang="es")
    ls = [token.text for token in es_stopper(u("el lapiz es en la mesa"))]
    assert ls == ["lapiz", "mesa"]
Beispiel #5
0
def test_double_metaphone():
    from whoosh.lang.dmetaphone import double_metaphone

    names = {'maurice': ('MRS', None),
             'aubrey': ('APR', None),
             'cambrillo': ('KMPRL', 'KMPR'),
             'heidi': ('HT', None),
             'katherine': ('K0RN', 'KTRN'),
             'Thumbail': ('0MPL', 'TMPL'),
             'catherine': ('K0RN', 'KTRN'),
             'richard': ('RXRT', 'RKRT'),
             'bob': ('PP', None),
             'eric': ('ARK', None),
             'geoff': ('JF', 'KF'),
             'Through': ('0R', 'TR'),
             'Schwein': ('XN', 'XFN'),
             'dave': ('TF', None),
             'ray': ('R', None),
             'steven': ('STFN', None),
             'bryce': ('PRS', None),
             'randy': ('RNT', None),
             'bryan': ('PRN', None),
             'Rapelje': ('RPL', None),
             'brian': ('PRN', None),
             'otto': ('AT', None),
             'auto': ('AT', None),
             'Dallas': ('TLS', None),
             'maisey': ('MS', None),
             'zhang': ('JNK', None),
             'Chile': ('XL', None),
             'Jose': ('HS', None),
             'Arnow': ('ARN', 'ARNF'),
             'solilijs': ('SLLS', None),
             'Parachute': ('PRKT', None),
             'Nowhere': ('NR', None),
             'Tux': ('TKS', None)}

    dmn = name = None
    for name in names.keys():
        dmn = double_metaphone(name)
    assert dmn == names[name]

    mf = (analysis.RegexTokenizer()
          | analysis.LowercaseFilter()
          | analysis.DoubleMetaphoneFilter())
    results = [(t.text, t.boost) for t in mf(u("Spruce View"))]
    assert results == [('SPRS', 1.0), ('F', 1.0), ('FF', 0.5)]

    mf = (analysis.RegexTokenizer()
          | analysis.LowercaseFilter()
          | analysis.DoubleMetaphoneFilter(combine=True))
    results = [(t.text, t.boost) for t in mf(u("Spruce View"))]
    assert results == [('spruce', 1.0), ('SPRS', 1.0), ('view', 1.0),
                       ('F', 1.0), ('FF', 0.5)]

    namefield = fields.TEXT(analyzer=mf)
    texts = list(namefield.process_text(u("Spruce View"), mode="query"))
    assert texts == [u('spruce'), 'SPRS', u('view'), 'F', 'FF']
Beispiel #6
0
def populate_whoosh(text_dir, whoosh_dir):
    loaded = 0

    # Create analyzer used for tokenizing and normalizing tokens
    # 000, 001, 010, 011,
    my_analyzers = [(analysis.RegexTokenizer()),
                    (analysis.RegexTokenizer() | analysis.LowercaseFilter()),
                    (analysis.RegexTokenizer() | analysis.LowercaseFilter()
                     | analysis.StopFilter()), (analysis.StemmingAnalyzer())]
    # Create schemas
    schemas = []
    for my_analyzer in my_analyzers:
        schema = Schema(url=ID(stored=True),
                        body=TEXT(stored=True, analyzer=my_analyzer))
        schemas.append(schema)

    # Setup index
    ixs = []
    for i, my_analyzer in enumerate(my_analyzers):
        whoosh_dir_current = whoosh_dir + str(i) + '/'
        os.makedirs(whoosh_dir_current, exist_ok=True)
        ix = index.create_in(whoosh_dir_current, schemas[i])
        ixs.append(ix)

    # Clear index
    writers = []
    for i, my_analyzer in enumerate(my_analyzer):
        writer = ixs[i].writer()
        writer.commit(mergetype=writing.CLEAR)
        writer = ixs[i].writer()
        writers.append(writer)

    # Index documents
    for root, dirs, files in os.walk(text_dir, topdown=False):
        for name in files:
            text_file = os.path.join(root, name)
            print('.', end='')
            with open(text_file) as tf:
                body = tf.read()
                url = text_file.replace(text_dir, "")
                for writer in writers:
                    writer.add_document(url=url, body=body)
                # print("Added", url)
                loaded += 1

    for writer in writers:
        writer.commit()

    print("\n\nLoaded", loaded, "documents")
Beispiel #7
0
def test_double_metaphone():
    mf = analysis.RegexTokenizer() | analysis.LowercaseFilter(
    ) | analysis.DoubleMetaphoneFilter()
    results = [(t.text, t.boost) for t in mf(u("Spruce View"))]
    assert_equal(results, [('SPRS', 1.0), ('F', 1.0), ('FF', 0.5)])

    mf = analysis.RegexTokenizer() | analysis.LowercaseFilter(
    ) | analysis.DoubleMetaphoneFilter(combine=True)
    results = [(t.text, t.boost) for t in mf(u("Spruce View"))]
    assert_equal(results, [('spruce', 1.0), ('SPRS', 1.0), ('view', 1.0),
                           ('F', 1.0), ('FF', 0.5)])

    namefield = fields.TEXT(analyzer=mf)
    texts = list(namefield.process_text(u("Spruce View"), mode="query"))
    assert_equal(texts, [u('spruce'), 'SPRS', u('view'), 'F', 'FF'])
def populate_whoosh(text_dir, whoosh_dir):
    loaded = 0

    ## Create analyzer used for tokenizing and normalizing tokens
    my_analyzer = (analysis.RegexTokenizer() | analysis.LowercaseFilter()
                   | analysis.StopFilter())

    # Create schema
    schema = Schema(url=ID(stored=True),
                    body=TEXT(stored=True, analyzer=my_analyzer))

    # Setup index
    os.makedirs(whoosh_dir, exist_ok=True)
    ix = index.create_in(whoosh_dir, schema)

    # Clear index
    writer = ix.writer()
    writer.commit(mergetype=writing.CLEAR)

    # Index documents
    writer = ix.writer()
    for root, dirs, files in os.walk(text_dir, topdown=False):
        for name in files:
            text_file = os.path.join(root, name)
            with open(text_file, encoding="utf8") as tf:
                body = tf.read()
                url = text_file.replace(text_dir, "")
                writer.add_document(url=url, body=body)
                print("Added", url)
                loaded += 1

    writer.commit()
    print("\n\nLoaded", loaded, "documents")
Beispiel #9
0
def test_word_segments():
    wordset = set(u("alfa bravo charlie delta").split())

    cwf = analysis.CompoundWordFilter(wordset, keep_compound=True)
    ana = analysis.RegexTokenizer(r"\S+") | cwf
    target = u("alfacharlie bravodelta delto bravo subalfa")
    tokens = [t.text for t in ana(target)]
    assert tokens == ["alfacharlie", "alfa", "charlie", "bravodelta",
                      "bravo", "delta", "delto", "bravo", "subalfa"]

    cwf = analysis.CompoundWordFilter(wordset, keep_compound=False)
    ana = analysis.RegexTokenizer(r"\S+") | cwf
    target = u("alfacharlie bravodelta delto bravo subalfa")
    tokens = [t.text for t in ana(target)]
    assert tokens == ["alfa", "charlie", "bravo", "delta", "delto", "bravo",
                      "subalfa"]
Beispiel #10
0
def test_fractional_weights():
    ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter()

    # With Positions format
    schema = fields.Schema(f=fields.TEXT(analyzer=ana))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(f=u("alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5"))
    w.commit()

    with ix.searcher() as s:
        wts = []
        for word in s.lexicon("f"):
            p = s.postings("f", word)
            wts.append(p.weight())
        assert_equal(wts, [0.5, 1.5, 2.0, 1.5])

    # Try again with Frequency format
    schema = fields.Schema(f=fields.TEXT(analyzer=ana, phrase=False))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(f=u("alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5"))
    w.commit()

    with ix.searcher() as s:
        wts = []
        for word in s.lexicon("f"):
            p = s.postings("f", word)
            wts.append(p.weight())
        assert_equal(wts, [0.5, 1.5, 2.0, 1.5])
Beispiel #11
0
def test_composing_functions():
    def filter(tokens):
        for t in tokens:
            t.text = t.text.upper()
            yield t

    analyzer = analysis.RegexTokenizer() | filter
    assert_equal([t.text for t in analyzer(u("abc def"))], ["ABC", "DEF"])
Beispiel #12
0
def test_biword_stopwords():
    # Note that the stop list is None here
    ana = (analysis.RegexTokenizer()
           | analysis.StopFilter(stoplist=None, minsize=3)
           | analysis.BiWordFilter())

    texts = [t.text for t in ana(u("stuff and then some"))]
    assert texts == ["stuff-and", "and-then", "then-some"]

    # Use a stop list here
    ana = (analysis.RegexTokenizer()
           | analysis.LowercaseFilter()
           | analysis.StopFilter()
           | analysis.BiWordFilter())

    texts = [t.text for t in ana(u("stuff and then some"))]
    assert texts == ["stuff-then", "then-some"]
Beispiel #13
0
def test_shared_composition():
    shared = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter()

    ana1 = shared | analysis.NgramFilter(3)
    ana2 = shared | analysis.DoubleMetaphoneFilter()

    assert_equal([t.text for t in ana1(u("hello"))], ["hel", "ell", "llo"])
    assert_equal([t.text for t in ana2(u("hello"))], ["HL"])
Beispiel #14
0
def test_multifilter():
    f1 = analysis.LowercaseFilter()
    f2 = analysis.PassFilter()
    mf = analysis.MultiFilter(a=f1, b=f2)
    ana = analysis.RegexTokenizer(r"\S+") | mf
    text = u("ALFA BRAVO CHARLIE")
    assert [t.text for t in ana(text, mode="a")] == ["alfa", "bravo", "charlie"]
    assert [t.text for t in ana(text, mode="b")] == ["ALFA", "BRAVO", "CHARLIE"]
Beispiel #15
0
def test_composition2():
    ca = analysis.RegexTokenizer() | analysis.LowercaseFilter()
    sa = ca | analysis.StopFilter()
    assert_equal(len(sa), 3)
    assert_equal(sa.__class__.__name__, "CompositeAnalyzer")
    assert_equal(sa[0].__class__.__name__, "RegexTokenizer")
    assert_equal(sa[1].__class__.__name__, "LowercaseFilter")
    assert_equal(sa[2].__class__.__name__, "StopFilter")
    assert_equal([t.text for t in sa(u("The ABC 123"))], ["abc", "123"])
Beispiel #16
0
def test_posboost_postings():
    pbs = PositionBoosts()
    ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter()
    content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa")
    assert _roundtrip(content, pbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]),
                                                                ("bravo", [(1, 0.1), (3, 0.5)]),
                                                                ("charlie", [(2, 2)])]
    assert _roundtrip(content, pbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])]
    assert _roundtrip(content, pbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)]
Beispiel #17
0
def test_intraword_possessive():
    iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True)
    ana = analysis.RegexTokenizer(r"\S+") | iwf | analysis.LowercaseFilter()

    target = u("O'Malley's-Bar")
    tokens = [(t.text, t.startchar, t.endchar)
              for t in ana(target, chars=True)]
    assert_equal(tokens, [("o", 0, 1), ("malley", 2, 8), ("bar", 11, 14),
                          ("omalleybar", 0, 14)])
Beispiel #18
0
def test_composing_functions():
    tokenizer = analysis.RegexTokenizer()

    def filter(tokens):
        for t in tokens:
            t.text = t.text.upper()
            yield t

    with pytest.raises(TypeError):
        tokenizer | filter
Beispiel #19
0
def test_frowny_face():
    # See https://bitbucket.org/mchaput/whoosh/issue/166/
    ana = analysis.RegexTokenizer(r"\S+") | analysis.IntraWordFilter()
    # text is all delimiters
    tokens = [t.text for t in ana(u(":-("))]
    assert_equal(tokens, [])

    # text has consecutive delimiters
    tokens = [t.text for t in ana(u("LOL:)"))]
    assert_equal(tokens, ["LOL"])
Beispiel #20
0
def test_shingles():
    ana = analysis.RegexTokenizer(r"\w+") | analysis.ShingleFilter(3, " ")
    source = u("better a witty fool than a foolish wit")
    results = [t.copy() for t in ana(source, positions=True, chars=True)]
    assert [t.text for t in results] == [u('better a witty'), u('a witty fool'),
                                         u('witty fool than'), u('fool than a'),
                                         u('than a foolish'),
                                         u('a foolish wit')]
    assert [t.pos for t in results] == list(range(len(results)))
    for t in results:
        assert t.text == source[t.startchar:t.endchar]
Beispiel #21
0
def test_start_pos():
    from whoosh import formats
    ana = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter()
    kw = {"positions": True}
    tks = formats.tokens(u("alfa bravo charlie delta"), ana, kw)
    assert_equal([t.pos for t in tks], [0, 1, 2, 3])

    kw["start_pos"] = 3
    ts = [t.copy() for t in formats.tokens(u("A B C D").split(), ana, kw)]
    assert_equal(" ".join([t.text for t in ts]), "A B C D")
    assert_equal([t.pos for t in ts], [3, 4, 5, 6])
Beispiel #22
0
def test_biword():
    ana = analysis.RegexTokenizer(r"\w+") | analysis.BiWordFilter()
    result = [t.copy() for t
              in ana(u("the sign of four"), chars=True, positions=True)]
    assert ["the-sign", "sign-of", "of-four"] == [t.text for t in result]
    assert [(0, 8), (4, 11), (9, 16)] == [(t.startchar, t.endchar)
                                          for t in result]
    assert [0, 1, 2] == [t.pos for t in result]

    result = [t.copy() for t in ana(u("single"))]
    assert len(result) == 1
    assert result[0].text == "single"
Beispiel #23
0
def test_shingle_stopwords():
    # Note that the stop list is None here
    ana = (analysis.RegexTokenizer()
           | analysis.StopFilter(stoplist=None, minsize=3)
           | analysis.ShingleFilter(size=3))

    texts = [t.text for t
             in ana(u("some other stuff and then some things To Check     "))]
    assert texts == ["some-other-stuff", "other-stuff-and", "stuff-and-then",
                     "and-then-some", "then-some-things", "some-things-Check"]

    # Use a stop list here
    ana = (analysis.RegexTokenizer()
           | analysis.LowercaseFilter()
           | analysis.StopFilter()
           | analysis.ShingleFilter(size=3))

    texts = [t.text for t
             in ana(u("some other stuff and then some things To Check     "))]
    assert texts == ["some-other-stuff", "other-stuff-then", "stuff-then-some",
                     "then-some-things", "some-things-check"]
Beispiel #24
0
def test_intraword():
    iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True)
    ana = analysis.RegexTokenizer(r"\S+") | iwf

    def check(text, ls):
        assert_equal([(t.pos, t.text) for t in ana(text)], ls)

    check(u("PowerShot)"), [(0, "Power"), (1, "Shot"), (1, "PowerShot")])
    check(u("A's+B's&C's"), [(0, "A"), (1, "B"), (2, "C"), (2, "ABC")])
    check(u("Super-Duper-XL500-42-AutoCoder!"),
          [(0, "Super"), (1, "Duper"), (2, "XL"), (2, "SuperDuperXL"),
           (3, "500"), (4, "42"), (4, "50042"), (5, "Auto"), (6, "Coder"),
           (6, "AutoCoder")])
Beispiel #25
0
 def __init__(self, toolbox, index_help=True):
     """
     Create a searcher for `toolbox`.
     """
     self.schema = Schema(id=STORED,
                          stub=KEYWORD,
                          name=TEXT(analyzer=analysis.SimpleAnalyzer()),
                          description=TEXT,
                          section=TEXT,
                          help=TEXT,
                          labels=KEYWORD)
     self.rex = analysis.RegexTokenizer()
     self.toolbox = toolbox
     self.build_index(index_help)
Beispiel #26
0
 def __init__(self, toolbox, panel_view_id: str, index_dir: str, index_help: bool = True):
     self.schema = Schema(id=ID(stored=True, unique=True),
                          old_id=ID,
                          stub=KEYWORD,
                          name=TEXT(analyzer=analysis.SimpleAnalyzer()),
                          description=TEXT,
                          section=TEXT,
                          help=TEXT,
                          labels=KEYWORD)
     self.rex = analysis.RegexTokenizer()
     self.index_dir = index_dir
     self.toolbox = toolbox
     self.panel_view_id = panel_view_id
     self.index = self._index_setup()
Beispiel #27
0
def test_multifilter():
    iwf_for_index = analysis.IntraWordFilter(mergewords=True, mergenums=False)
    iwf_for_query = analysis.IntraWordFilter(mergewords=False, mergenums=False)
    mf = analysis.MultiFilter(index=iwf_for_index, query=iwf_for_query)
    ana = analysis.RegexTokenizer() | mf | analysis.LowercaseFilter()
    schema = fields.Schema(text=fields.TEXT(analyzer=ana, stored=True))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(text=u("Our BabbleTron5000 is great"))
    w.commit()

    with ix.searcher() as s:
        hit = s.search(query.Term("text", "5000"))[0]
        assert_equal(hit.highlights("text"),
                     'Our BabbleTron<b class="match term0">5000</b> is great')
Beispiel #28
0
def test_charboost_postings():
    cbs = CharacterBoosts()
    ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter()
    content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa")
    assert _roundtrip(content, cbs, "character_boosts", ana) == [("alfa", [(0, 0, 4, 2), (4, 37, 41, 1), (5, 42, 46, 1)]),
                                                                 ("bravo", [(1, 7, 12, 0.1), (3, 27, 32, 0.5)]),
                                                                 ("charlie", [(2, 17, 24, 2)])]
    assert _roundtrip(content, cbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]),
                                                                ("bravo", [(1, 0.1), (3, 0.5)]),
                                                                ("charlie", [(2, 2)])]
    assert _roundtrip(content, cbs, "characters", ana) == [("alfa", [(0, 0, 4), (4, 37, 41), (5, 42, 46)]),
                                                           ("bravo", [(1, 7, 12), (3, 27, 32)]),
                                                           ("charlie", [(2, 17, 24)])]
    assert _roundtrip(content, cbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])]
    assert _roundtrip(content, cbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)]
Beispiel #29
0
def test_intraword_chars():
    iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True)
    ana = analysis.RegexTokenizer(r"\S+") | iwf | analysis.LowercaseFilter()

    target = u("WiKiWo-rd")
    tokens = [(t.text, t.startchar, t.endchar)
              for t in ana(target, chars=True)]
    assert_equal(tokens, [("wi", 0, 2), ("ki", 2, 4), ("wo", 4, 6),
                          ("rd", 7, 9), ("wikiword", 0, 9)])

    target = u("Zo WiKiWo-rd")
    tokens = [(t.text, t.startchar, t.endchar)
              for t in ana(target, chars=True)]
    assert_equal(tokens, [("zo", 0, 2), ("wi", 3, 5), ("ki", 5, 7),
                          ("wo", 7, 9), ("rd", 10, 12), ("wikiword", 3, 12)])
Beispiel #30
0
 def __init__(self, toolbox, index_help=True):
     self.schema = Schema(id=STORED,
                          stub=KEYWORD,
                          name=TEXT(analyzer=analysis.SimpleAnalyzer()),
                          description=TEXT,
                          section=TEXT,
                          help=TEXT,
                          labels=KEYWORD)
     self.rex = analysis.RegexTokenizer()
     self.toolbox = toolbox
     self.storage, self.index = self._index_setup()
     # We keep track of how many times the tool index has been rebuilt.
     # We start at -1, so that after the first index the count is at 0,
     # which is the same as the toolbox reload count. This way we can skip
     # reindexing if the index count is equal to the toolbox reload count.
     self.index_count = -1