def test_tee_filter(): target = u("Alfa Bravo Charlie") f1 = analysis.LowercaseFilter() f2 = analysis.ReverseTextFilter() ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2) result = " ".join([t.text for t in ana(target)]) assert_equal(result, "alfa aflA bravo ovarB charlie eilrahC") class ucfilter(analysis.Filter): def __call__(self, tokens): for t in tokens: t.text = t.text.upper() yield t f2 = analysis.ReverseTextFilter() | ucfilter() ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2) result = " ".join([t.text for t in ana(target)]) assert_equal(result, "alfa AFLA bravo OVARB charlie EILRAHC") f1 = analysis.PassFilter() f2 = analysis.BiWordFilter() ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter( f1, f2) | analysis.LowercaseFilter() result = " ".join([t.text for t in ana(target)]) assert_equal(result, "alfa alfa-bravo bravo bravo-charlie charlie")
def test_double_metaphone(): from whoosh.lang.dmetaphone import double_metaphone names = {'maurice': ('MRS', None), 'aubrey': ('APR', None), 'cambrillo': ('KMPRL', 'KMPR'), 'heidi': ('HT', None), 'katherine': ('K0RN', 'KTRN'), 'Thumbail': ('0MPL', 'TMPL'), 'catherine': ('K0RN', 'KTRN'), 'richard': ('RXRT', 'RKRT'), 'bob': ('PP', None), 'eric': ('ARK', None), 'geoff': ('JF', 'KF'), 'Through': ('0R', 'TR'), 'Schwein': ('XN', 'XFN'), 'dave': ('TF', None), 'ray': ('R', None), 'steven': ('STFN', None), 'bryce': ('PRS', None), 'randy': ('RNT', None), 'bryan': ('PRN', None), 'Rapelje': ('RPL', None), 'brian': ('PRN', None), 'otto': ('AT', None), 'auto': ('AT', None), 'Dallas': ('TLS', None), 'maisey': ('MS', None), 'zhang': ('JNK', None), 'Chile': ('XL', None), 'Jose': ('HS', None), 'Arnow': ('ARN', 'ARNF'), 'solilijs': ('SLLS', None), 'Parachute': ('PRKT', None), 'Nowhere': ('NR', None), 'Tux': ('TKS', None)} dmn = name = None for name in names.keys(): dmn = double_metaphone(name) assert dmn == names[name] mf = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.DoubleMetaphoneFilter()) results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert results == [('SPRS', 1.0), ('F', 1.0), ('FF', 0.5)] mf = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.DoubleMetaphoneFilter(combine=True)) results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert results == [('spruce', 1.0), ('SPRS', 1.0), ('view', 1.0), ('F', 1.0), ('FF', 0.5)] namefield = fields.TEXT(analyzer=mf) texts = list(namefield.process_text(u("Spruce View"), mode="query")) assert texts == [u('spruce'), 'SPRS', u('view'), 'F', 'FF']
def populate_whoosh(text_dir, whoosh_dir): loaded = 0 # Create analyzer used for tokenizing and normalizing tokens # 000, 001, 010, 011, my_analyzers = [(analysis.RegexTokenizer()), (analysis.RegexTokenizer() | analysis.LowercaseFilter()), (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter()), (analysis.StemmingAnalyzer())] # Create schemas schemas = [] for my_analyzer in my_analyzers: schema = Schema(url=ID(stored=True), body=TEXT(stored=True, analyzer=my_analyzer)) schemas.append(schema) # Setup index ixs = [] for i, my_analyzer in enumerate(my_analyzers): whoosh_dir_current = whoosh_dir + str(i) + '/' os.makedirs(whoosh_dir_current, exist_ok=True) ix = index.create_in(whoosh_dir_current, schemas[i]) ixs.append(ix) # Clear index writers = [] for i, my_analyzer in enumerate(my_analyzer): writer = ixs[i].writer() writer.commit(mergetype=writing.CLEAR) writer = ixs[i].writer() writers.append(writer) # Index documents for root, dirs, files in os.walk(text_dir, topdown=False): for name in files: text_file = os.path.join(root, name) print('.', end='') with open(text_file) as tf: body = tf.read() url = text_file.replace(text_dir, "") for writer in writers: writer.add_document(url=url, body=body) # print("Added", url) loaded += 1 for writer in writers: writer.commit() print("\n\nLoaded", loaded, "documents")
def test_double_metaphone(): mf = analysis.RegexTokenizer() | analysis.LowercaseFilter( ) | analysis.DoubleMetaphoneFilter() results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert_equal(results, [('SPRS', 1.0), ('F', 1.0), ('FF', 0.5)]) mf = analysis.RegexTokenizer() | analysis.LowercaseFilter( ) | analysis.DoubleMetaphoneFilter(combine=True) results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert_equal(results, [('spruce', 1.0), ('SPRS', 1.0), ('view', 1.0), ('F', 1.0), ('FF', 0.5)]) namefield = fields.TEXT(analyzer=mf) texts = list(namefield.process_text(u("Spruce View"), mode="query")) assert_equal(texts, [u('spruce'), 'SPRS', u('view'), 'F', 'FF'])
def populate_whoosh(text_dir, whoosh_dir): loaded = 0 ## Create analyzer used for tokenizing and normalizing tokens my_analyzer = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter()) # Create schema schema = Schema(url=ID(stored=True), body=TEXT(stored=True, analyzer=my_analyzer)) # Setup index os.makedirs(whoosh_dir, exist_ok=True) ix = index.create_in(whoosh_dir, schema) # Clear index writer = ix.writer() writer.commit(mergetype=writing.CLEAR) # Index documents writer = ix.writer() for root, dirs, files in os.walk(text_dir, topdown=False): for name in files: text_file = os.path.join(root, name) with open(text_file, encoding="utf8") as tf: body = tf.read() url = text_file.replace(text_dir, "") writer.add_document(url=url, body=body) print("Added", url) loaded += 1 writer.commit() print("\n\nLoaded", loaded, "documents")
def test_shared_composition(): shared = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter() ana1 = shared | analysis.NgramFilter(3) ana2 = shared | analysis.DoubleMetaphoneFilter() assert_equal([t.text for t in ana1(u("hello"))], ["hel", "ell", "llo"]) assert_equal([t.text for t in ana2(u("hello"))], ["HL"])
def test_multifilter(): f1 = analysis.LowercaseFilter() f2 = analysis.PassFilter() mf = analysis.MultiFilter(a=f1, b=f2) ana = analysis.RegexTokenizer(r"\S+") | mf text = u("ALFA BRAVO CHARLIE") assert [t.text for t in ana(text, mode="a")] == ["alfa", "bravo", "charlie"] assert [t.text for t in ana(text, mode="b")] == ["ALFA", "BRAVO", "CHARLIE"]
def test_composition2(): ca = analysis.RegexTokenizer() | analysis.LowercaseFilter() sa = ca | analysis.StopFilter() assert_equal(len(sa), 3) assert_equal(sa.__class__.__name__, "CompositeAnalyzer") assert_equal(sa[0].__class__.__name__, "RegexTokenizer") assert_equal(sa[1].__class__.__name__, "LowercaseFilter") assert_equal(sa[2].__class__.__name__, "StopFilter") assert_equal([t.text for t in sa(u("The ABC 123"))], ["abc", "123"])
def test_intraword_possessive(): iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True) ana = analysis.RegexTokenizer(r"\S+") | iwf | analysis.LowercaseFilter() target = u("O'Malley's-Bar") tokens = [(t.text, t.startchar, t.endchar) for t in ana(target, chars=True)] assert_equal(tokens, [("o", 0, 1), ("malley", 2, 8), ("bar", 11, 14), ("omalleybar", 0, 14)])
def test_start_pos(): from whoosh import formats ana = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter() kw = {"positions": True} tks = formats.tokens(u("alfa bravo charlie delta"), ana, kw) assert_equal([t.pos for t in tks], [0, 1, 2, 3]) kw["start_pos"] = 3 ts = [t.copy() for t in formats.tokens(u("A B C D").split(), ana, kw)] assert_equal(" ".join([t.text for t in ts]), "A B C D") assert_equal([t.pos for t in ts], [3, 4, 5, 6])
def test_multifilter(): iwf_for_index = analysis.IntraWordFilter(mergewords=True, mergenums=False) iwf_for_query = analysis.IntraWordFilter(mergewords=False, mergenums=False) mf = analysis.MultiFilter(index=iwf_for_index, query=iwf_for_query) ana = analysis.RegexTokenizer() | mf | analysis.LowercaseFilter() schema = fields.Schema(text=fields.TEXT(analyzer=ana, stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("Our BabbleTron5000 is great")) w.commit() with ix.searcher() as s: hit = s.search(query.Term("text", "5000"))[0] assert_equal(hit.highlights("text"), 'Our BabbleTron<b class="match term0">5000</b> is great')
def test_intraword_chars(): iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True) ana = analysis.RegexTokenizer(r"\S+") | iwf | analysis.LowercaseFilter() target = u("WiKiWo-rd") tokens = [(t.text, t.startchar, t.endchar) for t in ana(target, chars=True)] assert_equal(tokens, [("wi", 0, 2), ("ki", 2, 4), ("wo", 4, 6), ("rd", 7, 9), ("wikiword", 0, 9)]) target = u("Zo WiKiWo-rd") tokens = [(t.text, t.startchar, t.endchar) for t in ana(target, chars=True)] assert_equal(tokens, [("zo", 0, 2), ("wi", 3, 5), ("ki", 5, 7), ("wo", 7, 9), ("rd", 10, 12), ("wikiword", 3, 12)])
def _get_analyzer(self): if self.args.run == 0: # baseline final = WA.tokenizers.RegexTokenizer() else: # constraint run (use only titles from queries) if self.args.lang == 'english': final = WA.analyzers.StemmingAnalyzer(stoplist=self.stopwords, cachesize=300000) elif self.args.lang =='czech': tokenizer = MyLemmaTokenizer(self.tagger_path) filterI = WA.LowercaseFilter() | \ WA.filters.StopFilter(self.stopwords) | \ RemoveCzechChars() | \ CleanupFilter(1) # remove tokens t s.t., len(t) <= 1 final = tokenizer | filterI else: raise ValueError('wrong lang') return final
def test_name_field(): ana = (analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter() | analysis.DoubleMetaphoneFilter(combine=True)) namefield = fields.TEXT(analyzer=ana, multitoken_query="or") schema = fields.Schema(id=fields.STORED, name=namefield) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("one"), name=u("Leif Ericson")) w.commit() s = ix.searcher() qp = qparser.QueryParser("name", schema) q = qp.parse(u("leaf eriksen"), normalize=False) r = s.search(q) assert_equal(len(r), 1)
def test_biword_stopwords(): # Note that the stop list is None here ana = (analysis.RegexTokenizer() | analysis.StopFilter(stoplist=None, minsize=3) | analysis.BiWordFilter()) texts = [t.text for t in ana(u("stuff and then some"))] assert texts == ["stuff-and", "and-then", "then-some"] # Use a stop list here ana = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter() | analysis.BiWordFilter()) texts = [t.text for t in ana(u("stuff and then some"))] assert texts == ["stuff-then", "then-some"]
def test_pystemmer(): ana = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.PyStemmerFilter()) schema = fields.Schema(text=fields.TEXT(analyzer=ana)) st = RamStorage() ix = st.create_index(schema) with ix.writer() as w: w.add_document(text=u("rains falling strangely")) ix = st.open_index() with ix.writer() as w: w.add_document(text=u("pains stalling strongly")) ix = st.open_index() with ix.reader() as r: assert_equal(list(r.lexicon("text")), ["fall", "pain", "rain", "stall", "strang", "strong"])
def test_shingle_stopwords(): # Note that the stop list is None here ana = (analysis.RegexTokenizer() | analysis.StopFilter(stoplist=None, minsize=3) | analysis.ShingleFilter(size=3)) texts = [t.text for t in ana(u("some other stuff and then some things To Check "))] assert texts == ["some-other-stuff", "other-stuff-and", "stuff-and-then", "and-then-some", "then-some-things", "some-things-Check"] # Use a stop list here ana = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter() | analysis.ShingleFilter(size=3)) texts = [t.text for t in ana(u("some other stuff and then some things To Check "))] assert texts == ["some-other-stuff", "other-stuff-then", "stuff-then-some", "then-some-things", "some-things-check"]
def test_composition1(): ca = analysis.RegexTokenizer() | analysis.LowercaseFilter() assert_equal(ca.__class__.__name__, "CompositeAnalyzer") assert_equal(ca[0].__class__.__name__, "RegexTokenizer") assert_equal(ca[1].__class__.__name__, "LowercaseFilter") assert_equal([t.text for t in ca(u("ABC 123"))], ["abc", "123"])
def test_composition1(): ca = analysis.RegexTokenizer() | analysis.LowercaseFilter() assert ca.__class__.__name__ == "CompositeAnalyzer" assert ca[0].__class__.__name__ == "RegexTokenizer" assert ca[1].__class__.__name__ == "LowercaseFilter" assert [t.text for t in ca(u("ABC 123"))] == ["abc", "123"]
def __init__(self, split_restricts=True): self._tokenizer = analysis.RegexTokenizer() | analysis.LowercaseFilter( ) self._split_restricts = split_restricts
charclass_re = re.compile(":(?P<cls>[^:]+):`~?(?P<ref>[^`\n]+)`", re.MULTILINE) def_re = re.compile("^[.][.] ?([^:]+):: (.*?)$", re.MULTILINE) deffields = {"class": "cls", "module": "mod"} reffields = { "mod": "modref", "class": "clsref", "func": "funcref", "pep": "pep" } ana = analysis.StemmingAnalyzer(stoplist=stoplists["en"], maxsize=40) cls_ana = (analysis.SpaceSeparatedTokenizer() | analysis.IntraWordFilter(mergewords=True) | analysis.LowercaseFilter()) tech_ana = (analysis.RegexTokenizer("\w+") | analysis.LowercaseFilter()) class PydocSchema(fields.SchemaClass): path = fields.STORED title = fields.TEXT(stored=True, sortable=True, spelling=True, analyzer=ana) tgrams = fields.NGRAMWORDS content = fields.TEXT(spelling=True, analyzer=ana)