def test_stop_lang(): stopper = analysis.RegexTokenizer() | analysis.StopFilter() ls = [token.text for token in stopper(u("this is a test"))] assert ls == [u("test")] es_stopper = analysis.RegexTokenizer() | analysis.StopFilter(lang="es") ls = [token.text for token in es_stopper(u("el lapiz es en la mesa"))] assert ls == ["lapiz", "mesa"]
def populate_whoosh(text_dir, whoosh_dir): loaded = 0 ## Create analyzer used for tokenizing and normalizing tokens my_analyzer = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter()) # Create schema schema = Schema(url=ID(stored=True), body=TEXT(stored=True, analyzer=my_analyzer)) # Setup index os.makedirs(whoosh_dir, exist_ok=True) ix = index.create_in(whoosh_dir, schema) # Clear index writer = ix.writer() writer.commit(mergetype=writing.CLEAR) # Index documents writer = ix.writer() for root, dirs, files in os.walk(text_dir, topdown=False): for name in files: text_file = os.path.join(root, name) with open(text_file, encoding="utf8") as tf: body = tf.read() url = text_file.replace(text_dir, "") writer.add_document(url=url, body=body) print("Added", url) loaded += 1 writer.commit() print("\n\nLoaded", loaded, "documents")
def test_biword_stopwords(): # Note that the stop list is None here ana = (analysis.RegexTokenizer() | analysis.StopFilter(stoplist=None, minsize=3) | analysis.BiWordFilter()) texts = [t.text for t in ana(u("stuff and then some"))] assert texts == ["stuff-and", "and-then", "then-some"] # Use a stop list here ana = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter() | analysis.BiWordFilter()) texts = [t.text for t in ana(u("stuff and then some"))] assert texts == ["stuff-then", "then-some"]
def test_composition2(): ca = analysis.RegexTokenizer() | analysis.LowercaseFilter() sa = ca | analysis.StopFilter() assert_equal(len(sa), 3) assert_equal(sa.__class__.__name__, "CompositeAnalyzer") assert_equal(sa[0].__class__.__name__, "RegexTokenizer") assert_equal(sa[1].__class__.__name__, "LowercaseFilter") assert_equal(sa[2].__class__.__name__, "StopFilter") assert_equal([t.text for t in sa(u("The ABC 123"))], ["abc", "123"])
def test_shingle_stopwords(): # Note that the stop list is None here ana = (analysis.RegexTokenizer() | analysis.StopFilter(stoplist=None, minsize=3) | analysis.ShingleFilter(size=3)) texts = [t.text for t in ana(u("some other stuff and then some things To Check "))] assert texts == ["some-other-stuff", "other-stuff-and", "stuff-and-then", "and-then-some", "then-some-things", "some-things-Check"] # Use a stop list here ana = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter() | analysis.ShingleFilter(size=3)) texts = [t.text for t in ana(u("some other stuff and then some things To Check "))] assert texts == ["some-other-stuff", "other-stuff-then", "stuff-then-some", "then-some-things", "some-things-check"]
def populate_whoosh(text_dir, whoosh_dir): loaded = 0 # Create analyzer used for tokenizing and normalizing tokens # 000, 001, 010, 011, my_analyzers = [(analysis.RegexTokenizer()), (analysis.RegexTokenizer() | analysis.LowercaseFilter()), (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter()), (analysis.StemmingAnalyzer())] # Create schemas schemas = [] for my_analyzer in my_analyzers: schema = Schema(url=ID(stored=True), body=TEXT(stored=True, analyzer=my_analyzer)) schemas.append(schema) # Setup index ixs = [] for i, my_analyzer in enumerate(my_analyzers): whoosh_dir_current = whoosh_dir + str(i) + '/' os.makedirs(whoosh_dir_current, exist_ok=True) ix = index.create_in(whoosh_dir_current, schemas[i]) ixs.append(ix) # Clear index writers = [] for i, my_analyzer in enumerate(my_analyzer): writer = ixs[i].writer() writer.commit(mergetype=writing.CLEAR) writer = ixs[i].writer() writers.append(writer) # Index documents for root, dirs, files in os.walk(text_dir, topdown=False): for name in files: text_file = os.path.join(root, name) print('.', end='') with open(text_file) as tf: body = tf.read() url = text_file.replace(text_dir, "") for writer in writers: writer.add_document(url=url, body=body) # print("Added", url) loaded += 1 for writer in writers: writer.commit() print("\n\nLoaded", loaded, "documents")
def test_composition3(): sa = analysis.RegexTokenizer() | analysis.StopFilter() assert_equal(sa.__class__.__name__, "CompositeAnalyzer")