def test_names(): auto = Automaton() auto.add_all(NAMES) auto.update_automaton() auto_matches = [(m.start, m.end) for m in auto.get_matches(TEXT)] with TemporaryDirectory() as tmpdir: #tmpdir = '' fnm = os.path.join(tmpdir, 'test.aca') auto.save_to_file(fnm) auto2 = Automaton() auto2.load_from_file(fnm) auto2_matches = [(m.start, m.end) for m in auto2.get_matches(TEXT)] assert list(auto.items()) == list(auto2.items()) assert list(auto.prefixes()) == list(auto2.prefixes()) assert auto_matches == auto2_matches auto3 = Automaton() auto3.load_from_string(auto2.save_to_string()) auto3_matches = [(m.start, m.end) for m in auto3.get_matches(TEXT)] assert list(auto.items()) == list(auto2.items()) assert list(auto.prefixes()) == list(auto2.prefixes()) assert auto_matches == auto3_matches
def test_with_updating(): auto = Automaton() auto.add('hers') matches = auto.get_matches('ushers') assert len(matches) == 1 auto.add('us') matches = auto.get_matches('ushers') assert len(matches) == 2
def test_automaton_with_words(): auto = Automaton() for token in ['he', 'she', 'his', 'hers']: auto.add(token) expected_all_matches = [ Match(1, 4, 'Y'), Match(2, 4, 'Y'), Match(2, 6, 'Y') ] all_matches = auto.get_matches('ushers', exclude_overlaps=False) print(all_matches) assert expected_all_matches == all_matches expected_nonoverlap_matches = [Match(2, 6, 'Y')] nonoverlap_matches = auto.get_matches('ushers', exclude_overlaps=True) assert expected_nonoverlap_matches == nonoverlap_matches
def test_names(): auto = Automaton(NAMES) auto.add_all(NAMES) print (auto) matches = set(' '.join(match.elems) for match in auto.get_matches(TEXT.split())) names = set(' '.join(name) for name in NAMES) assert names == matches
def test_with_words(): auto = Automaton() auto.add(['funderbeam']) auto.add(['mattermark']) auto.add(['500', 'startups']) txt = 'funderbeam and mattermark along with 500 startups'.split() expected = [Match(0, 1, 'Y'), Match(2, 3, 'Y'), Match(5, 7, 'Y')] actual = auto.get_matches(txt) assert expected == actual
def test_lemmas(): auto = Automaton() auto.add(['sunlabob'], 'CO') auto.add(['renewable'], 'CO') lemmas = [ 'sunlabob', 'renewable', 'energy', 'receive', '$', '2.1', 'million', 'investment' ] print(auto.str()) matches = auto.get_matches(lemmas) assert len(matches) == 2
def test_map_interface(): auto = Automaton() auto['us'] = 'USA' auto['her'] = 'EUROPE' assert auto['us'] == 'USA' assert auto['her'] == 'EUROPE' matches = auto.get_matches('usher') assert len(matches) == 2 assert matches[0].label == 'USA' assert matches[1].label == 'EUROPE' assert 'us' in auto assert 'his' not in auto
# -*- coding: utf-8 -*- from __future__ import unicode_literals, print_function, absolute_import # create a new AC automaton from aca import Automaton automaton = Automaton() # instead of plain strings, you can also use lists of tokens names = [ (['Yuri', 'Artyukhin'], 'developer'), (['Tom', 'Anderson', 'Jr'], 'designer'), ] automaton.add_all(names) # you can add an item like this as well automaton[['Tom', 'Anderson']] = 'manager' # if you are not using plain strings, make sure you tokenize the text as well text = 'Tom Anderson Jr and Yuri Artyukhin work on my project'.split() print('matches that maximize the number of matched words') for match in automaton.get_matches(text): print(match.start, match.end, match.elems, match.label) print('all matches') for match in automaton.get_matches(text, exclude_overlaps=False): print(match.start, match.end, match.elems, match.label)
# -*- coding: utf-8 -*- from __future__ import unicode_literals, print_function, absolute_import # create a new AC automaton from aca import Automaton automaton = Automaton() # add a dictionary of words to the automaton painkillers = ['paracetamol', 'ibuprofen', 'hydrocloride'] automaton.add_all(painkillers) # match the dictionary on a text text = 'paracetamol and hydrocloride are a medications to relieve pain and fever. paracetamol is less efficient than ibuprofen' for match in automaton.get_matches(text): print(match.start, match.end, match.elems)