def test_names(): auto = Automaton() auto.add_all(NAMES) auto.update_automaton() auto_matches = [(m.start, m.end) for m in auto.get_matches(TEXT)] with TemporaryDirectory() as tmpdir: #tmpdir = '' fnm = os.path.join(tmpdir, 'test.aca') auto.save_to_file(fnm) auto2 = Automaton() auto2.load_from_file(fnm) auto2_matches = [(m.start, m.end) for m in auto2.get_matches(TEXT)] assert list(auto.items()) == list(auto2.items()) assert list(auto.prefixes()) == list(auto2.prefixes()) assert auto_matches == auto2_matches auto3 = Automaton() auto3.load_from_string(auto2.save_to_string()) auto3_matches = [(m.start, m.end) for m in auto3.get_matches(TEXT)] assert list(auto.items()) == list(auto2.items()) assert list(auto.prefixes()) == list(auto2.prefixes()) assert auto_matches == auto3_matches
def __init__(self, service_type, label_conf_dict_path): self.__level1_keywords__ = [] self.__level1_tag__ = [] self.__level1_automaton__ = [] label_file = "%s.rule.dat" % (service_type) level1_keywords_map_file_path = os.path.join(label_conf_dict_path, label_file) with open(level1_keywords_map_file_path) as level1_f: for line in level1_f: if not line: continue line = line.decode("utf-8").strip() if not line: continue if line.startswith("#"): continue line_arr = line.split(":") keywords_list = line_arr[0].split(" ") level1_list = line_arr[1].split(",") self.__level1_keywords__.append(keywords_list) self.__level1_tag__.append(level1_list) automaton = Automaton() automaton.add_all(keywords_list) self.__level1_automaton__.append(automaton)
def test_items(): auto = Automaton() auto.add_all(names) ens, evs = zip(*sorted(names)) ns, vs = zip(*list(auto.items())) ns = [''.join(n) for n in ns] assert list(ens) == list(ns) assert list(evs) == list(vs)
def test_names(): auto = Automaton(NAMES) auto.add_all(NAMES) print (auto) matches = set(' '.join(match.elems) for match in auto.get_matches(TEXT.split())) names = set(' '.join(name) for name in NAMES) assert names == matches
# -*- coding: utf-8 -*- from __future__ import unicode_literals, print_function, absolute_import # create a new AC automaton from aca import Automaton automaton = Automaton() # instead of plain strings, you can also use lists of tokens names = [ (['Yuri', 'Artyukhin'], 'developer'), (['Tom', 'Anderson', 'Jr'], 'designer'), ] automaton.add_all(names) # you can add an item like this as well automaton[['Tom', 'Anderson']] = 'manager' # if you are not using plain strings, make sure you tokenize the text as well text = 'Tom Anderson Jr and Yuri Artyukhin work on my project'.split() print('matches that maximize the number of matched words') for match in automaton.get_matches(text): print(match.start, match.end, match.elems, match.label) print('all matches') for match in automaton.get_matches(text, exclude_overlaps=False): print(match.start, match.end, match.elems, match.label)
# Import the library and initiate the automaton from aca import Automaton automaton = Automaton() # add the entities and build the automaton automaton.add_all(['Funderbeam', 'Funderbeam Data', 'Funderbeam Markets']) automaton.update_automaton() # find matches text = 'Funderbeam Data and Funderbeam Markets are two different products of Funderbeam' for match in automaton.get_matches(text, exclude_overlaps=False): print(match.start, match.end, match.elems) for match in automaton.get_matches(text, exclude_overlaps=True): print(match.start, match.end, match.elems)
# -*- coding: utf-8 -*- from __future__ import unicode_literals, print_function, absolute_import # create a new AC automaton from aca import Automaton automaton = Automaton() # add a dictionary of words to the automaton painkillers = ['paracetamol', 'ibuprofen', 'hydrocloride'] automaton.add_all(painkillers) # match the dictionary on a text text = 'paracetamol and hydrocloride are a medications to relieve pain and fever. paracetamol is less efficient than ibuprofen' for match in automaton.get_matches(text): print(match.start, match.end, match.elems)
def test_prefixes(): auto = Automaton() auto.add_all(['jaanus', 'janek', 'janis']) prefixes, values = zip(*auto.prefixes()) prefixes = [''.join(prefix) for prefix in prefixes] assert prefixes == ['', 'j', 'ja', 'jaa', 'jaan', 'jaanu', 'jaanus', 'jan', 'jane', 'janek', 'jani', 'janis']