def __init__(self, service_type, label_conf_dict_path): self.__level1_keywords__ = [] self.__level1_tag__ = [] self.__level1_automaton__ = [] label_file = "%s.rule.dat" % (service_type) level1_keywords_map_file_path = os.path.join(label_conf_dict_path, label_file) with open(level1_keywords_map_file_path) as level1_f: for line in level1_f: if not line: continue line = line.decode("utf-8").strip() if not line: continue if line.startswith("#"): continue line_arr = line.split(":") keywords_list = line_arr[0].split(" ") level1_list = line_arr[1].split(",") self.__level1_keywords__.append(keywords_list) self.__level1_tag__.append(level1_list) automaton = Automaton() automaton.add_all(keywords_list) self.__level1_automaton__.append(automaton)
def test_items(): auto = Automaton() auto.add_all(names) ens, evs = zip(*sorted(names)) ns, vs = zip(*list(auto.items())) ns = [''.join(n) for n in ns] assert list(ens) == list(ns) assert list(evs) == list(vs)
def test_names(): auto = Automaton(NAMES) auto.add_all(NAMES) print (auto) matches = set(' '.join(match.elems) for match in auto.get_matches(TEXT.split())) names = set(' '.join(name) for name in NAMES) assert names == matches
def test_names(): auto = Automaton() auto[KEY] = VAL auto.update_automaton() with TemporaryDirectory() as tmpdir: fnm = os.path.join(tmpdir, 'test.aca') auto.save_to_file(fnm) auto2 = Automaton() auto2.load_from_file(fnm) assert auto2[KEY] == VAL
def test_map_interface(): auto = Automaton() auto['us'] = 'USA' auto['her'] = 'EUROPE' assert auto['us'] == 'USA' assert auto['her'] == 'EUROPE' matches = auto.get_matches('usher') assert len(matches) == 2 assert matches[0].label == 'USA' assert matches[1].label == 'EUROPE' assert 'us' in auto assert 'his' not in auto
def test_with_updating(): auto = Automaton() auto.add('hers') matches = auto.get_matches('ushers') assert len(matches) == 1 auto.add('us') matches = auto.get_matches('ushers') assert len(matches) == 2
def test_lemmas(): auto = Automaton() auto.add(['sunlabob'], 'CO') auto.add(['renewable'], 'CO') lemmas = [ 'sunlabob', 'renewable', 'energy', 'receive', '$', '2.1', 'million', 'investment' ] print(auto.str()) matches = auto.get_matches(lemmas) assert len(matches) == 2
def test_has_pattern(): automaton = Automaton() automaton.add('himalaya') print(automaton) assert automaton.has_prefix('him') assert automaton.has_prefix('himalaya') assert not automaton.has_prefix('himalayas')
def test_with_words(): auto = Automaton() auto.add(['funderbeam']) auto.add(['mattermark']) auto.add(['500', 'startups']) txt = 'funderbeam and mattermark along with 500 startups'.split() expected = [Match(0, 1, 'Y'), Match(2, 3, 'Y'), Match(5, 7, 'Y')] actual = auto.get_matches(txt) assert expected == actual
class Dictionary(object): automaton = Automaton() with open(const.get_token_dictionary_file_name()) as f: token_dict = f.read().split() token_dictionary = [x.strip() for x in token_dict] automaton.add_all(token_dictionary) with open(const.get_garbage_dictionary_file_name()) as f: garbage_dict = f.readlines() garbage_dictionary = [x.strip() for x in garbage_dict] garbage_dictionary.sort(key=lambda item: (-len(item), item))
class Dictionary(object): automaton = Automaton() with open(SparkFiles.get('dict_token.txt')) as f: token_dict = f.read().split() token_dictionary = [x.strip() for x in token_dict] automaton.add_all(token_dictionary) with open(SparkFiles.get('dict_garbage.txt')) as f: garbage_dict = f.readlines() garbage_dictionary = [x.strip() for x in garbage_dict] garbage_dictionary.sort(key=lambda item: (-len(item), item))
def test_automaton_with_words(): auto = Automaton() for token in ['he', 'she', 'his', 'hers']: auto.add(token) expected_all_matches = [ Match(1, 4, 'Y'), Match(2, 4, 'Y'), Match(2, 6, 'Y') ] all_matches = auto.get_matches('ushers', exclude_overlaps=False) print(all_matches) assert expected_all_matches == all_matches expected_nonoverlap_matches = [Match(2, 6, 'Y')] nonoverlap_matches = auto.get_matches('ushers', exclude_overlaps=True) assert expected_nonoverlap_matches == nonoverlap_matches
# -*- coding: utf-8 -*- from __future__ import unicode_literals, print_function, absolute_import # create a new AC automaton from aca import Automaton automaton = Automaton() # instead of plain strings, you can also use lists of tokens names = [ (['Yuri', 'Artyukhin'], 'developer'), (['Tom', 'Anderson', 'Jr'], 'designer'), ] automaton.add_all(names) # you can add an item like this as well automaton[['Tom', 'Anderson']] = 'manager' # if you are not using plain strings, make sure you tokenize the text as well text = 'Tom Anderson Jr and Yuri Artyukhin work on my project'.split() print('matches that maximize the number of matched words') for match in automaton.get_matches(text): print(match.start, match.end, match.elems, match.label) print('all matches') for match in automaton.get_matches(text, exclude_overlaps=False): print(match.start, match.end, match.elems, match.label)
# Import the library and initiate the automaton from aca import Automaton automaton = Automaton() # add the entities and build the automaton automaton.add_all(['Funderbeam', 'Funderbeam Data', 'Funderbeam Markets']) automaton.update_automaton() # find matches text = 'Funderbeam Data and Funderbeam Markets are two different products of Funderbeam' for match in automaton.get_matches(text, exclude_overlaps=False): print(match.start, match.end, match.elems) for match in automaton.get_matches(text, exclude_overlaps=True): print(match.start, match.end, match.elems)
# -*- coding: utf-8 -*- from __future__ import unicode_literals, print_function, absolute_import # create a new AC automaton from aca import Automaton automaton = Automaton() automaton['Estonia'] = 'Tallinn' automaton['Germany'] = 'Berlin' automaton['Finland'] = 'Helsinki' # serialize to disk automaton.save_to_file('myautomaton.bin') # load from disk automaton2 = Automaton() automaton2.load_from_file('myautomaton.bin') # save / load to binary string automaton3 = Automaton() automaton3.load_from_string(automaton.save_to_string()) print(automaton2['Estonia']) print(automaton3['Germany'])
# -*- coding: utf-8 -*- from __future__ import unicode_literals, print_function, absolute_import # create a new AC automaton from aca import Automaton map = Automaton() # use the automaton as a map map['electrify'] = 'verb' map['elegant'] = 'adjective' map['acid'] = 'noun' map['acidic'] = 'adjective' # access it like a Python dictionary print(map['acid']) # using an invalid key raises a KeyError #print (map['invalid key']) # you can use get to provide a default value when key is missing print(map.get('invalid key', 'default value')) # NB! Implementation specific special case: empty strings # denote "missing" values, so you can't use these map['special'] = '' #print (map['special']) # you can delete items del map['electrify'] # trying to delete a non-existent item raises KeyError
def test_names(): auto = Automaton() auto.add_all(NAMES) auto.update_automaton() auto_matches = [(m.start, m.end) for m in auto.get_matches(TEXT)] with TemporaryDirectory() as tmpdir: #tmpdir = '' fnm = os.path.join(tmpdir, 'test.aca') auto.save_to_file(fnm) auto2 = Automaton() auto2.load_from_file(fnm) auto2_matches = [(m.start, m.end) for m in auto2.get_matches(TEXT)] assert list(auto.items()) == list(auto2.items()) assert list(auto.prefixes()) == list(auto2.prefixes()) assert auto_matches == auto2_matches auto3 = Automaton() auto3.load_from_string(auto2.save_to_string()) auto3_matches = [(m.start, m.end) for m in auto3.get_matches(TEXT)] assert list(auto.items()) == list(auto2.items()) assert list(auto.prefixes()) == list(auto2.prefixes()) assert auto_matches == auto3_matches
# -*- coding: utf-8 -*- from __future__ import unicode_literals, print_function, absolute_import # create a new AC automaton from aca import Automaton automaton = Automaton() # add a dictionary of words to the automaton painkillers = ['paracetamol', 'ibuprofen', 'hydrocloride'] automaton.add_all(painkillers) # match the dictionary on a text text = 'paracetamol and hydrocloride are a medications to relieve pain and fever. paracetamol is less efficient than ibuprofen' for match in automaton.get_matches(text): print(match.start, match.end, match.elems)
def test_has_pattern(): automaton = Automaton() automaton.add('he') automaton.add('she') automaton.add('us') assert automaton.has_pattern('he') assert automaton.has_pattern('she') assert automaton.has_pattern('us') assert not automaton.has_pattern('they') assert not automaton.has_pattern('e') assert not automaton.has_pattern('use')
def test_prefixes(): auto = Automaton() auto.add_all(['jaanus', 'janek', 'janis']) prefixes, values = zip(*auto.prefixes()) prefixes = [''.join(prefix) for prefix in prefixes] assert prefixes == ['', 'j', 'ja', 'jaa', 'jaan', 'jaanu', 'jaanus', 'jan', 'jane', 'janek', 'jani', 'janis']