예제 #1
0
    def __init__(self, service_type, label_conf_dict_path):
        self.__level1_keywords__ = []
        self.__level1_tag__ = []
        self.__level1_automaton__ = []

        label_file = "%s.rule.dat" % (service_type)
        level1_keywords_map_file_path = os.path.join(label_conf_dict_path,
                                                     label_file)

        with open(level1_keywords_map_file_path) as level1_f:
            for line in level1_f:
                if not line:
                    continue
                line = line.decode("utf-8").strip()
                if not line:
                    continue
                if line.startswith("#"):
                    continue
                line_arr = line.split(":")
                keywords_list = line_arr[0].split(" ")
                level1_list = line_arr[1].split(",")

                self.__level1_keywords__.append(keywords_list)
                self.__level1_tag__.append(level1_list)
                automaton = Automaton()
                automaton.add_all(keywords_list)
                self.__level1_automaton__.append(automaton)
예제 #2
0
def test_items():
    auto = Automaton()
    auto.add_all(names)
    ens, evs = zip(*sorted(names))
    ns, vs = zip(*list(auto.items()))
    ns = [''.join(n) for n in ns]
    assert list(ens) == list(ns)
    assert list(evs) == list(vs)
예제 #3
0
def test_names():
    auto = Automaton(NAMES)
    auto.add_all(NAMES)
    print (auto)

    matches = set(' '.join(match.elems) for match in auto.get_matches(TEXT.split()))
    names = set(' '.join(name) for name in NAMES)

    assert names == matches
예제 #4
0
def test_names():
    auto = Automaton()
    auto[KEY] = VAL
    auto.update_automaton()

    with TemporaryDirectory() as tmpdir:
        fnm = os.path.join(tmpdir, 'test.aca')
        auto.save_to_file(fnm)
        auto2 = Automaton()
        auto2.load_from_file(fnm)

    assert auto2[KEY] == VAL
예제 #5
0
def test_map_interface():
    auto = Automaton()
    auto['us'] = 'USA'
    auto['her'] = 'EUROPE'
    assert auto['us'] == 'USA'
    assert auto['her'] == 'EUROPE'
    matches = auto.get_matches('usher')
    assert len(matches) == 2
    assert matches[0].label == 'USA'
    assert matches[1].label == 'EUROPE'

    assert 'us' in auto
    assert 'his' not in auto
예제 #6
0
def test_with_updating():
    auto = Automaton()
    auto.add('hers')
    matches = auto.get_matches('ushers')
    assert len(matches) == 1
    auto.add('us')
    matches = auto.get_matches('ushers')
    assert len(matches) == 2
예제 #7
0
def test_lemmas():
    auto = Automaton()
    auto.add(['sunlabob'], 'CO')
    auto.add(['renewable'], 'CO')
    lemmas = [
        'sunlabob', 'renewable', 'energy', 'receive', '$', '2.1', 'million',
        'investment'
    ]
    print(auto.str())
    matches = auto.get_matches(lemmas)
    assert len(matches) == 2
예제 #8
0
def test_has_pattern():
    automaton = Automaton()
    automaton.add('himalaya')

    print(automaton)

    assert automaton.has_prefix('him')
    assert automaton.has_prefix('himalaya')
    assert not automaton.has_prefix('himalayas')
예제 #9
0
def test_with_words():
    auto = Automaton()
    auto.add(['funderbeam'])
    auto.add(['mattermark'])
    auto.add(['500', 'startups'])

    txt = 'funderbeam and mattermark along with 500 startups'.split()
    expected = [Match(0, 1, 'Y'), Match(2, 3, 'Y'), Match(5, 7, 'Y')]
    actual = auto.get_matches(txt)
    assert expected == actual
예제 #10
0
class Dictionary(object):
    automaton = Automaton()
    with open(const.get_token_dictionary_file_name()) as f:
        token_dict = f.read().split()

    token_dictionary = [x.strip() for x in token_dict]
    automaton.add_all(token_dictionary)

    with open(const.get_garbage_dictionary_file_name()) as f:
        garbage_dict = f.readlines()

    garbage_dictionary = [x.strip() for x in garbage_dict]
    garbage_dictionary.sort(key=lambda item: (-len(item), item))
예제 #11
0
class Dictionary(object):
    automaton = Automaton()
    with open(SparkFiles.get('dict_token.txt')) as f:
        token_dict = f.read().split()

    token_dictionary = [x.strip() for x in token_dict]
    automaton.add_all(token_dictionary)

    with open(SparkFiles.get('dict_garbage.txt')) as f:
        garbage_dict = f.readlines()

    garbage_dictionary = [x.strip() for x in garbage_dict]
    garbage_dictionary.sort(key=lambda item: (-len(item), item))
예제 #12
0
def test_automaton_with_words():
    auto = Automaton()
    for token in ['he', 'she', 'his', 'hers']:
        auto.add(token)

    expected_all_matches = [
        Match(1, 4, 'Y'), Match(2, 4, 'Y'),
        Match(2, 6, 'Y')
    ]
    all_matches = auto.get_matches('ushers', exclude_overlaps=False)
    print(all_matches)
    assert expected_all_matches == all_matches

    expected_nonoverlap_matches = [Match(2, 6, 'Y')]
    nonoverlap_matches = auto.get_matches('ushers', exclude_overlaps=True)
    assert expected_nonoverlap_matches == nonoverlap_matches
예제 #13
0
파일: example2.py 프로젝트: WeeJang/aca
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function, absolute_import

# create a new AC automaton
from aca import Automaton
automaton = Automaton()

# instead of plain strings, you can also use lists of tokens
names = [
    (['Yuri', 'Artyukhin'], 'developer'),
    (['Tom', 'Anderson', 'Jr'], 'designer'),
]
automaton.add_all(names)

# you can add an item like this as well
automaton[['Tom', 'Anderson']] = 'manager'

# if you are not using plain strings, make sure you tokenize the text as well
text = 'Tom Anderson Jr and Yuri Artyukhin work on my project'.split()

print('matches that maximize the number of matched words')
for match in automaton.get_matches(text):
    print(match.start, match.end, match.elems, match.label)

print('all matches')
for match in automaton.get_matches(text, exclude_overlaps=False):
    print(match.start, match.end, match.elems, match.label)
예제 #14
0
파일: example5.py 프로젝트: WeeJang/aca
# Import the library and initiate the automaton
from aca import Automaton
automaton = Automaton()

# add the entities and build the automaton
automaton.add_all(['Funderbeam', 'Funderbeam Data', 'Funderbeam Markets'])
automaton.update_automaton()

# find matches
text = 'Funderbeam Data and Funderbeam Markets are two different products of Funderbeam'
for match in automaton.get_matches(text, exclude_overlaps=False):
    print(match.start, match.end, match.elems)

for match in automaton.get_matches(text, exclude_overlaps=True):
    print(match.start, match.end, match.elems)
예제 #15
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function, absolute_import

# create a new AC automaton
from aca import Automaton

automaton = Automaton()
automaton['Estonia'] = 'Tallinn'
automaton['Germany'] = 'Berlin'
automaton['Finland'] = 'Helsinki'

# serialize to disk
automaton.save_to_file('myautomaton.bin')

# load from disk
automaton2 = Automaton()
automaton2.load_from_file('myautomaton.bin')

# save / load to binary string
automaton3 = Automaton()
automaton3.load_from_string(automaton.save_to_string())

print(automaton2['Estonia'])
print(automaton3['Germany'])
예제 #16
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function, absolute_import

# create a new AC automaton
from aca import Automaton
map = Automaton()

# use the automaton as a map
map['electrify'] = 'verb'
map['elegant'] = 'adjective'
map['acid'] = 'noun'
map['acidic'] = 'adjective'

# access it like a Python dictionary
print(map['acid'])

# using an invalid key raises a KeyError
#print (map['invalid key'])

# you can use get to provide a default value when key is missing
print(map.get('invalid key', 'default value'))

# NB! Implementation specific special case: empty strings
# denote "missing" values, so you can't use these
map['special'] = ''
#print (map['special'])

# you can delete items
del map['electrify']

# trying to delete a non-existent item raises KeyError
예제 #17
0
def test_names():
    auto = Automaton()
    auto.add_all(NAMES)
    auto.update_automaton()
    auto_matches = [(m.start, m.end) for m in auto.get_matches(TEXT)]

    with TemporaryDirectory() as tmpdir:
        #tmpdir = ''
        fnm = os.path.join(tmpdir, 'test.aca')
        auto.save_to_file(fnm)
        auto2 = Automaton()
        auto2.load_from_file(fnm)

    auto2_matches = [(m.start, m.end) for m in auto2.get_matches(TEXT)]
    assert list(auto.items()) == list(auto2.items())
    assert list(auto.prefixes()) == list(auto2.prefixes())
    assert auto_matches == auto2_matches

    auto3 = Automaton()
    auto3.load_from_string(auto2.save_to_string())
    auto3_matches = [(m.start, m.end) for m in auto3.get_matches(TEXT)]

    assert list(auto.items()) == list(auto2.items())
    assert list(auto.prefixes()) == list(auto2.prefixes())
    assert auto_matches == auto3_matches
예제 #18
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function, absolute_import

# create a new AC automaton
from aca import Automaton
automaton = Automaton()

# add a dictionary of words to the automaton
painkillers = ['paracetamol', 'ibuprofen', 'hydrocloride']
automaton.add_all(painkillers)

# match the dictionary on a text
text = 'paracetamol and hydrocloride are a medications to relieve pain and fever. paracetamol is less efficient than ibuprofen'

for match in automaton.get_matches(text):
    print(match.start, match.end, match.elems)
예제 #19
0
def test_has_pattern():
    automaton = Automaton()
    automaton.add('he')
    automaton.add('she')
    automaton.add('us')

    assert automaton.has_pattern('he')
    assert automaton.has_pattern('she')
    assert automaton.has_pattern('us')
    assert not automaton.has_pattern('they')
    assert not automaton.has_pattern('e')
    assert not automaton.has_pattern('use')
예제 #20
0
def test_prefixes():
    auto = Automaton()
    auto.add_all(['jaanus', 'janek', 'janis'])
    prefixes, values = zip(*auto.prefixes())
    prefixes = [''.join(prefix) for prefix in prefixes]
    assert prefixes == ['', 'j', 'ja', 'jaa', 'jaan', 'jaanu', 'jaanus', 'jan', 'jane', 'janek', 'jani', 'janis']