예제 #1
0
def test_names():
    auto = Automaton()
    auto.add_all(NAMES)
    auto.update_automaton()
    auto_matches = [(m.start, m.end) for m in auto.get_matches(TEXT)]

    with TemporaryDirectory() as tmpdir:
        #tmpdir = ''
        fnm = os.path.join(tmpdir, 'test.aca')
        auto.save_to_file(fnm)
        auto2 = Automaton()
        auto2.load_from_file(fnm)

    auto2_matches = [(m.start, m.end) for m in auto2.get_matches(TEXT)]
    assert list(auto.items()) == list(auto2.items())
    assert list(auto.prefixes()) == list(auto2.prefixes())
    assert auto_matches == auto2_matches

    auto3 = Automaton()
    auto3.load_from_string(auto2.save_to_string())
    auto3_matches = [(m.start, m.end) for m in auto3.get_matches(TEXT)]

    assert list(auto.items()) == list(auto2.items())
    assert list(auto.prefixes()) == list(auto2.prefixes())
    assert auto_matches == auto3_matches
예제 #2
0
def test_with_updating():
    auto = Automaton()
    auto.add('hers')
    matches = auto.get_matches('ushers')
    assert len(matches) == 1
    auto.add('us')
    matches = auto.get_matches('ushers')
    assert len(matches) == 2
예제 #3
0
def test_automaton_with_words():
    auto = Automaton()
    for token in ['he', 'she', 'his', 'hers']:
        auto.add(token)

    expected_all_matches = [
        Match(1, 4, 'Y'), Match(2, 4, 'Y'),
        Match(2, 6, 'Y')
    ]
    all_matches = auto.get_matches('ushers', exclude_overlaps=False)
    print(all_matches)
    assert expected_all_matches == all_matches

    expected_nonoverlap_matches = [Match(2, 6, 'Y')]
    nonoverlap_matches = auto.get_matches('ushers', exclude_overlaps=True)
    assert expected_nonoverlap_matches == nonoverlap_matches
예제 #4
0
def test_names():
    auto = Automaton(NAMES)
    auto.add_all(NAMES)
    print (auto)

    matches = set(' '.join(match.elems) for match in auto.get_matches(TEXT.split()))
    names = set(' '.join(name) for name in NAMES)

    assert names == matches
예제 #5
0
def test_with_words():
    auto = Automaton()
    auto.add(['funderbeam'])
    auto.add(['mattermark'])
    auto.add(['500', 'startups'])

    txt = 'funderbeam and mattermark along with 500 startups'.split()
    expected = [Match(0, 1, 'Y'), Match(2, 3, 'Y'), Match(5, 7, 'Y')]
    actual = auto.get_matches(txt)
    assert expected == actual
예제 #6
0
def test_lemmas():
    auto = Automaton()
    auto.add(['sunlabob'], 'CO')
    auto.add(['renewable'], 'CO')
    lemmas = [
        'sunlabob', 'renewable', 'energy', 'receive', '$', '2.1', 'million',
        'investment'
    ]
    print(auto.str())
    matches = auto.get_matches(lemmas)
    assert len(matches) == 2
예제 #7
0
def test_map_interface():
    auto = Automaton()
    auto['us'] = 'USA'
    auto['her'] = 'EUROPE'
    assert auto['us'] == 'USA'
    assert auto['her'] == 'EUROPE'
    matches = auto.get_matches('usher')
    assert len(matches) == 2
    assert matches[0].label == 'USA'
    assert matches[1].label == 'EUROPE'

    assert 'us' in auto
    assert 'his' not in auto
예제 #8
0
파일: example2.py 프로젝트: WeeJang/aca
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function, absolute_import

# create a new AC automaton
from aca import Automaton
automaton = Automaton()

# instead of plain strings, you can also use lists of tokens
names = [
    (['Yuri', 'Artyukhin'], 'developer'),
    (['Tom', 'Anderson', 'Jr'], 'designer'),
]
automaton.add_all(names)

# you can add an item like this as well
automaton[['Tom', 'Anderson']] = 'manager'

# if you are not using plain strings, make sure you tokenize the text as well
text = 'Tom Anderson Jr and Yuri Artyukhin work on my project'.split()

print('matches that maximize the number of matched words')
for match in automaton.get_matches(text):
    print(match.start, match.end, match.elems, match.label)

print('all matches')
for match in automaton.get_matches(text, exclude_overlaps=False):
    print(match.start, match.end, match.elems, match.label)
예제 #9
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function, absolute_import

# create a new AC automaton
from aca import Automaton
automaton = Automaton()

# add a dictionary of words to the automaton
painkillers = ['paracetamol', 'ibuprofen', 'hydrocloride']
automaton.add_all(painkillers)

# match the dictionary on a text
text = 'paracetamol and hydrocloride are a medications to relieve pain and fever. paracetamol is less efficient than ibuprofen'

for match in automaton.get_matches(text):
    print(match.start, match.end, match.elems)