Ejemplo n.º 1
0
def test_names():
    auto = Automaton()
    auto.add_all(NAMES)
    auto.update_automaton()
    auto_matches = [(m.start, m.end) for m in auto.get_matches(TEXT)]

    with TemporaryDirectory() as tmpdir:
        #tmpdir = ''
        fnm = os.path.join(tmpdir, 'test.aca')
        auto.save_to_file(fnm)
        auto2 = Automaton()
        auto2.load_from_file(fnm)

    auto2_matches = [(m.start, m.end) for m in auto2.get_matches(TEXT)]
    assert list(auto.items()) == list(auto2.items())
    assert list(auto.prefixes()) == list(auto2.prefixes())
    assert auto_matches == auto2_matches

    auto3 = Automaton()
    auto3.load_from_string(auto2.save_to_string())
    auto3_matches = [(m.start, m.end) for m in auto3.get_matches(TEXT)]

    assert list(auto.items()) == list(auto2.items())
    assert list(auto.prefixes()) == list(auto2.prefixes())
    assert auto_matches == auto3_matches
Ejemplo n.º 2
0
    def __init__(self, service_type, label_conf_dict_path):
        self.__level1_keywords__ = []
        self.__level1_tag__ = []
        self.__level1_automaton__ = []

        label_file = "%s.rule.dat" % (service_type)
        level1_keywords_map_file_path = os.path.join(label_conf_dict_path,
                                                     label_file)

        with open(level1_keywords_map_file_path) as level1_f:
            for line in level1_f:
                if not line:
                    continue
                line = line.decode("utf-8").strip()
                if not line:
                    continue
                if line.startswith("#"):
                    continue
                line_arr = line.split(":")
                keywords_list = line_arr[0].split(" ")
                level1_list = line_arr[1].split(",")

                self.__level1_keywords__.append(keywords_list)
                self.__level1_tag__.append(level1_list)
                automaton = Automaton()
                automaton.add_all(keywords_list)
                self.__level1_automaton__.append(automaton)
Ejemplo n.º 3
0
def test_items():
    auto = Automaton()
    auto.add_all(names)
    ens, evs = zip(*sorted(names))
    ns, vs = zip(*list(auto.items()))
    ns = [''.join(n) for n in ns]
    assert list(ens) == list(ns)
    assert list(evs) == list(vs)
Ejemplo n.º 4
0
def test_names():
    auto = Automaton(NAMES)
    auto.add_all(NAMES)
    print (auto)

    matches = set(' '.join(match.elems) for match in auto.get_matches(TEXT.split()))
    names = set(' '.join(name) for name in NAMES)

    assert names == matches
Ejemplo n.º 5
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function, absolute_import

# create a new AC automaton
from aca import Automaton
automaton = Automaton()

# instead of plain strings, you can also use lists of tokens
names = [
    (['Yuri', 'Artyukhin'], 'developer'),
    (['Tom', 'Anderson', 'Jr'], 'designer'),
]
automaton.add_all(names)

# you can add an item like this as well
automaton[['Tom', 'Anderson']] = 'manager'

# if you are not using plain strings, make sure you tokenize the text as well
text = 'Tom Anderson Jr and Yuri Artyukhin work on my project'.split()

print('matches that maximize the number of matched words')
for match in automaton.get_matches(text):
    print(match.start, match.end, match.elems, match.label)

print('all matches')
for match in automaton.get_matches(text, exclude_overlaps=False):
    print(match.start, match.end, match.elems, match.label)
Ejemplo n.º 6
0
# Import the library and initiate the automaton
from aca import Automaton
automaton = Automaton()

# add the entities and build the automaton
automaton.add_all(['Funderbeam', 'Funderbeam Data', 'Funderbeam Markets'])
automaton.update_automaton()

# find matches
text = 'Funderbeam Data and Funderbeam Markets are two different products of Funderbeam'
for match in automaton.get_matches(text, exclude_overlaps=False):
    print(match.start, match.end, match.elems)

for match in automaton.get_matches(text, exclude_overlaps=True):
    print(match.start, match.end, match.elems)
Ejemplo n.º 7
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function, absolute_import

# create a new AC automaton
from aca import Automaton
automaton = Automaton()

# add a dictionary of words to the automaton
painkillers = ['paracetamol', 'ibuprofen', 'hydrocloride']
automaton.add_all(painkillers)

# match the dictionary on a text
text = 'paracetamol and hydrocloride are a medications to relieve pain and fever. paracetamol is less efficient than ibuprofen'

for match in automaton.get_matches(text):
    print(match.start, match.end, match.elems)
Ejemplo n.º 8
0
def test_prefixes():
    auto = Automaton()
    auto.add_all(['jaanus', 'janek', 'janis'])
    prefixes, values = zip(*auto.prefixes())
    prefixes = [''.join(prefix) for prefix in prefixes]
    assert prefixes == ['', 'j', 'ja', 'jaa', 'jaan', 'jaanu', 'jaanus', 'jan', 'jane', 'janek', 'jani', 'janis']