Esempio n. 1
0
def find_entites(text: str, trie: StringTrie):
    tokens = text.split()
    start = 0
    count = 1  # start at 1, 0 is for the "NO_MATCH"
    entities = dict()
    for i in range(len(tokens)):
        key = "/".join(tokens[start : i + 1]).lower()
        if trie.has_subtrie(key):  # Not done yet
            if i == len(tokens) - 1:  # Reached the end of the string
                entities[count] = (get_entity(trie, key), start, i + 1)
        elif trie.has_key(key):  # noqa: W601 # Find a perfect match
            entities[count] = (trie[key], start, i + 1)
            count += 1
            start = i + 1
        elif start < i:  # Found partial prefix match before this token
            old_key = "/".join(tokens[start:i]).lower()
            entities[count] = (get_entity(trie, old_key), start, i)
            count += 1
            if trie.has_node(
                tokens[i].lower()
            ):  # Need to verify that the current token isn't in the Trie
                start = i
            else:
                start = i + 1
        else:  # No match
            start = i + 1
    return reduce_entities(entities)
Esempio n. 2
0
def find_entites(text: str, trie: StringTrie, mask: str = MASK_TOKEN):
    tokens = text.split()
    tokens = fix_punct_tokens(tokens)
    start = 0
    count = 1  # start at 1, 0 is for the "NO_MATCH"
    entities = dict()
    out = []
    for i in range(len(tokens)):
        key = "/".join(tokens[start:i + 1]).lower()
        # name = " ".join(tokens[start: i + 1])
        if trie.has_subtrie(key):  # Not done yet
            if i == len(tokens) - 1:  # Reached the end of the string
                entities[count] = get_partial_match(trie, key)
                out.append(add_bold(get_entity(entities[count])))
        elif trie.has_key(key):  # noqa: W601  # Find a perfect match
            entities[count] = trie[key]
            out.append(add_bold(get_entity(entities[count])))
            count += 1
            start = i + 1
        elif start < i:  # Found partial prefix match before this token
            old_key = "/".join(tokens[start:i]).lower()
            #  name = " ".join(tokens[start:i])
            entities[count] = get_partial_match(trie, old_key)
            out.append(add_bold(get_entity(entities[count])))
            count += 1
            if trie.has_node(tokens[i].lower(
            )):  # Need to verify that the current token isn't in the Trie
                start = i
            else:
                out.append(tokens[i])
                start = i + 1
        else:  # No match
            out.append(tokens[i])
            start = i + 1
    retokenized = "".join([
        " " + i if not i.startswith("'") and i not in PUNCT else i for i in out
    ]).strip()
    return retokenized, reduce_entities(entities)