def find_entites(text: str, trie: StringTrie): tokens = text.split() start = 0 count = 1 # start at 1, 0 is for the "NO_MATCH" entities = dict() for i in range(len(tokens)): key = "/".join(tokens[start : i + 1]).lower() if trie.has_subtrie(key): # Not done yet if i == len(tokens) - 1: # Reached the end of the string entities[count] = (get_entity(trie, key), start, i + 1) elif trie.has_key(key): # noqa: W601 # Find a perfect match entities[count] = (trie[key], start, i + 1) count += 1 start = i + 1 elif start < i: # Found partial prefix match before this token old_key = "/".join(tokens[start:i]).lower() entities[count] = (get_entity(trie, old_key), start, i) count += 1 if trie.has_node( tokens[i].lower() ): # Need to verify that the current token isn't in the Trie start = i else: start = i + 1 else: # No match start = i + 1 return reduce_entities(entities)
def find_entites(text: str, trie: StringTrie, mask: str = MASK_TOKEN): tokens = text.split() tokens = fix_punct_tokens(tokens) start = 0 count = 1 # start at 1, 0 is for the "NO_MATCH" entities = dict() out = [] for i in range(len(tokens)): key = "/".join(tokens[start:i + 1]).lower() # name = " ".join(tokens[start: i + 1]) if trie.has_subtrie(key): # Not done yet if i == len(tokens) - 1: # Reached the end of the string entities[count] = get_partial_match(trie, key) out.append(add_bold(get_entity(entities[count]))) elif trie.has_key(key): # noqa: W601 # Find a perfect match entities[count] = trie[key] out.append(add_bold(get_entity(entities[count]))) count += 1 start = i + 1 elif start < i: # Found partial prefix match before this token old_key = "/".join(tokens[start:i]).lower() # name = " ".join(tokens[start:i]) entities[count] = get_partial_match(trie, old_key) out.append(add_bold(get_entity(entities[count]))) count += 1 if trie.has_node(tokens[i].lower( )): # Need to verify that the current token isn't in the Trie start = i else: out.append(tokens[i]) start = i + 1 else: # No match out.append(tokens[i]) start = i + 1 retokenized = "".join([ " " + i if not i.startswith("'") and i not in PUNCT else i for i in out ]).strip() return retokenized, reduce_entities(entities)