def test_fusion_strategy_with_multiple_overlaps_highest_score_last(): ents = [ NamedEntity(start_char=64, end_char=71, tag="MISC", text="han.solo", score=0.92, recognizer="SomeRecognizer"), NamedEntity(start_char=64, end_char=85, tag="ORG", text="*****@*****.**", score=0.83, recognizer="SomeRecognizer"), NamedEntity(start_char=64, end_char=85, tag="EMAIL", text="*****@*****.**", score=1.0, recognizer="AnotherRecognizer"), ] expected_ents = [ NamedEntity(start_char=64, end_char=85, tag="EMAIL", text="*****@*****.**", score=1.0, recognizer="AnotherRecognizer"), ] assert combine(ents, strategy="fusion") == expected_ents
def test_smart_fusion_strategy_with_double_match_and_overlap(): ents = [ NamedEntity(start_char=0, end_char=8, tag="PER", text="Han Solo", score=0.94, recognizer="SomeRecognizer"), NamedEntity(start_char=64, end_char=71, tag="MISC", text="han.solo", score=0.99, recognizer="SomeRecognizer"), NamedEntity(start_char=64, end_char=85, tag="EMAIL", text="*****@*****.**", score=0.5, recognizer="SomeRecognizer"), NamedEntity(start_char=64, end_char=85, tag="EMAIL", text="*****@*****.**", score=0.5, recognizer="AnotherRecognizer"), NamedEntity(start_char=100, end_char=108, tag="LOC", text="Tatooine", score=0.98, recognizer="SomeRecognizer"), ] expected_ents = [ NamedEntity(start_char=0, end_char=8, tag="PER", text="Han Solo", score=0.94, recognizer="SomeRecognizer"), NamedEntity(start_char=64, end_char=85, tag="EMAIL", text="*****@*****.**", score=1.0, recognizer="AnotherRecognizer"), NamedEntity(start_char=100, end_char=108, tag="LOC", text="Tatooine", score=0.98, recognizer="SomeRecognizer"), ] assert combine(ents, strategy="smart-fusion") == expected_ents
def test_disjunctive_union_strategy_with_overlapping_ents(): ents = [ NamedEntity(start_char=0, end_char=8, tag="PER", text="Han Solo", score=0.94, recognizer="SomeRecognizer"), NamedEntity(start_char=64, end_char=71, tag="MISC", text="han.solo", score=0.92, recognizer="SomeRecognizer"), NamedEntity(start_char=64, end_char=85, tag="EMAIL", text="*****@*****.**", score=1.0, recognizer="AnotherRecognizer"), ] with pytest.raises(AssertionError): combine(ents, strategy="disjunctive_union")
def recognize(text: str, config: Config, combination_strategy=None, context_words=False, return_tokens=True) -> dict: """Find personally identifiable data in the given text and return it. :param text: the text that is searched for named entities :param config: pass a config object to configure the recognition methods :param combination_strategy: choose from None, `disjunctive_union`, `fusion`, and `smart-fusion`; see the docs of `combination_strategies.combine` for more details :param context_words: if True, use context words to boost the score of entities: this is the case, if one of a recognizer's context words appears in the entity's sentence. Setting `context_words` to True will also align each entity's start/end to the nearest token's start/end :param return_tokens: compute and return the tokenization; this will also align each entity's start/end to the nearest token's start/end """ analyzer.update_config(config) recognition_results = analyzer.run_recognition(text) if len(recognition_results) == 0: ents = [] else: ents = [ent for result in recognition_results for ent in result] result = {} tokens = [] if return_tokens or context_words: # tokenize analyzer.tokenizer.tokenize(text) tokens = analyzer.tokenizer.get_tokens() # align entities with tokens entity_aligner = EntityAligner() entity_aligner.align_entities_with_tokens(ents, tokens) # combine entities after they have been aligned ents = combine(ents, strategy=combination_strategy) if return_tokens: result["tokens"] = tokens if context_words: for ent in ents: sentence_tokens = analyzer.tokenizer.get_sentence_for_token( ent.start_tok, exclude_tokens=list(range(ent.start_tok, ent.end_tok)) ) sentence_words = [token.text for token in sentence_tokens] context_words = analyzer.recognizer_lookup[ent.recognizer].CONTEXT_WORDS if any(word in sentence_words for word in context_words): ent.score = min(ent.score * analyzer.config.context_word_confidence_boost_factor, 1.0) result["ents"] = ents return result
def test_fusion_strategy_with_disjoint_ents(): ents = [ NamedEntity(start_char=0, end_char=8, tag="PER", text="Han Solo", score=0.94, recognizer="SomeRecognizer"), NamedEntity(start_char=47, end_char=59, tag="MISC", text="Han's E-Mail", score=0.83, recognizer="SomeRecognizer"), NamedEntity(start_char=64, end_char=85, tag="EMAIL", text="*****@*****.**", score=1.0, recognizer="AnotherRecognizer"), ] assert combine(ents, strategy="fusion") == ents
def test_none_strategy(): ents = [ NamedEntity(start_char=64, end_char=71, tag="MISC", text="han.solo", score=0.92, recognizer="SomeRecognizer"), NamedEntity(start_char=64, end_char=85, tag="EMAIL", text="*****@*****.**", score=1.0, recognizer="AnotherRecognizer"), NamedEntity(start_char=64, end_char=85, tag="ORG", text="*****@*****.**", score=0.83, recognizer="SomeRecognizer"), ] assert combine(ents, strategy=None) == ents
def test_fusion_with_same_score_overlapping(): ents = [ NamedEntity( start_char=301, end_char=306, tag="PHONE", text="12345", score=0.8, recognizer="AnotherRecognizer", start_tok=44, end_tok=45, ), NamedEntity( start_char=301, end_char=313, tag="LOC", text="12345 Berlin", score=0.8, recognizer="AnotherRecognizer", start_tok=44, end_tok=46, ), ] expected_ents = [ NamedEntity( start_char=301, end_char=313, tag="LOC", text="12345 Berlin", score=0.8, recognizer="AnotherRecognizer", start_tok=44, end_tok=46, ), ] assert combine(ents, strategy="fusion") == expected_ents