def test_entity_level(): gold = [ Document(name='doc_a', text='', annotations=[Annotation('', 3, 6, 'MISC')]), Document(name='doc_b', text='', annotations=[Annotation('', 0, 2, 'PER')]) ] predicted = [ Document(name='doc_a', text='', annotations=[Annotation('', 2, 6, 'MISC')]), Document(name='doc_b', text='', annotations=[Annotation('', 0, 2, 'PER')]) ] evaluator = Evaluator(gold, predicted) scores = evaluator.entity_level() assert scores.micro_avg_f_score() == 0.5 assert scores.macro_avg_f_score() == 0.5 assert scores.f_score('PER') == 1 assert scores.f_score('MISC') == 0
def sents_to_standoff(sentence_tags: List[List[str]], docs: List[ParsedDoc]) -> List[Document]: """Convert a BIO tagged documents to standoff annotated documents. Parameters ---------- sentence_tags : List[List[str]] List of sentences of BIO tagged tokens. docs : List[ParsedDoc] The documents corresponding to each sentence. Returns ------- annotated_docs : List[Document] The documents with annotated entities in standoff format. """ tags_by_doc = _group_sentences(sentence_tags, docs) annotated_docs = [] for doc, tags in tags_by_doc: try: annotated_docs.append( Document(name=doc.name, text=doc.text, annotations=_bio_to_standoff(tags, doc.spacy_doc))) except Exception as e: logger.warning( 'Could not convert document to standoff {}\n tags = {}\n{}'. format(doc.name, tags, e)) annotated_docs.append( Document(name=doc.name, text=doc.text, annotations=[])) return annotated_docs
def test_token_level(): text = 'A B C D.' gold_a = [Annotation('B C', 2, 5, 'PER')] gold_b = [Annotation('A', 0, 1, 'ORG'), Annotation('B', 2, 3, 'PER')] pred_a = [Annotation('B', 2, 3, 'PER'), Annotation('C', 4, 5, 'PER')] pred_b = [Annotation('A', 0, 1, 'ORG'), Annotation('B', 2, 3, 'ORG')] gold = [ Document(name='doc_a', text=text, annotations=gold_a), Document(name='doc_b', text=text, annotations=gold_b) ] predicted = [ Document(name='doc_a', text=text, annotations=pred_a), Document(name='doc_b', text=text, annotations=pred_b) ] evaluator = Evaluator(gold, predicted) scores = evaluator.token_level() assert scores.precision('PER') == 1 assert scores.recall('PER') == 0.6667 assert scores.f_score('PER') == 0.8 assert scores.precision('ORG') == 0.5 assert scores.recall('ORG') == 1 assert scores.f_score('ORG') == 0.6667
def test_flair_sentence_with_whitespace_tokens(): text = 'Mw geniet zichtbaar. Maarten is de afgelopen periode veelal afwezig.' annotation = Annotation(text='Maarten', start=text.index('Maarten'), end=text.index('Maarten') + len('Maarten'), tag='PERSON') doc = Document(name='', text=text, annotations=[annotation]) tokenizer = TokenizerFactory().tokenizer('ons') flair_sents, docs = flair_utils.standoff_to_flair_sents([doc], tokenizer) # spaCy adds consecutive whitespace tokens as a single whitespace. These should be retained # in the Flair sentence, otherwise it's not possible to reconstruct the original document from # the tokenized representation. assert [token.text for token in flair_sents[0] ] == ['Mw', 'geniet', 'zichtbaar', '.', '<SPACE>'] spacy_doc = docs[0].spacy_doc spacy_sents = list(spacy_doc.sents) assert len(flair_sents) == 2 assert len(spacy_sents) == 2 assert len(flair_sents[0]) == 5 assert len(spacy_sents[0]) == 5 assert len(flair_sents[1]) == 8 assert len(spacy_sents[1]) == 8
def test_surrogate_annotations(): text = "De patient J. Jansen (e: [email protected], t: 06-12345678)" annotations = [ Annotation(text='J. Jansen', start=11, end=20, tag='Name', doc_id='', ann_id='T0'), Annotation(text='*****@*****.**', start=25, end=42, tag='Email', doc_id='', ann_id='T1'), Annotation(text='06-12345678', start=47, end=58, tag='Phone_fax', doc_id='', ann_id='T2') ] doc = Document(name='test_doc', text=text, annotations=annotations) surrogate_doc = list(surrogate_annotations([doc]))[0] assert len(surrogate_doc.annotations) == len(doc.annotations) assert re.match(r'De patient .* \(e: .*, t: .*\)', doc.text) assert not surrogate_doc.annotations_without_surrogates for ann in surrogate_doc.annotations: assert surrogate_doc.text[ann.start:ann.end] == ann.text
def load_data(): corpus = CONLL_03_DUTCH() sentences = corpus.train[:N_SENTS] tokens = sum(len(sent) for sent in sentences) docs = [ Document(name='', text=sent.to_plain_string(), annotations=[]) for sent in sentences ] return docs, tokens
def apply_surrogates(text, annotations, surrogates, errors='raise'): adjusted_annotations = [] # Amount of characters by which start point of annotation is adjusted # Positive shift if surrogates are longer than original annotations # Negative shift if surrogates are shorter shift = 0 original_text_pointer = 0 text_rewritten = '' failed_replacements = [] for annotation, surrogate in zip(annotations, surrogates): if not surrogate: if errors == 'raise': raise ValueError(f'No valid surrogate for {annotation}') if errors == 'ignore': surrogate = annotation.text elif errors == 'coerce': surrogate = f'[{annotation.tag}]' failed_replacements.append(annotation) part = text[original_text_pointer:annotation.start] start = annotation.start + shift end = start + len(surrogate) shift += len(surrogate) - len(annotation.text) adjusted_annotations.append(Annotation( text=surrogate, start=start, end=end, tag=annotation.tag, doc_id=annotation.doc_id, ann_id=annotation.ann_id )) text_rewritten += part + surrogate original_text_pointer = annotation.end text_rewritten += text[original_text_pointer:] doc_rewritten = Document(name='', text=text_rewritten, annotations=adjusted_annotations) doc_rewritten.annotations_without_surrogates = failed_replacements return doc_rewritten
def test_token_annotations(): evaluator = Evaluator(gold=(), predicted=()) doc = Document(name='doc_a', text='A B C D.', annotations=[ Annotation('B C', 2, 5, 'PER'), Annotation('D.', 6, 8, 'ORG') ]) assert evaluator.token_annotations(doc) == ['O', 'PER', 'PER', 'ORG'] assert evaluator.token_annotations( doc, tag_blind=True) == ['O', 'ENT', 'ENT', 'ENT']
def _load_folder(path): files = glob.glob(join(path, '*.ann')) files = sorted(files) documents = [] for file in files: doc_name = get_basename(file) annotations, text = brat.load_brat_document(path, doc_name) doc = Document(name=doc_name, text=text, annotations=annotations) documents.append(doc) return documents
def _unflatten_tags(tag_mapping, documents): replaced_docs = [] for doc in documents: new_annotations = [] for ann in doc.annotations: ann_key = (doc.name, ann.start, ann.end, ann.tag) new_tag = tag_mapping.get(ann_key, ann.tag) new_ann = ann._replace(tag=new_tag) new_annotations.append(new_ann) replaced_docs.append( Document(name=doc.name, text=doc.text, annotations=new_annotations)) return replaced_docs
def test_surrogate_annotations_errors_raise(): doc = Document(name='test_doc', text='This document was written on INVALID_DATE.', annotations=[ Annotation(text='INVALID_DATE', start=29, end=41, tag='Date', doc_id='', ann_id='T0') ]) with pytest.raises( ValueError, match=r'No valid surrogate for Annotation\(.*INVALID_DATE.*\)'): _ = list(surrogate_annotations([doc]))[0]
def documents_iter(notes): lines = readlines(notes) record_lines = [] for line in lines: if line.startswith('START_OF_RECORD'): record_lines = [] patient_id, record_id = re.findall(r'\d+', line) elif line.startswith('||||END_OF_RECORD'): yield Document( name='note-{}-{}'.format(patient_id, record_id), text=''.join(record_lines).rstrip(), annotations=[] ) else: record_lines.append(line)
def test_mask_annotations(): text = "De patient J. Jansen (e: [email protected], t: 06-12345678)" annotations = [ Annotation(text='J. Jansen', start=11, end=20, tag='Name', doc_id='', ann_id='T0'), Annotation(text='*****@*****.**', start=25, end=42, tag='Email', doc_id='', ann_id='T1'), Annotation(text='06-12345678', start=47, end=58, tag='Phone_fax', doc_id='', ann_id='T2') ] doc = Document(name='test_doc', text=text, annotations=annotations) doc = mask_annotations(doc) assert doc.text == "De patient [NAME] (e: [EMAIL], t: [PHONE_FAX])" assert doc.annotations == [ Annotation(text='[NAME]', start=11, end=17, tag='Name', doc_id='', ann_id='T0'), Annotation(text='[EMAIL]', start=22, end=29, tag='Email', doc_id='', ann_id='T1'), Annotation(text='[PHONE_FAX]', start=34, end=45, tag='Phone_fax', doc_id='', ann_id='T2') ]
def test_surrogate_annotations_errors_ignore(): original_doc = Document(name='test_doc', text='This document was written on INVALID_DATE.', annotations=[ Annotation(text='INVALID_DATE', start=29, end=41, tag='Date', doc_id='', ann_id='T0') ]) gen = surrogate_annotations([original_doc], errors='ignore') surrogate_doc = list(gen)[0] assert surrogate_doc.text == original_doc.text assert surrogate_doc.annotations == original_doc.annotations assert surrogate_doc.annotations_without_surrogates == original_doc.annotations
def predict(documents: List[Document], corpus_name='ons', verbose=False) -> List[Document]: predictions = [] for doc in tqdm(documents, disable=not verbose, desc='Tag documents'): annotator = DeduceAnnotator(doc.text) annotations = annotator.annotations() if corpus_name == 'ons': annotations = rewrite_annotations(doc.text, annotations) new_doc = Document(name=doc.name, text=doc.text, annotations=annotations) predictions.append(new_doc) return predictions
def mask_annotations(document: Document, replacement_formatter: Callable[[Annotation], str] = _uppercase_formatter ) -> Document: """Utility function to replace sensitive PHI spans with a placeholder. Parameters ---------- document : Document The document whose PHI annotations should be replaced. replacement_formatter : Callable[[Annotation], str] A callable that can be used to configure the formatting of the replacement. The default formatter replaces an annotation with `[annotation.tag.upper()]`. Returns ------- Document The document with masked annotations. """ # Amount of characters by which start point of annotation is adjusted # Positive shift if replacement is longer than original annotation # Negative shift if replacement is shorter shift = 0 original_text_pointer = 0 text_rewritten = '' annotations_rewritten = [] for annotation in document.annotations: replacement = replacement_formatter(annotation) part = document.text[original_text_pointer:annotation.start] start = annotation.start + shift end = start + len(replacement) shift += len(replacement) - len(annotation.text) text_rewritten += part + replacement original_text_pointer = annotation.end annotations_rewritten.append(annotation._replace( start=start, end=end, text=replacement )) text_rewritten += document.text[original_text_pointer:] return Document(name=document.name, text=text_rewritten, annotations=annotations_rewritten)
def test_annotate(): tagger = DeduceTagger() doc = Document(name='', text='Jan Jannsen vanuit het UMCU.', annotations=[]) anns = tagger.annotate([doc])[0].annotations assert anns == [ Annotation(text='Jan Jannsen', start=0, end=11, tag='Name', doc_id='', ann_id='T0'), Annotation(text='UMCU', start=23, end=27, tag='Named_Location', doc_id='', ann_id='T1') ]
def test_annotate(): doc = Document( name='', text= 'Hij werd op 10 oktober door arts Peter de Visser ontslagen van de kliniek.', annotations=[]) anns = tagger.annotate([doc])[0].annotations assert anns == [ Annotation(text='10 oktober', start=12, end=22, tag='Date', doc_id='', ann_id='T0'), Annotation(text='Peter de Visser', start=33, end=48, tag='Name', doc_id='', ann_id='T1') ]
def get_documents(docs_path, anns_path) -> List[Document]: if not isdir(docs_path): raise ValueError('docs_path = {} does not exist.'.format(docs_path)) if not isdir(anns_path): raise ValueError('anns_path = {} does not exist.'.format(anns_path)) txt_files = sorted(glob.glob(join(docs_path, '*.txt'))) ann_files = sorted(glob.glob(join(anns_path, '*.ann'))) assert ann_files and txt_files and _basenames(txt_files) == _basenames( ann_files) docs = [] for txt_file, ann_file in zip(txt_files, ann_files): doc_name = splitext(basename(txt_file))[0] doc_txt = brat.load_brat_text(txt_file) doc_annos = brat.load_brat_annotations(ann_file) docs.append( Document(name=doc_name, text=doc_txt, annotations=doc_annos)) return docs
def xml_to_document(xml_file): """Converts an i2b2/UTHealth XML document to a `deidentify.base.Document`. XML Structure: ``` <?xml version="1.0" encoding="UTF-8" ?> <deIdi2b2> <TEXT><![CDATA[ this is the record content ]]></TEXT> <TAGS> <DATE id="P0" start="16" end="26" text="2067-05-03" TYPE="DATE" comment="" /> <AGE id="P1" start="50" end="52" text="55" TYPE="AGE" comment="" /> </TAGS> </deIdi2b2> ``` """ tree = ET.parse(xml_file) root = tree.getroot() text = root.find('TEXT').text doc_name = 'doc-' + splitext(basename(xml_file))[0] annotations = [] for tag_element in root.find('TAGS'): tag_name = tag_element.tag + ':' + tag_element.attrib['TYPE'] annotations.append(Annotation( text=tag_element.attrib['text'], start=tag_element.attrib['start'], end=tag_element.attrib['end'], # Example: NAME:DOCTOR tag=TAG_MAPPING.get(tag_name, tag_name), # i2b2 annotations have id prefixed with P. Example: P12 doc_id=doc_name, ann_id='T{}'.format(tag_element.attrib['id'][1:]) )) return Document(name=doc_name, text=text, annotations=annotations)
def anonymize(input_file): with open(input_file, "r", encoding="utf-8") as f: text = f.read() # Wrap text in document documents = [Document(name='doc_01', text=text)] # Select downloaded model model = 'models/model_bilstmcrf_ons_fast-v0.1.0/final-model.pt' nlp = spacy.load('de_core_news_sm') # Instantiate tokenizer tokenizer = TokenizerFactory().tokenizer(corpus='germeval', disable=("tagger", "ner"), model=nlp) # Load tagger with a downloaded model file and tokenizer tagger = FlairTagger(model=model, tokenizer=tokenizer, verbose=False) # Annotate your documents annotated_doc = tagger.annotate(documents)[0] # Spacy NER extraction ners = nlp(text) filtered_annotations = [] # Dict for storing SpaCy and deidentify tag correspondences tag_dict = { "PER": "Name", "LOC": "Address", "ORG": "Organization_Company", "MISC": "Other" } # Add all SpaCy-detected NEs to list for ent in ners.ents: filtered_annotations.append({ "text": ent.text, "start": ent.start_char, "end": ent.end_char, "tag": tag_dict[ent.label_] }) for ann in annotated_doc.annotations: # discard names; they have a high likelihood of false positives since # nouns are capitalized in German, unlike in Dutch if ann.tag == "Name": continue # don't add the entity if it overlaps with SpaCy's - SpaCy makes fewer mistakes if True in [ent.start_char <= ann.end <= ent.end_char for ent in ners.ents] or \ True in [ann.start <= ent.end_char <= ann.end for ent in ners.ents]: continue filtered_annotations.append({ "text": ann.text, "start": ann.start, "end": ann.end, "tag": ann.tag }) filtered_annotations.sort(key=lambda x: x["start"]) masked_output = mask_annotations(annotated_doc.text, filtered_annotations) print(masked_output)
from deidentify.base import Document from deidentify.taggers import FlairTagger from deidentify.tokenizer import TokenizerFactory # Create some text text = ( "Dit is stukje tekst met daarin de naam Jan Jansen. De patient J. Jansen (e: " "[email protected], t: 06-12345678) is 64 jaar oud en woonachtig in Utrecht. Hij werd op 10 " "oktober door arts Peter de Visser ontslagen van de kliniek van het UMCU." ) # Wrap text in document documents = [ Document(name='doc_01', text=text) ] # Select downloaded model model = 'model_bilstmcrf_ons_fast-v0.2.0' # Instantiate tokenizer tokenizer = TokenizerFactory().tokenizer(corpus='ons', disable=("tagger", "ner")) # Load tagger with a downloaded model file and tokenizer tagger = FlairTagger(model=model, tokenizer=tokenizer, verbose=False) # Annotate your documents annotated_docs = tagger.annotate(documents) from pprint import pprint