def test_entity_level(): gold = [ Document(name='doc_a', text='', annotations=[Annotation('', 3, 6, 'MISC')]), Document(name='doc_b', text='', annotations=[Annotation('', 0, 2, 'PER')]) ] predicted = [ Document(name='doc_a', text='', annotations=[Annotation('', 2, 6, 'MISC')]), Document(name='doc_b', text='', annotations=[Annotation('', 0, 2, 'PER')]) ] evaluator = Evaluator(gold, predicted) scores = evaluator.entity_level() assert scores.micro_avg_f_score() == 0.5 assert scores.macro_avg_f_score() == 0.5 assert scores.f_score('PER') == 1 assert scores.f_score('MISC') == 0
def test_annotation(): ann_a = Annotation(text='test', start=12, end=15, tag='ABC', doc_id='123', ann_id='456') ann_b = Annotation(text='test', start=12, end=15, tag='ABC', doc_id='123', ann_id='456') ann_c = Annotation(text='test2', start=12, end=15, tag='ABC', doc_id='123', ann_id='456') assert ann_a == ann_b assert ann_a != ann_c with pytest.raises(AttributeError): ann_a.text = "Annotation should be immutable" # Annotation should also be hashable assert len(set([ann_a, ann_b, ann_c])) == 2
def test_surrogate_annotations(): text = "De patient J. Jansen (e: [email protected], t: 06-12345678)" annotations = [ Annotation(text='J. Jansen', start=11, end=20, tag='Name', doc_id='', ann_id='T0'), Annotation(text='*****@*****.**', start=25, end=42, tag='Email', doc_id='', ann_id='T1'), Annotation(text='06-12345678', start=47, end=58, tag='Phone_fax', doc_id='', ann_id='T2') ] doc = Document(name='test_doc', text=text, annotations=annotations) surrogate_doc = list(surrogate_annotations([doc]))[0] assert len(surrogate_doc.annotations) == len(doc.annotations) assert re.match(r'De patient .* \(e: .*, t: .*\)', doc.text) assert not surrogate_doc.annotations_without_surrogates for ann in surrogate_doc.annotations: assert surrogate_doc.text[ann.start:ann.end] == ann.text
def test_generate_surrogates_shuffle_choices(): text = 'Patient is being treated at UMCU.' annotations = [ Annotation('UMCU', text.index('UMCU'), text.index('UMCU') + 4, 'Hospital') ] doc_1 = Document(annotations, text) text = 'Patient is being treated at MST.' annotations = [ Annotation('MST', text.index('MST'), text.index('MST') + 3, 'Hospital') ] doc_2 = Document(annotations, text) surrogate_docs = DatasetDeidentifier().generate_surrogates([doc_1, doc_2]) original_annotations, surrogates = surrogate_docs[ 0].annotation_surrogate_pairs() assert len(original_annotations) == 1 and len(surrogates) == 1 assert original_annotations[0].text == 'UMCU' assert surrogates[0] == 'MST' original_annotations, surrogates = surrogate_docs[ 1].annotation_surrogate_pairs() assert len(original_annotations) == 1 and len(surrogates) == 1 assert original_annotations[0].text == 'MST' assert surrogates[0] == 'UMCU'
def test_token_annotations(): evaluator = Evaluator(gold=(), predicted=()) doc = Document(name='doc_a', text='A B C D.', annotations=[ Annotation('B C', 2, 5, 'PER'), Annotation('D.', 6, 8, 'ORG') ]) assert evaluator.token_annotations(doc) == ['O', 'PER', 'PER', 'ORG'] assert evaluator.token_annotations( doc, tag_blind=True) == ['O', 'ENT', 'ENT', 'ENT']
def test_apply_surrogates_errors_raise(): text = 'ccc cc ccc' annotations = [ Annotation('ccc', start=0, end=3, tag='A'), Annotation('cc', start=4, end=6, tag='A'), Annotation('ccc', start=7, end=10, tag='B') ] surrogates = ['a', None, 'b'] with pytest.raises(ValueError): rewrite_dataset.apply_surrogates(text, annotations, surrogates) with pytest.raises(ValueError): rewrite_dataset.apply_surrogates(text, annotations, surrogates, errors='raise')
def test_flair_sentence_with_whitespace_tokens(): text = 'Mw geniet zichtbaar. Maarten is de afgelopen periode veelal afwezig.' annotation = Annotation(text='Maarten', start=text.index('Maarten'), end=text.index('Maarten') + len('Maarten'), tag='PERSON') doc = Document(name='', text=text, annotations=[annotation]) tokenizer = TokenizerFactory().tokenizer('ons') flair_sents, docs = flair_utils.standoff_to_flair_sents([doc], tokenizer) # spaCy adds consecutive whitespace tokens as a single whitespace. These should be retained # in the Flair sentence, otherwise it's not possible to reconstruct the original document from # the tokenized representation. assert [token.text for token in flair_sents[0] ] == ['Mw', 'geniet', 'zichtbaar', '.', '<SPACE>'] spacy_doc = docs[0].spacy_doc spacy_sents = list(spacy_doc.sents) assert len(flair_sents) == 2 assert len(spacy_sents) == 2 assert len(flair_sents[0]) == 5 assert len(spacy_sents[0]) == 5 assert len(flair_sents[1]) == 8 assert len(spacy_sents[1]) == 8
def annotations_iter(annotations): lines = readlines(annotations) current_pid, current_rid = lines[0].split(maxsplit=5)[0:2] annotations = [] i = 1 for line in lines: pid, rid, start, end, tag, text = line.strip().split(maxsplit=5) if pid != current_pid or rid != current_rid: yield annotations annotations = [] i = 1 current_pid = pid current_rid = rid annotations.append(Annotation( text=text, start=int(start), end=int(end), tag=tag, ann_id='T{}'.format(i), doc_id='note-{}-{}'.format(current_pid, current_rid) )) i += 1 yield annotations
def apply_surrogates(text, annotations, surrogates): adjusted_annotations = [] # Amount of characters by which start point of annotation is adjusted # Positive shift if surrogates are longer than original annotations # Negative shift if surrogates are shorter shift = 0 original_text_pointer = 0 text_rewritten = '' for annotation, surrogate in zip(annotations, surrogates): part = text[original_text_pointer:annotation.start] start = annotation.start + shift end = start + len(surrogate) shift += len(surrogate) - len(annotation.text) adjusted_annotations.append( Annotation(text=surrogate, start=start, end=end, tag=annotation.tag, doc_id=annotation.doc_id, ann_id=annotation.ann_id)) text_rewritten += part + surrogate original_text_pointer = annotation.end text_rewritten += text[original_text_pointer:] return text_rewritten, adjusted_annotations
def _bio_to_standoff(bio_tags: List[str], spacy_doc: spacy.tokens.Doc) -> List[Annotation]: """Convert BIO tagged document to annotations in standoff format. The original spaCy document is used to recreate correct entity offsets. Parameters ---------- bio_tags : List[str] A BIO tagged sentence. `len(bio_tags) == len(spacy_doc)` has to hold. spacy_doc : spacy.tokens.Doc The spaCy doc corresponding to the BIO tags. Returns ------- List[Annotation] The standoff annotations. """ bio_tags = fix_dangling_entities(bio_tags) biluo_tags = _bio_to_biluo(bio_tags) offsets = offsets_from_biluo_tags(spacy_doc, biluo_tags) annotations = [] for i, offset in enumerate(offsets): annotations.append( Annotation( text=spacy_doc.char_span(offset[0], offset[1]).text, start=offset[0], end=offset[1], tag=offset[2], ann_id='T{}'.format(i), )) return annotations
def test_token_level(): text = 'A B C D.' gold_a = [Annotation('B C', 2, 5, 'PER')] gold_b = [Annotation('A', 0, 1, 'ORG'), Annotation('B', 2, 3, 'PER')] pred_a = [Annotation('B', 2, 3, 'PER'), Annotation('C', 4, 5, 'PER')] pred_b = [Annotation('A', 0, 1, 'ORG'), Annotation('B', 2, 3, 'ORG')] gold = [ Document(name='doc_a', text=text, annotations=gold_a), Document(name='doc_b', text=text, annotations=gold_b) ] predicted = [ Document(name='doc_a', text=text, annotations=pred_a), Document(name='doc_b', text=text, annotations=pred_b) ] evaluator = Evaluator(gold, predicted) scores = evaluator.token_level() assert scores.precision('PER') == 1 assert scores.recall('PER') == 0.6667 assert scores.f_score('PER') == 0.8 assert scores.precision('ORG') == 0.5 assert scores.recall('ORG') == 1 assert scores.f_score('ORG') == 0.6667
def test_annotations(): actual = ANNOTATOR.annotations() assert actual == [ Annotation('Jan Jansen', 39, 49, 'PERSOON'), Annotation('patient J. Jansen', 54, 71, 'PERSOON'), Annotation('*****@*****.**', 76, 93, 'URL'), Annotation('06-12345678', 98, 109, 'TELEFOONNUMMER'), Annotation('64', 114, 116, 'LEEFTIJD'), Annotation('Utrecht', 143, 150, 'LOCATIE'), Annotation('10 oktober', 164, 174, 'DATUM'), Annotation('Peter de Visser', 185, 200, 'PERSOON'), # We explicitly check that the following annotation is included in it's correct form. # Deduce annotates UMCU as umcu. During annotation, we attempt to recover the original text. Annotation('UMCU', 234, 238, 'INSTELLING') ]
def annotations(self): """ List of annotated PHI entities with their offset within the orginal (unannotated) text. """ annotations = [] text_parts = deduce.utility.split_tags(self.annotated_text) # Deduce denotes entities inline in form of <TYPE text>. We need to take this # into account when computing the character positions in the original text. original_text_pointer = 0 ann_id = 0 for part in text_parts: if self.is_annotation(part): tag = self.annotation_tag(part) # Disregard nested annotations. Nested content is considered to be part of the # parent annotation. ann_text = self.flatten_annotation_content(part) try: # Deduce randomly removes spaces preceeding an annotation. We do a best effort # to find back the entity in the original text. Matching is done relative to # the deduce match, so that we do not capture unwanted text. # # Casing is ignored as deduce sometimes changes the original text. # Example: deduce.annotate_text('UMCU') -> "<INSTELLING umcu>" idx_match = self.text[original_text_pointer:].lower( ).index(ann_text.lower()) except ValueError: # Sometimes, Deduce changes the original annotation text. Example: # gemeld door <PERSOON Jan van Jansen> # gemeld door <PERSOON Jan Jan van Jansen> # # In those case, we cannot recover the annotation and skip to the next. original_text_pointer += len(ann_text) continue start_idx = idx_match + original_text_pointer end_idx = start_idx + len(ann_text) original_text_pointer = end_idx annotations.append( Annotation(ann_id='T{}'.format(ann_id), tag=tag, text=self.text[start_idx:end_idx], start=start_idx, end=end_idx)) ann_id += 1 else: original_text_pointer += len(part) return annotations
def test_annotate(): tagger = DeduceTagger() doc = Document(name='', text='Jan Jannsen vanuit het UMCU.', annotations=[]) anns = tagger.annotate([doc])[0].annotations assert anns == [ Annotation(text='Jan Jannsen', start=0, end=11, tag='Name', doc_id='', ann_id='T0'), Annotation(text='UMCU', start=23, end=27, tag='Named_Location', doc_id='', ann_id='T1') ]
def test_annotate(): doc = Document( name='', text= 'Hij werd op 10 oktober door arts Peter de Visser ontslagen van de kliniek.', annotations=[]) anns = tagger.annotate([doc])[0].annotations assert anns == [ Annotation(text='10 oktober', start=12, end=22, tag='Date', doc_id='', ann_id='T0'), Annotation(text='Peter de Visser', start=33, end=48, tag='Name', doc_id='', ann_id='T1') ]
def main(args): df_surrogates = pd.read_csv(args.surrogate_table) logger.info('Rewrite {} files.'.format(len(df_surrogates.doc_id.unique()))) # Use manual surrogate if it exists. If not, use the automatically generated one df_surrogates['surrogate'] = df_surrogates.manual_surrogate.fillna( df_surrogates['surrogate']) for doc_id, rows in df_surrogates.groupby('doc_id'): text = load_brat_text(join(args.data_path, '{}.txt'.format(doc_id))) rows = rows.sort_values(by='start') annotations = rows.apply(lambda row: Annotation(text=row['text'], start=row['start'], end=row['end'], tag=row['tag'], doc_id=row['doc_id'], ann_id=row['ann_id']), axis=1) surrogates = rows.surrogate.values text_rewritten, adjusted_annotations = apply_surrogates( text, annotations, surrogates) write_brat_document(args.output_path, doc_id, text=text_rewritten, annotations=adjusted_annotations) files_with_annotations = set(df_surrogates.doc_id.values) all_files = [ splitext(basename(f))[0] for f in glob.glob(join(args.data_path, '*.txt')) ] files_without_annotations = [ f for f in all_files if f not in files_with_annotations ] logger.info('Found {} files without any annotations. ' 'Copy them to output_path...'.format( len(files_without_annotations))) for file in files_without_annotations: shutil.copy2(join(args.data_path, '{}.txt'.format(file)), args.output_path) shutil.copy2(join(args.data_path, '{}.ann'.format(file)), args.output_path) logger.info('Done.')
def test_surrogate_annotations_errors_coerce(): original_doc = Document(name='test_doc', text='This document was written on INVALID_DATE.', annotations=[ Annotation(text='INVALID_DATE', start=29, end=41, tag='Date', doc_id='', ann_id='T0') ]) gen = surrogate_annotations([original_doc], errors='coerce') surrogate_doc = list(gen)[0] assert surrogate_doc.text == 'This document was written on [Date].' assert surrogate_doc.annotations == [ Annotation(text='[Date]', start=29, end=35, tag='Date', doc_id='', ann_id='T0') ] assert surrogate_doc.annotations_without_surrogates == original_doc.annotations
def load_brat_annotations(ann_file): """Load a brat standoff annotations (.ann) files. This method does not support brat fragment annotations. These annotations are inserted when annotating text spanning multiple lines. Example of fragment annotation that is not supported: `T30 Address 3232 3245;3246 3263 Calslaan 11 1234AB Wildervank` ``` Parameters ---------- ann_file : str Full path to .ann file. Returns ------- list of deidentify.base.Annotation The annotations """ annotations = [] doc_id = splitext(basename(ann_file))[0] with open(ann_file) as file: lines = file.readlines() for line in lines: if not line.startswith('T'): continue splitted = line.split(None, 4) ann_id, tag, start, end, text = splitted text = text.rstrip('\n') try: annotation = Annotation(text=text, start=int(start), end=int(end), tag=tag, doc_id=doc_id, ann_id=ann_id) annotations.append(annotation) except ValueError: logger.warning( 'Brat fragment annotations are not supported, skipping line\n{}' .format(line)) return annotations
def test_generate_surrogates_without_choices(): text = 'Patient is being treated at UMCU.' annotations = [ Annotation('UMCU', text.index('UMCU'), text.index('UMCU') + 4, 'Hospital') ] doc = Document(annotations, text) surrogate_doc = DatasetDeidentifier().generate_surrogates([doc])[0] original_annotations, surrogates = surrogate_doc.annotation_surrogate_pairs( ) assert len(original_annotations) == 1 assert len(surrogates) == 1 assert original_annotations[0].text == 'UMCU' assert surrogates[0] == 'UMCU'
def test_surrogate_annotations_errors_raise(): doc = Document(name='test_doc', text='This document was written on INVALID_DATE.', annotations=[ Annotation(text='INVALID_DATE', start=29, end=41, tag='Date', doc_id='', ann_id='T0') ]) with pytest.raises( ValueError, match=r'No valid surrogate for Annotation\(.*INVALID_DATE.*\)'): _ = list(surrogate_annotations([doc]))[0]
def apply_surrogates(text, annotations, surrogates, errors='raise'): adjusted_annotations = [] # Amount of characters by which start point of annotation is adjusted # Positive shift if surrogates are longer than original annotations # Negative shift if surrogates are shorter shift = 0 original_text_pointer = 0 text_rewritten = '' failed_replacements = [] for annotation, surrogate in zip(annotations, surrogates): if not surrogate: if errors == 'raise': raise ValueError(f'No valid surrogate for {annotation}') if errors == 'ignore': surrogate = annotation.text elif errors == 'coerce': surrogate = f'[{annotation.tag}]' failed_replacements.append(annotation) part = text[original_text_pointer:annotation.start] start = annotation.start + shift end = start + len(surrogate) shift += len(surrogate) - len(annotation.text) adjusted_annotations.append(Annotation( text=surrogate, start=start, end=end, tag=annotation.tag, doc_id=annotation.doc_id, ann_id=annotation.ann_id )) text_rewritten += part + surrogate original_text_pointer = annotation.end text_rewritten += text[original_text_pointer:] doc_rewritten = Document(name='', text=text_rewritten, annotations=adjusted_annotations) doc_rewritten.annotations_without_surrogates = failed_replacements return doc_rewritten
def test_apply_surrogates_errors_coerce(): text = 'ccc cc ccc' annotations = [ Annotation('ccc', start=0, end=3, tag='A'), Annotation('cc', start=4, end=6, tag='A'), Annotation('ccc', start=7, end=10, tag='B') ] surrogates = ['a', None, 'b'] surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates, errors='coerce') assert surrogate_doc.text == 'a [A] b' assert surrogate_doc.annotations == [ Annotation('a', start=0, end=1, tag='A'), Annotation('[A]', start=2, end=5, tag='A'), Annotation('b', start=6, end=7, tag='B') ] assert surrogate_doc.annotations_without_surrogates == [ Annotation('cc', start=4, end=6, tag='A'), ]
def test_mask_annotations(): text = "De patient J. Jansen (e: [email protected], t: 06-12345678)" annotations = [ Annotation(text='J. Jansen', start=11, end=20, tag='Name', doc_id='', ann_id='T0'), Annotation(text='*****@*****.**', start=25, end=42, tag='Email', doc_id='', ann_id='T1'), Annotation(text='06-12345678', start=47, end=58, tag='Phone_fax', doc_id='', ann_id='T2') ] doc = Document(name='test_doc', text=text, annotations=annotations) doc = mask_annotations(doc) assert doc.text == "De patient [NAME] (e: [EMAIL], t: [PHONE_FAX])" assert doc.annotations == [ Annotation(text='[NAME]', start=11, end=17, tag='Name', doc_id='', ann_id='T0'), Annotation(text='[EMAIL]', start=22, end=29, tag='Email', doc_id='', ann_id='T1'), Annotation(text='[PHONE_FAX]', start=34, end=45, tag='Phone_fax', doc_id='', ann_id='T2') ]
def test_apply_surrogates(): text = 'ccc cc ccc c c ccc cccccc cccc' annotations = [ Annotation('ccc', start=0, end=3, tag='A'), Annotation('cc', start=4, end=6, tag='A'), Annotation('ccc', start=15, end=18, tag='B') ] surrogates = ['a', 'dd', 'bbbbb'] surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates) assert surrogate_doc.text == 'a dd ccc c c bbbbb cccccc cccc' assert surrogate_doc.annotations == [ Annotation('a', start=0, end=1, tag='A'), Annotation('dd', start=2, end=4, tag='A'), Annotation('bbbbb', start=13, end=18, tag='B') ] assert surrogate_doc.annotations_without_surrogates == []
def xml_to_document(xml_file): """Converts an i2b2/UTHealth XML document to a `deidentify.base.Document`. XML Structure: ``` <?xml version="1.0" encoding="UTF-8" ?> <deIdi2b2> <TEXT><![CDATA[ this is the record content ]]></TEXT> <TAGS> <DATE id="P0" start="16" end="26" text="2067-05-03" TYPE="DATE" comment="" /> <AGE id="P1" start="50" end="52" text="55" TYPE="AGE" comment="" /> </TAGS> </deIdi2b2> ``` """ tree = ET.parse(xml_file) root = tree.getroot() text = root.find('TEXT').text doc_name = 'doc-' + splitext(basename(xml_file))[0] annotations = [] for tag_element in root.find('TAGS'): tag_name = tag_element.tag + ':' + tag_element.attrib['TYPE'] annotations.append(Annotation( text=tag_element.attrib['text'], start=tag_element.attrib['start'], end=tag_element.attrib['end'], # Example: NAME:DOCTOR tag=TAG_MAPPING.get(tag_name, tag_name), # i2b2 annotations have id prefixed with P. Example: P12 doc_id=doc_name, ann_id='T{}'.format(tag_element.attrib['id'][1:]) )) return Document(name=doc_name, text=text, annotations=annotations)
def test_rewrite_text(): text = 'ccc cc ccc c c ccc cccccc cccc' annotations = [ Annotation('ccc', start=0, end=3, tag='A'), Annotation('cc', start=4, end=6, tag='A'), Annotation('ccc', start=15, end=18, tag='B') ] surrogates = ['a', 'dd', 'bbbbb'] result = rewrite_dataset.apply_surrogates(text, annotations, surrogates) text_rewritten, adjusted_annotations = result assert text_rewritten == 'a dd ccc c c bbbbb cccccc cccc' assert adjusted_annotations == [ Annotation('a', start=0, end=1, tag='A'), Annotation('dd', start=2, end=4, tag='A'), Annotation('bbbbb', start=13, end=18, tag='B') ]