def test_apply_surrogates_errors_raise(): text = 'ccc cc ccc' annotations = [ Annotation('ccc', start=0, end=3, tag='A'), Annotation('cc', start=4, end=6, tag='A'), Annotation('ccc', start=7, end=10, tag='B') ] surrogates = ['a', None, 'b'] with pytest.raises(ValueError): rewrite_dataset.apply_surrogates(text, annotations, surrogates) with pytest.raises(ValueError): rewrite_dataset.apply_surrogates(text, annotations, surrogates, errors='raise')
def surrogate_annotations(docs: List[Document], seed=42, errors='raise') -> List[Document]: """Replaces PHI annotations in documents with random surrogates. Parameters ---------- seed : int Set this seed to make the random generation deterministic. errors : str {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', errors during surrogate generation will raise an exception. - If 'ignore', failing annotations are skipped (they and PHI remains in text) - If 'coerce', failing annotations are replaced with pattern `[annotation.tag]` Returns ------- List[Document] A copy of `docs` with with text and annotations rewritten to their surrogates. If errors is 'ignore' or 'coerce', an extra property of type List is added to the returned documents (`Document.annotations_without_surrogates`), which includes annotations of the *input document* that could not be replaced with a surrogate. """ random_data = RandomData(seed=seed) dataset_deidentifier = DatasetDeidentifier(random_data=random_data) surrogate_docs = [SurrogateDocument(doc.annotations, doc.text) for doc in docs] surrogate_docs = dataset_deidentifier.generate_surrogates(documents=surrogate_docs) for doc in surrogate_docs: annotations, surrogates = doc.annotation_surrogate_pairs() doc_rewritten = apply_surrogates(doc.text, annotations, surrogates, errors=errors) yield doc_rewritten
def test_rewrite_text_no_annotations(): result = rewrite_dataset.apply_surrogates('ccc cc ccc', annotations=[], surrogates=[]) text_rewritten, adjusted_annotations = result assert text_rewritten == 'ccc cc ccc' assert adjusted_annotations == []
def test_apply_surrogates(): text = 'ccc cc ccc c c ccc cccccc cccc' annotations = [ Annotation('ccc', start=0, end=3, tag='A'), Annotation('cc', start=4, end=6, tag='A'), Annotation('ccc', start=15, end=18, tag='B') ] surrogates = ['a', 'dd', 'bbbbb'] surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates) assert surrogate_doc.text == 'a dd ccc c c bbbbb cccccc cccc' assert surrogate_doc.annotations == [ Annotation('a', start=0, end=1, tag='A'), Annotation('dd', start=2, end=4, tag='A'), Annotation('bbbbb', start=13, end=18, tag='B') ] assert surrogate_doc.annotations_without_surrogates == []
def test_rewrite_text(): text = 'ccc cc ccc c c ccc cccccc cccc' annotations = [ Annotation('ccc', start=0, end=3, tag='A'), Annotation('cc', start=4, end=6, tag='A'), Annotation('ccc', start=15, end=18, tag='B') ] surrogates = ['a', 'dd', 'bbbbb'] result = rewrite_dataset.apply_surrogates(text, annotations, surrogates) text_rewritten, adjusted_annotations = result assert text_rewritten == 'a dd ccc c c bbbbb cccccc cccc' assert adjusted_annotations == [ Annotation('a', start=0, end=1, tag='A'), Annotation('dd', start=2, end=4, tag='A'), Annotation('bbbbb', start=13, end=18, tag='B') ]
def test_apply_surrogates_errors_coerce(): text = 'ccc cc ccc' annotations = [ Annotation('ccc', start=0, end=3, tag='A'), Annotation('cc', start=4, end=6, tag='A'), Annotation('ccc', start=7, end=10, tag='B') ] surrogates = ['a', None, 'b'] surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates, errors='coerce') assert surrogate_doc.text == 'a [A] b' assert surrogate_doc.annotations == [ Annotation('a', start=0, end=1, tag='A'), Annotation('[A]', start=2, end=5, tag='A'), Annotation('b', start=6, end=7, tag='B') ] assert surrogate_doc.annotations_without_surrogates == [ Annotation('cc', start=4, end=6, tag='A'), ]
def test_apply_surrogates_no_annotations(): surrogate_doc = rewrite_dataset.apply_surrogates('ccc cc ccc', annotations=[], surrogates=[]) assert surrogate_doc.text == 'ccc cc ccc' assert surrogate_doc.annotations == [] assert surrogate_doc.annotations_without_surrogates == []