Exemple #1
0
def surrogate_annotations(docs: List[Document], seed=42, errors='raise') -> List[Document]:
    """Replaces PHI annotations in documents with random surrogates.

    Parameters
    ----------
    seed : int
        Set this seed to make the random generation deterministic.
    errors : str {'ignore', 'raise', 'coerce'}, default 'raise'
        - If 'raise',  errors during surrogate generation will raise an exception.
        - If 'ignore', failing annotations are skipped (they and PHI remains in text)
        - If 'coerce', failing annotations are replaced with pattern `[annotation.tag]`

    Returns
    -------
    List[Document]
        A copy of `docs` with with text and annotations rewritten to their surrogates.

        If errors is 'ignore' or 'coerce', an extra property of type List is added to the returned
        documents (`Document.annotations_without_surrogates`), which includes annotations of the
        *input document* that could not be replaced with a surrogate.

    """
    random_data = RandomData(seed=seed)
    dataset_deidentifier = DatasetDeidentifier(random_data=random_data)

    surrogate_docs = [SurrogateDocument(doc.annotations, doc.text) for doc in docs]
    surrogate_docs = dataset_deidentifier.generate_surrogates(documents=surrogate_docs)

    for doc in surrogate_docs:
        annotations, surrogates = doc.annotation_surrogate_pairs()
        doc_rewritten = apply_surrogates(doc.text, annotations, surrogates, errors=errors)
        yield doc_rewritten
Exemple #2
0
def test_replace_pattern():
    pfs = PhoneFaxSurrogates(annotations=[], random_data=RandomData(seed=42))

    for _ in range(100):
        pattern = '(0DD) ### ### #'
        replacement = pfs.replace_pattern(pattern)
        assert replacement[2:4] in DIAL_CODES_BY_LENGTH[2]
        assert replacement[6] in '123456789'
        assert len(replacement) == len(pattern)

        pattern = '0DD - ### ### #'
        replacement = pfs.replace_pattern(pattern)
        assert replacement[1:3] in DIAL_CODES_BY_LENGTH[2]
        assert replacement[6] in '123456789'
        assert len(replacement) == len(pattern)

        pattern = '0DD- ### ### #'
        replacement = pfs.replace_pattern(pattern)
        assert replacement[1:3] in DIAL_CODES_BY_LENGTH[2]
        assert replacement[5] in '123456789'
        assert len(replacement) == len(pattern)

        pattern = '0DD#######'
        replacement = pfs.replace_pattern(pattern)
        assert replacement[1:3] in DIAL_CODES_BY_LENGTH[2]
        assert replacement[3] in '123456789'
        assert len(replacement) == len(pattern)

        pattern = '### ### #'
        replacement = pfs.replace_pattern(pattern)
        assert replacement[0] in '123456789'
        assert len(replacement) == len(pattern)

        pattern = '+CC DDD ### ###'
        replacement = pfs.replace_pattern(pattern)
        assert replacement[0] == '+'
        assert replacement[1:3] == '31'
        assert replacement[4:7] in DIAL_CODES_BY_LENGTH[3]
        assert replacement[8] in '123456789'
        assert len(replacement) == len(pattern)

        pattern = '+CC D ## ## ## ##'
        replacement = pfs.replace_pattern(pattern)
        assert replacement[0] == '+'
        assert replacement[1:3] == '31'
        assert replacement[4] == '6'  # mobile phone
        assert replacement[6] in '123456789'
        assert len(replacement) == len(pattern)

        pattern = '00CC D ## ## ## ##'
        replacement = pfs.replace_pattern(pattern)
        assert replacement[0:2] == '00'
        assert replacement[2:4] == '31'
        assert replacement[5] == '6'  # mobile phone
        assert replacement[7] in '123456789'
        assert len(replacement) == len(pattern)
Exemple #3
0
def test_date_surrogate_generator():
    annotations = [
        '01 januari 1915',
        '01-02',
        'marc 2001',
        'February 2001',
        '01-02-2010',
    ]

    date_surrogates = DateSurrogates(annotations, random_data=RandomData(42))
    assert date_surrogates.replace_all() == [
        '09 januari 2006', '09-02', None, 'February 2086', '09-02-2095'
    ]
Exemple #4
0
def test_replace_phonenumber():
    pfs = PhoneFaxSurrogates(annotations=[], random_data=RandomData(seed=42))

    for _ in range(100):
        pattern = '(026) 123 456 7'
        replacement = pfs.replace_one(pattern)
        assert replacement[2:4] in DIAL_CODES_BY_LENGTH[2]
        assert replacement[6] in '123456789'
        assert len(replacement) == len(pattern)

        pattern = '+31 (0) 6 11 22 11 11'
        replacement = pfs.replace_one(pattern)
        assert replacement[0] == '+'
        assert replacement[1:3] == '31'
        assert replacement[3:8] == ' (0) '
        assert replacement[8] == '6'  # mobile phone
        assert replacement[10] in '123456789'
        assert len(replacement) == len(pattern)
 def __init__(self, random_data=None):
     if not random_data:
         random_data = RandomData(seed=45)
     self.random_data = random_data
Exemple #6
0
def test_random_char_mapping():
    random_data = RandomData()

    random_mapping = random_char_mapping(random_data=random_data)
    assert set(random_mapping.keys()) == set(string.ascii_lowercase)
    assert set(random_mapping.values()) == set(string.ascii_lowercase)