Esempio n. 1
0
 def test_segmenter_no_segmentation_needed(self):
     segmentor = Segmenter(5000, overlap_tokens=3)
     original_text = "Barack Hussein Obama II is an American politician and attorney who served as the " \
                     "44th president of the United States from 2009 to 2017."
     segments = segmentor.segment(original_text)
     assert len(segments) == 1
     assert segments[0].text == original_text
     assert segmentor.de_segment(segments).text == original_text
def redact(text, classification_segmenter: Segmenter, detection_segmenter: Segmenter,
           redactor: Redactor, comprehend: ComprehendClient, redaction_config: RedactionConfig, language_code) -> Document:
    """
    Redact pii data from given text. Logic for redacting:- .

    1. Segment text into subsegments of reasonable sizes (max doc size supported by comprehend) for doing initial classification
    2. For each subsegment ,
        2.1 call comprehend's classify-pii-document api to determine if it contains any PII data
        2.2 if it contains pii then split it to smaller chunks(e.g. <=5KB), else skip to the next subsegment
        2.3 for each chunk
             2.3.1 call comprehend's detect-pii-entities to extract the pii entities
             2.3.2 redact the pii entities from the chunk
        2.4 merge all chunks
    3. merge all subsegments
    """
    if REDACTION_API_ONLY:
        doc = Document(text)
        documents = [doc]
        docs_for_entity_detection = detection_segmenter.segment(doc.text, doc.char_offset)
    else:
        documents = comprehend.contains_pii_entities(classification_segmenter.segment(text), language_code)
        pii_docs = [doc for doc in documents if len(get_interested_pii(doc, redaction_config)) > 0]
        if not pii_docs:
            LOG.debug("Document doesn't have any pii. Nothing to redact.")
            text = classification_segmenter.de_segment(documents).text
            return Document(text, redacted_text=text)
        docs_for_entity_detection = []
        for pii_doc in pii_docs:
            docs_for_entity_detection.extend(detection_segmenter.segment(pii_doc.text, pii_doc.char_offset))

    docs_with_pii_entities = comprehend.detect_pii_documents(docs_for_entity_detection, language_code)
    resultant_doc = classification_segmenter.de_segment(documents + docs_with_pii_entities)
    assert len(resultant_doc.text) == len(text), "Not able to recover original document after segmentation and desegmentation."
    redacted_text = redactor.redact(text, resultant_doc.pii_entities)
    resultant_doc.redacted_text = redacted_text
    return resultant_doc
Esempio n. 3
0
 def test_segmenter_basic_text(self):
     segmentor = Segmenter(50, overlap_tokens=3)
     original_text = "Barack Hussein Obama II is an American politician and attorney who served as the " \
                     "44th president of the United States from 2009 to 2017."
     segments = segmentor.segment(original_text)
     expected_segments = [
         "Barack Hussein Obama II is an American politician ",
         "an American politician and attorney who served as ",
         "who served as the 44th president of the United ",
         "of the United States from 2009 to 2017."
     ]
     for expected_segment, actual_segment in zip(expected_segments,
                                                 segments):
         assert expected_segment == actual_segment.text
     shuffle(segments)
     assert segmentor.de_segment(segments).text == original_text
def classify(text, classification_segmenter: Segmenter, comprehend: ComprehendClient,
             detection_config: ClassificationConfig, language_code) -> List[str]:
    """
    Detect pii data from given text. Logic for detecting:- .

    1. Segment text into segments of reasonable sizes (max doc size supported by comprehend) for
       doing initial classification
    2. For each segment,
        2.1 call comprehend's classify-pii-document api to determine if it contains any PII data
        2.2 if it contains pii that is in the detection config then return those pii, else move to the next segment
    3. If no pii detected, return empty list, else list of pii types found that is also in the detection config
       and above the given threshold
    """
    pii_classified_documents = comprehend.contains_pii_entities(classification_segmenter.segment(text), language_code)
    pii_types = set()
    for doc in pii_classified_documents:
        doc_pii_types = get_interested_pii(doc, detection_config)
        pii_types |= set(doc_pii_types)
    return list(pii_types)
Esempio n. 5
0
 def test_segmenter_unicode_chars(self):
     segmentor = Segmenter(100, overlap_tokens=3)
     original_text = "╩ХтАв╠Бс┤етАв╠А╩ФуБгтЩб Emoticons ЁЯШЬ ╩ХтАв╠Бс┤етАв╠А╩ФуБгтЩб Emoticons ЁЯШЬ сЧ╖сЩУ ├▓┬е┬е┬е┬е┬е┬е┬есЧвсЦЗсУосШРсУ░ямбсЧйсТк тДмтДо ┬втЧО├╕┼В Bс┤З ╩Пс┤Пс┤Ь╩Аsс┤З╩Я╥У рд╡рд┐рдХрд┐рдкреАрдбрд┐рдпрд╛ рд╕рднреА рд╡рд┐рд╖рдпреЛрдВ рдкрд░ рдкреНрд░рд╛рдорд╛рдгрд┐рдХ рдФрд░ рдЙрдкрдпреЛрдЧ, " \
                     "рдкрд░рд┐рд╡рд░реНрддрди рд╡ рдкреБрдирд░реНрд╡рд┐рддрд░рдг рдХреЗ рд▓рд┐рдП рд╕реНрд╡рддрдиреНрддреНрд░ рдЬреНрдЮрд╛рдирдХреЛрд╢ рдмрдирд╛рдиреЗ h├аnb╟Оob─Бo, h├аnb╟Оo ц▒ЙхабхМЕ/ц╝вхабхМЕ, ц▒Йхаб/ц╝вхаб тАУ hamburger"
     segments = segmentor.segment(original_text)
     expected_segments = [
         "╩ХтАв╠Бс┤етАв╠А╩ФуБгтЩб Emoticons ЁЯШЬ ╩ХтАв╠Бс┤етАв╠А╩ФуБгтЩб Emoticons ЁЯШЬ сЧ╖сЩУ ",
         "Emoticons ЁЯШЬ сЧ╖сЩУ ├▓┬е┬е┬е┬е┬е┬е┬есЧвсЦЗсУосШРсУ░ямбсЧйсТк тДмтДо ┬втЧО├╕┼В Bс┤З ",
         "тДмтДо ┬втЧО├╕┼В Bс┤З ╩Пс┤Пс┤Ь╩Аsс┤З╩Я╥У рд╡рд┐рдХрд┐рдкреАрдбрд┐рдпрд╛ рд╕рднреА ",
         "╩Пс┤Пс┤Ь╩Аsс┤З╩Я╥У рд╡рд┐рдХрд┐рдкреАрдбрд┐рдпрд╛ рд╕рднреА рд╡рд┐рд╖рдпреЛрдВ рдкрд░ ",
         "рд╕рднреА рд╡рд┐рд╖рдпреЛрдВ рдкрд░ рдкреНрд░рд╛рдорд╛рдгрд┐рдХ рдФрд░ рдЙрдкрдпреЛрдЧ, ",
         "рдкреНрд░рд╛рдорд╛рдгрд┐рдХ рдФрд░ рдЙрдкрдпреЛрдЧ, рдкрд░рд┐рд╡рд░реНрддрди рд╡ ",
         "рдЙрдкрдпреЛрдЧ, рдкрд░рд┐рд╡рд░реНрддрди рд╡ рдкреБрдирд░реНрд╡рд┐рддрд░рдг рдХреЗ рд▓рд┐рдП ",
         "рдкреБрдирд░реНрд╡рд┐рддрд░рдг рдХреЗ рд▓рд┐рдП рд╕реНрд╡рддрдиреНрддреНрд░ ",
         "рдХреЗ рд▓рд┐рдП рд╕реНрд╡рддрдиреНрддреНрд░ рдЬреНрдЮрд╛рдирдХреЛрд╢ рдмрдирд╛рдиреЗ h├аnb╟Оob─Бo, ",
         "рдЬреНрдЮрд╛рдирдХреЛрд╢ рдмрдирд╛рдиреЗ h├аnb╟Оob─Бo, h├аnb╟Оo ц▒ЙхабхМЕ/ц╝вхабхМЕ, ц▒Йхаб/ц╝вхаб ",
         "h├аnb╟Оo ц▒ЙхабхМЕ/ц╝вхабхМЕ, ц▒Йхаб/ц╝вхаб тАУ hamburger"
     ]
     assert len(expected_segments) == len(segments)
     for expected_segment, actual_segment in zip(expected_segments,
                                                 segments):
         assert expected_segment == actual_segment.text
     assert segmentor.de_segment(segments).text == original_text