Esempio n. 1
0
def test_remove_key_words():
    text1 = "Ayant pour conseil Me Myriam MASSENGO LACAVE et Me Toto TITI, " \
            "avocat au barreau de PARIS, toque: B1132"
    offsets1 = [Offset(22, 44, "LAWYER"), Offset(51, 60, "LAWYER")]
    assert remove_key_words(text=text1, offsets=offsets1, rate=100) == ('Ayant pour conseil Myriam MASSENGO LACAVE '
                                                                        'et Toto TITI, avocat au barreau de PARIS, '
                                                                        'toque: B1132',
                                                                        [Offset(19, 41, "LAWYER"),
                                                                         Offset(45, 54, "LAWYER")])

    assert remove_key_words(text=text1, offsets=offsets1, rate=0) == (text1, offsets1)
    # check that no word related to companies is removed
    text2 = "Condamne la SCI CEK PARTICIPATIONS à payer à la SARL CEK LOISIRS la somme de 2.000 euros."
    offsets2 = [Offset(12, 34, 'ORGANIZATION'), Offset(48, 64, 'ORGANIZATION')]
    assert remove_key_words(text=text2, offsets=offsets2, rate=100) == (text2, offsets2)
                last_doc_with_unknown_entities = doubtful_mwe_matcher. \
                    add_unknown_words_offsets(texts=last_document_texts,
                                              offsets=last_doc_offset_unwanted_words_rmved)

                last_doc_offsets_no_space = [
                    remove_spaces_included_in_offsets(text, off) for text, off
                    in zip(last_document_texts, last_doc_with_unknown_entities)
                ]

                last_doc_offsets_normalized = [
                    normalize_offsets(off) for off in last_doc_offsets_no_space
                ]

                last_doc_remove_keywords = [
                    remove_key_words(text=text,
                                     offsets=off,
                                     rate=remove_keyword_rate) for text, off in
                    zip(last_document_texts, last_doc_offsets_normalized)
                ]

                last_doc_remove_keywords_text, last_doc_remove_keywords_offsets = zip(
                    *last_doc_remove_keywords)

                last_doc_txt_case_updated = [
                    random_case_change(text=text,
                                       offsets=off,
                                       rate=change_case_rate)
                    for text, off in zip(last_doc_remove_keywords_text,
                                         last_doc_remove_keywords_offsets)
                ]