def test_phone_number(): assert get_phone_number("phone: (06)-42-92 72- 29 et 01 44 23 65 89") == [ Offset(8, 25, 'PHONE_NUMBER'), Offset(28, 42, 'PHONE_NUMBER') ] assert get_phone_number("phone: (06)-42-92 72- 29 + 12") == [] assert get_phone_number("phone: (00)-42-92 72- 29 ") == []
def test_get_phrase_matcher(): text = "Aujourd'hui, Michaël et Jessica écrivent des unit tests dans la joie et la bonne humeur, " \ "mais où sont donc les enfants ?" matcher = FirstName(ignore_case=False) assert matcher.get_matches(text=text) == [ Offset(13, 19, "PERS"), Offset(24, 30, "PERS") ] assert matcher.contain_first_names(text=text) is True
def test_frequent_entities(): freq_entities = {"benesty": "LAWYER", "jessica": "PERS"} frequent_entities_matcher = FrequentEntities.test_builder( content=freq_entities) text = "Me Benesty rencontre son client Jessica." assert frequent_entities_matcher.get_matches(text=text) == [ Offset(3, 10, "LAWYER"), Offset(32, 39, "PERS") ]
def get_juridictions(text: str) -> List[Offset]: """ Extract Courts name from text :param text: original paragraph text :return: offsets as a list """ result1 = [Offset(t.start(), t.end(), "COURT_1") for t in juridiction_pattern_1.finditer(text)] result2 = [Offset(t.start(), t.end(), "COURT_1") for t in juridiction_pattern_2.finditer(text)] return result1 + result2
def get_judge_name(text: str) -> List[Offset]: """ Extract judge name from text :param text: original paragraph text :return: offsets as a list """ r1 = [Offset(start=t.start(), end=t.end(), type="JUDGE_CLERK_1") for t in extract_judge_pattern_1.finditer(text)] r2 = [Offset(start=t.start(), end=t.end(), type="JUDGE_CLERK_1") for t in extract_judge_pattern_2.finditer(text)] return r1 + r2
def test_licence_plate(): assert get_licence_plate("AA111AA") == [Offset(0, 7, 'LICENCE_PLATE')] assert get_licence_plate("AA-111-AA") == [Offset(0, 9, 'LICENCE_PLATE')] assert get_licence_plate("AA 111 AA") == [Offset(0, 9, 'LICENCE_PLATE')] assert get_licence_plate("1 AA111AA") == [] assert get_licence_plate("AA 111 AA 1") == [] assert get_licence_plate("1AA11") == [Offset(0, 5, 'LICENCE_PLATE')] assert get_licence_plate("9999 ZZZ 99") == [Offset(0, 11, 'LICENCE_PLATE')] assert get_licence_plate("9999 ZZZ 999") == []
def test_rg_regex(): assert get_rg_from_regex(" RG n° 11/17073") == [Offset(7, 15, 'RG')] assert get_rg_from_regex(" RG : 13/03625 D") == [Offset(6, 15, 'RG')] assert get_rg_from_regex("RG numéro : 12/01503") == [Offset(12, 20, 'RG')] assert get_rg_from_regex("N°RG: 13/03409") == [Offset(6, 14, 'RG')] assert get_rg_from_regex("N° 13/03409") == [] assert get_rg_from_regex( "Numéro d'inscription au répertoire général : 14/01913") == [ Offset(45, 53, 'RG') ]
def test_rg_from_case_id(): text1 = "CA-aix-en-provence-20130208-1022871-jurica" matcher = MatchRg(case_id=text1) assert matcher.get_rg_from_case_id() == "1022871" assert matcher.get_rg_offset_from_text(text=text1) == [ Offset(28, 35, 'RG') ] text2 = "Le numéro RG est celui-ci 102 /2871." assert matcher.get_rg_offset_from_text(text=text2) == [ Offset(26, 35, 'RG') ]
def get_addresses(text: str) -> List[Offset]: """ Extract addresses from text :param text: original paragraph text :return: offsets as a list """ result1 = [Offset(t.start(), t.end(), "ADDRESS_1") for t in extract_address_pattern_1.finditer(text)] result2 = [Offset(t.start(), t.end(), "ADDRESS_1") for t in extract_address_pattern_2.finditer(text)] result3 = [Offset(t.start(), t.end(), "ADDRESS_1") for t in extract_address_pattern_3.finditer(text)] return sorted(remove_duplicates(result1 + result2 + result3), key=lambda o: (o.start, o.end))
def test_bilou_conv(): doc: Doc = pytest.nlp.make_doc("Ceci est un test.") offset1 = [Offset(5, 8, "UNKNOWN")] assert convert_unknown_bilou( doc, offsets=offset1).ner == ["O", "-", "O", "O", "O"] assert convert_unknown_bilou_bulk( [doc], [offset1])[0].ner == ["O", "-", "O", "O", "O"] offset2 = [Offset(5, 8, "PERS")] assert convert_unknown_bilou( doc, offsets=offset2).ner == ["O", "U-PERS", "O", "O", "O"] offset3 = [Offset(0, 4, "UNKNOWN")] assert convert_unknown_bilou( doc, offsets=offset3).ner == ["-", "O", "O", "O", "O"]
def test_find_address_in_paragraph_block(): texts = ["popo", "Zone Industrielle de Rossignol", "47110 SAINTE LIVRADE SUR LOT", "popo", "", "", "", "", "", "", "", "", "", ""] offsets1 = [[], [], [], [], [], [], [], []] new_offsets = find_address_in_block_of_paragraphs(texts=texts, offsets=offsets1) expected_result = [[], [Offset(0, 30, "ADDRESS")], [Offset(0, 28, "ADDRESS")], [], [], [], [], []] offsets3 = [[], [], [Offset(0, 28, "ADDRESS")], [], [], [], [], []] offsets4 = [[], [Offset(0, 30, "ADDRESS")], [], [], [], [], [], []] assert new_offsets == expected_result new_offsets2 = find_address_in_block_of_paragraphs(texts=texts, offsets=expected_result) assert new_offsets2 == expected_result new_offsets3 = find_address_in_block_of_paragraphs(texts=texts, offsets=offsets3) assert new_offsets3 == expected_result new_offsets4 = find_address_in_block_of_paragraphs(texts=texts, offsets=offsets4) assert new_offsets4 == expected_result
def remove_spaces_included_in_offsets(text: str, offsets: List[Offset]) -> List[Offset]: """ If offset doesn't match a word boundary its type is removed by Spacy (unknown action) This function removes spaces when at the start or the end of the offset, if any More info -> https://spacy.io/usage/linguistic-features Test code: ---- import spacy from spacy.gold import GoldParse from spacy.tokens import Doc nlp = spacy.blank('fr') doc2 = nlp('Who is Chaka Khan popo?') gold2 = GoldParse(doc2, entities=[(7, 18, 'PERSON')]) print(gold2.ner) ---- :param text: original text :param offsets: list of original offsets for this text :return: list of new offsets fixed """ result = list() for offset in offsets: if (offset.start >= 0) and (offset.end - 1 < len(text)) and (offset.start != offset.end): new_start = offset.start + 1 if text[offset.start].isspace() else offset.start # remove 1 because the closing offset is not included in the selection in Python new_end = offset.end - 1 if text[offset.end - 1].isspace() else offset.end result.append(Offset(new_start, new_end, offset.type)) return result
def get_rg_offset_from_text(self, text: str) -> List[Offset]: """ Extract RG number offsets from a text, if any :param text: original text :return: offsets as a list """ return [Offset(item.start(), item.end(), "RG") for item in self.pattern.finditer(text)]
def test_remove_spaces(): text = "Je suis ici et je programme." offset = [Offset(3, 8, "TEST")] span_original = text[offset[0].start:offset[0].end] assert span_original == "suis " new_offset = remove_spaces_included_in_offsets(text, offset) span_new = text[new_offset[0].start:new_offset[0].end] assert span_new == span_original.strip()
def parse_offsets(text: str) -> Offset: """ Convert to the right offset format :param text: original line :return: a tuple containing the offset """ item = text.split(' ') return Offset(int(item[0]), int(item[1]), item[2])
def test_social_security(): valid_id = "2 40 09 93 618 017 05" invalid_id = "2 40 09 93 618 017 06" assert get_social_security_number(valid_id) == [ Offset(0, 21, 'SOCIAL_SECURITY_NUMBER') ] assert get_social_security_number("1" + valid_id) == [] assert get_social_security_number(invalid_id) == []
def test_random_case_change(): text = "La Banque est fermée" offsets = [Offset(3, 9, "PERS")] seed(123) results = [random_case_change(text, offsets, 100) for _ in range(1, 500)] assert "La Banque est fermée" in results assert "La banque est fermée" in results assert "La BANQUE est fermée" in results
def get_company_names(text: str) -> List[Offset]: """ Extract company names from string text :param text: original text :return: a list of offsets """ return [Offset(start=t.start(), end=t.end(), type="ORGANIZATION_1") for t in find_corp.finditer(text)]
def get_partie_pers(text: str) -> List[Offset]: """ Extract people names from text :param text: original paragraph text :return: offsets as a list """ result1 = [ Offset(start=t.start(), end=t.end(), type="PERS") for t in extract_partie_pp_pattern_1.finditer(text) ] result2 = [ Offset(start=t.start(), end=t.end(), type="PERS") for t in extract_partie_pp_pattern_2.finditer(text) ] result3 = [ Offset(start=t.start(), end=t.end(), type="PERS") for t in extract_partie_pp_pattern_3.finditer(text) ] return result1 + result2 + result3
def get_lawyer_name(text: str) -> List[Offset]: """ Extract lawyer name from text :param text: original paragraph text :return: offsets as a list """ return [ Offset(t.start(), t.end(), "LAWYER") for t in extract_lawyer.finditer(text) ]
def get_date(text: str) -> List[Offset]: """ Parse text to retrieve offset mentioning a date :param text: original text :return: offsets as a list """ r1 = [ Offset(t.start(), t.end(), "DATE_1") for t in date_pattern_in_letters_regex.finditer(text) ] r2 = [ Offset(t.start(), t.end(), "DATE_1") for t in date_pattern_in_numbers_regex_1.finditer(text) ] r3 = [ Offset(t.start(), t.end(), "DATE_1") for t in date_pattern_in_numbers_regex_2.finditer(text) ] return r1 + r2 + r3
def remove_key_words(text: str, offsets: List[Offset], rate: int) -> Tuple[str, List[Offset]]: """ Modify text to remove some key words, making the learning harder and the model more robust. :param text: original paragraph as a string :param offsets: list of extracted offsets :param rate: chance as an integer between 1 and 100 that a key word is removed :return: a tuple (new_text, offsets) """ words_to_delete_offsets: List[Offset] = key_words_matcher.get_matches( text=text, tag="TO_DELETE") if (len(words_to_delete_offsets) == 0) or (len(offsets) == 0): return text, offsets detected_spans = dict() for offset in offsets: span_text = text[offset.start:offset.end] if len(span_text) > 0: detected_spans[span_text] = offset.type if len(detected_spans) == 0: return text, offsets original_content_offsets_matcher = AcoraMatcher(content=list( detected_spans.keys()), ignore_case=False) cleaned_text = list() start_selection_offset = 0 for offset in words_to_delete_offsets: if randint(1, 99) < rate: # - 1 to remove also the space following the keyword to remove cleaned_text.append(text[start_selection_offset:offset.start - 1]) start_selection_offset = offset.end else: cleaned_text.append(text[start_selection_offset:offset.end]) start_selection_offset = offset.end cleaned_text.append(text[start_selection_offset:len(text)]) cleaned_text = ''.join(cleaned_text) updated_offsets = original_content_offsets_matcher.get_matches( text=cleaned_text, tag="UNKNOWN") offsets_to_return = list() # restore original offset type name for offset in updated_offsets: span_text = cleaned_text[offset.start:offset.end] type_name = detected_spans[span_text] offsets_to_return.append(Offset(offset.start, offset.end, type_name)) return cleaned_text, offsets_to_return
def clean_unknown_offsets(self, offsets: List[Offset]) -> List[Offset]: """ Remove offsets of unknown type span when there is an overlap with a known offset :param offsets: cleaned offsets with old known offsets and the new ones """ result = list() sorted_offsets = sorted(offsets, key=lambda o: (o.start, o.end)) for (index, offset) in enumerate(sorted_offsets): start_offset, end_offset, type_name = offset.start, offset.end, offset.type if offset.type == self.unknown_type_name: # is first token? if index > 0: previous_start_offset, previous_end_offset, previous_type_name = sorted_offsets[index - 1].start, \ sorted_offsets[index - 1].end, \ sorted_offsets[index - 1].type else: previous_start_offset, previous_end_offset, previous_type_name = None, None, None # is last token? if index < len(sorted_offsets) - 1: next_start_offset, next_end_offset, next_type_name = sorted_offsets[index + 1].start, \ sorted_offsets[index + 1].end, \ sorted_offsets[index + 1].type else: next_start_offset, next_end_offset, next_type_name = None, None, None is_start_offset_ok = (((previous_end_offset is not None) and (start_offset > previous_end_offset)) or (previous_end_offset is None)) is_end_offset_ok = ((next_start_offset is not None) and (end_offset < next_start_offset) or (next_start_offset is None)) if is_start_offset_ok and is_end_offset_ok: result.append(Offset(start_offset, end_offset, type_name)) else: result.append(Offset(start_offset, end_offset, type_name)) return result
def get_stripped_offsets(text: str, tag: str) -> Offset: """ Get offsets of a the actual text ie. the text without the surronding whitespaces :param text: a line of text :param tag: tag to apply to the text :return: a tuple (start offset, end offset, tag name) """ stripped_text = text.strip() start = text.find(stripped_text) end = start + len(stripped_text) return Offset(start, end, tag)
def test_credit_card(): credit_card_valid = "4474 2054 6736 1295" credit_card_invalid = "1265 157309414560" assert get_credit_card_number("pop " + credit_card_valid + " apres") == [Offset(4, 24, 'CREDIT_CARD')] assert get_credit_card_number("pop " + credit_card_invalid + " apres") == [] assert get_credit_card_number("1234 1234 1234 1234") == [] assert get_credit_card_number("1- 1234 1234 1234 1234") == [] assert get_credit_card_number("1- " + credit_card_valid) == []
def get_bar(text: str) -> List[Offset]: """ Extract offset related to a bar and its city localization French bar list: http://www.conferencedesbatonniers.com/barreaux/userslist/7-liste-des-barreaux :param text: original text :return: offset as a list """ return [ Offset(t.start(), t.end(), "BAR_1") for t in barreau_pattern.finditer(text) ]
def get_rg_from_regex(text: str) -> List[Offset]: """ Extract RG number from text when some pattern is found :param text: original text :return: offset as a list """ offsets = extract_rg_from_text_regex.search(text) if offsets is not None: return [Offset(offsets.start(), offsets.end(), "RG")] else: return list()
def get_all_unknown_words_offsets(self, text: str) -> List[Offset]: """ Find offsets of all words in upcase. :param text: original paragraph text :return: offsets as a list """ return [ Offset(t.start(), t.end(), self.unknown_type_name) for t in self.upcase_words_regex.finditer(text) if self.predicate_keep_unknown_entities( text=text, start=t.start(), end=t.end()) ]
def test_match_headers_content(): config_training = get_config_default() xml_path = config_training["xml_unittest_file"] header_content_all_cases = parse_xml_header(path=xml_path) case_id = list(header_content_all_cases.keys())[0] header_content = header_content_all_cases[case_id] headers_matcher = MatchValuesFromHeaders(current_header=header_content, threshold_size=3) matcher_partie_pp = headers_matcher.get_matcher_of_partie_pp_from_headers() text1 = "C'est Catherine ***REMOVED*** qui est responsable de ces faits avec M. LEON ***REMOVED***" assert matcher_partie_pp.get_matches(text1, "PERS") == [Offset(6, 29, "PERS")]
def get_extended_names(self, text: str) -> List[Offset]: """ Apply the generated regex pattern to current paragraph text No computation if there is nothing to find :param text: current original text :return: offset list """ if self.dont_detect: return list() result1 = [ Offset(t.start(), t.end(), self.type_name) for t in self.pattern_title.finditer(text) ] result2 = [ Offset(t.start(), t.end(), self.type_name) for t in self.pattern_extend_right.finditer(text) ] result = list(remove_duplicates(result1 + result2)) result = sorted(result, key=lambda o: o.start) return result