def remove_key_words(text: str, offsets: List[Offset], rate: int) -> Tuple[str, List[Offset]]: """ Modify text to remove some key words, making the learning harder and the model more robust. :param text: original paragraph as a string :param offsets: list of extracted offsets :param rate: chance as an integer between 1 and 100 that a key word is removed :return: a tuple (new_text, offsets) """ words_to_delete_offsets: List[Offset] = key_words_matcher.get_matches( text=text, tag="TO_DELETE") if (len(words_to_delete_offsets) == 0) or (len(offsets) == 0): return text, offsets detected_spans = dict() for offset in offsets: span_text = text[offset.start:offset.end] if len(span_text) > 0: detected_spans[span_text] = offset.type if len(detected_spans) == 0: return text, offsets original_content_offsets_matcher = AcoraMatcher(content=list( detected_spans.keys()), ignore_case=False) cleaned_text = list() start_selection_offset = 0 for offset in words_to_delete_offsets: if randint(1, 99) < rate: # - 1 to remove also the space following the keyword to remove cleaned_text.append(text[start_selection_offset:offset.start - 1]) start_selection_offset = offset.end else: cleaned_text.append(text[start_selection_offset:offset.end]) start_selection_offset = offset.end cleaned_text.append(text[start_selection_offset:len(text)]) cleaned_text = ''.join(cleaned_text) updated_offsets = original_content_offsets_matcher.get_matches( text=cleaned_text, tag="UNKNOWN") offsets_to_return = list() # restore original offset type name for offset in updated_offsets: span_text = cleaned_text[offset.start:offset.end] type_name = detected_spans[span_text] offsets_to_return.append(Offset(offset.start, offset.end, type_name)) return cleaned_text, offsets_to_return
def complete_case_annotations(spacy_docs: List[Doc], entity_typename: Dict[str, str]) -> List[Doc]: """ Complete/Normalize annotations from the spacy tagger. :param spacy_docs: the spacy annotations :param entity_typename: the dictionary with each occurence type :returns: the updated spacy_annotions (for convenience only, as the update is inplace) """ if len(spacy_docs) > 0: matcher = AcoraMatcher(content=list(entity_typename.keys()), ignore_case=True) doc_text, empty_offsets = zip(*[(spacy_doc.text, []) for spacy_doc in spacy_docs]) document_addresses_offsets = find_address_in_block_of_paragraphs( texts=list(doc_text), offsets=list(empty_offsets)) for spacy_doc, doc_address_offset in zip(spacy_docs, document_addresses_offsets): matches = matcher.get_matches(text=spacy_doc.text, tag="UNKNOWN") matcher_offsets = list() for offset in matches: span_text = spacy_doc.text[offset.start:offset.end] logger.debug(span_text) offset.type = entity_typename[span_text.lower()] matcher_offsets.append(offset) matcher_offsets_normalized = normalize_offsets( offsets=matcher_offsets + doc_address_offset) spacy_matcher_offset: List[Span] = list() for offset in matcher_offsets_normalized: # https://spacy.io/usage/linguistic-features#section-named-entities span_doc: Span = spacy_doc.char_span(offset.start, offset.end, label=offset.type) if span_doc is not None: # span will be none if the word is incomplete spacy_matcher_offset.append(span_doc) else: logger.error( f"ERROR char offset [{spacy_doc.text[offset.start:offset.end]}] " f"from [{spacy_doc.text}]") spacy_doc.ents = spacy_matcher_offset # all_offsets return spacy_docs
class PostalCodeCity: matcher = None def __init__(self): """ Build a matcher of first name based on a French names dictionary """ postal_code_city_list = list() config = get_config_default() file = config["postal_code_city"] with open(file) as f1: for line in f1.readlines(): fields = line.split(";") city = fields[1].strip() if len(city) >= 3: postal_code = fields[2].strip() postal_code_city_list.append(postal_code + " " + city) postal_code_city_list.append(city + " (" + postal_code + ")") assert len(postal_code_city_list) > 1000 postal_code_city_list.pop(0) self.matcher = AcoraMatcher(list(postal_code_city_list), ignore_case=True) def get_matches(self, text: str) -> List[Offset]: """ Find match of postal code and city names in a text :param text: original text :return: list of offsets """ return self.matcher.get_matches(text=text, tag="ADDRESS")
class CourtName: court_names = set() matcher = None def __init__(self): """ Build a matcher of French court names based on a list available in open data https://www.data.gouv.fr/fr/datasets/les-statistiques-par-juridiction/#_ (the list has more data, the one store is an extraction) """ config = get_config_default() file = config["french_court_names"] with open(file) as f1: for line in f1.readlines(): clean_text = line.strip() if len(clean_text) > 0: self.court_names.add(clean_text) assert len(self.court_names) > 1000 self.matcher = AcoraMatcher(content=list(self.court_names), ignore_case=True) def get_matches(self, text: str) -> List[Offset]: """ Find match of French court names in a text :param text: original text :return: list of offsets """ return self.matcher.get_matches(text=text, tag="COURT")
def __init__(self): """ Build a matcher of French court names based on a list available in open data https://www.data.gouv.fr/fr/datasets/les-statistiques-par-juridiction/#_ (the list has more data, the one store is an extraction) """ config = get_config_default() file = config["french_court_names"] with open(file) as f1: for line in f1.readlines(): clean_text = line.strip() if len(clean_text) > 0: self.court_names.add(clean_text) assert len(self.court_names) > 1000 self.matcher = AcoraMatcher(content=list(self.court_names), ignore_case=True)
def __init__(self): """ Build a matcher of first name based on a French names dictionary """ postal_code_city_list = list() config = get_config_default() file = config["postal_code_city"] with open(file) as f1: for line in f1.readlines(): fields = line.split(";") city = fields[1].strip() if len(city) >= 3: postal_code = fields[2].strip() postal_code_city_list.append(postal_code + " " + city) postal_code_city_list.append(city + " (" + postal_code + ")") assert len(postal_code_city_list) > 1000 postal_code_city_list.pop(0) self.matcher = AcoraMatcher(list(postal_code_city_list), ignore_case=True)
def __init__(self, path_trainset: str, threshold_occurrences: int, type_name_to_not_load: list, load_data: bool): """ Build an Acora matcher based on the dict of frequent entities :param path_trainset: path to a file storing the entity :param threshold_occurrences: minimum number of occurences of the entity :param type_name_to_not_load: type of entities that should not be loaded to avoid fake match :param load_data: boolean to decide if data should be loaded or not :return: an Acora matcher matcher """ if load_data: self.frequent_entities_dict = self.__read_frequent_entities( path_trainset=path_trainset, threshold_occurrences=threshold_occurrences, type_name_to_not_load=type_name_to_not_load) self.matcher = AcoraMatcher(content=list( self.frequent_entities_dict.keys()), ignore_case=True) else: self.matcher = AcoraMatcher(content=["!@#$%^&*()"], ignore_case=True)
def get_matcher_of_clerks_from_headers(self) -> AcoraMatcher: """ Create variations of items to search :return: a matcher of string which ignore case """ header_content = self.current_header['greffier'] matcher = AcoraMatcher(content=header_content, ignore_case=False) # for content in header_content: # first_name, last_name = get_first_last_name(content) # if len(first_name) > self.threshold_size: # matcher.add(first_name) # if len(last_name) > self.threshold_size: # matcher.add(last_name) return matcher
def __init__(self, ignore_case: bool): """ Build a matcher of first name based on a French names dictionary :type ignore_case: True to ignore case during matching :return: Acora matcher """ config = get_config_default() file1 = config["first_name_dict_1"] file2 = config["first_name_dict_2"] firs_name = set() with open(file1) as f1: for line in f1.readlines(): fields = line.split(";") # all names start with a Upcase letter and finishes with a space text = fields[3].strip() if len(text) >= 4: firs_name.add(text) with open(file2, encoding="ISO-8859-1") as f2: for line in f2.readlines(): fields = line.split(";") text = fields[0].strip() if len(text) >= 4: firs_name.add(get_title_case(text)) to_remove = [ "Elle", "France", "Mercedes", "Paris", "Alger", "Oran", "Sans" ] for item_to_remove in to_remove: firs_name.remove(item_to_remove) self.first_name_dict = firs_name self.matcher = AcoraMatcher(content=list(self.first_name_dict), ignore_case=ignore_case)
def test_builder(cls, content: dict): """ Build an instance of this object for tests. In particular, don't try to read saved data :param content: a dictionary of entities to load :return: an instance of FrequentEntities class """ instance = FrequentEntities(path_trainset="", threshold_occurrences=0, load_data=False, type_name_to_not_load=[]) instance.frequent_entities_dict = content instance.matcher = AcoraMatcher(content=list(content.keys()), ignore_case=True) return instance
def get_matcher_of_partie_pm_from_headers(self) -> AcoraMatcher: """ Create variations of items to search :return: a matcher of string which ignore case """ span_text = list() for full_content, short_content in zip( self.current_header['defendeur_fullname'] + self.current_header['demandeur_fullname'], self.current_header['defendeur_hidden'] + self.current_header['demandeur_hidden']): if short_content is None: span_text.append(full_content) matcher = AcoraMatcher(content=span_text, ignore_case=False) return matcher
def get_matcher_of_partie_pp_from_headers(self) -> AcoraMatcher: """ Create variations of items to search :return: a matcher of string which ignore case """ # this way of init assure that the matcher doesn't expect binary data # this may happen if we load empty arrays through update function for instance span_text = list() for full_content, short_content in zip( self.current_header['defendeur_fullname'] + self.current_header['demandeur_fullname'], self.current_header['defendeur_hidden'] + self.current_header['demandeur_hidden']): if short_content is not None: span_text.append(full_content) # first_name, last_name = get_first_last_name(full_content) # if len(first_name) > self.threshold_size: # matcher.add(first_name) # if len(last_name) > self.threshold_size: # matcher.add(last_name) matcher = AcoraMatcher(content=span_text, ignore_case=False) return matcher
def get_all_name_variation(texts: list, offsets: list, threshold_span_size: int) -> list: """ Search for any variation of known entities :param texts: original text :param offsets: discovered offsets :param threshold_span_size: minimum size of a name (first / last) to be added to the list :return: discovered offsets """ pp_text_span = list() pm_text_span = list() for current_offsets, text in zip(offsets, texts): for offset in current_offsets: start_offset, end_offset, type_name = offset text_span = text[start_offset:end_offset].strip() if len(text_span) > 0: if type_name == "PERS": pp_text_span.append(text_span) first_name, last_name = get_first_last_name(text_span) first_name = first_name.strip() last_name = last_name.strip() if len(first_name) > threshold_span_size: pp_text_span.append(first_name) if len(last_name) > threshold_span_size: pp_text_span.append(last_name) if type_name == "ORGANIZATION": pm_text_span.append(text_span) short_org_name = remove_org_type(text_span).strip() if (len(short_org_name) > 0) and (short_org_name != text_span): pm_text_span.append(short_org_name) pp_matcher = AcoraMatcher(content=pp_text_span, ignore_case=True) pm_matcher = AcoraMatcher(content=pm_text_span, ignore_case=True) results = list() for text, offset in zip(texts, offsets): results.append( pp_matcher.get_matches(text=text, tag="PERS") + pm_matcher.get_matches(text=text, tag="ORGANIZATION") + offset) return results
class FirstName: first_name_dict = None matcher = None def __init__(self, ignore_case: bool): """ Build a matcher of first name based on a French names dictionary :type ignore_case: True to ignore case during matching :return: Acora matcher """ config = get_config_default() file1 = config["first_name_dict_1"] file2 = config["first_name_dict_2"] firs_name = set() with open(file1) as f1: for line in f1.readlines(): fields = line.split(";") # all names start with a Upcase letter and finishes with a space text = fields[3].strip() if len(text) >= 4: firs_name.add(text) with open(file2, encoding="ISO-8859-1") as f2: for line in f2.readlines(): fields = line.split(";") text = fields[0].strip() if len(text) >= 4: firs_name.add(get_title_case(text)) to_remove = [ "Elle", "France", "Mercedes", "Paris", "Alger", "Oran", "Sans" ] for item_to_remove in to_remove: firs_name.remove(item_to_remove) self.first_name_dict = firs_name self.matcher = AcoraMatcher(content=list(self.first_name_dict), ignore_case=ignore_case) def get_matches(self, text: str) -> List[Offset]: """ Find match of first name in a text :param text: original text :return: list of offsets """ offsets = self.matcher.get_matches(text=text, tag="PERS") # names include a space so we fix the point by removing 1 to the offset results = [ Offset(offset.start, offset.end - 1, offset.type) for offset in offsets ] return results def contain_first_names(self, text: str) -> bool: """ Check if a text contains a known first name :param text: original text :return: True if it contains a first name """ matches = self.get_matches(text=text) for offset in matches: if (offset.end == len(text) - 1) or (not text[offset.end + 1].isalpha()): return True return False
flags=regex.VERSION1 | regex.IGNORECASE) def remove_org_type(original_text: str) -> str: """ Remove corporation type name :param original_text: Name of company included its type :return: the cleaned string """ return remove_org_type_pattern.sub(repl="", string=original_text).strip() key_words_matcher = AcoraMatcher(content=[ "Monsieur", "Madame", "Mme", "monsieur", "madame", "la société", "Me", "Maitre", "Maître" ], ignore_case=False) def remove_key_words(text: str, offsets: list, rate: int) -> tuple: """ Modify text to remove some key words, making the learning harder and the model more robust. :param text: original paragraph as a string :param offsets: list of extracted offsets :param rate: chance as an integer between 1 and 100 that a key word is removed :return: a tuple (new_text, offsets) """ words_to_delete_offsets: list = key_words_matcher.get_matches( text=text, tag="TO_DELETE")
class MatchDoubfulMwe: unknown_type_name = "UNKNOWN" pattern = "(?!M\. |\\bM\\b |Mme |Mlle |(M|m)onsieur |(M|m)adame |(M|m)ademoiselle )" \ "[A-ZÉÈ\-]+\w*" \ "( [A-ZÉÈ\-]+\w*)*" upcase_words_regex = regex.compile(pattern=pattern, flags=regex.VERSION1) first_name_matcher = FirstName(ignore_case=False) mister_matcher = AcoraMatcher(content=[ "monsieur", "madame", "Mme ", "Monsieur", "Madame", " M.", " M ", " mme " ], ignore_case=False) def add_unknown_words_offsets(self, texts: list, offsets: list) -> list: """ Add offsets of UNKNOWN words :param texts: list of original texts :param offsets: list of list of offsets :return: list of list of offsets including offset of unknown words """ result = list() for text, current_offsets in zip(texts, offsets): new_offset = self.get_unknown_words_offsets( text=text, offsets=current_offsets) result.append(new_offset) return result def get_unknown_words_offsets(self, text: str, offsets: list) -> list: """ Add unknown upcase words offset to existing ones :param text: original text :param offsets: known offset :return: offsets as a list """ unknown_offsets = self.get_all_unknown_words_offsets(text=text) all_offsets = offsets + unknown_offsets return self.clean_unknown_offsets(offsets=all_offsets) def get_all_unknown_words_offsets(self, text: str) -> list: """ Find offsets of all words in upcase. :param text: original paragraph text :return: offsets as a list """ return [(t.start(), t.end(), self.unknown_type_name) for t in self.upcase_words_regex.finditer(text) if self.predicate_keep_unknown_entities( text=text, start=t.start(), end=t.end())] def predicate_keep_unknown_entities(self, text: str, start: int, end: int) -> bool: """ Decides if an entity should be kept. 2 rules : contains a first name or preceded by Mister / Miss / ... :param text: original text :param start: offset start :param end: offset end :return: True if entity should be kept """ contain_first_name = self.first_name_matcher.contain_first_names( text=text[start:end]) if start >= 2: new_start = max(0, start - 9) previous_token = text[new_start:start] contain_mister = len( self.mister_matcher.get_matches(text=previous_token, tag="UNKNOWN")) > 0 else: contain_mister = False return contain_first_name or contain_mister def clean_unknown_offsets(self, offsets: list) -> list: """ Remove offsets of unknown type span when there is an overlap with a known offset :param offsets: cleaned offsets with old known offsets and the new ones """ result = list() sorted_offsets = sorted(offsets, key=lambda tup: (tup[0], tup[1])) for (index, (start_offset, end_offset, type_name)) in enumerate(sorted_offsets): if type_name == self.unknown_type_name: # is first token? if index > 0: previous_start_offset, previous_end_offset, previous_type_name = sorted_offsets[ index - 1] else: previous_start_offset, previous_end_offset, previous_type_name = None, None, None # is last token? if index < len(sorted_offsets) - 1: next_start_offset, next_end_offset, next_type_name = sorted_offsets[ index + 1] else: next_start_offset, next_end_offset, next_type_name = None, None, None is_start_offset_ok = (((previous_end_offset is not None) and (start_offset > previous_end_offset)) or (previous_end_offset is None)) is_end_offset_ok = ((next_start_offset is not None) and (end_offset < next_start_offset) or (next_start_offset is None)) if is_start_offset_ok and is_end_offset_ok: result.append((start_offset, end_offset, type_name)) else: result.append((start_offset, end_offset, type_name)) return result
class FrequentEntities(object): matcher = None frequent_entities_dict = None def __init__(self, path_trainset: str, threshold_occurrences: int, type_name_to_not_load: list, load_data: bool): """ Build an Acora matcher based on the dict of frequent entities :param path_trainset: path to a file storing the entity :param threshold_occurrences: minimum number of occurences of the entity :param type_name_to_not_load: type of entities that should not be loaded to avoid fake match :param load_data: boolean to decide if data should be loaded or not :return: an Acora matcher matcher """ if load_data: self.frequent_entities_dict = self.__read_frequent_entities( path_trainset=path_trainset, threshold_occurrences=threshold_occurrences, type_name_to_not_load=type_name_to_not_load) self.matcher = AcoraMatcher(content=list( self.frequent_entities_dict.keys()), ignore_case=True) else: self.matcher = AcoraMatcher(content=["!@#$%^&*()"], ignore_case=True) @classmethod def test_builder(cls, content: dict): """ Build an instance of this object for tests. In particular, don't try to read saved data :param content: a dictionary of entities to load :return: an instance of FrequentEntities class """ instance = FrequentEntities(path_trainset="", threshold_occurrences=0, load_data=False, type_name_to_not_load=[]) instance.frequent_entities_dict = content instance.matcher = AcoraMatcher(content=list(content.keys()), ignore_case=True) return instance @staticmethod def __read_frequent_entities(path_trainset: str, threshold_occurrences: int, type_name_to_not_load: list) -> dict: """ Analyze recognized entities and return those over a defined threshold in a dict entity -> type_name """ try: with open(path_trainset, 'rb') as f: data = pickle.load(file=f) def get_default_dict_value() -> dict: default_dict_value = dict([(token, set()) for token in entity_types]) # default_dict_value['general_count'] = 0 return default_dict_value exhaustive_dict = dict() for case_id, text, entities in data: for start_offset, end_offset, type_name in entities: entity_span = text[start_offset:end_offset].lower() current_count = exhaustive_dict.get( entity_span, get_default_dict_value()) current_count[type_name].add(case_id) exhaustive_dict[entity_span] = current_count final_list = list() for entity_span, dict_counts in exhaustive_dict.items(): max_count = 0 max_type_name = None for type_name, case_ids in dict_counts.items(): current_count = len(case_ids) if current_count > max_count: max_type_name = type_name max_count = current_count if (max_count > threshold_occurrences) and \ (len(entity_span) > 3) \ and (max_type_name not in type_name_to_not_load): final_list.append((entity_span, max_type_name)) return dict(final_list) except: warnings.warn("Empty dict of frequent entities", Warning) return dict() def get_matches(self, text: str) -> list: """ Find matches of frequent entities in the provided text :param text: original text :return: a list of offsets """ match_results = self.matcher.findall(text) entities = list() for match_text, start_offset in match_results: end_offset = start_offset + len(match_text) entity_span = text[start_offset:end_offset] # end_offset is one character after the end of the selection, # so it can be equal to the last charcter offset of the text + 1 last_char_ok = (end_offset == len(text)) or (not text[end_offset].isalnum()) first_char_ok = (start_offset == 0 or not text[start_offset - 1].isalnum()) and \ (text[start_offset].isupper() or text[start_offset].isdecimal()) if first_char_ok and last_char_ok: type_name = self.frequent_entities_dict[entity_span.lower()] entities.append((start_offset, end_offset, type_name)) return entities