def remove_key_words(text: str, offsets: List[Offset],
                     rate: int) -> Tuple[str, List[Offset]]:
    """
    Modify text to remove some key words, making the learning harder and the model more robust.
    :param text: original paragraph as a string
    :param offsets: list of extracted offsets
    :param rate: chance as an integer between 1 and 100 that a key word is removed
    :return: a tuple (new_text, offsets)
    """
    words_to_delete_offsets: List[Offset] = key_words_matcher.get_matches(
        text=text, tag="TO_DELETE")

    if (len(words_to_delete_offsets) == 0) or (len(offsets) == 0):
        return text, offsets

    detected_spans = dict()
    for offset in offsets:
        span_text = text[offset.start:offset.end]
        if len(span_text) > 0:
            detected_spans[span_text] = offset.type

    if len(detected_spans) == 0:
        return text, offsets

    original_content_offsets_matcher = AcoraMatcher(content=list(
        detected_spans.keys()),
                                                    ignore_case=False)

    cleaned_text = list()
    start_selection_offset = 0
    for offset in words_to_delete_offsets:
        if randint(1, 99) < rate:
            # - 1 to remove also the space following the keyword to remove
            cleaned_text.append(text[start_selection_offset:offset.start - 1])
            start_selection_offset = offset.end
        else:
            cleaned_text.append(text[start_selection_offset:offset.end])
            start_selection_offset = offset.end

    cleaned_text.append(text[start_selection_offset:len(text)])

    cleaned_text = ''.join(cleaned_text)

    updated_offsets = original_content_offsets_matcher.get_matches(
        text=cleaned_text, tag="UNKNOWN")

    offsets_to_return = list()

    # restore original offset type name
    for offset in updated_offsets:
        span_text = cleaned_text[offset.start:offset.end]
        type_name = detected_spans[span_text]
        offsets_to_return.append(Offset(offset.start, offset.end, type_name))

    return cleaned_text, offsets_to_return
Example #2
0
def complete_case_annotations(spacy_docs: List[Doc],
                              entity_typename: Dict[str, str]) -> List[Doc]:
    """
    Complete/Normalize annotations from the spacy tagger.

    :param spacy_docs: the spacy annotations
    :param entity_typename: the dictionary with each occurence type
    :returns: the updated spacy_annotions (for convenience only, as the update is inplace)
    """

    if len(spacy_docs) > 0:
        matcher = AcoraMatcher(content=list(entity_typename.keys()),
                               ignore_case=True)

        doc_text, empty_offsets = zip(*[(spacy_doc.text, [])
                                        for spacy_doc in spacy_docs])
        document_addresses_offsets = find_address_in_block_of_paragraphs(
            texts=list(doc_text), offsets=list(empty_offsets))

        for spacy_doc, doc_address_offset in zip(spacy_docs,
                                                 document_addresses_offsets):

            matches = matcher.get_matches(text=spacy_doc.text, tag="UNKNOWN")
            matcher_offsets = list()
            for offset in matches:
                span_text = spacy_doc.text[offset.start:offset.end]
                logger.debug(span_text)
                offset.type = entity_typename[span_text.lower()]
                matcher_offsets.append(offset)

            matcher_offsets_normalized = normalize_offsets(
                offsets=matcher_offsets + doc_address_offset)

            spacy_matcher_offset: List[Span] = list()
            for offset in matcher_offsets_normalized:
                # https://spacy.io/usage/linguistic-features#section-named-entities
                span_doc: Span = spacy_doc.char_span(offset.start,
                                                     offset.end,
                                                     label=offset.type)
                if span_doc is not None:
                    # span will be none if the word is incomplete
                    spacy_matcher_offset.append(span_doc)

                else:
                    logger.error(
                        f"ERROR char offset [{spacy_doc.text[offset.start:offset.end]}] "
                        f"from [{spacy_doc.text}]")

            spacy_doc.ents = spacy_matcher_offset  # all_offsets

    return spacy_docs
Example #3
0
class PostalCodeCity:
    matcher = None

    def __init__(self):
        """
        Build a matcher of first name based on a French names dictionary
        """
        postal_code_city_list = list()
        config = get_config_default()
        file = config["postal_code_city"]

        with open(file) as f1:
            for line in f1.readlines():
                fields = line.split(";")
                city = fields[1].strip()
                if len(city) >= 3:
                    postal_code = fields[2].strip()
                    postal_code_city_list.append(postal_code + " " + city)
                    postal_code_city_list.append(city + " (" + postal_code +
                                                 ")")
        assert len(postal_code_city_list) > 1000
        postal_code_city_list.pop(0)
        self.matcher = AcoraMatcher(list(postal_code_city_list),
                                    ignore_case=True)

    def get_matches(self, text: str) -> List[Offset]:
        """
        Find match of postal code and city names in a text
        :param text: original text
        :return: list of offsets
        """
        return self.matcher.get_matches(text=text, tag="ADDRESS")
Example #4
0
class CourtName:
    court_names = set()
    matcher = None

    def __init__(self):
        """
        Build a matcher of French court names based on a list available in open data
        https://www.data.gouv.fr/fr/datasets/les-statistiques-par-juridiction/#_
        (the list has more data, the one store is an extraction)
        """
        config = get_config_default()
        file = config["french_court_names"]

        with open(file) as f1:
            for line in f1.readlines():
                clean_text = line.strip()
                if len(clean_text) > 0:
                    self.court_names.add(clean_text)
        assert len(self.court_names) > 1000
        self.matcher = AcoraMatcher(content=list(self.court_names),
                                    ignore_case=True)

    def get_matches(self, text: str) -> List[Offset]:
        """
        Find match of French court names in a text
        :param text: original text
        :return: list of offsets
        """
        return self.matcher.get_matches(text=text, tag="COURT")
Example #5
0
    def __init__(self):
        """
        Build a matcher of French court names based on a list available in open data
        https://www.data.gouv.fr/fr/datasets/les-statistiques-par-juridiction/#_
        (the list has more data, the one store is an extraction)
        """
        config = get_config_default()
        file = config["french_court_names"]

        with open(file) as f1:
            for line in f1.readlines():
                clean_text = line.strip()
                if len(clean_text) > 0:
                    self.court_names.add(clean_text)
        assert len(self.court_names) > 1000
        self.matcher = AcoraMatcher(content=list(self.court_names),
                                    ignore_case=True)
Example #6
0
    def __init__(self):
        """
        Build a matcher of first name based on a French names dictionary
        """
        postal_code_city_list = list()
        config = get_config_default()
        file = config["postal_code_city"]

        with open(file) as f1:
            for line in f1.readlines():
                fields = line.split(";")
                city = fields[1].strip()
                if len(city) >= 3:
                    postal_code = fields[2].strip()
                    postal_code_city_list.append(postal_code + " " + city)
                    postal_code_city_list.append(city + " (" + postal_code +
                                                 ")")
        assert len(postal_code_city_list) > 1000
        postal_code_city_list.pop(0)
        self.matcher = AcoraMatcher(list(postal_code_city_list),
                                    ignore_case=True)
 def __init__(self, path_trainset: str, threshold_occurrences: int,
              type_name_to_not_load: list, load_data: bool):
     """
     Build an Acora matcher based on the dict of frequent entities
     :param path_trainset: path to a file storing the entity
     :param threshold_occurrences: minimum number of occurences of the entity
     :param type_name_to_not_load: type of entities that should not be loaded to avoid fake match
     :param load_data: boolean to decide if data should be loaded or not
     :return: an Acora matcher matcher
     """
     if load_data:
         self.frequent_entities_dict = self.__read_frequent_entities(
             path_trainset=path_trainset,
             threshold_occurrences=threshold_occurrences,
             type_name_to_not_load=type_name_to_not_load)
         self.matcher = AcoraMatcher(content=list(
             self.frequent_entities_dict.keys()),
                                     ignore_case=True)
     else:
         self.matcher = AcoraMatcher(content=["!@#$%^&*()"],
                                     ignore_case=True)
Example #8
0
 def get_matcher_of_clerks_from_headers(self) -> AcoraMatcher:
     """
     Create variations of items to search
     :return: a matcher of string which ignore case
     """
     header_content = self.current_header['greffier']
     matcher = AcoraMatcher(content=header_content, ignore_case=False)
     # for content in header_content:
     #     first_name, last_name = get_first_last_name(content)
     #     if len(first_name) > self.threshold_size:
     #         matcher.add(first_name)
     #     if len(last_name) > self.threshold_size:
     #         matcher.add(last_name)
     return matcher
    def __init__(self, ignore_case: bool):
        """
        Build a matcher of first name based on a French names dictionary
        :type ignore_case: True to ignore case during matching
        :return: Acora matcher
        """
        config = get_config_default()

        file1 = config["first_name_dict_1"]
        file2 = config["first_name_dict_2"]

        firs_name = set()
        with open(file1) as f1:
            for line in f1.readlines():
                fields = line.split(";")
                # all names start with a Upcase letter and finishes with a space
                text = fields[3].strip()
                if len(text) >= 4:
                    firs_name.add(text)

        with open(file2, encoding="ISO-8859-1") as f2:
            for line in f2.readlines():
                fields = line.split(";")
                text = fields[0].strip()
                if len(text) >= 4:
                    firs_name.add(get_title_case(text))

        to_remove = [
            "Elle", "France", "Mercedes", "Paris", "Alger", "Oran", "Sans"
        ]

        for item_to_remove in to_remove:
            firs_name.remove(item_to_remove)

        self.first_name_dict = firs_name
        self.matcher = AcoraMatcher(content=list(self.first_name_dict),
                                    ignore_case=ignore_case)
 def test_builder(cls, content: dict):
     """
     Build an instance of this object for tests.
     In particular, don't try to read saved data
     :param content: a dictionary of entities to load
     :return: an instance of FrequentEntities class
     """
     instance = FrequentEntities(path_trainset="",
                                 threshold_occurrences=0,
                                 load_data=False,
                                 type_name_to_not_load=[])
     instance.frequent_entities_dict = content
     instance.matcher = AcoraMatcher(content=list(content.keys()),
                                     ignore_case=True)
     return instance
Example #11
0
    def get_matcher_of_partie_pm_from_headers(self) -> AcoraMatcher:
        """
        Create variations of items to search
        :return: a matcher of string which ignore case
        """
        span_text = list()

        for full_content, short_content in zip(
                self.current_header['defendeur_fullname'] +
                self.current_header['demandeur_fullname'],
                self.current_header['defendeur_hidden'] +
                self.current_header['demandeur_hidden']):
            if short_content is None:
                span_text.append(full_content)

        matcher = AcoraMatcher(content=span_text, ignore_case=False)
        return matcher
Example #12
0
    def get_matcher_of_partie_pp_from_headers(self) -> AcoraMatcher:
        """
        Create variations of items to search
        :return: a matcher of string which ignore case
        """
        # this way of init assure that the matcher doesn't expect binary data
        # this may happen if we load empty arrays through update function for instance
        span_text = list()
        for full_content, short_content in zip(
                self.current_header['defendeur_fullname'] +
                self.current_header['demandeur_fullname'],
                self.current_header['defendeur_hidden'] +
                self.current_header['demandeur_hidden']):
            if short_content is not None:
                span_text.append(full_content)
                # first_name, last_name = get_first_last_name(full_content)
                # if len(first_name) > self.threshold_size:
                #     matcher.add(first_name)
                # if len(last_name) > self.threshold_size:
                #     matcher.add(last_name)

        matcher = AcoraMatcher(content=span_text, ignore_case=False)
        return matcher
def get_all_name_variation(texts: list, offsets: list,
                           threshold_span_size: int) -> list:
    """
    Search for any variation of known entities
    :param texts: original text
    :param offsets: discovered offsets
    :param threshold_span_size: minimum size of a name (first / last) to be added to the list
    :return: discovered offsets
    """
    pp_text_span = list()
    pm_text_span = list()
    for current_offsets, text in zip(offsets, texts):
        for offset in current_offsets:
            start_offset, end_offset, type_name = offset
            text_span = text[start_offset:end_offset].strip()
            if len(text_span) > 0:
                if type_name == "PERS":
                    pp_text_span.append(text_span)
                    first_name, last_name = get_first_last_name(text_span)
                    first_name = first_name.strip()
                    last_name = last_name.strip()

                    if len(first_name) > threshold_span_size:
                        pp_text_span.append(first_name)
                    if len(last_name) > threshold_span_size:
                        pp_text_span.append(last_name)

                if type_name == "ORGANIZATION":
                    pm_text_span.append(text_span)
                    short_org_name = remove_org_type(text_span).strip()
                    if (len(short_org_name) > 0) and (short_org_name !=
                                                      text_span):
                        pm_text_span.append(short_org_name)

    pp_matcher = AcoraMatcher(content=pp_text_span, ignore_case=True)
    pm_matcher = AcoraMatcher(content=pm_text_span, ignore_case=True)

    results = list()

    for text, offset in zip(texts, offsets):
        results.append(
            pp_matcher.get_matches(text=text, tag="PERS") +
            pm_matcher.get_matches(text=text, tag="ORGANIZATION") + offset)

    return results
class FirstName:
    first_name_dict = None
    matcher = None

    def __init__(self, ignore_case: bool):
        """
        Build a matcher of first name based on a French names dictionary
        :type ignore_case: True to ignore case during matching
        :return: Acora matcher
        """
        config = get_config_default()

        file1 = config["first_name_dict_1"]
        file2 = config["first_name_dict_2"]

        firs_name = set()
        with open(file1) as f1:
            for line in f1.readlines():
                fields = line.split(";")
                # all names start with a Upcase letter and finishes with a space
                text = fields[3].strip()
                if len(text) >= 4:
                    firs_name.add(text)

        with open(file2, encoding="ISO-8859-1") as f2:
            for line in f2.readlines():
                fields = line.split(";")
                text = fields[0].strip()
                if len(text) >= 4:
                    firs_name.add(get_title_case(text))

        to_remove = [
            "Elle", "France", "Mercedes", "Paris", "Alger", "Oran", "Sans"
        ]

        for item_to_remove in to_remove:
            firs_name.remove(item_to_remove)

        self.first_name_dict = firs_name
        self.matcher = AcoraMatcher(content=list(self.first_name_dict),
                                    ignore_case=ignore_case)

    def get_matches(self, text: str) -> List[Offset]:
        """
        Find match of first name in a text
        :param text: original text
        :return: list of offsets
        """
        offsets = self.matcher.get_matches(text=text, tag="PERS")
        # names include a space so we fix the point by removing 1 to the offset
        results = [
            Offset(offset.start, offset.end - 1, offset.type)
            for offset in offsets
        ]
        return results

    def contain_first_names(self, text: str) -> bool:
        """
        Check if a text contains a known first name
        :param text: original text
        :return: True if it contains a first name
        """
        matches = self.get_matches(text=text)
        for offset in matches:
            if (offset.end
                    == len(text) - 1) or (not text[offset.end + 1].isalpha()):
                return True

        return False
Example #15
0
                                        flags=regex.VERSION1
                                        | regex.IGNORECASE)


def remove_org_type(original_text: str) -> str:
    """
    Remove corporation type name
    :param original_text: Name of company included its type
    :return: the cleaned string
    """
    return remove_org_type_pattern.sub(repl="", string=original_text).strip()


key_words_matcher = AcoraMatcher(content=[
    "Monsieur", "Madame", "Mme", "monsieur", "madame", "la société", "Me",
    "Maitre", "Maître"
],
                                 ignore_case=False)


def remove_key_words(text: str, offsets: list, rate: int) -> tuple:
    """
    Modify text to remove some key words, making the learning harder and the model more robust.
    :param text: original paragraph as a string
    :param offsets: list of extracted offsets
    :param rate: chance as an integer between 1 and 100 that a key word is removed
    :return: a tuple (new_text, offsets)
    """
    words_to_delete_offsets: list = key_words_matcher.get_matches(
        text=text, tag="TO_DELETE")
class MatchDoubfulMwe:
    unknown_type_name = "UNKNOWN"
    pattern = "(?!M\. |\\bM\\b |Mme |Mlle |(M|m)onsieur |(M|m)adame |(M|m)ademoiselle )" \
              "[A-ZÉÈ\-]+\w*" \
              "( [A-ZÉÈ\-]+\w*)*"
    upcase_words_regex = regex.compile(pattern=pattern, flags=regex.VERSION1)
    first_name_matcher = FirstName(ignore_case=False)
    mister_matcher = AcoraMatcher(content=[
        "monsieur", "madame", "Mme ", "Monsieur", "Madame", " M.", " M ",
        " mme "
    ],
                                  ignore_case=False)

    def add_unknown_words_offsets(self, texts: list, offsets: list) -> list:
        """
        Add offsets of UNKNOWN words
        :param texts: list of original texts
        :param offsets: list of list of offsets
        :return: list of list of offsets including offset of unknown words
        """
        result = list()
        for text, current_offsets in zip(texts, offsets):
            new_offset = self.get_unknown_words_offsets(
                text=text, offsets=current_offsets)
            result.append(new_offset)
        return result

    def get_unknown_words_offsets(self, text: str, offsets: list) -> list:
        """
        Add unknown upcase words offset to existing ones
        :param text: original text
        :param offsets: known offset
        :return: offsets as a list
        """
        unknown_offsets = self.get_all_unknown_words_offsets(text=text)
        all_offsets = offsets + unknown_offsets
        return self.clean_unknown_offsets(offsets=all_offsets)

    def get_all_unknown_words_offsets(self, text: str) -> list:
        """
        Find offsets of all words in upcase.
        :param text: original paragraph text
        :return: offsets as a list
        """
        return [(t.start(), t.end(), self.unknown_type_name)
                for t in self.upcase_words_regex.finditer(text)
                if self.predicate_keep_unknown_entities(
                    text=text, start=t.start(), end=t.end())]

    def predicate_keep_unknown_entities(self, text: str, start: int,
                                        end: int) -> bool:
        """
        Decides if an entity should be kept.
        2 rules : contains a first name or preceded by Mister / Miss / ...
        :param text: original text
        :param start: offset start
        :param end: offset end
        :return: True if entity should be kept
        """
        contain_first_name = self.first_name_matcher.contain_first_names(
            text=text[start:end])

        if start >= 2:
            new_start = max(0, start - 9)
            previous_token = text[new_start:start]
            contain_mister = len(
                self.mister_matcher.get_matches(text=previous_token,
                                                tag="UNKNOWN")) > 0
        else:
            contain_mister = False

        return contain_first_name or contain_mister

    def clean_unknown_offsets(self, offsets: list) -> list:
        """
        Remove offsets of unknown type span when there is an overlap with a known offset
        :param offsets: cleaned offsets with old known offsets and the new ones
        """
        result = list()
        sorted_offsets = sorted(offsets, key=lambda tup: (tup[0], tup[1]))

        for (index, (start_offset, end_offset,
                     type_name)) in enumerate(sorted_offsets):
            if type_name == self.unknown_type_name:

                # is first token?
                if index > 0:
                    previous_start_offset, previous_end_offset, previous_type_name = sorted_offsets[
                        index - 1]
                else:
                    previous_start_offset, previous_end_offset, previous_type_name = None, None, None

                # is last token?
                if index < len(sorted_offsets) - 1:
                    next_start_offset, next_end_offset, next_type_name = sorted_offsets[
                        index + 1]
                else:
                    next_start_offset, next_end_offset, next_type_name = None, None, None

                is_start_offset_ok = (((previous_end_offset is not None) and
                                       (start_offset > previous_end_offset))
                                      or (previous_end_offset is None))

                is_end_offset_ok = ((next_start_offset is not None) and
                                    (end_offset < next_start_offset)
                                    or (next_start_offset is None))

                if is_start_offset_ok and is_end_offset_ok:
                    result.append((start_offset, end_offset, type_name))

            else:
                result.append((start_offset, end_offset, type_name))
        return result
class FrequentEntities(object):
    matcher = None
    frequent_entities_dict = None

    def __init__(self, path_trainset: str, threshold_occurrences: int,
                 type_name_to_not_load: list, load_data: bool):
        """
        Build an Acora matcher based on the dict of frequent entities
        :param path_trainset: path to a file storing the entity
        :param threshold_occurrences: minimum number of occurences of the entity
        :param type_name_to_not_load: type of entities that should not be loaded to avoid fake match
        :param load_data: boolean to decide if data should be loaded or not
        :return: an Acora matcher matcher
        """
        if load_data:
            self.frequent_entities_dict = self.__read_frequent_entities(
                path_trainset=path_trainset,
                threshold_occurrences=threshold_occurrences,
                type_name_to_not_load=type_name_to_not_load)
            self.matcher = AcoraMatcher(content=list(
                self.frequent_entities_dict.keys()),
                                        ignore_case=True)
        else:
            self.matcher = AcoraMatcher(content=["!@#$%^&*()"],
                                        ignore_case=True)

    @classmethod
    def test_builder(cls, content: dict):
        """
        Build an instance of this object for tests.
        In particular, don't try to read saved data
        :param content: a dictionary of entities to load
        :return: an instance of FrequentEntities class
        """
        instance = FrequentEntities(path_trainset="",
                                    threshold_occurrences=0,
                                    load_data=False,
                                    type_name_to_not_load=[])
        instance.frequent_entities_dict = content
        instance.matcher = AcoraMatcher(content=list(content.keys()),
                                        ignore_case=True)
        return instance

    @staticmethod
    def __read_frequent_entities(path_trainset: str,
                                 threshold_occurrences: int,
                                 type_name_to_not_load: list) -> dict:
        """
        Analyze recognized entities and return those over a defined threshold in a dict entity -> type_name
        """
        try:
            with open(path_trainset, 'rb') as f:
                data = pickle.load(file=f)

            def get_default_dict_value() -> dict:
                default_dict_value = dict([(token, set())
                                           for token in entity_types])
                # default_dict_value['general_count'] = 0
                return default_dict_value

            exhaustive_dict = dict()

            for case_id, text, entities in data:
                for start_offset, end_offset, type_name in entities:
                    entity_span = text[start_offset:end_offset].lower()
                    current_count = exhaustive_dict.get(
                        entity_span, get_default_dict_value())
                    current_count[type_name].add(case_id)
                    exhaustive_dict[entity_span] = current_count

            final_list = list()

            for entity_span, dict_counts in exhaustive_dict.items():

                max_count = 0
                max_type_name = None
                for type_name, case_ids in dict_counts.items():
                    current_count = len(case_ids)
                    if current_count > max_count:
                        max_type_name = type_name
                        max_count = current_count

                if (max_count > threshold_occurrences) and \
                        (len(entity_span) > 3) \
                        and (max_type_name not in type_name_to_not_load):
                    final_list.append((entity_span, max_type_name))

            return dict(final_list)

        except:
            warnings.warn("Empty dict of frequent entities", Warning)
            return dict()

    def get_matches(self, text: str) -> list:
        """
        Find matches of frequent entities in the provided text
        :param text: original text
        :return: a list of offsets
        """
        match_results = self.matcher.findall(text)
        entities = list()
        for match_text, start_offset in match_results:
            end_offset = start_offset + len(match_text)
            entity_span = text[start_offset:end_offset]

            # end_offset is one character after the end of the selection,
            # so it can be equal to the last charcter offset of the text + 1
            last_char_ok = (end_offset
                            == len(text)) or (not text[end_offset].isalnum())

            first_char_ok = (start_offset == 0 or not text[start_offset - 1].isalnum()) and \
                            (text[start_offset].isupper() or text[start_offset].isdecimal())

            if first_char_ok and last_char_ok:
                type_name = self.frequent_entities_dict[entity_span.lower()]
                entities.append((start_offset, end_offset, type_name))

        return entities