def test_header_parser():
    config_training = get_config_default()
    xml_path = config_training["xml_unittest_file"]
    header_content = parse_xml_header(path=xml_path)
    assert len(header_content) == 1
    assert header_content['CA-aix-en-provence-20130208-1022871-jurica'][
        'defendeur_fullname'] == ['Catherine ***REMOVED***']
def test_match_headers_content():
    config_training = get_config_default()
    xml_path = config_training["xml_unittest_file"]
    header_content_all_cases = parse_xml_header(path=xml_path)
    case_id = list(header_content_all_cases.keys())[0]
    header_content = header_content_all_cases[case_id]
    headers_matcher = MatchValuesFromHeaders(current_header=header_content, threshold_size=3)
    matcher_partie_pp = headers_matcher.get_matcher_of_partie_pp_from_headers()

    text1 = "C'est Catherine ***REMOVED*** qui est responsable de ces faits avec M. LEON ***REMOVED***"

    assert matcher_partie_pp.get_matches(text1, "PERS") == [(6, 29, "PERS")]
Example #3
0
    def __init__(self):
        """
        Build a matcher of French court names based on a list available in open data
        https://www.data.gouv.fr/fr/datasets/les-statistiques-par-juridiction/#_
        (the list has more data, the one store is an extraction)
        """
        config = get_config_default()
        file = config["french_court_names"]

        with open(file) as f1:
            for line in f1.readlines():
                clean_text = line.strip()
                if len(clean_text) > 0:
                    self.court_names.add(clean_text)
        assert len(self.court_names) > 1000
        self.matcher = AcoraMatcher(content=list(self.court_names),
                                    ignore_case=True)
Example #4
0
    def __init__(self):
        """
        Build a matcher of first name based on a French names dictionary
        """
        postal_code_city_list = list()
        config = get_config_default()
        file = config["postal_code_city"]

        with open(file) as f1:
            for line in f1.readlines():
                fields = line.split(";")
                city = fields[1].strip()
                if len(city) >= 3:
                    postal_code = fields[2].strip()
                    postal_code_city_list.append(postal_code + " " + city)
                    postal_code_city_list.append(city + " (" + postal_code +
                                                 ")")
        assert len(postal_code_city_list) > 1000
        postal_code_city_list.pop(0)
        self.matcher = AcoraMatcher(list(postal_code_city_list),
                                    ignore_case=True)
    def __init__(self, ignore_case: bool):
        """
        Build a matcher of first name based on a French names dictionary
        :type ignore_case: True to ignore case during matching
        :return: Acora matcher
        """
        config = get_config_default()

        file1 = config["first_name_dict_1"]
        file2 = config["first_name_dict_2"]

        firs_name = set()
        with open(file1) as f1:
            for line in f1.readlines():
                fields = line.split(";")
                # all names start with a Upcase letter and finishes with a space
                text = fields[3].strip()
                if len(text) >= 4:
                    firs_name.add(text)

        with open(file2, encoding="ISO-8859-1") as f2:
            for line in f2.readlines():
                fields = line.split(";")
                text = fields[0].strip()
                if len(text) >= 4:
                    firs_name.add(get_title_case(text))

        to_remove = [
            "Elle", "France", "Mercedes", "Paris", "Alger", "Oran", "Sans"
        ]

        for item_to_remove in to_remove:
            firs_name.remove(item_to_remove)

        self.first_name_dict = firs_name
        self.matcher = AcoraMatcher(content=list(self.first_name_dict),
                                    ignore_case=ignore_case)
from match_text_unsafe.extend_names import ExtendNames
from match_text_unsafe.find_header_values import parse_xml_headers
from match_text_unsafe.postal_code_dictionary_matcher import PostalCodeCity
from misc.normalize_offset import normalize_offsets, remove_spaces_included_in_offsets, \
    clean_offsets_from_unwanted_words
from modify_text.change_case import random_case_change
from modify_text.modify_strings import remove_key_words
from ner.training_function import train_model
from resources.config_provider import get_config_default
from viewer.spacy_viewer import convert_offsets_to_spacy_docs, view_spacy_docs
from xml_extractions.extract_node_values import get_paragraph_from_file, Paragraph, Offset

# reproducibility
seed(123)

config_training = get_config_default()
xml_train_path = config_training["xml_train_path"]
model_dir_path = config_training["model_dir_path"]
n_iter = int(config_training["number_iterations"])
batch_size = int(config_training["batch_size"])
dropout_rate = float(config_training["dropout_rate"])
training_set_export_path = config_training["training_set"]
change_case_rate = int(config_training["change_case_rate"])
remove_keyword_rate = int(config_training["remove_keyword_rate"])
frequent_entity_threshold = int(config_training["frequent_entity_threshold"])
number_of_paragraph_to_display = int(
    config_training["number_of_paragraph_to_display"])

print(len(sys.argv))
assert len(sys.argv) <= 2