def __init__(self, config: dict):
        super().__init__(config)
        self.logger = get_logger(__name__)

        self.debug = False
        self.split_type = "date-stratified"
        self.dataset_name = "criticality_prediction"
        # TODO wait for section splitting in other courts for facts and considerations to be enabled
        self.feature_cols = ['full_text'
                             ]  # ['facts', 'considerations', 'text']
    def __init__(self, config: dict):
        super().__init__(config)
        self.logger = get_logger(__name__)

        self.ARE_names = {
            "de": "Bundesamt für Raumentwicklung",
            "fr": "Office fédéral du développement territorial",
            "it": "Ufficio federale dello sviluppo territoriale",
        }
        self.law_abbrs = {"de": "RPG", "fr": "LAT", "it": "LPT"}
 def init(self):
     self.logger = get_logger(__name__)
     #  IMPORTANT: we need to take care of the fact that the laws are named differently in each language but refer to the same law!
     self.law_abbr_by_lang = self.build_law_abbr_by_lang(
     )  # the abbreviations are the keys
     self.law_id_by_lang = {
         lang: {v: k
                for k, v in laws.items()}
         for lang, laws in self.law_abbr_by_lang.items()
     }  # the ids are the keys
Exemple #4
0
    def __init__(self, config: dict):
        super().__init__(config)
        self.logger = get_logger(__name__)

        self.subdir = self.slc_subdir
        self.spacy_subdir = self.slc_spacy_subdir
        self.db = self.db_slc
        self.tables_name = "slc"
        self.tables = ["slc"]
        self.entries_template = {'sr': [], 'title': [], 'lang': [], 'text': []}
        self.glob_str = "30_XML_POSTagged/DE/*.xml"
    def __init__(self, config: dict):
        __metaclass__ = abc.ABCMeta
        super().__init__(config)
        self.logger = get_logger(__name__)

        self.seed = 42
        self.minFeatureColLength = 100  # characters
        self.debug_chunksize = int(2e2)
        self.real_chunksize = int(2e5)

        self.split_type = None  # to be overridden
        self.dataset_name = None  # to be overridden
        self.feature_cols = ["text"]  # to be overridden
Exemple #6
0
    def __init__(self, config: dict):
        super().__init__(config)
        self.logger = get_logger(__name__)

        self.models = {
            'de': 'de_core_news_lg',
            'fr': 'fr_core_news_lg',
            'it': 'it_core_news_lg'
        }
        # tag, pos and lemma are enough for now
        self.disable_pipes = ['senter', 'ner', 'attribute_ruler', 'textcat']
        self.active_spacy_model = None
        self.active_bert_tokenizer = None
    def __init__(self, config: dict):
        super().__init__(config)
        self.logger = get_logger(__name__)

        self.debug = True
        self.split_type = "date-stratified"
        self.dataset_name = "judgment_prediction"
        self.feature_cols = ['facts', 'considerations']

        self.with_partials = False
        self.with_write_off = False
        self.with_unification = False
        self.with_inadmissible = False
        self.make_single_label = True
 def __init__(self,
              config: dict,
              function_name: str,
              col_name: str,
              col_type: str = 'jsonb'):
     super().__init__(config)
     self.logger = get_logger(__name__)
     self.processing_functions = self.load_functions(config, function_name)
     self.logger.debug(self.processing_functions)
     self.col_name = col_name
     self.col_type = col_type
     self.processed_amount = 0
     self.total_to_process = -1
     self.spider_specific_dir = self.create_dir(
         ROOT_DIR, config['dir']['spider_specific_dir'])
Exemple #9
0
 def __init__(self, config: dict):
     super().__init__(config,
                      function_name='citation_extracting_functions',
                      col_name='citations')
     self.logger = get_logger(__name__)
     self.processed_file_path = self.progress_dir / "spiders_citation_extracted.txt"
     self.logger_info = {
         'start': 'Started extracting citations',
         'finished': 'Finished extracting citations',
         'start_spider': 'Started extracting citations for spider',
         'finish_spider': 'Finished extracting citations for spider',
         'saving': 'Saving chunk of citations',
         'processing_one': 'Extracting citations from',
         'no_functions': 'Not extracting citations.'
     }
    def __init__(self, config: dict):
        super().__init__(config)
        self.logger = get_logger(__name__)

        self.subdir = self.wikipedia_subdir
        self.spacy_subdir = self.wikipedia_spacy_subdir
        self.db = self.db_wikipedia
        self.tables_name = "wikipedia"
        self.tables = ["wikipedia"]
        self.entries_template = {
            'wiki_id': [],
            'title': [],
            'url': [],
            'text': []
        }
        self.glob_str = "xml/**/*"
 def __init__(self, config: dict):
     super().__init__(config)
     self.court_keys = [
         "spider",
         "canton",
         "court",
         "chamber",
         "date",
         "file_name",
         "file_number",
         "file_number_additional",
         "html_url",
         "html_raw",
         "pdf_url",
         "pdf_raw",
     ]
     self.logger = get_logger(__name__)
 def __init__(self, config: dict):
     super().__init__(config,
                      function_name="pattern_extracting_functions",
                      col_name='')
     self.logger = get_logger(__name__)
     self.columns = ['keyword', 'totalcount', 'example']
     self.test = Language.DE
     self.df = pd.DataFrame(columns=self.columns)
     self.language = {}
     self.currentLanguage = ''
     self.total = 0
     self.counter = 0
     self.spider = ''
     self.limit = 0
     self.dict = {
         Language.DE: {
             'count': 0,
             'dict': {}
         },
         Language.FR: {
             'count': 0,
             'dict': {}
         },
         Language.IT: {
             'count': 0,
             'dict': {}
         },
         Language.EN: {
             'count': 0,
             'dict': {}
         }
     }
     self.end = {}
     self.logger_info = {
         'start': 'Started pattern extraction',
         'finished': 'Finished pattern extraction',
         'start_spider': 'Started pattern extraction for spider',
         'finish_spider': 'Finished pattern extraction for spider',
     }
     self.processed_file_path = self.progress_dir / "pattern_extraction.txt"
    def __init__(self, config: dict):
        super().__init__(config)
        self.logger = get_logger(__name__)

        self.debug = True
        self.split_type = "date-stratified"
        self.dataset_name = "doc2doc_ir"
        self.feature_cols = ['facts', 'considerations']

        self.dataset_folder = self.create_dir(self.datasets_subdir,
                                              self.dataset_name)
        if self.debug:
            # make sure that we don't overwrite progress in the real directory
            self.dataset_folder = self.create_dir(self.tmp_subdir,
                                                  self.dataset_name)

        self.num_ruling_citations = 1000  # the 1000 most common ruling citations will be included

        self.load_rulings()
        self.load_law_articles()

        pandarallel.initialize(progress_bar=True)
        tqdm.pandas()
Exemple #14
0
    def __init__(self, config: dict):
        super().__init__(config,
                         function_name='section_splitting_functions',
                         col_name='sections')
        self.logger = get_logger(__name__)
        self.logger_info = {
            'start': 'Started section splitting',
            'finished': 'Finished section splitting',
            'start_spider': 'Started splitting sections for spider',
            'finish_spider': 'Finished splitting sections for spider',
            'saving': 'Saving chunk of recognized sections',
            'processing_one': 'Splitting sections from file',
            'no_functions': 'Not splitting into sections.'
        }

        # Get the tokenizers at the start so they don't have to be loaded with every chunk
        # Return value: spacy_tokenizer, bert_tokenizer = tokenizers['de']
        self.tokenizers: dict[Language, Tuple[any, any]] = {
            Language.DE: self.get_tokenizers('de'),
            Language.FR: self.get_tokenizers('fr'),
            Language.IT: self.get_tokenizers('it'),
        }

        self.processed_file_path = self.progress_dir / "spiders_section_split.txt"
 def init(self, model: str = 'compressed'):
     self.logger = get_logger(__name__)
     model_path = self.download_model(model)
     self.model = fasttext.load_model(str(model_path))
 def __init__(self, ):
     self.logger = get_logger(__name__)
 def __init__(self, config: dict):
     super().__init__(config)
     self.logger = get_logger(__name__)
 def __init__(self, config: dict):
     super().__init__(config)
     self.lang_id = LanguageIdentificationSingleton()
     self.logger = get_logger(__name__)
    def __init__(self, config: dict):
        super().__init__(config)
        self.logger = get_logger(__name__)

        self.lang_dir = None
        self.spacy_vocab = None
Exemple #20
0
    def __init__(self, config: dict, negation_detection_type="simple"):
        super().__init__(config)
        self.logger = get_logger(__name__)

        self.negation_detection_type = negation_detection_type
        # Two methods: either search with these strings
        self.fundamental_importance_search_strings = {
            "de": [
                "Rechtsfrage von grundsätzlicher Bedeutung",
                "Frage von grundsätzlicher Bedeutung"
            ],
            "fr": [
                "question juridique de principe",
            ],
            "it": [
                "questione di diritto di importanza fondamentale",
                "questione giuridica d''importanza fondamentale"
            ],
        }
        # or search with law articles
        self.articles = {
            # We removed Art. 42 because it is being cited many times without relevance to fundamental importance
            # Zitate sehr abhängig vom Gerichtsschreiber
            "de": [
                "Art. 20 Abs. 2 BGG",
                # "Art. 42 Abs. 2 BGG", # causes too many false positives
                "Art. 74 Abs. 2 lit. a BGG",
                "Art. 83 Abs. 1 lit. f Ziff. 1 BGG",
                "Art. 83 lit. f Ziff. 1 BGG",
                "Art. 83 Abs. 1 lit. m BGG",
                "Art. 83 lit. m BGG",
                "Art. 83 Abs. 1 lit. w BGG",
                "Art. 83 lit. w BGG",
                "Art. 83 Abs. 1 lit. x BGG",
                "Art. 83 lit. x BGG",
                "Art. 84a BGG",
                "Art. 85 Abs. 2 BGG",
                "Art. 109 Abs. 1 BGG"
            ],
            "fr": [
                "art. 20 al. 2 LTF",
                # "art. 42 al. 2 LTF", # causes too many false positives
                "art. 74 al. 2 let. a LTF",
                "art. 83 al. 1 let. f n. 1 LTF",
                "art. 83 let. f n. 1 LTF",
                "art. 83 al. 1 let. m LTF",
                "art. 83 let. m LTF",
                "art. 83 al. 1 let. w LTF",
                "art. 83 let. w LTF",
                "art. 83 al. 1 let. x LTF",
                "art. 83 let. x LTF",
                "art. 84a LTF",
                "art. 85 al. 2 LTF",
                "art. 109 al. 1 LTF"
            ],
            "it": [
                "art. 20 cpv. 2 LTF",
                # "art. 42 cpv. 2 LTF", # causes too many false positives
                "art. 74 cpv. 2 lett. a LTF",
                "art. 83 cpv. 1 lett. f n. 1 LTF",
                "art. 83 lett. f n. 1 LTF",
                "art. 83 cpv. 1 lett. m LTF",
                "art. 83 lett. m LTF",
                "art. 83 cpv. 1 lett. w LTF",
                "art. 83 lett. w LTF",
                "art. 83 cpv. 1 lett. x LTF",
                "art. 83 lett. x LTF",
                "art. 84a LTF",
                "art. 85 cpv. 2 LTF",
                "art. 109 cpv. 1 LTF"
            ]
        }
Exemple #21
0
 def __init__(self, config: dict):
     super().__init__(config)
     self.logger = get_logger(__name__)
     self.gender_db_file = self.data_dir / "name_to_gender.json"
     self.session = requests.Session()