def __init__(self, config: dict): super().__init__(config) self.logger = get_logger(__name__) self.debug = False self.split_type = "date-stratified" self.dataset_name = "criticality_prediction" # TODO wait for section splitting in other courts for facts and considerations to be enabled self.feature_cols = ['full_text' ] # ['facts', 'considerations', 'text']
def __init__(self, config: dict): super().__init__(config) self.logger = get_logger(__name__) self.ARE_names = { "de": "Bundesamt für Raumentwicklung", "fr": "Office fédéral du développement territorial", "it": "Ufficio federale dello sviluppo territoriale", } self.law_abbrs = {"de": "RPG", "fr": "LAT", "it": "LPT"}
def init(self): self.logger = get_logger(__name__) # IMPORTANT: we need to take care of the fact that the laws are named differently in each language but refer to the same law! self.law_abbr_by_lang = self.build_law_abbr_by_lang( ) # the abbreviations are the keys self.law_id_by_lang = { lang: {v: k for k, v in laws.items()} for lang, laws in self.law_abbr_by_lang.items() } # the ids are the keys
def __init__(self, config: dict): super().__init__(config) self.logger = get_logger(__name__) self.subdir = self.slc_subdir self.spacy_subdir = self.slc_spacy_subdir self.db = self.db_slc self.tables_name = "slc" self.tables = ["slc"] self.entries_template = {'sr': [], 'title': [], 'lang': [], 'text': []} self.glob_str = "30_XML_POSTagged/DE/*.xml"
def __init__(self, config: dict): __metaclass__ = abc.ABCMeta super().__init__(config) self.logger = get_logger(__name__) self.seed = 42 self.minFeatureColLength = 100 # characters self.debug_chunksize = int(2e2) self.real_chunksize = int(2e5) self.split_type = None # to be overridden self.dataset_name = None # to be overridden self.feature_cols = ["text"] # to be overridden
def __init__(self, config: dict): super().__init__(config) self.logger = get_logger(__name__) self.models = { 'de': 'de_core_news_lg', 'fr': 'fr_core_news_lg', 'it': 'it_core_news_lg' } # tag, pos and lemma are enough for now self.disable_pipes = ['senter', 'ner', 'attribute_ruler', 'textcat'] self.active_spacy_model = None self.active_bert_tokenizer = None
def __init__(self, config: dict): super().__init__(config) self.logger = get_logger(__name__) self.debug = True self.split_type = "date-stratified" self.dataset_name = "judgment_prediction" self.feature_cols = ['facts', 'considerations'] self.with_partials = False self.with_write_off = False self.with_unification = False self.with_inadmissible = False self.make_single_label = True
def __init__(self, config: dict, function_name: str, col_name: str, col_type: str = 'jsonb'): super().__init__(config) self.logger = get_logger(__name__) self.processing_functions = self.load_functions(config, function_name) self.logger.debug(self.processing_functions) self.col_name = col_name self.col_type = col_type self.processed_amount = 0 self.total_to_process = -1 self.spider_specific_dir = self.create_dir( ROOT_DIR, config['dir']['spider_specific_dir'])
def __init__(self, config: dict): super().__init__(config, function_name='citation_extracting_functions', col_name='citations') self.logger = get_logger(__name__) self.processed_file_path = self.progress_dir / "spiders_citation_extracted.txt" self.logger_info = { 'start': 'Started extracting citations', 'finished': 'Finished extracting citations', 'start_spider': 'Started extracting citations for spider', 'finish_spider': 'Finished extracting citations for spider', 'saving': 'Saving chunk of citations', 'processing_one': 'Extracting citations from', 'no_functions': 'Not extracting citations.' }
def __init__(self, config: dict): super().__init__(config) self.logger = get_logger(__name__) self.subdir = self.wikipedia_subdir self.spacy_subdir = self.wikipedia_spacy_subdir self.db = self.db_wikipedia self.tables_name = "wikipedia" self.tables = ["wikipedia"] self.entries_template = { 'wiki_id': [], 'title': [], 'url': [], 'text': [] } self.glob_str = "xml/**/*"
def __init__(self, config: dict): super().__init__(config) self.court_keys = [ "spider", "canton", "court", "chamber", "date", "file_name", "file_number", "file_number_additional", "html_url", "html_raw", "pdf_url", "pdf_raw", ] self.logger = get_logger(__name__)
def __init__(self, config: dict): super().__init__(config, function_name="pattern_extracting_functions", col_name='') self.logger = get_logger(__name__) self.columns = ['keyword', 'totalcount', 'example'] self.test = Language.DE self.df = pd.DataFrame(columns=self.columns) self.language = {} self.currentLanguage = '' self.total = 0 self.counter = 0 self.spider = '' self.limit = 0 self.dict = { Language.DE: { 'count': 0, 'dict': {} }, Language.FR: { 'count': 0, 'dict': {} }, Language.IT: { 'count': 0, 'dict': {} }, Language.EN: { 'count': 0, 'dict': {} } } self.end = {} self.logger_info = { 'start': 'Started pattern extraction', 'finished': 'Finished pattern extraction', 'start_spider': 'Started pattern extraction for spider', 'finish_spider': 'Finished pattern extraction for spider', } self.processed_file_path = self.progress_dir / "pattern_extraction.txt"
def __init__(self, config: dict): super().__init__(config) self.logger = get_logger(__name__) self.debug = True self.split_type = "date-stratified" self.dataset_name = "doc2doc_ir" self.feature_cols = ['facts', 'considerations'] self.dataset_folder = self.create_dir(self.datasets_subdir, self.dataset_name) if self.debug: # make sure that we don't overwrite progress in the real directory self.dataset_folder = self.create_dir(self.tmp_subdir, self.dataset_name) self.num_ruling_citations = 1000 # the 1000 most common ruling citations will be included self.load_rulings() self.load_law_articles() pandarallel.initialize(progress_bar=True) tqdm.pandas()
def __init__(self, config: dict): super().__init__(config, function_name='section_splitting_functions', col_name='sections') self.logger = get_logger(__name__) self.logger_info = { 'start': 'Started section splitting', 'finished': 'Finished section splitting', 'start_spider': 'Started splitting sections for spider', 'finish_spider': 'Finished splitting sections for spider', 'saving': 'Saving chunk of recognized sections', 'processing_one': 'Splitting sections from file', 'no_functions': 'Not splitting into sections.' } # Get the tokenizers at the start so they don't have to be loaded with every chunk # Return value: spacy_tokenizer, bert_tokenizer = tokenizers['de'] self.tokenizers: dict[Language, Tuple[any, any]] = { Language.DE: self.get_tokenizers('de'), Language.FR: self.get_tokenizers('fr'), Language.IT: self.get_tokenizers('it'), } self.processed_file_path = self.progress_dir / "spiders_section_split.txt"
def init(self, model: str = 'compressed'): self.logger = get_logger(__name__) model_path = self.download_model(model) self.model = fasttext.load_model(str(model_path))
def __init__(self, ): self.logger = get_logger(__name__)
def __init__(self, config: dict): super().__init__(config) self.logger = get_logger(__name__)
def __init__(self, config: dict): super().__init__(config) self.lang_id = LanguageIdentificationSingleton() self.logger = get_logger(__name__)
def __init__(self, config: dict): super().__init__(config) self.logger = get_logger(__name__) self.lang_dir = None self.spacy_vocab = None
def __init__(self, config: dict, negation_detection_type="simple"): super().__init__(config) self.logger = get_logger(__name__) self.negation_detection_type = negation_detection_type # Two methods: either search with these strings self.fundamental_importance_search_strings = { "de": [ "Rechtsfrage von grundsätzlicher Bedeutung", "Frage von grundsätzlicher Bedeutung" ], "fr": [ "question juridique de principe", ], "it": [ "questione di diritto di importanza fondamentale", "questione giuridica d''importanza fondamentale" ], } # or search with law articles self.articles = { # We removed Art. 42 because it is being cited many times without relevance to fundamental importance # Zitate sehr abhängig vom Gerichtsschreiber "de": [ "Art. 20 Abs. 2 BGG", # "Art. 42 Abs. 2 BGG", # causes too many false positives "Art. 74 Abs. 2 lit. a BGG", "Art. 83 Abs. 1 lit. f Ziff. 1 BGG", "Art. 83 lit. f Ziff. 1 BGG", "Art. 83 Abs. 1 lit. m BGG", "Art. 83 lit. m BGG", "Art. 83 Abs. 1 lit. w BGG", "Art. 83 lit. w BGG", "Art. 83 Abs. 1 lit. x BGG", "Art. 83 lit. x BGG", "Art. 84a BGG", "Art. 85 Abs. 2 BGG", "Art. 109 Abs. 1 BGG" ], "fr": [ "art. 20 al. 2 LTF", # "art. 42 al. 2 LTF", # causes too many false positives "art. 74 al. 2 let. a LTF", "art. 83 al. 1 let. f n. 1 LTF", "art. 83 let. f n. 1 LTF", "art. 83 al. 1 let. m LTF", "art. 83 let. m LTF", "art. 83 al. 1 let. w LTF", "art. 83 let. w LTF", "art. 83 al. 1 let. x LTF", "art. 83 let. x LTF", "art. 84a LTF", "art. 85 al. 2 LTF", "art. 109 al. 1 LTF" ], "it": [ "art. 20 cpv. 2 LTF", # "art. 42 cpv. 2 LTF", # causes too many false positives "art. 74 cpv. 2 lett. a LTF", "art. 83 cpv. 1 lett. f n. 1 LTF", "art. 83 lett. f n. 1 LTF", "art. 83 cpv. 1 lett. m LTF", "art. 83 lett. m LTF", "art. 83 cpv. 1 lett. w LTF", "art. 83 lett. w LTF", "art. 83 cpv. 1 lett. x LTF", "art. 83 lett. x LTF", "art. 84a LTF", "art. 85 cpv. 2 LTF", "art. 109 cpv. 1 LTF" ] }
def __init__(self, config: dict): super().__init__(config) self.logger = get_logger(__name__) self.gender_db_file = self.data_dir / "name_to_gender.json" self.session = requests.Session()