def test_replace_content(self): text = '§ 123 ABC' marker = RefMarker(text, 0, len(text)) marker.uuid = 'foo' marker.references = [ Ref(ref_type=RefType.LAW, book='abc', section=123) ] content = text + ' and other text...' self.assertEqual('[ref=foo]§ 123 ABC[/ref] and other text...', marker.replace_content(content, 0)[0], 'Invalid content')
def extract_law_ref_markers(self, content: str) -> List[RefMarker]: """ The main extraction method. Takes input content and returns content with markers and list of extracted references. :param content: Plain-text or even HTML :return: List of reference markers """ logger.debug("Extracting from: %s" % content) if self.law_book_context is not None: # Extraction with context available is done in another method return self.extract_law_ref_markers_with_context(content) # Init markers = [] marker_offset = 0 # Handle each match separately for marker_match in re.finditer( self.get_law_ref_regex(self.get_law_book_codes()), content): marker_text = str(marker_match.group(0)).strip() references: List[Ref] = [] # Handle single and multi refs separately if re.match(r"^(Art(\.{,1})|§)\s", marker_text): references = self.handle_single_law_ref( self.get_law_book_codes(), marker_text, references) elif re.match(r"^§§\s", marker_text): references = self.handle_multiple_law_refs( self.get_law_book_codes(), marker_text, references) else: raise RefExError("Unsupported ref beginning: %s" % marker_text) marker = RefMarker( text=marker_text, start=marker_match.start(), end=marker_match.end(), line=0, ) # TODO marker.set_uuid() marker.set_references(references) markers.append(marker) return markers
def test_replace_content(self): text = "§ 123 ABC" marker = RefMarker(text, 0, len(text)) marker.uuid = "foo" marker.references = [ Ref(ref_type=RefType.LAW, book="abc", section=123) ] content = text + " and other text..." self.assertEqual( "[ref=foo]§ 123 ABC[/ref] and other text...", marker.replace_content(content, 0)[0], "Invalid content", )
def extract_case_ref_markers(self, content: str) -> List[RefMarker]: """ BVerwG, Urteil vom 20. Februar 2013, - 10 C 23.12 - BVerwG, Urteil vom 27. April 2010 - 10 C 5.09 - BVerfG, Beschluss vom 10.07.1989, - 2 BvR 502, 1000, 961/86 - BVerwG, Urteil vom 20.02.2013, - 10 C 23.12 - OVG Nordrhein-Westfalen, Urteil vom 21.2.2017, - 14 A 2316/16.A - OVG Nordrhein-Westfalen, Urteil vom 29.10.2012 – 2 A 723/11 - OVG NRW, Urteil vom 14.08.2013 – 1 A 1481/10, Rn. 81 – OVG Saarland, Urteil vom 2.2.2017, - 2 A 515/16 - OVG Rheinland-Pfalz, Urteil vom 16.12.2016, -1A 10922/16 - Bayrischer VGH, Urteil vom 12.12.16, - 21 B 16.30364 OVG Nordrhein-Westfalen, Urteil vom 21.2.2017, - 14 A 2316/16.A - Bayrischer VGH, Urteil vom 12.12.2016, - 21 B 16.30372 - OVG Saarland, Urteil vom 2.2.2017, - 2 A 515/16 - OVG Rheinland-Pfalz, Urteil vom 16.12.2016, -1A 10922/16 - VG Minden, Urteil vom 22.12.2016, - 1 K 5137/16.A - VG Gießen, Urteil vom 23.11.2016, - 2 K 969/16.GI.A VG Düsseldorf, Urteil vom 24.1.2017, - 17 K 9400/16.A VG Köln, Beschluss vom 25.03.2013 – 23 L 287/12 - OVG Schleswig, Beschluss vom 20.07.2006 – 1 MB 13/06 - Schleswig-Holsteinisches Verwaltungsgericht, Urteil vom 05.082014 – 11 A 7/14, Rn. 37 – Entscheidung des Bundesverwaltungsgerichts vom 24.01.2012 (2 C 24/10) EuGH Urteil vom 25.07.2002 – C-459/99 - TODO all court codes + case types - look for (Entscheidung|Bechluss|Urteil) - +/- 50 chars - find VG|OVG|Verwaltungsgericht|BVerwG|... - find location - find file number - ... - or (...) TODO Sentence tokenzier - remove all "special endings" \s([0-9]+|[a-zA-Z]|sog|Abs)\. - remove all dates :param key: :param content: :return: """ refs = [] original = content text = content marker_offset = 0 # TODO More intelligent by search only in sentences. # Find all file numbers for match in re.finditer(self.get_file_number_regex(), content): # type: Match court = None # Search in surroundings for court names for diff in [100, 200, 500]: # TODO maybe search left first, then to the right start = max(0, match.start(0) - diff) end = min(len(content), match.end(0) + diff) surrounding = content[start:end] # print('Surroundings: %s' % content[start:end]) # File number position in surroundings fn_pos = match.start(0) - start candidates = collections.OrderedDict() for court_match in re.finditer(self.get_court_name_regex(), surrounding): candidate_pos = round( (court_match.start(0) + court_match.end(0)) / 2) # Position = center candidate_dist = abs( fn_pos - candidate_pos) # Distance to file number # print('-- Candidate: %s / pos: %i / dist: %i' % (court_match.group(0), candidate_pos, candidate_dist)) if candidate_dist not in candidates: candidates[candidate_dist] = court_match else: logger.warning( 'Court candidate with same distance exist already: %s' % court_match) # Court is the candidate with smallest distance to file number if len(candidates) > 0: court = next(iter(candidates.values())).group('court') # Stop searching if court was found with this range break if court is None: court = '' file_number = match.group(0) ref_ids = [ Ref(ref_type=RefType.CASE, court=court, file_number=file_number) # TODO date field ] # TODO maintain order for case+law refs marker = RefMarker(text=file_number, start=match.start(0), end=match.end(0), line=0) # TODO line number marker.set_uuid() marker.set_references(ref_ids) refs.append(marker) # print(match.start(0)) return refs
def extract_law_ref_markers(self, content: str, is_html: bool = False) -> List[RefMarker]: """ The main extraction method. Takes input content and returns content with markers and list of extracted references. Divide and Conquer - only simple regex - replace matches with mask (_REF_) to avoid multiple matches :param content: Plain-text or even HTML :return: List of reference markers """ if self.law_book_context is not None: # Extraction with context available is done in another method return self.extract_law_ref_markers_with_context(content) # Init markers = [] # Replace special characters if working with html if is_html: sectionSign = '§' self.word_delimiter = '\s|\.|,|;|:|!|\?|\(|\)|\[|\]|”|\‘|\’|<|>|&|”|\‘|\’|<|>|&|"|\'|<|>|&' else: sectionSign = '§' self.word_delimiter = '\s|\.|,|;|:|!|\?|\(|\)|\[|\]|"|\'|<|>|&' book_look_ahead = '(?=' + self.word_delimiter + ')' # book code should be followed by a word separator, e.g. space. # Single ref book_pattern = self.get_law_book_ref_regex(self.get_law_book_codes()) # Any content any_content = '(\s?([0-9]{1,5}(\.{,1})|[a-z]{1,2}|[IXV]{1,3}|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|und|bis|,|;|\s))*' any_content = '([0-9]{1,5}|\.|[a-z]|[IXV]{1,3}|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|und|bis|,|;|\s)*' multi_pattern = sectionSign + sectionSign + ' (\s|[0-9]+(\.{,1})|[a-z]|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|f\.|ff\.|und|bis|\,|;|\s' + book_pattern + ')+\s(' + book_pattern + ')' + book_look_ahead for marker_match in re.finditer(re.compile(multi_pattern), content): # All matches marker_text = marker_match.group(0) refs = [] refs_waiting_for_book = [] # print('>> ' + marker_text) logger.debug('Multi Match with: %s' % marker_text) # Books by position in text book_positions = { } # Can we ensure that book_position is in order? for book_match in re.finditer(book_pattern, marker_text): book_positions[book_match.start()] = book_match.group(0) # We cannot work without knowing the book if len(book_positions) < 0: logger.error('No book found in marker text: %s' % marker_text) continue # Extract references from marker text # - find for <separator §§|,|..> + <section> # - ignore Abs, Nr, ... # - corresponding book is the closest to right a = '([0-9]+)\s(?=bis|und)' b = '([0-9]+)\s?[a-z]' c = '([0-9]+)' pattern = '(?P<sep>' + sectionSign + sectionSign + '|,|;|und|bis)\s?(?P<sect>(' + a + '|' + b + '|' + c + '))' for ref_match in re.finditer(re.compile(pattern), marker_text): sect = ref_match.group('sect') logger.debug('Found ref: %s' % ref_match.group()) if len(book_positions) == 1: book = next(iter(book_positions.values())) else: book = None pos = ref_match.start() for bp in book_positions: if bp > pos: # Take the first book that is right to section position book = book_positions[bp] break if book is None: logger.error('No book after reference found: %s - %s' % (ref_match.group(0), marker_text)) continue # Check for 'between' (range sections) if ref_match.group('sep') == 'bis' and len(refs) > 0: from_sect = refs[-1].section # last section # Both sections should be integers (no a-z sections) if sect.isdigit() and from_sect.isdigit(): for between_sect in range( int(from_sect) + 1, int(sect)): # Add to queue refs.append( Ref.init_law(book=book, section=str(between_sect))) refs.append(Ref.init_law(book=book, section=sect)) # Prepare marker marker = RefMarker(text=marker_text, start=marker_match.start(), end=marker_match.end()) marker.set_uuid() marker.set_references(refs) # Check if actual references were found in marker text if len(refs) > 0: markers.append(marker) # Update content to avoid double matching content = marker.replace_content_with_mask(content) else: logger.warning('No references found in marker: %s ' % marker_text) # Single refs sect_pattern = '(?P<sect>([0-9]+)(\s?[a-z]?))' patterns = [ # § 3 BGB, § 3d BGB, § 83 d BGB sectionSign + ' ' + sect_pattern + ' (?P<book>' + book_pattern + ')' + book_look_ahead, # Abs OR Nr # § 42 Abs. 1 Alt. 1 VwGO sectionSign + ' ' + sect_pattern + ' Abs. ([0-9]+) Alt. ([0-9]+) (?P<book>' + book_pattern + ')' + book_look_ahead, sectionSign + ' (?P<sect>([0-9]+)(\s?[a-z]?)) ' + any_content + ' (?P<book>(' + book_pattern + '))' + book_look_ahead, sectionSign + ' (?P<sect>([0-9]+)(\s?[a-z]?)) ' + any_content + ' (?P<next_book>(i\.V\.m\.|iVm))' + book_look_ahead, ] markers_waiting_for_book = [] # type: List[RefMarker] for pattern in patterns: # Iterate over all patterns # logger.debug('Pattern: %s' % pattern) for marker_match in re.finditer(re.compile(pattern), content): # All matches marker_text = marker_match.group(0) if 'book' in marker_match.groupdict(): book = Ref.clean_book(marker_match.group('book')) else: book = None ref = Ref.init_law(section=marker_match.group('sect'), book=None) marker = RefMarker(text=marker_text, start=marker_match.start(), end=marker_match.end()) marker.set_uuid() # marker.uuid = 's' # Has this marker a book if book is not None: ref.book = book marker.set_references([ref]) # Update content to avoid double matching content = marker.replace_content_with_mask(content) markers.append(marker) # Set to waiting markers for waiting in markers_waiting_for_book: if len(waiting.references) == 1: waiting.references[0].book = book content = waiting.replace_content_with_mask( content) markers.append(waiting) markers_waiting_for_book = [] else: if marker_match.group('next_book') is not None: marker.set_references([ref]) markers_waiting_for_book.append(marker) else: raise RefExError('next_book and book are None') if len(markers_waiting_for_book) > 0: logger.warning('Marker could not be assign to book: %s' % markers_waiting_for_book) # TODO Art GG return markers
def extract_law_ref_markers_with_context(self, content): """ With context = citing law book is known § 343 der Zivilprozessordnung :param content: :return: """ markers = [] book_code = self.law_book_context # content = content.replace('§', '§') search_text = str(content) def multi_sect(match): start = int(match.group(1)) end = int(match.group(3)) + 1 sects = [] for sect in range(start, end): sects.append(str(sect)) return sects def multi_book(match): start = int(match.group(1)) end = int(match.group(3)) + 1 return [book_code] * (end - start) patterns = [ # §§ 664 bis 670 { 'pattern': '§§ ([0-9]+) (bis|und) ([0-9]+)', 'book': multi_book, 'sect': multi_sect }, # Anlage 3 { 'pattern': 'Anlage ([0-9]+)', 'book': lambda match: book_code, 'sect': lambda match: 'anlage-%i' % int(match.group(1)) }, # § 1 { 'pattern': '§ ([0-9]+)(?:\s(Abs\.|Absatz)\s([0-9]+))?(?:\sSatz\s([0-9]+))?', 'book': lambda match: book_code, 'sect': lambda match: match.group(1) }, ] for p in patterns: regex = p['pattern'] res = re.finditer(regex, search_text) # flags for ref_m in res: ref_text = ref_m.group(0) # Build ref with lambda functions ref_ids = [] books = p['book'](ref_m) sects = p['sect'](ref_m) # Handle multiple ref ids in a single marker if not isinstance(books, str): for key, book in enumerate(books): ref_ids.append( Ref(ref_type=RefType.LAW, book=book, section=sects[key])) else: ref_ids.append( Ref(ref_type=RefType.LAW, book=books, section=sects)) ref = RefMarker(text=ref_text, start=ref_m.start(), end=ref_m.end()) ref.set_uuid() ref.set_references(ref_ids) markers.append(ref) # Remove from search content to avoid duplicate matches search_text = search_text[:ref_m.start()] + ('_' * (ref_m.end() - ref_m.start())) \ + search_text[ref_m.end():] # print('-------') # Sort by start and replace markers # markers.sort(key=lambda r: r.start, reverse=False) # marker_offset = 0 # for key, ref in enumerate(markers): # content, marker_offset = ref.replace_content(content, marker_offset) return markers