Example #1
0
    def test_replace_content(self):
        text = '§ 123 ABC'
        marker = RefMarker(text, 0, len(text))
        marker.uuid = 'foo'
        marker.references = [
            Ref(ref_type=RefType.LAW, book='abc', section=123)
        ]

        content = text + ' and other text...'

        self.assertEqual('[ref=foo]§ 123 ABC[/ref] and other text...',
                         marker.replace_content(content, 0)[0],
                         'Invalid content')
Example #2
0
    def extract_law_ref_markers(self, content: str) -> List[RefMarker]:
        """

        The main extraction method. Takes input content and returns content with markers and list of extracted references.

        :param content: Plain-text or even HTML
        :return: List of reference markers
        """

        logger.debug("Extracting from: %s" % content)

        if self.law_book_context is not None:
            # Extraction with context available is done in another method
            return self.extract_law_ref_markers_with_context(content)

        # Init
        markers = []
        marker_offset = 0

        # Handle each match separately
        for marker_match in re.finditer(
                self.get_law_ref_regex(self.get_law_book_codes()), content):

            marker_text = str(marker_match.group(0)).strip()
            references: List[Ref] = []

            # Handle single and multi refs separately
            if re.match(r"^(Art(\.{,1})|§)\s", marker_text):
                references = self.handle_single_law_ref(
                    self.get_law_book_codes(), marker_text, references)

            elif re.match(r"^§§\s", marker_text):
                references = self.handle_multiple_law_refs(
                    self.get_law_book_codes(), marker_text, references)

            else:
                raise RefExError("Unsupported ref beginning: %s" % marker_text)

            marker = RefMarker(
                text=marker_text,
                start=marker_match.start(),
                end=marker_match.end(),
                line=0,
            )  # TODO
            marker.set_uuid()
            marker.set_references(references)

            markers.append(marker)

        return markers
Example #3
0
    def test_replace_content(self):
        text = "§ 123 ABC"
        marker = RefMarker(text, 0, len(text))
        marker.uuid = "foo"
        marker.references = [
            Ref(ref_type=RefType.LAW, book="abc", section=123)
        ]

        content = text + " and other text..."

        self.assertEqual(
            "[ref=foo]§ 123 ABC[/ref] and other text...",
            marker.replace_content(content, 0)[0],
            "Invalid content",
        )
Example #4
0
    def extract_case_ref_markers(self, content: str) -> List[RefMarker]:
        """
        BVerwG, Urteil vom 20. Februar 2013, - 10 C 23.12 -
        BVerwG, Urteil vom 27. April 2010 - 10 C 5.09 -
        BVerfG, Beschluss vom 10.07.1989, - 2 BvR 502, 1000, 961/86 -
        BVerwG, Urteil vom 20.02.2013, - 10 C 23.12 -
        OVG Nordrhein-Westfalen, Urteil vom 21.2.2017, - 14 A 2316/16.A -
        OVG Nordrhein-Westfalen, Urteil vom 29.10.2012 – 2 A 723/11 -
        OVG NRW, Urteil vom 14.08.2013 – 1 A 1481/10, Rn. 81 –
        OVG Saarland, Urteil vom 2.2.2017, - 2 A 515/16 -
        OVG Rheinland-Pfalz, Urteil vom 16.12.2016, -1A 10922/16 -
        Bayrischer VGH, Urteil vom 12.12.16, - 21 B 16.30364
        OVG Nordrhein-Westfalen, Urteil vom 21.2.2017, - 14 A 2316/16.A -
        Bayrischer VGH, Urteil vom 12.12.2016, - 21 B 16.30372 -
        OVG Saarland, Urteil vom 2.2.2017, - 2 A 515/16 -
        OVG Rheinland-Pfalz, Urteil vom 16.12.2016, -1A 10922/16 -
        VG Minden, Urteil vom 22.12.2016, - 1 K 5137/16.A -
        VG Gießen, Urteil vom 23.11.2016, - 2 K 969/16.GI.A
        VG Düsseldorf, Urteil vom 24.1.2017, - 17 K 9400/16.A
        VG Köln, Beschluss vom 25.03.2013 – 23 L 287/12 -
        OVG Schleswig, Beschluss vom 20.07.2006 – 1 MB 13/06 -
        Schleswig-Holsteinisches Verwaltungsgericht, Urteil vom 05.082014 – 11 A 7/14, Rn. 37 –
        Entscheidung des Bundesverwaltungsgerichts vom 24.01.2012 (2 C 24/10)

        EuGH Urteil vom 25.07.2002 – C-459/99 -

        TODO all court codes + case types

        - look for (Entscheidung|Bechluss|Urteil)
        - +/- 50 chars
        - find VG|OVG|Verwaltungsgericht|BVerwG|...
        - find location
        - find file number - ... - or (...)

        TODO

        Sentence tokenzier
        - remove all "special endings" \s([0-9]+|[a-zA-Z]|sog|Abs)\.
        - remove all dates

        :param key:
        :param content:
        :return:
        """

        refs = []
        original = content
        text = content
        marker_offset = 0

        # TODO More intelligent by search only in sentences.

        # Find all file numbers
        for match in re.finditer(self.get_file_number_regex(),
                                 content):  # type: Match
            court = None

            # Search in surroundings for court names
            for diff in [100, 200, 500]:
                # TODO maybe search left first, then to the right

                start = max(0, match.start(0) - diff)
                end = min(len(content), match.end(0) + diff)
                surrounding = content[start:end]

                # print('Surroundings: %s'  % content[start:end])

                # File number position in surroundings
                fn_pos = match.start(0) - start
                candidates = collections.OrderedDict()

                for court_match in re.finditer(self.get_court_name_regex(),
                                               surrounding):
                    candidate_pos = round(
                        (court_match.start(0) + court_match.end(0)) /
                        2)  # Position = center
                    candidate_dist = abs(
                        fn_pos - candidate_pos)  # Distance to file number

                    # print('-- Candidate: %s / pos: %i / dist: %i' % (court_match.group(0), candidate_pos, candidate_dist))

                    if candidate_dist not in candidates:
                        candidates[candidate_dist] = court_match
                    else:
                        logger.warning(
                            'Court candidate with same distance exist already: %s'
                            % court_match)

                # Court is the candidate with smallest distance to file number
                if len(candidates) > 0:
                    court = next(iter(candidates.values())).group('court')
                    # Stop searching if court was found with this range
                    break

            if court is None:
                court = ''

            file_number = match.group(0)
            ref_ids = [
                Ref(ref_type=RefType.CASE,
                    court=court,
                    file_number=file_number)  # TODO date field
            ]
            # TODO maintain order for case+law refs
            marker = RefMarker(text=file_number,
                               start=match.start(0),
                               end=match.end(0),
                               line=0)  # TODO line number
            marker.set_uuid()
            marker.set_references(ref_ids)

            refs.append(marker)

            # print(match.start(0))

        return refs
    def extract_law_ref_markers(self,
                                content: str,
                                is_html: bool = False) -> List[RefMarker]:
        """

        The main extraction method. Takes input content and returns content with markers and list of extracted references.

        Divide and Conquer
        - only simple regex
        - replace matches with mask (_REF_) to avoid multiple matches

        :param content: Plain-text or even HTML
        :return: List of reference markers
        """

        if self.law_book_context is not None:
            # Extraction with context available is done in another method
            return self.extract_law_ref_markers_with_context(content)

        # Init
        markers = []

        # Replace special characters if working with html
        if is_html:
            sectionSign = '§'
            self.word_delimiter = '\s|\.|,|;|:|!|\?|\(|\)|\[|\]|&#8221;|\&#8216;|\&#8217;|&#60;|&#62;|&#38;|&rdquo;|\&lsquo;|\&rsquo;|&lt;|&gt;|&amp;|"|\'|<|>|&'
        else:
            sectionSign = '§'
            self.word_delimiter = '\s|\.|,|;|:|!|\?|\(|\)|\[|\]|"|\'|<|>|&'

        book_look_ahead = '(?=' + self.word_delimiter + ')'  # book code should be followed by a word separator, e.g. space.

        # Single ref
        book_pattern = self.get_law_book_ref_regex(self.get_law_book_codes())

        # Any content
        any_content = '(\s?([0-9]{1,5}(\.{,1})|[a-z]{1,2}|[IXV]{1,3}|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|und|bis|,|;|\s))*'
        any_content = '([0-9]{1,5}|\.|[a-z]|[IXV]{1,3}|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|und|bis|,|;|\s)*'

        multi_pattern = sectionSign + sectionSign + ' (\s|[0-9]+(\.{,1})|[a-z]|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|f\.|ff\.|und|bis|\,|;|\s' + book_pattern + ')+\s(' + book_pattern + ')' + book_look_ahead

        for marker_match in re.finditer(re.compile(multi_pattern),
                                        content):  # All matches
            marker_text = marker_match.group(0)
            refs = []
            refs_waiting_for_book = []

            # print('>> ' + marker_text)
            logger.debug('Multi Match with: %s' % marker_text)

            # Books by position in text
            book_positions = {
            }  # Can we ensure that book_position is in order?
            for book_match in re.finditer(book_pattern, marker_text):
                book_positions[book_match.start()] = book_match.group(0)

            # We cannot work without knowing the book
            if len(book_positions) < 0:
                logger.error('No book found in marker text: %s' % marker_text)
                continue

            # Extract references from marker text
            # - find for <separator §§|,|..> + <section>
            # - ignore Abs, Nr, ...
            # - corresponding book is the closest to right
            a = '([0-9]+)\s(?=bis|und)'
            b = '([0-9]+)\s?[a-z]'
            c = '([0-9]+)'
            pattern = '(?P<sep>' + sectionSign + sectionSign + '|,|;|und|bis)\s?(?P<sect>(' + a + '|' + b + '|' + c + '))'

            for ref_match in re.finditer(re.compile(pattern), marker_text):
                sect = ref_match.group('sect')

                logger.debug('Found ref: %s' % ref_match.group())

                if len(book_positions) == 1:
                    book = next(iter(book_positions.values()))
                else:
                    book = None
                    pos = ref_match.start()

                    for bp in book_positions:
                        if bp > pos:
                            # Take the first book that is right to section position
                            book = book_positions[bp]
                            break

                if book is None:
                    logger.error('No book after reference found: %s - %s' %
                                 (ref_match.group(0), marker_text))
                    continue

                # Check for 'between' (range sections)
                if ref_match.group('sep') == 'bis' and len(refs) > 0:
                    from_sect = refs[-1].section  # last section

                    # Both sections should be integers (no a-z sections)
                    if sect.isdigit() and from_sect.isdigit():
                        for between_sect in range(
                                int(from_sect) + 1, int(sect)):
                            # Add to queue
                            refs.append(
                                Ref.init_law(book=book,
                                             section=str(between_sect)))

                refs.append(Ref.init_law(book=book, section=sect))

            # Prepare marker
            marker = RefMarker(text=marker_text,
                               start=marker_match.start(),
                               end=marker_match.end())
            marker.set_uuid()
            marker.set_references(refs)

            # Check if actual references were found in marker text
            if len(refs) > 0:
                markers.append(marker)

                # Update content to avoid double matching
                content = marker.replace_content_with_mask(content)
            else:
                logger.warning('No references found in marker: %s ' %
                               marker_text)

        # Single refs
        sect_pattern = '(?P<sect>([0-9]+)(\s?[a-z]?))'
        patterns = [
            # § 3 BGB, § 3d BGB, § 83 d BGB
            sectionSign + ' ' + sect_pattern + ' (?P<book>' + book_pattern +
            ')' + book_look_ahead,
            # Abs OR Nr
            # § 42 Abs. 1 Alt. 1 VwGO
            sectionSign + ' ' + sect_pattern +
            ' Abs. ([0-9]+) Alt. ([0-9]+) (?P<book>' + book_pattern + ')' +
            book_look_ahead,
            sectionSign + ' (?P<sect>([0-9]+)(\s?[a-z]?)) ' + any_content +
            ' (?P<book>(' + book_pattern + '))' + book_look_ahead,
            sectionSign + ' (?P<sect>([0-9]+)(\s?[a-z]?)) ' + any_content +
            ' (?P<next_book>(i\.V\.m\.|iVm))' + book_look_ahead,
        ]

        markers_waiting_for_book = []  # type: List[RefMarker]

        for pattern in patterns:  # Iterate over all patterns

            # logger.debug('Pattern: %s' % pattern)

            for marker_match in re.finditer(re.compile(pattern),
                                            content):  # All matches
                marker_text = marker_match.group(0)
                if 'book' in marker_match.groupdict():
                    book = Ref.clean_book(marker_match.group('book'))
                else:
                    book = None

                ref = Ref.init_law(section=marker_match.group('sect'),
                                   book=None)

                marker = RefMarker(text=marker_text,
                                   start=marker_match.start(),
                                   end=marker_match.end())
                marker.set_uuid()
                # marker.uuid = 's'

                # Has this marker a book
                if book is not None:
                    ref.book = book

                    marker.set_references([ref])

                    # Update content to avoid double matching
                    content = marker.replace_content_with_mask(content)

                    markers.append(marker)

                    # Set to waiting markers
                    for waiting in markers_waiting_for_book:
                        if len(waiting.references) == 1:
                            waiting.references[0].book = book

                            content = waiting.replace_content_with_mask(
                                content)

                            markers.append(waiting)
                    markers_waiting_for_book = []
                else:
                    if marker_match.group('next_book') is not None:
                        marker.set_references([ref])
                        markers_waiting_for_book.append(marker)
                    else:
                        raise RefExError('next_book and book are None')

        if len(markers_waiting_for_book) > 0:
            logger.warning('Marker could not be assign to book: %s' %
                           markers_waiting_for_book)

        # TODO Art GG

        return markers
    def extract_law_ref_markers_with_context(self, content):
        """
        With context = citing law book is known

        § 343 der Zivilprozessordnung
        :param content:
        :return:
        """
        markers = []

        book_code = self.law_book_context
        # content = content.replace('&#167;', '§')
        search_text = str(content)

        def multi_sect(match):
            start = int(match.group(1))
            end = int(match.group(3)) + 1
            sects = []

            for sect in range(start, end):
                sects.append(str(sect))

            return sects

        def multi_book(match):
            start = int(match.group(1))
            end = int(match.group(3)) + 1
            return [book_code] * (end - start)

        patterns = [
            # §§ 664 bis 670
            {
                'pattern': '§§ ([0-9]+) (bis|und) ([0-9]+)',
                'book': multi_book,
                'sect': multi_sect
            },
            # Anlage 3
            {
                'pattern': 'Anlage ([0-9]+)',
                'book': lambda match: book_code,
                'sect': lambda match: 'anlage-%i' % int(match.group(1))
            },

            # § 1
            {
                'pattern':
                '§ ([0-9]+)(?:\s(Abs\.|Absatz)\s([0-9]+))?(?:\sSatz\s([0-9]+))?',
                'book': lambda match: book_code,
                'sect': lambda match: match.group(1)
            },
        ]

        for p in patterns:
            regex = p['pattern']

            res = re.finditer(regex, search_text)  # flags

            for ref_m in res:
                ref_text = ref_m.group(0)

                # Build ref with lambda functions
                ref_ids = []
                books = p['book'](ref_m)
                sects = p['sect'](ref_m)

                # Handle multiple ref ids in a single marker
                if not isinstance(books, str):
                    for key, book in enumerate(books):
                        ref_ids.append(
                            Ref(ref_type=RefType.LAW,
                                book=book,
                                section=sects[key]))

                else:
                    ref_ids.append(
                        Ref(ref_type=RefType.LAW, book=books, section=sects))

                ref = RefMarker(text=ref_text,
                                start=ref_m.start(),
                                end=ref_m.end())
                ref.set_uuid()
                ref.set_references(ref_ids)
                markers.append(ref)

                # Remove from search content to avoid duplicate matches
                search_text = search_text[:ref_m.start()] + ('_' * (ref_m.end() - ref_m.start())) \
                              + search_text[ref_m.end():]
                # print('-------')

        # Sort by start and replace markers
        # markers.sort(key=lambda r: r.start, reverse=False)
        # marker_offset = 0
        # for key, ref in enumerate(markers):
        #     content, marker_offset = ref.replace_content(content, marker_offset)

        return markers