Python RefMarker.RefMarker Examples

Programming Language: Python

Namespace/Package Name: refex.models

Class/Type: RefMarker

Method/Function: RefMarker

Examples at hotexamples.com: 6

Python RefMarker.RefMarker - 6 examples found. These are the top rated real world Python examples of refex.models.RefMarker.RefMarker extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

RefMarker(6)

set_references(4)

set_uuid(4)

references(2)

replace_content(2)

uuid(2)

replace_content_with_mask(1)

Example #1

Show file

    def test_replace_content(self):
        text = '§ 123 ABC'
        marker = RefMarker(text, 0, len(text))
        marker.uuid = 'foo'
        marker.references = [
            Ref(ref_type=RefType.LAW, book='abc', section=123)
        ]

        content = text + ' and other text...'

        self.assertEqual('[ref=foo]§ 123 ABC[/ref] and other text...',
                         marker.replace_content(content, 0)[0],
                         'Invalid content')

Example #2

Show file

    def extract_law_ref_markers(self, content: str) -> List[RefMarker]:
        """

        The main extraction method. Takes input content and returns content with markers and list of extracted references.

        :param content: Plain-text or even HTML
        :return: List of reference markers
        """

        logger.debug("Extracting from: %s" % content)

        if self.law_book_context is not None:
            # Extraction with context available is done in another method
            return self.extract_law_ref_markers_with_context(content)

        # Init
        markers = []
        marker_offset = 0

        # Handle each match separately
        for marker_match in re.finditer(
                self.get_law_ref_regex(self.get_law_book_codes()), content):

            marker_text = str(marker_match.group(0)).strip()
            references: List[Ref] = []

            # Handle single and multi refs separately
            if re.match(r"^(Art(\.{,1})|§)\s", marker_text):
                references = self.handle_single_law_ref(
                    self.get_law_book_codes(), marker_text, references)

            elif re.match(r"^§§\s", marker_text):
                references = self.handle_multiple_law_refs(
                    self.get_law_book_codes(), marker_text, references)

            else:
                raise RefExError("Unsupported ref beginning: %s" % marker_text)

            marker = RefMarker(
                text=marker_text,
                start=marker_match.start(),
                end=marker_match.end(),
                line=0,
            )  # TODO
            marker.set_uuid()
            marker.set_references(references)

            markers.append(marker)

        return markers

Example #3

Show file

    def test_replace_content(self):
        text = "§ 123 ABC"
        marker = RefMarker(text, 0, len(text))
        marker.uuid = "foo"
        marker.references = [
            Ref(ref_type=RefType.LAW, book="abc", section=123)
        ]

        content = text + " and other text..."

        self.assertEqual(
            "[ref=foo]§ 123 ABC[/ref] and other text...",
            marker.replace_content(content, 0)[0],
            "Invalid content",
        )

Example #4

Show file

    def extract_case_ref_markers(self, content: str) -> List[RefMarker]:
        """
        BVerwG, Urteil vom 20. Februar 2013, - 10 C 23.12 -
        BVerwG, Urteil vom 27. April 2010 - 10 C 5.09 -
        BVerfG, Beschluss vom 10.07.1989, - 2 BvR 502, 1000, 961/86 -
        BVerwG, Urteil vom 20.02.2013, - 10 C 23.12 -
        OVG Nordrhein-Westfalen, Urteil vom 21.2.2017, - 14 A 2316/16.A -
        OVG Nordrhein-Westfalen, Urteil vom 29.10.2012 – 2 A 723/11 -
        OVG NRW, Urteil vom 14.08.2013 – 1 A 1481/10, Rn. 81 –
        OVG Saarland, Urteil vom 2.2.2017, - 2 A 515/16 -
        OVG Rheinland-Pfalz, Urteil vom 16.12.2016, -1A 10922/16 -
        Bayrischer VGH, Urteil vom 12.12.16, - 21 B 16.30364
        OVG Nordrhein-Westfalen, Urteil vom 21.2.2017, - 14 A 2316/16.A -
        Bayrischer VGH, Urteil vom 12.12.2016, - 21 B 16.30372 -
        OVG Saarland, Urteil vom 2.2.2017, - 2 A 515/16 -
        OVG Rheinland-Pfalz, Urteil vom 16.12.2016, -1A 10922/16 -
        VG Minden, Urteil vom 22.12.2016, - 1 K 5137/16.A -
        VG Gießen, Urteil vom 23.11.2016, - 2 K 969/16.GI.A
        VG Düsseldorf, Urteil vom 24.1.2017, - 17 K 9400/16.A
        VG Köln, Beschluss vom 25.03.2013 – 23 L 287/12 -
        OVG Schleswig, Beschluss vom 20.07.2006 – 1 MB 13/06 -
        Schleswig-Holsteinisches Verwaltungsgericht, Urteil vom 05.082014 – 11 A 7/14, Rn. 37 –
        Entscheidung des Bundesverwaltungsgerichts vom 24.01.2012 (2 C 24/10)

        EuGH Urteil vom 25.07.2002 – C-459/99 -

        TODO all court codes + case types

        - look for (Entscheidung|Bechluss|Urteil)
        - +/- 50 chars
        - find VG|OVG|Verwaltungsgericht|BVerwG|...
        - find location
        - find file number - ... - or (...)

        TODO

        Sentence tokenzier
        - remove all "special endings" \s([0-9]+|[a-zA-Z]|sog|Abs)\.
        - remove all dates

        :param key:
        :param content:
        :return:
        """

        refs = []
        original = content
        text = content
        marker_offset = 0

        # TODO More intelligent by search only in sentences.

        # Find all file numbers
        for match in re.finditer(self.get_file_number_regex(),
                                 content):  # type: Match
            court = None

            # Search in surroundings for court names
            for diff in [100, 200, 500]:
                # TODO maybe search left first, then to the right

                start = max(0, match.start(0) - diff)
                end = min(len(content), match.end(0) + diff)
                surrounding = content[start:end]

                # print('Surroundings: %s'  % content[start:end])

                # File number position in surroundings
                fn_pos = match.start(0) - start
                candidates = collections.OrderedDict()

                for court_match in re.finditer(self.get_court_name_regex(),
                                               surrounding):
                    candidate_pos = round(
                        (court_match.start(0) + court_match.end(0)) /
                        2)  # Position = center
                    candidate_dist = abs(
                        fn_pos - candidate_pos)  # Distance to file number

                    # print('-- Candidate: %s / pos: %i / dist: %i' % (court_match.group(0), candidate_pos, candidate_dist))

                    if candidate_dist not in candidates:
                        candidates[candidate_dist] = court_match
                    else:
                        logger.warning(
                            'Court candidate with same distance exist already: %s'
                            % court_match)

                # Court is the candidate with smallest distance to file number
                if len(candidates) > 0:
                    court = next(iter(candidates.values())).group('court')
                    # Stop searching if court was found with this range
                    break

            if court is None:
                court = ''

            file_number = match.group(0)
            ref_ids = [
                Ref(ref_type=RefType.CASE,
                    court=court,
                    file_number=file_number)  # TODO date field
            ]
            # TODO maintain order for case+law refs
            marker = RefMarker(text=file_number,
                               start=match.start(0),
                               end=match.end(0),
                               line=0)  # TODO line number
            marker.set_uuid()
            marker.set_references(ref_ids)

            refs.append(marker)

            # print(match.start(0))

        return refs

Example #5

Show file

File: law_dnc.py Project: openlegaldata/legal-reference-extraction

    def extract_law_ref_markers(self,
                                content: str,
                                is_html: bool = False) -> List[RefMarker]:
        """

        The main extraction method. Takes input content and returns content with markers and list of extracted references.

        Divide and Conquer
        - only simple regex
        - replace matches with mask (_REF_) to avoid multiple matches

        :param content: Plain-text or even HTML
        :return: List of reference markers
        """

        if self.law_book_context is not None:
            # Extraction with context available is done in another method
            return self.extract_law_ref_markers_with_context(content)

        # Init
        markers = []

        # Replace special characters if working with html
        if is_html:
            sectionSign = '&#167;'
            self.word_delimiter = '\s|\.|,|;|:|!|\?|\(|\)|\[|\]|&#8221;|\&#8216;|\&#8217;|&#60;|&#62;|&#38;|&rdquo;|\&lsquo;|\&rsquo;|&lt;|&gt;|&amp;|"|\'|<|>|&'
        else:
            sectionSign = '§'
            self.word_delimiter = '\s|\.|,|;|:|!|\?|\(|\)|\[|\]|"|\'|<|>|&'

        book_look_ahead = '(?=' + self.word_delimiter + ')'  # book code should be followed by a word separator, e.g. space.

        # Single ref
        book_pattern = self.get_law_book_ref_regex(self.get_law_book_codes())

        # Any content
        any_content = '(\s?([0-9]{1,5}(\.{,1})|[a-z]{1,2}|[IXV]{1,3}|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|und|bis|,|;|\s))*'
        any_content = '([0-9]{1,5}|\.|[a-z]|[IXV]{1,3}|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|und|bis|,|;|\s)*'

        multi_pattern = sectionSign + sectionSign + ' (\s|[0-9]+(\.{,1})|[a-z]|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|f\.|ff\.|und|bis|\,|;|\s' + book_pattern + ')+\s(' + book_pattern + ')' + book_look_ahead

        for marker_match in re.finditer(re.compile(multi_pattern),
                                        content):  # All matches
            marker_text = marker_match.group(0)
            refs = []
            refs_waiting_for_book = []

            # print('>> ' + marker_text)
            logger.debug('Multi Match with: %s' % marker_text)

            # Books by position in text
            book_positions = {
            }  # Can we ensure that book_position is in order?
            for book_match in re.finditer(book_pattern, marker_text):
                book_positions[book_match.start()] = book_match.group(0)

            # We cannot work without knowing the book
            if len(book_positions) < 0:
                logger.error('No book found in marker text: %s' % marker_text)
                continue

            # Extract references from marker text
            # - find for <separator §§|,|..> + <section>
            # - ignore Abs, Nr, ...
            # - corresponding book is the closest to right
            a = '([0-9]+)\s(?=bis|und)'
            b = '([0-9]+)\s?[a-z]'
            c = '([0-9]+)'
            pattern = '(?P<sep>' + sectionSign + sectionSign + '|,|;|und|bis)\s?(?P<sect>(' + a + '|' + b + '|' + c + '))'

            for ref_match in re.finditer(re.compile(pattern), marker_text):
                sect = ref_match.group('sect')

                logger.debug('Found ref: %s' % ref_match.group())

                if len(book_positions) == 1:
                    book = next(iter(book_positions.values()))
                else:
                    book = None
                    pos = ref_match.start()

                    for bp in book_positions:
                        if bp > pos:
                            # Take the first book that is right to section position
                            book = book_positions[bp]
                            break

                if book is None:
                    logger.error('No book after reference found: %s - %s' %
                                 (ref_match.group(0), marker_text))
                    continue

                # Check for 'between' (range sections)
                if ref_match.group('sep') == 'bis' and len(refs) > 0:
                    from_sect = refs[-1].section  # last section

                    # Both sections should be integers (no a-z sections)
                    if sect.isdigit() and from_sect.isdigit():
                        for between_sect in range(
                                int(from_sect) + 1, int(sect)):
                            # Add to queue
                            refs.append(
                                Ref.init_law(book=book,
                                             section=str(between_sect)))

                refs.append(Ref.init_law(book=book, section=sect))

            # Prepare marker
            marker = RefMarker(text=marker_text,
                               start=marker_match.start(),
                               end=marker_match.end())
            marker.set_uuid()
            marker.set_references(refs)

            # Check if actual references were found in marker text
            if len(refs) > 0:
                markers.append(marker)

                # Update content to avoid double matching
                content = marker.replace_content_with_mask(content)
            else:
                logger.warning('No references found in marker: %s ' %
                               marker_text)

        # Single refs
        sect_pattern = '(?P<sect>([0-9]+)(\s?[a-z]?))'
        patterns = [
            # § 3 BGB, § 3d BGB, § 83 d BGB
            sectionSign + ' ' + sect_pattern + ' (?P<book>' + book_pattern +
            ')' + book_look_ahead,
            # Abs OR Nr
            # § 42 Abs. 1 Alt. 1 VwGO
            sectionSign + ' ' + sect_pattern +
            ' Abs. ([0-9]+) Alt. ([0-9]+) (?P<book>' + book_pattern + ')' +
            book_look_ahead,
            sectionSign + ' (?P<sect>([0-9]+)(\s?[a-z]?)) ' + any_content +
            ' (?P<book>(' + book_pattern + '))' + book_look_ahead,
            sectionSign + ' (?P<sect>([0-9]+)(\s?[a-z]?)) ' + any_content +
            ' (?P<next_book>(i\.V\.m\.|iVm))' + book_look_ahead,
        ]

        markers_waiting_for_book = []  # type: List[RefMarker]

        for pattern in patterns:  # Iterate over all patterns

            # logger.debug('Pattern: %s' % pattern)

            for marker_match in re.finditer(re.compile(pattern),
                                            content):  # All matches
                marker_text = marker_match.group(0)
                if 'book' in marker_match.groupdict():
                    book = Ref.clean_book(marker_match.group('book'))
                else:
                    book = None

                ref = Ref.init_law(section=marker_match.group('sect'),
                                   book=None)

                marker = RefMarker(text=marker_text,
                                   start=marker_match.start(),
                                   end=marker_match.end())
                marker.set_uuid()
                # marker.uuid = 's'

                # Has this marker a book
                if book is not None:
                    ref.book = book

                    marker.set_references([ref])

                    # Update content to avoid double matching
                    content = marker.replace_content_with_mask(content)

                    markers.append(marker)

                    # Set to waiting markers
                    for waiting in markers_waiting_for_book:
                        if len(waiting.references) == 1:
                            waiting.references[0].book = book

                            content = waiting.replace_content_with_mask(
                                content)

                            markers.append(waiting)
                    markers_waiting_for_book = []
                else:
                    if marker_match.group('next_book') is not None:
                        marker.set_references([ref])
                        markers_waiting_for_book.append(marker)
                    else:
                        raise RefExError('next_book and book are None')

        if len(markers_waiting_for_book) > 0:
            logger.warning('Marker could not be assign to book: %s' %
                           markers_waiting_for_book)

        # TODO Art GG

        return markers

Example #6

Show file

File: law_dnc.py Project: openlegaldata/legal-reference-extraction

    def extract_law_ref_markers_with_context(self, content):
        """
        With context = citing law book is known

        § 343 der Zivilprozessordnung
        :param content:
        :return:
        """
        markers = []

        book_code = self.law_book_context
        # content = content.replace('&#167;', '§')
        search_text = str(content)

        def multi_sect(match):
            start = int(match.group(1))
            end = int(match.group(3)) + 1
            sects = []

            for sect in range(start, end):
                sects.append(str(sect))

            return sects

        def multi_book(match):
            start = int(match.group(1))
            end = int(match.group(3)) + 1
            return [book_code] * (end - start)

        patterns = [
            # §§ 664 bis 670
            {
                'pattern': '§§ ([0-9]+) (bis|und) ([0-9]+)',
                'book': multi_book,
                'sect': multi_sect
            },
            # Anlage 3
            {
                'pattern': 'Anlage ([0-9]+)',
                'book': lambda match: book_code,
                'sect': lambda match: 'anlage-%i' % int(match.group(1))
            },

            # § 1
            {
                'pattern':
                '§ ([0-9]+)(?:\s(Abs\.|Absatz)\s([0-9]+))?(?:\sSatz\s([0-9]+))?',
                'book': lambda match: book_code,
                'sect': lambda match: match.group(1)
            },
        ]

        for p in patterns:
            regex = p['pattern']

            res = re.finditer(regex, search_text)  # flags

            for ref_m in res:
                ref_text = ref_m.group(0)

                # Build ref with lambda functions
                ref_ids = []
                books = p['book'](ref_m)
                sects = p['sect'](ref_m)

                # Handle multiple ref ids in a single marker
                if not isinstance(books, str):
                    for key, book in enumerate(books):
                        ref_ids.append(
                            Ref(ref_type=RefType.LAW,
                                book=book,
                                section=sects[key]))

                else:
                    ref_ids.append(
                        Ref(ref_type=RefType.LAW, book=books, section=sects))

                ref = RefMarker(text=ref_text,
                                start=ref_m.start(),
                                end=ref_m.end())
                ref.set_uuid()
                ref.set_references(ref_ids)
                markers.append(ref)

                # Remove from search content to avoid duplicate matches
                search_text = search_text[:ref_m.start()] + ('_' * (ref_m.end() - ref_m.start())) \
                              + search_text[ref_m.end():]
                # print('-------')

        # Sort by start and replace markers
        # markers.sort(key=lambda r: r.start, reverse=False)
        # marker_offset = 0
        # for key, ref in enumerate(markers):
        #     content, marker_offset = ref.replace_content(content, marker_offset)

        return markers