Esempio n. 1
0
    def find_court_by_key_column(
            self, phrase: LineOrPhrase, phrase_finder: PhraseFinder,
            column: str) -> Tuple[MatchFound, List[PhraseMatch]]:
        found_substrings = phrase_finder.find_word(phrase.text, True)
        if len(found_substrings) == 0:
            return None
        subset = self.courts.loc[self.courts[column] == found_substrings[0][0]]
        if len(subset) == 0:
            return None

        match = MatchFound(subset, phrase.start + found_substrings[0][1],
                           phrase.start + found_substrings[0][2])
        return (match, found_substrings)
    def test_abbreviation(self):
        text = "In C.D. Ill. we should find"
        finder = PhraseFinder(['C.D. Ill.'])
        rst = finder.find_word(text, True)
        self.assertEqual(1, len(rst))

        finder = PhraseFinder(['C.D. Ill.', 'sh', 'should', 'find'])
        rst = finder.find_word(text, True)
        self.assertEqual(3, len(rst))
class CourtCitationsParser:
    """
    https://docs.google.com/spreadsheets/d/1_-Hnr46s8JmTYIFSkcI01gbwqwjnVDoAV1dUv3Bz5fM/edit#gid=0

    could be one of the following:

    (lalala; DATE; vom) or (lalala; DATE; Beschluss)

    or § lalala Court Name (abbreviation)

    or
    """
    reg_cite_chunk = re.compile(r"\([^0-9\)]+[0-9\.]{2,}[^\)]+\)", re.UNICODE)
    reg_trigger_words = re.compile('Beschluss|vom', re.UNICODE | re.IGNORECASE)
    reg_token_end = re.compile(r"[;,]")

    registries = {
        'BStBl': 'Bundessteuerblatt (sonstige gebräuchliche Verwendung)',
        'BFH': 'Deutschland, Rechtswesen: Bundesfinanzhof',
        'BFHE': 'Sammlung der Entscheidungen des BFH',
        'GmS-OGB': 'Beschluss des Gemeinsamen Senats der obersten Gerichtshöfe des Bundes',
        'BVerwGE': 'Entscheidungen des Bundesverwaltungsgerichts',
        'GrS': 'Beschluss des Großen Senats des BFH',
        'BFH-Urteile': 'Bundesfinanzhof Urteile',
        'BFH-Beschlüsse': 'Bundesfinanzhof Beschlüsse',
        'DstR': 'Deutsches Steuerrecht - DStR',
        'KStG':'Körperschaftsteuergesetz'
    }

    registry_finder = None
    reg_split_by_registry = None

    # region STATICINIT
    if not registry_finder:
        registry_finder = PhraseFinder(list(registries.keys()))
    if not reg_split_by_registry:
        reg_split_by_registry = re.compile("|".join(list(registries.keys())))
    # endregion

    def __init__(self):
        self.locale = 'de'
        self.items = []  # List[CourtCitationAnnotation]
        self.locale = None

    def parse(self, text: str, locale: str = None) -> List[CourtCitationAnnotation]:
        self.items = []
        self.locale = locale
        self.find_citations_in_embraced_text(text)
        return self.items

    def find_citations_in_embraced_text(self, text: str) -> None:
        fragment_start = 0
        for embraced_text in CourtCitationsParser.reg_cite_chunk.finditer(text):
            start = embraced_text.start()

            # process text before braces
            fragment = text[fragment_start:start]
            self.split_chunk_and_find_citations(fragment, fragment_start)
            fragment_start = embraced_text.end() + 1

            # process text in braces
            self.process_chunks_in_embraced_text(embraced_text, start)

        fragment = text[fragment_start:-1]
        self.split_chunk_and_find_citations(fragment, fragment_start)

    def process_chunks_in_embraced_text(self, embraced_text: str, start) -> None:
        parts = embraced_text.group().split(';')
        for part in parts:
            self.get_detail_from_chunk(part, start)
            start += len(part) + 1
        return

    def split_chunk_and_find_citations(self, text: str, start: int) -> None:
        chunks = self.split_text_by_keywords(text)
        for chunk in chunks:
            self.get_detail_from_chunk(chunk[0], chunk[1] + start)

    def get_detail_from_chunk(self, chunk_text: str, chunk_start: int) -> None:
        chunk_body = chunk_text.strip(r'() \t')
        dates = self.get_dates_from_text(chunk_body)
        registries = self.get_registries_from_text(chunk_body)
        triggers = CourtCitationsParser.reg_trigger_words.search(chunk_body)

        if not triggers and len(registries) == 0 and len(dates) == 0:
            return

        start = chunk_start + chunk_text.find(chunk_body)
        end = start + len(chunk_body)
        ant = CourtCitationAnnotation(name=chunk_body,
                                      coords=(start, end),
                                      text=chunk_body,
                                      locale=self.locale)
        ant.locale = self.locale
        if len(registries) > 0:
            ant.name = CourtCitationsParser.registries[registries[0].value]
            ant.short_name = self.get_reference_from_registry(registries[0], chunk_body)
        self.items.append(ant)

    def get_reference_from_registry(self, registry: PossibleToken,
                                    chunk_body: str) -> str:
        start = registry.coords[0]
        end = -1
        end_match = CourtCitationsParser.reg_token_end.search(chunk_body[start:])
        if end_match:
            end = end_match.start()
        return chunk_body[start: end + 1].strip(' \t.,;()')

    def get_registries_from_text(self, text: str) -> List[PossibleToken]:
        reg_names = [(m, 100) for m in CourtCitationsParser.registry_finder.find_word(text, ignore_case=False)]
        # if the case is not the same, the probability is 50%
        reg_names += [(m, 50) for m in CourtCitationsParser.registry_finder.find_word(text, ignore_case=True)]
        reg_names.sort(key=lambda n: n[0][1] - n[1] * 1000)

        toks = []
        for match_prob in reg_names:
            tok = PossibleToken('registry', match_prob[0][0],
                                (match_prob[0][1], match_prob[0][2]),
                                match_prob[1])
            toks.append(tok)
        return toks

    def get_dates_from_text(self, text: str) -> List[PossibleToken]:
        try:
            date_ents = list(get_dates(text))
        except TypeError:
            date_ents = []
        date_ents.sort(key=lambda d: d['location_start'])

        tokens = []
        for d in date_ents:
            tok = PossibleToken('date', d['value'],
                                (d['location_start'], d['location_end']), 100)
            tokens.append(tok)
        if len(tokens) > 0:
            return tokens

        # try get years only
        for year in year_parser.year_parser.get_years_with_coords_from_string(text):
            tokens.append(PossibleToken('date', str(year[0]),
                                        (year[1], year[2]), 50))
        return tokens

    def split_text_by_keywords(self, text: str) -> List[Tuple[str, int]]:
        matches =  list(CourtCitationsParser.reg_split_by_registry.finditer(text))
        chunks = []
        for i in range(len(matches)):
            match = matches[i]
            ending = -1
            if i < len(matches) - 1:
                ending = matches[i + 1].start() - 1
            phrase_break = CourtCitationsParser.reg_token_end.search(text[match.end():ending])
            if phrase_break:
                ending = min(ending, phrase_break.start())
            chunks.append((text[match.start():ending], match.start()))
        return chunks
Esempio n. 4
0
    def __init__(self, ptrs: ParserInitParams):
        """
        :param ptrs.court_pattern_checker: a regex or None, the parser skips the phrase if pattern doesn't match the phrase
        :param ptrs.column_names['type']: "Court Type", e.g. 'Federal District Court'
        :param ptrs.column_names['name']: "Court Name", e.g. 'Southern Georgia District Court'
        :param ptrs.column_names['jurisdiction']: "Jurisdiction", e.g. 'Federal'
        :param ptrs.column_names['alias']: "Alias", e.g. 'C.D. Cal'
        :param ptrs.dataframe_paths: like ['data/us_courts.csv', ...]
        :param ptrs.split_ptrs: phrase splitting processor parameters, see LineProcessor class
        :param ptrs.key_word_preproc_func: a function used to pre-process column values used in text search

        dataframe_paths is a collection of *.CSV files that contain the data like:

        | Jurisdiction || Court Type         || Court Name               || ... |
        | Federal      || Verfassungsgericht || Bundesverfassungsgericht || ... |

        The column 'Court Name' (you may provide another column name instead of Court Name
        in param: court_name_column) should contain unique values that precisely identify each
        of the court given.

        The columns 'Court Type' (param: court_type_column) and 'Jurisdiction'
        (param: jurisdiction_column) in couple may or may not precisely identify the court given.

        At least this parser can identify the court's type and return the annotation that
        neither specifies the court's name nor jurisdiction

        The court_pattern_checker parameter speeds up the parsing process:
        - the whole text or the line would be skipped if this line doesn't match the court_pattern_checker
        E.g., you can pass re.compile('court', re.IGNORECASE) for searching courts' annotations
        for the En locale

        The split_ptrs specify how the parser splits the text into phrases.
        Each phrase can contain zero ore one court annotations. See LineProcessor class.
        For a courts parser phrase bounds usually include punctuation (.,;!?) and conjunctions
        (and, or) or (und, oder)

        The example function for key_word_preproc_func is:
        def preproc_func(text):
             return re.sub('e$', '[e]?', text)
        """

        self.phrase_match_pattern = None if ptrs.court_pattern_checker is None \
            else ptrs.court_pattern_checker
        self.court_type_column = ptrs.column_names['type']
        self.court_name_column = ptrs.column_names['name']
        self.court_alias_column = ptrs.column_names['alias']
        self.jurisdiction_column = ptrs.column_names['jurisdiction']
        self.proc = LineProcessor()
        self.phrase_split_ptrs = ptrs.split_ptrs
        self.annotations = []
        self.courts = None
        self.load_courts(ptrs.dataframe_paths)

        # unique columns
        self.finder_court_name = PhraseFinder(
            UniversalCourtsParser.get_unique_col_values(
                self.courts[self.court_name_column]),
            ptrs.key_word_preproc_func)
        self.finder_court_alias = None if len(self.court_alias_column) == 0 else \
            PhraseFinder(UniversalCourtsParser.get_unique_col_values(
                self.courts[self.court_alias_column]), ptrs.key_word_preproc_func)

        # non-unique columns
        self.finder_court_type = PhraseFinder(
            UniversalCourtsParser.get_unique_col_values(
                self.courts[self.court_type_column]),
            ptrs.key_word_preproc_func)
        self.finder_jur = PhraseFinder(
            UniversalCourtsParser.get_unique_col_values(
                self.courts[self.jurisdiction_column]),
            ptrs.key_word_preproc_func)
Esempio n. 5
0
class UniversalCourtsParser:
    """
    The class describes a "constructor" for building locale (and region) specific
    parsers, that find reference to courts within the text.

    Use the parse() method to find all reference to courts from the
    text provided.
    Each reference is a dictionary with two keys:
    - "attrs" key leads to the "coordinates" (starting and ending characters) of the
      occurrence within the provided text
    - "tags" key leads to another dictionary, which contains:
      - court official name
      - court's jurisdiction ...

    In order to parse the text you are supposed to create your locale (or region) specific instance of
    UniversalCourtsParser. See the constructor below:
    """
    def __init__(self, ptrs: ParserInitParams):
        """
        :param ptrs.court_pattern_checker: a regex or None, the parser skips the phrase if pattern doesn't match the phrase
        :param ptrs.column_names['type']: "Court Type", e.g. 'Federal District Court'
        :param ptrs.column_names['name']: "Court Name", e.g. 'Southern Georgia District Court'
        :param ptrs.column_names['jurisdiction']: "Jurisdiction", e.g. 'Federal'
        :param ptrs.column_names['alias']: "Alias", e.g. 'C.D. Cal'
        :param ptrs.dataframe_paths: like ['data/us_courts.csv', ...]
        :param ptrs.split_ptrs: phrase splitting processor parameters, see LineProcessor class
        :param ptrs.key_word_preproc_func: a function used to pre-process column values used in text search

        dataframe_paths is a collection of *.CSV files that contain the data like:

        | Jurisdiction || Court Type         || Court Name               || ... |
        | Federal      || Verfassungsgericht || Bundesverfassungsgericht || ... |

        The column 'Court Name' (you may provide another column name instead of Court Name
        in param: court_name_column) should contain unique values that precisely identify each
        of the court given.

        The columns 'Court Type' (param: court_type_column) and 'Jurisdiction'
        (param: jurisdiction_column) in couple may or may not precisely identify the court given.

        At least this parser can identify the court's type and return the annotation that
        neither specifies the court's name nor jurisdiction

        The court_pattern_checker parameter speeds up the parsing process:
        - the whole text or the line would be skipped if this line doesn't match the court_pattern_checker
        E.g., you can pass re.compile('court', re.IGNORECASE) for searching courts' annotations
        for the En locale

        The split_ptrs specify how the parser splits the text into phrases.
        Each phrase can contain zero ore one court annotations. See LineProcessor class.
        For a courts parser phrase bounds usually include punctuation (.,;!?) and conjunctions
        (and, or) or (und, oder)

        The example function for key_word_preproc_func is:
        def preproc_func(text):
             return re.sub('e$', '[e]?', text)
        """

        self.phrase_match_pattern = None if ptrs.court_pattern_checker is None \
            else ptrs.court_pattern_checker
        self.court_type_column = ptrs.column_names['type']
        self.court_name_column = ptrs.column_names['name']
        self.court_alias_column = ptrs.column_names['alias']
        self.jurisdiction_column = ptrs.column_names['jurisdiction']
        self.proc = LineProcessor()
        self.phrase_split_ptrs = ptrs.split_ptrs
        self.annotations = []
        self.courts = None
        self.load_courts(ptrs.dataframe_paths)

        # unique columns
        self.finder_court_name = PhraseFinder(
            UniversalCourtsParser.get_unique_col_values(
                self.courts[self.court_name_column]),
            ptrs.key_word_preproc_func)
        self.finder_court_alias = None if len(self.court_alias_column) == 0 else \
            PhraseFinder(UniversalCourtsParser.get_unique_col_values(
                self.courts[self.court_alias_column]), ptrs.key_word_preproc_func)

        # non-unique columns
        self.finder_court_type = PhraseFinder(
            UniversalCourtsParser.get_unique_col_values(
                self.courts[self.court_type_column]),
            ptrs.key_word_preproc_func)
        self.finder_jur = PhraseFinder(
            UniversalCourtsParser.get_unique_col_values(
                self.courts[self.jurisdiction_column]),
            ptrs.key_word_preproc_func)

    def parse(self, text: str) -> List[dict]:
        """
        :param text: the text being processed
        :return: annotations - List[dict]

        Here is an example of the method's call:
        ret = processor.parse("Bei dir läuft, deine Verfassungsgerichtshof des Freistaates Sachsen rauchen Joints vor der Kamera")

        ret[0]['attrs'] = {'start': 14, 'end': 97}
        ret[0]['tags'] = {'Extracted Entity Type': 'court',
            'Extracted Entity Court Name': 'Verfassungsgerichtshof des Freistaates Sachsen',
            'Extracted Entity Court Type': 'Verfassungsgericht',
            'Extracted Entity Court Jurisdiction': 'Sachsen'}
        """
        self.annotations = []

        self.find_courts_by_alias_in_whole_text(text)

        # if the whole text doesn't contain the key word (gericht) - skip all the following
        if self.phrase_match_pattern is not None:
            if self.phrase_match_pattern.search(text, re.IGNORECASE) is None:
                return self.annotations

        for phrase in self.proc.split_text_on_line_with_endings(
                text, self.phrase_split_ptrs):
            # if the phrase doesn't contain the key word (e.g., gericht for deutsche) - skip the phrase
            if self.phrase_match_pattern is not None:
                if self.phrase_match_pattern.search(phrase.text,
                                                    re.IGNORECASE) is None:
                    continue
            self.find_court_by_any_key(phrase)

        return self.annotations

    def load_courts(self, dataframe_paths: List[str]):
        frames = []
        dtypes = {
            self.court_type_column: str,
            self.court_name_column: str,
            self.jurisdiction_column: str
        }
        if self.court_alias_column:
            dtypes[self.court_alias_column] = str

        for path in dataframe_paths:
            frame = pd.read_csv(path,
                                encoding="utf-8",
                                error_bad_lines=False,
                                converters=dtypes)
            frames.append(frame)
        self.courts = pd.concat(frames)

    def find_courts_by_alias_in_whole_text(self, text: str) -> None:
        if self.finder_court_alias is None:
            return
        for m in self.finder_court_alias.find_word(text):
            alias = m[0]
            rows = self.courts.loc[self.courts[self.court_alias_column] ==
                                   alias]
            match_found = MatchFound(rows, m[1], m[2])
            self.add_annotation(match_found)

    def find_court_by_any_key(self, phrase: LineOrPhrase):
        # find by court names
        matches = []
        matches += self.find_court_by_name(phrase)
        matches += self.find_court_by_type_and_jurisdiction(phrase)
        matches = [m for m in matches if m is not None]
        if len(matches) == 0:
            return
        # find the best match
        matches.sort(key=lambda m: m.make_sort_key())
        self.add_annotation(matches[0])

    def find_court_by_name(self, phrase: LineOrPhrase) -> List[MatchFound]:
        match = self.find_court_by_key_column(phrase, self.finder_court_name,
                                              self.court_name_column)
        if match is None:
            return []

        match[0].court_name = match[1][0][0]
        return [match[0]]

    def find_court_by_key_column(
            self, phrase: LineOrPhrase, phrase_finder: PhraseFinder,
            column: str) -> Tuple[MatchFound, List[PhraseMatch]]:
        found_substrings = phrase_finder.find_word(phrase.text, True)
        if len(found_substrings) == 0:
            return None
        subset = self.courts.loc[self.courts[column] == found_substrings[0][0]]
        if len(subset) == 0:
            return None

        match = MatchFound(subset, phrase.start + found_substrings[0][1],
                           phrase.start + found_substrings[0][2])
        return (match, found_substrings)

    def find_court_by_type_and_jurisdiction(
            self, phrase: LineOrPhrase) -> List[MatchFound]:
        court_types = self.finder_court_type.find_word(phrase.text, True)
        if len(court_types) == 0:
            return []

        court_jurs = self.finder_jur.find_word(phrase.text, True)
        if len(court_types) != 1 or len(court_jurs) > 1:
            # special case: 2 ore more courts within the same phrase
            # (without commas or conjuctions)
            matches = []
            for ct in court_types:
                m = MatchFound([], phrase.start + ct[1], phrase.start + ct[2])
                m.court_type = ct[0]
                m.court_name = ct[0]
                matches.append(m)
            return matches

        if len(court_jurs) == 0:
            subset = self.courts.loc[self.courts[self.court_type_column] ==
                                     court_types[0][0]]
        else:
            subset = self.courts.loc[
                (self.courts[self.court_type_column] == court_types[0][0])
                & (self.courts[self.jurisdiction_column] == court_jurs[0][0])]

        match = MatchFound(subset, phrase.start,
                           phrase.start + court_types[0][2])
        if len(subset) != 1:
            match.court_name = court_types[0][0]
            match.court_type = court_types[0][0]
        return [match]

    def add_annotation(self, match: MatchFound):
        mlen = len(match.subset)

        name = match.subset[self.court_name_column].values[0] \
            if match.is_exact else \
            match.court_name if match.court_name is not None else \
            match.subset[self.court_name_column].values[0] if mlen > 0 else ''

        court_type = match.subset[self.court_type_column].values[0] \
            if match.is_exact else \
            match.court_type if match.court_type is not None else \
            match.subset[self.court_type_column].values[0] if mlen > 0 else ''

        jurisdiction = match.subset[self.jurisdiction_column].values[0] \
            if match.is_exact else \
            match.jurisdiction if match.jurisdiction is not None else \
                match.subset[self.jurisdiction_column].values[0] if mlen > 0 else ''

        ant = dict(attrs={
            'start': match.entry_start,
            'end': match.entry_end
        },
                   tags={
                       'Extracted Entity Type': 'court',
                       'Extracted Entity Court Name': name,
                       'Extracted Entity Court Type': court_type,
                       'Extracted Entity Court Jurisdiction': jurisdiction
                   })
        self.annotations.append(ant)

    @staticmethod
    def get_unique_col_values(col_values):
        return [c for c in col_values.unique() if c]