Esempio n. 1
0
 def test_line_processor_lines(self):
     text = """
 aaa
 Bb b
 c"""
     proc = LineProcessor()
     lines = [line for line in proc.split_text_on_line_with_endings(text)]
     assert len(lines) == 3
Esempio n. 2
0
 def test_line_processor_phrases_de(self):
     text = """
     (2) Vermögenswerte im Sinne dieses Gesetzes sind bebaute und unbebaute Grundstücke sowie rechtlich selbständige Gebäude und Baulichkeiten (im folgenden Grundstücke und Gebäude genannt), Nutzungsrechte und dingliche Rechte an Grundstücken oder Gebäuden, bewegliche Sachen sowie gewerbliche Schutzrechte, Urheberrechte und verwandte Schutzrechte. Vermögenswerte im Sinne dieses Gesetzes sind auch Kontoguthaben und sonstige auf Geldzahlungen gerichtete Forderungen sowie Eigentum/Beteiligungen an Unternehmen oder an Betriebsstätten/Zweigniederlassungen von Unternehmen mit Sitz außerhalb der Deutschen Demokratischen Republik.
     """
     ptrs = LineSplitParams()
     ptrs.line_breaks = {'\n', '.', ';'}
     proc = LineProcessor(line_split_params=ptrs)
     lines = [line for line in proc.split_text_on_line_with_endings(text)]
     assert len(lines) == 3  # plus one for an empty line
Esempio n. 3
0
    def test_split_text_on_words(self):
        text = " While I pounded, weak  and weary. Over "
        proc = LineProcessor()
        all_words = proc.split_text_on_words(text)
        separators = [w for w in all_words if w.is_separator]
        words = [w for w in all_words if not w.is_separator]

        assert len(separators) == 8
        assert len(words) == 7
Esempio n. 4
0
 def __init__(self, parsing_functions: List[Callable[[str], List[DefinitionMatch]]],
              split_params: LineSplitParams):
     """
     :param parsing_functions: a functions' collection from SpanishParsingMethods
     :param split_params: text-to-sentences splitting params
     """
     self.parsing_functions = parsing_functions
     self.annotations = [] # type: List[dict]
     self.split_params = split_params
     self.proc = LineProcessor()
     self.prohibited_words = {} # words that are Not definitions per se
Esempio n. 5
0
 def test_de_linebreaks(self):
     split_params = LineSplitParams()
     split_params.line_breaks = {'.', ';', '!', '?'}
     split_params.abbreviations = {
         'nr.', 'abs.', 'no.', 'act.', 'inc.', 'p.'
     }
     split_params.abbr_ignore_case = True
     text = 'Nach der Allgemeine\nGebührenverordnung'
     proc = LineProcessor(line_split_params=split_params)
     sents = list(proc.split_text_on_line_with_endings(text))
     self.assertEqual(1, len(sents))
Esempio n. 6
0
    def test_line_processor_phrases(self):
        text = """
Once upon a midnight dreary

While I pounded, weak and weary. Over many a quaint and curious volume of forgotten lore,
While I nodded, nearly napping; suddenly there came a tapping,
As of some one gently rapping, rapping at my chamber door."""
        ptrs = LineSplitParams()
        ptrs.line_breaks = {'\n', '.', ';'}
        proc = LineProcessor(line_split_params=ptrs)
        lines = [line for line in proc.split_text_on_line_with_endings(text)]
        assert len(lines) == 6
Esempio n. 7
0
    def split_text_on_lines(self, text: str):
        self.estimate = ParsedTextQualityEstimate()
        proc = LineProcessor()
        self.lines = [
            TypedLineOrPhrase.wrap_line(l)
            for l in proc.split_text_on_line_with_endings(text)
        ]
        proc.determine_line_length(text)
        self.estimate.avg_line_length = proc.line_length

        for line in self.lines:
            self.determine_line_type(line)
Esempio n. 8
0
    def test_de_abbrs(self):
        split_params = LineSplitParams()
        split_params.line_breaks = {'.', ';', '!', '?'}
        split_params.abbreviations = {'nr.', 'abs.', 'no.', 'act.', 'a.D.'}
        split_params.abbr_ignore_case = True

        text = '1000 a.D. und drang'
        proc = LineProcessor(line_split_params=split_params)
        sents = list(proc.split_text_on_line_with_endings(text))
        self.assertEqual(1, len(sents))

        text = '1000 A.d. und drang'
        sents = list(proc.split_text_on_line_with_endings(text))
        self.assertGreater(len(sents), 1)
Esempio n. 9
0
 def init_parser():
     split_params = LineSplitParams()
     split_params.line_breaks = {'\n', '.', ';', '!', '?'}
     split_params.abbreviations = DeLanguageTokens.abbreviations
     split_params.abbr_ignore_case = True
     CopyrightDeParser.line_processor = LineProcessor(
         line_split_params=split_params)
    def test_line_processor_phrases_abbr(self):
        text = 'Articolul saisprezece (16) Nr. 2. Textul:'
        proc = LineProcessor()
        ptrs = LineSplitParams()
        ptrs.line_breaks = {'\n', '.', ';'}

        lines = [
            line for line in proc.split_text_on_line_with_endings(text, ptrs)
        ]
        assert len(lines) == 3

        ptrs.abbreviations = ['nr.', 'abs.']
        ptrs.abbr_ignore_case = True
        lines = [
            line for line in proc.split_text_on_line_with_endings(text, ptrs)
        ]
        assert len(lines) == 2
Esempio n. 11
0
    def __init__(self, gesetze_df: pd.DataFrame, verordnungen_df: pd.DataFrame,
                 concept_df: pd.DataFrame):
        self.locale = ''
        parse_columns = ('Kurztitel', 'Titel', 'Abkürzung')
        dependent_columns = {'Titel': 'External Reference Normalized'}
        preformed_entity = {
            'External Reference Type': 'Laws and Rules',
            'External Reference Source': 'BaFin',
            'External Reference Issuing Country': 'Germany'
        }
        split_params = LineSplitParams()
        split_params.line_breaks = {'.', ';', '!', '?'}
        split_params.abbreviations = DeLanguageTokens.abbreviations
        split_params.abbr_ignore_case = True
        proc = LineProcessor(line_split_params=split_params)

        self.gesetze_parser = DataframeEntityParser(
            gesetze_df,
            parse_columns,
            result_columns=dependent_columns,
            preformed_entity=preformed_entity,
            line_processor=proc)

        self.verordnungen_parser = DataframeEntityParser(
            verordnungen_df,
            parse_columns,
            result_columns=dependent_columns,
            preformed_entity=preformed_entity,
            line_processor=proc)

        parse_columns = ('b', )
        dependent_columns = {
            'b': 'External Reference Normalized',
            'a': 'External Reference Type'
        }
        preformed_entity.pop('External Reference Type')

        self.concept_parser = DataframeEntityParser(
            concept_df,
            parse_columns,
            result_columns=dependent_columns,
            preformed_entity=preformed_entity,
            line_processor=proc)
Esempio n. 12
0
    def test_check_phrase_starts_with_phrase(self):
        text = 'While I pounded, weak and weary. Over many a quaint and curious volume of forgotten lore'
        proc = LineProcessor()
        words = proc.split_text_on_words(text)

        ret = proc.check_phrase_starts_with_phrase(words, 2, ['I', 'goat'])
        assert ret

        ret = proc.check_phrase_starts_with_phrase(words, 3, ['I', 'goat'])
        assert not ret

        ret = proc.check_phrase_starts_with_phrase(words, 6, ['I', 'weak'])
        assert ret

        ret = proc.check_phrase_starts_with_phrase(words, 6,
                                                   ['I', ['weak', 'and']])
        assert ret

        ret = proc.check_phrase_starts_with_phrase(words, 6,
                                                   ['I', ['weak', 'weary']])
        assert not ret
Esempio n. 13
0
    def test_line_processor_phrases_abbr(self):
        text = 'Articolul saisprezece (16) nr. 2. Textul:'
        ptrs = LineSplitParams()
        ptrs.line_breaks = {'\n', '.', ';'}
        proc = LineProcessor(line_split_params=ptrs)

        lines = list(proc.split_text_on_line_with_endings(text))
        assert len(lines) == 3

        ptrs.abbreviations = {'nr.', 'abs.'}
        ptrs.abbr_ignore_case = True
        proc = LineProcessor(line_split_params=ptrs)
        lines = list(proc.split_text_on_line_with_endings(text))
        assert len(lines) == 2
Esempio n. 14
0
class ParsedTextQualityEstimator:
    """
    Estimates the probability of the text passed being somewhat corrupted
    """

    sentence_break_chars = {'.', ';', '!', '?', ','}
    reg_numered_header = re.compile(
        r'(^[\s]*\(?[a-zA-Z]\)?\s)|(^[\s]*[0-9\.]+[\)]?\s)')
    reg_paragraph_start = re.compile(r'(^\s{2})|(^\t)')
    minimal_paragraph_line_length = 250

    def __init__(self):
        self.estimate = ParsedTextQualityEstimate()
        self.lines = []
        self.proc = LineProcessor()

    def estimate_text(self, text: str) -> ParsedTextQualityEstimate:
        """
        Let's assume the text is:
            Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical
            Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at

            Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a
            Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered

            the undoubtable source.
        :param text: a text containing a number of \n\n sequences, see above
        :return: ParsedTextQualityEstimate: {'avg_line_length': 103, 'extra_line_breaks_prob': 66, 'corrupted_prob': 66}
        """

        self.split_text_on_lines(text)
        # does the text contain unnecessary line breaks?
        self.estimate_extra_line_breaks()
        # wrap up the estimate
        self.estimate.corrupted_prob = self.estimate.extra_line_breaks_prob
        return self.estimate

    def split_text_on_lines(self, text: str):
        self.estimate = ParsedTextQualityEstimate()

        self.lines = [
            TypedLineOrPhrase.wrap_line(l)
            for l in self.proc.split_text_on_line_with_endings(text)
        ]
        self.proc.determine_line_length(text)
        self.estimate.avg_line_length = self.proc.line_length

        for line in self.lines:
            self.determine_line_type(line)

    def estimate_extra_line_breaks(self):
        lines_total = len(self.lines)
        if lines_total == 0:
            return

        longest_seq = 0
        current_seq = 0
        total_extra_breaks = 0

        for indx in range(0, len(self.lines)):
            if self.check_line_followed_by_unnecessary_break(indx):
                total_extra_breaks += 1
                current_seq += 1
                longest_seq = max(current_seq, longest_seq)
                continue
            current_seq = 0

        if total_extra_breaks > 1:
            p1 = 100 if longest_seq > lines_total / 3 else int(
                100 * longest_seq * 2.5 / lines_total)
            p2 = int(100 * total_extra_breaks * 2 / lines_total)
            self.estimate.extra_line_breaks_prob = min(100, max(p1, p2))

    def check_line_followed_by_unnecessary_break(self,
                                                 line_index: int) -> bool:
        line = self.lines[line_index]
        if line.ending.count('\n') <= 1:
            return False
        if len(line.text
               ) > ParsedTextQualityEstimator.minimal_paragraph_line_length:
            # the whole line could be a paragraph
            return False
        prob_needs_extra = line.type == LineType.header
        if not prob_needs_extra:
            next_line = self.lines[
                line_index + 1] if line_index < len(self.lines) - 1 else None
            prob_needs_extra = next_line is not None and next_line.type != LineType.regular
        return not prob_needs_extra

    def determine_line_type(self, line: TypedLineOrPhrase):
        p_head = self.estimate_line_is_header_prob(line.text)
        if p_head > 50:
            line.type = LineType.header
            return
        p_par_start = self.estimate_line_is_paragraph_start_prob(line.text)
        if p_par_start > 50:
            line.type = LineType.paragraph_start

    def estimate_line_is_paragraph_start_prob(self, line: str) -> int:
        if ParsedTextQualityEstimator.reg_paragraph_start.search(line):
            return 100
        return 0

    def estimate_line_is_header_prob(self, line: str) -> int:
        line = line.rstrip(' \t')
        if len(line) == 0:
            return 0
        if line[-1] in ParsedTextQualityEstimator.sentence_break_chars:
            return 0
        if ParsedTextQualityEstimator.reg_numered_header.search(line):
            return 100

        if len(line) < self.estimate.avg_line_length * 0.6:
            return 65  # 65% chance the line is a header

        return 35
Esempio n. 15
0
 def __init__(self):
     self.estimate = ParsedTextQualityEstimate()
     self.lines = []
     self.proc = LineProcessor()
Esempio n. 16
0
class UniversalDefinitionsParser:
    """
    EsDefinitionsParser searches for definitions in text according to the
    rules of Spanish. See the "parse" method
    """
    def __init__(self, parsing_functions: List[Callable[[str], List[DefinitionMatch]]],
                 split_params: LineSplitParams):
        """
        :param parsing_functions: a functions' collection from SpanishParsingMethods
        :param split_params: text-to-sentences splitting params
        """
        self.parsing_functions = parsing_functions
        self.annotations = [] # type: List[dict]
        self.split_params = split_params
        self.proc = LineProcessor()
        self.prohibited_words = {} # words that are Not definitions per se

    def parse(self, text: str) -> List[dict]:
        """
        :param text: En este acuerdo, el término "Software" se refiere a: (i) el programa informático
        :return: { "attrs": {"start": 28, "end": 82}, "tags": {"Extracted Entity Type": "definition",
                "Extracted Entity Definition Name": "Software",
                "Extracted Entity Text": ""Software" se refiere a: (i) el programa informático"} }
        """
        for phrase in self.proc.split_text_on_line_with_endings(text, self.split_params):
            matches = []
            for f in self.parsing_functions:
                ml = f(phrase.text)
                matches += ml
            # find synonyms
            # sort and take the most appropriate matches
            matches = self.remove_prohibited_words(matches)
            if len(matches) > 1:
                matches = self.choose_best_matches(matches)
                matches = self.choose_more_precise_matches(matches)
            # trim parts of matches
            for match in matches:
                ant = {
                    "attrs": {
                        "start": phrase.start + match.start,
                        "end": phrase.start + match.end
                    },
                    "tags": {
                        'Extracted Entity Type': 'definition',
                        'Extracted Entity Definition Name': match.name,
                        'Extracted Entity Text': phrase.text[match.start: match.end]
                    }
                }
                self.annotations.append(ant)
        return self.annotations

    def remove_prohibited_words(self, matches: List[DefinitionMatch]) -> List[DefinitionMatch]:
        # like 'und' or 'and' or 'the' - the word like this is not a definition itself
        return [m for m in matches if m.name not in self.prohibited_words]

    def choose_best_matches(self, matches: List[DefinitionMatch]) -> List[DefinitionMatch]:
        resulted = []
        for k, g in groupby(matches, lambda m: m.name.strip(" \t'\"")):
            same_matches = list(g)
            if len(same_matches) > 1:
                same_matches = [sorted(same_matches,
                                       key=UniversalDefinitionsParser.estimate_match_quality, reverse=True)[0]]
            resulted += same_matches
        return resulted

    def choose_more_precise_matches(self, matches: List[DefinitionMatch]) -> List[DefinitionMatch]:
        """
        look for a match "consumed" by other matches and spare the consuming! matches
        """
        resulted = []
        if len(matches) < 2:
            return matches
        for i in range(0, len(matches)):
            a = matches[i]
            is_consuming = False
            for j in range(0, len(matches)):
                if i == j:
                    continue
                b = matches[j]
                if b.name in a.name:
                    is_consuming = True
                    break
            if not is_consuming:
                resulted.append(a)
        return resulted


    @staticmethod
    def estimate_match_quality(match: DefinitionMatch) -> int:
        return 1000 * match.probability - (match.end - match.start)
Esempio n. 17
0
class UniversalCourtsParser:
    """
    The class describes a "constructor" for building locale (and region) specific
    parsers, that find reference to courts within the text.

    Use the parse() method to find all reference to courts from the
    text provided.
    Each reference is a dictionary with two keys:
    - "attrs" key leads to the "coordinates" (starting and ending characters) of the
      occurrence within the provided text
    - "tags" key leads to another dictionary, which contains:
      - court official name
      - court's jurisdiction ...

    In order to parse the text you are supposed to create your locale (or region) specific instance of
    UniversalCourtsParser. See the constructor below:
    """
    def __init__(self, ptrs: ParserInitParams):
        """
        :param ptrs.court_pattern_checker: a regex or None, the parser skips the phrase if pattern doesn't match the phrase
        :param ptrs.column_names['type']: "Court Type", e.g. 'Federal District Court'
        :param ptrs.column_names['name']: "Court Name", e.g. 'Southern Georgia District Court'
        :param ptrs.column_names['jurisdiction']: "Jurisdiction", e.g. 'Federal'
        :param ptrs.column_names['alias']: "Alias", e.g. 'C.D. Cal'
        :param ptrs.dataframe_paths: like ['data/us_courts.csv', ...]
        :param ptrs.split_ptrs: phrase splitting processor parameters, see LineProcessor class
        :param ptrs.key_word_preproc_func: a function used to pre-process column values used in text search

        dataframe_paths is a collection of *.CSV files that contain the data like:

        | Jurisdiction || Court Type         || Court Name               || ... |
        | Federal      || Verfassungsgericht || Bundesverfassungsgericht || ... |

        The column 'Court Name' (you may provide another column name instead of Court Name
        in param: court_name_column) should contain unique values that precisely identify each
        of the court given.

        The columns 'Court Type' (param: court_type_column) and 'Jurisdiction'
        (param: jurisdiction_column) in couple may or may not precisely identify the court given.

        At least this parser can identify the court's type and return the annotation that
        neither specifies the court's name nor jurisdiction

        The court_pattern_checker parameter speeds up the parsing process:
        - the whole text or the line would be skipped if this line doesn't match the court_pattern_checker
        E.g., you can pass re.compile('court', re.IGNORECASE) for searching courts' annotations
        for the En locale

        The split_ptrs specify how the parser splits the text into phrases.
        Each phrase can contain zero ore one court annotations. See LineProcessor class.
        For a courts parser phrase bounds usually include punctuation (.,;!?) and conjunctions
        (and, or) or (und, oder)

        The example function for key_word_preproc_func is:
        def preproc_func(text):
             return re.sub('e$', '[e]?', text)
        """

        self.phrase_match_pattern = None if ptrs.court_pattern_checker is None \
            else ptrs.court_pattern_checker
        self.court_type_column = ptrs.column_names['type']
        self.court_name_column = ptrs.column_names['name']
        self.court_alias_column = ptrs.column_names['alias']
        self.jurisdiction_column = ptrs.column_names['jurisdiction']
        self.proc = LineProcessor()
        self.phrase_split_ptrs = ptrs.split_ptrs
        self.annotations = []
        self.courts = None
        self.load_courts(ptrs.dataframe_paths)

        # unique columns
        self.finder_court_name = PhraseFinder(
            UniversalCourtsParser.get_unique_col_values(
                self.courts[self.court_name_column]),
            ptrs.key_word_preproc_func)
        self.finder_court_alias = None if len(self.court_alias_column) == 0 else \
            PhraseFinder(UniversalCourtsParser.get_unique_col_values(
                self.courts[self.court_alias_column]), ptrs.key_word_preproc_func)

        # non-unique columns
        self.finder_court_type = PhraseFinder(
            UniversalCourtsParser.get_unique_col_values(
                self.courts[self.court_type_column]),
            ptrs.key_word_preproc_func)
        self.finder_jur = PhraseFinder(
            UniversalCourtsParser.get_unique_col_values(
                self.courts[self.jurisdiction_column]),
            ptrs.key_word_preproc_func)

    def parse(self, text: str) -> List[dict]:
        """
        :param text: the text being processed
        :return: annotations - List[dict]

        Here is an example of the method's call:
        ret = processor.parse("Bei dir läuft, deine Verfassungsgerichtshof des Freistaates Sachsen rauchen Joints vor der Kamera")

        ret[0]['attrs'] = {'start': 14, 'end': 97}
        ret[0]['tags'] = {'Extracted Entity Type': 'court',
            'Extracted Entity Court Name': 'Verfassungsgerichtshof des Freistaates Sachsen',
            'Extracted Entity Court Type': 'Verfassungsgericht',
            'Extracted Entity Court Jurisdiction': 'Sachsen'}
        """
        self.annotations = []

        self.find_courts_by_alias_in_whole_text(text)

        # if the whole text doesn't contain the key word (gericht) - skip all the following
        if self.phrase_match_pattern is not None:
            if self.phrase_match_pattern.search(text, re.IGNORECASE) is None:
                return self.annotations

        for phrase in self.proc.split_text_on_line_with_endings(
                text, self.phrase_split_ptrs):
            # if the phrase doesn't contain the key word (e.g., gericht for deutsche) - skip the phrase
            if self.phrase_match_pattern is not None:
                if self.phrase_match_pattern.search(phrase.text,
                                                    re.IGNORECASE) is None:
                    continue
            self.find_court_by_any_key(phrase)

        return self.annotations

    def load_courts(self, dataframe_paths: List[str]):
        frames = []
        dtypes = {
            self.court_type_column: str,
            self.court_name_column: str,
            self.jurisdiction_column: str
        }
        if self.court_alias_column:
            dtypes[self.court_alias_column] = str

        for path in dataframe_paths:
            frame = pd.read_csv(path,
                                encoding="utf-8",
                                error_bad_lines=False,
                                converters=dtypes)
            frames.append(frame)
        self.courts = pd.concat(frames)

    def find_courts_by_alias_in_whole_text(self, text: str) -> None:
        if self.finder_court_alias is None:
            return
        for m in self.finder_court_alias.find_word(text):
            alias = m[0]
            rows = self.courts.loc[self.courts[self.court_alias_column] ==
                                   alias]
            match_found = MatchFound(rows, m[1], m[2])
            self.add_annotation(match_found)

    def find_court_by_any_key(self, phrase: LineOrPhrase):
        # find by court names
        matches = []
        matches += self.find_court_by_name(phrase)
        matches += self.find_court_by_type_and_jurisdiction(phrase)
        matches = [m for m in matches if m is not None]
        if len(matches) == 0:
            return
        # find the best match
        matches.sort(key=lambda m: m.make_sort_key())
        self.add_annotation(matches[0])

    def find_court_by_name(self, phrase: LineOrPhrase) -> List[MatchFound]:
        match = self.find_court_by_key_column(phrase, self.finder_court_name,
                                              self.court_name_column)
        if match is None:
            return []

        match[0].court_name = match[1][0][0]
        return [match[0]]

    def find_court_by_key_column(
            self, phrase: LineOrPhrase, phrase_finder: PhraseFinder,
            column: str) -> Tuple[MatchFound, List[PhraseMatch]]:
        found_substrings = phrase_finder.find_word(phrase.text, True)
        if len(found_substrings) == 0:
            return None
        subset = self.courts.loc[self.courts[column] == found_substrings[0][0]]
        if len(subset) == 0:
            return None

        match = MatchFound(subset, phrase.start + found_substrings[0][1],
                           phrase.start + found_substrings[0][2])
        return (match, found_substrings)

    def find_court_by_type_and_jurisdiction(
            self, phrase: LineOrPhrase) -> List[MatchFound]:
        court_types = self.finder_court_type.find_word(phrase.text, True)
        if len(court_types) == 0:
            return []

        court_jurs = self.finder_jur.find_word(phrase.text, True)
        if len(court_types) != 1 or len(court_jurs) > 1:
            # special case: 2 ore more courts within the same phrase
            # (without commas or conjuctions)
            matches = []
            for ct in court_types:
                m = MatchFound([], phrase.start + ct[1], phrase.start + ct[2])
                m.court_type = ct[0]
                m.court_name = ct[0]
                matches.append(m)
            return matches

        if len(court_jurs) == 0:
            subset = self.courts.loc[self.courts[self.court_type_column] ==
                                     court_types[0][0]]
        else:
            subset = self.courts.loc[
                (self.courts[self.court_type_column] == court_types[0][0])
                & (self.courts[self.jurisdiction_column] == court_jurs[0][0])]

        match = MatchFound(subset, phrase.start,
                           phrase.start + court_types[0][2])
        if len(subset) != 1:
            match.court_name = court_types[0][0]
            match.court_type = court_types[0][0]
        return [match]

    def add_annotation(self, match: MatchFound):
        mlen = len(match.subset)

        name = match.subset[self.court_name_column].values[0] \
            if match.is_exact else \
            match.court_name if match.court_name is not None else \
            match.subset[self.court_name_column].values[0] if mlen > 0 else ''

        court_type = match.subset[self.court_type_column].values[0] \
            if match.is_exact else \
            match.court_type if match.court_type is not None else \
            match.subset[self.court_type_column].values[0] if mlen > 0 else ''

        jurisdiction = match.subset[self.jurisdiction_column].values[0] \
            if match.is_exact else \
            match.jurisdiction if match.jurisdiction is not None else \
                match.subset[self.jurisdiction_column].values[0] if mlen > 0 else ''

        ant = dict(attrs={
            'start': match.entry_start,
            'end': match.entry_end
        },
                   tags={
                       'Extracted Entity Type': 'court',
                       'Extracted Entity Court Name': name,
                       'Extracted Entity Court Type': court_type,
                       'Extracted Entity Court Jurisdiction': jurisdiction
                   })
        self.annotations.append(ant)

    @staticmethod
    def get_unique_col_values(col_values):
        return [c for c in col_values.unique() if c]
Esempio n. 18
0
    def __init__(self, ptrs: ParserInitParams):
        """
        :param ptrs.court_pattern_checker: a regex or None, the parser skips the phrase if pattern doesn't match the phrase
        :param ptrs.column_names['type']: "Court Type", e.g. 'Federal District Court'
        :param ptrs.column_names['name']: "Court Name", e.g. 'Southern Georgia District Court'
        :param ptrs.column_names['jurisdiction']: "Jurisdiction", e.g. 'Federal'
        :param ptrs.column_names['alias']: "Alias", e.g. 'C.D. Cal'
        :param ptrs.dataframe_paths: like ['data/us_courts.csv', ...]
        :param ptrs.split_ptrs: phrase splitting processor parameters, see LineProcessor class
        :param ptrs.key_word_preproc_func: a function used to pre-process column values used in text search

        dataframe_paths is a collection of *.CSV files that contain the data like:

        | Jurisdiction || Court Type         || Court Name               || ... |
        | Federal      || Verfassungsgericht || Bundesverfassungsgericht || ... |

        The column 'Court Name' (you may provide another column name instead of Court Name
        in param: court_name_column) should contain unique values that precisely identify each
        of the court given.

        The columns 'Court Type' (param: court_type_column) and 'Jurisdiction'
        (param: jurisdiction_column) in couple may or may not precisely identify the court given.

        At least this parser can identify the court's type and return the annotation that
        neither specifies the court's name nor jurisdiction

        The court_pattern_checker parameter speeds up the parsing process:
        - the whole text or the line would be skipped if this line doesn't match the court_pattern_checker
        E.g., you can pass re.compile('court', re.IGNORECASE) for searching courts' annotations
        for the En locale

        The split_ptrs specify how the parser splits the text into phrases.
        Each phrase can contain zero ore one court annotations. See LineProcessor class.
        For a courts parser phrase bounds usually include punctuation (.,;!?) and conjunctions
        (and, or) or (und, oder)

        The example function for key_word_preproc_func is:
        def preproc_func(text):
             return re.sub('e$', '[e]?', text)
        """

        self.phrase_match_pattern = None if ptrs.court_pattern_checker is None \
            else ptrs.court_pattern_checker
        self.court_type_column = ptrs.column_names['type']
        self.court_name_column = ptrs.column_names['name']
        self.court_alias_column = ptrs.column_names['alias']
        self.jurisdiction_column = ptrs.column_names['jurisdiction']
        self.proc = LineProcessor()
        self.phrase_split_ptrs = ptrs.split_ptrs
        self.annotations = []
        self.courts = None
        self.load_courts(ptrs.dataframe_paths)

        # unique columns
        self.finder_court_name = PhraseFinder(
            UniversalCourtsParser.get_unique_col_values(
                self.courts[self.court_name_column]),
            ptrs.key_word_preproc_func)
        self.finder_court_alias = None if len(self.court_alias_column) == 0 else \
            PhraseFinder(UniversalCourtsParser.get_unique_col_values(
                self.courts[self.court_alias_column]), ptrs.key_word_preproc_func)

        # non-unique columns
        self.finder_court_type = PhraseFinder(
            UniversalCourtsParser.get_unique_col_values(
                self.courts[self.court_type_column]),
            ptrs.key_word_preproc_func)
        self.finder_jur = PhraseFinder(
            UniversalCourtsParser.get_unique_col_values(
                self.courts[self.jurisdiction_column]),
            ptrs.key_word_preproc_func)
Esempio n. 19
0
class TextPatternCollector:
    basic_line_processor = LineProcessor()
    """
    EsDefinitionsParser searches for definitions in text according to the
    rules of Spanish. See the "parse" method
    """
    def __init__(self, parsing_functions: List[Callable[[str],
                                                        List[PatternFound]]],
                 split_params: LineSplitParams):
        """
        :param parsing_functions: a functions' collection from SpanishParsingMethods
        :param split_params: text-to-sentences splitting params
        """
        self.parsing_functions = parsing_functions
        self.annotations = []  # type: List[TextAnnotation]
        self.split_params = split_params
        self.proc = LineProcessor()
        self.prohibited_words = {}  # words that are Not definitions per se

    def parse(self, text: str, locale: str = None) -> List[TextAnnotation]:
        """
        :param locale: 'En', 'De', 'Es', ...
        :param text: En este acuerdo, el término "Software" se refiere a: (i) el programa informático
        :return: { "attrs": {"start": 28, "end": 82}, "tags": {"Extracted Entity Type": "definition",
                "Extracted Entity Definition Name": "Software",
                "Extracted Entity Text": ""Software" se refiere a: (i) el programa informático"} }
        """
        self.annotations = []  # type: List[TextAnnotation]
        for phrase in self.proc.split_text_on_line_with_endings(
                text, self.split_params):
            matches = []
            for f in self.parsing_functions:
                ml = f(phrase.text)
                matches += ml
            # find synonyms
            # sort and take the most appropriate matches
            matches = self.remove_prohibited_words(matches)
            if len(matches) > 1:
                matches = self.choose_best_matches(matches)
                matches = self.choose_more_precise_matches(matches)
            # trim parts of matches
            for match in matches:
                ant = self.make_annotation_from_pattrn(locale, match, phrase)
                ant.coords = (ant.coords[0] + phrase.start,
                              ant.coords[1] + phrase.start)
                self.annotations.append(ant)
        return self.annotations

    # pylint: disable=unused-argument
    def make_annotation_from_pattrn(self, locale: str, ptrn: PatternFound,
                                    phrase: LineOrPhrase) -> TextAnnotation:
        # should be overriden in derived class
        return None

    # pylint: enable=unused-argument

    def remove_prohibited_words(
            self, matches: List[PatternFound]) -> List[PatternFound]:
        # like 'und' or 'and' or 'the' - the word like this is not a definition itself
        return [m for m in matches if m.name not in self.prohibited_words]

    def choose_best_matches(self,
                            matches: List[PatternFound]) -> List[PatternFound]:
        resulted = []
        # pylint: disable=unused-variable
        for _, g in groupby(matches, lambda m: m.name.strip(" \t'\"")):
            # pylint: enable=unused-variable
            same_matches = list(g)
            if len(same_matches) > 1:
                same_matches = [
                    sorted(same_matches,
                           key=TextPatternCollector.estimate_match_quality,
                           reverse=True)[0]
                ]
            resulted += same_matches
        return resulted

    def choose_more_precise_matches(
            self, matches: List[PatternFound]) -> List[PatternFound]:
        """
        look for a match "consumed" by other matches and spare the consuming! matches
        """
        resulted = []
        if len(matches) < 2:
            return matches
        for i in range(0, len(matches)):
            a = matches[i]
            a_worse_b = False
            for j in range(0, len(matches)):
                if i == j:
                    continue
                b = matches[j]
                if a.pattern_worse_than_target(b):
                    a_worse_b = True
                    break
            if not a_worse_b:
                resulted.append(a)
        return resulted

    @staticmethod
    def estimate_match_quality(match: PatternFound) -> int:
        return 1000 * match.probability - (match.end - match.start)
Esempio n. 20
0
                       "may be used", "is hereby changed to",
                       "is defined", "shall be interpreted"]

WEAK_TRIGGER_LIST = [r"[\(\)]", "in "]
ALL_TRIGGER_LIST = STRONG_TRIGGER_LIST + WEAK_TRIGGER_LIST

STRONG_TRIGGER_LIST.sort(key=len, reverse=True)
WEAK_TRIGGER_LIST.sort(key=len, reverse=True)
ALL_TRIGGER_LIST.sort(key=len, reverse=True)


def join_collection(collection):
    return "|".join([w.replace(" ", r"\s+") for w in collection])


word_processor = LineProcessor()

# Case 1: Term in quotes, is preceded by word|term|phrase or :,.^
# and has item from TRIGGER_LIST after itself.
# Fetch term along with quotes to be able to extract multiple terms,
# e.g.: the words "person" and "whoever" include
TRIGGER_WORDS_PTN = r"""
(?:(?:word|term|phrase)s?\s+|[:,\.]\s*|^)
['"“].{{1,{max_term_chars}}}['"”]\s*
(?:{trigger_list})[\s,]""".format(
    max_term_chars=MAX_TERM_CHARS,
    trigger_list=join_collection(ALL_TRIGGER_LIST))
TRIGGER_WORDS_PTN_RE = re.compile(TRIGGER_WORDS_PTN, re.IGNORECASE | re.UNICODE | re.DOTALL | re.MULTILINE | re.VERBOSE)
EXTRACT_PTN = r"""['"“](.+?)['"”\.]"""
EXTRACT_PTN_RE = re.compile(EXTRACT_PTN, re.UNICODE | re.DOTALL | re.MULTILINE)