def test_abbreviation(self): text = "In C.D. Ill. we should find" finder = PhraseFinder(['C.D. Ill.']) rst = finder.find_word(text, True) self.assertEqual(1, len(rst)) finder = PhraseFinder(['C.D. Ill.', 'sh', 'should', 'find']) rst = finder.find_word(text, True) self.assertEqual(3, len(rst))
def find_court_by_key_column( self, phrase: LineOrPhrase, phrase_finder: PhraseFinder, column: str) -> Tuple[MatchFound, List[PhraseMatch]]: found_substrings = phrase_finder.find_word(phrase.text, True) if len(found_substrings) == 0: return None subset = self.courts.loc[self.courts[column] == found_substrings[0][0]] if len(subset) == 0: return None match = MatchFound(subset, phrase.start + found_substrings[0][1], phrase.start + found_substrings[0][2]) return (match, found_substrings)
class UniversalCourtsParser: """ The class describes a "constructor" for building locale (and region) specific parsers, that find reference to courts within the text. Use the parse() method to find all reference to courts from the text provided. Each reference is a dictionary with two keys: - "attrs" key leads to the "coordinates" (starting and ending characters) of the occurrence within the provided text - "tags" key leads to another dictionary, which contains: - court official name - court's jurisdiction ... In order to parse the text you are supposed to create your locale (or region) specific instance of UniversalCourtsParser. See the constructor below: """ def __init__(self, ptrs: ParserInitParams): """ :param ptrs.court_pattern_checker: a regex or None, the parser skips the phrase if pattern doesn't match the phrase :param ptrs.column_names['type']: "Court Type", e.g. 'Federal District Court' :param ptrs.column_names['name']: "Court Name", e.g. 'Southern Georgia District Court' :param ptrs.column_names['jurisdiction']: "Jurisdiction", e.g. 'Federal' :param ptrs.column_names['alias']: "Alias", e.g. 'C.D. Cal' :param ptrs.dataframe_paths: like ['data/us_courts.csv', ...] :param ptrs.split_ptrs: phrase splitting processor parameters, see LineProcessor class :param ptrs.key_word_preproc_func: a function used to pre-process column values used in text search dataframe_paths is a collection of *.CSV files that contain the data like: | Jurisdiction || Court Type || Court Name || ... | | Federal || Verfassungsgericht || Bundesverfassungsgericht || ... | The column 'Court Name' (you may provide another column name instead of Court Name in param: court_name_column) should contain unique values that precisely identify each of the court given. The columns 'Court Type' (param: court_type_column) and 'Jurisdiction' (param: jurisdiction_column) in couple may or may not precisely identify the court given. At least this parser can identify the court's type and return the annotation that neither specifies the court's name nor jurisdiction The court_pattern_checker parameter speeds up the parsing process: - the whole text or the line would be skipped if this line doesn't match the court_pattern_checker E.g., you can pass re.compile('court', re.IGNORECASE) for searching courts' annotations for the En locale The split_ptrs specify how the parser splits the text into phrases. Each phrase can contain zero ore one court annotations. See LineProcessor class. For a courts parser phrase bounds usually include punctuation (.,;!?) and conjunctions (and, or) or (und, oder) The example function for key_word_preproc_func is: def preproc_func(text): return re.sub('e$', '[e]?', text) """ self.phrase_match_pattern = None if ptrs.court_pattern_checker is None \ else ptrs.court_pattern_checker self.court_type_column = ptrs.column_names['type'] self.court_name_column = ptrs.column_names['name'] self.court_alias_column = ptrs.column_names['alias'] self.jurisdiction_column = ptrs.column_names['jurisdiction'] self.proc = LineProcessor() self.phrase_split_ptrs = ptrs.split_ptrs self.annotations = [] self.courts = None self.load_courts(ptrs.dataframe_paths) # unique columns self.finder_court_name = PhraseFinder( UniversalCourtsParser.get_unique_col_values( self.courts[self.court_name_column]), ptrs.key_word_preproc_func) self.finder_court_alias = None if len(self.court_alias_column) == 0 else \ PhraseFinder(UniversalCourtsParser.get_unique_col_values( self.courts[self.court_alias_column]), ptrs.key_word_preproc_func) # non-unique columns self.finder_court_type = PhraseFinder( UniversalCourtsParser.get_unique_col_values( self.courts[self.court_type_column]), ptrs.key_word_preproc_func) self.finder_jur = PhraseFinder( UniversalCourtsParser.get_unique_col_values( self.courts[self.jurisdiction_column]), ptrs.key_word_preproc_func) def parse(self, text: str) -> List[dict]: """ :param text: the text being processed :return: annotations - List[dict] Here is an example of the method's call: ret = processor.parse("Bei dir läuft, deine Verfassungsgerichtshof des Freistaates Sachsen rauchen Joints vor der Kamera") ret[0]['attrs'] = {'start': 14, 'end': 97} ret[0]['tags'] = {'Extracted Entity Type': 'court', 'Extracted Entity Court Name': 'Verfassungsgerichtshof des Freistaates Sachsen', 'Extracted Entity Court Type': 'Verfassungsgericht', 'Extracted Entity Court Jurisdiction': 'Sachsen'} """ self.annotations = [] self.find_courts_by_alias_in_whole_text(text) # if the whole text doesn't contain the key word (gericht) - skip all the following if self.phrase_match_pattern is not None: if self.phrase_match_pattern.search(text, re.IGNORECASE) is None: return self.annotations for phrase in self.proc.split_text_on_line_with_endings( text, self.phrase_split_ptrs): # if the phrase doesn't contain the key word (e.g., gericht for deutsche) - skip the phrase if self.phrase_match_pattern is not None: if self.phrase_match_pattern.search(phrase.text, re.IGNORECASE) is None: continue self.find_court_by_any_key(phrase) return self.annotations def load_courts(self, dataframe_paths: List[str]): frames = [] dtypes = { self.court_type_column: str, self.court_name_column: str, self.jurisdiction_column: str } if self.court_alias_column: dtypes[self.court_alias_column] = str for path in dataframe_paths: frame = pd.read_csv(path, encoding="utf-8", error_bad_lines=False, converters=dtypes) frames.append(frame) self.courts = pd.concat(frames) def find_courts_by_alias_in_whole_text(self, text: str) -> None: if self.finder_court_alias is None: return for m in self.finder_court_alias.find_word(text): alias = m[0] rows = self.courts.loc[self.courts[self.court_alias_column] == alias] match_found = MatchFound(rows, m[1], m[2]) self.add_annotation(match_found) def find_court_by_any_key(self, phrase: LineOrPhrase): # find by court names matches = [] matches += self.find_court_by_name(phrase) matches += self.find_court_by_type_and_jurisdiction(phrase) matches = [m for m in matches if m is not None] if len(matches) == 0: return # find the best match matches.sort(key=lambda m: m.make_sort_key()) self.add_annotation(matches[0]) def find_court_by_name(self, phrase: LineOrPhrase) -> List[MatchFound]: match = self.find_court_by_key_column(phrase, self.finder_court_name, self.court_name_column) if match is None: return [] match[0].court_name = match[1][0][0] return [match[0]] def find_court_by_key_column( self, phrase: LineOrPhrase, phrase_finder: PhraseFinder, column: str) -> Tuple[MatchFound, List[PhraseMatch]]: found_substrings = phrase_finder.find_word(phrase.text, True) if len(found_substrings) == 0: return None subset = self.courts.loc[self.courts[column] == found_substrings[0][0]] if len(subset) == 0: return None match = MatchFound(subset, phrase.start + found_substrings[0][1], phrase.start + found_substrings[0][2]) return (match, found_substrings) def find_court_by_type_and_jurisdiction( self, phrase: LineOrPhrase) -> List[MatchFound]: court_types = self.finder_court_type.find_word(phrase.text, True) if len(court_types) == 0: return [] court_jurs = self.finder_jur.find_word(phrase.text, True) if len(court_types) != 1 or len(court_jurs) > 1: # special case: 2 ore more courts within the same phrase # (without commas or conjuctions) matches = [] for ct in court_types: m = MatchFound([], phrase.start + ct[1], phrase.start + ct[2]) m.court_type = ct[0] m.court_name = ct[0] matches.append(m) return matches if len(court_jurs) == 0: subset = self.courts.loc[self.courts[self.court_type_column] == court_types[0][0]] else: subset = self.courts.loc[ (self.courts[self.court_type_column] == court_types[0][0]) & (self.courts[self.jurisdiction_column] == court_jurs[0][0])] match = MatchFound(subset, phrase.start, phrase.start + court_types[0][2]) if len(subset) != 1: match.court_name = court_types[0][0] match.court_type = court_types[0][0] return [match] def add_annotation(self, match: MatchFound): mlen = len(match.subset) name = match.subset[self.court_name_column].values[0] \ if match.is_exact else \ match.court_name if match.court_name is not None else \ match.subset[self.court_name_column].values[0] if mlen > 0 else '' court_type = match.subset[self.court_type_column].values[0] \ if match.is_exact else \ match.court_type if match.court_type is not None else \ match.subset[self.court_type_column].values[0] if mlen > 0 else '' jurisdiction = match.subset[self.jurisdiction_column].values[0] \ if match.is_exact else \ match.jurisdiction if match.jurisdiction is not None else \ match.subset[self.jurisdiction_column].values[0] if mlen > 0 else '' ant = dict(attrs={ 'start': match.entry_start, 'end': match.entry_end }, tags={ 'Extracted Entity Type': 'court', 'Extracted Entity Court Name': name, 'Extracted Entity Court Type': court_type, 'Extracted Entity Court Jurisdiction': jurisdiction }) self.annotations.append(ant) @staticmethod def get_unique_col_values(col_values): return [c for c in col_values.unique() if c]