def test_extract_citations(case_factory, tmpdir, settings, elasticsearch): from scripts.extract_cites import EDITIONS as processed_editions settings.MISSED_CITATIONS_DIR = str(tmpdir) blocked_by_date = set( k for k in list(EDITIONS.keys()) + list(VARIATIONS_ONLY.keys()) if all(c['start_year'] > 2000 for c in processed_editions[k])) legitimate_cites = [ "225 F. Supp. 552", # correct ["125 f supp 152", "125 F. Supp. 152"], # normalized ["125 Burnett (Wis.) 152", "125 Bur. 152"], # normalized ["1 F. 2d 2", "1 F.2d 2"], # not matched as "1 F. 2" "2 1/2 Mass. 1", # special volume numbers "3 Suppl. Mass. 2", # special volume numbers "1 La.App. 5 Cir. 2", # not matched as "1 La.App. 5" "2000 WL 12345", # vendor cite ] legitimate_cites += [ "1 %s 1" % c for c in EDITIONS.keys() if c not in blocked_by_date ] legitimate_cites += [["1 %s 1" % k, "1 %s 1" % v] for k, vv in VARIATIONS_ONLY.items() for v in vv if k not in blocked_by_date] legitimate_cites_normalized = set( normalize_cite(c if type(c) is str else c[1]) for c in legitimate_cites) legitimate_cites = [ c if type(c) is str else c[0] for c in legitimate_cites ] illegitimate_cites = [ "2 Dogs 3", # unrecognized reporter "3 Dogs 4", # duplicate unrecognized reporter "1 or 2", # not matched as 1 Or. 2 "word1 Mass. 2word", # not matched if part of larger word "1 Mass.\n 2", # no match across newlines "1 A.3d 1", # no match to reporter that started publishing in 2010 ] illegitimate_cites += ["1 %s 1" % c for c in blocked_by_date] case = case_factory( body_cache__text=", some text, ".join(legitimate_cites + illegitimate_cites), decision_date=datetime(2000, 1, 1)) fabfile.extract_all_citations() update_elasticsearch_from_queue() # check extracted cites cites = list(ExtractedCitation.objects.all()) cite_set = set(c.cite for c in cites) normalized_cite_set = set(c.normalized_cite for c in cites) assert cite_set == set(legitimate_cites) assert normalized_cite_set == legitimate_cites_normalized assert all(c.cited_by_id == case.pk for c in cites)
def tokenize(text): """Tokenize text using regular expressions in the following steps: - Split the text by the occurrences of patterns which match a federal reporter, including the reporter strings as part of the resulting list. - Perform simple tokenization (whitespace split) on each of the non-reporter strings in the list. Example: >>>tokenize('See Roe v. Wade, 410 U. S. 113 (1973)') ['See', 'Roe', 'v.', 'Wade,', '410', 'U. S.', '113', '(1973)'] """ # if the text looks likes the corner-case 'digit-REPORTER-digit', splitting # by spaces doesn't work if re.match(r"\d+\-[A-Za-z]+\-\d+", text): return text.split("-") # otherwise, we just split on spaces to find words strings = REPORTER_RE.split(text) words = [] for string in strings: if string in list(EDITIONS.keys()) + list(VARIATIONS_ONLY.keys()): words.append(string) else: # Normalize spaces words.extend(_tokenize(string)) return words
def tokenize(text): """Tokenize text using regular expressions in the following steps: - Split the text by the occurrences of patterns which match a federal reporter, including the reporter strings as part of the resulting list. - Perform simple tokenization (whitespace split) on each of the non-reporter strings in the list. Example: >>>tokenize('See Roe v. Wade, 410 U. S. 113 (1973)') ['See', 'Roe', 'v.', 'Wade,', '410', 'U.S.', '113', '(1973)'] """ # if the text looks likes the corner-case 'digit-REPORTER-digit', splitting # by spaces doesn't work if re.match('\d+\-[A-Za-z]+\-\d+', text): return text.split('-') # otherwise, we just split on spaces to find words strings = REPORTER_RE.split(text) words = [] for string in strings: if string in EDITIONS.keys() + VARIATIONS_ONLY.keys(): words.append(string) else: # Normalize spaces words.extend(_tokenize(string)) return words
def get_citations(text, html=True, do_post_citation=True, do_defendant=True, disambiguate=True): if html: text = get_visible_text(text) words = reporter_tokenizer.tokenize(text) citations = [] # Exclude first and last tokens when looking for reporters, because valid # citations must have a volume before and a page after the reporter. for i in xrange(0, len(words) - 1): # Find reporter if words[i] in (EDITIONS.keys() + VARIATIONS_ONLY.keys()): citation = extract_base_citation(words, i) if citation is None: # Not a valid citation; continue looking continue if do_post_citation: add_post_citation(citation, words) if do_defendant: add_defendant(citation, words) citations.append(citation) if disambiguate: # Disambiguate or drop all the reporters citations = disambiguate_reporters(citations) for citation in citations: if not citation.court and is_scotus_reporter(citation): citation.court = 'scotus' return citations
def get_citations(text, html=True, do_post_citation=True, do_defendant=True): if html: text = get_visible_text(text) words = reporter_tokenizer.tokenize(text) citations = [] # Exclude first and last tokens when looking for reporters, because valid # citations must have a volume before and a page number after the reporter. for i in xrange(1, len(words) - 1): # Find reporter if words[i] in (EDITIONS.keys() + VARIATIONS_ONLY.keys()): citation = extract_base_citation(words, i) if citation is None: # Not a valid citation; continue looking continue if do_post_citation: add_post_citation(citation, words, i) if do_defendant: add_defendant(citation, words, i) citations.append(citation) # Disambiguate or drop all the reporters citations = disambiguate_reporters(citations) for citation in citations: if not citation.court and is_scotus_reporter(citation): citation.court = "scotus" return citations
def tokenize(text): """Tokenize text using regular expressions in the following steps: - Split the text by the occurrences of patterns which match a federal reporter, including the reporter strings as part of the resulting list. - Perform simple tokenization (whitespace split) on each of the non-reporter strings in the list. Example: >>>tokenize('See Roe v. Wade, 410 U. S. 113 (1973)') ['See', 'Roe', 'v.', 'Wade,', '410', 'U.S.', '113', '(1973)'] """ strings = REPORTER_RE.split(text) words = [] for string in strings: if string in EDITIONS.keys() + VARIATIONS_ONLY.keys(): words.append(string) else: # Normalize spaces words.extend(_tokenize(string)) return words
#!/usr/bin/env python # encoding: utf-8 # Loosely adapted from the Natural Language Toolkit: Tokenizers # URL: <http://nltk.sourceforge.net> import re from reporters_db import EDITIONS, VARIATIONS_ONLY # We need to build a REGEX that has all the variations and the reporters in # order from longest to shortest. REGEX_LIST = list(EDITIONS.keys()) + list(VARIATIONS_ONLY.keys()) REGEX_LIST.sort(key=len, reverse=True) REGEX_STR = "|".join(map(re.escape, REGEX_LIST)) REPORTER_RE = re.compile(r"(^|\s)(%s)(\s|,)" % REGEX_STR) def normalize_variation(string): """Gets the best possible canonicalization of a variant spelling of a reporter. Variations map to lists of one or more result, and we need to figure out which is best. Usually, this can be accomplished using the year of the item. """ if string in VARIATIONS_ONLY.keys(): if len(VARIATIONS_ONLY[string]) == 1: # Simple case return VARIATIONS_ONLY[string][0] else:
#!/usr/bin/env python # encoding: utf-8 # Loosely adapted from the Natural Language Toolkit: Tokenizers # URL: <http://nltk.sourceforge.net> import re from reporters_db import EDITIONS, VARIATIONS_ONLY # We need to build a REGEX that has all the variations and the reporters in # order from longest to shortest. REGEX_LIST = EDITIONS.keys() + VARIATIONS_ONLY.keys() REGEX_LIST.sort(key=len, reverse=True) REGEX_STR = '|'.join(map(re.escape, REGEX_LIST)) REPORTER_RE = re.compile("(^|\s)(%s)\s" % REGEX_STR) def normalize_variation(string): """Gets the best possible canonicalization of a variant spelling of a reporter. Variations map to lists of one or more result, and we need to figure out which is best. Usually, this can be accomplished using the year of the item. """ if string in VARIATIONS_ONLY.keys(): if len(VARIATIONS_ONLY[string]) == 1: # Simple case return VARIATIONS_ONLY[string][0] else:
#!/usr/bin/env python # encoding: utf-8 # Loosely adapted from the Natural Language Toolkit: Tokenizers # URL: <http://nltk.sourceforge.net> import re from reporters_db import EDITIONS, VARIATIONS_ONLY # We need to build a REGEX that has all the variations and the reporters in # order from longest to shortest. REGEX_LIST = EDITIONS.keys() + VARIATIONS_ONLY.keys() REGEX_LIST.sort(key=len, reverse=True) REGEX_STR = '|'.join(map(re.escape, REGEX_LIST)) REPORTER_RE = re.compile("\s(%s)\s" % REGEX_STR) def normalize_variation(string): """Gets the best possible canonicalization of a variant spelling of a reporter. Variations map to lists of one or more result, and we need to figure out which is best. Usually, this can be accomplished using the year of the item. """ if string in VARIATIONS_ONLY.keys(): if len(VARIATIONS_ONLY[string]) == 1: # Simple case return VARIATIONS_ONLY[string][0] else:
def get_citations( text, html=True, do_post_citation=True, do_defendant=True, disambiguate=True, ): if html: text = get_visible_text(text) words = reporter_tokenizer.tokenize(text) citations = [] for i in xrange(0, len(words) - 1): citation_token = words[i] # CASE 1: Citation token is a reporter (e.g., "U. S."). # In this case, first try extracting it as a standard, full citation, # and if that fails try extracting it as a short form citation. if citation_token in (EDITIONS.keys() + VARIATIONS_ONLY.keys()): citation = extract_full_citation(words, i) if citation: # CASE 1A: Standard citation found, try to add additional data if do_post_citation: add_post_citation(citation, words) if do_defendant: add_defendant(citation, words) else: # CASE 1B: Standard citation not found, so see if this # reference to a reporter is a short form citation instead citation = extract_shortform_citation(words, i) if not citation: # Neither a full nor short form citation continue # CASE 2: Citation token is an "Id." or "Ibid." reference. # In this case, the citation is simply to the immediately previous # document, but for safety we won't make that resolution until the # previous citation has been successfully matched to an opinion. elif citation_token.lower() in {"id.", "id.,", "ibid."}: citation = extract_id_citation(words, i) # CASE 3: Citation token is a "supra" reference. # In this case, we're not sure yet what the citation's antecedent is. # It could be any of the previous citations above. Thus, like an Id. # citation, we won't be able to resolve this reference until the # previous citations are actually matched to opinions. elif strip_punct(citation_token.lower()) == "supra": citation = extract_supra_citation(words, i) # CASE 4: Citation token is a section marker. # In this case, it's likely that this is a reference to a non- # opinion document. So we record this marker in order to keep # an accurate list of the possible antecedents for id citations. elif u"§" in citation_token: citation = NonopinionCitation(match_token=citation_token) # CASE 5: The token is not a citation. else: continue citations.append(citation) # Disambiguate each citation's reporter if disambiguate: citations = disambiguate_reporters(citations) citations = remove_address_citations(citations) # Set each citation's court property to "scotus" by default for citation in citations: if (isinstance(citation, Citation) and not citation.court and is_scotus_reporter(citation)): citation.court = "scotus" # Returns a list of citations ordered in the sequence that they appear in # the document. The ordering of this list is important because we will # later rely on that order to reconstruct the references of the # ShortformCitation, SupraCitation, and IdCitation objects. return citations
def get_citations( text: str, html: bool = True, do_post_citation: bool = True, do_defendant: bool = True, disambiguate: bool = True, ) -> List[Union[NonopinionCitation, Citation]]: """Main function""" if html: text = get_visible_text(text) words = tokenize(text) citations: List[Union[Citation, NonopinionCitation]] = [] for i in range(0, len(words) - 1): citation_token = words[i] citation: Union[Citation, NonopinionCitation, None] = None # CASE 1: Citation token is a reporter (e.g., "U. S."). # In this case, first try extracting it as a standard, full citation, # and if that fails try extracting it as a short form citation. if citation_token in list(EDITIONS.keys()) + list( VARIATIONS_ONLY.keys()): citation = extract_full_citation(words, i) if citation: # CASE 1A: Standard citation found, try to add additional data if do_post_citation: add_post_citation(citation, words) if do_defendant: add_defendant(citation, words) else: # CASE 1B: Standard citation not found, so see if this # reference to a reporter is a short form citation instead citation = extract_shortform_citation(words, i) if not citation: # Neither a full nor short form citation continue # CASE 2: Citation token is an "Id." or "Ibid." reference. # In this case, the citation should simply be to the item cited # immediately prior, but for safety we will leave that resolution up # to the user. elif citation_token.lower() in {"id.", "id.,", "ibid."}: citation = extract_id_citation(words, i) # CASE 3: Citation token is a "supra" reference. # In this case, we're not sure yet what the citation's antecedent is. # It could be any of the previous citations above. Thus, like an Id. # citation, for safety we won't resolve this reference yet. elif strip_punct(citation_token.lower()) == "supra": citation = extract_supra_citation(words, i) # CASE 4: Citation token is a section marker. # In this case, it's likely that this is a reference to a non- # opinion document. So we record this marker in order to keep # an accurate list of the possible antecedents for id citations. elif "§" in citation_token: citation = NonopinionCitation(match_token=citation_token) # CASE 5: The token is not a citation. else: continue if citation is not None: citations.append(citation) # Disambiguate each citation's reporter if disambiguate: citations = disambiguate_reporters(citations) citations = remove_address_citations(citations) # Set each citation's court property to "scotus" by default for citation in citations: if (isinstance(citation, Citation) and not citation.court and is_scotus_reporter(citation)): citation.court = "scotus" # Returns a list of citations ordered in the sequence that they appear in # the document. The ordering of this list is important for reconstructing # the references of the ShortformCitation, SupraCitation, and # IdCitation objects. return citations