def test_reporter_tokenizer(self): """Do we tokenize correctly?""" self.assertEqual(tokenize('See Roe v. Wade, 410 U. S. 113 (1973)'), ['See', 'Roe', 'v.', 'Wade,', '410', 'U. S.', '113', '(1973)']) self.assertEqual(tokenize('Foo bar eats grue, 232 Vet. App. (2003)'), ['Foo', 'bar', 'eats', 'grue,', '232', 'Vet. App.', '(2003)'])
def test_reporter_tokenizer(self): """Do we tokenize correctly?""" self.assertEqual( tokenize('See Roe v. Wade, 410 U. S. 113 (1973)'), ['See', 'Roe', 'v.', 'Wade,', '410', 'U. S.', '113', '(1973)']) self.assertEqual( tokenize('Foo bar eats grue, 232 Vet. App. (2003)'), ['Foo', 'bar', 'eats', 'grue,', '232', 'Vet. App.', '(2003)']) # Tests that the tokenizer handles whitespace well. In the past, the # capital letter P in 5243-P matched the abbreviation for the Pacific # reporter ("P"), and the tokenizing would be wrong. self.assertEqual( tokenize('Failed to recognize 1993 Ct. Sup. 5243-P'), ['Failed', 'to', 'recognize', '1993', 'Ct. Sup.', '5243-P'])
def test_reporter_tokenizer(self): """Do we tokenize correctly?""" self.assertEqual(tokenize('See Roe v. Wade, 410 U. S. 113 (1973)'), ['See', 'Roe', 'v.', 'Wade,', '410', 'U. S.', '113', '(1973)']) self.assertEqual(tokenize('Foo bar eats grue, 232 Vet. App. (2003)'), ['Foo', 'bar', 'eats', 'grue,', '232', 'Vet. App.', '(2003)']) # Tests that the tokenizer handles whitespace well. In the past, the # capital letter P in 5243-P matched the abbreviation for the Pacific # reporter ("P"), and the tokenizing would be wrong. self.assertEqual(tokenize('Failed to recognize 1993 Ct. Sup. 5243-P'), ['Failed', 'to', 'recognize', '1993', 'Ct. Sup.', '5243-P'])
def get_citations(text, html=True, do_post_citation=True, do_defendant=True, disambiguate=True): if html: text = get_visible_text(text) words = reporter_tokenizer.tokenize(text) citations = [] # Exclude first and last tokens when looking for reporters, because valid # citations must have a volume before and a page after the reporter. for i in xrange(0, len(words) - 1): # Find reporter if words[i] in (EDITIONS.keys() + VARIATIONS_ONLY.keys()): citation = extract_base_citation(words, i) if citation is None: # Not a valid citation; continue looking continue if do_post_citation: add_post_citation(citation, words) if do_defendant: add_defendant(citation, words) citations.append(citation) if disambiguate: # Disambiguate or drop all the reporters citations = disambiguate_reporters(citations) for citation in citations: if not citation.court and is_scotus_reporter(citation): citation.court = 'scotus' return citations
def get_citations(text, html=True, do_post_citation=True, do_defendant=True): if html: text = get_visible_text(text) words = reporter_tokenizer.tokenize(text) citations = [] # Exclude first and last tokens when looking for reporters, because valid # citations must have a volume before and a page number after the reporter. for i in xrange(1, len(words) - 1): # Find reporter if words[i] in (EDITIONS.keys() + VARIATIONS_ONLY.keys()): citation = extract_base_citation(words, i) if citation is None: # Not a valid citation; continue looking continue if do_post_citation: add_post_citation(citation, words) if do_defendant: add_defendant(citation, words) citations.append(citation) # Disambiguate or drop all the reporters citations = disambiguate_reporters(citations) for citation in citations: if not citation.court and is_scotus_reporter(citation): citation.court = 'scotus' return citations
def test_reporter_tokenizer(self): """Do we tokenize correctly?""" self.assertEqual( tokenize("See Roe v. Wade, 410 U. S. 113 (1973)"), ["See", "Roe", "v.", "Wade,", "410", "U. S.", "113", "(1973)"], ) self.assertEqual( tokenize("Foo bar eats grue, 232 Vet. App. (2003)"), ["Foo", "bar", "eats", "grue,", "232", "Vet. App.", "(2003)"], ) # Tests that the tokenizer handles whitespace well. In the past, the # capital letter P in 5243-P matched the abbreviation for the Pacific # reporter ("P"), and the tokenizing would be wrong. self.assertEqual( tokenize("Failed to recognize 1993 Ct. Sup. 5243-P"), ["Failed", "to", "recognize", "1993", "Ct. Sup.", "5243-P"], )
def get_citations( text, html=True, do_post_citation=True, do_defendant=True, disambiguate=True, ): if html: text = get_visible_text(text) words = reporter_tokenizer.tokenize(text) citations = [] for i in xrange(0, len(words) - 1): citation_token = words[i] # CASE 1: Citation token is a reporter (e.g., "U. S."). # In this case, first try extracting it as a standard, full citation, # and if that fails try extracting it as a short form citation. if citation_token in (EDITIONS.keys() + VARIATIONS_ONLY.keys()): citation = extract_full_citation(words, i) if citation: # CASE 1A: Standard citation found, try to add additional data if do_post_citation: add_post_citation(citation, words) if do_defendant: add_defendant(citation, words) else: # CASE 1B: Standard citation not found, so see if this # reference to a reporter is a short form citation instead citation = extract_shortform_citation(words, i) if not citation: # Neither a full nor short form citation continue # CASE 2: Citation token is an "Id." or "Ibid." reference. # In this case, the citation is simply to the immediately previous # document, but for safety we won't make that resolution until the # previous citation has been successfully matched to an opinion. elif citation_token.lower() in {"id.", "id.,", "ibid."}: citation = extract_id_citation(words, i) # CASE 3: Citation token is a "supra" reference. # In this case, we're not sure yet what the citation's antecedent is. # It could be any of the previous citations above. Thus, like an Id. # citation, we won't be able to resolve this reference until the # previous citations are actually matched to opinions. elif strip_punct(citation_token.lower()) == "supra": citation = extract_supra_citation(words, i) # CASE 4: Citation token is a section marker. # In this case, it's likely that this is a reference to a non- # opinion document. So we record this marker in order to keep # an accurate list of the possible antecedents for id citations. elif u"ยง" in citation_token: citation = NonopinionCitation(match_token=citation_token) # CASE 5: The token is not a citation. else: continue citations.append(citation) # Disambiguate each citation's reporter if disambiguate: citations = disambiguate_reporters(citations) citations = remove_address_citations(citations) # Set each citation's court property to "scotus" by default for citation in citations: if (isinstance(citation, Citation) and not citation.court and is_scotus_reporter(citation)): citation.court = "scotus" # Returns a list of citations ordered in the sequence that they appear in # the document. The ordering of this list is important because we will # later rely on that order to reconstruct the references of the # ShortformCitation, SupraCitation, and IdCitation objects. return citations