Example #1
 def test_reporter_tokenizer(self):
     """Do we tokenize correctly?"""
     self.assertEqual(tokenize('See Roe v. Wade, 410 U. S. 113 (1973)'),
                      ['See', 'Roe', 'v.', 'Wade,', '410', 'U. S.', '113',
     self.assertEqual(tokenize('Foo bar eats grue, 232 Vet. App. (2003)'),
                      ['Foo', 'bar', 'eats', 'grue,', '232', 'Vet. App.',
Example #2
 def test_reporter_tokenizer(self):
     """Do we tokenize correctly?"""
         tokenize('See Roe v. Wade, 410 U. S. 113 (1973)'),
         ['See', 'Roe', 'v.', 'Wade,', '410', 'U. S.', '113', '(1973)'])
         tokenize('Foo bar eats grue, 232 Vet. App. (2003)'),
         ['Foo', 'bar', 'eats', 'grue,', '232', 'Vet. App.', '(2003)'])
     # Tests that the tokenizer handles whitespace well. In the past, the
     # capital letter P in 5243-P matched the abbreviation for the Pacific
     # reporter ("P"), and the tokenizing would be wrong.
         tokenize('Failed to recognize 1993 Ct. Sup. 5243-P'),
         ['Failed', 'to', 'recognize', '1993', 'Ct. Sup.', '5243-P'])
Example #3
 def test_reporter_tokenizer(self):
     """Do we tokenize correctly?"""
     self.assertEqual(tokenize('See Roe v. Wade, 410 U. S. 113 (1973)'),
                      ['See', 'Roe', 'v.', 'Wade,', '410', 'U. S.', '113',
     self.assertEqual(tokenize('Foo bar eats grue, 232 Vet. App. (2003)'),
                      ['Foo', 'bar', 'eats', 'grue,', '232', 'Vet. App.',
     # Tests that the tokenizer handles whitespace well. In the past, the
     # capital letter P in 5243-P matched the abbreviation for the Pacific
     # reporter ("P"), and the tokenizing would be wrong.
     self.assertEqual(tokenize('Failed to recognize 1993 Ct. Sup. 5243-P'),
                      ['Failed', 'to', 'recognize', '1993', 'Ct. Sup.',
def get_citations(text,
    if html:
        text = get_visible_text(text)
    words = reporter_tokenizer.tokenize(text)
    citations = []
    # Exclude first and last tokens when looking for reporters, because valid
    # citations must have a volume before and a page after the reporter.
    for i in xrange(0, len(words) - 1):
        # Find reporter
        if words[i] in (EDITIONS.keys() + VARIATIONS_ONLY.keys()):
            citation = extract_base_citation(words, i)
            if citation is None:
                # Not a valid citation; continue looking
            if do_post_citation:
                add_post_citation(citation, words)
            if do_defendant:
                add_defendant(citation, words)

    if disambiguate:
        # Disambiguate or drop all the reporters
        citations = disambiguate_reporters(citations)

    for citation in citations:
        if not citation.court and is_scotus_reporter(citation):
            citation.court = 'scotus'

    return citations
Example #5
def get_citations(text, html=True, do_post_citation=True, do_defendant=True):
    if html:
        text = get_visible_text(text)
    words = reporter_tokenizer.tokenize(text)
    citations = []
    # Exclude first and last tokens when looking for reporters, because valid
    # citations must have a volume before and a page number after the reporter.
    for i in xrange(1, len(words) - 1):
        # Find reporter
        if words[i] in (EDITIONS.keys() + VARIATIONS_ONLY.keys()):
            citation = extract_base_citation(words, i)
            if citation is None:
                # Not a valid citation; continue looking
            if do_post_citation:
                add_post_citation(citation, words)
            if do_defendant:
                add_defendant(citation, words)

    # Disambiguate or drop all the reporters
    citations = disambiguate_reporters(citations)

    for citation in citations:
        if not citation.court and is_scotus_reporter(citation):
            citation.court = 'scotus'

    return citations
Example #6
 def test_reporter_tokenizer(self):
     """Do we tokenize correctly?"""
         tokenize("See Roe v. Wade, 410 U. S. 113 (1973)"),
         ["See", "Roe", "v.", "Wade,", "410", "U. S.", "113", "(1973)"],
         tokenize("Foo bar eats grue, 232 Vet. App. (2003)"),
         ["Foo", "bar", "eats", "grue,", "232", "Vet. App.", "(2003)"],
     # Tests that the tokenizer handles whitespace well. In the past, the
     # capital letter P in 5243-P matched the abbreviation for the Pacific
     # reporter ("P"), and the tokenizing would be wrong.
         tokenize("Failed to recognize 1993 Ct. Sup. 5243-P"),
         ["Failed", "to", "recognize", "1993", "Ct. Sup.", "5243-P"],
Example #7
def get_citations(
    if html:
        text = get_visible_text(text)
    words = reporter_tokenizer.tokenize(text)
    citations = []

    for i in xrange(0, len(words) - 1):
        citation_token = words[i]

        # CASE 1: Citation token is a reporter (e.g., "U. S.").
        # In this case, first try extracting it as a standard, full citation,
        # and if that fails try extracting it as a short form citation.
        if citation_token in (EDITIONS.keys() + VARIATIONS_ONLY.keys()):
            citation = extract_full_citation(words, i)
            if citation:
                # CASE 1A: Standard citation found, try to add additional data
                if do_post_citation:
                    add_post_citation(citation, words)
                if do_defendant:
                    add_defendant(citation, words)
                # CASE 1B: Standard citation not found, so see if this
                # reference to a reporter is a short form citation instead
                citation = extract_shortform_citation(words, i)

                if not citation:
                    # Neither a full nor short form citation

        # CASE 2: Citation token is an "Id." or "Ibid." reference.
        # In this case, the citation is simply to the immediately previous
        # document, but for safety we won't make that resolution until the
        # previous citation has been successfully matched to an opinion.
        elif citation_token.lower() in {"id.", "id.,", "ibid."}:
            citation = extract_id_citation(words, i)

        # CASE 3: Citation token is a "supra" reference.
        # In this case, we're not sure yet what the citation's antecedent is.
        # It could be any of the previous citations above. Thus, like an Id.
        # citation, we won't be able to resolve this reference until the
        # previous citations are actually matched to opinions.
        elif strip_punct(citation_token.lower()) == "supra":
            citation = extract_supra_citation(words, i)

        # CASE 4: Citation token is a section marker.
        # In this case, it's likely that this is a reference to a non-
        # opinion document. So we record this marker in order to keep
        # an accurate list of the possible antecedents for id citations.
        elif u"ยง" in citation_token:
            citation = NonopinionCitation(match_token=citation_token)

        # CASE 5: The token is not a citation.


    # Disambiguate each citation's reporter
    if disambiguate:
        citations = disambiguate_reporters(citations)

    citations = remove_address_citations(citations)

    # Set each citation's court property to "scotus" by default
    for citation in citations:
        if (isinstance(citation, Citation) and not citation.court
                and is_scotus_reporter(citation)):
            citation.court = "scotus"

    # Returns a list of citations ordered in the sequence that they appear in
    # the document. The ordering of this list is important because we will
    # later rely on that order to reconstruct the references of the
    # ShortformCitation, SupraCitation, and IdCitation objects.
    return citations