Ejemplo n.º 1
0
    def test_make_html(self):
        """Can we make basic HTML conversions properly?"""
        good_html = (
            '<pre class="inline">asdf </pre><span class="citation '
            'no-link"><span class="volume">22</span> <span '
            'class="reporter">U.S.</span> <span class="page">33</span>'
            '</span><pre class="inline"> asdf</pre>')

        # Simple example
        s = 'asdf 22 U.S. 33 asdf'
        opinion = Opinion(plain_text=s)
        citations = get_citations(s)
        new_html = create_cited_html(opinion, citations)
        self.assertEqual(
            good_html,
            new_html,
        )

        # Using a variant format for U.S. (Issue #409)
        s = 'asdf 22 U. S. 33 asdf'
        opinion = Opinion(plain_text=s)
        citations = get_citations(s)
        new_html = create_cited_html(opinion, citations)
        self.assertEqual(
            good_html,
            new_html,
        )
Ejemplo n.º 2
0
    def test_make_html(self):
        """Can we make basic HTML conversions properly?"""
        good_html = ('<pre class="inline">asdf </pre><span class="citation '
                     'no-link"><span class="volume">22</span> <span '
                     'class="reporter">U.S.</span> <span class="page">33</span>'
                     '</span><pre class="inline"> asdf</pre>')

        # Simple example
        s = 'asdf 22 U.S. 33 asdf'
        opinion = Opinion(plain_text=s)
        citations = get_citations(s)
        new_html = create_cited_html(opinion, citations)
        self.assertEqual(
            good_html,
            new_html,
        )

        # Using a variant format for U.S. (Issue #409)
        s = 'asdf 22 U. S. 33 asdf'
        opinion = Opinion(plain_text=s)
        citations = get_citations(s)
        new_html = create_cited_html(opinion, citations)
        self.assertEqual(
            good_html,
            new_html,
        )
Ejemplo n.º 3
0
def get_document_citations(opinion):
    """Identify and return citations from the html or plain text of the
    opinion.
    """
    if opinion.html_columbia:
        citations = find_citations.get_citations(opinion.html_columbia)
    elif opinion.html_lawbox:
        citations = find_citations.get_citations(opinion.html_lawbox)
    elif opinion.html:
        citations = find_citations.get_citations(opinion.html)
    elif opinion.plain_text:
        citations = find_citations.get_citations(opinion.plain_text, html=False)
    else:
        citations = []
    return citations
Ejemplo n.º 4
0
def get_document_citations(opinion):
    """Identify and return citations from the html or plain text of the
    opinion.
    """
    if opinion.html_columbia:
        citations = find_citations.get_citations(opinion.html_columbia)
    elif opinion.html_lawbox:
        citations = find_citations.get_citations(opinion.html_lawbox)
    elif opinion.html:
        citations = find_citations.get_citations(opinion.html)
    elif opinion.plain_text:
        citations = find_citations.get_citations(opinion.plain_text,
                                                 html=False)
    else:
        citations = []
    return citations
Ejemplo n.º 5
0
 def test_citation_matching_issue621(self):
     """Make sure that a citation like 1 Wheat 9 doesn't match 9 Wheat 1"""
     # The fixture contains a reference to 9 F. 1, so we expect no results.
     citation_str = '1 F. 9 (1795)'
     citation = get_citations(citation_str)[0]
     results = match_citation(citation)
     self.assertEqual([], results)
Ejemplo n.º 6
0
 def test_citation_matching_issue621(self):
     """Make sure that a citation like 1 Wheat 9 doesn't match 9 Wheat 1"""
     # The fixture contains a reference to 9 F. 1, so we expect no results.
     citation_str = '1 F. 9 (1795)'
     citation = get_citations(citation_str)[0]
     results = match_citation(citation)
     self.assertEqual([], results)
Ejemplo n.º 7
0
 def test_find_tc_citations(self):
     """Can we parse tax court citations properly?"""
     test_pairs = (
         # Test with atypical formatting for Tax Court Memos
         ('the 1 T.C. No. 233',
          [Citation(volume=1, reporter='T.C. No.', page=233,
                    canonical_reporter=u'T.C. No.', lookup_index=0,
                    reporter_index=2, reporter_found='T.C. No.')]),
         ('word T.C. Memo. 2019-233',
          [Citation(volume=2019, reporter='T.C. Memo.', page=233,
                    canonical_reporter=u'T.C. Memo.', lookup_index=0,
                    reporter_index=1, reporter_found='T.C. Memo.')]),
         ('something T.C. Summary Opinion 2019-233',
          [Citation(volume=2019, reporter='T.C. Summary Opinion', page=233,
                    canonical_reporter=u'T.C. Summary Opinion',
                    lookup_index=0,
                    reporter_index=1,
                    reporter_found='T.C. Summary Opinion')]),
         ('T.C. Summary Opinion 2018-133',
          [Citation(volume=2018, reporter='T.C. Summary Opinion', page=133,
                    canonical_reporter=u'T.C. Summary Opinion',
                    lookup_index=0,
                    reporter_index=0,
                    reporter_found='T.C. Summary Opinion')]),
         ('1     UNITED STATES TAX COURT REPORT   (2018)',
          [Citation(volume=1, reporter='T.C.', page=2018,
                    canonical_reporter=u'T.C.',
                    lookup_index=0,
                    reporter_index=1,
                    reporter_found='UNITED STATES TAX COURT REPORT')]),
         ('U.S. of A. 1     UNITED STATES TAX COURT REPORT   (2018)',
          [Citation(volume=1, reporter='T.C.', page=2018,
                    canonical_reporter=u'T.C.',
                    lookup_index=0,
                    reporter_index=4,
                    reporter_found='UNITED STATES TAX COURT REPORT')]),
         ('U.S. 1234 1 U.S. 1',
          [Citation(volume=1, reporter='U.S.', page=1,
                    canonical_reporter=u'U.S.',
                    lookup_index=0,
                    reporter_index=3,
                    court='scotus',
                    reporter_found='U.S.')]),
     )
     for q, a in test_pairs:
         print "Testing citation extraction for %s..." % q,
         cites_found = get_citations(q)
         self.assertEqual(
             cites_found,
             a,
             msg='%s\n%s\n\n    !=\n\n%s' % (
                 q,
                 ",\n".join([str(cite.__dict__) for cite in cites_found]),
                 ",\n".join([str(cite.__dict__) for cite in a]),
             )
         )
         print "✓"
def make_citation(cite_str, cluster, cite_type):
    """Create and return a citation object for the input values."""
    citation_obj = get_citations(cite_str)[0]
    return Citation(
        cluster=cluster,
        volume=citation_obj.volume,
        reporter=citation_obj.reporter,
        page=citation_obj.page,
        type=cite_type,
    )
def make_citation(cite_str, cluster, cite_type):
    """Create and return a citation object for the input values."""
    citation_obj = get_citations(cite_str)[0]
    return Citation(
        cluster=cluster,
        volume=citation_obj.volume,
        reporter=citation_obj.reporter,
        page=citation_obj.page,
        type=cite_type,
    )
Ejemplo n.º 10
0
def get_citations_from_tree(complete_html_tree, case_path):
    path = ('//center[descendant::text()[not('
            'starts-with(normalize-space(.), "No.") or '
            'starts-with(normalize-space(.), "Case No.") or '
            'starts-with(normalize-space(.), "Record No.")'
            ')]]')
    citations = []
    for e in complete_html_tree.xpath(path):
        text = tostring(e, method='text', encoding='unicode')
        citations.extend(get_citations(text, html=False, do_defendant=False))
    if not citations:
        path = '//title/text()'
        text = complete_html_tree.xpath(path)[0]
        citations = get_citations(text, html=False, do_post_citation=False,
                                  do_defendant=False)

    if not citations:
        try:
            citations = fixes[case_path]['citations']
        except KeyError:
            if 'input_citations' in DEBUG:
                subprocess.Popen(
                    ['firefox', 'file://%s' % case_path],
                    shell=False
                ).communicate()
                input_citation = raw_input(
                    '  No citations found. What should be here? ')
                citation_objects = get_citations(
                    input_citation,
                    html=False,
                    do_post_citation=False,
                    do_defendant=False
                )
                add_fix(case_path, {'citations': citation_objects})
                citations = citation_objects

    if 'citations' in DEBUG and len(citations):
        cite_strs = [str(cite.__dict__) for cite in citations]
        log_print(
            "  Citations found: %s" % ',\n                   '.join(cite_strs))
    elif 'citations' in DEBUG:
        log_print("  No citations found!")
    return citations
Ejemplo n.º 11
0
def get_citations_from_tree(complete_html_tree, case_path):
    path = ('//center[descendant::text()[not('
            'starts-with(normalize-space(.), "No.") or '
            'starts-with(normalize-space(.), "Case No.") or '
            'starts-with(normalize-space(.), "Record No.")'
            ')]]')
    citations = []
    for e in complete_html_tree.xpath(path):
        text = tostring(e, method='text', encoding='unicode')
        citations.extend(get_citations(text, html=False, do_defendant=False))
    if not citations:
        path = '//title/text()'
        text = complete_html_tree.xpath(path)[0]
        citations = get_citations(text,
                                  html=False,
                                  do_post_citation=False,
                                  do_defendant=False)

    if not citations:
        try:
            citations = fixes[case_path]['citations']
        except KeyError:
            if 'input_citations' in DEBUG:
                subprocess.Popen(
                    ['firefox', 'file://%s' % case_path],
                    shell=False).communicate()
                input_citation = raw_input(
                    '  No citations found. What should be here? ')
                citation_objects = get_citations(input_citation,
                                                 html=False,
                                                 do_post_citation=False,
                                                 do_defendant=False)
                add_fix(case_path, {'citations': citation_objects})
                citations = citation_objects

    if 'citations' in DEBUG and len(citations):
        cite_strs = [str(cite.__dict__) for cite in citations]
        log_print("  Citations found: %s" %
                  ',\n                   '.join(cite_strs))
    elif 'citations' in DEBUG:
        log_print("  No citations found!")
    return citations
Ejemplo n.º 12
0
def find_tax_court_citation(opinion_text):
    """
    Returns a dictionary representation of our
    Citation object.

    Return the citation object or nothing.
    Iterates over lines of text beacuse we assume our citations won't wrap.

    :param opinion_text: The plain_text of our opinion from the scrape.
    :return: citation object or None
    """
    for line_of_text in opinion_text.split("\n")[:250]:
        cites = find_citations.get_citations(line_of_text, html=False)
        if not cites:
            continue

        if "UNITED STATES TAX COURT REPORT" in opinion_text:
            for cite in cites:
                if "UNITED STATES TAX COURT REPORT" in cite.reporter_found:
                    cite.type = Citation.SPECIALTY
                    return cite
        else:
            for cite in cites:
                if (
                    "T.C." not in cite.reporter
                    and "T. C." not in cite.reporter
                ):
                    # If not the first cite - Skip
                    return None

                if cite.reporter_index > 2:
                    # If reporter not in first or second term in the line we skip.
                    return None

                alt_cite = line_of_text.replace(
                    cite.reporter_found, ""
                ).strip()
                other_words = alt_cite.split(" ")

                if len([x for x in other_words if x != ""]) > 3:
                    # If line has more than three non reporter components skip.
                    return None

                if "T.C." == cite.reporter:
                    cite_type = Citation.SPECIALTY
                elif "T.C. No." == cite.reporter:
                    cite_type = Citation.SPECIALTY
                else:
                    cite_type = Citation.NEUTRAL

                cite.type = cite_type
                return cite
Ejemplo n.º 13
0
def find_cites(case_data: Dict[str, str]) -> List[FoundCitation]:
    """Extract citations from raw string.

    :param case_data: Case information from the anon 2020 db.
    :return: Citation objects found in the raw string.
    """
    found_citations = []
    cites = re.findall(r"\"(.*?)\"", case_data["lexis_ids_normalized"],
                       re.DOTALL)
    for cite in cites:
        fc = get_citations(cite)
        if len(fc) > 0:
            found_citations.append(fc[0])
    return found_citations
Ejemplo n.º 14
0
    def do_citations(cluster, scdb_info):
        """
        Handle the citation fields.

        :param cluster: The Cluster to be changed.
        :param scdb_info: A dict with the SCDB information.
        """
        fields = {
            "usCite": ("U.S.", Citation.FEDERAL),
            "sctCite": ("S. Ct.", Citation.FEDERAL),
            "ledCite": ("L. Ed.", Citation.FEDERAL),
            "lexisCite": ("U.S. LEXIS", Citation.LEXIS),
        }
        for scdb_field, reporter_info in fields.items():
            if not scdb_info[scdb_field]:
                continue
            try:
                citation_obj = get_citations(
                    scdb_info[scdb_field],
                    html=False,
                    do_post_citation=False,
                    do_defendant=False,
                    disambiguate=False,
                )[0]
            except IndexError:
                logger.warning("Unable to parse citation for: %s",
                               scdb_info[scdb_field])
            else:
                cites = cluster.citations.filter(reporter=reporter_info[0])
                if cites.count() == 1:
                    # Update the existing citation.
                    cite = cites[0]
                    cite.volume = citation_obj.volume
                    cite.reporter = citation_obj.reporter
                    cite.page = citation_obj.page
                    cite.save()
                else:
                    try:
                        # Create a new citation
                        Citation.objects.create(
                            cluster=cluster,
                            volume=citation_obj.volume,
                            reporter=citation_obj.reporter,
                            page=citation_obj.page,
                            type=reporter_info[1],
                        )
                    except IntegrityError:
                        # Violated unique_together constraint. Fine.
                        pass
Ejemplo n.º 15
0
    def do_citations(cluster, scdb_info):
        """
        Handle the citation fields.

        :param cluster: The Cluster to be changed.
        :param scdb_info: A dict with the SCDB information.
        """
        fields = {
            'usCite': ("U.S.", Citation.FEDERAL),
            'sctCite': ("S. Ct.", Citation.FEDERAL),
            'ledCite': ("L. Ed.", Citation.FEDERAL),
            'lexisCite': ("U.S. LEXIS", Citation.LEXIS),
        }
        for scdb_field, reporter_info in fields.items():
            try:
                citation_obj = get_citations(
                    scdb_info[scdb_field],
                    html=False,
                    do_post_citation=False,
                    do_defendant=False,
                    disambiguate=False,
                )[0]
            except IndexError:
                logger.warn("Unable to parse citation for: %s",
                            scdb_info[scdb_field])
            else:
                cite = cluster.citations.filter(reporter=reporter_info[0])
                if cite:
                    # Update the existing citation.
                    cite.volume = citation_obj.volume
                    cite.reporter = citation_obj.reporter
                    cite.page = citation_obj.page
                    cite.save()
                else:
                    try:
                        # Create a new citation
                        Citation.objects.create(
                            cluster=cluster,
                            volume=citation_obj.volume,
                            reporter=citation_obj.reporter,
                            page=citation_obj.page,
                            type=reporter_info[1],
                        )
                    except IntegrityError:
                        # Violated unique_together constraint. Fine.
                        pass
Ejemplo n.º 16
0
def get_query_citation(cd):
    """Extract citations from the query string and return them, or return
    None
    """
    if not cd.get('q'):
        return None
    citations = get_citations(cd['q'], html=False)

    matches = None
    if len(citations) == 1:
        # If it's not exactly one match, user doesn't get special help.
        matches = match_citation(citations[0])
        if len(matches) >= 1:
            # Just return the first result if there is more than one. This
            # should be rare, and they're ordered by relevance.
            return matches.result.docs[0]

    return matches
Ejemplo n.º 17
0
def get_query_citation(cd):
    """Extract citations from the query string and return them, or return
    None
    """
    if not cd.get('q'):
        return None
    citations = get_citations(cd['q'], html=False)

    matches = None
    if len(citations) == 1:
        # If it's not exactly one match, user doesn't get special help.
        matches = match_citation(citations[0])
        if len(matches) >= 1:
            # Just return the first result if there is more than one. This
            # should be rare, and they're ordered by relevance.
            return matches.result.docs[0]

    return matches
Ejemplo n.º 18
0
def get_query_citation(cd):
    """Extract citations from the query string and return them, or return
    None
    """
    if not cd.get("q"):
        return None
    citations = get_citations(
        cd["q"], html=False, do_post_citation=False, do_defendant=False
    )

    matches = None
    if len(citations) == 1:
        # If it's not exactly one match, user doesn't get special help.
        matches = match_citation(citations[0])
        if len(matches) == 1:
            # If more than one match, don't show the tip
            return matches.result.docs[0]

    return matches
Ejemplo n.º 19
0
 def test_identifying_parallel_citations(self):
     """Given a string, can we identify parallel citations"""
     tests = (
         # A pair consisting of a test string and the number of parallel
         # citations that should be identifiable in that string.
         # Simple case
         ("1 U.S. 1 (22 U.S. 33)", 1, 2),
         # Too far apart
         ("1 U.S. 1 too many words 22 U.S. 33", 0, 0),
         # Three citations
         ("1 U.S. 1, (44 U.S. 33, 99 U.S. 100)", 1, 3),
         # Parallel citation after a valid citation too early on
         ("1 U.S. 1 too many words, then 22 U.S. 33, 13 WL 33223", 1, 2),
     )
     for q, citation_group_count, expected_num_parallel_citations in tests:
         print "Testing parallel citation identification for: %s..." % q,
         citations = get_citations(q)
         citation_groups = identify_parallel_citations(citations)
         computed_num_citation_groups = len(citation_groups)
         self.assertEqual(
             computed_num_citation_groups,
             citation_group_count,
             msg="Did not have correct number of citation groups. Got %s, "
             "not %s."
             % (computed_num_citation_groups, citation_group_count),
         )
         if not citation_groups:
             # Add an empty list to make testing easier.
             citation_groups = [[]]
         computed_num_parallel_citation = len(list(citation_groups)[0])
         self.assertEqual(
             computed_num_parallel_citation,
             expected_num_parallel_citations,
             msg="Did not identify correct number of parallel citations in "
             "the group. Got %s, not %s"
             % (
                 computed_num_parallel_citation,
                 expected_num_parallel_citations,
             ),
         )
         print "✓"
Ejemplo n.º 20
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        qs = OpinionCluster.objects.all()
        start_at = options['start_at']
        if start_at:
            qs = qs.filter(pk__gte=start_at)
        for i, cluster in enumerate(queryset_generator(qs)):
            for field in cluster.citation_fields:
                citation_str = getattr(cluster, field)
                if citation_str:
                    # Split the citation and add it to the DB.
                    try:
                        citation_obj = get_citations(
                            citation_str,
                            html=False,
                            do_post_citation=False,
                            do_defendant=False,
                            disambiguate=False,
                        )[0]
                    except IndexError:
                        msg = "Errored out on: %s in %s" % (citation_str,
                                                            cluster.pk)
                        print(msg)
                        logger.info(msg)
                        continue
                    try:
                        Citation.objects.create(
                            cluster=cluster,
                            volume=citation_obj.volume,
                            reporter=citation_obj.reporter,
                            page=citation_obj.page,
                            type=map_model_field_to_citation_type(field)
                        )
                    except IntegrityError:
                        # Violated unique_together constraint. Fine.
                        pass

            if i % 1000 == 0:
                msg = "Completed %s items (last: %s)"
                print(msg % (i, cluster.pk))
                logger.info(msg, i, cluster.pk)
Ejemplo n.º 21
0
def generate_citation(opinion_text, cluster_id):
    """
    Returns a dictionary representation of our
    Citation object.

    This data will only be returned if found, otherwise none is returned and
    no Citation object is added to the system.  It could be a failed parse
    or the data could simply not be available.

    :param opinion_text: The plain_text of our opinion from the scrape.
    :param cluster_id: The id of the associated Opinion_Cluster related
                        to this opinion
    :return: cite_dict => Returns dictionary of the citation data
    """
    for line_of_text in opinion_text.split("\n")[:250]:
        cites = find_citations.get_citations(line_of_text, html=False)
        if not cites:
            continue

        for cite in cites:
            if "T.C." not in cite.reporter and "T. C." not in cite.reporter:
                continue

            if "T.C." == cite.reporter:
                cite_type = Citation.SPECIALTY
            elif "T.C. No." == cite.reporter:
                cite_type = Citation.SPECIALTY
            else:
                cite_type = Citation.NEUTRAL

            if not Citation.objects.filter(
                    volume=cite.volume,
                    reporter=cite.reporter,
                    page=cite.page,
                    cluster_id=cluster_id,
            ):
                cite.type = cite_type
                return cite
            else:
                logger.info("Citation already in the system. Return None.")
Ejemplo n.º 22
0
 def test_identifying_parallel_citations(self):
     """Given a string, can we identify parallel citations"""
     tests = (
         # A pair consisting of a test string and the number of parallel
         # citations that should be identifiable in that string.
         # Simple case
         ("1 U.S. 1 (22 U.S. 33)", 1, 2),
         # Too far apart
         ("1 U.S. 1 too many words 22 U.S. 33", 0, 0),
         # Three citations
         ("1 U.S. 1, (44 U.S. 33, 99 U.S. 100)", 1, 3),
         # Parallel citation after a valid citation too early on
         ("1 U.S. 1 too many words, then 22 U.S. 33, 13 WL 33223", 1, 2),
     )
     for q, citation_group_count, expected_num_parallel_citations in tests:
         print "Testing parallel citation identification for: %s..." % q,
         citations = get_citations(q)
         citation_groups = identify_parallel_citations(citations)
         computed_num_citation_groups = len(citation_groups)
         self.assertEqual(
             computed_num_citation_groups,
             citation_group_count,
             msg="Did not have correct number of citation groups. Got %s, "
                 "not %s." % (computed_num_citation_groups,
                              citation_group_count)
         )
         if not citation_groups:
             # Add an empty list to make testing easier.
             citation_groups = [[]]
         computed_num_parallel_citation = len(list(citation_groups)[0])
         self.assertEqual(
             computed_num_parallel_citation,
             expected_num_parallel_citations,
             msg="Did not identify correct number of parallel citations in "
                 "the group. Got %s, not %s" % (
                     computed_num_parallel_citation,
                     expected_num_parallel_citations,
                 )
         )
         print '✓'
Ejemplo n.º 23
0
def get_query_citation(cd: Dict[str, Any]) -> Optional[List[Citation]]:
    """Extract citations from the query string and return them, or return
    None
    """
    if not cd.get("q"):
        return None
    citations = get_citations(cd["q"],
                              html=False,
                              do_post_citation=False,
                              do_defendant=False)

    citations = [c for c in citations if isinstance(c, Citation)]

    matches = None
    if len(citations) == 1:
        # If it's not exactly one match, user doesn't get special help.
        matches = match_citation(citations[0])
        if len(matches) == 1:
            # If more than one match, don't show the tip
            return matches.result.docs[0]

    return matches
Ejemplo n.º 24
0
 def test_disambiguate_citations(self):
     test_pairs = [
         # 1. P.R.R --> Correct abbreviation for a reporter.
         ('1 P.R.R. 1',
          [Citation(volume=1, reporter='P.R.R.', page=1,
                    canonical_reporter=u'P.R.R.', lookup_index=0,
                    reporter_index=1, reporter_found='P.R.R.')]),
         # 2. U. S. --> A simple variant to resolve.
         ('1 U. S. 1',
          [Citation(volume=1, reporter='U.S.', page=1,
                    canonical_reporter=u'U.S.', lookup_index=0,
                    court='scotus', reporter_index=1,
                    reporter_found='U. S.')]),
         # 3. A.2d --> Not a variant, but needs to be looked up in the
         #    EDITIONS variable.
         ('1 A.2d 1',
          [Citation(volume=1, reporter='A.2d', page=1,
                    canonical_reporter=u'A.', lookup_index=0,
                    reporter_index=1, reporter_found='A.2d')]),
         # 4. A. 2d --> An unambiguous variant of an edition
         ('1 A. 2d 1',
          [Citation(volume=1, reporter='A.2d', page=1,
                    canonical_reporter=u'A.', lookup_index=0,
                    reporter_index=1, reporter_found='A. 2d')]),
         # 5. P.R. --> A variant of 'Pen. & W.', 'P.R.R.', or 'P.' that's
         #    resolvable by year
         ('1 P.R. 1 (1831)',
          # Of the three, only Pen & W. was being published this year.
          [Citation(volume=1, reporter='Pen. & W.', page=1,
                    canonical_reporter=u'Pen. & W.', lookup_index=0,
                    year=1831, reporter_index=1, reporter_found='P.R.')]),
         # 5.1: W.2d --> A variant of an edition that either resolves to
         #      'Wis. 2d' or 'Wash. 2d' and is resolvable by year.
         ('1 W.2d 1 (1854)',
          # Of the two, only Wis. 2d was being published this year.
          [Citation(volume=1, reporter='Wis. 2d', page=1,
                    canonical_reporter=u'Wis.', lookup_index=0,
                    year=1854, reporter_index=1, reporter_found='W.2d')]),
         # 5.2: Wash. --> A non-variant that has more than one reporter for
         #      the key, but is resolvable by year
         ('1 Wash. 1 (1890)',
          [Citation(volume=1, reporter='Wash.', page=1,
                    canonical_reporter=u'Wash.', lookup_index=1, year=1890,
                    reporter_index=1, reporter_found='Wash.')]),
         # 6. Cr. --> A variant of Cranch, which is ambiguous, except with
         #    paired with this variation.
         ('1 Cra. 1',
          [Citation(volume=1, reporter='Cranch', page=1,
                    canonical_reporter=u'Cranch', lookup_index=0,
                    court='scotus', reporter_index=1,
                    reporter_found='Cra.')]),
         # 7. Cranch. --> Not a variant, but could refer to either Cranch's
         #    Supreme Court cases or his DC ones. In this case, we cannot
         #    disambiguate. Years are not known, and we have no further
         #    clues. We must simply drop Cranch from the results.
         ('1 Cranch 1 1 U.S. 23',
          [Citation(volume=1, reporter='U.S.', page=23,
                    canonical_reporter=u'U.S.', lookup_index=0,
                    court='scotus', reporter_index=4,
                    reporter_found='U.S.')]),
         # 8. Unsolved problem. In theory, we could use parallel citations
         #    to resolve this, because Rob is getting cited next to La., but
         #    we don't currently know the proximity of citations to each
         #    other, so can't use this.
         #  - Rob. --> Either:
         #                8.1: A variant of Robards (1862-1865) or
         #                8.2: Robinson's Louisiana Reports (1841-1846) or
         #                8.3: Robinson's Virgina Reports (1842-1865)
         # ('1 Rob. 1 1 La. 1',
         # [Citation(volume=1, reporter='Rob.', page=1,
         #                          canonical_reporter='Rob.',
         #                          lookup_index=0),
         #  Citation(volume=1, reporter='La.', page=1,
         #                          canonical_reporter='La.',
         #                          lookup_index=0)]),
     ]
     for pair in test_pairs:
         print "Testing disambiguation for %s..." % pair[0],
         citations = get_citations(pair[0], html=False)
         self.assertEqual(
             citations, pair[1],
             msg='%s\n%s != \n%s' %
                 (
                     pair[0],
                     [cite.__dict__ for cite in citations],
                     [cite.__dict__ for cite in pair[1]]
                 )
         )
         print "✓"
Ejemplo n.º 25
0
def parse_harvard_opinions(reporter, volume):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download", file_path.split("/", 9)[-1]]
        )

        if OpinionCluster.objects.filter(
            filepath_json_harvard=file_path
        ).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"], html=False)
        if not cites:
            logger.info(
                "No citation found for %s." % data["citations"][0]["cite"]
            )
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            find_judge_names(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            find_judge_names(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            list(set(itertools.chain.from_iterable(judge_list + author_list)))
        )
        judges = titlecase(judges)
        docket_string = (
            data["docket_number"]
            .replace("Docket No.", "")
            .replace("Docket Nos.", "")
            .strip()
        )

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket.objects.create(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            # Iterate over other xml fields in Harvard data set
            # and save as string list   for further processing at a later date.
            json_fields = [
                "attorneys",
                "disposition",
                "syllabus",
                "summary",
                "history",
                "otherdate",
                "seealso",
                "headnotes",
                "correction",
            ]
            data_set = {}
            while json_fields:
                key = json_fields.pop(0)
                data_set[key] = "|".join([x.text for x in soup.find_all(key)])

            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster.objects.create(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=data_set["attorneys"],
                disposition=data_set["disposition"],
                syllabus=data_set["syllabus"],
                summary=data_set["summary"],
                history=data_set["history"],
                other_dates=data_set["otherdate"],
                cross_reference=data_set["seealso"],
                headnotes=data_set["headnotes"],
                correction=data_set["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.reporter][0]["cite_type"]
                ),
                cluster_id=cluster.id,
            )
            for op in soup.find_all("opinion"):
                joined_by_str = titlecase(
                    " ".join(
                        list(set(itertools.chain.from_iterable(judge_list)))
                    )
                )
                author_str = titlecase(
                    " ".join(
                        list(set(itertools.chain.from_iterable(author_list)))
                    )
                )

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                Opinion.objects.create(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    joined_by_str=joined_by_str,
                    extracted_by_ocr=True,
                )

        logger.info("Finished: %s", citation.base_citation())
Ejemplo n.º 26
0
def citation_redirector(request, reporter=None, volume=None, page=None):
    """Take a citation URL and use it to redirect the user to the canonical page
    for that citation.

    This uses the same infrastructure as the thing that identifies citations in
    the text of opinions.
    """
    if request.method == 'POST':
        form = CitationRedirectorForm(request.POST)
        if form.is_valid():
            # Redirect to the page with the right values
            cd = form.cleaned_data
            return HttpResponseRedirect(
                reverse('citation_redirector', kwargs=cd))
        else:
            # Error in form, somehow.
            return render(request, 'citation_redirect_info_page.html', {
                'show_homepage': True,
                'form': form,
                'private': True
            })
    else:
        if all(_ is None for _ in (reporter, volume, page)):
            # No parameters. Show the standard page.
            form = CitationRedirectorForm()
            return render(request, 'citation_redirect_info_page.html', {
                'show_homepage': True,
                'form': form,
                'private': False,
            })

        else:
            # We have a citation. Look it up, redirect the user or show
            # disambiguation.
            citation_str = " ".join([volume, reporter, page])
            try:
                citation = get_citations(citation_str)[0]
                citation_str = citation.base_citation(
                )  # Corrects typos/variations.
                lookup_fields = [map_citations_to_models([citation]).keys()[0]]
            except IndexError:
                # Unable to disambiguate the citation. Try looking in *all*
                # citation fields.
                lookup_fields = OpinionCluster().citation_fields

            # We were able to get a match, expand it if it's a federal/state
            # match.
            if (len(lookup_fields) == 1
                    and lookup_fields[0] == 'federal_cite_one'):
                lookup_fields = [
                    'federal_cite_one', 'federal_cite_two',
                    'federal_cite_three'
                ]
            elif (len(lookup_fields) == 1
                  and lookup_fields[0] == 'state_cite_one'):
                lookup_fields = [
                    'state_cite_one', 'state_cite_two', 'state_cite_three'
                ]
            q = Q()
            for lookup_field in lookup_fields:
                q |= Q(**{lookup_field: citation_str})
            clusters = OpinionCluster.objects.filter(q)

            # Show the correct page....
            if clusters.count() == 0:
                # No results for an otherwise valid citation.
                return render(
                    request, 'citation_redirect_info_page.html', {
                        'none_found': True,
                        'citation_str': citation_str,
                        'private': True,
                    })

            elif clusters.count() == 1:
                # Total success. Redirect to correct location.
                return HttpResponseRedirect(clusters[0].get_absolute_url())

            elif clusters.count() > 1:
                # Multiple results. Show them.
                return render(
                    request, 'citation_redirect_info_page.html', {
                        'too_many': True,
                        'citation_str': citation_str,
                        'clusters': clusters,
                        'private': True,
                    })
Ejemplo n.º 27
0
def make_and_save(item,
                  skipdupes=False,
                  min_dates=None,
                  start_dates=None,
                  testing=True):
    """Associates case data from `parse_opinions` with objects. Saves these
    objects.

    min_date: if not none, will skip cases after min_date
    """
    date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None
    unknown_date = None
    for date_cluster in item['dates']:
        for date_info in date_cluster:
            # check for any dates that clearly aren't dates
            if date_info[1].year < 1600 or date_info[1].year > 2020:
                continue
            # check for untagged dates that will be assigned to date_filed
            if date_info[0] is None:
                date_filed = date_info[1]
                continue
            # try to figure out what type of date it is based on its tag string
            if date_info[0] in FILED_TAGS:
                date_filed = date_info[1]
            elif date_info[0] in DECIDED_TAGS:
                if not date_filed:
                    date_filed = date_info[1]
            elif date_info[0] in ARGUED_TAGS:
                date_argued = date_info[1]
            elif date_info[0] in REARGUE_TAGS:
                date_reargued = date_info[1]
            elif date_info[0] in REARGUE_DENIED_TAGS:
                date_reargument_denied = date_info[1]
            elif date_info[0] in CERT_GRANTED_TAGS:
                date_cert_granted = date_info[1]
            elif date_info[0] in CERT_DENIED_TAGS:
                date_cert_denied = date_info[1]
            else:
                unknown_date = date_info[1]
                if date_info[0] not in UNKNOWN_TAGS:
                    print("\nFound unknown date tag '%s' with date '%s'.\n" %
                          date_info)

    # the main date (used for date_filed in OpinionCluster) and panel dates
    # (used for finding judges) are ordered in terms of which type of dates
    # best reflect them
    main_date = (date_filed or date_argued or date_reargued
                 or date_reargument_denied or unknown_date)
    panel_date = (date_argued or date_reargued or date_reargument_denied
                  or date_filed or unknown_date)

    if main_date is None:
        raise Exception("Failed to get a date for " + item['file'])

    # special rule for Kentucky
    if item['court_id'] == 'kycourtapp' and main_date <= date(1975, 12, 31):
        item['court_id'] = 'kycourtapphigh'

    if min_dates is not None:
        if min_dates.get(item['court_id']) is not None:
            if main_date >= min_dates[item['court_id']]:
                print(main_date, 'after', min_dates[item['court_id']],
                      ' -- skipping.')
                return
    if start_dates is not None:
        if start_dates.get(item['court_id']) is not None:
            if main_date <= start_dates[item['court_id']]:
                print(main_date, 'before court founding:',
                      start_dates[item['court_id']], ' -- skipping.')
                return

    docket = Docket(source=Docket.COLUMBIA,
                    date_argued=date_argued,
                    date_reargued=date_reargued,
                    date_cert_granted=date_cert_granted,
                    date_cert_denied=date_cert_denied,
                    date_reargument_denied=date_reargument_denied,
                    court_id=item['court_id'],
                    case_name_short=item['case_name_short'] or '',
                    case_name=item['case_name'] or '',
                    case_name_full=item['case_name_full'] or '',
                    docket_number=item['docket'] or '')

    # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...}
    found_citations = []
    for c in item['citations']:
        found = get_citations(c)
        if not found:
            # if the docket number --is-- citation string, we're likely dealing
            # with a somewhat common triplet of (docket number, date,
            # jurisdiction), which isn't a citation at all (so there's no
            # problem)
            if item['docket']:
                docket_no = item['docket'].lower()
                if 'claim no.' in docket_no:
                    docket_no = docket_no.split('claim no.')[0]
                for junk in DOCKET_JUNK:
                    docket_no = docket_no.replace(junk, '')
                docket_no = docket_no.strip('.').strip()
                if docket_no and docket_no in c.lower():
                    continue

            # there are a trivial number of letters (except for months and a few
            # trivial words) in the citation, then it's not a citation at all
            non_trivial = c.lower()
            for trivial in TRIVIAL_CITE_WORDS:
                non_trivial = non_trivial.replace(trivial, '')
            num_letters = sum(
                non_trivial.count(letter) for letter in string.lowercase)
            if num_letters < 3:
                continue

            # if there is a string that's known to indicate a bad citation, then
            # it's not a citation
            if any(bad in c for bad in BAD_CITES):
                continue
            # otherwise, this is a problem
            raise Exception("Failed to get a citation from the string '%s' in "
                            "court '%s' with docket '%s'." %
                            (c, item['court_id'], item['docket']))
        else:
            found_citations.extend(found)
    citations_map = map_citations_to_models(found_citations)

    cluster = OpinionCluster(
        judges=item.get('judges', '') or "",
        precedential_status=('Unpublished'
                             if item['unpublished'] else 'Published'),
        date_filed=main_date,
        case_name_short=item['case_name_short'] or '',
        case_name=item['case_name'] or '',
        case_name_full=item['case_name_full'] or '',
        source='Z',
        attorneys=item['attorneys'] or '',
        posture=item['posture'] or '',
        **citations_map)
    panel = [
        find_person(n, item['court_id'], case_date=panel_date)
        for n in item['panel']
    ]
    panel = [x for x in panel if x is not None]

    opinions = []
    for i, opinion_info in enumerate(item['opinions']):
        if opinion_info['author'] is None:
            author = None
        else:
            author = find_person(opinion_info['author'],
                                 item['court_id'],
                                 case_date=panel_date)
        converted_text = convert_columbia_html(opinion_info['opinion'])
        opinion_type = OPINION_TYPE_MAPPING[opinion_info['type']]
        if opinion_type == '020lead' and i > 0:
            opinion_type = '050addendum'

        opinion = Opinion(
            author=author,
            per_curiam=opinion_info['per_curiam'],
            type=opinion_type,
            # type=OPINION_TYPE_MAPPING[opinion_info['type']],
            html_columbia=converted_text,
            sha1=opinion_info['sha1'],
            local_path=opinion_info['local_path'],
        )
        joined_by = [
            find_person(n, item['court_id'], case_date=panel_date)
            for n in opinion_info['joining']
        ]
        joined_by = [x for x in joined_by if x is not None]
        opinions.append((opinion, joined_by))

    if min_dates is None:
        # check to see if this is a duplicate
        dups = find_dups(docket, cluster)
        if dups:
            if skipdupes:
                print('Duplicate. skipping.')
            else:
                raise Exception("Found %s duplicate(s)." % len(dups))

    # save all the objects
    if not testing:
        try:
            docket.save()
            cluster.docket = docket
            cluster.save(index=False)
            for member in panel:
                cluster.panel.add(member)
            for opinion, joined_by in opinions:
                opinion.cluster = cluster
                opinion.save(index=False)
                for joiner in joined_by:
                    opinion.joined_by.add(joiner)
            if settings.DEBUG:
                domain = "http://127.0.0.1:8000"
            else:
                domain = "https://www.courtlistener.com"
            print("Created item at: %s%s" %
                  (domain, cluster.get_absolute_url()))
        except:
            # if anything goes wrong, try to delete everything
            try:
                docket.delete()
            except:
                pass
            raise
Ejemplo n.º 28
0
def citation_redirector(request, reporter=None, volume=None, page=None):
    """Take a citation URL and use it to redirect the user to the canonical page
    for that citation.

    This uses the same infrastructure as the thing that identifies citations in
    the text of opinions.
    """
    if request.method == 'POST':
        form = CitationRedirectorForm(request.POST)
        if form.is_valid():
            # Redirect to the page with the right values
            cd = form.cleaned_data
            return HttpResponseRedirect(
                reverse('citation_redirector', kwargs=cd)
            )
        else:
            # Error in form, somehow.
            return render_to_response(
                'citation_redirect_info_page.html',
                {'show_homepage': True,
                 'form': form,
                 'private': True},
                RequestContext(request),
            )
    else:
        if all(_ is None for _ in (reporter, volume, page)):
            # Show the most basic page
            form = CitationRedirectorForm()
            return render_to_response(
                'citation_redirect_info_page.html',
                {
                    'show_homepage': True,
                    'form': form,
                    'private': False,
                },
                RequestContext(request),
            )

        else:
            # Look up the citation, redirect the user or show disambiguation.
            citation_str = " ".join([volume, reporter, page])
            try:
                citation = get_citations(citation_str)[0]
                citation_str = citation.base_citation()  # Corrects typos/variations.
                lookup_fields = [map_citations_to_models([citation]).keys()[0]]
            except IndexError:
                # Unable to disambiguate the citation. Try looking in *all*
                # citation fields.
                lookup_fields = OpinionCluster().citation_fields

            # We were able to get a match, expand it if it's a federal/state
            # match.
            if (len(lookup_fields) == 1 and
                    lookup_fields[0] == 'federal_cite_one'):
                lookup_fields = ['federal_cite_one', 'federal_cite_two',
                                 'federal_cite_three']
            elif (len(lookup_fields) == 1 and
                    lookup_fields[0] == 'state_cite_one'):
                lookup_fields = ['state_cite_one', 'state_cite_two',
                                 'state_cite_three']
            q = Q()
            for lookup_field in lookup_fields:
                q |= Q(**{lookup_field: citation_str})
            clusters = OpinionCluster.objects.filter(q)

            # Show the correct page....
            if clusters.count() == 0:
                # No results for an otherwise valid citation.
                return render_to_response(
                    'citation_redirect_info_page.html',
                    {
                        'none_found': True,
                        'citation_str': citation_str,
                        'private': True,
                    },
                    RequestContext(request),
                    status=404,
                )

            elif clusters.count() == 1:
                # Total success. Redirect to correct location.
                return HttpResponseRedirect(
                    clusters[0].get_absolute_url()
                )

            elif clusters.count() > 1:
                # Multiple results. Show them.
                return render_to_response(
                    'citation_redirect_info_page.html',
                    {
                        'too_many': True,
                        'citation_str': citation_str,
                        'clusters': clusters,
                        'private': True,
                    },
                    RequestContext(request),
                    status=300,
                )
Ejemplo n.º 29
0
    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        west_cite_str = item.get('west_citations', '')
        state_cite_str = item.get('west_state_citations', '')
        neutral_cite_str = item.get('neutral_citations', '')
        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            date_filed_is_approximate=item['date_filed_is_approximate'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            # These three fields are replaced below.
            federal_cite_one=west_cite_str,
            state_cite_one=state_cite_str,
            neutral_cite=neutral_cite_str,
            syllabus=item.get('summaries', ''),
        )
        citations = []
        if west_cite_str:
            citation_obj = get_citations(west_cite_str)[0]
            citations.append(
                Citation(
                    cluster=cluster,
                    volume=citation_obj.volume,
                    reporter=citation_obj.reporter,
                    page=citation_obj.page,
                    type=Citation.WEST,
                ))
        if state_cite_str:
            citation_obj = get_citations(state_cite_str)[0]
            citations.append(
                Citation(
                    cluster=cluster,
                    volume=citation_obj.volume,
                    reporter=citation_obj.reporter,
                    page=citation_obj.page,
                    type=Citation.STATE,
                ))
        if neutral_cite_str:
            citation_obj = get_citations(neutral_cite_str)[0]
            citations.append(
                Citation(
                    cluster=cluster,
                    volume=citation_obj.volume,
                    reporter=citation_obj.reporter,
                    page=citation_obj.page,
                    type=Citation.NEUTRAL,
                ))
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, citations, error
Ejemplo n.º 30
0
def make_and_save(item, skipdupes=False, min_dates=None, testing=True):
    """Associates case data from `parse_opinions` with objects. Saves these
    objects.

    min_date: if not none, will skip cases after min_date
    """
    date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None
    unknown_date = None
    for date_cluster in item['dates']:
        for date_info in date_cluster:
            # check for any dates that clearly aren't dates
            if date_info[1].year < 1600 or date_info[1].year > 2020:
                continue
            # check for untagged dates that will be assigned to date_filed
            if date_info[0] is None:
                date_filed = date_info[1]
                continue
            # try to figure out what type of date it is based on its tag string
            if date_info[0] in FILED_TAGS:
                date_filed = date_info[1]
            elif date_info[0] in DECIDED_TAGS:
                if not date_filed:
                    date_filed = date_info[1]
            elif date_info[0] in ARGUED_TAGS:
                date_argued = date_info[1]
            elif date_info[0] in REARGUE_TAGS:
                date_reargued = date_info[1]
            elif date_info[0] in REARGUE_DENIED_TAGS:
                date_reargument_denied = date_info[1]
            elif date_info[0] in CERT_GRANTED_TAGS:
                date_cert_granted = date_info[1]
            elif date_info[0] in CERT_DENIED_TAGS:
                date_cert_denied = date_info[1]
            else:
                unknown_date = date_info[1]
                if date_info[0] not in UNKNOWN_TAGS:
                    print("\nFound unknown date tag '%s' with date '%s'.\n" %
                          date_info)

    # the main date (used for date_filed in OpinionCluster) and panel dates
    # (used for finding judges) are ordered in terms of which type of dates
    # best reflect them
    main_date = (date_filed or date_argued or date_reargued or
                 date_reargument_denied or unknown_date)
    panel_date = (date_argued or date_reargued or date_reargument_denied or
                  date_filed or unknown_date)

    if main_date is None:
        raise Exception("Failed to get a date for " + item['file'])

    if min_dates is not None:
        if min_dates.get(item['court_id']) is not None:
            if main_date >= min_dates[item['court_id']]:
                print(main_date, 'after', min_dates[item['court_id']],
                      ' -- skipping.')
                return

    docket = Docket(
        source=Docket.COLUMBIA,
        date_argued=date_argued,
        date_reargued=date_reargued,
        date_cert_granted=date_cert_granted,
        date_cert_denied=date_cert_denied,
        date_reargument_denied=date_reargument_denied,
        court_id=item['court_id'],
        case_name_short=item['case_name_short'] or '',
        case_name=item['case_name'] or '',
        case_name_full=item['case_name_full'] or '',
        docket_number=item['docket'] or ''
    )

    # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...}
    found_citations = []
    for c in item['citations']:
        found = get_citations(c)
        if not found:
            # if the docket number --is-- citation string, we're likely dealing
            # with a somewhat common triplet of (docket number, date,
            # jurisdiction), which isn't a citation at all (so there's no
            # problem)
            if item['docket']:
                docket_no = item['docket'].lower()
                if 'claim no.' in docket_no:
                    docket_no = docket_no.split('claim no.')[0]
                for junk in DOCKET_JUNK:
                    docket_no = docket_no.replace(junk, '')
                docket_no = docket_no.strip('.').strip()
                if docket_no and docket_no in c.lower():
                    continue

            # there are a trivial number of letters (except for months and a few
            # trivial words) in the citation, then it's not a citation at all
            non_trivial = c.lower()
            for trivial in TRIVIAL_CITE_WORDS:
                non_trivial = non_trivial.replace(trivial, '')
            num_letters = sum(non_trivial.count(letter) for letter in string.lowercase)
            if num_letters < 3:
                continue

            # if there is a string that's known to indicate a bad citation, then
            # it's not a citation
            if any(bad in c for bad in BAD_CITES):
                continue
            # otherwise, this is a problem
            raise Exception("Failed to get a citation from the string '%s' in "
                            "court '%s' with docket '%s'." % (
                                c, item['court_id'], item['docket']
                            ))
        else:
            found_citations.extend(found)
    citations_map = map_citations_to_models(found_citations)

    cluster = OpinionCluster(
        judges=item.get('judges', '') or "",
        precedential_status=('Unpublished' if item['unpublished'] else 'Published'),
        date_filed=main_date,
        case_name_short=item['case_name_short'] or '',
        case_name=item['case_name'] or '',
        case_name_full=item['case_name_full'] or '',
        source='Z',
        attorneys=item['attorneys'] or '',
        posture=item['posture'] or '',
        **citations_map
    )
    panel = [find_person(n, item['court_id'], case_date=panel_date) for n in
             item['panel']]
    panel = [x for x in panel if x is not None]

    opinions = []
    for i, opinion_info in enumerate(item['opinions']):
        if opinion_info['author'] is None:
            author = None
        else:
            author = find_person(opinion_info['author'], item['court_id'],
                                 case_date=panel_date)
        converted_text = convert_columbia_html(opinion_info['opinion'])
        opinion_type = OPINION_TYPE_MAPPING[opinion_info['type']]
        if opinion_type == '020lead' and i > 0:
            opinion_type = '050addendum'

        opinion = Opinion(
            author=author,
            per_curiam=opinion_info['per_curiam'],
            type=opinion_type,
            # type=OPINION_TYPE_MAPPING[opinion_info['type']],
            html_columbia=converted_text,
            sha1=opinion_info['sha1'],
            local_path=opinion_info['local_path'],
        )
        joined_by = [find_person(n, item['court_id'], case_date=panel_date) for n in opinion_info['joining']]
        joined_by = [x for x in joined_by if x is not None]
        opinions.append((opinion, joined_by))

    if min_dates is None:
        # check to see if this is a duplicate
        dups = find_dups(docket, cluster, panel, opinions)
        if dups:
            if skipdupes:
                print('Duplicate. skipping.')
            else:
                raise Exception("Found %s duplicate(s)." % len(dups))

    # save all the objects
    if not testing:
        try:
            docket.save()
            cluster.docket = docket
            cluster.save(index=False)
            for member in panel:
                cluster.panel.add(member)
            for opinion, joined_by in opinions:
                opinion.cluster = cluster
                opinion.save(index=False)
                for joiner in joined_by:
                    opinion.joined_by.add(joiner)
            if settings.DEBUG:
                domain = "http://127.0.0.1:8000"
            else:
                domain = "https://www.courtlistener.com"
            print("Created item at: %s%s" % (domain, cluster.get_absolute_url()))
        except:
            # if anything goes wrong, try to delete everything
            try:
                docket.delete()
            except:
                pass
            raise
Ejemplo n.º 31
0
def make_and_save(item):
    """Associates case data from `parse_opinions` with objects. Saves these objects."""
    date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None
    for date_cluster in item['dates']:
        for date_info in date_cluster:
            # check for any dates that clearly aren't dates
            if date_info[1].year < 1600 or date_info[1].year > 2020:
                continue
            # check for untagged dates that will be assigned to date_filed
            if date_info[0] is None:
                date_filed = date_info[1]
                continue
            # try to figure out what type of date it is based on its tag string
            if date_info[0] in FILED_TAGS:
                date_filed = date_info[1]
            elif date_info[0] in DECIDED_TAGS:
                if not date_filed:
                    date_filed = date_info[1]
            elif date_info[0] in ARGUED_TAGS:
                date_argued = date_info[1]
            elif date_info[0] in REARGUE_TAGS:
                date_reargued = date_info[1]
            elif date_info[0] in REARGUE_DENIED_TAGS:
                date_reargument_denied = date_info[1]
            elif date_info[0] in CERT_GRANTED_TAGS:
                date_cert_granted = date_info[1]
            elif date_info[0] in CERT_DENIED_TAGS:
                date_cert_denied = date_info[1]
            else:
                print("Found unknown date tag '%s' with date '%s'." % date_info)

    docket = Docket(
        date_argued=date_argued
        ,date_reargued=date_reargued
        ,date_cert_granted=date_cert_granted
        ,date_cert_denied=date_cert_denied
        ,date_reargument_denied=date_reargument_denied
        ,court_id=item['court_id']
        ,case_name_short=item['case_name_short'] or ''
        ,case_name=item['case_name'] or ''
        ,case_name_full=item['case_name_full'] or ''
        ,docket_number=item['docket'] or ''
    )
    docket.save()

    # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...}
    found_citations = []
    for c in item['citations']:
        found = get_citations(c)
        if not found:
            raise Exception("Failed to get a citation from the string '%s'." % c)
        elif len(found) > 1:
            raise Exception("Got multiple citations from string '%s' when there should have been one." % c)
        found_citations.append(found[0])
    citations_map = map_citations_to_models(found_citations)

    cluster = OpinionCluster(
        docket=docket
        ,precedential_status=('Unpublished' if item['unpublished'] else 'Published')
        ,date_filed=date_filed
        ,case_name_short=item['case_name_short'] or ''
        ,case_name=item['case_name'] or ''
        ,case_name_full=item['case_name_full'] or ''
        ,source='Z'
        ,attorneys=item['attorneys'] or ''
        ,posture=item['posture'] or ''
        ,**citations_map
    )
    cluster.save()
    
    if date_argued is not None:
        paneldate = date_argued
    else:
        paneldate = date_filed
    panel = [find_person(n, item['court_id'], paneldate) for n in item['panel']]
    panel = [x for x in panel if x is not None]
    for member in panel:
        cluster.panel.add(member)

    for opinion_info in item['opinions']:
        if opinion_info['author'] is None:
            author = None
        else:
            author = find_person(opinion_info['author'], item['court_id'], date_filed or date_argued)
        opinion = Opinion(
            cluster=cluster
            ,author=author
            ,type=OPINION_TYPE_MAPPING[opinion_info['type']]
            ,html_columbia=opinion_info['opinion']
        )
        opinion.save()
        joined_by = [find_person(n, item['court_id'], paneldate) for n in opinion_info['joining']]
        joined_by = [x for x in joined_by if x is not None]
        for joiner in joined_by:
            opinion.joined_by.add(joiner)
Ejemplo n.º 32
0
 def test_find_citations(self):
     """Can we find and make Citation objects from strings?"""
     test_pairs = (
         # Basic test
         ('1 U.S. 1', [
             Citation(volume=1,
                      reporter='U.S.',
                      page=1,
                      canonical_reporter=u'U.S.',
                      lookup_index=0,
                      court='scotus',
                      reporter_index=1,
                      reporter_found='U.S.')
         ]),
         # Basic test of non-case name before citation (should not be found)
         ('lissner test 1 U.S. 1', [
             Citation(volume=1,
                      reporter='U.S.',
                      page=1,
                      canonical_reporter=u'U.S.',
                      lookup_index=0,
                      court='scotus',
                      reporter_index=3,
                      reporter_found='U.S.')
         ]),
         # Test with plaintiff and defendant
         ('lissner v. test 1 U.S. 1', [
             Citation(plaintiff='lissner',
                      defendant='test',
                      volume=1,
                      reporter='U.S.',
                      page=1,
                      canonical_reporter=u'U.S.',
                      lookup_index=0,
                      court='scotus',
                      reporter_index=4,
                      reporter_found='U.S.')
         ]),
         # Test with plaintiff, defendant and year
         ('lissner v. test 1 U.S. 1 (1982)', [
             Citation(plaintiff='lissner',
                      defendant='test',
                      volume=1,
                      reporter='U.S.',
                      page=1,
                      year=1982,
                      canonical_reporter=u'U.S.',
                      lookup_index=0,
                      court='scotus',
                      reporter_index=4,
                      reporter_found='U.S.')
         ]),
         # Test with different reporter than all of above.
         ('bob lissner v. test 1 F.2d 1 (1982)', [
             Citation(plaintiff='lissner',
                      defendant='test',
                      volume=1,
                      reporter='F.2d',
                      page=1,
                      year=1982,
                      canonical_reporter=u'F.',
                      lookup_index=0,
                      reporter_index=5,
                      reporter_found='F.2d')
         ]),
         # Test with court and extra information
         ('bob lissner v. test 1 U.S. 12, 347-348 (4th Cir. 1982)', [
             Citation(plaintiff='lissner',
                      defendant='test',
                      volume=1,
                      reporter='U.S.',
                      page=12,
                      year=1982,
                      extra=u'347-348',
                      court='ca4',
                      canonical_reporter=u'U.S.',
                      lookup_index=0,
                      reporter_index=5,
                      reporter_found='U.S.')
         ]),
         # Test with text before and after and a variant reporter
         ('asfd 22 U. S. 332 (1975) asdf', [
             Citation(volume=22,
                      reporter='U.S.',
                      page=332,
                      year=1975,
                      canonical_reporter=u'U.S.',
                      lookup_index=0,
                      court='scotus',
                      reporter_index=2,
                      reporter_found='U. S.')
         ]),
         # Test with finding reporter when it's a second edition
         ('asdf 22 A.2d 332 asdf', [
             Citation(volume=22,
                      reporter='A.2d',
                      page=332,
                      canonical_reporter=u'A.',
                      lookup_index=0,
                      reporter_index=2,
                      reporter_found='A.2d')
         ]),
         # Test finding a variant second edition reporter
         ('asdf 22 A. 2d 332 asdf', [
             Citation(volume=22,
                      reporter='A.2d',
                      page=332,
                      canonical_reporter=u'A.',
                      lookup_index=0,
                      reporter_index=2,
                      reporter_found='A. 2d')
         ]),
         # Test finding a variant of an edition resolvable by variant alone.
         ('171 Wn.2d 1016', [
             Citation(volume=171,
                      reporter='Wash. 2d',
                      page=1016,
                      canonical_reporter=u'Wash.',
                      lookup_index=1,
                      reporter_index=1,
                      reporter_found='Wn.2d')
         ]),
         # Test finding two citations where one of them has abutting
         # punctuation.
         ('2 U.S. 3, 4-5 (3 Atl. 33)', [
             Citation(volume=2,
                      reporter="U.S.",
                      page=3,
                      extra=u'4-5',
                      canonical_reporter=u"U.S.",
                      lookup_index=0,
                      reporter_index=1,
                      reporter_found="U.S.",
                      court='scotus'),
             Citation(volume=3,
                      reporter="A.",
                      page=33,
                      canonical_reporter=u"A.",
                      lookup_index=0,
                      reporter_index=5,
                      reporter_found="Atl.")
         ]),
         # Test with the page number as a Roman numeral
         ('12 Neb. App. lxiv (2004)', [
             Citation(volume=12,
                      reporter='Neb. Ct. App.',
                      page='lxiv',
                      year=2004,
                      canonical_reporter=u'Neb. Ct. App.',
                      lookup_index=0,
                      reporter_index=1,
                      reporter_found='Neb. App.')
         ]),
         # Test with the 'digit-REPORTER-digit' corner-case formatting
         ('2007-NMCERT-008', [
             Citation(volume=2007,
                      reporter='NMCERT',
                      page=8,
                      canonical_reporter=u'NMCERT',
                      lookup_index=0,
                      reporter_index=1,
                      reporter_found='NMCERT')
         ]),
         ('2006-Ohio-2095', [
             Citation(volume=2006,
                      reporter='Ohio',
                      page=2095,
                      canonical_reporter=u'Ohio',
                      lookup_index=0,
                      reporter_index=1,
                      reporter_found='Ohio')
         ]),
     )
     for q, a in test_pairs:
         print "Testing citation extraction for %s..." % q,
         cites_found = get_citations(q)
         self.assertEqual(
             cites_found,
             a,
             msg='%s\n%s\n\n    !=\n\n%s' % (
                 q,
                 ",\n".join([str(cite.__dict__) for cite in cites_found]),
                 ",\n".join([str(cite.__dict__) for cite in a]),
             ))
         print "✓"
Ejemplo n.º 33
0
def parse_harvard_opinions(reporter, volume, make_searchable):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :param make_searchable: Boolean to indicate saving to solr
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download",
             file_path.split("/", 9)[-1]])

        if OpinionCluster.objects.filter(
                filepath_json_harvard=file_path).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"], html=False)
        if not cites:
            logger.info("No citation found for %s." %
                        data["citations"][0]["cite"])
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name, file_path):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            find_judge_names(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            find_judge_names(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            sorted(
                list(
                    set(itertools.chain.from_iterable(judge_list +
                                                      author_list)))))
        judges = titlecase(judges)
        docket_string = (data["docket_number"].replace(
            "Docket No.", "").replace("Docket Nos.", "").strip())

        short_fields = ["attorneys", "disposition", "otherdate", "seealso"]

        long_fields = [
            "syllabus",
            "summary",
            "history",
            "headnotes",
            "correction",
        ]

        short_data = parse_extra_fields(soup, short_fields, False)
        long_data = parse_extra_fields(soup, long_fields, True)

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            try:
                with transaction.atomic():
                    docket.save()
            except OperationalError as e:
                if "exceeds maximum" in str(e):
                    docket.docket_number = (
                        "%s, See Corrections for full Docket Number" %
                        trunc(docket_string, length=5000, ellipsis="..."))
                    docket.save()
                    long_data["correction"] = "%s <br> %s" % (
                        data["docket_number"],
                        long_data["correction"],
                    )
            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=short_data["attorneys"],
                disposition=short_data["disposition"],
                syllabus=long_data["syllabus"],
                summary=long_data["summary"],
                history=long_data["history"],
                other_dates=short_data["otherdate"],
                cross_reference=short_data["seealso"],
                headnotes=long_data["headnotes"],
                correction=long_data["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )
            cluster.save(index=False)

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.canonical_reporter][0]["cite_type"]),
                cluster_id=cluster.id,
            )
            new_op_pks = []
            for op in soup.find_all("opinion"):
                # This code cleans author tags for processing.
                # It is particularly useful for identifiying Per Curiam
                for elem in [op.find("author")]:
                    if elem is not None:
                        [x.extract() for x in elem.find_all("page-number")]

                auth = op.find("author")
                if auth is not None:
                    author_tag_str = titlecase(auth.text.strip(":"))
                    author_str = titlecase("".join(
                        find_judge_names(author_tag_str)))
                else:
                    author_str = ""
                    author_tag_str = ""

                per_curiam = True if author_tag_str == "Per Curiam" else False
                # If Per Curiam is True set author string to Per Curiam
                if per_curiam:
                    author_str = "Per Curiam"

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                op = Opinion(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    per_curiam=per_curiam,
                    extracted_by_ocr=True,
                )
                # Don't index now; do so later if desired
                op.save(index=False)
                new_op_pks.append(op.pk)

        if make_searchable:
            add_items_to_solr.delay(new_op_pks, "search.Opinion")

        logger.info("Finished: %s", citation.base_citation())
Ejemplo n.º 34
0
 def test_disambiguate_citations(self):
     test_pairs = [
         # 1. P.R.R --> Correct abbreviation for a reporter.
         ('1 P.R.R. 1', [
             Citation(volume=1,
                      reporter='P.R.R.',
                      page=1,
                      canonical_reporter=u'P.R.R.',
                      lookup_index=0,
                      reporter_index=1,
                      reporter_found='P.R.R.')
         ]),
         # 2. U. S. --> A simple variant to resolve.
         ('1 U. S. 1', [
             Citation(volume=1,
                      reporter='U.S.',
                      page=1,
                      canonical_reporter=u'U.S.',
                      lookup_index=0,
                      court='scotus',
                      reporter_index=1,
                      reporter_found='U. S.')
         ]),
         # 3. A.2d --> Not a variant, but needs to be looked up in the
         #    EDITIONS variable.
         ('1 A.2d 1', [
             Citation(volume=1,
                      reporter='A.2d',
                      page=1,
                      canonical_reporter=u'A.',
                      lookup_index=0,
                      reporter_index=1,
                      reporter_found='A.2d')
         ]),
         # 4. A. 2d --> An unambiguous variant of an edition
         ('1 A. 2d 1', [
             Citation(volume=1,
                      reporter='A.2d',
                      page=1,
                      canonical_reporter=u'A.',
                      lookup_index=0,
                      reporter_index=1,
                      reporter_found='A. 2d')
         ]),
         # 5. P.R. --> A variant of 'Pen. & W.', 'P.R.R.', or 'P.' that's
         #    resolvable by year
         (
             '1 P.R. 1 (1831)',
             # Of the three, only Pen & W. was being published this year.
             [
                 Citation(volume=1,
                          reporter='Pen. & W.',
                          page=1,
                          canonical_reporter=u'Pen. & W.',
                          lookup_index=0,
                          year=1831,
                          reporter_index=1,
                          reporter_found='P.R.')
             ]),
         # 5.1: W.2d --> A variant of an edition that either resolves to
         #      'Wis. 2d' or 'Wash. 2d' and is resolvable by year.
         (
             '1 W.2d 1 (1854)',
             # Of the two, only Wis. 2d was being published this year.
             [
                 Citation(volume=1,
                          reporter='Wis. 2d',
                          page=1,
                          canonical_reporter=u'Wis.',
                          lookup_index=0,
                          year=1854,
                          reporter_index=1,
                          reporter_found='W.2d')
             ]),
         # 5.2: Wash. --> A non-variant that has more than one reporter for
         #      the key, but is resolvable by year
         ('1 Wash. 1 (1890)', [
             Citation(volume=1,
                      reporter='Wash.',
                      page=1,
                      canonical_reporter=u'Wash.',
                      lookup_index=1,
                      year=1890,
                      reporter_index=1,
                      reporter_found='Wash.')
         ]),
         # 6. Cr. --> A variant of Cranch, which is ambiguous, except with
         #    paired with this variation.
         ('1 Cra. 1', [
             Citation(volume=1,
                      reporter='Cranch',
                      page=1,
                      canonical_reporter=u'Cranch',
                      lookup_index=0,
                      court='scotus',
                      reporter_index=1,
                      reporter_found='Cra.')
         ]),
         # 7. Cranch. --> Not a variant, but could refer to either Cranch's
         #    Supreme Court cases or his DC ones. In this case, we cannot
         #    disambiguate. Years are not known, and we have no further
         #    clues. We must simply drop Cranch from the results.
         ('1 Cranch 1 1 U.S. 23', [
             Citation(volume=1,
                      reporter='U.S.',
                      page=23,
                      canonical_reporter=u'U.S.',
                      lookup_index=0,
                      court='scotus',
                      reporter_index=4,
                      reporter_found='U.S.')
         ]),
         # 8. Unsolved problem. In theory, we could use parallel citations
         #    to resolve this, because Rob is getting cited next to La., but
         #    we don't currently know the proximity of citations to each
         #    other, so can't use this.
         #  - Rob. --> Either:
         #                8.1: A variant of Robards (1862-1865) or
         #                8.2: Robinson's Louisiana Reports (1841-1846) or
         #                8.3: Robinson's Virgina Reports (1842-1865)
         # ('1 Rob. 1 1 La. 1',
         # [Citation(volume=1, reporter='Rob.', page=1,
         #                          canonical_reporter='Rob.',
         #                          lookup_index=0),
         #  Citation(volume=1, reporter='La.', page=1,
         #                          canonical_reporter='La.',
         #                          lookup_index=0)]),
     ]
     for pair in test_pairs:
         print "Testing disambiguation for %s..." % pair[0],
         citations = get_citations(pair[0], html=False)
         self.assertEqual(citations,
                          pair[1],
                          msg='%s\n%s != \n%s' %
                          (pair[0], [cite.__dict__ for cite in citations
                                     ], [cite.__dict__
                                         for cite in pair[1]]))
         print "✓"
Ejemplo n.º 35
0
 def test_find_citations(self):
     """Can we find and make Citation objects from strings?"""
     test_pairs = (
         # Basic test
         ('1 U.S. 1',
          [Citation(volume=1, reporter='U.S.', page=1,
                    canonical_reporter=u'U.S.', lookup_index=0,
                    court='scotus', reporter_index=1,
                    reporter_found='U.S.')]),
         # Basic test of non-case name before citation (should not be found)
         ('lissner test 1 U.S. 1',
          [Citation(volume=1, reporter='U.S.', page=1,
                    canonical_reporter=u'U.S.', lookup_index=0,
                    court='scotus', reporter_index=3,
                    reporter_found='U.S.')]),
         # Test with plaintiff and defendant
         ('lissner v. test 1 U.S. 1',
          [Citation(plaintiff='lissner', defendant='test', volume=1,
                    reporter='U.S.', page=1, canonical_reporter=u'U.S.',
                    lookup_index=0, court='scotus', reporter_index=4,
                    reporter_found='U.S.')]),
         # Test with plaintiff, defendant and year
         ('lissner v. test 1 U.S. 1 (1982)',
          [Citation(plaintiff='lissner', defendant='test', volume=1,
                    reporter='U.S.', page=1, year=1982,
                    canonical_reporter=u'U.S.', lookup_index=0,
                    court='scotus', reporter_index=4,
                    reporter_found='U.S.')]),
         # Test with different reporter than all of above.
         ('bob lissner v. test 1 F.2d 1 (1982)',
          [Citation(plaintiff='lissner', defendant='test', volume=1,
                    reporter='F.2d', page=1, year=1982,
                    canonical_reporter=u'F.', lookup_index=0,
                    reporter_index=5, reporter_found='F.2d')]),
         # Test with court and extra information
         ('bob lissner v. test 1 U.S. 12, 347-348 (4th Cir. 1982)',
          [Citation(plaintiff='lissner', defendant='test', volume=1,
                    reporter='U.S.', page=12, year=1982, extra=u'347-348',
                    court='ca4', canonical_reporter=u'U.S.', lookup_index=0,
                    reporter_index=5, reporter_found='U.S.')]),
         # Test with text before and after and a variant reporter
         ('asfd 22 U. S. 332 (1975) asdf',
          [Citation(volume=22, reporter='U.S.', page=332, year=1975,
                    canonical_reporter=u'U.S.', lookup_index=0,
                    court='scotus', reporter_index=2,
                    reporter_found='U. S.')]),
         # Test with finding reporter when it's a second edition
         ('asdf 22 A.2d 332 asdf',
          [Citation(volume=22, reporter='A.2d', page=332,
                    canonical_reporter=u'A.', lookup_index=0,
                    reporter_index=2, reporter_found='A.2d')]),
         # Test finding a variant second edition reporter
         ('asdf 22 A. 2d 332 asdf',
          [Citation(volume=22, reporter='A.2d', page=332,
                    canonical_reporter=u'A.', lookup_index=0,
                    reporter_index=2, reporter_found='A. 2d')]),
         # Test finding a variant of an edition resolvable by variant alone.
         ('171 Wn.2d 1016',
          [Citation(volume=171, reporter='Wash. 2d', page=1016,
                    canonical_reporter=u'Wash.', lookup_index=1,
                    reporter_index=1, reporter_found='Wn.2d')]),
         # Test finding two citations where one of them has abutting
         # punctuation.
         ('2 U.S. 3, 4-5 (3 Atl. 33)',
          [Citation(volume=2, reporter="U.S.", page=3, extra=u'4-5',
                    canonical_reporter=u"U.S.", lookup_index=0,
                    reporter_index=1, reporter_found="U.S.", court='scotus'),
           Citation(volume=3, reporter="A.", page=33,
                    canonical_reporter=u"A.", lookup_index=0,
                    reporter_index=5, reporter_found="Atl.")]),
         # Test with the page number as a Roman numeral
         ('12 Neb. App. lxiv (2004)',
          [Citation(volume=12, reporter='Neb. Ct. App.', page='lxiv', year=2004,
                   canonical_reporter=u'Neb. Ct. App.', lookup_index=0,
                   reporter_index=1, reporter_found='Neb. App.')]),
         # Test with the 'digit-REPORTER-digit' corner-case formatting
         ('2007-NMCERT-008',
          [Citation(volume=2007, reporter='NMCERT', page=8,
                   canonical_reporter=u'NMCERT', lookup_index=0,
                   reporter_index=1, reporter_found='NMCERT')]),
         ('2006-Ohio-2095',
          [Citation(volume=2006, reporter='Ohio', page=2095,
                    canonical_reporter=u'Ohio', lookup_index=0,
                    reporter_index=1, reporter_found='Ohio')]),
         ('2017 IL App (4th) 160407WC',
          [Citation(volume=2017, reporter='IL App (4th)', page='160407WC',
                    canonical_reporter=u'IL App (4th)', lookup_index=0,
                    reporter_index=1, reporter_found='IL App (4th)')]),
         ('2017 IL App (1st) 143684-B',
          [Citation(volume=2017, reporter='IL App (1st)', page='143684-B',
                    canonical_reporter=u'IL App (1st)', lookup_index=0,
                    reporter_index=1, reporter_found='IL App (1st)')])
     )
     for q, a in test_pairs:
         print "Testing citation extraction for %s..." % q,
         cites_found = get_citations(q)
         self.assertEqual(
             cites_found,
             a,
             msg='%s\n%s\n\n    !=\n\n%s' % (
                 q,
                 ",\n".join([str(cite.__dict__) for cite in cites_found]),
                 ",\n".join([str(cite.__dict__) for cite in a]),
             )
         )
         print "✓"