Ejemplo n.º 1
0
    def test_make_html(self):
        """Can we make basic HTML conversions properly?"""
        good_html = (
            '<pre class="inline">asdf </pre><span class="citation '
            'no-link"><span class="volume">22</span> <span '
            'class="reporter">U.S.</span> <span class="page">33</span>'
            '</span><pre class="inline"> asdf</pre>')

        # Simple example
        s = 'asdf 22 U.S. 33 asdf'
        opinion = Opinion(plain_text=s)
        citations = get_citations(s)
        new_html = create_cited_html(opinion, citations)
        self.assertEqual(
            good_html,
            new_html,
        )

        # Using a variant format for U.S. (Issue #409)
        s = 'asdf 22 U. S. 33 asdf'
        opinion = Opinion(plain_text=s)
        citations = get_citations(s)
        new_html = create_cited_html(opinion, citations)
        self.assertEqual(
            good_html,
            new_html,
        )
Ejemplo n.º 2
0
class StaticFilesTest(TestCase):
    good_mp3_path = 'mp3/2014/06/09/ander_v._leo.mp3'
    good_txt_path = 'txt/2015/12/28/opinion_text.txt'
    good_pdf_path = 'pdf/2013/06/12/' + \
                    'in_re_motion_for_consent_to_disclosure_of_court_records.pdf'

    def setUp(self):
        self.court = Court.objects.get(pk='test')
        self.docket = Docket(case_name=u'Docket',
                             court=self.court,
                             source=Docket.DEFAULT)
        self.docket.save()

        self.audio = Audio(local_path_original_file=self.good_mp3_path,
                           local_path_mp3=self.good_mp3_path,
                           docket=self.docket,
                           blocked=False,
                           case_name_full='Ander v. Leo',
                           date_created=datetime.date(2014, 6, 9))
        self.audio.save(index=False)

        self.opinioncluster = OpinionCluster(
            case_name=u'Hotline Bling',
            docket=self.docket,
            date_filed=datetime.date(2015, 12, 14),
        )
        self.opinioncluster.save(index=False)

        self.txtopinion = Opinion(cluster=self.opinioncluster,
                                  type='Lead Opinion',
                                  local_path=self.good_txt_path)
        self.txtopinion.save(index=False)

        self.pdfopinion = Opinion(cluster=self.opinioncluster,
                                  type='Lead Opinion',
                                  local_path=self.good_pdf_path)
        self.pdfopinion.save(index=False)

    def test_serve_static_file_serves_mp3(self):
        request = HttpRequest()
        file_path = self.audio.local_path_mp3
        response = serve_static_file(request, file_path=self.good_mp3_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'audio/mpeg')
        self.assertIn('inline;', response['Content-Disposition'])

    def test_serve_static_file_serves_txt(self):
        request = HttpRequest()
        response = serve_static_file(request, file_path=self.good_txt_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'text/plain')
        self.assertIn('inline;', response['Content-Disposition'])
        self.assertIn('FOR THE DISTRICT OF COLUMBIA CIRCUIT', response.content)

    def test_serve_static_file_serves_pdf(self):
        request = HttpRequest()
        response = serve_static_file(request, file_path=self.good_pdf_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'application/pdf')
        self.assertIn('inline;', response['Content-Disposition'])
Ejemplo n.º 3
0
    def setUp(self):
        self.court = Court.objects.get(pk='test')
        self.docket = Docket(case_name=u'Docket', court=self.court, source=Docket.DEFAULT)
        self.docket.save()

        self.audio = Audio(
            local_path_original_file=self.good_mp3_path,
            local_path_mp3=self.good_mp3_path,
            docket=self.docket,
            blocked=False,
            case_name_full='Ander v. Leo',
            date_created=datetime.date(2014, 6, 9)
        )
        self.audio.save(index=False)

        self.opinioncluster = OpinionCluster(
            case_name=u'Hotline Bling',
            docket=self.docket,
            date_filed=datetime.date(2015, 12, 14),
        )
        self.opinioncluster.save(index=False)

        self.txtopinion = Opinion(
            cluster=self.opinioncluster,
            type='Lead Opinion',
            local_path=self.good_txt_path
        )
        self.txtopinion.save(index=False)

        self.pdfopinion = Opinion(
            cluster=self.opinioncluster,
            type='Lead Opinion',
            local_path=self.good_pdf_path
        )
        self.pdfopinion.save(index=False)
Ejemplo n.º 4
0
    def setUp(self):
        self.court = Court.objects.get(pk='test')
        self.docket = Docket(case_name=u'Docket', court=self.court, source=Docket.DEFAULT)
        self.docket.save()

        self.audio = Audio(
            local_path_original_file=self.good_mp3_path,
            local_path_mp3=self.good_mp3_path,
            docket=self.docket,
            blocked=False,
            case_name_full='Ander v. Leo',
            date_created=datetime.date(2014, 6, 9)
        )
        self.audio.save(index=False)

        self.opinioncluster = OpinionCluster(
            case_name=u'Hotline Bling',
            docket=self.docket,
            date_filed=datetime.date(2015, 12, 14),
        )
        self.opinioncluster.save(index=False)

        self.txtopinion = Opinion(
            cluster=self.opinioncluster,
            type='Lead Opinion',
            local_path=self.good_txt_path
        )
        self.txtopinion.save(index=False)

        self.pdfopinion = Opinion(
            cluster=self.opinioncluster,
            type='Lead Opinion',
            local_path=self.good_pdf_path
        )
        self.pdfopinion.save(index=False)
    def add_oc_and_o(self, old_document, old_citation, old_docket, new_docket):
        """Add the OpinionCluster and Opinion, updating existing items if
        present.
        """
        new_opinion_cluster = OpinionClusterNew(
            pk=old_document.pk,
            docket=new_docket,
            judges=self._none_to_blank(old_document.judges),
            date_modified=old_document.date_modified,
            date_created=old_document.date_modified,
            date_filed=old_document.date_filed,
            slug=self._none_to_blank(old_citation.slug),
            citation_id=old_document.citation_id,
            case_name_short=old_docket.case_name_short,
            case_name=old_docket.case_name,
            case_name_full=old_docket.case_name_full,
            federal_cite_one=self._none_to_blank(old_citation.federal_cite_one),
            federal_cite_two=self._none_to_blank(old_citation.federal_cite_two),
            federal_cite_three=self._none_to_blank(old_citation.federal_cite_three),
            state_cite_one=self._none_to_blank(old_citation.state_cite_one),
            state_cite_two=self._none_to_blank(old_citation.state_cite_two),
            state_cite_three=self._none_to_blank(old_citation.state_cite_three),
            state_cite_regional=self._none_to_blank(old_citation.state_cite_regional),
            specialty_cite_one=self._none_to_blank(old_citation.specialty_cite_one),
            scotus_early_cite=self._none_to_blank(old_citation.scotus_early_cite),
            lexis_cite=self._none_to_blank(old_citation.lexis_cite),
            westlaw_cite=self._none_to_blank(old_citation.westlaw_cite),
            neutral_cite=self._none_to_blank(old_citation.neutral_cite),
            scdb_id=self._none_to_blank(old_document.supreme_court_db_id),
            source=old_document.source,
            nature_of_suit=old_document.nature_of_suit,
            citation_count=old_document.citation_count,
            precedential_status=old_document.precedential_status,
            date_blocked=old_document.date_blocked,
            blocked=old_document.blocked,
        )
        new_opinion_cluster.save(
            using='default',
            index=False,
        )

        new_opinion = OpinionNew(
            pk=old_document.pk,
            cluster=new_opinion_cluster,
            date_modified=old_document.date_modified,
            date_created=old_document.time_retrieved,
            type='010combined',
            sha1=old_document.sha1,
            download_url=old_document.download_url,
            local_path=old_document.local_path,
            plain_text=old_document.plain_text,
            html=self._none_to_blank(old_document.html),
            html_lawbox=self._none_to_blank(old_document.html_lawbox),
            html_with_citations=old_document.html_with_citations,
            extracted_by_ocr=old_document.extracted_by_ocr,
        )
        new_opinion.save(
            using='default',
            index=False,
        )
Ejemplo n.º 6
0
    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked is not None:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            federal_cite_one=item.get('west_citations', ''),
            state_cite_one=item.get('west_state_citations', ''),
            neutral_cite=item.get('neutral_citations', ''),
        )
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, error
Ejemplo n.º 7
0
    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked is not None:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            federal_cite_one=item.get('west_citations', ''),
            state_cite_one=item.get('west_state_citations', ''),
            neutral_cite=item.get('neutral_citations', ''),
        )
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, error
Ejemplo n.º 8
0
    def test_make_html_from_matched_citation_objects(self) -> None:
        """Can we render matched citation objects as HTML?"""
        # This test case is similar to the two above, except it allows us to
        # test the rendering of citation objects that we assert are correctly
        # matched. (No matching is performed in the previous cases.)
        # fmt: off

        test_pairs = [
            # Id. citation with page number ("Id., at 123, 124")
            ('asdf, Id., at 123, 124. Lorem ipsum dolor sit amet',
             '<pre class="inline">asdf, </pre><span class="citation" data-id="'
             'MATCH_ID"><a href="MATCH_URL">Id., at 123, 124</a></span><pre '
             'class="inline">. Lorem ipsum dolor sit amet</pre>'),

            # Id. citation with complex page number ("Id. @ 123:1, ¶¶ 124")
            ('asdf, Id. @ 123:1, ¶¶ 124. Lorem ipsum dolor sit amet',
             '<pre class="inline">asdf, </pre><span class="citation" data-id='
             '"MATCH_ID"><a href="MATCH_URL">Id.</a></span><pre class='
             '"inline"> @ 123:1, ¶¶ 124. Lorem ipsum dolor sit amet</pre>'),

            # Id. citation without page number ("Id. Something else")
            ('asdf, Id. Lorem ipsum dolor sit amet',
             '<pre class="inline">asdf, </pre><span class="citation" data-id="'
             'MATCH_ID"><a href="MATCH_URL">Id.</a></span><pre class="inline">'
             ' Lorem ipsum dolor sit amet</pre>'),
        ]

        # fmt: on
        for s, expected_html in test_pairs:
            with self.subTest(
                    f"Testing object to HTML rendering for {s}...",
                    s=s,
                    expected_html=expected_html,
            ):
                opinion = Opinion(plain_text=s)
                get_and_clean_opinion_text(opinion)
                citations = get_citations(opinion.cleaned_text)

                # Stub out fake output from do_resolve_citations(), since the
                # purpose of this test is not to test that. We just need
                # something that looks like what create_cited_html() expects
                # to receive. Also make sure that the "matched" opinion is
                # mocked appropriately.
                opinion.pk = "MATCH_ID"
                opinion.cluster = Mock(OpinionCluster(id=24601))
                opinion.cluster.get_absolute_url.return_value = "MATCH_URL"
                citation_resolutions = {opinion: citations}

                created_html = create_cited_html(opinion, citation_resolutions)

                self.assertEqual(
                    created_html,
                    expected_html,
                    msg=f"\n{created_html}\n\n    !=\n\n{expected_html}",
                )
Ejemplo n.º 9
0
    def setUp(self) -> None:
        docket = Docket(
            case_name="foo",
            court=Court.objects.get(pk="test"),
            source=Docket.DEFAULT,
        )
        docket.save()
        # Must be more than a year old for all tests to be runnable.
        last_month = now().date() - timedelta(days=400)
        self.doc_cluster = OpinionCluster(case_name="foo",
                                          docket=docket,
                                          date_filed=last_month)
        self.doc_cluster.save(index=False)
        opinion = Opinion(cluster=self.doc_cluster, type="Lead Opinion")
        opinion.save(index=False)

        opinion2 = Opinion(cluster=self.doc_cluster, type="Concurrence")
        opinion2.save(index=False)

        OpinionsCited.objects.create(citing_opinion=opinion2,
                                     cited_opinion=opinion)

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        OralArgumentCommand().scrape_court(site, full_crawl=True)
Ejemplo n.º 10
0
 def setUp(self):
     self.court = Court.objects.get(pk="test")
     self.docket = Docket(case_name=u"Docket", court=self.court)
     self.opinioncluster = OpinionCluster(
         case_name=u"Hotline Bling",
         docket=self.docket,
         date_filed=datetime.date(2015, 12, 14),
     )
     self.opinion = Opinion(cluster=self.opinioncluster,
                            type="Lead Opinion")
Ejemplo n.º 11
0
    def test_make_html_from_matched_citation_objects(self) -> None:
        """Can we render matched citation objects as HTML?"""
        # This test case is similar to the two above, except it allows us to
        # test the rendering of citation objects that we assert are correctly
        # matched. (No matching is performed in the previous cases.)
        # fmt: off

        test_triples = [
            # Id. citation with page number ("Id., at 123, 124")
            ('asdf, Id., at 123, 124. Lorem ipsum dolor sit amet',
             IdCitation(id_token='Id.,',
                        after_tokens=['at', '123', '124'],
                        has_page=True),
             '<pre class="inline">asdf</pre><span class="citation" data-id="'
             'MATCH_ID">, <a href="MATCH_URL"><span class="id_token">Id.,'
             '</span> at 123, 124</a></span><pre class="inline">. Lorem ipsum'
             ' dolor sit amet</pre>'),

            # Id. citation with complex page number ("Id. @ 123:1, ¶¶ 124")
            ('asdf, Id. @ 123:1, ¶¶ 124. Lorem ipsum dolor sit amet',
             IdCitation(id_token='Id.',
                        after_tokens=['@', '123:1', '¶¶', '124'],
                        has_page=True),
             '<pre class="inline">asdf</pre><span class="citation" data-id="'
             'MATCH_ID">, <a href="MATCH_URL"><span class="id_token">Id.'
             '</span> @ 123:1, ¶¶ 124</a></span><pre class="inline">. Lorem '
             'ipsum dolor sit amet</pre>'),

            # Id. citation without page number ("Id. Something else")
            ('asdf, Id. Lorem ipsum dolor sit amet',
             IdCitation(id_token='Id.',
                        after_tokens=['Lorem', 'ipsum'],
                        has_page=False),
             '<pre class="inline">asdf</pre><span class="citation" data-id='
             '"MATCH_ID">, <a href="MATCH_URL"><span class="id_token">Id.'
             '</span></a> Lorem ipsum </span><pre class="inline">dolor sit '
             'amet</pre>'),
        ]

        # fmt: on
        for plain_text, citation, expected_html in test_triples:
            print(
                "Testing object to HTML rendering for %s..." % plain_text,
                end=" ",
            )
            citation.match_url = "MATCH_URL"
            citation.match_id = "MATCH_ID"
            opinion = Opinion(plain_text=plain_text)
            created_html = create_cited_html(opinion, [citation])
            self.assertEqual(
                created_html,
                expected_html,
                msg="\n%s\n\n    !=\n\n%s" % (created_html, expected_html),
            )
            print("✓")
Ejemplo n.º 12
0
    def test_make_html_from_html(self) -> None:
        """Can we convert the HTML of an opinion into modified HTML?"""
        # fmt: off

        test_pairs = [
            # Id. citation with HTML tags
            ('<div><p>the improper views of the Legislature.\" 2 <i>id., at '
             '73.</i></p>\n<p>Nathaniel Gorham of Massachusetts</p></div>',
             '<div><p>the improper views of the Legislature." 2 <i><span '
             'class="citation no-link">id., at 73</span>.</i></p>\n<p>'
             'Nathaniel Gorham of Massachusetts</p></div>'),

            # Id. citation with an intervening HTML tag
            #  (We expect the HTML to be unchanged, since it's too risky to
            #   modify with another tag in the way)
            ('<div><p>the improper views of the Legislature.\" 2 <i>id.,</i> '
             'at <b>73, bolded</b>.</p>\n<p>Nathaniel Gorham of Massachusetts'
             '</p></div>',
             '<div><p>the improper views of the Legislature.\" 2 <i>id.,</i> '
             'at <b>73, bolded</b>.</p>\n<p>Nathaniel Gorham of Massachusetts'
             '</p></div>'),

            # Ibid. citation with HTML tags
            ('<div><p>possess any peculiar knowledge of the mere policy of '
             'public measures.\" <i>Ibid.</i> Gerry of Massachusetts '
             'like</p></div>',
             '<div><p>possess any peculiar knowledge of the mere policy of '
             'public measures." <i><span class="citation no-link">Ibid.'
             '</span></i> Gerry of Massachusetts like</p></div>'
            ),
        ]

        # fmt: on
        for s, expected_html in test_pairs:
            with self.subTest(
                "Testing html to html conversion for %s..." % s,
                s=s,
                expected_html=expected_html,
            ):
                opinion = Opinion(html=s)
                get_and_clean_opinion_text(opinion)
                citations = get_citations(opinion.cleaned_text)

                # Stub out fake output from do_resolve_citations(), since the
                # purpose of this test is not to test that. We just need
                # something that looks like what create_cited_html() expects
                # to receive.
                citation_resolutions = {NO_MATCH_RESOURCE: citations}

                created_html = create_cited_html(opinion, citation_resolutions)
                self.assertEqual(
                    created_html,
                    expected_html,
                    msg="\n%s\n\n    !=\n\n%s" % (created_html, expected_html),
                )
Ejemplo n.º 13
0
    def test_save_old_opinion(self):
        """Can we save opinions older than 1900?"""
        docket = Docket(case_name=u"Blah", court_id='test',
                        source=Docket.DEFAULT)
        docket.save()
        oc = OpinionCluster(
            case_name=u"Blah",
            docket=docket,
            date_filed=datetime.date(1899, 1, 1),
        )
        oc.save()
        o = Opinion(cluster=oc, type='Lead Opinion')

        try:
            cf = ContentFile(StringIO.StringIO('blah').read())
            o.file_with_date = datetime.date(1899, 1, 1)
            o.local_path.save('file_name.pdf', cf, save=False)
            o.save(index=False)
        except ValueError as e:
            raise ValueError("Unable to save a case older than 1900. Did you "
                             "try to use `strftime`...again?")
Ejemplo n.º 14
0
 def setUp(self):
     self.court = Court.objects.get(pk='test')
     self.docket = Docket(case_name=u'Docket', court=self.court)
     self.opinioncluster = OpinionCluster(
         case_name=u'Hotline Bling',
         docket=self.docket,
         date_filed=datetime.date(2015, 12, 14),
     )
     self.opinion = Opinion(
         cluster=self.opinioncluster,
         type='Lead Opinion',
     )
Ejemplo n.º 15
0
    def setUp(self):
        docket = Docket(
            case_name=u'foo',
            court=Court.objects.get(pk='test'),
            source=Docket.DEFAULT
        )
        docket.save()
        # Must be more than a year old for all tests to be runnable.
        last_month = now().date() - timedelta(days=400)
        self.doc_cluster = OpinionCluster(
            case_name=u"foo",
            docket=docket,
            date_filed=last_month
        )
        self.doc_cluster.save(index=False)
        opinion = Opinion(cluster=self.doc_cluster, type='Lead Opinion')
        opinion.save(index=False)

        opinion2 = Opinion(cluster=self.doc_cluster, type='Concurrence')
        opinion2.save(index=False)

        OpinionsCited.objects.create(
            citing_opinion=opinion2,
            cited_opinion=opinion
        )

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        OralArgumentCommand().scrape_court(site, full_crawl=True)
Ejemplo n.º 16
0
    def test_make_html_from_html(self) -> None:
        """Can we convert the HTML of an opinion into modified HTML?"""
        # fmt: off

        test_pairs = [
            # Id. citation with HTML tags
            ('<div><p>the improper views of the Legislature.\" 2 <i>id.,</i> '
             'at 73.</p>\n<p>Nathaniel Gorham of Massachusetts</p></div>',
             '<div><p>the improper views of the Legislature." 2<span class="'
             'citation no-link"> <i><span class="id_token">id.,</span></i> at '
             '73.</span></p>\n<p>Nathaniel Gorham of Massachusetts</p></div>'),

            # Id. citation with an intervening HTML tag
            #  (We expect the HTML to be unchanged, since it's too risky to
            #   modify with another tag in the way)
            ('<div><p>the improper views of the Legislature.\" 2 <i>id.,</i> '
             'at <b>73, bolded</b>.</p>\n<p>Nathaniel Gorham of Massachusetts'
             '</p></div>',
             '<div><p>the improper views of the Legislature.\" 2 <i>id.,</i> '
             'at <b>73, bolded</b>.</p>\n<p>Nathaniel Gorham of Massachusetts'
             '</p></div>'),

            # Ibid. citation with HTML tags
            ('<div><p>possess any peculiar knowledge of the mere policy of '
             'public measures.\" <i>Ibid.</i> Gerry of Massachusetts '
             'like</p></div>',
             '<div><p>possess any peculiar knowledge of the mere policy of '
             'public measures."<span class="citation no-link"> <i><span class='
             '"id_token">Ibid.</span></i> Gerry of Massachusetts </span>like'
             '</p></div>'),
        ]

        # fmt: on
        for s, expected_html in test_pairs:
            print("Testing html to html conversion for %s..." % s, end=" ")
            opinion = Opinion(html=s)
            citations = get_citations(s, clean=("html", "whitespace"))
            created_html = create_cited_html(opinion, citations)
            self.assertEqual(
                created_html,
                expected_html,
                msg="\n%s\n\n    !=\n\n%s" % (created_html, expected_html),
            )
            print("✓")
Ejemplo n.º 17
0
def extract_from_pdf(
    path: str,
    opinion: Opinion,
    ocr_available: bool = False,
) -> ExtractProcessResult:
    """Extract text from pdfs.

    Start with pdftotext. If we we enabled OCR - and the the content is empty
    or the PDF contains images, use tesseract. This pattern occurs because PDFs
    can be images, text-based and a mix of the two. We check for images to
    make sure we do OCR on mix-type PDFs.

    If a text-based PDF we fix corrupt PDFs from ca9.

    :param path: The path to the PDF
    :param opinion: The Opinion associated with the PDF
    :param ocr_available: Whether we should do OCR stuff
    :return Tuple of the content itself and any errors we received
    """
    process = make_pdftotext_process(path)
    content, err = process.communicate()
    content = content.decode()
    if err is not None:
        err = err.decode()

    if not ocr_available:
        if "e" not in content:
            # It's a corrupt PDF from ca9. Fix it.
            content = fix_mojibake(content)
    else:
        if ocr_needed(path, content):
            success, ocr_content = extract_by_ocr(path)
            if success:
                # Check content length and take the longer of the two
                if len(ocr_content) > len(content):
                    content = ocr_content
                    opinion.extracted_by_ocr = True
            elif content == "" or not success:
                content = "Unable to extract document content."

    return content, err
Ejemplo n.º 18
0
def extract_from_wpd(path: str, opinion: Opinion) -> ExtractProcessResult:
    """Extract text from a Word Perfect file

    Yes, courts still use these, so we extract their text using wpd2html. Once
    that's done, we pull out the body of the HTML, and do some minor cleanup
    on it.
    """
    process = subprocess.Popen(
        ["wpd2html", path], shell=False, stdout=subprocess.PIPE, stderr=DEVNULL
    )
    content, err = process.communicate()

    content = get_clean_body_content(content)
    content = content.decode()
    if err is not None:
        err = err.decode()

    if "not for publication" in content.lower():
        opinion.precedential_status = "Unpublished"

    return content, err
Ejemplo n.º 19
0
def get_and_clean_opinion_text(opinion: Opinion) -> None:
    """Memoize useful versions of an opinion's text as additional properties
    on the Opinion object. This should be done before performing citation
    extraction and annotation on an opinion.

    :param opinion: The Opinion whose text should be parsed
    """
    for attr in ["html_anon_2020", "html_columbia", "html_lawbox", "html"]:
        text = getattr(opinion, attr)
        if text:
            opinion.source_text = text
            opinion.cleaned_text = clean_text(text, ["html", "all_whitespace"])
            opinion.source_is_html = True
            break
    else:
        # Didn't hit the break; use plain text
        text = getattr(opinion, "plain_text")
        opinion.source_text = text
        opinion.cleaned_text = clean_text(text, ["all_whitespace"])
        opinion.source_is_html = False
Ejemplo n.º 20
0
    def test_make_html_from_plain_text(self) -> None:
        """Can we convert the plain text of an opinion into HTML?"""
        # fmt: off

        full_citation_html = ('<pre class="inline">asdf </pre><span class="'
                              'citation no-link"><span class="volume">22'
                              '</span> <span class="reporter">U.S.</span> '
                              '<span class="page">33</span> </span><pre class='
                              '"inline">asdf</pre>')
        test_pairs = [
            # Simple example for full citations
            ('asdf 22 U.S. 33 asdf', full_citation_html),

            # Using a variant format for U.S. (Issue #409)
            ('asdf 22 U. S. 33 asdf', full_citation_html),

            # Full citation across line break
            ('asdf John v. Doe, 123\nU.S. 456, upholding foo bar',
             '<pre class="inline">asdf John v. Doe, </pre><span class="'
             'citation no-link"><span class="volume">123</span>\n<span class='
             '"reporter">U.S.</span> <span class="page">456</span></span><pre'
             ' class="inline">, upholding foo bar</pre>'),

            # Basic short form citation
            ('existing text asdf, 515 U.S., at 240. foobar',
             '<pre class="inline">existing text </pre><span class="citation '
             'no-link"><span class="antecedent_guess">asdf,</span> <span '
             'class="volume">515</span> <span class="reporter">U.S.</span>, '
             'at <span class="page">240</span></span><pre class="inline">. '
             'foobar</pre>'),

            # Short form citation with no comma after reporter in original
            ('existing text asdf, 1 U. S. at 2. foobar',
             '<pre class="inline">existing text </pre><span class="citation '
             'no-link"><span class="antecedent_guess">asdf,</span> <span class'
             '="volume">1</span> <span class="reporter">U.S.</span> at <span '
             'class="page">2</span></span><pre class="inline">. foobar</pre>'),

            # Short form citation across line break
            ('asdf.’ ” 123 \n U.S., at 456. Foo bar foobar',
             '<pre class="inline">asdf.’ </pre><span class="'
             'citation no-link"><span class="antecedent_guess">”'
             '</span> <span class="volume">123</span> \n <span class='
             '"reporter">U.S.</span>, at <span class="page">456</span></span>'
             '<pre class="inline">. Foo bar foobar</pre>'),

            # First kind of supra citation (standard kind)
            ('existing text asdf, supra, at 2. foobar',
             '<pre class="inline">existing text </pre><span class="citation '
             'no-link"><span class="antecedent_guess">asdf,</span> supra, at '
             '<span class="page">2</span></span><pre class="inline">. foobar'
             '</pre>'),

            # Second kind of supra citation (with volume)
            ('existing text asdf, 123 supra, at 2. foo bar',
             '<pre class="inline">existing text </pre><span class="citation '
             'no-link"><span class="antecedent_guess">asdf,</span> <span '
             'class="volume">123</span> supra, at <span class="page">2</span>'
             '</span><pre class="inline">. foo bar</pre>'),

            # Third kind of supra citation (sans page)
            ('existing text asdf, supra, foo bar',
             '<pre class="inline">existing text </pre><span class="citation '
             'no-link"><span class="antecedent_guess">asdf,</span> supra'
             '</span><pre class="inline">, foo bar</pre>'),

            # Fourth kind of supra citation (with period)
            ('existing text asdf, supra. foo bar',
             '<pre class="inline">existing text </pre><span class="citation '
             'no-link"><span class="antecedent_guess">asdf,</span> supra'
             '</span><pre class="inline">. foo bar</pre>'),

            # Supra citation across line break
            ('existing text asdf, supra, at\n99 (quoting foo)',
             '<pre class="inline">existing text </pre><span class="citation '
             'no-link"><span class="antecedent_guess">asdf,</span> supra, '
             'at\n<span class="page">99</span> </span><pre class="inline">'
             '(quoting foo)</pre>'),

            # Id. citation ("Id., at 123")
            ('asdf, id., at 123. Lorem ipsum dolor sit amet',
             '<pre class="inline">asdf</pre><span class="citation no-link">, '
             '<span class="id_token">id.,</span> at 123. </span><pre class="'
             'inline">Lorem ipsum dolor sit amet</pre>'),

            # Duplicate Id. citation
            ('asd, id., at 123. Lo rem ip sum. asdf, id., at 123. Lo rem ip.',
             '<pre class="inline">asd</pre><span class="citation no-link">, '
             '<span class="id_token">id.,</span> at 123. </span><pre class="'
             'inline">Lo rem ip sum. asdf</pre><span class="citation '
             'no-link">, <span class="id_token">id.,</span> at 123. </span>'
             '<pre class="inline">Lo rem ip.</pre>'),

            # Id. citation across line break
            ('asdf." Id., at 315.\n       Lorem ipsum dolor sit amet',
             '<pre class="inline">asdf."</pre><span class="citation no-link"> '
             '<span class="id_token">Id.,</span> at 315.\n</span><pre class="'
             'inline">       Lorem ipsum dolor sit amet</pre>'),

            # Ibid. citation ("... Ibid.")
            ('asdf, Ibid. Lorem ipsum dolor sit amet',
             '<pre class="inline">asdf</pre><span class="citation no-link">, '
             '<span class="id_token">Ibid.</span> Lorem ipsum dolor </span>'
             '<pre class="inline">sit amet</pre>'),

            # NonopinionCitation (currently nothing should happen here)
            ('Lorem ipsum dolor sit amet. U.S. Code §3617. Foo bar.',
             '<pre class="inline">Lorem ipsum dolor sit amet. U.S. Code '
             '§3617. Foo bar.</pre>'),
        ]

        # fmt: on
        for s, expected_html in test_pairs:
            print("Testing plain text to html conversion for %s..." % s,
                  end=" ")
            opinion = Opinion(plain_text=s)
            citations = get_citations(s)
            created_html = create_cited_html(opinion, citations)
            self.assertEqual(
                created_html,
                expected_html,
                msg="\n%s\n\n    !=\n\n%s" % (created_html, expected_html),
            )
            print("✓")
Ejemplo n.º 21
0
    def test_make_html_from_plain_text(self) -> None:
        """Can we convert the plain text of an opinion into HTML?"""
        # fmt: off

        test_pairs = [
            # Simple example for full citations
            ('asdf 22 U.S. 33 asdf',
             '<pre class="inline">asdf </pre><span class="'
             'citation no-link">22 U.S. 33</span><pre class="'
             'inline"> asdf</pre>'),

            # Using a variant format for U.S. (Issue #409)
            ('asdf 22 U. S. 33 asdf',
             '<pre class="inline">asdf </pre><span class="'
             'citation no-link">22 U. S. 33</span><pre class="'
             'inline"> asdf</pre>'),

            # Full citation across line break
            ('asdf John v. Doe, 123\nU.S. 456, upholding foo bar',
             '<pre class="inline">asdf John v. Doe, </pre><span class="'
             'citation no-link">123\nU.S. 456</span><pre class="inline">, '
             'upholding foo bar</pre>'),

            # Basic short form citation
            ('existing text asdf, 515 U.S., at 240. foobar',
             '<pre class="inline">existing text asdf, </pre><span class="'
             'citation no-link">515 U.S., at 240</span><pre class="inline">. '
             'foobar</pre>'),

            # Short form citation with no comma after reporter in original
            ('existing text asdf, 1 U. S. at 2. foobar',
             '<pre class="inline">existing text asdf, </pre><span class="'
             'citation no-link">1 U. S. at 2</span><pre class="inline">. '
             'foobar</pre>'),

            # Short form citation across line break
            ('asdf.’ ” 123 \n U.S., at 456. Foo bar foobar',
             '<pre class="inline">asdf.’ ” </pre><span class="citation '
             'no-link">123 \n U.S., at 456</span><pre class="inline">. Foo '
             'bar foobar</pre>'),

            # First kind of supra citation (standard kind)
            ('existing text asdf, supra, at 2. foobar',
             '<pre class="inline">existing text asdf, </pre><span class="'
             'citation no-link">supra, at 2</span><pre class="inline">. '
             'foobar</pre>'),

            # Second kind of supra citation (with volume)
            ('existing text asdf, 123 supra, at 2. foo bar',
             '<pre class="inline">existing text asdf, 123 </pre><span class="'
             'citation no-link">supra, at 2</span><pre class="inline">. foo '
             'bar</pre>'),

            # Third kind of supra citation (sans page)
            ('existing text asdf, supra, foo bar',
             '<pre class="inline">existing text asdf, </pre><span class="'
             'citation no-link">supra,</span><pre class="inline"> foo bar'
             '</pre>'),

            # Fourth kind of supra citation (with period)
            ('existing text asdf, supra. foo bar',
             '<pre class="inline">existing text asdf, </pre><span class="'
             'citation no-link">supra.</span><pre class="inline"> foo bar'
             '</pre>'),

            # Supra citation across line break
            ('existing text asdf, supra, at\n99 (quoting foo)',
             '<pre class="inline">existing text asdf, </pre><span class="'
             'citation no-link">supra, at\n99</span><pre class="inline"> '
             '(quoting foo)</pre>'),

            # Id. citation ("Id., at 123")
            ('asdf, id., at 123. Lorem ipsum dolor sit amet',
             '<pre class="inline">asdf, </pre><span class="citation no-link">'
             'id., at 123</span><pre class="inline">. Lorem ipsum dolor sit '
             'amet</pre>'),

            # Duplicate Id. citation
            ('asd, id., at 123. Lo rem ip sum. asdf, id., at 123. Lo rem ip.',
             '<pre class="inline">asd, </pre><span class="citation no-link">'
             'id., at 123</span><pre class="inline">. Lo rem ip sum. asdf, '
             '</pre><span class="citation no-link">id., at 123</span><pre '
             'class="inline">. Lo rem ip.</pre>'),

            # Id. citation across line break
            ('asdf." Id., at 315.\n       Lorem ipsum dolor sit amet',
             '<pre class="inline">asdf." </pre><span class="citation '
             'no-link">Id., at 315</span><pre class="inline">.\n       Lorem '
             'ipsum dolor sit amet</pre>'),

            # Ibid. citation ("... Ibid.")
            ('asdf, Ibid. Lorem ipsum dolor sit amet',
             '<pre class="inline">asdf, </pre><span class="citation no-link">'
             'Ibid.</span><pre class="inline"> Lorem ipsum dolor sit amet'
             '</pre>'),

            # NonopinionCitation
            ('Lorem ipsum dolor sit amet. U.S. Code §3617. Foo bar.',
             '<pre class="inline">Lorem ipsum dolor sit amet. U.S. Code </pre>'
             '<span class="citation no-link">§3617.</span><pre class="inline">'
             ' Foo bar.</pre>'),
        ]

        # fmt: on
        for s, expected_html in test_pairs:
            with self.subTest(
                    f"Testing plain text to html conversion for {s}...",
                    s=s,
                    expected_html=expected_html,
            ):
                opinion = Opinion(plain_text=s)
                get_and_clean_opinion_text(opinion)
                citations = get_citations(opinion.cleaned_text)

                # Stub out fake output from do_resolve_citations(), since the
                # purpose of this test is not to test that. We just need
                # something that looks like what create_cited_html() expects
                # to receive.
                citation_resolutions = {NO_MATCH_RESOURCE: citations}

                created_html = create_cited_html(opinion, citation_resolutions)
                self.assertEqual(
                    created_html,
                    expected_html,
                    msg=f"\n{created_html}\n\n    !=\n\n{expected_html}",
                )
Ejemplo n.º 22
0
class StaticFilesTest(TestCase):
    good_mp3_path = 'mp3/2014/06/09/ander_v._leo.mp3'
    good_txt_path = 'txt/2015/12/28/opinion_text.txt'
    good_pdf_path = 'pdf/2013/06/12/' + \
                    'in_re_motion_for_consent_to_disclosure_of_court_records.pdf'

    def setUp(self):
        self.court = Court.objects.get(pk='test')
        self.docket = Docket(case_name=u'Docket', court=self.court, source=Docket.DEFAULT)
        self.docket.save()

        self.audio = Audio(
            local_path_original_file=self.good_mp3_path,
            local_path_mp3=self.good_mp3_path,
            docket=self.docket,
            blocked=False,
            case_name_full='Ander v. Leo',
            date_created=datetime.date(2014, 6, 9)
        )
        self.audio.save(index=False)

        self.opinioncluster = OpinionCluster(
            case_name=u'Hotline Bling',
            docket=self.docket,
            date_filed=datetime.date(2015, 12, 14),
        )
        self.opinioncluster.save(index=False)

        self.txtopinion = Opinion(
            cluster=self.opinioncluster,
            type='Lead Opinion',
            local_path=self.good_txt_path
        )
        self.txtopinion.save(index=False)

        self.pdfopinion = Opinion(
            cluster=self.opinioncluster,
            type='Lead Opinion',
            local_path=self.good_pdf_path
        )
        self.pdfopinion.save(index=False)

    def test_serve_static_file_serves_mp3(self):
        request = HttpRequest()
        file_path = self.audio.local_path_mp3
        response = serve_static_file(request, file_path=self.good_mp3_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'audio/mpeg')
        self.assertIn('inline;', response['Content-Disposition'])

    def test_serve_static_file_serves_txt(self):
        request = HttpRequest()
        response = serve_static_file(request, file_path=self.good_txt_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'text/plain')
        self.assertIn('inline;', response['Content-Disposition'])
        self.assertIn(
            'FOR THE DISTRICT OF COLUMBIA CIRCUIT',
            response.content
        )

    def test_serve_static_file_serves_pdf(self):
        request = HttpRequest()
        response = serve_static_file(request, file_path=self.good_pdf_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'application/pdf')
        self.assertIn('inline;', response['Content-Disposition'])
Ejemplo n.º 23
0
def make_and_save(item,
                  skipdupes=False,
                  min_dates=None,
                  start_dates=None,
                  testing=True):
    """Associates case data from `parse_opinions` with objects. Saves these
    objects.

    min_date: if not none, will skip cases after min_date
    """
    date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None
    unknown_date = None
    for date_cluster in item['dates']:
        for date_info in date_cluster:
            # check for any dates that clearly aren't dates
            if date_info[1].year < 1600 or date_info[1].year > 2020:
                continue
            # check for untagged dates that will be assigned to date_filed
            if date_info[0] is None:
                date_filed = date_info[1]
                continue
            # try to figure out what type of date it is based on its tag string
            if date_info[0] in FILED_TAGS:
                date_filed = date_info[1]
            elif date_info[0] in DECIDED_TAGS:
                if not date_filed:
                    date_filed = date_info[1]
            elif date_info[0] in ARGUED_TAGS:
                date_argued = date_info[1]
            elif date_info[0] in REARGUE_TAGS:
                date_reargued = date_info[1]
            elif date_info[0] in REARGUE_DENIED_TAGS:
                date_reargument_denied = date_info[1]
            elif date_info[0] in CERT_GRANTED_TAGS:
                date_cert_granted = date_info[1]
            elif date_info[0] in CERT_DENIED_TAGS:
                date_cert_denied = date_info[1]
            else:
                unknown_date = date_info[1]
                if date_info[0] not in UNKNOWN_TAGS:
                    print("\nFound unknown date tag '%s' with date '%s'.\n" %
                          date_info)

    # the main date (used for date_filed in OpinionCluster) and panel dates
    # (used for finding judges) are ordered in terms of which type of dates
    # best reflect them
    main_date = (date_filed or date_argued or date_reargued
                 or date_reargument_denied or unknown_date)
    panel_date = (date_argued or date_reargued or date_reargument_denied
                  or date_filed or unknown_date)

    if main_date is None:
        raise Exception("Failed to get a date for " + item['file'])

    # special rule for Kentucky
    if item['court_id'] == 'kycourtapp' and main_date <= date(1975, 12, 31):
        item['court_id'] = 'kycourtapphigh'

    if min_dates is not None:
        if min_dates.get(item['court_id']) is not None:
            if main_date >= min_dates[item['court_id']]:
                print(main_date, 'after', min_dates[item['court_id']],
                      ' -- skipping.')
                return
    if start_dates is not None:
        if start_dates.get(item['court_id']) is not None:
            if main_date <= start_dates[item['court_id']]:
                print(main_date, 'before court founding:',
                      start_dates[item['court_id']], ' -- skipping.')
                return

    docket = Docket(source=Docket.COLUMBIA,
                    date_argued=date_argued,
                    date_reargued=date_reargued,
                    date_cert_granted=date_cert_granted,
                    date_cert_denied=date_cert_denied,
                    date_reargument_denied=date_reargument_denied,
                    court_id=item['court_id'],
                    case_name_short=item['case_name_short'] or '',
                    case_name=item['case_name'] or '',
                    case_name_full=item['case_name_full'] or '',
                    docket_number=item['docket'] or '')

    # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...}
    found_citations = []
    for c in item['citations']:
        found = get_citations(c)
        if not found:
            # if the docket number --is-- citation string, we're likely dealing
            # with a somewhat common triplet of (docket number, date,
            # jurisdiction), which isn't a citation at all (so there's no
            # problem)
            if item['docket']:
                docket_no = item['docket'].lower()
                if 'claim no.' in docket_no:
                    docket_no = docket_no.split('claim no.')[0]
                for junk in DOCKET_JUNK:
                    docket_no = docket_no.replace(junk, '')
                docket_no = docket_no.strip('.').strip()
                if docket_no and docket_no in c.lower():
                    continue

            # there are a trivial number of letters (except for months and a few
            # trivial words) in the citation, then it's not a citation at all
            non_trivial = c.lower()
            for trivial in TRIVIAL_CITE_WORDS:
                non_trivial = non_trivial.replace(trivial, '')
            num_letters = sum(
                non_trivial.count(letter) for letter in string.lowercase)
            if num_letters < 3:
                continue

            # if there is a string that's known to indicate a bad citation, then
            # it's not a citation
            if any(bad in c for bad in BAD_CITES):
                continue
            # otherwise, this is a problem
            raise Exception("Failed to get a citation from the string '%s' in "
                            "court '%s' with docket '%s'." %
                            (c, item['court_id'], item['docket']))
        else:
            found_citations.extend(found)
    citations_map = map_citations_to_models(found_citations)

    cluster = OpinionCluster(
        judges=item.get('judges', '') or "",
        precedential_status=('Unpublished'
                             if item['unpublished'] else 'Published'),
        date_filed=main_date,
        case_name_short=item['case_name_short'] or '',
        case_name=item['case_name'] or '',
        case_name_full=item['case_name_full'] or '',
        source='Z',
        attorneys=item['attorneys'] or '',
        posture=item['posture'] or '',
        **citations_map)
    panel = [
        find_person(n, item['court_id'], case_date=panel_date)
        for n in item['panel']
    ]
    panel = [x for x in panel if x is not None]

    opinions = []
    for i, opinion_info in enumerate(item['opinions']):
        if opinion_info['author'] is None:
            author = None
        else:
            author = find_person(opinion_info['author'],
                                 item['court_id'],
                                 case_date=panel_date)
        converted_text = convert_columbia_html(opinion_info['opinion'])
        opinion_type = OPINION_TYPE_MAPPING[opinion_info['type']]
        if opinion_type == '020lead' and i > 0:
            opinion_type = '050addendum'

        opinion = Opinion(
            author=author,
            per_curiam=opinion_info['per_curiam'],
            type=opinion_type,
            # type=OPINION_TYPE_MAPPING[opinion_info['type']],
            html_columbia=converted_text,
            sha1=opinion_info['sha1'],
            local_path=opinion_info['local_path'],
        )
        joined_by = [
            find_person(n, item['court_id'], case_date=panel_date)
            for n in opinion_info['joining']
        ]
        joined_by = [x for x in joined_by if x is not None]
        opinions.append((opinion, joined_by))

    if min_dates is None:
        # check to see if this is a duplicate
        dups = find_dups(docket, cluster)
        if dups:
            if skipdupes:
                print('Duplicate. skipping.')
            else:
                raise Exception("Found %s duplicate(s)." % len(dups))

    # save all the objects
    if not testing:
        try:
            docket.save()
            cluster.docket = docket
            cluster.save(index=False)
            for member in panel:
                cluster.panel.add(member)
            for opinion, joined_by in opinions:
                opinion.cluster = cluster
                opinion.save(index=False)
                for joiner in joined_by:
                    opinion.joined_by.add(joiner)
            if settings.DEBUG:
                domain = "http://127.0.0.1:8000"
            else:
                domain = "https://www.courtlistener.com"
            print("Created item at: %s%s" %
                  (domain, cluster.get_absolute_url()))
        except:
            # if anything goes wrong, try to delete everything
            try:
                docket.delete()
            except:
                pass
            raise
Ejemplo n.º 24
0
def merge_or_add_opinions(
    cluster_id: int,
    html_str: str,
    data: Dict[str, Any],
    date_argued: datetime.date,
    date_filed: datetime.date,
    case_names: Dict[str, str],
    status: str,
    docket_number: str,
    found_citations: List[FoundCitation],
) -> Optional[Docket]:
    """Merge opinions if applicable.

    If opinion not in system, merge or add to cluster.
    If opinion in system came from harvard, add new opinion to cluster, else
    we merge new opinion data into scraped opinion.

    :param cluster_id: Opinion Cluster id.
    :param html_str: HTML opinion to add.
    :param data: Case data to import.
    :param date_argued: Date case was argued.
    :param date_filed: Date case was filed.
    :param case_names: A dict with the three case name types
    :param status: Whether it's precedential
    :param docket_number: The docket number
    :param found_citations: A list of FoundCitation objects.
    :return: The merged docket, cluster, and opinion.
    """
    does_exist = (Opinion.objects.filter(cluster_id=cluster_id).exclude(
        html_anon_2020="").exists())
    if does_exist:
        logger.info(f"Opinion already in database at {cluster_id}")
        return

    logger.info(f"Starting merger of opinions in cluster {cluster_id}.")

    cluster = OpinionCluster.objects.get(pk=cluster_id)
    docket = cluster.docket

    # Dates are uniformly good in our dataset
    # validation and is_approx not needed

    # Merge docket information
    docket.add_anon_2020_source()
    docket.date_argued = date_argued or docket.date_argued
    docket.docket_number = docket_number or docket.docket_number
    docket.case_name_short = (case_names["case_name_short"]
                              or docket.case_name_short)
    docket.case_name = case_names["case_name"] or docket.case_name
    docket.case_name_full = (case_names["case_name_full"]
                             or docket.case_name_full)

    # Merge cluster information
    cluster.date_filed = date_filed or cluster.date_filed
    cluster.precedential_status = status or cluster.precedential_status
    cluster.attorneys = data["representation"] or cluster.attorneys
    cluster.disposition = data["summary_disposition"] or cluster.disposition
    cluster.summary = data["summary_court"] or cluster.summary
    cluster.history = data["history"] or cluster.history
    cluster.cross_reference = (data["history_docket_numbers"]
                               or cluster.cross_reference)
    cluster.correction = data["publication_status_note"] or cluster.correction
    if data["judges"]:
        cluster.judges = (data["judges"].replace("{", "").replace("}", "")
                          or cluster.judges)
    cluster.case_name_short = (case_names["case_name_short"]
                               or cluster.case_name_short)
    cluster.case_name = case_names["case_name"] or cluster.case_name
    cluster.case_name_full = (case_names["case_name_full"]
                              or cluster.case_name_full)

    docket.save()
    cluster.save()

    # Add citations to cluster if applicable
    for citation in found_citations:
        Citation.objects.get_or_create(
            volume=citation.volume,
            reporter=citation.reporter,
            page=citation.page,
            type=map_reporter_db_cite_type(
                REPORTERS[citation.canonical_reporter][0]["cite_type"]),
            cluster_id=cluster.id,
        )

    # Merge with scrape or add opinion to cluster with harvard
    if OpinionCluster.objects.get(pk=cluster_id).source == "C":
        opinion = Opinion.objects.get(cluster_id=cluster_id)
        logger.info("Merge with Harvard data")
        opinion.html_anon_2020 = html_str
    else:
        opinion = Opinion(
            cluster_id=cluster.id,
            type=Opinion.COMBINED,
            html_anon_2020=html_str,
            extracted_by_ocr=False,
        )
    opinion.save()
    logger.info(f"Finished merging opinion in cluster {cluster_id}.")
    return docket
Ejemplo n.º 25
0
def add_new_records(
    html_str: str,
    data: Dict[str, Any],
    date_argued: datetime.date,
    date_filed: datetime.date,
    case_names: Dict[str, str],
    status: str,
    docket_number: str,
    found_citations: List[FoundCitation],
    court_id: str,
) -> Docket:
    """Create new records in the DB based on parsed data

    :param html_str: HTML opinion to add
    :param data: Case data to import
    :param date_argued: Date case was argued.
    :param date_filed: Date case was filed.
    :param case_names: A dict with the three case name types
    :param status: Whether it's precedential
    :param docket_number: The docket number
    :param found_citations: A list of FoundCitation objects.
    :param court_id: The CL id of the court
    :return: None.
    """
    docket = Docket.objects.create(
        **case_names,
        docket_number=docket_number,
        court_id=court_id,
        source=Docket.ANON_2020,
        ia_needs_upload=False,
        date_argued=date_argued,
    )

    logger.info("Add cluster for: %s", found_citations[0].base_citation())
    judges = data["judges"] or ""
    cluster = OpinionCluster(
        **case_names,
        precedential_status=status,
        docket_id=docket.id,
        source=docket.ANON_2020,
        date_filed=date_filed,
        attorneys=data["representation"] or "",
        disposition=data["summary_disposition"] or "",
        summary=data["summary_court"] or "",
        history=data["history"] or "",
        cross_reference=data["history_docket_numbers"] or "",
        correction=data["publication_status_note"] or "",
        judges=judges.replace("{", "").replace("}", "") or "",
    )
    cluster.save(index=False)

    for citation in found_citations:
        logger.info("Adding citation for: %s", citation.base_citation())
        Citation.objects.get_or_create(
            volume=citation.volume,
            reporter=citation.reporter,
            page=citation.page,
            type=map_reporter_db_cite_type(
                REPORTERS[citation.canonical_reporter][0]["cite_type"]),
            cluster_id=cluster.id,
        )

    op = Opinion(
        cluster_id=cluster.id,
        type=Opinion.COMBINED,
        html_anon_2020=html_str,
        extracted_by_ocr=False,
    )
    op.save()
    logger.info(
        f"Finished importing cluster {cluster.id}; {found_citations[0].base_citation()}"
    )
    return docket
Ejemplo n.º 26
0
class StaticFilesTest(TestCase):
    good_mp3_path = "mp3/2014/06/09/ander_v._leo.mp3"
    good_txt_path = "txt/2015/12/28/opinion_text.txt"
    good_pdf_path = (
        "pdf/2013/06/12/" +
        "in_re_motion_for_consent_to_disclosure_of_court_records.pdf")

    def setUp(self):
        self.court = Court.objects.get(pk="test")
        self.docket = Docket(case_name=u"Docket",
                             court=self.court,
                             source=Docket.DEFAULT)
        self.docket.save()

        self.audio = Audio(
            local_path_original_file=self.good_mp3_path,
            local_path_mp3=self.good_mp3_path,
            docket=self.docket,
            blocked=False,
            case_name_full="Ander v. Leo",
            date_created=datetime.date(2014, 6, 9),
        )
        self.audio.save(index=False)

        self.opinioncluster = OpinionCluster(
            case_name=u"Hotline Bling",
            docket=self.docket,
            date_filed=datetime.date(2015, 12, 14),
        )
        self.opinioncluster.save(index=False)

        self.txtopinion = Opinion(
            cluster=self.opinioncluster,
            type="Lead Opinion",
            local_path=self.good_txt_path,
        )
        self.txtopinion.save(index=False)

        self.pdfopinion = Opinion(
            cluster=self.opinioncluster,
            type="Lead Opinion",
            local_path=self.good_pdf_path,
        )
        self.pdfopinion.save(index=False)

    def test_serve_static_file_serves_mp3(self):
        request = HttpRequest()
        file_path = self.audio.local_path_mp3
        response = serve_static_file(request, file_path=self.good_mp3_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response["Content-Type"], "audio/mpeg")
        self.assertIn("inline;", response["Content-Disposition"])

    def test_serve_static_file_serves_txt(self):
        request = HttpRequest()
        response = serve_static_file(request, file_path=self.good_txt_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response["Content-Type"], "text/plain")
        self.assertIn("inline;", response["Content-Disposition"])
        self.assertIn("FOR THE DISTRICT OF COLUMBIA CIRCUIT", response.content)

    def test_serve_static_file_serves_pdf(self):
        request = HttpRequest()
        response = serve_static_file(request, file_path=self.good_pdf_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response["Content-Type"], "application/pdf")
        self.assertIn("inline;", response["Content-Disposition"])
Ejemplo n.º 27
0
def make_and_save(item,
                  skipdupes=False,
                  min_dates=None,
                  start_dates=None,
                  testing=True):
    """Associates case data from `parse_opinions` with objects. Saves these
    objects.

    min_date: if not none, will skip cases after min_date
    """
    date_filed = (date_argued) = (
        date_reargued
    ) = date_reargument_denied = date_cert_granted = date_cert_denied = None
    unknown_date = None
    for date_cluster in item["dates"]:
        for date_info in date_cluster:
            # check for any dates that clearly aren't dates
            if date_info[1].year < 1600 or date_info[1].year > 2020:
                continue
            # check for untagged dates that will be assigned to date_filed
            if date_info[0] is None:
                date_filed = date_info[1]
                continue
            # try to figure out what type of date it is based on its tag string
            if date_info[0] in FILED_TAGS:
                date_filed = date_info[1]
            elif date_info[0] in DECIDED_TAGS:
                if not date_filed:
                    date_filed = date_info[1]
            elif date_info[0] in ARGUED_TAGS:
                date_argued = date_info[1]
            elif date_info[0] in REARGUE_TAGS:
                date_reargued = date_info[1]
            elif date_info[0] in REARGUE_DENIED_TAGS:
                date_reargument_denied = date_info[1]
            elif date_info[0] in CERT_GRANTED_TAGS:
                date_cert_granted = date_info[1]
            elif date_info[0] in CERT_DENIED_TAGS:
                date_cert_denied = date_info[1]
            else:
                unknown_date = date_info[1]
                if date_info[0] not in UNKNOWN_TAGS:
                    print("\nFound unknown date tag '%s' with date '%s'.\n" %
                          date_info)

    # the main date (used for date_filed in OpinionCluster) and panel dates
    # (used for finding judges) are ordered in terms of which type of dates
    # best reflect them
    main_date = (date_filed or date_argued or date_reargued
                 or date_reargument_denied or unknown_date)
    panel_date = (date_argued or date_reargued or date_reargument_denied
                  or date_filed or unknown_date)

    if main_date is None:
        raise Exception("Failed to get a date for " + item["file"])

    # special rule for Kentucky
    if item["court_id"] == "kycourtapp" and main_date <= date(1975, 12, 31):
        item["court_id"] = "kycourtapphigh"

    if min_dates is not None:
        if min_dates.get(item["court_id"]) is not None:
            if main_date >= min_dates[item["court_id"]]:
                print(
                    main_date,
                    "after",
                    min_dates[item["court_id"]],
                    " -- skipping.",
                )
                return
    if start_dates is not None:
        if start_dates.get(item["court_id"]) is not None:
            if main_date <= start_dates[item["court_id"]]:
                print(
                    main_date,
                    "before court founding:",
                    start_dates[item["court_id"]],
                    " -- skipping.",
                )
                return

    docket = Docket(
        source=Docket.COLUMBIA,
        date_argued=date_argued,
        date_reargued=date_reargued,
        date_cert_granted=date_cert_granted,
        date_cert_denied=date_cert_denied,
        date_reargument_denied=date_reargument_denied,
        court_id=item["court_id"],
        case_name_short=item["case_name_short"] or "",
        case_name=item["case_name"] or "",
        case_name_full=item["case_name_full"] or "",
        docket_number=item["docket"] or "",
    )

    # get citation objects in a list for addition to the cluster
    found_citations = []
    for c in item["citations"]:
        found = get_citations(clean_text(c, ["html", "inline_whitespace"]))
        if not found:
            # if the docket number --is-- citation string, we're likely dealing
            # with a somewhat common triplet of (docket number, date,
            # jurisdiction), which isn't a citation at all (so there's no
            # problem)
            if item["docket"]:
                docket_no = item["docket"].lower()
                if "claim no." in docket_no:
                    docket_no = docket_no.split("claim no.")[0]
                for junk in DOCKET_JUNK:
                    docket_no = docket_no.replace(junk, "")
                docket_no = docket_no.strip(".").strip()
                if docket_no and docket_no in c.lower():
                    continue

            # there are a trivial number of letters (except for
            # months and a few trivial words) in the citation,
            # then it's not a citation at all
            non_trivial = c.lower()
            for trivial in TRIVIAL_CITE_WORDS:
                non_trivial = non_trivial.replace(trivial, "")
            num_letters = sum(
                non_trivial.count(letter) for letter in string.lowercase)
            if num_letters < 3:
                continue

            # if there is a string that's known to indicate
            # a bad citation, then it's not a citation
            if any(bad in c for bad in BAD_CITES):
                continue
            # otherwise, this is a problem
            raise Exception("Failed to get a citation from the string '%s' in "
                            "court '%s' with docket '%s'." %
                            (c, item["court_id"], item["docket"]))
        else:
            found_citations.extend(found.to_model())

    cluster = OpinionCluster(
        judges=item.get("judges", "") or "",
        precedential_status=("Unpublished"
                             if item["unpublished"] else "Published"),
        date_filed=main_date,
        case_name_short=item["case_name_short"] or "",
        case_name=item["case_name"] or "",
        case_name_full=item["case_name_full"] or "",
        source="Z",
        attorneys=item["attorneys"] or "",
        posture=item["posture"] or "",
    )
    panel = lookup_judges_by_last_name_list(item["panel"], item["court_id"],
                                            panel_date)

    opinions = []
    for i, opinion_info in enumerate(item["opinions"]):
        if opinion_info["author"] is None:
            author = None
        else:
            author = lookup_judge_by_last_name(opinion_info["author"],
                                               item["court_id"], panel_date)

        converted_text = convert_columbia_html(opinion_info["opinion"])
        opinion_type = OPINION_TYPE_MAPPING[opinion_info["type"]]
        if opinion_type == Opinion.LEAD and i > 0:
            opinion_type = Opinion.ADDENDUM

        opinion = Opinion(
            author=author,
            per_curiam=opinion_info["per_curiam"],
            type=opinion_type,
            # type=OPINION_TYPE_MAPPING[opinion_info['type']],
            html_columbia=converted_text,
            sha1=opinion_info["sha1"],
            # This is surely not updated for the new S3 world. If you're
            # reading this, you'll need to update this code.
            local_path=opinion_info["local_path"],
        )
        joined_by = lookup_judges_by_last_name_list(item["joining"],
                                                    item["court_id"],
                                                    panel_date)
        opinions.append((opinion, joined_by))

    if min_dates is None:
        # check to see if this is a duplicate
        dups = find_dups(docket, cluster)
        if dups:
            if skipdupes:
                print("Duplicate. skipping.")
            else:
                raise Exception("Found %s duplicate(s)." % len(dups))

    # save all the objects
    if not testing:
        try:
            docket.save()
            cluster.docket = docket
            cluster.save(index=False)
            for citation in found_citations:
                citation.cluster = cluster
                citation.save()
            for member in panel:
                cluster.panel.add(member)
            for opinion, joined_by in opinions:
                opinion.cluster = cluster
                opinion.save(index=False)
                for joiner in joined_by:
                    opinion.joined_by.add(joiner)
            if settings.DEBUG:
                domain = "http://127.0.0.1:8000"
            else:
                domain = "https://www.courtlistener.com"
            print("Created item at: %s%s" %
                  (domain, cluster.get_absolute_url()))
        except:
            # if anything goes wrong, try to delete everything
            try:
                docket.delete()
            except:
                pass
            raise
Ejemplo n.º 28
0
    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item["blocked_statuses"]
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = item.get(
            "case_name_shorts") or self.cnt.make_case_name_short(
                item["case_names"])
        docket = Docket(
            docket_number=item.get("docket_numbers", ""),
            case_name=item["case_names"],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        west_cite_str = item.get("west_citations", "")
        state_cite_str = item.get("west_state_citations", "")
        neutral_cite_str = item.get("neutral_citations", "")
        cluster = OpinionCluster(
            judges=item.get("judges", ""),
            date_filed=item["case_dates"],
            date_filed_is_approximate=item["date_filed_is_approximate"],
            case_name=item["case_names"],
            case_name_short=case_name_short,
            source="C",
            precedential_status=item["precedential_statuses"],
            nature_of_suit=item.get("nature_of_suit", ""),
            blocked=blocked,
            date_blocked=date_blocked,
            # These three fields are replaced below.
            federal_cite_one=west_cite_str,
            state_cite_one=state_cite_str,
            neutral_cite=neutral_cite_str,
            syllabus=item.get("summaries", ""),
        )
        citations = []
        cite_types = [
            (west_cite_str, Citation.WEST),
            (state_cite_str, Citation.STATE),
            (neutral_cite_str, Citation.NEUTRAL),
        ]
        for cite_str, cite_type in cite_types:
            if cite_str:
                citations.append(make_citation(cite_str, cluster, cite_type))
        opinion = Opinion(
            type=Opinion.COMBINED,
            sha1=sha1_hash,
            download_url=item["download_urls"],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item["case_names"].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ("Unable to save binary to disk. Deleted "
                   "item: %s.\n %s" %
                   (item["case_names"], traceback.format_exc()))
            logger.critical(msg.encode("utf-8"))
            ErrorLog(log_level="CRITICAL", court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, citations, error
Ejemplo n.º 29
0
def parse_harvard_opinions(reporter, volume, make_searchable):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :param make_searchable: Boolean to indicate saving to solr
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download", file_path.split("/", 9)[-1]]
        )

        if OpinionCluster.objects.filter(
            filepath_json_harvard=file_path
        ).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"])
        if not cites:
            logger.info(
                "No citation found for %s." % data["citations"][0]["cite"]
            )
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name, file_path):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            sorted(
                list(
                    set(
                        itertools.chain.from_iterable(judge_list + author_list)
                    )
                )
            )
        )
        judges = titlecase(judges)
        docket_string = (
            data["docket_number"]
            .replace("Docket No.", "")
            .replace("Docket Nos.", "")
            .strip()
        )

        short_fields = ["attorneys", "disposition", "otherdate", "seealso"]

        long_fields = [
            "syllabus",
            "summary",
            "history",
            "headnotes",
            "correction",
        ]

        short_data = parse_extra_fields(soup, short_fields, False)
        long_data = parse_extra_fields(soup, long_fields, True)

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            try:
                with transaction.atomic():
                    docket.save()
            except OperationalError as e:
                if "exceeds maximum" in str(e):
                    docket.docket_number = (
                        "%s, See Corrections for full Docket Number"
                        % trunc(docket_string, length=5000, ellipsis="...")
                    )
                    docket.save()
                    long_data["correction"] = "%s <br> %s" % (
                        data["docket_number"],
                        long_data["correction"],
                    )
            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=short_data["attorneys"],
                disposition=short_data["disposition"],
                syllabus=long_data["syllabus"],
                summary=long_data["summary"],
                history=long_data["history"],
                other_dates=short_data["otherdate"],
                cross_reference=short_data["seealso"],
                headnotes=long_data["headnotes"],
                correction=long_data["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )
            cluster.save(index=False)

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.canonical_reporter][0]["cite_type"]
                ),
                cluster_id=cluster.id,
            )
            new_op_pks = []
            for op in soup.find_all("opinion"):
                # This code cleans author tags for processing.
                # It is particularly useful for identifiying Per Curiam
                for elem in [op.find("author")]:
                    if elem is not None:
                        [x.extract() for x in elem.find_all("page-number")]

                auth = op.find("author")
                if auth is not None:
                    author_tag_str = titlecase(auth.text.strip(":"))
                    author_str = titlecase(
                        "".join(extract_judge_last_name(author_tag_str))
                    )
                else:
                    author_str = ""
                    author_tag_str = ""

                per_curiam = True if author_tag_str == "Per Curiam" else False
                # If Per Curiam is True set author string to Per Curiam
                if per_curiam:
                    author_str = "Per Curiam"

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                op = Opinion(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    per_curiam=per_curiam,
                    extracted_by_ocr=True,
                )
                # Don't index now; do so later if desired
                op.save(index=False)
                new_op_pks.append(op.pk)

        if make_searchable:
            add_items_to_solr.delay(new_op_pks, "search.Opinion")

        logger.info("Finished: %s", citation.base_citation())
Ejemplo n.º 30
0
    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        west_cite_str = item.get('west_citations', '')
        state_cite_str = item.get('west_state_citations', '')
        neutral_cite_str = item.get('neutral_citations', '')
        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            date_filed_is_approximate=item['date_filed_is_approximate'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            # These three fields are replaced below.
            federal_cite_one=west_cite_str,
            state_cite_one=state_cite_str,
            neutral_cite=neutral_cite_str,
            syllabus=item.get('summaries', ''),
        )
        citations = []
        if west_cite_str:
            citation_obj = get_citations(west_cite_str)[0]
            citations.append(
                Citation(
                    cluster=cluster,
                    volume=citation_obj.volume,
                    reporter=citation_obj.reporter,
                    page=citation_obj.page,
                    type=Citation.WEST,
                ))
        if state_cite_str:
            citation_obj = get_citations(state_cite_str)[0]
            citations.append(
                Citation(
                    cluster=cluster,
                    volume=citation_obj.volume,
                    reporter=citation_obj.reporter,
                    page=citation_obj.page,
                    type=Citation.STATE,
                ))
        if neutral_cite_str:
            citation_obj = get_citations(neutral_cite_str)[0]
            citations.append(
                Citation(
                    cluster=cluster,
                    volume=citation_obj.volume,
                    reporter=citation_obj.reporter,
                    page=citation_obj.page,
                    type=Citation.NEUTRAL,
                ))
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, citations, error
Ejemplo n.º 31
0
def make_and_save(item, skipdupes=False, min_dates=None, testing=True):
    """Associates case data from `parse_opinions` with objects. Saves these
    objects.

    min_date: if not none, will skip cases after min_date
    """
    date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None
    unknown_date = None
    for date_cluster in item['dates']:
        for date_info in date_cluster:
            # check for any dates that clearly aren't dates
            if date_info[1].year < 1600 or date_info[1].year > 2020:
                continue
            # check for untagged dates that will be assigned to date_filed
            if date_info[0] is None:
                date_filed = date_info[1]
                continue
            # try to figure out what type of date it is based on its tag string
            if date_info[0] in FILED_TAGS:
                date_filed = date_info[1]
            elif date_info[0] in DECIDED_TAGS:
                if not date_filed:
                    date_filed = date_info[1]
            elif date_info[0] in ARGUED_TAGS:
                date_argued = date_info[1]
            elif date_info[0] in REARGUE_TAGS:
                date_reargued = date_info[1]
            elif date_info[0] in REARGUE_DENIED_TAGS:
                date_reargument_denied = date_info[1]
            elif date_info[0] in CERT_GRANTED_TAGS:
                date_cert_granted = date_info[1]
            elif date_info[0] in CERT_DENIED_TAGS:
                date_cert_denied = date_info[1]
            else:
                unknown_date = date_info[1]
                if date_info[0] not in UNKNOWN_TAGS:
                    print("\nFound unknown date tag '%s' with date '%s'.\n" %
                          date_info)

    # the main date (used for date_filed in OpinionCluster) and panel dates
    # (used for finding judges) are ordered in terms of which type of dates
    # best reflect them
    main_date = (date_filed or date_argued or date_reargued or
                 date_reargument_denied or unknown_date)
    panel_date = (date_argued or date_reargued or date_reargument_denied or
                  date_filed or unknown_date)

    if main_date is None:
        raise Exception("Failed to get a date for " + item['file'])

    if min_dates is not None:
        if min_dates.get(item['court_id']) is not None:
            if main_date >= min_dates[item['court_id']]:
                print(main_date, 'after', min_dates[item['court_id']],
                      ' -- skipping.')
                return

    docket = Docket(
        source=Docket.COLUMBIA,
        date_argued=date_argued,
        date_reargued=date_reargued,
        date_cert_granted=date_cert_granted,
        date_cert_denied=date_cert_denied,
        date_reargument_denied=date_reargument_denied,
        court_id=item['court_id'],
        case_name_short=item['case_name_short'] or '',
        case_name=item['case_name'] or '',
        case_name_full=item['case_name_full'] or '',
        docket_number=item['docket'] or ''
    )

    # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...}
    found_citations = []
    for c in item['citations']:
        found = get_citations(c)
        if not found:
            # if the docket number --is-- citation string, we're likely dealing
            # with a somewhat common triplet of (docket number, date,
            # jurisdiction), which isn't a citation at all (so there's no
            # problem)
            if item['docket']:
                docket_no = item['docket'].lower()
                if 'claim no.' in docket_no:
                    docket_no = docket_no.split('claim no.')[0]
                for junk in DOCKET_JUNK:
                    docket_no = docket_no.replace(junk, '')
                docket_no = docket_no.strip('.').strip()
                if docket_no and docket_no in c.lower():
                    continue

            # there are a trivial number of letters (except for months and a few
            # trivial words) in the citation, then it's not a citation at all
            non_trivial = c.lower()
            for trivial in TRIVIAL_CITE_WORDS:
                non_trivial = non_trivial.replace(trivial, '')
            num_letters = sum(non_trivial.count(letter) for letter in string.lowercase)
            if num_letters < 3:
                continue

            # if there is a string that's known to indicate a bad citation, then
            # it's not a citation
            if any(bad in c for bad in BAD_CITES):
                continue
            # otherwise, this is a problem
            raise Exception("Failed to get a citation from the string '%s' in "
                            "court '%s' with docket '%s'." % (
                                c, item['court_id'], item['docket']
                            ))
        else:
            found_citations.extend(found)
    citations_map = map_citations_to_models(found_citations)

    cluster = OpinionCluster(
        judges=item.get('judges', '') or "",
        precedential_status=('Unpublished' if item['unpublished'] else 'Published'),
        date_filed=main_date,
        case_name_short=item['case_name_short'] or '',
        case_name=item['case_name'] or '',
        case_name_full=item['case_name_full'] or '',
        source='Z',
        attorneys=item['attorneys'] or '',
        posture=item['posture'] or '',
        **citations_map
    )
    panel = [find_person(n, item['court_id'], case_date=panel_date) for n in
             item['panel']]
    panel = [x for x in panel if x is not None]

    opinions = []
    for i, opinion_info in enumerate(item['opinions']):
        if opinion_info['author'] is None:
            author = None
        else:
            author = find_person(opinion_info['author'], item['court_id'],
                                 case_date=panel_date)
        converted_text = convert_columbia_html(opinion_info['opinion'])
        opinion_type = OPINION_TYPE_MAPPING[opinion_info['type']]
        if opinion_type == '020lead' and i > 0:
            opinion_type = '050addendum'

        opinion = Opinion(
            author=author,
            per_curiam=opinion_info['per_curiam'],
            type=opinion_type,
            # type=OPINION_TYPE_MAPPING[opinion_info['type']],
            html_columbia=converted_text,
            sha1=opinion_info['sha1'],
            local_path=opinion_info['local_path'],
        )
        joined_by = [find_person(n, item['court_id'], case_date=panel_date) for n in opinion_info['joining']]
        joined_by = [x for x in joined_by if x is not None]
        opinions.append((opinion, joined_by))

    if min_dates is None:
        # check to see if this is a duplicate
        dups = find_dups(docket, cluster, panel, opinions)
        if dups:
            if skipdupes:
                print('Duplicate. skipping.')
            else:
                raise Exception("Found %s duplicate(s)." % len(dups))

    # save all the objects
    if not testing:
        try:
            docket.save()
            cluster.docket = docket
            cluster.save(index=False)
            for member in panel:
                cluster.panel.add(member)
            for opinion, joined_by in opinions:
                opinion.cluster = cluster
                opinion.save(index=False)
                for joiner in joined_by:
                    opinion.joined_by.add(joiner)
            if settings.DEBUG:
                domain = "http://127.0.0.1:8000"
            else:
                domain = "https://www.courtlistener.com"
            print("Created item at: %s%s" % (domain, cluster.get_absolute_url()))
        except:
            # if anything goes wrong, try to delete everything
            try:
                docket.delete()
            except:
                pass
            raise
Ejemplo n.º 32
0
def make_and_save(item):
    """Associates case data from `parse_opinions` with objects. Saves these objects."""
    date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None
    for date_cluster in item['dates']:
        for date_info in date_cluster:
            # check for any dates that clearly aren't dates
            if date_info[1].year < 1600 or date_info[1].year > 2020:
                continue
            # check for untagged dates that will be assigned to date_filed
            if date_info[0] is None:
                date_filed = date_info[1]
                continue
            # try to figure out what type of date it is based on its tag string
            if date_info[0] in FILED_TAGS:
                date_filed = date_info[1]
            elif date_info[0] in DECIDED_TAGS:
                if not date_filed:
                    date_filed = date_info[1]
            elif date_info[0] in ARGUED_TAGS:
                date_argued = date_info[1]
            elif date_info[0] in REARGUE_TAGS:
                date_reargued = date_info[1]
            elif date_info[0] in REARGUE_DENIED_TAGS:
                date_reargument_denied = date_info[1]
            elif date_info[0] in CERT_GRANTED_TAGS:
                date_cert_granted = date_info[1]
            elif date_info[0] in CERT_DENIED_TAGS:
                date_cert_denied = date_info[1]
            else:
                print("Found unknown date tag '%s' with date '%s'." % date_info)

    docket = Docket(
        date_argued=date_argued
        ,date_reargued=date_reargued
        ,date_cert_granted=date_cert_granted
        ,date_cert_denied=date_cert_denied
        ,date_reargument_denied=date_reargument_denied
        ,court_id=item['court_id']
        ,case_name_short=item['case_name_short'] or ''
        ,case_name=item['case_name'] or ''
        ,case_name_full=item['case_name_full'] or ''
        ,docket_number=item['docket'] or ''
    )
    docket.save()

    # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...}
    found_citations = []
    for c in item['citations']:
        found = get_citations(c)
        if not found:
            raise Exception("Failed to get a citation from the string '%s'." % c)
        elif len(found) > 1:
            raise Exception("Got multiple citations from string '%s' when there should have been one." % c)
        found_citations.append(found[0])
    citations_map = map_citations_to_models(found_citations)

    cluster = OpinionCluster(
        docket=docket
        ,precedential_status=('Unpublished' if item['unpublished'] else 'Published')
        ,date_filed=date_filed
        ,case_name_short=item['case_name_short'] or ''
        ,case_name=item['case_name'] or ''
        ,case_name_full=item['case_name_full'] or ''
        ,source='Z'
        ,attorneys=item['attorneys'] or ''
        ,posture=item['posture'] or ''
        ,**citations_map
    )
    cluster.save()
    
    if date_argued is not None:
        paneldate = date_argued
    else:
        paneldate = date_filed
    panel = [find_person(n, item['court_id'], paneldate) for n in item['panel']]
    panel = [x for x in panel if x is not None]
    for member in panel:
        cluster.panel.add(member)

    for opinion_info in item['opinions']:
        if opinion_info['author'] is None:
            author = None
        else:
            author = find_person(opinion_info['author'], item['court_id'], date_filed or date_argued)
        opinion = Opinion(
            cluster=cluster
            ,author=author
            ,type=OPINION_TYPE_MAPPING[opinion_info['type']]
            ,html_columbia=opinion_info['opinion']
        )
        opinion.save()
        joined_by = [find_person(n, item['court_id'], paneldate) for n in opinion_info['joining']]
        joined_by = [x for x in joined_by if x is not None]
        for joiner in joined_by:
            opinion.joined_by.add(joiner)
Ejemplo n.º 33
0
def set_blocked_status(opinion: Opinion, content: str, extension: str) -> None:
    """Figure out if the case should be blocked from search engines

    Use a number of rules to figure out if a case is better off out of search
    results. Also check the content of the document to see if it has SSNs or
    other sensitive data. If so, strip the sensitive content from those
    attributes of the opinion object. Do not save the object, that's up to the
    caller.

    For a discussion of the rules below, see:
    https://github.com/freelawproject/courtlistener/issues/691

    :param opinion: An opinion to check.
    :param content: The text of that opinion (usually from an extracted PDF).
    :param extension: The extension of the file, without the leading period.
    :return: None
    """
    def set_blocked(opinion: Opinion) -> None:
        opinion.cluster.blocked = True
        opinion.cluster.date_blocked = now()
        return None

    # Block if there is sensitive content in the document
    if extension in ["html", "wpd"]:
        opinion.html, found_ppi = anonymize(content)
    else:
        opinion.plain_text, found_ppi = anonymize(content)
    if found_ppi:
        set_blocked(opinion)
        return None

    oc = opinion.cluster
    court = opinion.cluster.docket.court
    lower_content = " ".join(content.lower().split())

    # Don't block things older than 30 years
    thirty_years_ago = now().date() - timedelta(days=30 * 365)
    from_last_thirty_years = oc.date_filed > thirty_years_ago
    if not from_last_thirty_years:
        return None
    #
    # Block based on rules and terms
    #
    if oc.precedential_status == "Unpublished":
        set_blocked(opinion)
        return None

    lower_state_court = court in Court.objects.filter(
        jurisdiction__in=[Court.STATE_TRIAL, Court.STATE_APPELLATE])
    if lower_state_court and any([
            "divorce" in lower_content,
            "minor" in lower_content and "child" in lower_content,
            "paternity" in lower_content,
            "wrongful termination" in lower_content,
    ]):
        set_blocked(opinion)
        return None

    federal_district_court = court in Court.objects.filter(
        jurisdiction=Court.FEDERAL_DISTRICT)
    if federal_district_court and "asylum" in lower_content:
        set_blocked(opinion)
        return None

    not_appellate_court = court in Court.objects.exclude(jurisdiction__in=[
        Court.STATE_SUPREME,
        Court.FEDERAL_APPELLATE,
        Court.FEDERAL_BANKRUPTCY_PANEL,
    ])
    if not_appellate_court and any([
            "grams of cocaine" in lower_content,
            "grams of crack cocaine" in lower_content,
            "grams of marijuana" in lower_content,
    ]):
        set_blocked(opinion)
        return None

    # fmt: off
    private_court = court in Court.objects.filter(pk__in=[
        # Military courts
        "afcca",
        "asbca",
        "armfor",
        "acca",
        "mc",
        "nmcca",
        "cavc",
        "bva",
        # Tax courts
        "tax",
        "bta",
        "ariztaxct",
        "indtc",
        "monttc",
        "njtaxct",
        "ortc",
        # Merit Systems Protection Board
        "mspb",
        # Workers' Comp, etc.
        "arkworkcompcom",
        "connworkcompcom",
        "tennworkcompcl",
        "tennworkcompapp"
    ])
    # fmt: on
    if private_court:
        set_blocked(opinion)
        return None
Ejemplo n.º 34
0
def make_objects(
    item: Dict[str, Union[str, Any]],
    court: Court,
    sha1_hash: str,
    content: bytes,
) -> Tuple[Docket, Opinion, OpinionCluster, List[Citation]]:
    """Takes the meta data from the scraper and associates it with objects.

    Returns the created objects.
    """
    blocked = item["blocked_statuses"]
    if blocked:
        date_blocked = date.today()
    else:
        date_blocked = None

    case_name_short = item.get("case_name_shorts") or cnt.make_case_name_short(
        item["case_names"]
    )

    docket = Docket(
        docket_number=item.get("docket_numbers", ""),
        case_name=item["case_names"],
        case_name_short=case_name_short,
        court=court,
        blocked=blocked,
        date_blocked=date_blocked,
        source=item.get("source") or Docket.SCRAPER,
    )

    west_cite_str = item.get("west_citations", "")
    state_cite_str = item.get("west_state_citations", "")
    neutral_cite_str = item.get("neutral_citations", "")
    cluster = OpinionCluster(
        judges=item.get("judges", ""),
        date_filed=item["case_dates"],
        date_filed_is_approximate=item["date_filed_is_approximate"],
        case_name=item["case_names"],
        case_name_short=case_name_short,
        source=item.get("cluster_source") or "C",
        precedential_status=item["precedential_statuses"],
        nature_of_suit=item.get("nature_of_suit", ""),
        blocked=blocked,
        date_blocked=date_blocked,
        syllabus=item.get("summaries", ""),
    )
    citations = []
    cite_types = [
        (west_cite_str, Citation.WEST),
        (state_cite_str, Citation.STATE),
        (neutral_cite_str, Citation.NEUTRAL),
    ]
    for cite_str, cite_type in cite_types:
        if cite_str:
            citations.append(make_citation(cite_str, cluster, cite_type))
    opinion = Opinion(
        type=Opinion.COMBINED,
        sha1=sha1_hash,
        download_url=item["download_urls"],
    )

    cf = ContentFile(content)
    extension = get_extension(content)
    file_name = trunc(item["case_names"].lower(), 75) + extension
    opinion.file_with_date = cluster.date_filed
    opinion.local_path.save(file_name, cf, save=False)

    return docket, opinion, cluster, citations
Ejemplo n.º 35
0
    def migrate_opinions_oral_args_and_dockets(self):
        self.stdout.write("Migrating dockets, audio files, and opinions to new "
                          "database...")
        q = DocketOld.objects.using('old').all()
        old_dockets = queryset_generator(q)
        num_dockets = q.count()

        progress = 0
        self._print_progress(progress, num_dockets)
        for old_docket in old_dockets:
            # First do the docket, then create the cluster and opinion objects.
            try:
                old_audio = old_docket.audio_files.all()[0]
            except IndexError:
                old_audio = None
            try:
                old_document = old_docket.documents.all()[0]
            except IndexError:
                old_document = None
            if old_document is not None:
                old_citation = old_document.citation
                old_doc_case_name, old_doc_case_name_full, old_doc_case_name_short = self._get_case_names(old_citation.case_name)
            if old_audio is not None:
                old_audio_case_name, old_audio_case_name_full, old_audio_case_name_short = self._get_case_names(old_audio.case_name)

            court = CourtNew.objects.get(pk=old_docket.court_id)  # Courts are in place thanks to initial data.

            new_docket = DocketNew(
                pk=old_docket.pk,
                date_modified=old_docket.date_modified,
                date_created=old_docket.date_modified,
                court=court,
                case_name=old_doc_case_name,
                case_name_full=old_doc_case_name_full,
                case_name_short=old_doc_case_name_short,
                slug=self._none_to_blank(old_docket.slug),
                docket_number=self._none_to_blank(old_citation.docket_number),
                date_blocked=old_docket.date_blocked,
                blocked=old_docket.blocked,
            )
            if old_audio is not None:
                new_docket.date_argued = old_audio.date_argued
            new_docket.save(using='default')

            if old_document is not None:
                new_opinion_cluster = OpinionClusterNew(
                    pk=old_document.pk,
                    docket=new_docket,
                    judges=self._none_to_blank(old_document.judges),
                    date_modified=old_document.date_modified,
                    date_created=old_document.date_modified,
                    date_filed=old_document.date_filed,
                    slug=self._none_to_blank(old_citation.slug),
                    citation_id=old_document.citation_id,
                    case_name_short=old_doc_case_name_short,
                    case_name=old_doc_case_name,
                    case_name_full=old_doc_case_name_full,
                    federal_cite_one=self._none_to_blank(
                        old_citation.federal_cite_one),
                    federal_cite_two=self._none_to_blank(
                        old_citation.federal_cite_two),
                    federal_cite_three=self._none_to_blank(
                        old_citation.federal_cite_three),
                    state_cite_one=self._none_to_blank(
                        old_citation.state_cite_one),
                    state_cite_two=self._none_to_blank(
                        old_citation.state_cite_two),
                    state_cite_three=self._none_to_blank(
                        old_citation.state_cite_three),
                    state_cite_regional=self._none_to_blank(
                        old_citation.state_cite_regional),
                    specialty_cite_one=self._none_to_blank(
                        old_citation.specialty_cite_one),
                    scotus_early_cite=self._none_to_blank(
                        old_citation.scotus_early_cite),
                    lexis_cite=self._none_to_blank(old_citation.lexis_cite),
                    westlaw_cite=self._none_to_blank(old_citation.westlaw_cite),
                    neutral_cite=self._none_to_blank(old_citation.neutral_cite),
                    scdb_id=self._none_to_blank(
                        old_document.supreme_court_db_id),
                    source=old_document.source,
                    nature_of_suit=old_document.nature_of_suit,
                    citation_count=old_document.citation_count,
                    precedential_status=old_document.precedential_status,
                    date_blocked=old_document.date_blocked,
                    blocked=old_document.blocked,
                )
                new_opinion_cluster.save(
                    using='default',
                    index=False,
                )

                new_opinion = OpinionNew(
                    pk=old_document.pk,
                    cluster=new_opinion_cluster,
                    date_modified=old_document.date_modified,
                    date_created=old_document.time_retrieved,
                    type='010combined',
                    sha1=old_document.sha1,
                    download_url=old_document.download_url,
                    local_path=old_document.local_path,
                    plain_text=old_document.plain_text,
                    html=self._none_to_blank(old_document.html),
                    html_lawbox=self._none_to_blank(old_document.html_lawbox),
                    html_with_citations=old_document.html_with_citations,
                    extracted_by_ocr=old_document.extracted_by_ocr,
                )
                new_opinion.save(
                    using='default',
                    index=False,
                )

            if old_audio is not None:
                new_audio_file = AudioNew(
                    pk=old_audio.pk,
                    docket=new_docket,
                    source=old_audio.source,
                    case_name=old_audio_case_name,
                    case_name_short=old_audio_case_name_short,
                    case_name_full=old_audio_case_name_full,
                    judges=self._none_to_blank(old_audio.judges),
                    date_created=old_audio.time_retrieved,
                    date_modified=old_audio.date_modified,
                    sha1=old_audio.sha1,
                    download_url=old_audio.download_url,
                    local_path_mp3=old_audio.local_path_mp3,
                    local_path_original_file=old_audio.local_path_original_file,
                    duration=old_audio.duration,
                    processing_complete=old_audio.processing_complete,
                    date_blocked=old_audio.date_blocked,
                    blocked=old_audio.blocked,
                )
                new_audio_file.save(
                    using='default',
                    index=False,
                )

            progress += 1
            self._print_progress(progress, num_dockets)
        self.stdout.write(u'')  # Newline
Ejemplo n.º 36
0
    def add_oc_and_o(self, old_document, old_citation, old_docket, new_docket):
        """Add the OpinionCluster and Opinion, updating existing items if
        present.
        """
        new_opinion_cluster = OpinionClusterNew(
            pk=old_document.pk,
            docket=new_docket,
            judges=self._none_to_blank(old_document.judges),
            date_modified=old_document.date_modified,
            date_created=old_document.date_modified,
            date_filed=old_document.date_filed,
            slug=self._none_to_blank(old_citation.slug),
            citation_id=old_document.citation_id,
            case_name_short=old_docket.case_name_short,
            case_name=old_docket.case_name,
            case_name_full=old_docket.case_name_full,
            federal_cite_one=self._none_to_blank(
                old_citation.federal_cite_one),
            federal_cite_two=self._none_to_blank(
                old_citation.federal_cite_two),
            federal_cite_three=self._none_to_blank(
                old_citation.federal_cite_three),
            state_cite_one=self._none_to_blank(old_citation.state_cite_one),
            state_cite_two=self._none_to_blank(old_citation.state_cite_two),
            state_cite_three=self._none_to_blank(
                old_citation.state_cite_three),
            state_cite_regional=self._none_to_blank(
                old_citation.state_cite_regional),
            specialty_cite_one=self._none_to_blank(
                old_citation.specialty_cite_one),
            scotus_early_cite=self._none_to_blank(
                old_citation.scotus_early_cite),
            lexis_cite=self._none_to_blank(old_citation.lexis_cite),
            westlaw_cite=self._none_to_blank(old_citation.westlaw_cite),
            neutral_cite=self._none_to_blank(old_citation.neutral_cite),
            scdb_id=self._none_to_blank(old_document.supreme_court_db_id),
            source=old_document.source,
            nature_of_suit=old_document.nature_of_suit,
            citation_count=old_document.citation_count,
            precedential_status=old_document.precedential_status,
            date_blocked=old_document.date_blocked,
            blocked=old_document.blocked,
        )
        new_opinion_cluster.save(
            using='default',
            index=False,
        )

        new_opinion = OpinionNew(
            pk=old_document.pk,
            cluster=new_opinion_cluster,
            date_modified=old_document.date_modified,
            date_created=old_document.time_retrieved,
            type='010combined',
            sha1=old_document.sha1,
            download_url=old_document.download_url,
            local_path=old_document.local_path,
            plain_text=old_document.plain_text,
            html=self._none_to_blank(old_document.html),
            html_lawbox=self._none_to_blank(old_document.html_lawbox),
            html_with_citations=old_document.html_with_citations,
            extracted_by_ocr=old_document.extracted_by_ocr,
        )
        new_opinion.save(
            using='default',
            index=False,
        )