Ejemplo n.º 1
0
    def test_juriscraper_docket_number_extraction(self):
        """Can we extract docket number from tax court pdf and add to db?"""

        o = Opinion.objects.get(pk=76)
        self.assertEqual(
            None,
            o.cluster.docket.docket_number,
            msg="Docket number should be none.",
        )
        extract_doc_content(pk=76, do_ocr=False)
        o.cluster.docket.refresh_from_db()
        self.assertEqual("19031-13, 27735-13, 11905-14",
                         o.cluster.docket.docket_number)
Ejemplo n.º 2
0
    def test_juriscraper_object_creation(self):
        """Can we extract text from tax court pdf and add to db?"""

        o = Opinion.objects.get(pk=76)
        self.assertFalse(
            o.cluster.citations.exists(),
            msg="Citation should not exist at beginning of test",
        )

        extract_doc_content(pk=o.pk, do_ocr=False)
        self.assertTrue(
            o.cluster.citations.exists(),
            msg="Expected citation was not created in db",
        )
Ejemplo n.º 3
0
 def test_content_extraction(self):
     """Do all of the supported mimetypes get extracted to text
     successfully, including OCR?"""
     test_strings = [
         'supreme', 'intelligence', 'indiana', 'reagan', 'indiana',
         'fidelity'
     ]
     opinions = Opinion.objects.all()
     for op, test_string in zip(opinions, test_strings):
         ext = get_extension(op.local_path.file.read())
         extract_doc_content(op.pk, do_ocr=True)
         op.refresh_from_db()
         if ext in ['.html', '.wpd']:
             self.assertIn(test_string, op.html.lower())
         else:
             self.assertIn(test_string, op.plain_text.lower())
Ejemplo n.º 4
0
 def test_content_extraction(self):
     """Do all of the supported mimetypes get extracted to text
     successfully, including OCR?"""
     test_strings = [
         'supreme',
         'intelligence',
         'indiana',
         'reagan',
         'indiana',
         'fidelity'
     ]
     opinions = Opinion.objects.all()
     for op, test_string in zip(opinions, test_strings):
         ext = get_extension(op.local_path.file.read())
         extract_doc_content(op.pk, do_ocr=True)
         op.refresh_from_db()
         if ext in ['.html', '.wpd']:
             self.assertIn(test_string, op.html.lower())
         else:
             self.assertIn(test_string, op.plain_text.lower())
Ejemplo n.º 5
0
 def test_wpd_content_extraction(self):
     """Can we ingest a wpd file?"""
     wpd_opinion = Opinion.objects.get(pk=5)
     extract_doc_content(wpd_opinion.pk, do_ocr=False)
     wpd_opinion.refresh_from_db()
     self.assertIn("greene", wpd_opinion.html.lower())
Ejemplo n.º 6
0
 def test_html_content_extraction(self):
     """Can we ingest an html file?"""
     html_opinion = Opinion.objects.get(pk=4)
     extract_doc_content(html_opinion.pk, do_ocr=False)
     html_opinion.refresh_from_db()
     self.assertIn("reagan", html_opinion.html.lower())
Ejemplo n.º 7
0
 def test_text_based_pdf(self):
     """Can we ingest a text based pdf file?"""
     txt_opinion = Opinion.objects.get(pk=3)
     extract_doc_content(txt_opinion.pk, do_ocr=False)
     txt_opinion.refresh_from_db()
     self.assertIn("tarrant", txt_opinion.plain_text.lower())
Ejemplo n.º 8
0
 def test_image_based_pdf(self):
     """Can we ingest an image based pdf file?"""
     image_opinion = Opinion.objects.get(pk=2)
     extract_doc_content(image_opinion.pk, do_ocr=True)
     image_opinion.refresh_from_db()
     self.assertIn("intelligence", image_opinion.plain_text.lower())
Ejemplo n.º 9
0
 def test_doc_content_extraction(self):
     """Can we ingest a doc file?"""
     image_opinion = Opinion.objects.get(pk=1)
     extract_doc_content(image_opinion.pk, do_ocr=False)
     image_opinion.refresh_from_db()
     self.assertIn("indiana", image_opinion.plain_text.lower())
Ejemplo n.º 10
0
 def test_txt_content_extraction(self):
     """Can we ingest a txt file?"""
     txt_opinion = Opinion.objects.get(pk=6)
     extract_doc_content(txt_opinion.pk, do_ocr=False)
     txt_opinion.refresh_from_db()
     self.assertIn("ideal", txt_opinion.plain_text.lower())