Python extract_doc_content Exemples, cl.scrapers.tasks.extract_doc_content Python Exemples

Exemple #1

0

Afficher le fichier

    def test_juriscraper_docket_number_extraction(self):
        """Can we extract docket number from tax court pdf and add to db?"""

        o = Opinion.objects.get(pk=76)
        self.assertEqual(
            None,
            o.cluster.docket.docket_number,
            msg="Docket number should be none.",
        )
        extract_doc_content(pk=76, do_ocr=False)
        o.cluster.docket.refresh_from_db()
        self.assertEqual("19031-13, 27735-13, 11905-14",
                         o.cluster.docket.docket_number)

Exemple #2

0

Afficher le fichier

    def test_juriscraper_object_creation(self):
        """Can we extract text from tax court pdf and add to db?"""

        o = Opinion.objects.get(pk=76)
        self.assertFalse(
            o.cluster.citations.exists(),
            msg="Citation should not exist at beginning of test",
        )

        extract_doc_content(pk=o.pk, do_ocr=False)
        self.assertTrue(
            o.cluster.citations.exists(),
            msg="Expected citation was not created in db",
        )

Exemple #3

0

Afficher le fichier

 def test_content_extraction(self):
     """Do all of the supported mimetypes get extracted to text
     successfully, including OCR?"""
     test_strings = [
         'supreme', 'intelligence', 'indiana', 'reagan', 'indiana',
         'fidelity'
     ]
     opinions = Opinion.objects.all()
     for op, test_string in zip(opinions, test_strings):
         ext = get_extension(op.local_path.file.read())
         extract_doc_content(op.pk, do_ocr=True)
         op.refresh_from_db()
         if ext in ['.html', '.wpd']:
             self.assertIn(test_string, op.html.lower())
         else:
             self.assertIn(test_string, op.plain_text.lower())

Exemple #4

0

Afficher le fichier

Fichier : tests.py Projet : freelawproject/courtlistener

 def test_content_extraction(self):
     """Do all of the supported mimetypes get extracted to text
     successfully, including OCR?"""
     test_strings = [
         'supreme',
         'intelligence',
         'indiana',
         'reagan',
         'indiana',
         'fidelity'
     ]
     opinions = Opinion.objects.all()
     for op, test_string in zip(opinions, test_strings):
         ext = get_extension(op.local_path.file.read())
         extract_doc_content(op.pk, do_ocr=True)
         op.refresh_from_db()
         if ext in ['.html', '.wpd']:
             self.assertIn(test_string, op.html.lower())
         else:
             self.assertIn(test_string, op.plain_text.lower())

Exemple #5

0

Afficher le fichier

 def test_wpd_content_extraction(self):
     """Can we ingest a wpd file?"""
     wpd_opinion = Opinion.objects.get(pk=5)
     extract_doc_content(wpd_opinion.pk, do_ocr=False)
     wpd_opinion.refresh_from_db()
     self.assertIn("greene", wpd_opinion.html.lower())

Exemple #6

0

Afficher le fichier

 def test_html_content_extraction(self):
     """Can we ingest an html file?"""
     html_opinion = Opinion.objects.get(pk=4)
     extract_doc_content(html_opinion.pk, do_ocr=False)
     html_opinion.refresh_from_db()
     self.assertIn("reagan", html_opinion.html.lower())

Exemple #7

0

Afficher le fichier

 def test_text_based_pdf(self):
     """Can we ingest a text based pdf file?"""
     txt_opinion = Opinion.objects.get(pk=3)
     extract_doc_content(txt_opinion.pk, do_ocr=False)
     txt_opinion.refresh_from_db()
     self.assertIn("tarrant", txt_opinion.plain_text.lower())

Exemple #8

0

Afficher le fichier

 def test_image_based_pdf(self):
     """Can we ingest an image based pdf file?"""
     image_opinion = Opinion.objects.get(pk=2)
     extract_doc_content(image_opinion.pk, do_ocr=True)
     image_opinion.refresh_from_db()
     self.assertIn("intelligence", image_opinion.plain_text.lower())

Exemple #9

0

Afficher le fichier

 def test_doc_content_extraction(self):
     """Can we ingest a doc file?"""
     image_opinion = Opinion.objects.get(pk=1)
     extract_doc_content(image_opinion.pk, do_ocr=False)
     image_opinion.refresh_from_db()
     self.assertIn("indiana", image_opinion.plain_text.lower())

Exemple #10

0

Afficher le fichier

 def test_txt_content_extraction(self):
     """Can we ingest a txt file?"""
     txt_opinion = Opinion.objects.get(pk=6)
     extract_doc_content(txt_opinion.pk, do_ocr=False)
     txt_opinion.refresh_from_db()
     self.assertIn("ideal", txt_opinion.plain_text.lower())