Example #1
0
 def setUpClass(cls):
     cls.text_out = TextOutput(Document(cls.mandarin_pdf))
     cls.all_text = []
     # see FIXME
     for i in range(19):
         with open(file_in_test_dir('mandarin_txts', '{0}.txt'.format(i)),
                   'r',
                   encoding='utf-8') as f:
             cls.all_text.append(f.read())
Example #2
0
 def test_rawimageoutput(self):
     for pdf in self.pdfs:
         for mode in self.img_modes:
             imgout = RawImageOutput(Document(pdf),
                                     mode,
                                     resolution=74,
                                     scale_before_rotation=True)
             for img in page_iterator(imgout, scale_pixel_box=(50, 50)):
                 pass
Example #3
0
 def test_extracted_image_properties(self):
     for t in self.tests:
         iout = PDFImageOutput(Document(t["pdf"]))
         with open(t["prop_file"]) as f:
             img_props = self.fix_props(json.load(f))
             assert type(img_props) == list
             i = 0
             for imgs in page_iterator(iout):
                 for img in imgs:
                     self.assertPropertyDict(img, img_props[i])
                     i += 1
Example #4
0
    def test_document_page_iter(self):
        doc = Document(self.dmca_pdf)
        with self.assertRaises(KeyError):
            doc['abc']
        with self.assertRaises(IndexError):
            doc[-19]
        with self.assertRaises(IndexError):
            doc[18]
        with self.assertRaises(TypeError):
            doc[list()]

        self.assertEqual([], doc[18:])
Example #5
0
 def test_document_text_raw(self):
     doc = Document(self.mandarin_pdf)
     with open(file_in_test_dir('mandarin_first.txt'),
               'r',
               encoding='utf-8') as fp:
         self.assertEqual(doc.text(end=0), fp.read())
Example #6
0
 def test_document_page_by_label(self):
     doc = Document(self.mandarin_pdf)
     for i in range(len(doc)):
         self.assertEqual(doc[i].index, doc[str(i + 1)].index)
Example #7
0
 def test_load_pdf_without_xref(self):
     doc = Document(self.simple_file)
     self.assertEqual(1, len(doc))
Example #8
0
 def test_load_from_path(self):
     doc = Document(self.dmca_pdf)
     self.dmca_prop['filename'] = self.dmca_pdf
     self._test_doc_properties(doc)
Example #9
0
 def test_load_from_file_like(self):
     with open(self.dmca_pdf, 'rb') as fp:
         doc = Document(fp)
         self.dmca_prop['filename'] = ''
         self._test_doc_properties(doc)
Example #10
0
 def test_document_page_by_label(self):
     doc = Document(self.mandarin_pdf)
     with self.subTest("Test get page by labels"):
         for i in range(len(doc)):
             self.assertEqual(doc[i].index, doc[str(i + 1)].index)
Example #11
0
 def setUp(self):
     super(PageTestCase, self).setUp()
     self.doc = Document(self.mandarin_pdf)
Example #12
0
 def setUp(self):
     super(PageTestCase, self).setUp()
     Config.text_encoding = 'utf-8'
     Config.text_eol = 'unix'
     self.doc = Document(self.mandarin_pdf)
Example #13
0
 def test_load_from_Path_object(self):
     doc = Document(Path(self.dmca_pdf))
     self.dmca_prop["filename"] = self.dmca_pdf
     self._test_doc_properties(doc)
Example #14
0
 def test_load_from_file_like(self):
     with open(self.dmca_pdf, "rb") as fp:
         doc = Document(fp)
         self.dmca_prop["filename"] = ""
         self._test_doc_properties(doc)
Example #15
0
 def test_document_text_raw(self):
     doc = Document(self.mandarin_pdf)
     with open(file_in_test_dir("mandarin_first.txt"), "r", encoding="utf-8") as fp:
         self.assertEqual(doc.text(end=0), fp.read())