def setUpClass(cls): cls.text_out = TextOutput(Document(cls.mandarin_pdf)) cls.all_text = [] # see FIXME for i in range(19): with open(file_in_test_dir('mandarin_txts', '{0}.txt'.format(i)), 'r', encoding='utf-8') as f: cls.all_text.append(f.read())
def test_rawimageoutput(self): for pdf in self.pdfs: for mode in self.img_modes: imgout = RawImageOutput(Document(pdf), mode, resolution=74, scale_before_rotation=True) for img in page_iterator(imgout, scale_pixel_box=(50, 50)): pass
def test_extracted_image_properties(self): for t in self.tests: iout = PDFImageOutput(Document(t["pdf"])) with open(t["prop_file"]) as f: img_props = self.fix_props(json.load(f)) assert type(img_props) == list i = 0 for imgs in page_iterator(iout): for img in imgs: self.assertPropertyDict(img, img_props[i]) i += 1
def test_document_page_iter(self): doc = Document(self.dmca_pdf) with self.assertRaises(KeyError): doc['abc'] with self.assertRaises(IndexError): doc[-19] with self.assertRaises(IndexError): doc[18] with self.assertRaises(TypeError): doc[list()] self.assertEqual([], doc[18:])
def test_document_text_raw(self): doc = Document(self.mandarin_pdf) with open(file_in_test_dir('mandarin_first.txt'), 'r', encoding='utf-8') as fp: self.assertEqual(doc.text(end=0), fp.read())
def test_document_page_by_label(self): doc = Document(self.mandarin_pdf) for i in range(len(doc)): self.assertEqual(doc[i].index, doc[str(i + 1)].index)
def test_load_pdf_without_xref(self): doc = Document(self.simple_file) self.assertEqual(1, len(doc))
def test_load_from_path(self): doc = Document(self.dmca_pdf) self.dmca_prop['filename'] = self.dmca_pdf self._test_doc_properties(doc)
def test_load_from_file_like(self): with open(self.dmca_pdf, 'rb') as fp: doc = Document(fp) self.dmca_prop['filename'] = '' self._test_doc_properties(doc)
def test_document_page_by_label(self): doc = Document(self.mandarin_pdf) with self.subTest("Test get page by labels"): for i in range(len(doc)): self.assertEqual(doc[i].index, doc[str(i + 1)].index)
def setUp(self): super(PageTestCase, self).setUp() self.doc = Document(self.mandarin_pdf)
def setUp(self): super(PageTestCase, self).setUp() Config.text_encoding = 'utf-8' Config.text_eol = 'unix' self.doc = Document(self.mandarin_pdf)
def test_load_from_Path_object(self): doc = Document(Path(self.dmca_pdf)) self.dmca_prop["filename"] = self.dmca_pdf self._test_doc_properties(doc)
def test_load_from_file_like(self): with open(self.dmca_pdf, "rb") as fp: doc = Document(fp) self.dmca_prop["filename"] = "" self._test_doc_properties(doc)
def test_document_text_raw(self): doc = Document(self.mandarin_pdf) with open(file_in_test_dir("mandarin_first.txt"), "r", encoding="utf-8") as fp: self.assertEqual(doc.text(end=0), fp.read())