def test_pdf_open(tmp_path): # fail on a buffer full of null bytes with pytest.raises(PdfParser.PdfFormatError): PdfParser.PdfParser(buf=bytearray(65536)) # make an empty PDF object with PdfParser.PdfParser() as empty_pdf: assert len(empty_pdf.pages) == 0 assert len(empty_pdf.info) == 0 assert not empty_pdf.should_close_buf assert not empty_pdf.should_close_file # make a PDF file pdf_filename = helper_save_as_pdf(tmp_path, "RGB") # open the PDF file with PdfParser.PdfParser(filename=pdf_filename) as hopper_pdf: assert len(hopper_pdf.pages) == 1 assert hopper_pdf.should_close_buf assert hopper_pdf.should_close_file # read a PDF file from a buffer with a non-zero offset with open(pdf_filename, "rb") as f: content = b"xyzzy" + f.read() with PdfParser.PdfParser(buf=content, start_offset=5) as hopper_pdf: assert len(hopper_pdf.pages) == 1 assert not hopper_pdf.should_close_buf assert not hopper_pdf.should_close_file # read a PDF file from an already open file with open(pdf_filename, "rb") as f: with PdfParser.PdfParser(f=f) as hopper_pdf: assert len(hopper_pdf.pages) == 1 assert hopper_pdf.should_close_buf assert not hopper_pdf.should_close_file
def test_pdf_append(self): # make a PDF file pdf_filename = self.helper_save_as_pdf("RGB", producer="PdfParser") # open it, check pages and info with PdfParser.PdfParser(pdf_filename, mode="r+b") as pdf: self.assertEqual(len(pdf.pages), 1) self.assertEqual(len(pdf.info), 4) self.assertEqual( pdf.info.Title, os.path.splitext(os.path.basename(pdf_filename))[0]) self.assertEqual(pdf.info.Producer, "PdfParser") self.assertIn(b"CreationDate", pdf.info) self.assertIn(b"ModDate", pdf.info) self.check_pdf_pages_consistency(pdf) # append some info pdf.info.Title = "abc" pdf.info.Author = "def" pdf.info.Subject = u"ghi\uABCD" pdf.info.Keywords = "qw)e\\r(ty" pdf.info.Creator = "hopper()" pdf.start_writing() pdf.write_xref_and_trailer() # open it again, check pages and info again with PdfParser.PdfParser(pdf_filename) as pdf: self.assertEqual(len(pdf.pages), 1) self.assertEqual(len(pdf.info), 8) self.assertEqual(pdf.info.Title, "abc") self.assertIn(b"CreationDate", pdf.info) self.assertIn(b"ModDate", pdf.info) self.check_pdf_pages_consistency(pdf) # append two images mode_CMYK = hopper("CMYK") mode_P = hopper("P") mode_CMYK.save(pdf_filename, append=True, save_all=True, append_images=[mode_P]) # open the PDF again, check pages and info again with PdfParser.PdfParser(pdf_filename) as pdf: self.assertEqual(len(pdf.pages), 3) self.assertEqual(len(pdf.info), 8) self.assertEqual(PdfParser.decode_text(pdf.info[b"Title"]), "abc") self.assertEqual(pdf.info.Title, "abc") self.assertEqual(pdf.info.Producer, "PdfParser") self.assertEqual(pdf.info.Keywords, "qw)e\\r(ty") self.assertEqual(pdf.info.Subject, u"ghi\uABCD") self.assertIn(b"CreationDate", pdf.info) self.assertIn(b"ModDate", pdf.info) self.check_pdf_pages_consistency(pdf)
def test_pdf_append(tmp_path): # make a PDF file pdf_filename = helper_save_as_pdf(tmp_path, "RGB", producer="PdfParser") # open it, check pages and info with PdfParser.PdfParser(pdf_filename, mode="r+b") as pdf: assert len(pdf.pages) == 1 assert len(pdf.info) == 4 assert pdf.info.Title == os.path.splitext( os.path.basename(pdf_filename))[0] assert pdf.info.Producer == "PdfParser" assert b"CreationDate" in pdf.info assert b"ModDate" in pdf.info check_pdf_pages_consistency(pdf) # append some info pdf.info.Title = "abc" pdf.info.Author = "def" pdf.info.Subject = "ghi\uABCD" pdf.info.Keywords = "qw)e\\r(ty" pdf.info.Creator = "hopper()" pdf.start_writing() pdf.write_xref_and_trailer() # open it again, check pages and info again with PdfParser.PdfParser(pdf_filename) as pdf: assert len(pdf.pages) == 1 assert len(pdf.info) == 8 assert pdf.info.Title == "abc" assert b"CreationDate" in pdf.info assert b"ModDate" in pdf.info check_pdf_pages_consistency(pdf) # append two images mode_cmyk = hopper("CMYK") mode_p = hopper("P") mode_cmyk.save(pdf_filename, append=True, save_all=True, append_images=[mode_p]) # open the PDF again, check pages and info again with PdfParser.PdfParser(pdf_filename) as pdf: assert len(pdf.pages) == 3 assert len(pdf.info) == 8 assert PdfParser.decode_text(pdf.info[b"Title"]) == "abc" assert pdf.info.Title == "abc" assert pdf.info.Producer == "PdfParser" assert pdf.info.Keywords == "qw)e\\r(ty" assert pdf.info.Subject == "ghi\uABCD" assert b"CreationDate" in pdf.info assert b"ModDate" in pdf.info check_pdf_pages_consistency(pdf)
def test_pdf_info(self): # make a PDF file pdf_filename = self.helper_save_as_pdf( "RGB", title="title", author="author", subject="subject", keywords="keywords", creator="creator", producer="producer", creationDate=time.strptime("2000", "%Y"), modDate=time.strptime("2001", "%Y")) # open it, check pages and info with PdfParser.PdfParser(pdf_filename) as pdf: self.assertEqual(len(pdf.info), 8) self.assertEqual(pdf.info.Title, "title") self.assertEqual(pdf.info.Author, "author") self.assertEqual(pdf.info.Subject, "subject") self.assertEqual(pdf.info.Keywords, "keywords") self.assertEqual(pdf.info.Creator, "creator") self.assertEqual(pdf.info.Producer, "producer") self.assertEqual(pdf.info.CreationDate, time.strptime("2000", "%Y")) self.assertEqual(pdf.info.ModDate, time.strptime("2001", "%Y")) self.check_pdf_pages_consistency(pdf)
def helper_save_as_pdf(self, mode, **kwargs): # Arrange im = hopper(mode) outfile = self.tempfile("temp_" + mode + ".pdf") # Act im.save(outfile, **kwargs) # Assert self.assertTrue(os.path.isfile(outfile)) self.assertGreater(os.path.getsize(outfile), 0) with PdfParser.PdfParser(outfile) as pdf: if kwargs.get("append_images", False) or \ kwargs.get("append", False): self.assertGreater(len(pdf.pages), 1) else: self.assertGreater(len(pdf.pages), 0) with open(outfile, 'rb') as fp: contents = fp.read() size = tuple( int(d) for d in contents.split(b'/MediaBox [ 0 0 ')[1].split(b']') [0].split()) self.assertEqual(im.size, size) return outfile
def test_redos(newline): malicious = b" trailer<<>>" + newline * 3456 # This particular exception isn't relevant here. # The important thing is it doesn't timeout, cause a ReDoS (CVE-2021-25292). with pytest.raises(PdfParser.PdfFormatError): PdfParser.PdfParser(buf=malicious)
def test_pdf_info(tmp_path): # make a PDF file pdf_filename = helper_save_as_pdf( tmp_path, "RGB", title="title", author="author", subject="subject", keywords="keywords", creator="creator", producer="producer", creationDate=time.strptime("2000", "%Y"), modDate=time.strptime("2001", "%Y"), ) # open it, check pages and info with PdfParser.PdfParser(pdf_filename) as pdf: assert len(pdf.info) == 8 assert pdf.info.Title == "title" assert pdf.info.Author == "author" assert pdf.info.Subject == "subject" assert pdf.info.Keywords == "keywords" assert pdf.info.Creator == "creator" assert pdf.info.Producer == "producer" assert pdf.info.CreationDate == time.strptime("2000", "%Y") assert pdf.info.ModDate == time.strptime("2001", "%Y") check_pdf_pages_consistency(pdf)
def helper_save_as_pdf(self, mode, **kwargs): # Arrange im = hopper(mode) outfile = self.tempfile("temp_" + mode + ".pdf") # Act im.save(outfile, **kwargs) # Assert assert os.path.isfile(outfile) assert os.path.getsize(outfile) > 0 with PdfParser.PdfParser(outfile) as pdf: if kwargs.get("append_images", False) or kwargs.get( "append", False): assert len(pdf.pages) > 1 else: assert len(pdf.pages) > 0 with open(outfile, "rb") as fp: contents = fp.read() size = tuple( int(d) for d in contents.split(b"/MediaBox [ 0 0 ")[1].split(b"]") [0].split()) assert im.size == size return outfile
def test_pdf_append(self): # make a PDF file pdf_filename = self.helper_save_as_pdf("RGB", producer="PdfParser") # open it, check pages and info with PdfParser.PdfParser(pdf_filename, mode="r+b") as pdf: self.assertEqual(len(pdf.pages), 1) self.assertEqual(len(pdf.info), 4) self.assertEqual(pdf.info.Title, os.path.splitext( os.path.basename(pdf_filename) )[0]) self.assertEqual(pdf.info.Producer, "PdfParser") self.assertIn(b"CreationDate", pdf.info) self.assertIn(b"ModDate", pdf.info) self.check_pdf_pages_consistency(pdf) # append some info pdf.info.Title = "abc" pdf.info.Author = "def" pdf.info.Subject = u"ghi\uABCD" pdf.info.Keywords = "qw)e\\r(ty" pdf.info.Creator = "hopper()" pdf.start_writing() pdf.write_xref_and_trailer() # open it again, check pages and info again with PdfParser.PdfParser(pdf_filename) as pdf: self.assertEqual(len(pdf.pages), 1) self.assertEqual(len(pdf.info), 8) self.assertEqual(pdf.info.Title, "abc") self.assertIn(b"CreationDate", pdf.info) self.assertIn(b"ModDate", pdf.info) self.check_pdf_pages_consistency(pdf) # append two images mode_CMYK = hopper("CMYK") mode_P = hopper("P") mode_CMYK.save(pdf_filename, append=True, save_all=True, append_images=[mode_P]) # open the PDF again, check pages and info again with PdfParser.PdfParser(pdf_filename) as pdf: self.assertEqual(len(pdf.pages), 3) self.assertEqual(len(pdf.info), 8) self.assertEqual(PdfParser.decode_text(pdf.info[b"Title"]), "abc") self.assertEqual(pdf.info.Title, "abc") self.assertEqual(pdf.info.Producer, "PdfParser") self.assertEqual(pdf.info.Keywords, "qw)e\\r(ty") self.assertEqual(pdf.info.Subject, u"ghi\uABCD") self.assertIn(b"CreationDate", pdf.info) self.assertIn(b"ModDate", pdf.info) self.check_pdf_pages_consistency(pdf)
def test_pdf_open(self): # fail on a buffer full of null bytes self.assertRaises(PdfParser.PdfFormatError, PdfParser.PdfParser, buf=bytearray(65536)) # make an empty PDF object with PdfParser.PdfParser() as empty_pdf: self.assertEqual(len(empty_pdf.pages), 0) self.assertEqual(len(empty_pdf.info), 0) self.assertFalse(empty_pdf.should_close_buf) self.assertFalse(empty_pdf.should_close_file) # make a PDF file pdf_filename = self.helper_save_as_pdf("RGB") # open the PDF file with PdfParser.PdfParser(filename=pdf_filename) as hopper_pdf: self.assertEqual(len(hopper_pdf.pages), 1) self.assertTrue(hopper_pdf.should_close_buf) self.assertTrue(hopper_pdf.should_close_file) # read a PDF file from a buffer with a non-zero offset with open(pdf_filename, "rb") as f: content = b"xyzzy" + f.read() with PdfParser.PdfParser(buf=content, start_offset=5) as hopper_pdf: self.assertEqual(len(hopper_pdf.pages), 1) self.assertFalse(hopper_pdf.should_close_buf) self.assertFalse(hopper_pdf.should_close_file) # read a PDF file from an already open file with open(pdf_filename, "rb") as f: with PdfParser.PdfParser(f=f) as hopper_pdf: self.assertEqual(len(hopper_pdf.pages), 1) self.assertTrue(hopper_pdf.should_close_buf) self.assertFalse(hopper_pdf.should_close_file)
def helper_save_as_pdf(self, mode, **kwargs): # Arrange im = hopper(mode) outfile = self.tempfile("temp_" + mode + ".pdf") # Act im.save(outfile, **kwargs) # Assert self.assertTrue(os.path.isfile(outfile)) self.assertGreater(os.path.getsize(outfile), 0) with PdfParser.PdfParser(outfile) as pdf: if kwargs.get("append_images", False) or \ kwargs.get("append", False): self.assertGreater(len(pdf.pages), 1) else: self.assertGreater(len(pdf.pages), 0) return outfile