def testJpegImage(self): """ Test loading and parsing of a file. Extract the image of the file and compare to expected textual output. Expected outcome: file loads, image matches expected. """ with open(join(TEST_DATA_ROOT, 'jpeg.pdf'), 'rb') as inputfile: # Load PDF file from file r = PdfFileReader(inputfile) # Retrieve the text of the image with open(join(self.localDataRoot, 'jpeg.txt'), 'r') as pdftextFile: imagetext = pdftextFile.read() page1 = r.getPage(0) xObject = page1['/Resources']['/XObject'].getObject() data = xObject['/Im4'].getData() # Compare the text of the PDF to a known source self.assertEqual( binascii.hexlify(data).decode(), imagetext, msg='PDF extracted image differs from expected value.' '\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' % (imagetext, binascii.hexlify(data).decode())) r.close()
def testObjectIds(self): """ Tests the ``ObjectStream.objectIds()`` method. """ expResults = ( (8, 3, 10, 2, 1, 11, 13, 15, 4, 19, 5, 20, 6, 21, 17), (644, 642, 646, 647, 648, 122, 119, 120, 121, 124, 179, 232, 327, 467, 478, 519, 568, 573, 580, 586, 592, 598, 603, 611, 616, 623, 629, 634), ) # Files we know to have Object Streams within inputData = ( # (filename, id, generation number) ("crazyones.pdf", 9, 0), ("GeoBase_NHNC1_Data_Model_UML_EN.pdf", 645, 0), ) for o, d in zip(expResults, inputData): filepath = join(TESTS_DATA_ROOT, d[0]) r = PdfFileReader(filepath) ref = IndirectObject(d[1], d[2], r) objStm = r.getObject(ref) r.close() self.assertIsInstance(objStm, ObjectStream) self.assertTupleEqual(tuple(o), tuple(objStm.objectIds))
def testDel(self): """ Tests the ``__del__()`` method of ``PdfFileReader`` and ``PdfFileWriter`` ensuring that no exceptions are raised. """ r = PdfFileReader(join(TEST_DATA_ROOT, "crazyones.pdf")) w = PdfFileWriter() try: # This may generate some collateral warnings in stderr when del r # is performed by the GC r.__del__() self.assertTrue(True) except Exception as e: self.assertTrue( False, "Exception '%s' was raised in %s.__del__()" % (e, PdfFileReader.__name__)) try: w.__del__() self.assertTrue(True) except Exception as e: self.assertTrue( False, "Exception '%s' was raised in %s.__del__()" % (e, PdfFileWriter.__name__))
def testFileLoad(self): """ Test loading and parsing of a file. Extract text of the file and compare to expected textual output. Expected outcome: file loads, text matches expected. """ with open(join(TEST_DATA_ROOT, 'crazyones.pdf'), 'rb') as inputfile: # Load PDF file from file r = PdfFileReader(inputfile) page1 = r.getPage(0) # Retrieve the text of the PDF with open(join(self.localDataRoot, 'crazyones.txt'), 'rb') as pdftextFile: pdftext = pdftextFile.read() page1Text = page1.extractText().replace('\n', '').encode('utf-8') # Compare the text of the PDF to a known source self.assertEqual( pdftext, page1Text, msg='PDF extracted text differs from expected value.' '\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' % (pdftext, page1Text)) r.close()
def testAttachFiles(self): """ Tests the addAttachment function for attaching multiple files. Since the Names array in the EmbeddedFiles dictionary contains both the name (string) and indirect object (dictionary) for each file, we have to check for two entries per attached file. """ numAttachments = 3 _, testfile = tempfile.mkstemp() try: # Make PDF with attachment with PdfFileReader(join(TEST_DATA_ROOT, 'jpeg.pdf')) as reader: with PdfFileWriter(testfile) as writer: writer.appendPagesFromReader(reader) writer.attachFiles( [join(TEST_DATA_ROOT, 'attachment_small.png')] * numAttachments) writer.write() # Check for attachment entries with PdfFileReader(testfile) as pdf: pdf.numPages # For caching _cachedObjects data for k, v in pdf._cachedObjects.items(): if '/Type' in v: if v['/Type'] == '/Catalog': self.assertIsNotNone(v['/Names']['/EmbeddedFiles']) real = len(v['/Names']['/EmbeddedFiles']['/Names']) self.assertEqual(numAttachments * 2, real) finally: os.remove(testfile)
def testXRefTableObjects(self): """ Ensures that after ``PdfFileReader._parsePdfFile()`` all the indirect references from the XRef-Table *only* have been loaded as expected. Objects from the free entries list are included as well in the test. This case tests the part of ``PdfFileReader.objects()`` responsible for generating the Cross-Reference Table entries too. """ self.maxDiff = None inputFiles = ("jpeg.pdf", "Seige_of_Vicksburg_Sample_OCR.pdf", "SF424_page2.pdf") for filename in inputFiles: filepath = join(TEST_DATA_ROOT, filename) xtablepath = join(self.localDataRoot, filename) r = PdfFileReader(filepath) # The two below are (id, gen, byte offset)-valued lists actualItems = list() expItems = list() for ref in r.objects(PdfFileReader.R_XTABLE, True): actualItems.append( (ref.idnum, ref.generation, r._xrefTable[ref.generation][ref.idnum][0])) r.close() # We artificially read the XRef Table entries that we know belong # to filepath, and store them into expItems. expItems = sorted(self._parseXRefTable(xtablepath, (0, 1, 2))) actualItems = sorted(actualItems) expItems = sorted(expItems) self.assertListEqual(expItems, actualItems, "Differences found in " + filename)
def testDel(self): """ Tests the ``__del__()`` method of ``PdfFileReader`` and ``PdfFileWriter`` ensuring that no exceptions are raised. """ r = PdfFileReader(join(TEST_DATA_ROOT, "crazyones.pdf")) w = PdfFileWriter(BytesIO(b"")) try: r.__del__() self.assertTrue(True) except Exception as e: # pylint: disable=broad-except self.assertTrue( False, "Exception '%s' was raised in %s.__del__()" % (e, PdfFileReader.__name__), ) try: w.__del__() self.assertTrue(True) except Exception as e: # pylint: disable=broad-except self.assertTrue( False, "Exception '%s' was raised in %s.__del__()" % (e, PdfFileWriter.__name__), )
def testDecodeStreamData(self): """ Stores PDF files infos and the coordinates of stream objects. We don't care if we need to open a new file stream for each obj. reference -- unit tests don't have to be efficient """ this_dir = join(TEST_DATA_ROOT, self.testDecodeStreamData.__name__) filters = ( # (filter type, filename, id, gen. number) (FlateCodec, "FlateDecode.pdf", 4, 0), (FlateCodec, "FlateDecode.pdf", 8, 0), (FlateCodec, "FlateDecode.pdf", 9, 0), # TO-DO No PDF files found with this type of encoding, get them. # (ASCIIHexCodec, "ASCIIHexDecode.pdf", ?, ?) (LZWCodec, "LZWDecode.pdf", 209, 0), (LZWCodec, "LZWDecode.pdf", 210, 0), (LZWCodec, "LZWDecode.pdf", 211, 0), (ASCII85Codec, "ASCII85Decode.pdf", 5, 0), (ASCII85Codec, "ASCII85Decode.pdf", 6, 0), (DCTCodec, "DCTDecode.pdf", 4, 0), # TO-DO No PDF files found with this type of encoding, get them. # (JPXCodec, "JPXDecode.pdf", ?, ?) (CCITTFaxCodec, "CCITTFaxDecode.pdf", 46, 0), ) for f__ in filters: with open(join(this_dir, f__[1]), "rb") as infile: reader = PdfFileReader(infile) ref = IndirectObject(f__[2], f__[3], reader) stream = reader.getObject(ref) # Ensures that the PdfFileReader reads a stream object self.assertEqual(EncodedStreamObject, type(stream)) # print("Running with %s!" % f[0].__name__) if f__[0] is CCITTFaxCodec: self.assertEqual( f__[0].decode( stream._data, stream.get("/DecodeParms"), stream.get("/Height"), ), decodeStreamData(stream), ) else: self.assertEqual( f__[0].decode(stream._data, stream.get("/DecodeParms")), decodeStreamData(stream), )
def testReadXRefStreamCompressedObjects(self): """ Targets the same objects as ``testXRefStreamObjects()``, but instead of ensuring an identity between the list of items read and the one expected, it verifies that their *contents* are identical. This method does **not** test ``PdfFileReader.objects()`` as two of the previous test cases did. """ self.maxDiff = None inputFiles = ("crazyones.pdf", ) # expItems and actualItems will contain two-element tuples, where the # first element is the object ID, used to sort. sortKey = lambda e: e[0] compressedObj = lambda e: e[1][0] == 2 for filename in inputFiles: filepath = join(self.localDataRoot, filename) r = PdfFileReader(join(TEST_DATA_ROOT, filename)) expItems = list() actualItems = list() with open(filepath, "rb") as instream: for line in instream: if not line or line.isspace() or line.startswith(b"%"): continue globalId, offset, obj = line.split(b" ", 2) globalId, offset = int(globalId), int(offset) with BytesIO(obj) as objStream: obj = readObject(objStream, r) expItems.append((globalId, obj)) for itemid, item in filter(compressedObj, r._xrefStm.items()): # We deal exclusively with compressed objects (from Table 18 of # ISO 32000 reference, 2008) whose generation number is 0 actualItems.append( # (ID, PdfObject) tuples (itemid, IndirectObject(itemid, 0, r).getObject())) r.close() expItems = sorted(expItems, key=sortKey) actualItems = sorted(actualItems, key=sortKey) self.assertListEqual(expItems, actualItems)
def testXTableAgainstXStream(self): """ In section 7.5.8.4 of ISO 32000, "Compatibility with Applications That Do Not Support Compressed Reference Streams", the standard describes a means of crafting PDF files designed for versions 1.5+ that can be opened nevertheless by readers that support older versions. This test case verifies that all the items hidden by the XRef Table in non-conforming readers are *all and exactly* loaded into the XRef Stream by readers that support PDF 1.5+. """ self.maxDiff = None # TO-DO Possibly add a few other files to this test case inputFiles = ("GeoBase_NHNC1_Data_Model_UML_EN.pdf", ) for filename in inputFiles: filepath = join(self.localDataRoot, filename) expItems = { e[0]: e[1:] for e in self._parseXRefTable(filepath, (0, 2, 3)) } actualItems = list() r = PdfFileReader(join(TEST_DATA_ROOT, filename)) for ref in r.objects(PdfFileReader.R_XSTREAM, True): actualItems.append(ref) r.close() actualItems = sorted(actualItems, key=lambda e: e.idnum) expKeys = sorted(expItems.keys()) actualKeys = list(map(lambda e: e.idnum, actualItems)) self.assertListEqual(expKeys, actualKeys, "Lists of item IDs are not identical") for e, a in zip(expKeys, actualItems): self.assertEqual(e, a.idnum, "Items ID does not correspond") # If an item is in use in the XRef Stream, ensure then that it # is marked free in the XRef Table. if r._xrefStm[a.idnum][0] in (2, ): self.assertTrue( expItems[e][-1], "Item %d should be hid by the XRef Table, but it was " "not." % e, )
def testXRefStreamObjects(self): """ Like ``PdfReaderTestCases.testXRefTableObjects()``, except that it tests objects referenced by the Cross-Reference Stream. ``PdfFileReader.objects()`` second part (dealing with XStream objects) is invoked and implicitly tested. """ inputFiles = ("crazyones.pdf", ) for filename in inputFiles: filepath = join(self.localDataRoot, filename) r = PdfFileReader(join(TEST_DATA_ROOT, filename)) # Two lists of tuples as explained by Table 18 actualItems = list() expItems = list() with open(filepath, "r") as instream: for line in instream: if not line or line.isspace() or line.startswith("%"): continue this_type, field2, field3 = (int(f) for f in line.split()) expItems.append((this_type, field2, field3)) for item in r.objects(PdfFileReader.R_XSTREAM, True): priv8Item = r._xrefStm[item.idnum] if priv8Item[0] in {0, 1}: self.assertEqual(priv8Item[2], item.generation) elif priv8Item[0] == 2: self.assertEqual(item.generation, 0) actualItems.append(priv8Item) r.close() actualItems = sorted(actualItems) expItems = sorted(expItems) self.assertListEqual( expItems, actualItems, "Didn't correctly read the Cross-Reference Stream", )
def testIsObjectFree(self): """ Tests the ``PdfFileReader.isObjectFree()` method. """ # TO-DO Find PDF files that feature free-entry lists. We are checking # isObjectFree() only against used items. inputFiles = ( "jpeg.pdf", "Seige_of_Vicksburg_Sample_OCR.pdf", "SF424_page2.pdf", ) for filename in inputFiles: filepath = join(self.localDataRoot, filename) r = PdfFileReader(join(TEST_DATA_ROOT, filename)) expItems = self._parseXRefTable(filepath, (0, 1, 3)) actualItems = list() for ref in r.objects(PdfFileReader.R_XTABLE, True): actualItems.append( # This is where isObjectFree() gets invoked (ref.idnum, ref.generation, r.isObjectFree(ref))) r.close() expItems = sorted(expItems) actualItems = sorted(actualItems) self.assertListEqual(expItems, actualItems)
def testAddAttachment(self): """ Tests the addAttachment function for attaching a single file. Since the Names array in the EmbeddedFiles dictionary contains both the name (string) and indirect object (dictionary) for each file, we have to check for two entries per attached file. """ _, testfile = tempfile.mkstemp() try: # Make PDF with attachment with PdfFileReader(join(TEST_DATA_ROOT, "jpeg.pdf")) as reader: with PdfFileWriter(testfile) as writer: writer.appendPagesFromReader(reader) with open( join( # pylint: disable=bad-continuation TEST_DATA_ROOT, "attachment_small.png"), "rb", # pylint: disable=bad-continuation # pylint: disable=bad-continuation ) as attachment_stream: read_data = attachment_stream.read() writer.addAttachment("attachment_small.png", read_data) writer.write() # Check for attachment entries with PdfFileReader(testfile) as pdf: # For caching _cachedObjects data pdf.numPages # pylint: disable=pointless-statement for _k, v in pdf._cachedObjects.items(): if "/Type" in v: if v["/Type"] == "/Catalog": self.assertIsNotNone(v["/Names"]["/EmbeddedFiles"]) real = len(v["/Names"]["/EmbeddedFiles"]["/Names"]) self.assertEqual(2, real) finally: os.remove(testfile)
def testContextManager(self): """ Tests the context manager implementation (the ``with <expr> as identifier`` feature) of ``PdfFileReader``. """ inputFiles = ("jpeg.pdf", "Seige_of_Vicksburg_Sample_OCR.pdf", "SF424_page2.pdf") for filename in inputFiles: r = None with PdfFileReader(join(TEST_DATA_ROOT, filename)) as r: # Test assertions not strictly related to the whole test case self.assertEqual(filename, basename(r.filepath)) self.assertFalse(r.isClosed) self.assertTrue(r.isClosed)
def main(): pagesRequired = 5 output = "PyPDF-Features-Output.pdf" if set(argv) & FLAG_HELP: print(USAGE) exit(0) elif len(argv) < 2: print(USAGE) exit(1) else: inputpath = argv[1].strip() filename = basename(inputpath) if len(argv) > 2: output = argv[2].strip() # We can instantiate a PdfFileReader/Writer by giving in a stream object # or a path string reader = PdfFileReader(open(inputpath, "rb")) writer = PdfFileWriter(output) # Check that the PDF file has the required number of pages if reader.numPages < pagesRequired: print( "We require a document with %d pages at least, %s has %d" % (pagesRequired, filename, reader.numPages), file=stderr, ) exit(1) else: print("'%s' has %d pages... OK" % (filename, reader.numPages)) # Add page 1 from reader to output document, unchanged writer.addPage(reader.getPage(0)) # Add page 2 from reader, but rotated clockwise 90 degrees writer.addPage(reader.getPage(1).rotateClockwise(90)) # Add page 3 from reader, rotated the other way: writer.addPage(reader.getPage(2).rotateCounterClockwise(90)) # Alt.: writer.addPage(reader.getPage(2).rotateClockwise(270)) # Add page 4 from reader, but first add a watermark from another PDF: page4 = reader.getPage(3) watermark = PdfFileReader(open(join(SAMPLE_PDF_ROOT, "AutoCad_Diagram.pdf"), "rb")) page4.mergePage(watermark.getPage(0)) writer.addPage(page4) # Add page 5 from reader, but crop it to half size: page5 = reader.getPage(4) page5.mediaBox.upperRight = ( page5.mediaBox.getUpperRight_x() / 2, page5.mediaBox.getUpperRight_y() / 2, ) writer.addPage(page5) # Add some Javascript to launch the print window on opening this PDF. # The password dialog may prevent the print dialog from being shown. # Comment the encrypted lines, if that's the case, to try this out writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") # Encrypt your new PDF and add a password password = "******" writer.encrypt(password) # Finally, write the resulting PDF document to ``output`` writer.write() print("Output successfully written to", output) reader.close() writer.close()
def setUp(self): reader = PdfFileReader(join(TEST_DATA_ROOT, 'crazyones.pdf')) self.writer = PdfFileWriter(BytesIO(b"")) self.writer.appendPagesFromReader(reader)
def setUp(self): ipdf = PdfFileReader(join(TEST_DATA_ROOT, 'crazyones.pdf')) self.pdfFileWriter = PdfFileWriter() self.pdfFileWriter.appendPagesFromReader(ipdf)
def setUp(self): """ [EXPLAIN THIS CONVENIENCE.] """ reader = PdfFileReader(join(TEST_DATA_ROOT, "crazyones.pdf")) self.writer = PdfFileWriter(BytesIO(b"")) self.writer.appendPagesFromReader(reader)