def testFileLoad(self): """ Test loading and parsing of a file. Extract text of the file and compare to expected textual output. Expected outcome: file loads, text matches expected. """ with open(join(TEST_DATA_ROOT, 'crazyones.pdf'), 'rb') as inputfile: # Load PDF file from file r = PdfFileReader(inputfile) page1 = r.getPage(0) # Retrieve the text of the PDF with open(join(self.localDataRoot, 'crazyones.txt'), 'rb') as pdftextFile: pdftext = pdftextFile.read() page1Text = page1.extractText().replace('\n', '').encode('utf-8') # Compare the text of the PDF to a known source self.assertEqual( pdftext, page1Text, msg='PDF extracted text differs from expected value.' '\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' % (pdftext, page1Text)) r.close()
def testJpegImage(self): """ Test loading and parsing of a file. Extract the image of the file and compare to expected textual output. Expected outcome: file loads, image matches expected. """ with open(join(TEST_DATA_ROOT, 'jpeg.pdf'), 'rb') as inputfile: # Load PDF file from file r = PdfFileReader(inputfile) # Retrieve the text of the image with open(join(self.localDataRoot, 'jpeg.txt'), 'r') as pdftextFile: imagetext = pdftextFile.read() page1 = r.getPage(0) xObject = page1['/Resources']['/XObject'].getObject() data = xObject['/Im4'].getData() # Compare the text of the PDF to a known source self.assertEqual( binascii.hexlify(data).decode(), imagetext, msg='PDF extracted image differs from expected value.' '\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' % (imagetext, binascii.hexlify(data).decode())) r.close()
def testXRefTableObjects(self): """ Ensures that after ``PdfFileReader._parsePdfFile()`` all the indirect references from the XRef-Table *only* have been loaded as expected. Objects from the free entries list are included as well in the test. This case tests the part of ``PdfFileReader.objects()`` responsible for generating the Cross-Reference Table entries too. """ self.maxDiff = None inputFiles = ("jpeg.pdf", "Seige_of_Vicksburg_Sample_OCR.pdf", "SF424_page2.pdf") for filename in inputFiles: filepath = join(TEST_DATA_ROOT, filename) xtablepath = join(self.localDataRoot, filename) r = PdfFileReader(filepath) # The two below are (id, gen, byte offset)-valued lists actualItems = list() expItems = list() for ref in r.objects(PdfFileReader.R_XTABLE, True): actualItems.append( (ref.idnum, ref.generation, r._xrefTable[ref.generation][ref.idnum][0])) r.close() # We artificially read the XRef Table entries that we know belong # to filepath, and store them into expItems. expItems = sorted(self._parseXRefTable(xtablepath, (0, 1, 2))) actualItems = sorted(actualItems) expItems = sorted(expItems) self.assertListEqual(expItems, actualItems, "Differences found in " + filename)
def testIsObjectFree(self): """ Tests the ``PdfFileReader.isObjectFree()` method. """ # TO-DO Find PDF files that feature free-entry lists. We are checking # isObjectFree() only against used items. inputFiles = ( "jpeg.pdf", "Seige_of_Vicksburg_Sample_OCR.pdf", "SF424_page2.pdf", ) for filename in inputFiles: filepath = join(self.localDataRoot, filename) r = PdfFileReader(join(TEST_DATA_ROOT, filename)) expItems = self._parseXRefTable(filepath, (0, 1, 3)) actualItems = list() for ref in r.objects(PdfFileReader.R_XTABLE, True): actualItems.append( # This is where isObjectFree() gets invoked (ref.idnum, ref.generation, r.isObjectFree(ref))) r.close() expItems = sorted(expItems) actualItems = sorted(actualItems) self.assertListEqual(expItems, actualItems)
def testObjectIds(self): """ Tests the ``ObjectStream.objectIds()`` method. """ expResults = ( (8, 3, 10, 2, 1, 11, 13, 15, 4, 19, 5, 20, 6, 21, 17), (644, 642, 646, 647, 648, 122, 119, 120, 121, 124, 179, 232, 327, 467, 478, 519, 568, 573, 580, 586, 592, 598, 603, 611, 616, 623, 629, 634), ) # Files we know to have Object Streams within inputData = ( # (filename, id, generation number) ("crazyones.pdf", 9, 0), ("GeoBase_NHNC1_Data_Model_UML_EN.pdf", 645, 0), ) for o, d in zip(expResults, inputData): filepath = join(TESTS_DATA_ROOT, d[0]) r = PdfFileReader(filepath) ref = IndirectObject(d[1], d[2], r) objStm = r.getObject(ref) r.close() self.assertIsInstance(objStm, ObjectStream) self.assertTupleEqual(tuple(o), tuple(objStm.objectIds))
def testReadXRefStreamCompressedObjects(self): """ Targets the same objects as ``testXRefStreamObjects()``, but instead of ensuring an identity between the list of items read and the one expected, it verifies that their *contents* are identical. This method does **not** test ``PdfFileReader.objects()`` as two of the previous test cases did. """ self.maxDiff = None inputFiles = ("crazyones.pdf", ) # expItems and actualItems will contain two-element tuples, where the # first element is the object ID, used to sort. sortKey = lambda e: e[0] compressedObj = lambda e: e[1][0] == 2 for filename in inputFiles: filepath = join(self.localDataRoot, filename) r = PdfFileReader(join(TEST_DATA_ROOT, filename)) expItems = list() actualItems = list() with open(filepath, "rb") as instream: for line in instream: if not line or line.isspace() or line.startswith(b"%"): continue globalId, offset, obj = line.split(b" ", 2) globalId, offset = int(globalId), int(offset) with BytesIO(obj) as objStream: obj = readObject(objStream, r) expItems.append((globalId, obj)) for itemid, item in filter(compressedObj, r._xrefStm.items()): # We deal exclusively with compressed objects (from Table 18 of # ISO 32000 reference, 2008) whose generation number is 0 actualItems.append( # (ID, PdfObject) tuples (itemid, IndirectObject(itemid, 0, r).getObject())) r.close() expItems = sorted(expItems, key=sortKey) actualItems = sorted(actualItems, key=sortKey) self.assertListEqual(expItems, actualItems)
def testXTableAgainstXStream(self): """ In section 7.5.8.4 of ISO 32000, "Compatibility with Applications That Do Not Support Compressed Reference Streams", the standard describes a means of crafting PDF files designed for versions 1.5+ that can be opened nevertheless by readers that support older versions. This test case verifies that all the items hidden by the XRef Table in non-conforming readers are *all and exactly* loaded into the XRef Stream by readers that support PDF 1.5+. """ self.maxDiff = None # TO-DO Possibly add a few other files to this test case inputFiles = ("GeoBase_NHNC1_Data_Model_UML_EN.pdf", ) for filename in inputFiles: filepath = join(self.localDataRoot, filename) expItems = { e[0]: e[1:] for e in self._parseXRefTable(filepath, (0, 2, 3)) } actualItems = list() r = PdfFileReader(join(TEST_DATA_ROOT, filename)) for ref in r.objects(PdfFileReader.R_XSTREAM, True): actualItems.append(ref) r.close() actualItems = sorted(actualItems, key=lambda e: e.idnum) expKeys = sorted(expItems.keys()) actualKeys = list(map(lambda e: e.idnum, actualItems)) self.assertListEqual(expKeys, actualKeys, "Lists of item IDs are not identical") for e, a in zip(expKeys, actualItems): self.assertEqual(e, a.idnum, "Items ID does not correspond") # If an item is in use in the XRef Stream, ensure then that it # is marked free in the XRef Table. if r._xrefStm[a.idnum][0] in (2, ): self.assertTrue( expItems[e][-1], "Item %d should be hid by the XRef Table, but it was " "not." % e, )
def testXRefStreamObjects(self): """ Like ``PdfReaderTestCases.testXRefTableObjects()``, except that it tests objects referenced by the Cross-Reference Stream. ``PdfFileReader.objects()`` second part (dealing with XStream objects) is invoked and implicitly tested. """ inputFiles = ("crazyones.pdf", ) for filename in inputFiles: filepath = join(self.localDataRoot, filename) r = PdfFileReader(join(TEST_DATA_ROOT, filename)) # Two lists of tuples as explained by Table 18 actualItems = list() expItems = list() with open(filepath, "r") as instream: for line in instream: if not line or line.isspace() or line.startswith("%"): continue this_type, field2, field3 = (int(f) for f in line.split()) expItems.append((this_type, field2, field3)) for item in r.objects(PdfFileReader.R_XSTREAM, True): priv8Item = r._xrefStm[item.idnum] if priv8Item[0] in {0, 1}: self.assertEqual(priv8Item[2], item.generation) elif priv8Item[0] == 2: self.assertEqual(item.generation, 0) actualItems.append(priv8Item) r.close() actualItems = sorted(actualItems) expItems = sorted(expItems) self.assertListEqual( expItems, actualItems, "Didn't correctly read the Cross-Reference Stream", )
def main(): pagesRequired = 5 output = "PyPDF-Features-Output.pdf" if set(argv) & FLAG_HELP: print(USAGE) exit(0) elif len(argv) < 2: print(USAGE) exit(1) else: inputpath = argv[1].strip() filename = basename(inputpath) if len(argv) > 2: output = argv[2].strip() # We can instantiate a PdfFileReader/Writer by giving in a stream object # or a path string reader = PdfFileReader(open(inputpath, "rb")) writer = PdfFileWriter(output) # Check that the PDF file has the required number of pages if reader.numPages < pagesRequired: print( "We require a document with %d pages at least, %s has %d" % (pagesRequired, filename, reader.numPages), file=stderr, ) exit(1) else: print("'%s' has %d pages... OK" % (filename, reader.numPages)) # Add page 1 from reader to output document, unchanged writer.addPage(reader.getPage(0)) # Add page 2 from reader, but rotated clockwise 90 degrees writer.addPage(reader.getPage(1).rotateClockwise(90)) # Add page 3 from reader, rotated the other way: writer.addPage(reader.getPage(2).rotateCounterClockwise(90)) # Alt.: writer.addPage(reader.getPage(2).rotateClockwise(270)) # Add page 4 from reader, but first add a watermark from another PDF: page4 = reader.getPage(3) watermark = PdfFileReader(open(join(SAMPLE_PDF_ROOT, "AutoCad_Diagram.pdf"), "rb")) page4.mergePage(watermark.getPage(0)) writer.addPage(page4) # Add page 5 from reader, but crop it to half size: page5 = reader.getPage(4) page5.mediaBox.upperRight = ( page5.mediaBox.getUpperRight_x() / 2, page5.mediaBox.getUpperRight_y() / 2, ) writer.addPage(page5) # Add some Javascript to launch the print window on opening this PDF. # The password dialog may prevent the print dialog from being shown. # Comment the encrypted lines, if that's the case, to try this out writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") # Encrypt your new PDF and add a password password = "******" writer.encrypt(password) # Finally, write the resulting PDF document to ``output`` writer.write() print("Output successfully written to", output) reader.close() writer.close()