def testIsObjectFree(self): """ Tests the ``PdfFileReader.isObjectFree()` method. """ # TO-DO Find PDF files that feature free-entry lists. We are checking # isObjectFree() only against used items. inputFiles = ( "jpeg.pdf", "Seige_of_Vicksburg_Sample_OCR.pdf", "SF424_page2.pdf", ) for filename in inputFiles: filepath = join(self.localDataRoot, filename) r = PdfFileReader(join(TEST_DATA_ROOT, filename)) expItems = self._parseXRefTable(filepath, (0, 1, 3)) actualItems = list() for ref in r.objects(PdfFileReader.R_XTABLE, True): actualItems.append( # This is where isObjectFree() gets invoked (ref.idnum, ref.generation, r.isObjectFree(ref))) r.close() expItems = sorted(expItems) actualItems = sorted(actualItems) self.assertListEqual(expItems, actualItems)
def testXRefTableObjects(self): """ Ensures that after ``PdfFileReader._parsePdfFile()`` all the indirect references from the XRef-Table *only* have been loaded as expected. Objects from the free entries list are included as well in the test. This case tests the part of ``PdfFileReader.objects()`` responsible for generating the Cross-Reference Table entries too. """ self.maxDiff = None inputFiles = ("jpeg.pdf", "Seige_of_Vicksburg_Sample_OCR.pdf", "SF424_page2.pdf") for filename in inputFiles: filepath = join(TEST_DATA_ROOT, filename) xtablepath = join(self.localDataRoot, filename) r = PdfFileReader(filepath) # The two below are (id, gen, byte offset)-valued lists actualItems = list() expItems = list() for ref in r.objects(PdfFileReader.R_XTABLE, True): actualItems.append( (ref.idnum, ref.generation, r._xrefTable[ref.generation][ref.idnum][0])) r.close() # We artificially read the XRef Table entries that we know belong # to filepath, and store them into expItems. expItems = sorted(self._parseXRefTable(xtablepath, (0, 1, 2))) actualItems = sorted(actualItems) expItems = sorted(expItems) self.assertListEqual(expItems, actualItems, "Differences found in " + filename)
def testXTableAgainstXStream(self): """ In section 7.5.8.4 of ISO 32000, "Compatibility with Applications That Do Not Support Compressed Reference Streams", the standard describes a means of crafting PDF files designed for versions 1.5+ that can be opened nevertheless by readers that support older versions. This test case verifies that all the items hidden by the XRef Table in non-conforming readers are *all and exactly* loaded into the XRef Stream by readers that support PDF 1.5+. """ self.maxDiff = None # TO-DO Possibly add a few other files to this test case inputFiles = ("GeoBase_NHNC1_Data_Model_UML_EN.pdf", ) for filename in inputFiles: filepath = join(self.localDataRoot, filename) expItems = { e[0]: e[1:] for e in self._parseXRefTable(filepath, (0, 2, 3)) } actualItems = list() r = PdfFileReader(join(TEST_DATA_ROOT, filename)) for ref in r.objects(PdfFileReader.R_XSTREAM, True): actualItems.append(ref) r.close() actualItems = sorted(actualItems, key=lambda e: e.idnum) expKeys = sorted(expItems.keys()) actualKeys = list(map(lambda e: e.idnum, actualItems)) self.assertListEqual(expKeys, actualKeys, "Lists of item IDs are not identical") for e, a in zip(expKeys, actualItems): self.assertEqual(e, a.idnum, "Items ID does not correspond") # If an item is in use in the XRef Stream, ensure then that it # is marked free in the XRef Table. if r._xrefStm[a.idnum][0] in (2, ): self.assertTrue( expItems[e][-1], "Item %d should be hid by the XRef Table, but it was " "not." % e, )
def testXRefStreamObjects(self): """ Like ``PdfReaderTestCases.testXRefTableObjects()``, except that it tests objects referenced by the Cross-Reference Stream. ``PdfFileReader.objects()`` second part (dealing with XStream objects) is invoked and implicitly tested. """ inputFiles = ("crazyones.pdf", ) for filename in inputFiles: filepath = join(self.localDataRoot, filename) r = PdfFileReader(join(TEST_DATA_ROOT, filename)) # Two lists of tuples as explained by Table 18 actualItems = list() expItems = list() with open(filepath, "r") as instream: for line in instream: if not line or line.isspace() or line.startswith("%"): continue this_type, field2, field3 = (int(f) for f in line.split()) expItems.append((this_type, field2, field3)) for item in r.objects(PdfFileReader.R_XSTREAM, True): priv8Item = r._xrefStm[item.idnum] if priv8Item[0] in {0, 1}: self.assertEqual(priv8Item[2], item.generation) elif priv8Item[0] == 2: self.assertEqual(item.generation, 0) actualItems.append(priv8Item) r.close() actualItems = sorted(actualItems) expItems = sorted(expItems) self.assertListEqual( expItems, actualItems, "Didn't correctly read the Cross-Reference Stream", )