def testFindEqclassesCircularReferences(self): pdf = pdfsizeopt.PdfData() # The Rs are needed in the trailer, otherwise objects would be discarded. pdf.trailer = pdfsizeopt.PdfObj( '0 0 obj<<4 0 R 5 0 R 9 0 R 10 0 R>>endobj') pdf.objs[4] = pdfsizeopt.PdfObj( '0 0 obj<</Parent 1 0 R/Type/Pages/Kids[9 0 R]/Count 1>>endobj') pdf.objs[5] = pdfsizeopt.PdfObj( '0 0 obj<</Parent 1 0 R/Type/Pages/Kids[10 0 R]/Count 1>>endobj') pdf.objs[9] = pdfsizeopt.PdfObj( '0 0 obj<</Type/Page/MediaBox[0 0 419 534]/CropBox[0 0 419 534]' '/Parent 4 0 R/Resources<</XObject<</S 2 0 R>>/ProcSet[/PDF/ImageB]>>' '/Contents 3 0 R>>endobj') pdf.objs[10] = pdfsizeopt.PdfObj( '10 0 obj<</Type/Page/MediaBox[0 0 419 534]/CropBox[0 0 419 534]' '/Parent 5 0 R/Resources<</XObject<</S 2 0 R>>/ProcSet[/PDF/ImageB]>>' '/Contents 3 0 R>>endobj') pdf.objs['trailer'] = pdf.trailer new_objs = pdfsizeopt.PdfData.FindEqclasses( pdf.objs, do_remove_unused=True, do_renumber=True) del pdf.objs['trailer'] for obj_num in new_objs: new_objs[obj_num] = (new_objs[obj_num].head, new_objs[obj_num].stream) self.assertEqual( {1: ('<</Parent null/Type/Pages/Kids[2 0 R]/Count 1>>', None), 2: ('<</Type/Page/MediaBox[0 0 419 534]/CropBox[0 0 419 534]' '/Parent 1 0 R/Resources<</XObject<</S null>>' '/ProcSet[/PDF/ImageB]>>/Contents null>>', None), 'trailer': ('<<1 0 R 1 0 R 2 0 R 2 0 R>>', None)}, new_objs)
def testFindEqclassesAllEquivalentAndUndefined(self): pdf = pdfsizeopt.PdfData() pdf.trailer = pdfsizeopt.PdfObj('0 0 obj<<>>endobj') pdf.objs[1] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 2 0 R /U 6 0 R>>endobj') pdf.objs[2] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 1 0 R /U 7 0 R>>endobj') pdf.objs[3] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 4 0 R /U 8 0 R>>endobj') pdf.objs[4] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 3 0 R /U 9 0 R>>endobj') new_objs = pdfsizeopt.PdfData.FindEqclasses(pdf.objs) for obj_num in new_objs: new_objs[obj_num] = (new_objs[obj_num].head, new_objs[obj_num].stream) self.assertEqual({1: ('<</S(q)/P 1 0 R/U null>>', None)}, new_objs)
def testFindEqclassesAllEquivalent(self): pdf = pdfsizeopt.PdfData() pdf.trailer = pdfsizeopt.PdfObj('0 0 obj<<>>endobj') pdf.objs[5] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 6 0 R>>endobj') pdf.objs[6] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 5 0 R >>endobj') pdf.objs[3] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 4 0 R >>endobj') pdf.objs[4] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 3 0 R >>endobj') new_objs = pdfsizeopt.PdfData.FindEqclasses(pdf.objs) for obj_num in new_objs: new_objs[obj_num] = (new_objs[obj_num].head, new_objs[obj_num].stream) self.assertEqual({3: ('<</S(q)/P 3 0 R>>', None)}, new_objs)
def testFindEqclassesString(self): pdf = pdfsizeopt.PdfData() pdf.trailer = pdfsizeopt.PdfObj( '0 0 obj<</A[3 0 R]>>endobj') pdf.objs[3] = pdfsizeopt.PdfObj('0 0 obj<</A()/B<>/C(:)/D<3a3A4>>>endobj') pdf.objs['trailer'] = pdf.trailer new_objs = pdfsizeopt.PdfData.FindEqclasses( pdf.objs, do_remove_unused=True, do_renumber=True) for obj_num in new_objs: new_objs[obj_num] = (new_objs[obj_num].head, new_objs[obj_num].stream) self.assertEqual( {'trailer': ('<</A[1 0 R]>>', None), 1: ('<</A()/B()/C(:)/D(::@)>>', None)}, new_objs)
def testFindEqclassesTwoGroupsByOrder(self): pdf = pdfsizeopt.PdfData() pdf.trailer = pdfsizeopt.PdfObj('0 0 obj<<>>endobj') pdf.objs[1] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 2 0 R>>endobj') pdf.objs[2] = pdfsizeopt.PdfObj('0 0 obj<</P 1 0 R/S(q)>>endobj') pdf.objs[3] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 4 0 R>>endobj') pdf.objs[4] = pdfsizeopt.PdfObj('0 0 obj<</P 3 0 R /S<71>>>endobj') new_objs = pdfsizeopt.PdfData.FindEqclasses(pdf.objs) for obj_num in new_objs: new_objs[obj_num] = (new_objs[obj_num].head, new_objs[obj_num].stream) self.assertEqual( {1: ('<</S(q)/P 2 0 R>>', None), 2: ('<</P 1 0 R/S(q)>>', None)}, new_objs)
def testFindEqclassesAllDifferentBecauseOfStream(self): pdf = pdfsizeopt.PdfData() pdf.trailer = pdfsizeopt.PdfObj('0 0 obj<<>>endobj') pdf.objs[1] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 2 0 R>>endobj') pdf.objs[2] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 1 0 R >>endobj') pdf.objs[2].stream = 'foo' pdf.objs[3] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 4 0 R >>endobj') pdf.objs[4] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 3 0 R >>endobj') pdf.objs[4].stream = 'fox' new_objs = pdfsizeopt.PdfData.FindEqclasses(pdf.objs) for obj_num in new_objs: new_objs[obj_num] = (new_objs[obj_num].head, new_objs[obj_num].stream) self.assertEqual( {1: ('<</S(q)/P 2 0 R>>', None), 2: ('<</S(q)/P 1 0 R>>', 'foo'), 3: ('<</S(q)/P 4 0 R>>', None), 4: ('<</S(q)/P 3 0 R>>', 'fox')}, new_objs)
def testPdfObjGetSet(self): obj = pdfsizeopt.PdfObj('42 0 obj<</Foo(hi)>>\t\f\rendobj junk stream\r\n') self.assertEqual('<</Foo(hi)>>', obj._head) self.assertEqual(None, obj._cache) self.assertEqual('<</Foo(hi)>>', obj.head) self.assertEqual(None, obj._cache) self.assertEqual(None, obj.Get('Bar')) self.assertEqual(None, obj._cache) self.assertEqual(None, obj.Get('Fo')) self.assertEqual({'Foo': '<6869>'}, obj._cache) self.assertEqual('<6869>', obj.Get('Foo')) self.assertEqual('<</Foo(hi)>>', obj._head) obj.Set('Foo', ' \t<6869>\f \r ') self.assertEqual('<6869>', obj.Get('Foo')) self.assertEqual({'Foo': '<6869>'}, obj._cache) self.assertEqual('<</Foo(hi)>>', obj._head) obj.Set('Foo', ' \t(hi)\f \r ') self.assertEqual('<6869>', obj.Get('Foo')) self.assertEqual({'Foo': '<6869>'}, obj._cache) self.assertEqual('<</Foo(hi)>>', obj._head) # still valid obj.Set('Foo', '(*)') self.assertEqual('<2a>', obj.Get('Foo')) self.assertEqual(None, obj._head) self.assertEqual({'Foo': '<2a>'}, obj._cache) obj.Set('Bar', '0042') self.assertEqual({'Foo': '<2a>', 'Bar': 42}, obj._cache) self.assertEqual(None, obj._head) self.assertEqual('<</Bar 42/Foo(*)>>', obj.head) self.assertEqual('<</Bar 42/Foo(*)>>', obj._head) obj.Set('Bar', 'null') self.assertEqual({'Foo': '<2a>', 'Bar': 'null'}, obj._cache) self.assertEqual(None, obj._head) self.assertEqual('<</Bar null/Foo(*)>>', obj.head) self.assertEqual('<</Bar null/Foo(*)>>', obj._head) obj.Set('Bar', None, do_keep_null=True) self.assertEqual({'Foo': '<2a>', 'Bar': 'null'}, obj._cache) self.assertEqual('<</Bar null/Foo(*)>>', obj._head) obj.Set('Bar', None) self.assertEqual({'Foo': '<2a>'}, obj._cache) self.assertEqual(None, obj._head) self.assertEqual(len('<</Foo(*)>>') + 40, obj.size) self.assertEqual('<</Foo(*)>>', obj._head) self.assertEqual('<</Foo(*)>>', obj.head) obj.head = '<</Foo(*)>>' self.assertEqual({'Foo': '<2a>'}, obj._cache) obj.head = '<</Foo<2a>>>' # invalidates the cache self.assertEqual(None, obj._cache) self.assertEqual(None, obj.Get('Food')) self.assertEqual(None, obj._cache) self.assertEqual('<2a>', obj.Get('Foo')) self.assertEqual({'Foo': '<2a>'}, obj._cache)
def NewObj(head, stream=None, do_compress=False): obj = pdfsizeopt.PdfObj(None) if stream is None: obj.head = head or '' else: if not isinstance(stream, str): raise TypeError obj.head = head or '<<>>' if do_compress: obj.SetStreamAndCompress(stream) else: obj.Set('Length', len(stream)) obj.stream = stream return obj
def testFindEqclassesTwoGroupsWithTrailerRenumber(self): pdf = pdfsizeopt.PdfData() pdf.trailer = pdfsizeopt.PdfObj( '0 0 obj<</A[3 0 R 4 0 R 5 0 R 6 0 R 4 0 R]>>endobj') pdf.objs[5] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 6 0 R>>endobj') pdf.objs[6] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/Q 5 0 R >>endobj') pdf.objs[3] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 4 0 R >>endobj') pdf.objs[4] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/Q 3 0 R >>endobj') pdf.objs[10] = pdfsizeopt.PdfObj('0 0 obj[11 0 R]endobj') pdf.objs[11] = pdfsizeopt.PdfObj('0 0 obj[10 0 R]endobj') pdf.objs[12] = pdfsizeopt.PdfObj('0 0 obj[11 0 R]endobj') pdf.objs[12].stream = 'blah' pdf.objs['trailer'] = pdf.trailer new_objs = pdfsizeopt.PdfData.FindEqclasses( pdf.objs, do_remove_unused=True, do_renumber=True) del pdf.objs['trailer'] for obj_num in new_objs: new_objs[obj_num] = (new_objs[obj_num].head, new_objs[obj_num].stream) self.assertEqual( {'trailer': ('<</A[2 0 R 1 0 R 2 0 R 1 0 R 1 0 R]>>', None), 2: ('<</S(q)/P 1 0 R>>', None), 1: ('<</S(q)/Q 2 0 R>>', None)}, new_objs)
def testFindEqclassesTwoGroupsWithTrailer(self): pdf = pdfsizeopt.PdfData() pdf.trailer = pdfsizeopt.PdfObj( '0 0 obj<</A[3 0 R 4 0 R 5 0 R 6 0 R 3 0 R]>>endobj') pdf.objs[5] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 6 0 R>>endobj') pdf.objs[6] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/Q 5 0 R >>endobj') pdf.objs[3] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/P 4 0 R >>endobj') pdf.objs[4] = pdfsizeopt.PdfObj('0 0 obj<</S(q)/Q 3 0 R >>endobj') pdf.objs[10] = pdfsizeopt.PdfObj('0 0 obj[11 0 R]endobj') pdf.objs[11] = pdfsizeopt.PdfObj('0 0 obj[10 0 R]endobj') pdf.objs[12] = pdfsizeopt.PdfObj('0 0 obj[11 0 R]endobj') pdf.objs[12].stream = 'blah' pdf.objs['trailer'] = pdf.trailer new_objs = pdfsizeopt.PdfData.FindEqclasses(pdf.objs) del pdf.objs['trailer'] for obj_num in new_objs: new_objs[obj_num] = (new_objs[obj_num].head, new_objs[obj_num].stream) self.assertEqual( {'trailer': ('<</A[3 0 R 4 0 R 3 0 R 4 0 R 3 0 R]>>', None), 10: ('[10 0 R]', None), 12: ('[10 0 R]', 'blah'), 3: ('<</S(q)/P 4 0 R>>', None), 4: ('<</S(q)/Q 3 0 R>>', None)}, new_objs)
def testPdfObjParse(self): obj = pdfsizeopt.PdfObj( '42 0 obj<</Length 3>>stream\r\nABC endstream endobj') self.assertEqual('<</Length 3>>', obj.head) self.assertEqual('ABC', obj.stream) obj = pdfsizeopt.PdfObj( '42 0 obj<</Length 4>>stream\r\nABC endstream endobj') self.assertRaises( pdfsizeopt.PdfTokenParseError, pdfsizeopt.PdfObj, '42 0 obj<</Length 99>>stream\r\nABC endstream endobj') self.assertEqual('<</Length 4>>', obj.head) self.assertEqual('ABC ', obj.stream) obj = pdfsizeopt.PdfObj( '42 0 obj<</Length 4>>endobj') self.assertEqual('<</Length 4>>', obj.head) self.assertEqual(None, obj.stream) obj = pdfsizeopt.PdfObj( '42 0 obj<</T[/Length 99]/Length 3>>stream\r\nABC endstream endobj') self.assertEqual('ABC', obj.stream) obj = pdfsizeopt.PdfObj( '42 0 obj<</T()/Length 3>>stream\nABC endstream endobj') self.assertEqual('ABC', obj.stream) s = '41 0 obj<</T(>>\nendobj\n)/Length 3>>stream\nABD endstream endobj' t = '42 0 obj<</T 5%>>endobj\n/Length 3>>stream\nABE endstream endobj' end_ofs_out = [] obj = pdfsizeopt.PdfObj(s, end_ofs_out=end_ofs_out) self.assertEqual([len(s)], end_ofs_out) self.assertEqual('ABD', obj.stream) end_ofs_out = [] obj = pdfsizeopt.PdfObj(t + '\r\n\tANYTHING', end_ofs_out=end_ofs_out) self.assertEqual([len(t) + 1], end_ofs_out) end_ofs_out = [] obj = pdfsizeopt.PdfObj( '%s\n%s' % (s, t), start=len(s) + 1, end_ofs_out=end_ofs_out) self.assertEqual('ABE', obj.stream) self.assertEqual([len(s) + 1 + len(t)], end_ofs_out) # Exception because start points to '\n', not an `X Y obj'. self.assertRaises( pdfsizeopt.PdfTokenParseError, pdfsizeopt.PdfObj, '%s\n%s' % (s, t), start=len(s)) s = '22 0 obj<</Producer(A)/CreationDate(B)/Creator(C)>>\nendobj ' t = '23 0 obj' end_ofs_out = [] obj = pdfsizeopt.PdfObj(s + t, end_ofs_out=end_ofs_out) self.assertEqual('<</Producer(A)/CreationDate(B)/Creator(C)>>', obj.head) self.assertEqual([len(s)], end_ofs_out) obj = pdfsizeopt.PdfObj( '42 0 obj[/Foo%]endobj\n42 43\t]\nendobj') # Parses the comment properly, but doesn't replace it with the non-comment # version. self.assertEqual('[/Foo%]endobj\n42 43\t]', obj.head) obj = pdfsizeopt.PdfObj('42 0 obj%hello\r \t\f%more\n/Foo%bello\nendobj') # Leading comments are removed, but trailing comments aren't. self.assertEqual('/Foo%bello', obj.head)