def getNamedDestRoot(self): root = self.getObject(self._root) if '/Names' in root and isinstance(root['/Names'], DictionaryObject): names = root['/Names'] idnum = self._objects.index(names) + 1 namesRef = IndirectObject(idnum, 0, self) assert namesRef.getObject() == names if '/Dests' in names and isinstance( names['/Dests'], DictionaryObject): dests = names['/Dests'] idnum = self._objects.index(dests) + 1 destsRef = IndirectObject(idnum, 0, self) assert destsRef.getObject() == dests if '/Names' in dests: nd = dests['/Names'] else: nd = ArrayObject() dests[NameObject('/Names')] = nd else: dests = DictionaryObject() destsRef = self._addObject(dests) names[NameObject('/Dests')] = destsRef nd = ArrayObject() dests[NameObject('/Names')] = nd else: names = DictionaryObject() namesRef = self._addObject(names) root[NameObject('/Names')] = namesRef dests = DictionaryObject() destsRef = self._addObject(dests) names[NameObject('/Dests')] = destsRef nd = ArrayObject() dests[NameObject('/Names')] = nd return nd
def getOutlineRoot(self): root = self.getObject(self._root) if '/Outlines' in root: outline = root['/Outlines'] idnum = self._objects.index(outline) + 1 outlineRef = IndirectObject(idnum, 0, self) assert outlineRef.getObject() == outline else: outline = TreeObject() outline.update({}) outlineRef = self._addObject(outline) root[NameObject('/Outlines')] = outlineRef return outline
def getNamedDestRoot(self): root = self.getObject(self._root) if '/Names' in root and isinstance(root['/Names'], DictionaryObject): names = root['/Names'] idnum = self._objects.index(names) + 1 namesRef = IndirectObject(idnum, 0, self) assert namesRef.getObject() == names if '/Dests' in names and isinstance(names['/Dests'], DictionaryObject): dests = names['/Dests'] idnum = self._objects.index(dests) + 1 destsRef = IndirectObject(idnum, 0, self) assert destsRef.getObject() == dests if '/Names' in dests: nd = dests['/Names'] else: nd = ArrayObject() dests[NameObject('/Names')] = nd else: dests = DictionaryObject() destsRef = self._addObject(dests) names[NameObject('/Dests')] = destsRef nd = ArrayObject() dests[NameObject('/Names')] = nd else: names = DictionaryObject() namesRef = self._addObject(names) root[NameObject('/Names')] = namesRef dests = DictionaryObject() destsRef = self._addObject(dests) names[NameObject('/Dests')] = destsRef nd = ArrayObject() dests[NameObject('/Names')] = nd return nd
def _sweepIndirectReferences(self, externMap, data): if isinstance(data, DictionaryObject): for key, value in data.items(): value = self._sweepIndirectReferences(externMap, value) if isinstance(value, StreamObject): # a dictionary value is a stream. streams must be indirect # objects, so we need to change this value. value = self._addObject(value) data[key] = value return data elif isinstance(data, ArrayObject): for i in range(len(data)): value = self._sweepIndirectReferences(externMap, data[i]) if isinstance(value, StreamObject): # an array value is a stream. streams must be indirect # objects, so we need to change this value value = self._addObject(value) data[i] = value return data elif isinstance(data, IndirectObject): # internal indirect references are fine if data.pdf == self: if data.idnum in self.stack: return data else: self.stack.append(data.idnum) realdata = self.getObject(data) self._sweepIndirectReferences(externMap, realdata) self.stack.pop() return data else: newobj = externMap.get(data.pdf, {}).get(data.generation, {}).get(data.idnum, None) if newobj is None: newobj = data.pdf.getObject(data) self._objects.append(None) # placeholder idnum = len(self._objects) newobj_ido = IndirectObject(idnum, 0, self) externMap.setdefault(data.pdf, {}) externMap[data.pdf].setdefault(data.generation, {}) externMap[data.pdf][data.generation][data.idnum] = \ newobj_ido newobj = self._sweepIndirectReferences(externMap, newobj) self._objects[idnum - 1] = newobj return newobj_ido return newobj else: return data
def _addObject(self, obj): self._objects.append(obj) return IndirectObject(len(self._objects), 0, self)
def getReference(self, obj): idnum = self._objects.index(obj) + 1 ref = IndirectObject(idnum, 0, self) assert ref.getObject() == obj return ref
def write(self, stream): externalReferenceMap = {} # PDF objects sometimes have circular references to their /Page objects # inside their object tree (for example, annotations). Those will be # indirect references to objects that we've recreated in this PDF. To # address this problem, PageObject's store their original object # reference number, and we add it to the external reference map before # we sweep for indirect references. This forces self-page-referencing # trees to reference the correct new object location, rather than # copying in a new copy of the page object. for objIndex in xrange(len(self._objects)): obj = self._objects[objIndex] if isinstance(obj, PageObject) and obj.indirectRef is not None: data = obj.indirectRef externalReferenceMap.setdefault(data.pdf, {}) externalReferenceMap[data.pdf].setdefault(data.generation, {}) externalReferenceMap[data.pdf][data.generation][data.idnum] = \ IndirectObject(objIndex + 1, 0, self) self.stack = [] self._sweepIndirectReferences(externalReferenceMap, self._root) del self.stack # Begin writing: object_positions = [] stream.write(self._header + b_("\n")) for i in range(len(self._objects)): idnum = (i + 1) obj = self._objects[i] object_positions.append(stream.tell()) stream.write(b_(str(idnum) + " 0 obj\n")) key = None if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: pack1 = struct.pack("<i", i + 1)[:3] pack2 = struct.pack("<i", 0)[:2] key = self._encrypt_key + pack1 + pack2 assert len(key) == (len(self._encrypt_key) + 5) md5_hash = md5(key).digest() key = md5_hash[:min(16, len(self._encrypt_key) + 5)] if obj is not None: obj.writeToStream(stream, key) stream.write(b_("\nendobj\n")) # xref table xref_location = stream.tell() stream.write(b_("xref\n")) stream.write(b_("0 %s\n" % (len(self._objects) + 1))) stream.write(b_("%010d %05d f \n" % (0, 65535))) for offset in object_positions: stream.write(b_("%010d %05d n \n" % (offset, 0))) # trailer stream.write(b_("trailer\n")) trailer = DictionaryObject() trailer.update({ NameObject("/Size"): NumberObject(len(self._objects) + 1), NameObject("/Root"): self._root, NameObject("/Info"): self._info }) if hasattr(self, "_ID"): trailer[NameObject("/ID")] = self._ID if hasattr(self, "_encrypt"): trailer[NameObject("/Encrypt")] = self._encrypt trailer.writeToStream(stream, None) # eof stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)))
def getObject(self, indirectReference): retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None) if retval is not None: return retval if indirectReference.generation == 0 \ and indirectReference.idnum in self.xref_objStm: # indirect reference to object in object stream # read the entire object stream into memory stmnum, idx = self.xref_objStm[indirectReference.idnum] objStm = IndirectObject(stmnum, 0, self).getObject() assert objStm['/Type'] == '/ObjStm' assert idx < objStm['/N'] streamData = StringIO(objStm.getData()) for i in range(objStm['/N']): objnum = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) offset = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) t = streamData.tell() streamData.seek(objStm['/First']+offset, 0) obj = readObject(streamData, self) self.resolvedObjects[0][objnum] = obj streamData.seek(t, 0) return self.resolvedObjects[0][indirectReference.idnum] if indirectReference.idnum \ not in self.xref[indirectReference.generation]: warnings.warn("Object %d %d not defined." % ( indirectReference.idnum, indirectReference.generation), utils.PdfReadWarning) return None start = self.xref[indirectReference.generation][ indirectReference.idnum] self.stream.seek(start, 0) idnum, generation = self.readObjectHeader(self.stream) try: assert idnum == indirectReference.idnum except AssertionError: if self.xrefIndex: # Xref table probably had bad indexes due to not # being zero-indexed if self.strict: raise utils.PdfReadError( "Expected object ID (%d %d) does " "not match actual (%d %d); xref " "table not zero-indexed." % ( indirectReference.idnum, indirectReference.generation, idnum, generation)) else: # should not happen since the xref table is corrected in # non-strict mode pass else: # some other problem raise utils.PdfReadError("Expected object ID (%d %d) does not " " match actual (%d %d)." % ( indirectReference.idnum, indirectReference.generation, idnum, generation)) assert generation == indirectReference.generation retval = readObject(self.stream, self) # override encryption is used for the /Encrypt dictionary if not self._override_encryption and self.isEncrypted: # if we don't have the encryption key: if not hasattr(self, '_decryption_key'): raise Exception("file has not been decrypted") # otherwise, decrypt here... pack1 = struct.pack("<i", indirectReference.idnum)[:3] pack2 = struct.pack("<i", indirectReference.generation)[:2] key = self._decryption_key + pack1 + pack2 assert len(key) == (len(self._decryption_key) + 5) md5_hash = md5(key).digest() key = md5_hash[:min(16, len(self._decryption_key) + 5)] retval = self._decryptObject(retval, key) self.cacheIndirectObject(generation, idnum, retval) return retval
def getObject(self, indirectReference): retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None) if retval is not None: return retval if indirectReference.generation == 0 \ and indirectReference.idnum in self.xref_objStm: # indirect reference to object in object stream # read the entire object stream into memory stmnum, idx = self.xref_objStm[indirectReference.idnum] objStm = IndirectObject(stmnum, 0, self).getObject() assert objStm['/Type'] == '/ObjStm' assert idx < objStm['/N'] streamData = StringIO(objStm.getData()) for i in range(objStm['/N']): objnum = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) offset = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) t = streamData.tell() streamData.seek(objStm['/First'] + offset, 0) obj = readObject(streamData, self) self.resolvedObjects[0][objnum] = obj streamData.seek(t, 0) return self.resolvedObjects[0][indirectReference.idnum] if indirectReference.idnum \ not in self.xref[indirectReference.generation]: warnings.warn( "Object %d %d not defined." % (indirectReference.idnum, indirectReference.generation), utils.PdfReadWarning) return None start = self.xref[indirectReference.generation][ indirectReference.idnum] self.stream.seek(start, 0) idnum, generation = self.readObjectHeader(self.stream) try: assert idnum == indirectReference.idnum except AssertionError: if self.xrefIndex: # Xref table probably had bad indexes due to not # being zero-indexed if self.strict: raise utils.PdfReadError( "Expected object ID (%d %d) does " "not match actual (%d %d); xref " "table not zero-indexed." % (indirectReference.idnum, indirectReference.generation, idnum, generation)) else: # should not happen since the xref table is corrected in # non-strict mode pass else: # some other problem raise utils.PdfReadError( "Expected object ID (%d %d) does not " " match actual (%d %d)." % (indirectReference.idnum, indirectReference.generation, idnum, generation)) assert generation == indirectReference.generation retval = readObject(self.stream, self) # override encryption is used for the /Encrypt dictionary if not self._override_encryption and self.isEncrypted: # if we don't have the encryption key: if not hasattr(self, '_decryption_key'): raise Exception("file has not been decrypted") # otherwise, decrypt here... pack1 = struct.pack("<i", indirectReference.idnum)[:3] pack2 = struct.pack("<i", indirectReference.generation)[:2] key = self._decryption_key + pack1 + pack2 assert len(key) == (len(self._decryption_key) + 5) md5_hash = md5(key).digest() key = md5_hash[:min(16, len(self._decryption_key) + 5)] retval = self._decryptObject(retval, key) self.cacheIndirectObject(generation, idnum, retval) return retval