def readFromStream(stream, pdf): debug = False if debug: print((stream.tell())) name = stream.read(1) if name != NameObject.surfix: raise PdfReadError("name read error") name += utils.readUntilRegex(stream, NameObject.delimiterPattern, ignore_eof=True) if debug: print(name) try: try: ret = name.decode('utf-8') except (UnicodeEncodeError, UnicodeDecodeError): ret = name.decode('gbk') return NameObject(ret) except (UnicodeEncodeError, UnicodeDecodeError): # Name objects should represent irregular characters # with a '#' followed by the symbol's hex number if not pdf.strict: warnings.warn("Illegal character in Name Object", utils.PdfReadWarning) return NameObject(name) else: raise PdfReadError("Illegal character in Name Object")
def readFromStream(stream, pdf): idnum = b_("") while True: tok = stream.read(1) if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if tok.isspace(): break idnum += tok generation = b_("") while True: tok = stream.read(1) if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if tok.isspace(): if not generation: continue break generation += tok r = readNonWhitespace(stream) if r != b_("R"): raise PdfReadError( "Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell())) return IndirectObject(int(idnum), int(generation), pdf)
def decode(self): """ algorithm derived from: http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html and the PDFReference """ cW = self.CLEARDICT baos = "" while True: pW = cW cW = self.nextCode() if cW == -1: raise PdfReadError("Missed the stop code in LZWDecode!") if cW == self.STOP: break elif cW == self.CLEARDICT: self.resetDict() elif pW == self.CLEARDICT: baos += self.dict[cW] else: if cW < self.dictlen: baos += self.dict[cW] p = self.dict[pW] + self.dict[cW][0] self.dict[self.dictlen] = p self.dictlen += 1 else: p = self.dict[pW] + self.dict[pW][0] baos += p self.dict[self.dictlen] = p self.dictlen += 1 if (self.dictlen >= (1 << self.bitspercode) - 1 and self.bitspercode < 12): self.bitspercode += 1 return baos
def decode(data, decodeParms): """ :param data: flate-encoded data. :param decodeParms: a dictionary of values, understanding the "/Predictor":<int> key only :return: the flate-decoded data. """ data = decompress(data) predictor = 1 if decodeParms: try: from PyPDF2.generic import ArrayObject if isinstance(decodeParms, ArrayObject): for decodeParm in decodeParms: if '/Predictor' in decodeParm: predictor = decodeParm['/Predictor'] else: predictor = decodeParms.get("/Predictor", 1) except AttributeError: pass # Usually an array with a null object was read # predictor 1 == no predictor if predictor != 1: # The /Columns param. has 1 as the default value; see ISO 32000, # ยง7.4.4.3 LZWDecode and FlateDecode Parameters, Table 8 columns = decodeParms.get(LZW.COLUMNS, 1) # PNG prediction: if 10 <= predictor <= 15: data = FlateDecode._decode_png_prediction(data, columns) else: # unsupported predictor raise PdfReadError("Unsupported flatedecode predictor %r" % predictor) return data
def _decode_png_prediction(data, columns): output = StringIO() # PNG prediction can vary from row to row rowlength = columns + 1 assert len(data) % rowlength == 0 prev_rowdata = (0,) * rowlength for row in range(len(data) // rowlength): rowdata = [ord_(x) for x in data[(row*rowlength):((row+1)*rowlength)]] filterByte = rowdata[0] if filterByte == 0: pass elif filterByte == 1: for i in range(2, rowlength): rowdata[i] = (rowdata[i] + rowdata[i-1]) % 256 elif filterByte == 2: for i in range(1, rowlength): rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 elif filterByte == 3: for i in range(1, rowlength): left = rowdata[i-1] if i > 1 else 0 floor = math.floor(left + prev_rowdata[i])/2 rowdata[i] = (rowdata[i] + int(floor)) % 256 elif filterByte == 4: for i in range(1, rowlength): left = rowdata[i - 1] if i > 1 else 0 up = prev_rowdata[i] up_left = prev_rowdata[i - 1] if i > 1 else 0 paeth = paethPredictor(left, up, up_left) rowdata[i] = (rowdata[i] + paeth) % 256 else: # unsupported PNG filter raise PdfReadError("Unsupported PNG filter %r" % filterByte) prev_rowdata = rowdata output.write(''.join([chr(x) for x in rowdata[1:]])) return output.getvalue()
def decode(data, decodeParms): data = decompress(data) predictor = 1 if decodeParms: try: from PyPDF2.generic import ArrayObject if isinstance(decodeParms, ArrayObject): for decodeParm in decodeParms: if '/Predictor' in decodeParm: predictor = decodeParm['/Predictor'] else: predictor = decodeParms.get("/Predictor", 1) except AttributeError: pass # usually an array with a null object was read # predictor 1 == no predictor if predictor != 1: columns = decodeParms[LZW.COLUMNS] # PNG prediction: if predictor >= 10 and predictor <= 15: data = FlateDecode._decode_png_prediction(data, columns) else: # unsupported predictor raise PdfReadError("Unsupported flatedecode predictor %r" % predictor) return data
def readFromStream(stream): word = stream.read(4) if word == b_("true"): return BooleanObject(True) elif word == b_("fals"): stream.read(1) return BooleanObject(False) else: raise PdfReadError('Could not read Boolean object')
def __init__(self, stream: ContentStream) -> None: self.stream = stream try: data = self.stream.get_data() doc_root: Document = parseString(data) except ExpatError as e: raise PdfReadError(f"XML in XmpInformation was invalid: {e}") self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS( RDF_NAMESPACE, "RDF")[0] self.cache: Dict[Any, Any] = {}
def test_DictionaryObject_read_from_stream_stream_stream_valid( strict, length, should_fail): stream = BytesIO(b"<< /S /GoTo /Length %d >>stream\nBT /F1\nendstream\n" % length) class Tst: # to replace pdf strict = True pdf = Tst() pdf.strict = strict with pytest.raises(PdfReadError) as exc: do = DictionaryObject.read_from_stream(stream, pdf) # TODO: What should happen with the stream? assert do == {"/S": "/GoTo"} if length in (6, 10): assert b"BT /F1" in do._StreamObject__data raise PdfReadError("__ALLGOOD__") print(exc.value) assert should_fail ^ (exc.value.args[0] == "__ALLGOOD__")
def readFromStream(stream, pdf): arr = ArrayObject() tmp = stream.read(1) if tmp != b_("["): raise PdfReadError("Could not read array") while True: # skip leading whitespace tok = stream.read(1) while tok.isspace(): tok = stream.read(1) stream.seek(-1, 1) # check for array ending peekahead = stream.read(1) if peekahead == b_("]"): break stream.seek(-1, 1) # read and append obj arr.append(readObject(stream, pdf)) return arr
def decode(self): """ TIFF 6.0 specification explains in sufficient details the steps to implement the LZW encode() and decode() algorithms. algorithm derived from: http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html and the PDFReference :rtype: bytes """ cW = self.CLEARDICT baos="" while True: pW = cW cW = self.nextCode() if cW == -1: raise PdfReadError("Missed the stop code in LZWDecode!") if cW == self.STOP: break elif cW == self.CLEARDICT: self.resetDict() elif pW == self.CLEARDICT: baos+=self.dict[cW] else: if cW < self.dictlen: baos += self.dict[cW] p=self.dict[pW]+self.dict[cW][0] self.dict[self.dictlen]=p self.dictlen+=1 else: p=self.dict[pW]+self.dict[pW][0] baos+=p self.dict[self.dictlen] = p self.dictlen+=1 if (self.dictlen >= (1 << self.bitspercode) - 1 and self.bitspercode < 12): self.bitspercode+=1 return baos
def __init__(self, title, page, typ, *args): DictionaryObject.__init__(self) self[NameObject("/Title")] = title self[NameObject("/Page")] = page self[NameObject("/Type")] = typ from PyPDF2.constants import TypArguments as TA from PyPDF2.constants import TypFitArguments as TF # from table 8.2 of the PDF 1.7 reference. if typ == "/XYZ": (self[NameObject(TA.LEFT)], self[NameObject(TA.TOP)], self[NameObject("/Zoom")]) = args elif typ == TF.FIT_R: (self[NameObject(TA.LEFT)], self[NameObject(TA.BOTTOM)], self[NameObject(TA.RIGHT)], self[NameObject(TA.TOP)]) = args elif typ in [TF.FIT_H, TF.FIT_BH]: self[NameObject(TA.TOP)], = args elif typ in [TF.FIT_V, TF.FIT_BV]: self[NameObject(TA.LEFT)], = args elif typ in [TF.FIT, TF.FIT_B]: pass else: raise PdfReadError("Unknown Destination Type: %r" % typ)
def decode(data, decodeParms): data = decompress(data) predictor = 1 if decodeParms: try: from PyPDF2.generic import ArrayObject if isinstance(decodeParms, ArrayObject): for decodeParm in decodeParms: if '/Predictor' in decodeParm: predictor = decodeParm['/Predictor'] else: predictor = decodeParms.get("/Predictor", 1) except AttributeError: pass # usually an array with a null object was read # predictor 1 == no predictor if predictor != 1: columns = decodeParms[LZW.COLUMNS] # PNG prediction: if predictor >= 10 and predictor <= 15: output = StringIO() # PNG prediction can vary from row to row rowlength = columns + 1 assert len(data) % rowlength == 0 prev_rowdata = (0, ) * rowlength for row in range(len(data) // rowlength): rowdata = [ ord_(x) for x in data[(row * rowlength):((row + 1) * rowlength)] ] filterByte = rowdata[0] if filterByte == 0: pass elif filterByte == 1: for i in range(2, rowlength): rowdata[i] = (rowdata[i] + rowdata[i - 1]) % 256 elif filterByte == 2: for i in range(1, rowlength): rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 elif filterByte == 3: for i in range(1, rowlength): left = rowdata[i - 1] if i > 1 else 0 floor = math.floor(left + prev_rowdata[i]) / 2 rowdata[i] = (rowdata[i] + int(floor)) % 256 elif filterByte == 4: for i in range(1, rowlength): left = rowdata[i - 1] if i > 1 else 0 up = prev_rowdata[i] up_left = prev_rowdata[i - 1] if i > 1 else 0 paeth = paethPredictor(left, up, up_left) rowdata[i] = (rowdata[i] + paeth) % 256 else: # unsupported PNG filter raise PdfReadError("Unsupported PNG filter %r" % filterByte) prev_rowdata = rowdata output.write(''.join([chr(x) for x in rowdata[1:]])) data = output.getvalue() else: # unsupported predictor raise PdfReadError("Unsupported flatedecode predictor %r" % predictor) return data
def setData(self, data): raise PdfReadError( "Creating EncodedStreamObject is not currently supported")
def readFromStream(stream, pdf): debug = False tmp = stream.read(2) if tmp != b_("<<"): raise PdfReadError( "Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell())) data = {} while True: tok = readNonWhitespace(stream) if tok == b_('\x00'): continue elif tok == b_('%'): stream.seek(-1, 1) skipOverComment(stream) continue if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if debug: print(("Tok:", tok)) if tok == b_(">"): stream.read(1) break stream.seek(-1, 1) key = readObject(stream, pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, pdf) if not data.get(key): data[key] = value elif pdf.strict: # multiple definitions of key not permitted raise PdfReadError( "Multiple definitions in dictionary at byte %s for key %s" \ % (utils.hexStr(stream.tell()), key)) else: warnings.warn( "Multiple definitions in dictionary at byte %s for key %s" \ % (utils.hexStr(stream.tell()), key), PdfReadWarning) pos = stream.tell() s = readNonWhitespace(stream) if s == b_('s') and stream.read(5) == b_('tream'): eol = stream.read(1) # odd PDF file output has spaces after 'stream' keyword but before EOL. # patch provided by Danial Sandler while eol == b_(' '): eol = stream.read(1) assert eol in (b_("\n"), b_("\r")) if eol == b_("\r"): # read \n after if stream.read(1) != b_('\n'): stream.seek(-1, 1) # this is a stream object, not a dictionary assert SA.LENGTH in data length = data[SA.LENGTH] if debug: print(data) if isinstance(length, IndirectObject): t = stream.tell() length = pdf.getObject(length) stream.seek(t, 0) data["__streamdata__"] = stream.read(length) if debug: print("here") # if debug: print(binascii.hexlify(data["__streamdata__"])) e = readNonWhitespace(stream) ndstream = stream.read(8) if (e + ndstream) != b_("endstream"): # (sigh) - the odd PDF file has a length that is too long, so # we need to read backwards to find the "endstream" ending. # ReportLab (unknown version) generates files with this bug, # and Python users into PDF files tend to be our audience. # we need to do this to correct the streamdata and chop off # an extra character. pos = stream.tell() stream.seek(-10, 1) end = stream.read(9) if end == b_("endstream"): # we found it by looking back one character further. data["__streamdata__"] = data["__streamdata__"][:-1] else: stream.seek(pos, 0) raise PdfReadError( "Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell())) else: stream.seek(pos, 0) if "__streamdata__" in data: return StreamObject.initializeFromDictionary(data) else: retval = DictionaryObject() retval.update(data) return retval
def readStringFromStream(stream): tok = stream.read(1) parens = 1 txt = b_("") while True: tok = stream.read(1) if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) if tok == b_("("): parens += 1 elif tok == b_(")"): parens -= 1 if parens == 0: break elif tok == b_("\\"): tok = stream.read(1) ESCAPE_DICT = { b_("n"): b_("\n"), b_("r"): b_("\r"), b_("t"): b_("\t"), b_("b"): b_("\b"), b_("f"): b_("\f"), b_("c"): b_(r"\c"), b_("("): b_("("), b_(")"): b_(")"), b_("/"): b_("/"), b_("\\"): b_("\\"), b_(" "): b_(" "), b_("/"): b_("/"), b_("%"): b_("%"), b_("<"): b_("<"), b_(">"): b_(">"), b_("["): b_("["), b_("]"): b_("]"), b_("#"): b_("#"), b_("_"): b_("_"), b_("&"): b_("&"), b_('$'): b_('$'), } try: tok = ESCAPE_DICT[tok] except KeyError: if tok.isdigit(): # "The number ddd may consist of one, two, or three # octal digits; high-order overflow shall be ignored. # Three octal digits shall be used, with leading zeros # as needed, if the next character of the string is also # a digit." (PDF reference 7.3.4.2, p 16) for _ in range(2): ntok = stream.read(1) if ntok.isdigit(): tok += ntok else: break tok = b_(chr(int(tok, base=8))) elif tok in b_("\n\r"): # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the # second character: tok = stream.read(1) if tok not in b_("\n\r"): stream.seek(-1, 1) # Then don't add anything to the actual string, since this # line break was escaped: tok = b_('') else: raise PdfReadError(r"Unexpected escaped string: %s" % tok) txt += tok return createStringObject(txt)
def readFromStream(stream): nulltxt = stream.read(4) if nulltxt != b_("null"): raise PdfReadError("Could not read Null object") return NullObject()