def test_dict(self): self.assertEqual(PDFParser.parse("<< /Foo /Bar >>"), {PDFName("/Foo"): PDFName("/Bar")}) self.assertEqual(PDFParser.parse("<< /Foo << /Bar /Koo>> >>"), {PDFName("/Foo"): { PDFName("/Bar"): PDFName("/Koo") }}) self.assertEqual(PDFParser.parse("<< /Foobar13478 123 >>"), {PDFName("/Foobar13478"): 123})
def test_comments(self): data = """<< /ID [<5b><5b>] % ReportLab generated PDF document -- digest (http://www.reportlab.com) /Info 13 0 R /Root 12 0 R /Size 17 >>""" PDFParser.parse(data)
def test_pdf_comment(self): self.assertEqual( PDFParser.parse("""<< /Length 213 0 R % Foobar >>"""), { PDFName("/Length"): PDFXRef(213, 0), })
def test_realworld_pdf(self): self.assertEqual( PDFParser.parse("""<< /Length 213 0 R /PatternType 1 /BBox [0 0 2596 37] /XStep 8243 /YStep 8243 /TilingType 1 /PaintType 1 /Matrix [ 0.333679 0 0 0.333468 78.832642 172.074584 ] /Resources << /XObject << /x211 211 0 R >> >> >>"""), { PDFName("/Length"): PDFXRef(213, 0), PDFName("/PatternType"): 1, PDFName("/BBox"): [0, 0, 2596, 37], PDFName("/XStep"): 8243, PDFName("/YStep"): 8243, PDFName("/TilingType"): 1, PDFName("/PaintType"): 1, PDFName("/Matrix"): [0.333679, 0, 0, 0.333468, 78.832642, 172.074584], PDFName("/Resources"): { PDFName("/XObject"): { PDFName("/x211"): PDFXRef(211, 0), }, }, }) self.assertEqual( PDFParser.parse( "<< /XHeight 0 /CharSet (/F) /FontFile 2992 0 R >>"), { PDFName("/XHeight"): 0, PDFName("/CharSet"): b"/F", PDFName("/FontFile"): PDFXRef(2992, 0), })
def test_string_simple(self): self.assertEqual(PDFParser.parse("(Foo)"), b"Foo") self.assertEqual(PDFParser.parse("(Foo Bar)"), b"Foo Bar") self.assertEqual(PDFParser.parse("(Foo Bar)"), b"Foo Bar") self.assertEqual(PDFParser.parse("(Foo Bar )"), b"Foo Bar ") self.assertEqual(PDFParser.parse("( Foo)"), b" Foo") self.assertEqual(PDFParser.parse("(Foo )"), b"Foo ") self.assertEqual(PDFParser.parse("( Foo )"), b" Foo ")
def test_string_escape_sequences(self): self.assertEqual(PDFParser.parse(r"(Foo\n)"), b"Foo\n") self.assertEqual(PDFParser.parse(r"(Foo\t\r\n)"), b"Foo\t\r\n") self.assertEqual(PDFParser.parse(r"(Foo\(\(\()"), b"Foo(((") self.assertEqual(PDFParser.parse(r"(Foo\)\)\)\(\(\()"), b"Foo)))(((") self.assertEqual(PDFParser.parse(r"(Foo\)\)\) \\\\\\ \(\(\()"), b"Foo))) \\\\\\ (((") self.assertEqual(PDFParser.parse(r"(Foo \\ Bar)"), b"Foo \\ Bar") self.assertEqual(PDFParser.parse(r"(Foo \\ \) Bar)"), b"Foo \\ ) Bar")
def test_string_nested(self): self.assertEqual(PDFParser.parse("(Foo (Bar))"), b"Foo (Bar)") self.assertEqual(PDFParser.parse("(Foo( Bar))"), b"Foo( Bar)") self.assertEqual(PDFParser.parse("(Foo (Bar) )"), b"Foo (Bar) ") self.assertEqual(PDFParser.parse("(Foo ( Bar ) )"), b"Foo ( Bar ) ") self.assertEqual(PDFParser.parse("(Foo (Klammer) Bar)"), b"Foo (Klammer) Bar") self.assertEqual( PDFParser.parse("(Foo (Klammer (Klammer2) Yeah) Bar)"), b"Foo (Klammer (Klammer2) Yeah) Bar") self.assertEqual( PDFParser.parse("(Foo (Space) Yes)"), b"Foo (Space) Yes")
def __init__(self, objid, gennum, rawdata): assert (objid is not None) assert (gennum is not None) assert (isinstance(objid, int)) assert (isinstance(gennum, int)) self._objid = objid self._gennum = gennum if rawdata is not None: strm = StreamRepr(rawdata) stream_begin = strm.read_until_token(b"stream") if stream_begin is not None: stream_data = strm.read_until_token(b"endstream") if stream_data is not None: content = stream_begin self._stream = stream_data else: # Probably erroneous stream data ("stream" maybe in dict, # but no "endstream") content = rawdata self._stream = None else: # No stream in this object found, just content content = rawdata self._stream = None content = content.decode("latin1") # Remove line continuations content = content.replace("\\\r\n", "") content = content.replace("\\\n", "") content = content.replace("\\\r", "") self._content = PDFParser.parse(content) if (self._stream is not None) and ( PDFName("/Length") in self._content) and isinstance( self._content[PDFName("/Length")], int): # When direct length field is given, then truncate the stream # according to it. For indirect streams, we don't do this (yet) self._stream = self._stream[:self._content[PDFName("/Length")]] else: self._stream = None self._content = None
def test_array(self): self.assertEqual(PDFParser.parse("[ 1 2 /Foo 3 4 ]"), [1, 2, PDFName("/Foo"), 3, 4]) self.assertEqual(PDFParser.parse("[ /Foobar13478 /Barfoo999 ]"), [PDFName("/Foobar13478"), PDFName("/Barfoo999")]) self.assertEqual( PDFParser.parse("[ 12345 9999 48 489 8473 << /foo 3939 >>]"), [12345, 9999, 48, 489, 8473, { PDFName("/foo"): 3939 }]) self.assertEqual( PDFParser.parse( "[ 12345 9999 48 489 R 8473 3.43984 << /foo 3939 >>]"), [ 12345, 9999, PDFXRef(48, 489), 8473, 3.43984, { PDFName("/foo"): 3939 } ]) self.assertEqual( PDFParser.parse("[ 0.333679 0 0 0.333468 78.832642 172.074584 ]"), [0.333679, 0, 0, 0.333468, 78.832642, 172.074584]) self.assertEqual(PDFParser.parse("[ 1.2345 1.2345 ]"), [1.2345, 1.2345])
def test_string_non_ascii(self): self.assertEqual(PDFParser.parse(r"(Foo \1 Bar)"), b"Foo \x01 Bar") self.assertEqual(PDFParser.parse(r"(Foo \12 Bar)"), b"Foo \x0a Bar") self.assertEqual(PDFParser.parse(r"(Foo \123 Bar)"), b"Foo S Bar") self.assertEqual(PDFParser.parse(r"(Foo \333 Bar)"), b"Foo \xdb Bar") self.assertEqual(PDFParser.parse("(Foo \xc4 Bar)"), b"Foo \xc4 Bar")
def test_name(self): self.assertEqual(PDFParser.parse("/Foo"), PDFName("/Foo")) self.assertEqual(PDFParser.parse("/Foo#20Bar"), PDFName("/Foo Bar"))
def test_float(self): self.assertEqual(PDFParser.parse("-9.13478"), -9.13478) self.assertEqual(PDFParser.parse("13.478"), 13.478) self.assertEqual(PDFParser.parse(".13478"), 0.13478) self.assertEqual(PDFParser.parse("000.33478"), 0.33478) self.assertEqual(PDFParser.parse("10.000"), 10.0)
def test_int(self): self.assertEqual(PDFParser.parse("013478"), 13478) self.assertEqual(PDFParser.parse("13478"), 13478) self.assertEqual(PDFParser.parse("-913478"), -913478)
def test_xref(self): self.assertEqual(PDFParser.parse("123 456 R"), PDFXRef(123, 456)) self.assertEqual( PDFParser.parse("[ 123 456 R 999 888 R ]"), [PDFXRef(123, 456), PDFXRef(999, 888)])
def _read_trailer(self): self._log.debug("Started reading trailer at 0x%x.", self._f.tell()) (trailer_data, delimiter) = self._f.read_until_token(b"startxref") trailer_data = trailer_data.decode("latin1") self._trailer = PDFParser.parse(trailer_data) self._f.seek(self._f.tell() - len(delimiter))
def _read_trailer(self, f): self._log.debug("Started reading trailer at 0x%x.", f.tell()) trailer_data = f.read_until_token(b"startxref", rewind=True) trailer_data = trailer_data.decode("latin1") trailer = PDFParser.parse(trailer_data) return trailer