/
pdfg.py
201 lines (164 loc) · 4.73 KB
/
pdfg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/env python
# -*- python -*-
__description__ = 'Tool to create a PDF object graph'
__author__ = 'backspace____'
__version__ = '0.0.1'
__date__ = '2012/09/17'
"""
Tool to create a PDF object graph
no Copyright
Use at your own risk
History:
2012/09/17: start
Todo:
"""
import pickle
import argparse
import re
import sys
from collections import deque
import gv
#from pygraph.classes.graph import graph
from pygraph.classes.digraph import digraph
from pygraph.algorithms.searching import breadth_first_search
from pygraph.readwrite.dot import write
isint = re.compile('^\d+$')
isname = re.compile('^.*/.+?')
def isWhiteSpace(code):
"""tell if code is white space
code: decimal representation char code"""
if (code == 0 or
code == 9 or
code == 10 or
code == 12 or
code == 13 or
code == 32):
return True
return False
def isDelimiter(code):
"""tell if code is delimiter
code: decimal representation char code"""
if (code == 0x28
or code == 0x29
or code == 0x3C
or code == 0x3E
or code == 0x5B
or code == 0x5D
or code == 0x7B
or code == 0x7D
#or code == 0x2F
or code == 0x25):
return True
return False
def allNone(list):
if (list[0] == None and
list[1] == None and
list[2] == None):
return True
return False
endobj=False
trailer=False
def isObjToken(list):
global endobj
global trailer
if (isint.match(list[0]) and
isint.match(list[1]) and
list[2] == "obj"):
endobj=False
return True
if (list[2] == "endobj"):
endobj=True
if (endobj and list[2] == "trailer"):
trailer=True
return True
return False
def getObjNum(list):
if trailer:
return "trailer"
return list[0]
def isIndirect(list):
if (isint.match(list[0]) and
isint.match(list[1]) and
(list[2] == "R" or re.search('^R/', list[2]) != None)):
return True
return False
def getIndirectNum(list):
return list[0]
def getAttrName(list, default):
if not list[2]:
return default
if (isname.match(list[2])):
ret = re.sub(r'^.*/', '', list[2])
return ret
return default
def debug(msg, flag):
if flag:
print msg
class Tokenizer:
""" a part of this class is from pdf-parser.py
http://didierstevens.com/files/software/pdf-parser_V0_3_9.zip
"""
def __init__(self, file):
self.file = file
self.infile = open(file, 'rb')
self.ungetted =[]
self.position = -1
def byte(self):
if self.infile.closed:
return None
inbyte = self.infile.read(1)
if not inbyte:
self.infile.close()
return None
return ord(inbyte)
def token(self):
self.token_str = ''
code = self.byte()
while code != None:
if isWhiteSpace(code) or isDelimiter(code):
code = self.byte()
continue
else:
self.token_str = self.token_str + chr(code)
code = self.byte()
if isWhiteSpace(code) or isDelimiter(code) or code == None:
return self.token_str
else:
continue
return None
def Main():
parser = argparse.ArgumentParser()
parser.add_argument("input_pdf", type=str,
help="pdf file that will be analysed.")
parser.add_argument("-T", "--debug-with-token",action="store_true",
help="print token strings.")
parser.add_argument("-o", "--output-png", default="out.png",
help="path to a png file to be written.",
type=str)
args = parser.parse_args()
tk = Tokenizer(args.input_pdf)
queue = deque([tk.token(), tk.token(), tk.token()])
obj_num = ""
attr = ""
gr = digraph()
while not allNone(queue):
if isObjToken(queue):
attr = ""
obj_num = getObjNum(queue)
if not gr.has_node(obj_num):
gr.add_node(obj_num)
elif isIndirect(queue):
if not gr.has_node(getIndirectNum(queue)):
gr.add_node(getIndirectNum(queue))
if not gr.has_edge((obj_num, getIndirectNum(queue))):
#print "obj: " + obj_num + " to: " + getIndirectNum(queue)
gr.add_edge((obj_num, getIndirectNum(queue)), label=attr)
attr = getAttrName(queue, attr)
debug(queue.popleft(), args.debug_with_token)
queue.append(tk.token())
dot = write(gr)
gvv = gv.readstring(dot)
gv.layout(gvv, 'dot')
gv.render(gvv, 'png', args.output_png)
if __name__ == '__main__':
Main()