/
validate.py
executable file
·513 lines (449 loc) · 18.3 KB
/
validate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
#!/usr/bin/env python2.7
"""
Validate html files for techint group website.
"""
import glob
import os
import pdb
import re
import sys
import HTMLParser
import urllib2
import xml.sax.saxutils as xml
from optparse import *
# ---------------------------------------------------------------------------
def main(args):
"""
Handle command line arguments, process the list of files, call sys.exit()
"""
p = OptionParser()
p.add_option('-d', '--debug',
action='store_true', default=False, dest='debug',
help='debug')
p.add_option('-w', '--w3c',
action='store_true', default=False, dest='w3c',
help='send file to validator.w3.org')
p.add_option('-r', '--rm',
action='store_true', default=False, dest='passrm',
help='rm validation output on pass')
p.add_option('-v', '--verbose',
action='store_true', default=False, dest='verbose',
help='more output')
(o, a) = p.parse_args(args)
if o.debug: pdb.set_trace()
verbose(o.verbose)
if 1 < len(a):
flist = a[1:]
else:
flist = glob.glob("*.html")
for filename in flist:
if verbose(): print filename
if o.w3c:
w3c_validate(filename)
else:
check_file(filename)
sys.exit(exit_value())
# ---------------------------------------------------------------------------
def check_file(filename):
"""
Load the contents of the file, instantiate a parser, and feed it
the input data.
"""
f = open(filename, 'r')
html = f.read()
f.close()
tip = TIParser(filename, html)
tip.feed(html)
tip.finish()
# ---------------------------------------------------------------------------
def exit_value(val=0):
"""
Record the exit value to be used when the process terminates.
"""
global xval
try:
rval = xval
except:
xval = 0
if val != 0:
xval = val
return xval
# ---------------------------------------------------------------------------
def verbose(value=None):
"""
Cache and return the value of the -v option.
"""
global verbosity
if value != None:
verbosity = value
try:
rval = verbosity
except NameError:
verbosity = False
rval = verbosity
return rval
# ---------------------------------------------------------------------------
def w3c_validate(filename):
"""
For index.html, the validation URL is http://validator.w3.org/check?uri=http%3A%2F%2Fusers.nccs.gov%2F~tpb%2Ftechint_olcf%2Findex.html&charset=%28detect+automatically%29&doctype=Inline&group=0&user-agent=W3C_Validator%2F1.3
"""
entities = {':': '%3a',
'/': '%2F',
'(': '%28',
')': '%29'}
validator = "http://validator.w3.org"
host = "http://users.nccs.gov/"
path = "~tpb/techint"
uri = xml.escape("uri=%s%s/%s" % (host, path, filename), entities)
charset = xml.escape("charset=(detect+automatically)", entities)
doctype = "doctype=Inline"
group = "group=0"
agent = xml.escape("user-agent=W3C_Validator/1.3", entities)
url = "%s/check?%s&%s&%s&%s&%s" % (validator, uri, charset, doctype,
group, agent)
# print url
page = urllib2.urlopen(url)
text = page.readlines()
vname = "validation_%s" % filename
h = open(vname, 'w')
h.writelines(text)
h.close()
print("Validation output is in %s" % vname)
assess_validation(vname)
# ---------------------------------------------------------------------------
def assess_validation(filename):
f = open(filename)
v = f.readlines()
f.close()
passed = False
for line in v:
if "Passed" in line:
print line
passed = True
if 'class="msg"' in line:
print line
if passed:
os.unlink(filename)
# ---------------------------------------------------------------------------
class TIParser(HTMLParser.HTMLParser):
# -----------------------------------------------------------------------
def __init__(self, filename="", text=""):
"""
Initialize the object with the data and flags we'll need to
validate the input.
The general strategy for checking for required elements is to
create a flag here that starts out with the value 'missing',
indicating the element has not yet been seen. When/if we come
across the element, we'll update the flag to 'present' or
whatever is appropriate. Then in finish(), we can check the
flag and if it still says 'missing', we know we never found
the required element.
"""
if verbose(): print("TIParser.__init__")
HTMLParser.HTMLParser.__init__(self)
self.filename = filename
self.text = text.split("\n")
self.deprecations = {'applet': '<object>',
'basefont': 'CSS',
'blackface': 'CSS',
'center': 'CSS',
'dir': '<ul>',
'embed': '<object>',
'font': 'CSS',
'strike': 'CSS',
}
self.doctype = "missing"
self.head = 'missing'
self.body = 'missing'
self.css = 'missing'
self.filetype = 'missing'
self.charset = 'missing'
self.description = 'missing'
self.title = 'missing'
self.nostack = ['p', 'br', 'meta', 'li', 'dd', 'dt']
self.stack = []
self.catch_tabs()
# -----------------------------------------------------------------------
def handle_startendtag(self, tag, attrs):
"""
Here we catch <script/>, which doesn't work for loading
javascripts.
"""
if verbose(): print("TIParser.handle_startendtag(self, %s, %s)"
% (tag, attrs))
if tag == 'script':
self.errmsg("<script/> does not load javascript effectively."
+ " Please use '<script ... > </script>' instead.")
self.standard_tag_checks(tag, attrs)
# -----------------------------------------------------------------------
def handle_starttag(self, tag, attrs):
"""
Run the standard tag checks.
At the very first tag, we should have already seen a
<!doctype...>, so if not, we complain. Once we've complained
once, we change the string in self.doctype so we won't report
the missing <!doctype> again.
Next, we check for unquoted attributes, deprecated tags, and
uppercase tags.
"""
if verbose(): print("TIParser.handle_starttag(self, %s, %s)"
% (tag, attrs))
self.standard_tag_checks(tag, attrs)
if tag not in self.nostack:
self.stack.append(tag)
# -----------------------------------------------------------------------
def handle_endtag(self, tag):
"""
This gets called when the parser sees an end tag.
"""
if verbose(): print("TIParser.handle_endtag(self, %s)" % (tag))
if tag == 'head':
self.head = 'closed'
if tag == 'body':
self.body = 'closed'
(line, offset) = self.getpos()
etag = self.text[line-1][offset:]
if tag not in self.nostack:
pop = self.stack.pop()
if tag != pop:
self.errmsg("</%s> does not match <%s>" % (tag, pop))
# -----------------------------------------------------------------------
def handle_data(self, data):
"""
This gets called when the parser sees data between tags.
"""
if verbose(): print("TIParser.handle_data(self, '%s')" % (data))
pass
# -----------------------------------------------------------------------
def handle_decl(self, decl):
"""
This gets called when the parser sees a declaration, like
<!doctype ...>
"""
if verbose(): print("TIParser.handle_decl(self, '%s')" % (decl))
if 'doctype' in decl.lower():
self.doctype = "present"
# -----------------------------------------------------------------------
def handle_a(self, tag, attrs):
"""
This gets called when we see an <a> tag
"""
ad = dict(attrs)
if 'href' in ad.keys() \
and ad['href'].startswith('http:') \
and 'target' not in ad.keys():
self.errmsg("External link with no target attribute")
# -----------------------------------------------------------------------
def handle_div(self, tag, attrs):
"""
This gets called on <div> tags
"""
if 'div' in self.stack:
# self.errmsg('warning: nested <div> tags detected', 0)
pass
# -----------------------------------------------------------------------
def handle_head(self, tag, attrs):
"""
This gets called when the parser sees a <head> tag.
"""
self.head = 'open'
# -----------------------------------------------------------------------
def handle_body(self, tag, attrs):
"""
This gets called when the parser sees a <body> tag.
"""
self.body = 'open'
# -----------------------------------------------------------------------
def handle_img(self, tag, attrs):
if 'alt' not in [n for (n, v) in attrs]:
self.errmsg("<img> tag needs an 'alt' attribute")
# -----------------------------------------------------------------------
def handle_input(self, tag, attrs):
if 'alt' not in [n for (n,v) in attrs]:
self.errmsg("<input> tag needs an 'alt' attribute")
# -----------------------------------------------------------------------
def handle_link(self, tag, attrs):
"""
This gets called when the parser sees a <link> tag. At least
one is required to specify the CSS file.
"""
if ('rel', 'stylesheet') in attrs and ('type', 'text/css') in attrs:
self.css = "present"
# -----------------------------------------------------------------------
def handle_meta(self, tag, attrs):
"""
Handling for meta tags. Two things must be specified with meta
tags: filetype (e.g., <meta name="keywords" content="index" />) and charset
(e.g., <meta charset="utf-8" />). They can both be specified in a single
meta tag or split up.
"""
ad = {}
for tup in attrs:
ad[tup[0]] = tup[1]
if 'name' in ad.keys() \
and 'keywords' == ad['name'] \
and 'content' in ad.keys():
self.filetype = ad['content']
if 'name' in ad.keys() \
and 'description' == ad['name']:
self.description = 'present'
if 'charset' in ad.keys():
self.charset = 'present'
# -----------------------------------------------------------------------
def handle_script(self, tag, attrs):
"""
Handle <script> tags.
"""
if 'head' in self.stack:
self.errmsg('Please put your <script> tags at the end of '
+ '<body> rather than in <head>', 0)
# -----------------------------------------------------------------------
def handle_title(self, tag, attrs):
"""
Note that we've seen a title tag.
"""
self.title = 'present'
# -----------------------------------------------------------------------
def handle_named_tag(self, tag, attrs):
"""
Look for a method named 'handle_<tagname>'. If it exists, call
it with the tag and attribute list as arguments. This makes it
easy to add handlers for specific tags.
"""
d = dir(self)
mname = 'handle_%s' % tag
if mname in dir(self):
getattr(self, mname)(tag, attrs)
# -----------------------------------------------------------------------
def catch_tabs(self):
"""
Scan the input text and report the location of any TAB
characters found.
"""
lnum = 1
for line in self.text:
cnum = line.find("\t")
if 0 <= cnum:
self.errmsg("TAB detected in input. Please use spaces.",
pos=(lnum,cnum))
lnum += 1
# -----------------------------------------------------------------------
def catch_unquoted_attrs(self, text, attrlist):
"""
Here We check to make sure attributes inside HTML tags are quoted.
"""
for tup in attrlist:
(an, av) = tup
rgx = "%s\s*=\s*" % (an) \
+ "['" \
+ '"]%s["' % (re.escape(av)) \
+ "']"
q = re.search(rgx, self.unescape(text))
if q == None:
self.errmsg("unquoted attribute in '%s'" % (text))
# -----------------------------------------------------------------------
def catch_deprecated_tags(self, tag):
"""
Here we report any deprecated tags we encounter.
"""
if tag in self.deprecations.keys():
(line, offs) = self.getpos()
self.errmsg("Tag '<%s>' is deprecated. Consider using %s instead"
% (tag, self.deprecations[tag]),
0)
# -----------------------------------------------------------------------
def catch_uppercase_tags(self, tag):
"""
Here we report uppercase tags.
"""
raw = self.get_starttag_text()
q = re.search("<\s*(\w+)\s*", raw)
txt = q.groups()[0]
for chr in txt:
if chr.isupper():
self.errmsg("Tags like '<%s>' containing uppercase letters are deprecated"
% (txt))
break
# -----------------------------------------------------------------------
def errmsg(self, msg, exit_val = 1, pos = None):
"""
Format an error message -- filename, location in file, and
message. Optionally, the caller can set the exit value to be
used when the process terminates. By default that's 1,
indicating an issue but the caller can set exit_val to 0 for a
warning that won't prevent downstream processing.
"""
if pos == None:
(line, offset) = self.getpos()
else:
(line, offset) = pos
fmsg = "%s[%d,%d]: %s" % (self.filename, line, offset, msg)
print(fmsg)
exit_value(exit_val)
# -----------------------------------------------------------------------
def finish(self):
"""
This is the last method called for an HTML file so we can
report required tags that were never seend, etc.
"""
if verbose(): print("TIParser.finish()")
for tag in ['head', 'body']:
if getattr(self, tag) == 'missing':
self.errmsg('%s tag not found' % (tag))
elif getattr(self, tag) != 'closed':
self.errmsg('%s tag not complete' % (tag))
if self.filetype == 'missing':
self.errmsg("Filetype missing. Please add "
+ "'<meta name=\"keywords\" content=\"[ft]\" /> "
+ "where 'ft' is one of 'about', 'proj', 'member', "
+ "'contact', 'jobs', 'nav', 'pub', or 'software' "
+ "in the <head> section.")
elif self.title == 'missing':
self.errmsg("A <title> tag is needed for this file.", 0)
if self.charset == 'missing':
self.errmsg("Charset not specified. Please add "
+ "<meta charset='utf-8' /> "
+ "in the <head> section.")
if self.css == 'missing':
self.errmsg("No CSS link found in <head>. Please add at least "
+ "<link rel='stylesheet' type='text/css' "
+ "href='techint_f.css' />")
if self.description == 'missing':
self.errmsg("File description not found. Please add at least "
+ '<meta name="description" content="page description"> '
+ 'in the <head> section.')
# -----------------------------------------------------------------------
def standard_tag_checks(self, tag, attrs):
"""
This method is called by both handle_starttag() and
handle_startendtag(). It represents the common code that needs
to be run for each start tag, whether it contains text before
the end tag or the end is included with the start tag.
At the very first tag, we should have already seen a
<!doctype...>, so if not, we complain. Once we've complained
once, we change the string in self.doctype so we won't report
the missing <!doctype> again.
The method handle_named_tag() looks to see whether a method
named handle_<tagname>() exists or not. If it does,
handle_named_tag() will call it with the tag and attrs list as
arguments. This allows makes it quick and easy to add handling
for a specific tag.
Next, we check for unquoted attributes, deprecated tags, and
uppercase tags.
"""
if self.doctype == "missing":
self.errmsg("A <!doctype ...> is required at the top of the file")
self.doctype = "reported"
if 0 == len(self.stack) and tag != 'html':
self.errmsg("The top level tag should be <html>")
elif 1 == len(self.stack) and tag != 'head' and tag != 'body':
self.errmsg("stray '%s' tag found" % tag)
if tag == 'style' or 'style' in [n for (n,v) in attrs]:
self.errmsg('warning: external styling is prefered', 0)
self.handle_named_tag(tag, attrs)
self.catch_unquoted_attrs(self.get_starttag_text(), attrs)
self.catch_deprecated_tags(tag)
self.catch_uppercase_tags(tag)
# ---------------------------------------------------------------------------
if __name__ == '__main__':
main(sys.argv)