-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
489 lines (405 loc) · 15.6 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
from bs4 import BeautifulSoup
from spellchecker import SpellChecker
from urllib.parse import urlparse
import urllib.request
import cssutils
import requests
import logging
import re
import colorsys
import webcolors
from time import sleep
import string
PRODUCTION = True
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"accept-encoding": "gzip, deflate, sdch, br"
}
INACCESSIBLE_COLORS_FOUND = []
BAD_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.pdf', '.svg', '.gif', '.webm']
# Example:
# response = requests.get(url, headers=headers)
def get_domain(URL):
parsed_uri = urlparse(URL)
result = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
return result
# THIS ASSUMES THAT URL IS VALID, FRONT END VALIDATES IT
# IT ALSO ASSUMES IT'S IN HTTP OR HTTPS
def recieve_front_end_link(URL, socketio):
try:
response = requests.get(URL, headers=headers)
except:
create_error_json("Couldn't reach provided URL", "error", URL, socketio, text="Couldn't reaach provided URL", meta="")
base_url = urlparse(URL).netloc
visited_set = set()
stack = [URL]
while stack:
curr_url = stack.pop()
if curr_url not in visited_set:
visited_set.add(curr_url)
try:
resource = urllib.request.urlopen(curr_url)
soup = BeautifulSoup(resource, from_encoding=resource.info().get_param('charset'), features="lxml")
if PRODUCTION:
run_prod(soup, curr_url, socketio)
else:
run_debug(soup, curr_url, socketio)
for link in soup.findAll('a', attrs={'href': re.compile("^((?!http).)*$")}):
l = link.get('href')
linked_page = "http://" + base_url + "/" + l
if linked_page not in visited_set and urlparse(linked_page).netloc == base_url and not any(ext in linked_page for ext in BAD_EXTENSIONS):
stack.append(linked_page)
for link in soup.findAll('a', attrs={'href': re.compile("^https?://")}):
linked_page = link.get('href')
if linked_page not in visited_set and urlparse(linked_page).netloc == base_url and not any(ext in linked_page for ext in BAD_EXTENSIONS):
stack.append(link.get('href'))
except:
pass
def run_prod(soup, URL, socketio):
find_too_many_h1s(soup, URL, socketio)
find_broken_links(soup, URL, socketio)
find_broken_buttons(soup, URL, socketio)
find_inline_styles(soup, URL, socketio)
css_parse(soup, URL, socketio)
find_spelling_errors(soup, URL, socketio)
socketio.emit('data', {"severity": "info", "type": "text", "text": "Completed testing."})
def run_debug(soup, URL, socketio):
find_too_many_h1s(soup, URL, socketio)
find_broken_links(soup, URL, socketio)
find_broken_buttons(soup, URL, socketio)
find_inline_styles(soup, URL, socketio)
find_spelling_errors(soup, URL, socketio)
css_parse(soup, URL, socketio)
def css_parse(soup, URL, socketio):
DOMAIN = get_domain(URL)
cssLinkElements1 = soup.findAll("link", type="text/css")
cssLinkElements2 = soup.findAll("link", rel="stylesheet")
cssLinkLists = cssLinkElements1 + cssLinkElements2
finalCSSLinks = []
for cssLink in cssLinkLists:
if "bootstrap" in str(cssLink) or "vendor" in str(cssLink) or "http" in str(cssLink):
pass
elif cssLink not in finalCSSLinks:
finalCSSLinks.append(cssLink)
first_bool = True
smt_success_bool = True # assume successs
inaccess_success_bool = True # assume success
contrast_success_bool = True # assume success
if finalCSSLinks != []:
for item in finalCSSLinks:
stylesheetName = item['href']
create_print_json("css stylesheet: " + str(stylesheetName), socketio)
fullCSSStyleLink = DOMAIN + item['href']
cssutils.log.setLevel(logging.CRITICAL)
sheet = cssutils.parseUrl(fullCSSStyleLink)
for rule in sheet:
if rule.type == rule.STYLE_RULE:
try:
contrast_bool = find_contrast(soup, URL, first_bool, stylesheetName, fullCSSStyleLink, rule, socketio)
if contrast_bool == False:
contrast_success_bool = False # once it turns false, it's not going back to True
except:
pass
# HERE WE LOOP OVER THE PROPERTIES
# THIS IS WHERE WE CALL ALL THE FUNCTIONS TO CHECK AT THE SAME TIME
# SO WE ONLY HAVE TO LOOP OVER THE STYLE SHEET ONCE
for cssProperty in rule.style:
try:
smt_bool = find_small_text(soup, URL, cssProperty, first_bool, stylesheetName, fullCSSStyleLink, rule, socketio)
if smt_bool == False: # error
smt_success_bool = False # once it turns false, it's not going back to True
except:
pass
try:
inaccess_bool = find_inaccessible_colors(soup, URL, cssProperty, first_bool, stylesheetName, fullCSSStyleLink, rule, socketio)
if inaccess_bool == False:
inaccess_success_bool = False # once it turns false, it's not going back to True
except:
pass
first_bool = False
if smt_success_bool: # if this stayed true the whole time it's a success
create_success_json("small text", URL, socketio)
if inaccess_success_bool:
create_success_json("inaccessible colors", URL, socketio)
else:
TYPE = "inaccessible colors"
SEVERITY = "warning"
text = "We found " + str(len(INACCESSIBLE_COLORS_FOUND)) + " inaccessible colors."
create_error_json(TYPE, SEVERITY, fullCSSStyleLink, text=text, meta=str(INACCESSIBLE_COLORS_FOUND), socketio=socketio)
if contrast_success_bool:
create_success_json("accessibility for colorblind users", URL, socketio)
else:
create_success_json("CSS design", URL, socketio)
def find_contrast(soup, URL, first_bool, stylesheetName, fullCSSStyleLink, rule, socketio):
TYPE = "accessibility for colorblind users"
SEVERITY = "warning"
inaccessible_colors = []
if first_bool:
create_print_json(TYPE, socketio)
cssString = rule.cssText
if rule.style['color'] and rule.style['background-color']:
color = str(rule.style['color'])
backgroundColor = str(rule.style['background-color'])
# dumb edge case
if color == "#FFF" or color == "#fff":
color = "#FFFFFF"
if backgroundColor == "#FFF" or backgroundColor == "#fff":
backgroundColor = "#FFFFFF"
# get them both in hex
if color.startswith("#") and len(str(color)) == 7:
colorHex = color
elif not color.startswith("rgb"):
colorHex = webcolors.name_to_hex(color)
else: # rgb
colorHex = webcolors.rgb_to_hex(color)
if backgroundColor.startswith("#") and len(str(backgroundColor)) == 7:
backgroundHex = backgroundColor
elif not backgroundColor.startswith("rgb"):
backgroundHex = webcolors.name_to_hex(backgroundColor)
else:
backgroundHex = webcolors.rgb_to_hex(backgroundColor)
# now we have them both in hex
val = distinguish_hex(colorHex, backgroundHex)
if val == []: # no issue
return True
else:
text = "Bad contrast ratio between: " + color + " and " + backgroundColor + ". Consider changing them to similar colors: " + str(val)
create_error_json(TYPE, SEVERITY, fullCSSStyleLink, text=text, meta=rule.cssText, socketio=socketio)
return False
# searches for red, green
def find_inaccessible_colors(soup, URL, cssProperty, first_bool, stylesheetName, fullCSSStyleLink, rule, socketio):
TYPE = "inaccessible colors"
SEVERITY = "warning"
inaccessible_colors = ["red", "green", "#ff0000", "#00ff00"]
if first_bool:
create_print_json(TYPE, socketio)
# if font-size and it's value is in pixels
if cssProperty.name == "color":
# convert font size to ints
our_color_value = cssProperty.value
if if_bad_color(str(our_color_value), inaccessible_colors, 1):
if our_color_value not in INACCESSIBLE_COLORS_FOUND: # don't add duplicates
INACCESSIBLE_COLORS_FOUND.append(str(our_color_value))
return False # error found
def if_bad_color(color, bad_colors_list, issue):
# this is not fast but whatever
if color in bad_colors_list: # this is if we have standard color wording like "blue"
return True
# otherwise if a hexvalue
elif color.startswith("#") and len(color) == 7:
hexV = color[1:]
if hexV in bad_colors_list:
return True
else: # convert the hex to rgb and let it be the final test
rgbV = convert_hex_to_rgb(hexV)
return test_if_bad_rgb(rgbV, issue)
# otherwise it's an rgb
elif color.startswith("rgb"):
return test_if_bad_rgb(color, issue)
else: # not inaccessible
return False
def test_if_bad_rgb(rgb_string, issue=1):
rgbTuple = eval((rgb_string[3:]))
r = rgbTuple[0]
g = rgbTuple[1]
b = rgbTuple[2]
# then we can't have too much green or too much red
if issue == 1:
MAX_BAD = 125
MIN_OTHER = 90
if g > MAX_BAD and r < MIN_OTHER and b < MIN_OTHER:
return True
elif r > MAX_BAD and g < MIN_OTHER and b < MIN_OTHER:
return True
return False
def convert_hex_to_rgb(h):
rgbTuple = (tuple(int(h[i:i+2], 16) for i in (0, 2, 4)))
return "rgb" + str(rgbTuple)
# loops over properties
def find_small_text(soup, URL, cssProperty, first_bool, stylesheetName, fullCSSStyleLink, rule, socketio):
MINIMUM_SIZE_FONT = 12
TYPE = "small text"
SEVERITY = "warning"
if first_bool:
create_print_json(TYPE, socketio)
# if font-size and it's value is in pixels
if cssProperty.name == "font-size" and str(cssProperty.value)[-2:] == "px": # last 2 characters
# convert font size to ints
size = int(str(cssProperty.value[:-2])) # everything except last 2 characters
if size < MINIMUM_SIZE_FONT:
text = "You have a font size of " + str(size) + "px on stylesheet: " + stylesheetName
create_error_json(TYPE, SEVERITY, fullCSSStyleLink, text=text, meta=rule.cssText, socketio=socketio)
return False # error found
def find_broken_links(soup, URL, socketio):
TYPE = "possible broken link"
SEVERITY = "error"
create_print_json(TYPE, socketio)
success = True
try:
DOMAIN = get_domain(URL)
for htmlAnchor in soup.find_all('a'):
link = htmlAnchor.get('href')
if "mailto:" in link:
pass
else:
try:
response = requests.get(link, headers=headers)
except:
link = DOMAIN + link
response = requests.get(link, headers=headers)
text = "link: " + link
#create_print_json(text, socketio)
if int(response.status_code) >= 301 and int(response.status_code) != 999:
text = "You have a link which returned a bad " + str(response.status_code) + " response code."
create_error_json(TYPE, SEVERITY, URL, text=text, meta=htmlAnchor, socketio=socketio)
success = False
if success:
create_success_json(TYPE, URL, socketio)
except:
create_success_json(TYPE, URL, socketio)
def find_too_many_h1s(soup, URL, socketio):
TYPE = "too many header elements"
SEVERITY = "warning"
create_print_json(TYPE, socketio)
try:
h1TagsList = soup.find_all('h1')
if len(h1TagsList) > 1:
meta = h1TagsList
text = "You have " + str(len(h1TagsList)) + " h1 elements on " + str(URL)
create_error_json(TYPE, SEVERITY, URL, text=text, meta=meta, socketio=socketio)
else:
create_success_json(TYPE, URL, socketio)
except:
create_success_json(TYPE, URL, socketio)
# severity types: warning, error
def create_error_json(type, severity, URL, socketio, lineNumber=-1, text="", meta=""):
json = {"type": type, "severity": severity, "URL": URL, "lineNumber": lineNumber, "text": text, "meta": str(meta)}
print(json)
if PRODUCTION:
socketio.emit('data', json)
return json
# severity types: info
def create_print_json(TYPE, socketio):
json = {"severity": "info", "text": ("Running analysis of " + str(TYPE) + "... ")}
print(json)
if PRODUCTION:
socketio.emit('data', json)
return json
# severity type: success
def create_success_json(TYPE, URL, socketio):
json = {"severity": "success", "URL": URL, "type": str(TYPE), "text": "Success, " + (str(TYPE)) + " test passed!"}
print(json)
if PRODUCTION:
socketio.emit('data', json)
return json
def find_inline_styles(soup, URL, socketio):
TYPE = 'inline styles'
SEVERITY = 'warning'
create_print_json(TYPE, socketio)
try:
error_list = soup.find_all(style=True)
if len(error_list) == 0:
create_success_json(TYPE, URL, socketio)
else:
for error in error_list:
text = "You have an inline styled elements on " + str(URL)
create_error_json(TYPE, SEVERITY, URL, text=text, meta=error, socketio=socketio)
except:
create_success_json(TYPE, URL, socketio)
def find_spelling_errors(soup, URL, socketio):
TYPE = 'spell check'
SEVERITY = 'warning'
MIN_LENGTH = 14
create_print_json(TYPE, socketio)
try:
counter = 0
[s.extract() for s in soup('script')]
text = soup.get_text()
text = re.sub(r'[\n]', '', text)
spell = SpellChecker()
misspelled_word = False
for word in text.split(' '):
word = word.translate(str.maketrans('', '', string.punctuation))
if len(word) < MIN_LENGTH and word != '' and word[0] not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
correct_spelling = spell.correction(word)
if correct_spelling != word:
misspelled_word = True
counter += 1
text = "You have a misspelled word at " + str(URL)
create_error_json(TYPE, SEVERITY, URL, text=text, meta=word, socketio=socketio)
if counter > 5:
return
if not misspelled_word:
create_success_json(TYPE, URL, socketio)
except:
create_success_json(TYPE, URL, socketio)
def find_broken_buttons(soup, URL, socketio):
TYPE = 'broken button'
SEVERITY = 'warning'
create_print_json(TYPE, socketio)
broken_button = False
button_href = soup.find_all('button', {"href": False})
if "data-target" not in str(button_href):
if len(button_href) != 0:
text = "You have a button without an href at " + str(URL)
create_error_json(TYPE, SEVERITY, URL, text=text, meta=button_href, socketio=socketio)
broken_button = True
for tag in soup.find_all('button'):
for broken_tag in tag.findAll('a', {'href': False}):
text = "You have a button without an href at " + str(URL)
create_error_json(TYPE, SEVERITY, URL, text=text, meta=broken_tag, socketio=socketio)
broken_button = True
for tag in soup.find_all('div'):
for broken_tag in tag.findAll('a', {'href': False}):
text = "You have a button without an href at " + str(URL)
create_error_json(TYPE, SEVERITY, URL, text=text, meta=broken_tag, socketio=socketio)
broken_button = True
for broken_tag in tag.findAll('a', {'href': False}):
text = "You have a button without an href at " + str(URL)
create_error_json(TYPE, SEVERITY, URL, text=text, meta=broken_tag, socketio=socketio)
broken_button = True
if not broken_button:
create_success_json(TYPE, URL, socketio)
def rgb2hex(r, g, b):
return '#%02x%02x%02x' % (r, g, b)
def hex2rgb(hex_str):
m = re.match(
r'^\#?([0-9a-fA-F]{2})([0-9a-fA-F]{2})([0-9a-fA-F]{2})$', hex_str)
return (int(m.group(1), 16), int(m.group(2), 16), int(m.group(3), 16))
def distinguish_hex(hex1, hex2, mindiff=50):
"""
mindiff is the minimal
difference in lightness.
RETURNS [] IF NO ISSUE
RETURN [CHANGED COLORS] IF RECOMMENDATION TO CHANGE
"""
rgb1 = hex2rgb(hex1)
rgb2 = hex2rgb(hex2)
hls1 = colorsys.rgb_to_hls(*rgb1)
hls2 = colorsys.rgb_to_hls(*rgb2)
l1 = hls1[1]
l2 = hls2[1]
if abs(l1 - l2) >= mindiff: # ok already
return []
restdiff = abs(l1 - l2) - mindiff
if l1 >= l2:
l1 = min(255, l1 + restdiff / 2)
l2 = max(0, l1 - mindiff)
l1 = min(255, l2 + mindiff)
else:
l2 = min(255, l2 + restdiff / 2)
l1 = max(0, l2 - mindiff)
l2 = min(255, l1 + mindiff)
hsl1 = (hls1[0], l1, hls1[2])
hsl2 = (hls2[0], l2, hls2[2])
rgb1 = colorsys.hls_to_rgb(*hsl1)
rgb2 = colorsys.hls_to_rgb(*hsl2)
f1 = "rgb" + str((int(rgb1[0]), int(rgb1[1]), int(rgb1[2])))
f2 = "rgb" + str((int(rgb2[0]), int(rgb2[1]), int(rgb2[2])))
return [f1, f2]
######## DRIVER ############
if __name__ == "__main__":
FRONT_END_URL = "https://www.alexanderdanilowicz.com/second"
#FRONT_END_URL = "https://avalonbagelstoburgers.com/"
recieve_front_end_link(FRONT_END_URL, "DEBUG_SOCKET")