forked from gumho/antplanner
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
88 lines (68 loc) · 2.42 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from lib.BeautifulSoup import BeautifulSoup
import re
import logging
def strip_search(html):
form_html = BeautifulSoup(html).find('form', action='http://websoc.reg.uci.edu/')
#replace form submit with our own link
form_html['action'] = '/schedules'
#remove 'Display Text Results' button
text_buttons = form_html.findAll(attrs={"class" : "banner-width"})
for i in text_buttons:
i.replaceWith('<p id=\"submit-container\"><input type="submit" value="Display Results" name="Submit"></p>')
return str(form_html)
def strip_schedule(html):
schedule_html = BeautifulSoup(html).find('div', 'course-list')
if schedule_html is None:
return "<p id=\"error\">No results were found.</p>"
else:
return str(schedule_html)
def strip_websoc_version(html):
version_matches = re.findall('version.{,8}', html)
if not version_matches:
return 'Couldn\'t find a match'
else:
return version_matches[0]
def strip_professors(html, name):
"""Returns list of professor matches"""
profs = []
table = BeautifulSoup(html).find('div', {'id': 'ratingTable'})
if table is None:
logging.debug(html[500:])
return profs
split = name[:-1].upper().split(',')
qLast = split[0]
try:
qFirst = split[1]
except:
qFirst = ''
rows = table.findAll('div', {'class': re.compile(r"entry (odd|even)")})
for row in rows:
divName = row.find('div', {'class': 'profName'})
anchor = divName.find('a')
profName = unicode(anchor.renderContents().strip(), 'utf-8', 'ignore').upper()
try:
firstName = profName.split(',')[1]
except:
firstName = ''
# logging.info('Searching against: ' + profName)
if profName.startswith(qLast) and qFirst in firstName:
href = 'http://www.ratemyprofessors.com/' + anchor['href'].strip()
profDept = row.find('div', {'class': 'profDept'}).renderContents().strip()
profRatings = row.find('div', {'class': 'profRatings'}).renderContents().strip()
profQuality = row.find('div', {'class': 'profAvg'}).renderContents().strip()
profEasiness = row.find('div', {'class': 'profEasy'}).renderContents().strip()
profHot = row.find('div', {'class': re.compile(r".*\bprofHot\b.*")}).renderContents().strip()
if profHot == 'Hot':
profHot = '✓'
else:
profHot = ' '
profs.append({
'name': profName,
'href': href,
'dept': profDept,
'ratings': profRatings,
'quality': profQuality,
'easiness': profEasiness,
'hot': profHot
})
return profs