forked from OpenCourseAPI/OwlAPI
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_scraper.py
126 lines (98 loc) · 3.89 KB
/
data_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from os import makedirs, rename, remove
from os.path import join, exists
from collections import defaultdict
from re import match
# 3rd party
import requests
from bs4 import BeautifulSoup
from tinydb import TinyDB
SCHEDULE = 'schedule.html'
TERM_CODES = {'fh': '201911', 'da': '201912'}
HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end',
'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap')
DB_ROOT = 'db/'
COURSE_PATTERN = '[FD]0*(\d*\w?)\.?\d*([YWZH])?'
def main():
if not exists(DB_ROOT):
makedirs(DB_ROOT, exist_ok=True)
for term in TERM_CODES.values():
temp_path = join(DB_ROOT, 'temp.json')
temp = TinyDB(temp_path)
content = mine(term)
parse(content, db=temp)
rename(temp_path, join(DB_ROOT, f'{term}_database.json')) and remove(temp_path)
db = TinyDB(join(DB_ROOT, f'{term}_database.json'))
print(term, db.tables())
def mine(term, write=False):
'''
Mine will hit the database for foothill's class listings and write it to a file.
:param term: (str) the term to mine
:param write: (bool) write to file?
:return res.content: (json) the html body
'''
headers = {
'Origin': 'https://banssb.fhda.edu',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'FoothillAPI',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'text/html, */*; q=0.01',
'Referer': 'https://banssb.fhda.edu/PROD/fhda_opencourses.P_Application',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
}
data = [('termcode', f'{term}'), ]
res = requests.post('https://banssb.fhda.edu/PROD/fhda_opencourses.P_GetCourseList', headers=headers, data=data)
res.raise_for_status()
if write:
with open(f'{SCHEDULE}', "wb") as file:
for chunk in res.iter_content(chunk_size=512):
if chunk:
file.write(chunk)
return res.content
def parse(content, db):
'''
Parse takes the content from the request and then populates the database with the data
:param content: (html) The html containing the courses
:param db: (TinyDB) the current database
'''
soup = BeautifulSoup(content, 'html5lib')
tables = soup.find_all('table', {'class': 'TblCourses'})
for t in tables:
dept = t['dept'].replace(' ', '')
dept_desc = t['dept-desc']
rows = t.find_all('tr', {'class': 'CourseRow'})
s = defaultdict(lambda: defaultdict(list)) # key: list()
for r in rows:
cols = r.find_all(lambda tag: tag.name == 'td' and not tag.get_text().isspace())
if cols:
for i, c in enumerate(cols):
a = c.find('a')
cols[i] = a.get_text() if a else cols[i].get_text()
try:
key = get_key(f'{cols[0] if cols[0] else cols[1]}')[0]
data = dict(zip(HEADERS, cols))
crn = data['CRN']
if len(s[key][crn]) > 0:
comb = set(s[key][crn][0].items()) ^ set(data.items())
if len(comb) == 0:
continue
data['units'] = data['units'].lstrip()
s[key][crn].append(data)
except KeyError:
continue
j = dict(s)
db.table(f'{dept}').insert(j)
def get_key(course):
'''
This is the key parser for the course names
:param course: (str) The unparsed string containing the course name
:return match_obj.groups(): (list) the string for the regex match
'''
c = course.split(' ')
idx = 1 if len(c) < 3 else 2
section = c[idx]
match_obj = match(COURSE_PATTERN, section)
return match_obj.groups()
if __name__ == "__main__":
main()