forked from mpescimoro/stripp3r
/
Stripp3r.py
155 lines (139 loc) · 6.31 KB
/
Stripp3r.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python
"""
Author : pescimoro.mattia@gmail.com
Licence : GPL v3 or any later version
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
from html.parser import HTMLParser
from urllib.request import urlopen
import datetime
import time
import locale
# external libraries
import requests
from lessonEntity import Lesson
from bs4 import BeautifulSoup # html5 smart parser
# set the locale to italian. This is mandatory for operate with the dates into the University of Brescia pages. It allow
# the right interpretation of the date string in italian (month name) and make the script independent from the system
# locale
locale.setlocale(locale.LC_ALL, "it_IT")
class MLStripper(HTMLParser):
def __init__(self): # Override here
HTMLParser.__init__(self)
# todo: add documentation
self.immolation = False
# is the line counter for parsing the header data
self.is_header = 0
# is the time of the firts hour of lessons. Mandatory to calculate the start hour from offsets
self.firstHour = datetime.datetime.strptime("08:30", "%H:%M")
# the lesson database entity
self.lesson = Lesson()
self.dataLine = 0
self.semesterStartDate = datetime.datetime
self.semesterEndDate = datetime.datetime
def handle_starttag(self, tag, attrs):
if tag == 'table':
for attr in attrs:
if attr[0] == 'class' and attr[1] == 'cellTabs':
break
if attr[0] == 'id':
self.immolation = True
# Hours coded from 0 to 9 as the nine hours avalaible in a standard lesson day from 08:30 to 09:30
# Day coded form 0 to 6 where 0 is monday and 6 is sunday.
coords = attr[1].split(sep='_') # Format as follow: DAY_HOUR
self.lesson.day = coords[1]
# calculate the correct start hour for the lesson
dt = self.firstHour + datetime.timedelta(hours=int(coords[2]))
self.lesson.hour = dt.time()
# print(self.lesson.hour) #DEBUG
# print(coords) #DEBUG
elif tag == 'td':
for attr in attrs:
# find the header with the information regarding the semester
if attr[1] == 'ttTitleTD':
# count the lines of the header
self.is_header += 1
# print('Pippo '+str(self.is_header)) #DEBUG
def handle_endtag(self, tag):
if tag == 'table': # or tag == 'br' #DEBUG
self.immolation = False
def handle_data(self, data):
# print('tag '+data) #DEBUG
if self.is_header > 0:
self.is_header += 1
# print('Pippo '+str(self.is_header)) # DEBUG
data = data.strip()
# needed to manage multiple teacher lines
is_append = False
if self.immolation:
# count the 4 lines of data for the lesson (subject, teacher, rooms, address)
if self.dataLine <= 3:
# pass #DEBUG
if data == '/':
self.dataLine -= 1
is_append = True
if self.dataLine == 0:
self.lesson.subject = data.upper()
print(self.lesson.subject) #DEBUG
if self.dataLine == 1:
self.lesson.teacher = self.lesson.teacher+data.upper()
if is_append == True:
self.dataLine -=1
is_append = False
print(self.lesson.teacher) #DEBUG
if self.dataLine == 2:
self.lesson.rooms = data.upper()
print(self.lesson.rooms) #DEBUG
if self.dataLine == 3:
self.lesson.address = data.upper()
print(self.lesson.address) #DEBUG
# print(data.upper()) # DEBUG
self.dataLine += 1
# if is the last line
if self.dataLine == 4:
print("--") # DEBUG
self.lesson.semesterStartDate = self.semesterStartDate
self.lesson.semesterEndDate = self.semesterEndDate
self.dataLine = 0
self.lesson.persist()
# reset the entity
self.lesson = Lesson()
if self.is_header == 9: # find the correct line
# print(data.upper()) #DEBUG
# extract the data information and foreach date create the datetime object
line = data.split(sep="-")
self.semesterStartDate = datetime.datetime.strptime(line[0], ": %A %d %B %Y ")
self.semesterEndDate = datetime.datetime.strptime(line[1], " %A %d %B %Y")
# print(self.semesterStartDate) #DEBUG
# print(self.semesterEndDate) #DEBUG
self.is_header = 0
# URL goes here
AA ='2015-2016'
unknownCode = '160'
# is the homepage of the calendars
url = 'https://calendari.unibs.it/EasyCourse/Orario/Area_di_Scienze_Ingegneristiche/'+AA+'/'+unknownCode+'/'
html_doc = requests.get(url+'index.html')
htmlSoup = BeautifulSoup(html_doc.text, 'html.parser')
courses = []
parser = MLStripper()
# find all the links to the courses calendar from the menu of the index page.
for link in htmlSoup.find_all('a'):
if link.get('href').find('Curricula', 0, 10) == 0:
courses.append(url+link.get('href'))
for calendar in courses:
fetch = urlopen(calendar)
data = fetch.read().decode('iso-8859-1') # Welcome back to 1987!
# data = data.replace(' / ', ' ')
parser.feed(data) # OM NOM NOM
parser.close()
# DISCLAIMER: this is random coding that relies on HTML formatting of a specific webpage.
# Thus the only thing that keeps it working is the belief in lazy people.