def get_teilar(self): ''' Grab announcements from the following websites and put them in separate custom RSS files: - http://teilar.gr/news.php?cid=1 - http://teilar.gr/news.php?cid=2 - http://teilar.gr/news.php?cid=5 - http://teilar.gr/news.php?cid=6 - http://teilar.gr/tmimatanews.php ''' rss_filenames = { 1: 'general.rss', 2: 'teilar_ann.rss', 5: 'council.rss', 6: 'committee.rss', 'tmimatanews.php': 'departments.rss', } for cid, rss_name in rss_filenames.iteritems(): custom_rss = self.initialize_rss_file() if type(cid) == int: output = teilar_anon_login('http://www.teilar.gr/news.php?cid=%s' % cid) else: output = teilar_anon_login('http://www.teilar.gr/%s' % cid) soup = BeautifulSoup(output) try: announcements_all = soup.find_all('table')[17].find_all('a', 'BlackText11')[:10] except Exception as error: logger_syslog.error(error, extra = log_extra_data()) logger_mail.exception(error) for item in announcements_all: ''' Get inside the announcement to get the rest of the info ''' ann_url = 'news_detail.php?nid=' + item['href'].split('nid=')[1] if type(cid) != int: ann_url = 'tmimata/' + ann_url output = teilar_anon_login('http://www.teilar.gr/%s' % ann_url) soup = BeautifulSoup(output) try: if type(cid) != int: creator = soup.find('span', 'OraTextBold').contents[0].split(' >')[0].replace(u'Τεχν.', u'Τεχνολογίας') else: creator = None temp_td_oratext = soup.find_all('td', 'OraText') pubdate = temp_td_oratext[0].contents[0].split('/') pubdate = date(int(pubdate[2]), int(pubdate[1]), int(pubdate[0])) title = temp_td_oratext[1].contents[0] description = unicode(soup.find('td', 'BlackText11')) enclosure = self.get_enclosure(soup) except Exception as error: logger_syslog.error(error, extra = log_extra_data('http://teilar.gr' + ann_url)) logger_mail.exception(error) self.add_rss_item(custom_rss, title, 'http://teilar.gr/' + ann_url, pubdate, description, creator, enclosure) self.write_rss_file(custom_rss, rss_name) return
def get_teachers(self): ''' Retrieves the teachers from teilar.gr The output is dictionary with the following structure: teachers_from_teilar = {'url': ['name', 'email', 'department']} ''' teachers_from_teilar = {} for pid in range(400): if pid == 386: ''' Dirty workaround to avoid a teacher who has no matching department, probably because they are testing something ''' continue ''' Perform connections to each of the teacher's profile page. From the HTML output we grab the name, email and department ''' url = 'http://www.teilar.gr/person.php?pid=%s' % pid output = teilar_anon_login(url) soup = BeautifulSoup(output) name = None email = None department = None try: name = soup.findAll('td', 'BlackText11Bold')[1].contents[0].strip() except IndexError: ''' No teacher found, continue with the next item of the loop ''' continue try: email = soup.findAll('td', 'BlackText11')[5].a.contents[0].split(' ')[0].strip() except AttributeError: try: email = soup.findAll('td', 'BlackText11')[5].contents[0].strip() except IndexError: pass except IndexError: pass try: ''' The string replace in the end is to keep it in track with dionysos.teilar.gr ''' department = soup.findAll('td', 'BlackText11')[2].contents[0].strip().replace(u'Τεχν.', u'Τεχνολογίας') except IndexError: pass teachers_from_teilar[url] = [name, email, department] return teachers_from_teilar
def get_faculties(self): ''' Retrieves the faculties from eclass.teilar.gr The output is dictionary with the following structure: faculties_from_eclass = {'url': ['name', 'code']} ''' faculties_from_eclass = {} output = teilar_anon_login('http://openclass.teilar.gr/modules/auth/listfaculte.php') soup = BeautifulSoup(output) all_faculties = soup.table('td') for faculty in all_faculties: url = 'http://openclass.teilar.gr/modules/auth/' + faculty.a.get('href') name = faculty.a.contents[0].strip() code = faculty.small.contents[0].split(')')[0].replace('(', '').strip() faculties_from_eclass[url] = [name, code] return faculties_from_eclass
def get_departments(self): """ Retrieves the departments from teilar.gr The output is dictionary with the following structure: departments_from_teilar = {'url': 'name'} """ departments_from_teilar = {} output = teilar_anon_login("http://www.teilar.gr/schools.php") soup = BeautifulSoup(output) all_departments = soup.find_all("a", "BlueText") for department in all_departments: url = "http://www.teilar.gr/" + department.get("href") """ The string replace in the end is to keep it in track with dionysos.teilar.gr """ name = department.contents[0].replace(u"Τεχν.", u"Τεχνολογίας") departments_from_teilar[url] = name return departments_from_teilar
def get_lessons(self, faculties_from_db_q): ''' Retrieves the lessons from eclass.teilar.gr The output is dictionary with the following structure: lessons_from_eclass = {'url': ['name', 'teacher', 'faculty', 'ltype']} ''' lessons_from_eclass = {} for faculty in faculties_from_db_q: output = teilar_anon_login(faculty.url) soup = BeautifulSoup(output) for i in range(3): ''' EclassLessons are grouped in three types: Undergraduate, Graduate, Other ''' ltype = BeautifulSoup(str(soup.find_all('table', id='t%s' % i))) if not ltype: ''' If the lesson type does not exist, then move on ''' continue all_lessons = ltype.find_all('tr', 'even') + ltype.find_all('tr', 'odd') for lesson in all_lessons: url = lesson.small.contents[0][1:-1] url = u'http://openclass.teilar.gr/courses/%s/' % url try: name = lesson.a.contents[0].strip() except AttributeError: name = lesson.find_all('td')[1].contents[0].strip() try: teacher = lesson.find_all('td')[2].contents[0].strip() except IndexError: teacher = None if i == 0: ltype = u'Προπτυχιακό' elif i == 1: ltype = u'Μεταπτυχιακό' elif i == 2: ltype = u'Άλλο' lessons_from_eclass[url] = [unicode(name), unicode(teacher), faculty.name, ltype] return lessons_from_eclass
def library(request): ''' Perform search in library.teilar.gr and print the results ''' notification = {} results = [] if request.method == 'GET': form = LibraryForm(request.GET) if form.is_valid(): url = 'http://hermes.lib.teilar.gr/ipac20/ipac.jsp?session=A26772NR74250.24315&menu=search&aspect=subtab22&npp=40&ipp=20&spp=20&profile=multbl--1&ri=&term=%s&index=.GEN&x=0&y=0&aspect=subtab22' % str(request.GET.get('search')) output = teilar_anon_login(url, request) soup = BeautifulSoup(output).find_all('table')[24] temp_a_mediumboldanchor = soup.find_all('a', 'mediumBoldAnchor') temp_td = soup.find_all('td') i = 5 for item in temp_a_mediumboldanchor: title = item.contents[0] ''' The authors are in <i> tags. Take the list of them by taking a list of the contents of <i> tags, and then join the list with commas for prettier output ''' authors = [] tmp_authors = temp_td[i].find_all('i') for author in tmp_authors: authors.append(author.contents[0].replace(',', '').strip()) authors = ', '.join(authors) editor = temp_td[i+1].contents[0].contents[0].split(' : ')[1] city = temp_td[i+1].contents[0].contents[0].split(' : ')[0] i += 10 results.append([title, authors, editor, city]) if not results: notification['info'] = 'Δεν υπάρχουν αποτελέσματα' else: form = SearchForm() return render_to_response('library.html', { 'form': form, 'notification': notification, 'results': results, }, context_instance = RequestContext(request))
def get_teachers(self): ''' Grab announcements from all the teachers, and put them in a custom RSS file. ''' custom_rss = self.initialize_rss_file() output = teilar_anon_login('http://www.teilar.gr/profannnews.php') soup = BeautifulSoup(output) try: announcements_all = soup.find_all('a', 'BlackText11') except Exception as error: logger_syslog.error(error, extra = log_extra_data()) logger_mail.exception(error) authors = {} for item in announcements_all: ''' The teacher's announcements are all under one page instead of being each one in separate page. We count in the combined page how many times a teacher's name is mentioned, and we parse the same number of the teacher's top announcements. The results are kept in a dictionary with the following structure: authors = {'url': number_of_announcements} ''' url = item['href'] if url in authors.keys(): authors[url] = authors[url] + 1 else: authors[url] = 1 for url, number in authors.iteritems(): ''' Get inside the teacher's page which contains all the announcements ''' output = teilar_anon_login('http://www.teilar.gr/%s' % url) soup = BeautifulSoup(output) try: author_name = soup.find('td', 'BlueTextBold').i.contents[0] except Exception as error: logger_syslog.error(error, extra = log_extra_data(url)) logger_mail.exception(error) ''' Select only the number of announcements we want ''' try: announcements_all = soup.find_all('td', 'LineDownDots')[0:number] except Exception as error: logger_syslog.error(error, extra = log_extra_data(url)) logger_mail.exception(error) for announcement in announcements_all: ''' Parse data from each announcement ''' try: temp_td_blacktext11 = announcement.find_all('td', 'BlackText11') title = temp_td_blacktext11[0].b.contents[0] pubdate = announcement.find('td', 'OraText').contents[0].split('/') pubdate = date(int(pubdate[2]), int(pubdate[1]), int(pubdate[0])) description = temp_td_blacktext11[1] enclosure = self.get_enclosure(soup) except Exception as error: logger_syslog.error(error, extra = log_extra_data(author_name)) logger_mail.exception(error) self.add_rss_item(custom_rss, title, 'http://teilar.gr/' + url, pubdate, description, author_name, enclosure) self.write_rss_file(custom_rss, 'teachers.rss') return