def scrape_adsu(): """Get information about the adsu in a crazy way due to their bitching page made like shit""" scraped_info = {} adsu_url = "http://www.adsuaq.org/" headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", "accept-encoding": "gzip,deflate,sdch", "accept-language": "en-US,en;q=0.8", } request = requests.get(adsu_url, headers=headers) if request.status_code != 200: print("Error! Status "+request.status_code) return info = BeautifulSoup(request.text, "html.parser").find(id="AutoNumber5").text.replace(" ", "")\ .replace("\t", "").replace("\r", "").replace("\n\n", "") scraped_info.update({ "info": info }) utils.write_json(scraped_info, "../json/adsu.json")
def newson_command(bot, update): """Defining the command to enable notifications for news""" if update.message.chat_id not in utils.SUBSCRIBERS: utils.SUBSCRIBERS.append(update.message.chat_id) bot.sendMessage(update.message.chat_id, text='Notifiche Abilitate!') utils.write_json(utils.SUBSCRIBERS, "json/subscribers.json") else: bot.sendMessage(update.message.chat_id, text='Le notifiche sono già abilitate!')
def newsoff_command(bot, update): """Defining the command to disable notifications for news""" if update.message.chat_id in utils.SUBSCRIBERS: utils.SUBSCRIBERS.remove(update.message.chat_id) bot.sendMessage(update.message.chat_id, text='Notifiche Disattivate!') utils.write_json(utils.SUBSCRIBERS, "json/subscribers.json") else: bot.sendMessage(update.message.chat_id, text='Per disattivare le notifiche dovresti prima attivarle.')
def newsoff_command(bot, update): """Defining the command to disable notifications for news""" if update.message.chat_id in utils.SUBSCRIBERS: utils.SUBSCRIBERS.remove(update.message.chat_id) bot.sendMessage(update.message.chat_id, text='Notifiche Disattivate!') utils.write_json(utils.SUBSCRIBERS, "json/subscribers.json") else: bot.sendMessage(update.message.chat_id, text='Per disattivare le notifiche dovresti \ prima attivarle.')
def scrape_professors(): """Get information about professors""" scraped_professors = [] professors_url = "http://www.disim.univaq.it/didattica/" \ "content.php?tipo=3&ordine=1&chiave=0&pid=25&did=8&lid=it&" \ "frmRicercaNome=&frmRicercaCognome=&frmRicercaLaurea=1&action_search=Filtra" headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", "accept-encoding": "gzip,deflate,sdch", "accept-language": "en-US,en;q=0.8", } request = requests.get(professors_url, headers=headers) if request.status_code != 200: print("Error! Status "+request.status_code) return professors_table = BeautifulSoup(request.text, "html.parser").find("table") firsts_td = professors_table.find_all(colspan='2') for name_cell in firsts_td: name = name_cell.find("a").text phone = name_cell.find_next_sibling().text email = str(name_cell.find_next_sibling().find_next_sibling().a) \ .replace('<a href="#">', '').replace('</a>', '') \ .replace('<img alt="dot" height="2" src="img/dot.gif" width="3"/>', '.') \ .replace('<img alt="at" height="10" src="img/at.gif" width="12"/>', '@') courses = name_cell.find_next_sibling().find_next_sibling().find_next_sibling() \ .text.replace('\n', '').replace('\u00a0', '').replace('[F3I]', '') \ .replace('[F4I]', '').replace('[F3M]', '').replace('[I3N]', '') \ .replace('[I4T]', '') scraped_professors.append({ "nome": name if name != "" else "non disponibile", "telefono": phone if phone != "" else "non disponibile", "e-mail": email if email != "" else "non disponibile", "corsi": courses if courses != "" else "non disponibile", "ufficio": "0" }) utils.write_json(scraped_professors, "../json/professors.json")
def notify_news(bot): """Defining method that will be repeated over and over""" unread_news = news.check_news() if len(unread_news) > 0: data = news.pull_news(10) utils.write_json(data, "json/news.json") new_news_string = "" for item in unread_news: truncated_descr = item['description'][:75] + '...' if len(item['description']) > 75\ else item['description'] new_news_string += "- [" + item['title'] + "](" + item['link'] + ")\n" \ + truncated_descr + "\n" for chat_id in utils.SUBSCRIBERS: bot.sendMessage(chat_id, parse_mode='Markdown', text=new_news_string) JOB_QUEUE.put(notify_news, 40, repeat=True)
def scrape_student_office(): """Get info about the student service office""" scraped_info = {} student_office_url = "http://www.univaq.it/section.php?id=607" headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", "accept-encoding": "gzip,deflate,sdch", "accept-language": "en-US,en;q=0.8", } request = requests.get(student_office_url, headers=headers) if request.status_code != 200: print("Error! Status " + request.status_code) return first_row = BeautifulSoup(request.text, "html.parser").find(string="AREA SCIENTIFICA")\ .parent.parent.find_next_sibling().find("tr") address = first_row.find(class_="address_table_description").text phone = first_row.find_next_sibling().find( class_="address_table_description").text email = first_row.find_next_sibling().find_next_sibling()\ .find(class_="address_table_description").text hours = first_row.find_next_sibling().find_next_sibling().find_next_sibling()\ .find(class_="address_table_description").text.replace('\n', '')\ .replace("13", "13, ") scraped_info.update({ "indirizzo": address, "telefono": phone, "e-mail": email, "orari": hours }) utils.write_json(scraped_info, "../json/student_office.json")
def notify_news(bot): """Defining method that will be repeated over and over""" unread_news = news.check_news() invalid_chatid = list() if unread_news: data = news.pull_news(10) news_to_string = "" utils.write_json(data, "json/news.json") for item in unread_news: news_to_string += "- [{title}]({link})\n{description}\n".format(**item) for chat_id in utils.SUBSCRIBERS: try: bot.sendMessage(chat_id, parse_mode='Markdown', text=news_to_string) except TelegramError: invalid_chatid.append(chat_id) for chat_id in invalid_chatid: utils.SUBSCRIBERS.remove(chat_id) utils.write_json(utils.SUBSCRIBERS, "json/subscribers.json")
def scrape_student_office(): """Get info about the student service office""" scraped_info = {} student_office_url = "http://www.univaq.it/section.php?id=607" headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", "accept-encoding": "gzip,deflate,sdch", "accept-language": "en-US,en;q=0.8", } request = requests.get(student_office_url, headers=headers) if request.status_code != 200: print("Error! Status "+request.status_code) return first_row = BeautifulSoup(request.text, "html.parser").find(string="AREA SCIENTIFICA")\ .parent.parent.find_next_sibling().find("tr") address = first_row.find(class_="address_table_description").text phone = first_row.find_next_sibling().find(class_="address_table_description").text email = first_row.find_next_sibling().find_next_sibling()\ .find(class_="address_table_description").text hours = first_row.find_next_sibling().find_next_sibling().find_next_sibling()\ .find(class_="address_table_description").text.replace('\n', '')\ .replace("13", "13, ") scraped_info.update({ "indirizzo": address, "telefono": phone, "e-mail": email, "orari": hours }) utils.write_json(scraped_info, "../json/student_office.json")
return '' s = ''.join([c for c in s if c.isdigit() or c == '+']) if s and s[0] != '+' and len(s) == 10: s = '+39' + s # if not already internationalized, make it Italian return '-'.join([s[:3], s[3:7], s[7:]]) if s.startswith('+39') else s def scrape_professors(url=PROFESSORS_URL): """Get information about professors""" scraped_professors = [] soup = utils.get_soup_from_url(url) professor_names = soup.find("table").find_all(colspan='2') for name_cell in professor_names: name, phone, email, courses, _ = name_cell.parent.find_all('td') scraped_professors.append({ "nome": name.text or "non disponibile", "telefono": phone_cleanup(phone.text) or "non disponibile", "e-mail": email_soup_cleanup(email) or "non disponibile", "corsi": courses_cleanup(courses.text) or "non disponibile", }) return scraped_professors if __name__ == "__main__": utils.write_json(scrape_professors(), "../json/professors.json")
img.replace_with('.') return email_soup.text.strip() # .lower() # ? def phone_cleanup(s): """Clean the phones' output""" if not s: return '' s = ''.join([c for c in s if c.isdigit() or c == '+']) if s and s[0] != '+' and len(s) == 10: s = '+39' + s # if not already internationalized, make it Italian return '-'.join([s[:3], s[3:7], s[7:]]) if s.startswith('+39') else s def scrape_professors(url=PROFESSORS_URL): """Get information about professors""" scraped_professors = [] soup = utils.get_soup_from_url(url) professor_names = soup.find("table").find_all(colspan='2') for name_cell in professor_names: name, phone, email, courses, _ = name_cell.parent.find_all('td') scraped_professors.append({ "nome": name.text or "non disponibile", "telefono": phone_cleanup(phone.text) or "non disponibile", "e-mail": email_soup_cleanup(email) or "non disponibile", "corsi": courses_cleanup(courses.text) or "non disponibile", }) return scraped_professors if __name__ == "__main__": utils.write_json(scrape_professors(), "../json/professors.json")
#!/usr/bin/env python # -*- coding: utf-8 -*- """This script scrapes information about the student service office from the univaq website.""" import sys sys.path.insert(0, '../') from libs.utils import utils STUDENT_OFFICE_URL = "http://www.univaq.it/section.php?id=607" def scrape_student_office(url=STUDENT_OFFICE_URL): """Get info about the student service office""" soup = utils.get_soup_from_url(url) area = soup.find(text='AREA SCIENTIFICA').parent.parent.find_next_sibling() address, phone, email, hours = area.find_all(class_='address_table_description') return { 'indirizzo': address.text, 'telefono': phone.text, 'e-mail': email.text, 'orari': hours.text.strip().replace('13', '13, ') } if __name__ == "__main__": utils.write_json(scrape_student_office(), "../json/student_office.json")
#!/usr/bin/env python # -*- coding: utf-8 -*- """This script scrapes all the info about the adsu of the university's city.""" import sys sys.path.insert(0, '../') from libs.utils import utils ADSU_URL = "http://www.adsuaq.org/" def scrape_adsu(url=ADSU_URL): """Get information about the adsu in a crazy way due to their bitching page made like shit""" soup = utils.get_soup_from_url(url).find(id="AutoNumber5") info = soup.text.replace(" ", "").replace("\t", "").replace("\r", "").replace("\n\n", "") return {"info": info} if __name__ == "__main__": utils.write_json(scrape_adsu(), "../json/adsu.json")
def create_news_json(): """Defining command to check (and create) the news.json file""" if not os.path.isfile("json/news.json"): utils.write_json(pull_news(10), "json/news.json")