def scrape_libraries_services(): print("*************** Scraping Libraries and Services *********************") soup = get_soup(base_url) dl = soup.find("dl", {'id': 'locations-table'}) anchor_nodes = dl.find_all("a") last_library = None for a_node in anchor_nodes: name = a_node.find_next("dt").get_text() link = a_node.get('href') if name.startswith(' '): # is a lib service service = LibraryService(name, link) service.library = last_library populate_details(service) services.append(service) else: # is a lib lib = Library(name, link) last_library = lib populate_details(lib) libraries.append(lib) generate_id(libraries) generate_id(services) xml_lib_list = XmlList().from_list(libraries) xml_lib_list.save("library.xml") xml_services_list = XmlList().from_list(services) xml_services_list.save("library_service.xml")
def scrapeCampusServices(): print("*************** Scraping Campus Services *********************") soup = get_soup('https://www.dal.ca/faculty_staff.html') service_nodes = soup.find_all("h4", class_="c-title") service_list: List[CampusService] = list() web_link_list: List[WebLinks] = list() service_id = 0 for node in service_nodes: service_id = service_id + 1 link_nodes = node.find_next("ul").find_all("li") service = node.find_next("h4").find_next("a") service_url = service.get("href") service_url = dal_prefix(service_url) service_name = service.get_text() campus_service = CampusService(service_name, service_url) campus_service.id = service_id service_list.append(campus_service) for link_node in link_nodes: link = link_node.find_next("a") url = link.get('href') url = dal_prefix(url) text = link.get_text() web_link = WebLinks(text, url, service_name) web_link.service_id = service_id web_link_list.append(web_link) xml_camp_service = XmlList() xml_camp_service.from_list(service_list) xml_camp_service.save("campus_service.xml") xml_web_links = XmlList() xml_web_links.from_list(web_link_list) xml_web_links.save("web_links.xml")
def scrape_buildings(): campus_ids = dict() buildings = get_buildings(campus_ids) b_list = XmlList() amenities: Dict[str, int] = dict() generate_id(buildings) for b in buildings: b_list.add(b.to_xml_obj()) for key in b.amenities.keys(): amenities[key] = 1 build_ids_from_dict(amenities) # save_to_file(dict_to_xml_rows(amenities, "amenity"), "amenity.xml") save_to_file(dict_to_xml_rows(campus_ids, "campus"), "campus.xml") b_list.save("building.xml")