def crawl_term_medians_for_url(url): soup = retrieve_soup(url) table_rows = soup.find("table").find("tbody").find_all("tr") medians = [ _convert_table_row_to_dict(table_row) for table_row in table_rows] medians.sort(cmp=_median_dict_sorter) return medians
def _get_department_urls_from_url(url): soup = retrieve_soup(url) linked_urls = [ urlparse.urljoin(BASE_URL, a["href"]) for a in soup.find_all("a", href=True) ] return set(linked_url for linked_url in linked_urls if _is_department_url(linked_url, url))
def crawl_courses_from_program_page_url(url, program_code): soup = retrieve_soup(url) linked_urls = [ urlparse.urljoin(BASE_URL, a["href"]) for a in soup.find_all("a", href=True) ] course_urls = sorted(set(url for url in linked_urls if _is_course_url(url))) return filter(None, [ _crawl_course_data(course_url, program_code) for course_url in course_urls ])
def _get_program_urls_from_department_url(url): soup = retrieve_soup(url) linked_urls = [ urlparse.urljoin(BASE_URL, a["href"]) for a in soup.find_all("a", href=True) ] program_urls = set() for potential_program_url in linked_urls: if _is_course_url(potential_program_url): potential_program_url = ("/".join( potential_program_url.split("/")[:-1])) if _is_program_url(potential_program_url, url): program_urls.add(potential_program_url) return program_urls
def _crawl_course_data(course_url, program_code): soup = retrieve_soup(course_url) course_heading = " ".join(soup.find("h1").get_text().split()) course_heading = COURSE_HEADING_CORRECTIONS.get(program_code, {}).get( course_heading, course_heading) if course_heading: split_course_heading = course_heading.split() department = split_course_heading[0] number, subnumber = parse_number_and_subnumber(split_course_heading[1]) course_title = " ".join(split_course_heading[2:]) description = soup.find(class_="desc").get_text(strip=True) return { "department": department, "description": description, "number": number, "subnumber": subnumber, "title": course_title, "url": course_url, }
def crawl_median_page_urls(): soup = retrieve_soup(MEDIAN_PAGE_INDEX_URL) return _retrieve_term_medians_urls_from_soup(soup)
def crawl_timetable(term): """ Timetable HTML is malformed. All table rows except the head do not have a proper starting <tr>, which requires us to: 1. Iterate over <td></td> in chunks rather than by <tr></tr> 2. Remove all </tr> in the table, which otherwise breaks BeautifulSoup into not allowing us to iterate over all the <td></td> To iterate over the <td></td> in chunks, we get the number of columns, put all of the <td></td> in a generator, and pull the number of columns from the generator to get the row. """ course_data = [] request_data = DATA_TO_SEND.format(term=_get_timetable_term_code(term)) soup = retrieve_soup( TIMETABLE_URL, data=request_data, preprocess=lambda x: re.sub("</tr>", "", x), ) num_columns = len(soup.find(class_="data-table").find_all("th")) assert num_columns == 18 tds = soup.find(class_="data-table").find_all("td") assert len(tds) % num_columns == 0 td_generator = (td for td in tds) for _ in xrange(len(tds) / num_columns): tds = [next(td_generator) for _ in xrange(num_columns)] number, subnumber = parse_number_and_subnumber(tds[3].get_text()) crosslisted_courses = _parse_crosslisted_courses( tds[7].get_text(strip=True)) course_data.append({ "term": _convert_timetable_term_to_term(tds[0].get_text(strip=True)), # "crn": int(tds[1].get_text(strip=True)), "program": tds[2].get_text(strip=True), "number": number, "subnumber": subnumber, "section": int(tds[4].get_text(strip=True)), "title": tds[5].get_text(strip=True).encode('ascii', 'ignore').decode('ascii'), "crosslisted": crosslisted_courses, "period": tds[8].get_text(strip=True), "room": tds[9].get_text(strip=True), "building": tds[10].get_text(strip=True), "instructor": _parse_instructors(tds[11].get_text(strip=True)), "world_culture": tds[12].get_text(strip=True), "distribs": _parse_distribs(tds[13].get_text(strip=True)), "limit": int_or_none(tds[14].get_text(strip=True)), # "enrollment": int_or_none(tds[15].get_text(strip=True)), "status": tds[16].get_text(strip=True), }) return course_data