def _parse_crosslisted_courses(xlist_text): crosslisted_courses = [] for course_text in (xlist_text.split(",") if xlist_text else []): program, numbers, section = course_text.split() number, subnumber = parse_number_and_subnumber(numbers) section = int(section) crosslisted_courses.append({ "program": program, "number": number, "subnumber": subnumber, "section": section, }) return crosslisted_courses
def _crawl_course_data(course_url, program_code): soup = retrieve_soup(course_url) course_heading = " ".join(soup.find("h1").get_text().split()) course_heading = COURSE_HEADING_CORRECTIONS.get(program_code, {}).get( course_heading, course_heading) if course_heading: split_course_heading = course_heading.split() department = split_course_heading[0] number, subnumber = parse_number_and_subnumber(split_course_heading[1]) course_title = " ".join(split_course_heading[2:]) description = soup.find(class_="desc").get_text(strip=True) return { "department": department, "description": description, "number": number, "subnumber": subnumber, "title": course_title, "url": course_url, }
def _convert_table_row_to_dict(table_row): median_data = table_row.find_all("td") term = median_data[0].get_text(strip=True) course = median_data[1].get_text(strip=True) department = clean_department_code(course.split("-")[0]) enrollment = int(median_data[2].get_text(strip=True)) section = int(course.split("-")[2]) median = median_data[3].get_text(strip=True) number, subnumber = parse_number_and_subnumber(course.split("-")[1]) median_dict = { "course": { "department": department, "number": number, "subnumber": subnumber, }, "enrollment": enrollment, "median": median, "section": section, "term": term, } return median_dict
def crawl_timetable(term): """ Timetable HTML is malformed. All table rows except the head do not have a proper starting <tr>, which requires us to: 1. Iterate over <td></td> in chunks rather than by <tr></tr> 2. Remove all </tr> in the table, which otherwise breaks BeautifulSoup into not allowing us to iterate over all the <td></td> To iterate over the <td></td> in chunks, we get the number of columns, put all of the <td></td> in a generator, and pull the number of columns from the generator to get the row. """ course_data = [] request_data = DATA_TO_SEND.format(term=_get_timetable_term_code(term)) soup = retrieve_soup( TIMETABLE_URL, data=request_data, preprocess=lambda x: re.sub("</tr>", "", x), ) num_columns = len(soup.find(class_="data-table").find_all("th")) assert num_columns == 18 tds = soup.find(class_="data-table").find_all("td") assert len(tds) % num_columns == 0 td_generator = (td for td in tds) for _ in xrange(len(tds) / num_columns): tds = [next(td_generator) for _ in xrange(num_columns)] number, subnumber = parse_number_and_subnumber(tds[3].get_text()) crosslisted_courses = _parse_crosslisted_courses( tds[7].get_text(strip=True)) course_data.append({ "term": _convert_timetable_term_to_term(tds[0].get_text(strip=True)), # "crn": int(tds[1].get_text(strip=True)), "program": tds[2].get_text(strip=True), "number": number, "subnumber": subnumber, "section": int(tds[4].get_text(strip=True)), "title": tds[5].get_text(strip=True).encode('ascii', 'ignore').decode('ascii'), "crosslisted": crosslisted_courses, "period": tds[8].get_text(strip=True), "room": tds[9].get_text(strip=True), "building": tds[10].get_text(strip=True), "instructor": _parse_instructors(tds[11].get_text(strip=True)), "world_culture": tds[12].get_text(strip=True), "distribs": _parse_distribs(tds[13].get_text(strip=True)), "limit": int_or_none(tds[14].get_text(strip=True)), # "enrollment": int_or_none(tds[15].get_text(strip=True)), "status": tds[16].get_text(strip=True), }) return course_data