Esempio n. 1
0
def _parse_crosslisted_courses(xlist_text):
    crosslisted_courses = []
    for course_text in (xlist_text.split(",") if xlist_text else []):
        program, numbers, section = course_text.split()
        number, subnumber = parse_number_and_subnumber(numbers)
        section = int(section)
        crosslisted_courses.append({
            "program": program,
            "number": number,
            "subnumber": subnumber,
            "section": section,
        })
    return crosslisted_courses
Esempio n. 2
0
def _crawl_course_data(course_url, program_code):
    soup = retrieve_soup(course_url)
    course_heading = " ".join(soup.find("h1").get_text().split())
    course_heading = COURSE_HEADING_CORRECTIONS.get(program_code, {}).get(
        course_heading, course_heading)
    if course_heading:
        split_course_heading = course_heading.split()
        department = split_course_heading[0]
        number, subnumber = parse_number_and_subnumber(split_course_heading[1])
        course_title = " ".join(split_course_heading[2:])
        description = soup.find(class_="desc").get_text(strip=True)
        return {
            "department": department,
            "description": description,
            "number": number,
            "subnumber": subnumber,
            "title": course_title,
            "url": course_url,
        }
Esempio n. 3
0
def _convert_table_row_to_dict(table_row):
    median_data = table_row.find_all("td")
    term = median_data[0].get_text(strip=True)
    course = median_data[1].get_text(strip=True)
    department = clean_department_code(course.split("-")[0])
    enrollment = int(median_data[2].get_text(strip=True))
    section = int(course.split("-")[2])
    median = median_data[3].get_text(strip=True)
    number, subnumber = parse_number_and_subnumber(course.split("-")[1])
    median_dict = {
        "course": {
            "department": department,
            "number": number,
            "subnumber": subnumber,
        },
        "enrollment": enrollment,
        "median": median,
        "section": section,
        "term": term,
    }
    return median_dict
Esempio n. 4
0
def crawl_timetable(term):
    """
    Timetable HTML is malformed. All table rows except the head do not have
    a proper starting <tr>, which requires us to:

    1. Iterate over <td></td> in chunks rather than by <tr></tr>
    2. Remove all </tr> in the table, which otherwise breaks BeautifulSoup into
       not allowing us to iterate over all the <td></td>

    To iterate over the <td></td> in chunks, we get the number of columns,
    put all of the <td></td> in a generator, and pull the number of columns
    from the generator to get the row.
    """
    course_data = []
    request_data = DATA_TO_SEND.format(term=_get_timetable_term_code(term))
    soup = retrieve_soup(
        TIMETABLE_URL,
        data=request_data,
        preprocess=lambda x: re.sub("</tr>", "", x),
    )
    num_columns = len(soup.find(class_="data-table").find_all("th"))
    assert num_columns == 18

    tds = soup.find(class_="data-table").find_all("td")
    assert len(tds) % num_columns == 0

    td_generator = (td for td in tds)
    for _ in xrange(len(tds) / num_columns):
        tds = [next(td_generator) for _ in xrange(num_columns)]

        number, subnumber = parse_number_and_subnumber(tds[3].get_text())
        crosslisted_courses = _parse_crosslisted_courses(
            tds[7].get_text(strip=True))

        course_data.append({
            "term":
            _convert_timetable_term_to_term(tds[0].get_text(strip=True)),
            # "crn": int(tds[1].get_text(strip=True)),
            "program":
            tds[2].get_text(strip=True),
            "number":
            number,
            "subnumber":
            subnumber,
            "section":
            int(tds[4].get_text(strip=True)),
            "title":
            tds[5].get_text(strip=True).encode('ascii',
                                               'ignore').decode('ascii'),
            "crosslisted":
            crosslisted_courses,
            "period":
            tds[8].get_text(strip=True),
            "room":
            tds[9].get_text(strip=True),
            "building":
            tds[10].get_text(strip=True),
            "instructor":
            _parse_instructors(tds[11].get_text(strip=True)),
            "world_culture":
            tds[12].get_text(strip=True),
            "distribs":
            _parse_distribs(tds[13].get_text(strip=True)),
            "limit":
            int_or_none(tds[14].get_text(strip=True)),
            # "enrollment": int_or_none(tds[15].get_text(strip=True)),
            "status":
            tds[16].get_text(strip=True),
        })
    return course_data