Exemple #1
0
def lingk_csv_data_to_course_descriptions(data):
    header, *rows = data
    try:
        course_code_idx = header.index("courseNumber")
        desc_idx = header.index("description")
    except ValueError:
        raise ScrapeError(f"unexpected header: {header!r}") from None
    desc_map = {}
    for row in rows:
        # We have some rows that are completely empty and some that
        # are just whitespace.
        if not row or "".join(row).isspace():
            continue
        if len(row) != len(header):
            raise ScrapeError(f"malformed row: {row!r}")
        course_code = row[course_code_idx]
        try:
            course_info = shared.parse_course_code(course_code,
                                                   with_section=False)
        except ScrapeError:
            continue
        index_key = tuple(
            shared.course_info_as_list(course_info, with_section=False))
        description = row[desc_idx]
        if not description:
            continue
        description = " ".join(description.split())
        # If two conflicting descriptions for the same course code
        # (yep, it happens), pick the one that comes later :/
        desc_map[index_key] = description
    if len(desc_map) < 100:
        raise ScrapeError(
            f"Not enough course descriptions: {len(desc_map)}") from None
    return desc_map
Exemple #2
0
def course_to_key(course):
    """
    Given a course object, return a tuple that can be used to index
    into the course description dictionary returned by
    `lingk.get_course_descriptions`.
    """
    course_info = shared.parse_course_code(course["courseCode"],
                                           with_section=False)
    return tuple(shared.course_info_as_list(course_info, with_section=False))
def lingk_api_data_to_course_descriptions(data):
    """
    Given the decoded JSON from the Lingk API, return a dictionary
    mapping tuples of course information (`with_section` false; see
    `shared.course_info_as_list`) to course descriptions.

    Throw ScrapeError if the data is malformed.
    """
    if not isinstance(data, dict):
        raise ScrapeError("Lingk JSON is not map")
    if "data" not in data:
        raise ScrapeError("Lingk JSON is missing 'data' field")
    desc_index = {}
    for idx, course in enumerate(data["data"]):
        if "description" not in course:
            continue
        description = course["description"]
        if not isinstance(description, str):
            raise ScrapeError(
                "'description' at index {} is not string".format(idx))
        if "courseNumber" not in course:
            raise ScrapeError(
                "Lingk JSON at index {} is missing 'courseNumber' field".
                format(idx))
        course_code = course["courseNumber"]
        # Special case that doesn't show up on Portal.
        if course_code == "ABROAD   HM":
            continue
        course_info = shared.parse_course_code(course_code, with_section=False)
        course_key = tuple(
            shared.course_info_as_list(course_info, with_section=False))
        found_mismatch = (course_key in desc_index
                          and desc_index[course_key] != description)
        if found_mismatch:
            raise ScrapeError("Lingk JSON has duplicate course: {}".format(
                repr(course_key)))
        desc_index[course_key] = description
    return desc_index
def get_courses(desc_index):
    """
    Return a tuple containing the list of course objects and the
    current term. Takes `desc_index` as returned by
    `lingk.get_course_descriptions`.
    """
    browser = get_browser()
    html, term = get_portal_html(browser)
    # Save on memory.
    scraper.kill_google_chrome()
    # Count how many courses we add descriptions to, so we can fail if
    # there aren't enough.
    num_descs_added = 0
    # Count how many courses we fail to parse, so we can fail if there
    # are too many.
    num_failed = 0
    # Get the first round of raw courses from Portal.
    raw_courses_1 = parse_portal_html(html)
    # Add course descriptions to them, using the raw course codes.
    # Also collect the course codes into a dictionary so that we can
    # deduplicate them.
    raw_courses_2 = []
    course_info_map = collections.defaultdict(list)
    for raw_course in raw_courses_1:
        try:
            course_code = raw_course["course_code"].strip()
            course_info = shared.parse_course_code(course_code,
                                                   with_section=True)
            desc_key = tuple(
                shared.course_info_as_list(course_info, with_section=False))
            desc = desc_index.get(desc_key)
            if desc:
                num_descs_added += 1
            raw_course["course_description"] = desc
            course_info_map[frozendict.frozendict(course_info)].append(
                raw_course)
        except ScrapeError as err:
            util.log_verbose(
                f"Failed to parse course: {format_raw_course(raw_course)!r} ({err})"
            )
            num_failed += 1
            continue
        raw_courses_2.append(raw_course)
    if num_descs_added < 100:
        raise ScrapeError(
            f"not enough course descriptions added: {num_descs_added}")
    # Deduplicate course codes.
    raw_courses_3 = []
    for course_info, courses in course_info_map.items():
        if len(courses) > 1:
            if course_info["course_code_suffix"]:
                util.log_verbose(
                    f"Duplicate course with suffix ({len(courses)} copies): "
                    f"{format_raw_course(courses[0])!r}")
                num_failed += len(courses)
                continue
            if len(courses) > len(string.ascii_uppercase):
                util.log_verbose(
                    f"Duplicate course with too many copies ({len(courses)}): "
                    f"{format_raw_course(courses[0])!r}")
                num_failed += len(courses)
                continue
            for course, letter in zip(courses, string.ascii_uppercase):
                course["course_code_suffix"] = letter
        raw_courses_3.extend(courses)
    raw_courses = raw_courses_3
    courses = []
    for raw_course in raw_courses:
        try:
            courses.append(process_course(raw_course, term))
        except ScrapeError as err:
            util.log_verbose(
                f"Failed to parse course: {format_raw_course(raw_course)!r} ({err})"
            )
            num_failed += 1
    if num_failed >= 10:
        raise ScrapeError(f"Too many malformed courses: {num_failed}")
    num_succeeded = len(raw_courses) - num_failed
    if num_succeeded < 500:
        raise ScrapeError(f"Not enough courses: {num_succeeded}")
    util.log_verbose(
        f"Added descriptions to {num_descs_added} out of {num_succeeded} courses"
    )
    return courses, term
def process_course(raw_course, term):
    """
    Turn a raw course object into something that the frontend can use.
    Return a dictionary.

    If the raw course object has invalid data, raise ScrapeError.
    """
    course_code = raw_course["course_code"].strip()
    course_info = shared.parse_course_code(course_code, with_section=True)
    course_code = shared.course_info_as_string(course_info)
    sort_key = shared.course_info_as_list(course_info, with_section=True)
    mutual_exclusion_key = shared.course_info_as_list(course_info,
                                                      with_section=False)
    course_name = raw_course["course_name"].strip()
    if not course_name:
        raise ScrapeError("empty string for course name")
    faculty = sorted(set(f.strip() for f in raw_course["faculty"]))
    if not faculty:
        raise ScrapeError("no faculty")
    for faculty_name in faculty:
        if not faculty_name:
            raise ScrapeError("faculty with empty name")
    try:
        # careful: "∕" (`chr(8725)`) != "/" (`chr(47)`)
        filled_seats, total_seats = map(int, raw_course["seats"].split("∕"))
    except ValueError as err:
        raise ScrapeError(
            f"malformed seat count: {raw_course['seats']!r} ({err})")
    if filled_seats < 0:
        raise ScrapeError(f"negative filled seat count: {filled_seats}")
    if total_seats < 0:
        raise ScrapeError(f"negative total seat count: {total_seats}")
    course_status = raw_course["status"].lower().strip()
    if course_status not in ("open", "closed", "reopened"):
        raise ScrapeError(f"unknown course status: {course_status!r}")
    begin_date = dateutil.parser.parse(raw_course["begin_date"]).date()
    end_date = dateutil.parser.parse(raw_course["end_date"]).date()
    # First half-semester courses start (spring) January 1 through
    # January 31 or (fall) July 15 through September 15. (For some
    # reason, MATH 30B in Fall 2017 is listed as starting August 8.)
    first_half = datetime.date(
        begin_date.year, 1, 1) < begin_date < datetime.date(
            begin_date.year, 1, 31) or datetime.date(
                begin_date.year, 7, 15) < begin_date < datetime.date(
                    begin_date.year, 9, 15)
    # Second half-semester courses for the spring end May 1 through
    # May 31, but there's also frosh chem pt.II which just *has* to be
    # different by ending 2/3 of the way through the semester. So we
    # also count that by allowing April 1 through April 30. Sigh. Fall
    # courses end December 1 through December 31.
    second_half = datetime.date(
        end_date.year, 4,
        1) < end_date < datetime.date(end_date.year, 5, 31) or datetime.date(
            end_date.year, 12, 1) < end_date < datetime.date(
                end_date.year, 12, 31)
    if first_half and second_half:
        term_count = 1
        terms = [0]
    elif first_half and not second_half:
        term_count = 2
        terms = [0]
    elif second_half and not first_half:
        term_count = 2
        terms = [1]
    else:
        raise ScrapeError(
            f"weird date range "
            f"{begin.date.strftime('%F')}-{end_date.strftime('%F')}")
    schedule = []
    for slot in raw_course["schedule"]:
        if re.match(r"To Be Arranged\xa00?0:00 ?- ?0?0:00 ?AM", slot):
            continue
        match = re.match(SCHEDULE_REGEX, slot)
        if not match:
            raise ScrapeError(f"malformed schedule slot: {slot!r}")
        days, start, end, location = match.groups()
        for day in days:
            if day not in DAYS_OF_WEEK:
                raise ScrapeError(
                    f"unknown day of week {day!r} in schedule slot {slot!r}")
        days = "".join(sorted(set(days), key=DAYS_OF_WEEK.index))
        if not days:
            raise ScrapeError(f"no days in schedule slot {slot!r}")
        if not (start.endswith("AM") or start.endswith("PM")):
            start += end[-2:]
        try:
            start = dateutil.parser.parse(start).time()
        except ValueError:
            raise ScrapeError(
                f"malformed start time {start!r} in schedule slot {slot!r}")
        try:
            end = dateutil.parser.parse(end).time()
        except ValueError:
            raise ScrapeError(
                f"malformed end time {end!r} in schedule slot {slot!r}")
        location = " ".join(location.strip().split())
        if not location:
            raise ScrapeError("empty string for location")
        # Start using camelCase here because we are constructing
        # objects that will be returned from the API as JSON -- no
        # longer just intermediate objects private to this module.
        schedule.append({
            "scheduleDays": days,
            "scheduleStartTime": start.strftime("%H:%M"),
            "scheduleEndTime": end.strftime("%H:%M"),
            "scheduleStartDate": begin_date.strftime("%Y-%m-%d"),
            "scheduleEndDate": end_date.strftime("%Y-%m-%d"),
            "scheduleTermCount": term_count,
            "scheduleTerms": terms,
            "scheduleLocation": location,
        })
    if not schedule:
        schedule.append({
            "scheduleDays": "",
            "scheduleStartTime": "00:00",
            "scheduleEndTime": "00:00",
            "scheduleStartDate": begin_date.strftime("%Y-%m-%d"),
            "scheduleEndDate": end_date.strftime("%Y-%m-%d"),
            "scheduleTermCount": term_count,
            "scheduleTerms": terms,
            "scheduleLocation": "N/A",
        })
    schedule = unique_preserve_order(schedule)
    num_credits = raw_course["credits"]
    try:
        num_credits = float(num_credits)
    except ValueError:
        raise ScrapeError(f"malformed credit count: {num_credits!r}")
    if num_credits < 0.0:
        raise ScrapeError(f"negative credit count: {raw_course['credits']}")
    if "Colloquium" in course_name and num_credits == 0.0:
        num_credits = 0.5
    elif re.match("PE ", course_code) and num_credits == 0.0:
        num_credits = 1.0
    elif num_credits == 0.25:
        num_credits = 1.0
    elif not re.search(r"HM-", course_code):
        num_credits *= 3.0
    if num_credits == 9.0:
        num_credits = 3.0
    num_credits = str(num_credits)
    course_description = raw_course["course_description"]
    # just urls for now - we could add ratings or would take again percentages later
    urls = []
    for prof in faculty:
        a = RateMyProfAPI(teacher=prof)
        # scrape the info from RateMyProfessors site
        a.fetch_info()
        urls.append(a.get_url())
    return {
        "courseCode": course_code,
        "courseName": course_name,
        "courseSortKey": sort_key,
        "courseMutualExclusionKey": mutual_exclusion_key,
        "courseDescription": course_description,
        "courseInstructors": faculty,
        "courseInstructorRMPpages": urls,
        "courseTerm": term,
        "courseSchedule": schedule,
        "courseCredits": num_credits,
        "courseSeatsTotal": total_seats,
        "courseSeatsFilled": filled_seats,
        "courseWaitlistLength": None,
        "courseEnrollmentStatus": course_status,
    }
def process_course(raw_course, term):
    """
    Turn a raw course object into something that the frontend can use.
    Return a dictionary.

    If the raw course object has invalid data, raise ScrapeError.
    """
    course_code = raw_course["course_code"].strip()
    course_info = shared.parse_course_code(course_code, with_section=True)
    course_code = shared.course_info_as_string(course_info)
    sort_key = shared.course_info_as_list(course_info, with_section=True)
    mutual_exclusion_key = shared.course_info_as_list(course_info,
                                                      with_section=False)
    course_name = raw_course["course_name"].strip()
    if not course_name:
        raise ScrapeError("empty string for course name")
    faculty = sorted(set(re.split(r"\s*\n\s*", raw_course["faculty"].strip())))
    if not faculty:
        raise ScrapeError("no faculty")
    for faculty_name in faculty:
        if not faculty_name:
            raise ScrapeError("faculty with empty name")
    match = re.match(r"([0-9]+)/([0-9]+)", raw_course["seats"])
    if not match:
        raise ScrapeError("malformed seat count: {}".format(
            repr(raw_course["seats"])))
    filled_seats, total_seats = map(int, match.groups())
    if filled_seats < 0:
        raise ScrapeError(
            "negative filled seat count: {}".format(filled_seats))
    if total_seats < 0:
        raise ScrapeError("negative total seat count: {}".format(total_seats))
    course_status = raw_course["status"].lower()
    if course_status not in ("open", "closed", "reopened"):
        raise ScrapeError("unknown course status: {}".format(
            repr(course_status)))
    begin_date = dateutil.parser.parse(raw_course["begin_date"]).date()
    end_date = dateutil.parser.parse(raw_course["end_date"]).date()
    # First half-semester courses start (spring) January 1 through
    # January 31 or (fall) July 15 through September 15. (For some
    # reason, MATH 30B in Fall 2017 is listed as starting August 8.)
    first_half = (datetime.date(begin_date.year, 1, 1) < begin_date <
                  datetime.date(begin_date.year, 1, 31)
                  or datetime.date(begin_date.year, 7, 15) < begin_date <
                  datetime.date(begin_date.year, 9, 15))
    # Second half-semester courses for the spring end May 1 through
    # May 31, but there's also frosh chem pt.II which just *has* to be
    # different by ending 2/3 of the way through the semester. So we
    # also count that by allowing April 1 through April 30. Sigh. Fall
    # courses end December 1 through December 31.
    second_half = (datetime.date(
        end_date.year, 4, 1) < end_date < datetime.date(end_date.year, 5, 31)
                   or datetime.date(end_date.year, 12, 1) < end_date <
                   datetime.date(end_date.year, 12, 31))
    if first_half and second_half:
        term_count = 1
        terms = [0]
    elif first_half and not second_half:
        term_count = 2
        terms = [0]
    elif second_half and not first_half:
        term_count = 2
        terms = [1]
    else:
        raise ScrapeError("weird date range {}-{}".format(
            begin_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")))
    schedule = []
    for slot in raw_course["schedule"]:
        if slot.startswith("0:00 - 0:00 AM"):
            continue
        match = re.match(SCHEDULE_REGEX, slot)
        if not match:
            raise ScrapeError("malformed schedule slot: {}".format(repr(slot)))
        days, start, end, location = match.groups()
        for day in days:
            if day not in DAYS_OF_WEEK:
                raise ScrapeError(
                    "unknown day of week {} in schedule slot {}".format(
                        repr(day), repr(slot)))
        days = "".join(sorted(set(days), key=DAYS_OF_WEEK.index))
        if not days:
            raise ScrapeError("no days in schedule slot {}".format(repr(slot)))
        if not (start.endswith("AM") or start.endswith("PM")):
            start += end[-2:]
        try:
            start = dateutil.parser.parse(start).time()
        except ValueError:
            raise ScrapeError(
                "malformed start time {} in schedule slot {}".format(
                    repr(start), repr(slot)))
        try:
            end = dateutil.parser.parse(end).time()
        except ValueError:
            raise ScrapeError(
                "malformed end time {} in schedule slot {}".format(
                    repr(end), repr(slot)))
        location = " ".join(location.strip().split())
        if not location:
            raise ScrapeError("empty string for location")
        # Start using camelCase here because we are constructing
        # objects that will be returned from the API as JSON -- no
        # longer just intermediate objects private to this module.
        schedule.append({
            "scheduleDays": days,
            "scheduleStartTime": start.strftime("%H:%M"),
            "scheduleEndTime": end.strftime("%H:%M"),
            "scheduleStartDate": begin_date.strftime("%Y-%m-%d"),
            "scheduleEndDate": end_date.strftime("%Y-%m-%d"),
            "scheduleTermCount": term_count,
            "scheduleTerms": terms,
            "scheduleLocation": location,
        })
    if not schedule:
        schedule.append({
            "scheduleDays": "",
            "scheduleStartTime": "00:00",
            "scheduleEndTime": "00:00",
            "scheduleStartDate": begin_date.strftime("%Y-%m-%d"),
            "scheduleEndDate": end_date.strftime("%Y-%m-%d"),
            "scheduleTermCount": term_count,
            "scheduleTerms": terms,
            "scheduleLocation": "N/A",
        })
    schedule = unique_preserve_order(schedule)
    num_credits = raw_course["credits"]
    try:
        num_credits = float(num_credits)
    except ValueError:
        raise ScrapeError("malformed credit count: {}".format(
            repr(num_credits)))
    if num_credits < 0.0:
        raise ScrapeError("negative credit count: {}".format(
            raw_course["credits"]))
    if "Colloquium" in course_name and num_credits == 0.0:
        num_credits = 0.5
    elif re.match("PE ", course_code) and num_credits == 0.0:
        num_credits = 1.0
    elif num_credits == 0.25:
        num_credits = 1.0
    elif not re.search(r"HM-", course_code):
        num_credits *= 3.0
    if num_credits == 9.0:
        num_credits = 3.0
    num_credits = str(num_credits)
    course_description = raw_course["course_description"]
    return {
        "courseCode": course_code,
        "courseName": course_name,
        "courseSortKey": sort_key,
        "courseMutualExclusionKey": mutual_exclusion_key,
        "courseDescription": course_description,
        "courseInstructors": faculty,
        "courseTerm": term,
        "courseSchedule": schedule,
        "courseCredits": num_credits,
        "courseSeatsTotal": total_seats,
        "courseSeatsFilled": filled_seats,
        "courseWaitlistLength": None,
        "courseEnrollmentStatus": course_status,
    }