def get_course_descriptions():
    """
    Given a Lingk API key and secret for authentication, return a
    dictionary mapping course codes (as can be used on the frontend)
    to course descriptions.

    Throw ScrapeError if the API is not available or returns bad data.
    """
    if util.get_env_boolean("lingk"):
        key = os.environ.get("HYPERSCHEDULE_LINGK_KEY")
        secret = os.environ.get("HYPERSCHEDULE_LINGK_SECRET")
        if not key or not secret:
            util.log("Skipping Lingk as key and secret are not set")
            return {}
        util.log_verbose("Scraping Lingk API")
        data = get_lingk_api_data(key, secret)
        desc_index = lingk_api_data_to_course_descriptions(data)
    else:
        util.log_verbose("Scraping Lingk CSV")
        data = get_lingk_csv_data()
        desc_index = lingk_csv_data_to_course_descriptions(data)
    if len(desc_index) < 100:
        raise ScrapeError("Not enough course descriptions: {}".format(
            len(desc_index)))
    return desc_index
Exemple #2
0
def get_portal_html(browser):
    """
    Given a Selenium browser object, perform a webscrape of Portal.
    Return a tuple (html, term) with the HTML of the course search
    results page as a string and the current term (for which courses
    were retrieved) also as a string.

    Raise ScrapeError if something goes wrong with the browser or
    Portal.
    """
    util.log_verbose(f"Current scraper IP is {get_ip()}")
    util.log_verbose("Scraping Portal")

    browser.set_window_size(1920, 1200)

    url = ("https://portal.hmc.edu/ICS/Portal_Homepage.jnz?"
           "portlet=Course_Schedules&screen=Advanced+Course+Search"
           "&screenType=next")
    browser.get(url)

    term_dropdown = selenium.webdriver.support.ui.Select(
        browser.find_element_by_id("pg0_V_ddlTerm"))
    term_names = [option.text for option in term_dropdown.options]

    terms_info = []
    for term_name in term_names:
        match = re.match(r"\s*(FA|SP)\s*([0-9]{4})\s*", term_name)
        if match:
            fall_or_spring, year_str = match.groups()
            terms_info.append(
                (int(year_str), fall_or_spring == "FA", term_name))

    if not terms_info:
        raise ScrapeError(
            f"couldn't parse any term names (from: {term_names!r})")

    term_info = max(terms_info)
    term = term_info[2]
    term_dropdown.select_by_visible_text(term)

    title_input = browser.find_element_by_id("pg0_V_txtTitleRestrictor")
    title_input.clear()
    title_input.send_keys("?")

    search_button = browser.find_element_by_id("pg0_V_btnSearch")
    search_button.click()

    show_all_checkbox = browser.find_element_by_id("pg0_V_lnkShowAll")
    show_all_checkbox.click()

    return browser.page_source, " ".join(term.split())
def get_lingk_api_data(key, secret):
    """
    Return the decoded JSON response from the Lingk API, using the
    given key and secret for authentication.

    Throw ScrapeError if the API is not available or returns bad data.
    """
    # For some bizarre reason the Lingk API sometimes returns 401
    # Unauthorized even when you are authenticated correctly. Asking
    # again a few times fixes the issue. I don't even want to know
    # why.
    last_error = None
    fails = 0
    for i in range(LINGK_RETRY_COUNT):
        now = datetime.datetime.utcnow()
        date = now.strftime("%a, %d %b %Y %H:%M:%S UTC")
        response = requests.get(LINGK_URL,
                                headers={
                                    "Date":
                                    date,
                                    "Authorization":
                                    get_auth_header(key, secret, date)
                                })
        try:
            response.raise_for_status()
            return response.json()
        except requests.exceptions.HTTPError as e:
            fails += 1
            util.log_verbose(
                "Got auth error from Lingk API ({} of {} allowed)".format(
                    fails, LINGK_RETRY_COUNT))
            time.sleep(1)
            last_error = e
            continue
        except ValueError:
            raise ScrapeError("Lingk API returned no data")
        except json.decoder.JSONDecodeError:
            raise ScrapeError("Lingk API did not return valid JSON")
    raise ScrapeError(
        "Lingk API returned error response: {}".format(last_error))
def get_courses(desc_index):
    """
    Return a tuple containing the list of course objects and the
    current term. Takes `desc_index` as returned by
    `lingk.get_course_descriptions`.
    """
    browser = get_browser()
    html, term = get_portal_html(browser)
    # Save on memory.
    scraper.kill_google_chrome()
    # Count how many courses we add descriptions to, so we can fail if
    # there aren't enough.
    num_descs_added = 0
    # Count how many courses we fail to parse, so we can fail if there
    # are too many.
    num_failed = 0
    # Get the first round of raw courses from Portal.
    raw_courses_1 = parse_portal_html(html)
    # Add course descriptions to them, using the raw course codes.
    # Also collect the course codes into a dictionary so that we can
    # deduplicate them.
    raw_courses_2 = []
    course_info_map = collections.defaultdict(list)
    for raw_course in raw_courses_1:
        try:
            course_code = raw_course["course_code"].strip()
            course_info = shared.parse_course_code(course_code,
                                                   with_section=True)
            desc_key = tuple(
                shared.course_info_as_list(course_info, with_section=False))
            desc = desc_index.get(desc_key)
            if desc:
                num_descs_added += 1
            raw_course["course_description"] = desc
            course_info_map[frozendict.frozendict(course_info)].append(
                raw_course)
        except ScrapeError as err:
            util.log_verbose(
                f"Failed to parse course: {format_raw_course(raw_course)!r} ({err})"
            )
            num_failed += 1
            continue
        raw_courses_2.append(raw_course)
    if num_descs_added < 100:
        raise ScrapeError(
            f"not enough course descriptions added: {num_descs_added}")
    # Deduplicate course codes.
    raw_courses_3 = []
    for course_info, courses in course_info_map.items():
        if len(courses) > 1:
            if course_info["course_code_suffix"]:
                util.log_verbose(
                    f"Duplicate course with suffix ({len(courses)} copies): "
                    f"{format_raw_course(courses[0])!r}")
                num_failed += len(courses)
                continue
            if len(courses) > len(string.ascii_uppercase):
                util.log_verbose(
                    f"Duplicate course with too many copies ({len(courses)}): "
                    f"{format_raw_course(courses[0])!r}")
                num_failed += len(courses)
                continue
            for course, letter in zip(courses, string.ascii_uppercase):
                course["course_code_suffix"] = letter
        raw_courses_3.extend(courses)
    raw_courses = raw_courses_3
    courses = []
    for raw_course in raw_courses:
        try:
            courses.append(process_course(raw_course, term))
        except ScrapeError as err:
            util.log_verbose(
                f"Failed to parse course: {format_raw_course(raw_course)!r} ({err})"
            )
            num_failed += 1
    if num_failed >= 10:
        raise ScrapeError(f"Too many malformed courses: {num_failed}")
    num_succeeded = len(raw_courses) - num_failed
    if num_succeeded < 500:
        raise ScrapeError(f"Not enough courses: {num_succeeded}")
    util.log_verbose(
        f"Added descriptions to {num_descs_added} out of {num_succeeded} courses"
    )
    return courses, term