Ejemplo n.º 1
0
    # set up parallel processing pool
    pool = Pool(processes=64)

    for season in seasons:

        print(f"Retrieving demand by subject for season {season}")

        dates = get_dates(season)

        pool_args = [(season, subject_code, subject_codes, dates)
                     for subject_code in subject_codes]

        season_courses = []

        # use imap_unordered to report to tqdm
        with tqdm(total=len(pool_args), desc="Subjects retrieved") as pbar:
            for i, result in enumerate(
                    pool.imap_unordered(handle_season_subject_demand,
                                        pool_args)):
                pbar.update()

                season_courses.append(result)

        # flatten season courses
        season_courses = [x for y in season_courses for x in y]

        # sort courses by title (for consistency with ferry-data)
        season_courses = sorted(season_courses, key=lambda x: x["title"])

        with open(f"{config.DATA_DIR}/demand_stats/{season}_demand.json",
                  "w") as f:
Ejemplo n.º 2
0
            filename.split("_")[0]  # remove the _demand.json suffix
            for filename in os.listdir(f"{config.DATA_DIR}/demand_stats/")
            if filename[0] != "." and filename != "subjects.json"
        ]
    )

    # ----------------------
    # Import course listings
    # ----------------------

    print("[Importing courses]")
    print(f"Season(s): {', '.join(course_seasons)}")

    merged_course_info_: List[pd.DataFrame] = []

    for season in tqdm(course_seasons, desc="Loading course JSONs"):
        # Read the course listings, giving preference to freshly parsed over migrated ones.
        parsed_courses_file = Path(f"{config.DATA_DIR}/parsed_courses/{season}.json")

        if parsed_courses_file.is_file():
            parsed_course_info = pd.DataFrame(pd.read_json(str(parsed_courses_file)))
        else:
            # check migrated courses as a fallback
            migrated_courses_file = Path(
                f"{config.DATA_DIR}/migrated_courses/{season}.json"
            )

            if not migrated_courses_file.is_file():
                print(
                    f"Skipping season {season}: not found in parsed or migrated courses."
                )
Ejemplo n.º 3
0
    # cache list of classes
    with open(f"{config.DATA_DIR}/season_courses/{season}_fysem.json",
              "w") as f:
        ujson.dump(season_courses, f, indent=4)

# fetch detailed info for each class in each season
for season in seasons:

    with open(f"{config.DATA_DIR}/season_courses/{season}.json", "r") as f:
        season_courses = ujson.load(f)

    # track progress for each season
    tqdm.write(f"Fetching class information for season {season}")

    # merge all the JSON results per season
    aggregate_season_json = []

    for course in tqdm(season_courses):

        course_json = fetch_course_json(course["code"], course["crn"],
                                        course["srcdb"])

        aggregate_season_json.append(course_json)

    # cache to JSON for entire season
    with open(f"{config.DATA_DIR}/course_json_cache/{season}.json", "w") as f:
        ujson.dump(aggregate_season_json, f, indent=4)

    print()
Ejemplo n.º 4
0
    prev = fetch_course_lists(connection, limit=None)
    """
    prev = [
        # Test with ACCT 270 from 201903.
        ("201903", "11970", {}),
        # Test with ACCT 170 from 200903.
        ("200903", "11256", {}),
        # Test with ECON 466 01 from 201003.
        # Compare with https://dougmckee.net/aging-evals-fall-2010.pdf.
        ("201003", "12089", {}),
    ]
    """

    prev = list(reversed(prev))
    for course_season, course_crn, course_extras in tqdm(prev):
        identifier = f"{course_season}-{course_crn}"

        output_path = f"{config.DATA_DIR}/previous_evals/{identifier}.json"
        if isfile(output_path):
            tqdm.write(f"Skipping {identifier} - already exists")
            continue

        try:
            tqdm.write(f"Processing {identifier}")
            course_eval = fetch_legacy_ratings(connection, course_season,
                                               course_crn, course_extras)

            with open(output_path, "w") as file:
                ujson.dump(course_eval, file, indent=4)
        except KeyError as err:
Ejemplo n.º 5
0
# load list of classes per season
for season in seasons:

    print(f"Parsing courses for season {season}")

    fysem_file = Path(f"{config.DATA_DIR}/season_courses/{season}_fysem.json")

    if fysem_file.is_file():
        with open(fysem_file, "r") as f:
            fysem = ujson.load(f)
            fysem = {x["crn"] for x in fysem}
        print("Loaded first-year seminars")
    else:
        print("First-year seminars filter missing")
        fysem = set()

    # load raw responses for season
    with open(f"{config.DATA_DIR}/course_json_cache/{season}.json", "r") as file:
        aggregate_term_json = ujson.load(file)

    # parse course JSON in season
    parsed_course_info = [
        extract_course_info(x, season, fysem)
        for x in tqdm(aggregate_term_json, ncols=96)
    ]

    # write output
    with open(f"{config.DATA_DIR}/parsed_courses/{season}.json", "w") as file:
        ujson.dump(parsed_course_info, file, indent=4)
Ejemplo n.º 6
0
            for filename in os.listdir(f"{config.DATA_DIR}/previous_json/")
            if filename[-4:] == "json" and filename[:5] != "evals"
        ]

        seasons = sorted(seasons)

    for season in seasons:

        migrated_courses = []

        with open(f"{config.DATA_DIR}/previous_json/{season}.json", "r") as f:

            previous_json = ujson.load(f)

            tqdm.write(f"Processing courses in season {season}")
            for course in tqdm(previous_json):

                migrated_course = {}

                migrated_course["season_code"] = str(season)

                migrated_course["description"] = convert_old_description(
                    course["description"])
                migrated_course["requirements"] = course["requirements"]

                def truncate_title(title: str) -> str:
                    """
                    Shorten a title if it is over 32 characters long.

                    Parameters
                    ----------
Ejemplo n.º 7
0
        handler_session = create_session()
        course_eval = fetch_course_eval(handler_session, course_crn,
                                        course_season_code)

        with open(output_path, "w") as file:
            ujson.dump(course_eval, file, indent=4)

        tqdm.write("Dumped in JSON")
    # except SeasonMissingEvalsError:
    #     tqdm.write(f"Skipping season {course_season_code} - missing evals")
    # except CrawlerError:
    #     tqdm.write(f"skipped - missing evals")

    # pylint: disable=broad-except
    except Exception as error:

        # traceback.print_exc()
        tqdm.write(f"Skipped - unknown error {error}")


# fetch ratings in parallel
pool = Pool(processes=16)

# use imap_unordered to report to tqdm
with tqdm(total=len(queue), desc="Subjects retrieved") as pbar:
    for i, result in enumerate(pool.imap_unordered(handle_course_evals,
                                                   queue)):
        pbar.update()

pool.terminate()
Ejemplo n.º 8
0
def resolve_historical_courses(
    courses: pd.DataFrame, listings: pd.DataFrame
) -> Tuple[Dict[int, int], Dict[int, List[int]], Dict[int, int], Dict[
        int, List[int]]]:
    """
    Among courses, identify historical offerings of a course.

    This is equivalent to constructing a partition of course_ids such that each
    partition contains the same courses, offered over different terms.

    Parameters
    ----------
    courses:
        'courses' table.
    listings:
        'listings' table.

    Returns
    -------
    course_to_same_course:
        Mapping from course_id to resolved same_course id.
    same_course_to_courses:
        Mapping from resolved same_course id to group of identical courses.
    course_to_same_course_filtered:
        Mapping from course_id to resolved same_course id, with title/description filtering.
    same_course_to_courses_filtered:
        Mapping from resolved same_course id to group of identical courses, with
        title/description filtering.
    """
    # map course to codes and code to courses
    course_to_codes, code_to_courses = map_to_groups(listings, "course_id",
                                                     "course_code")

    # map course_id to course codes
    courses_codes = courses.set_index("course_id", drop=False)[  # type: ignore
        "course_id"].apply(course_to_codes.get)

    # map course_id to all other courses with overlapping codes
    courses_shared_code = courses_codes.apply(
        lambda x: [code_to_courses[code] for code in x])
    # flatten courses with overlapping codes
    courses_shared_code = courses_shared_code.apply(
        lambda x: list(set(flatten_list_of_lists(x))))

    # filter out titles and descriptions for matching
    long_titles = courses.loc[courses["title"].fillna("").apply(len) >=
                              MIN_TITLE_MATCH_LEN  # type: ignore
                              ]
    long_descriptions = courses.loc[
        courses["description"].fillna("").apply(len)  # type: ignore
        >= MIN_DESCRIPTION_MATCH_LEN]

    # construct initial graph of courses:
    # each node is a unique course from the 'courses' table, and two courses are
    # linked if they share a common course code
    #
    # edges are then pruned for title/description match

    same_courses = networkx.Graph()

    # fill in the nodes first to keep courses with no same-code edges
    for course in courses["course_id"]:
        same_courses.add_node(course)

    for course, shared_code_courses in tqdm(
            courses_shared_code.iteritems(),  # type: ignore
            total=len(courses_shared_code),
            desc="Populating initial same-courses graph",
    ):
        for other_course_id in shared_code_courses:
            same_courses.add_edge(course, other_course_id)

    # filtered same-courses graph:
    # we iterate over edges of the same_courses graph and keep the ones that satisfy
    # our title/description matching criteria
    same_courses_filtered = networkx.Graph()

    # fill in the nodes first to keep courses with no same-code edges
    for course_id in courses["course_id"]:
        same_courses_filtered.add_node(course_id)

    # course_id to title and description for graph pruning
    course_to_title, _ = map_to_groups(long_titles, "course_id", "title")
    course_to_description, _ = map_to_groups(long_descriptions, "course_id",
                                             "description")

    for course_1, course_2 in tqdm(
            same_courses.edges(data=False),
            desc="Building filtered same-courses graph"):

        title_1 = course_to_title.get(course_1, [""])[0]
        title_2 = course_to_title.get(course_2, [""])[0]

        description_1 = course_to_description.get(course_1, [""])[0]
        description_2 = course_to_description.get(course_2, [""])[0]

        # if title and description are similar enough, keep the edge
        if is_same_course(title_1, title_2, description_1, description_2):

            same_courses_filtered.add_edge(course_1, course_2)

    print(f"Original shared-code edges: {same_courses.number_of_edges()}")
    print(
        f"Pruned shared-code edges: {same_courses_filtered.number_of_edges()}")

    print("Identifying same courses by connected components")

    # get same-course mappings from connected components
    course_to_same_course, same_course_to_courses = get_connected_courses(
        same_courses)
    (
        course_to_same_course_filtered,
        same_course_to_courses_filtered,
    ) = get_connected_courses(same_courses_filtered)

    return (
        course_to_same_course,
        same_course_to_courses,
        course_to_same_course_filtered,
        same_course_to_courses_filtered,
    )