# set up parallel processing pool pool = Pool(processes=64) for season in seasons: print(f"Retrieving demand by subject for season {season}") dates = get_dates(season) pool_args = [(season, subject_code, subject_codes, dates) for subject_code in subject_codes] season_courses = [] # use imap_unordered to report to tqdm with tqdm(total=len(pool_args), desc="Subjects retrieved") as pbar: for i, result in enumerate( pool.imap_unordered(handle_season_subject_demand, pool_args)): pbar.update() season_courses.append(result) # flatten season courses season_courses = [x for y in season_courses for x in y] # sort courses by title (for consistency with ferry-data) season_courses = sorted(season_courses, key=lambda x: x["title"]) with open(f"{config.DATA_DIR}/demand_stats/{season}_demand.json", "w") as f:
filename.split("_")[0] # remove the _demand.json suffix for filename in os.listdir(f"{config.DATA_DIR}/demand_stats/") if filename[0] != "." and filename != "subjects.json" ] ) # ---------------------- # Import course listings # ---------------------- print("[Importing courses]") print(f"Season(s): {', '.join(course_seasons)}") merged_course_info_: List[pd.DataFrame] = [] for season in tqdm(course_seasons, desc="Loading course JSONs"): # Read the course listings, giving preference to freshly parsed over migrated ones. parsed_courses_file = Path(f"{config.DATA_DIR}/parsed_courses/{season}.json") if parsed_courses_file.is_file(): parsed_course_info = pd.DataFrame(pd.read_json(str(parsed_courses_file))) else: # check migrated courses as a fallback migrated_courses_file = Path( f"{config.DATA_DIR}/migrated_courses/{season}.json" ) if not migrated_courses_file.is_file(): print( f"Skipping season {season}: not found in parsed or migrated courses." )
# cache list of classes with open(f"{config.DATA_DIR}/season_courses/{season}_fysem.json", "w") as f: ujson.dump(season_courses, f, indent=4) # fetch detailed info for each class in each season for season in seasons: with open(f"{config.DATA_DIR}/season_courses/{season}.json", "r") as f: season_courses = ujson.load(f) # track progress for each season tqdm.write(f"Fetching class information for season {season}") # merge all the JSON results per season aggregate_season_json = [] for course in tqdm(season_courses): course_json = fetch_course_json(course["code"], course["crn"], course["srcdb"]) aggregate_season_json.append(course_json) # cache to JSON for entire season with open(f"{config.DATA_DIR}/course_json_cache/{season}.json", "w") as f: ujson.dump(aggregate_season_json, f, indent=4) print()
prev = fetch_course_lists(connection, limit=None) """ prev = [ # Test with ACCT 270 from 201903. ("201903", "11970", {}), # Test with ACCT 170 from 200903. ("200903", "11256", {}), # Test with ECON 466 01 from 201003. # Compare with https://dougmckee.net/aging-evals-fall-2010.pdf. ("201003", "12089", {}), ] """ prev = list(reversed(prev)) for course_season, course_crn, course_extras in tqdm(prev): identifier = f"{course_season}-{course_crn}" output_path = f"{config.DATA_DIR}/previous_evals/{identifier}.json" if isfile(output_path): tqdm.write(f"Skipping {identifier} - already exists") continue try: tqdm.write(f"Processing {identifier}") course_eval = fetch_legacy_ratings(connection, course_season, course_crn, course_extras) with open(output_path, "w") as file: ujson.dump(course_eval, file, indent=4) except KeyError as err:
# load list of classes per season for season in seasons: print(f"Parsing courses for season {season}") fysem_file = Path(f"{config.DATA_DIR}/season_courses/{season}_fysem.json") if fysem_file.is_file(): with open(fysem_file, "r") as f: fysem = ujson.load(f) fysem = {x["crn"] for x in fysem} print("Loaded first-year seminars") else: print("First-year seminars filter missing") fysem = set() # load raw responses for season with open(f"{config.DATA_DIR}/course_json_cache/{season}.json", "r") as file: aggregate_term_json = ujson.load(file) # parse course JSON in season parsed_course_info = [ extract_course_info(x, season, fysem) for x in tqdm(aggregate_term_json, ncols=96) ] # write output with open(f"{config.DATA_DIR}/parsed_courses/{season}.json", "w") as file: ujson.dump(parsed_course_info, file, indent=4)
for filename in os.listdir(f"{config.DATA_DIR}/previous_json/") if filename[-4:] == "json" and filename[:5] != "evals" ] seasons = sorted(seasons) for season in seasons: migrated_courses = [] with open(f"{config.DATA_DIR}/previous_json/{season}.json", "r") as f: previous_json = ujson.load(f) tqdm.write(f"Processing courses in season {season}") for course in tqdm(previous_json): migrated_course = {} migrated_course["season_code"] = str(season) migrated_course["description"] = convert_old_description( course["description"]) migrated_course["requirements"] = course["requirements"] def truncate_title(title: str) -> str: """ Shorten a title if it is over 32 characters long. Parameters ----------
handler_session = create_session() course_eval = fetch_course_eval(handler_session, course_crn, course_season_code) with open(output_path, "w") as file: ujson.dump(course_eval, file, indent=4) tqdm.write("Dumped in JSON") # except SeasonMissingEvalsError: # tqdm.write(f"Skipping season {course_season_code} - missing evals") # except CrawlerError: # tqdm.write(f"skipped - missing evals") # pylint: disable=broad-except except Exception as error: # traceback.print_exc() tqdm.write(f"Skipped - unknown error {error}") # fetch ratings in parallel pool = Pool(processes=16) # use imap_unordered to report to tqdm with tqdm(total=len(queue), desc="Subjects retrieved") as pbar: for i, result in enumerate(pool.imap_unordered(handle_course_evals, queue)): pbar.update() pool.terminate()
def resolve_historical_courses( courses: pd.DataFrame, listings: pd.DataFrame ) -> Tuple[Dict[int, int], Dict[int, List[int]], Dict[int, int], Dict[ int, List[int]]]: """ Among courses, identify historical offerings of a course. This is equivalent to constructing a partition of course_ids such that each partition contains the same courses, offered over different terms. Parameters ---------- courses: 'courses' table. listings: 'listings' table. Returns ------- course_to_same_course: Mapping from course_id to resolved same_course id. same_course_to_courses: Mapping from resolved same_course id to group of identical courses. course_to_same_course_filtered: Mapping from course_id to resolved same_course id, with title/description filtering. same_course_to_courses_filtered: Mapping from resolved same_course id to group of identical courses, with title/description filtering. """ # map course to codes and code to courses course_to_codes, code_to_courses = map_to_groups(listings, "course_id", "course_code") # map course_id to course codes courses_codes = courses.set_index("course_id", drop=False)[ # type: ignore "course_id"].apply(course_to_codes.get) # map course_id to all other courses with overlapping codes courses_shared_code = courses_codes.apply( lambda x: [code_to_courses[code] for code in x]) # flatten courses with overlapping codes courses_shared_code = courses_shared_code.apply( lambda x: list(set(flatten_list_of_lists(x)))) # filter out titles and descriptions for matching long_titles = courses.loc[courses["title"].fillna("").apply(len) >= MIN_TITLE_MATCH_LEN # type: ignore ] long_descriptions = courses.loc[ courses["description"].fillna("").apply(len) # type: ignore >= MIN_DESCRIPTION_MATCH_LEN] # construct initial graph of courses: # each node is a unique course from the 'courses' table, and two courses are # linked if they share a common course code # # edges are then pruned for title/description match same_courses = networkx.Graph() # fill in the nodes first to keep courses with no same-code edges for course in courses["course_id"]: same_courses.add_node(course) for course, shared_code_courses in tqdm( courses_shared_code.iteritems(), # type: ignore total=len(courses_shared_code), desc="Populating initial same-courses graph", ): for other_course_id in shared_code_courses: same_courses.add_edge(course, other_course_id) # filtered same-courses graph: # we iterate over edges of the same_courses graph and keep the ones that satisfy # our title/description matching criteria same_courses_filtered = networkx.Graph() # fill in the nodes first to keep courses with no same-code edges for course_id in courses["course_id"]: same_courses_filtered.add_node(course_id) # course_id to title and description for graph pruning course_to_title, _ = map_to_groups(long_titles, "course_id", "title") course_to_description, _ = map_to_groups(long_descriptions, "course_id", "description") for course_1, course_2 in tqdm( same_courses.edges(data=False), desc="Building filtered same-courses graph"): title_1 = course_to_title.get(course_1, [""])[0] title_2 = course_to_title.get(course_2, [""])[0] description_1 = course_to_description.get(course_1, [""])[0] description_2 = course_to_description.get(course_2, [""])[0] # if title and description are similar enough, keep the edge if is_same_course(title_1, title_2, description_1, description_2): same_courses_filtered.add_edge(course_1, course_2) print(f"Original shared-code edges: {same_courses.number_of_edges()}") print( f"Pruned shared-code edges: {same_courses_filtered.number_of_edges()}") print("Identifying same courses by connected components") # get same-course mappings from connected components course_to_same_course, same_course_to_courses = get_connected_courses( same_courses) ( course_to_same_course_filtered, same_course_to_courses_filtered, ) = get_connected_courses(same_courses_filtered) return ( course_to_same_course, same_course_to_courses, course_to_same_course_filtered, same_course_to_courses_filtered, )