def fetch_ftp_files(out_dir): """Downloads ftp files from UT Austin FTP server Args: out_dir (str): directory to download files to """ __url = 'reg-it.austin.utexas.edu' __username = '******' logger.info(f"Downloading FTP data files to {out_dir}") cur_dir = getcwd() ftp = FTP(__url) ftp.login(user=__username) chdir(out_dir) for filename in (filename_current, filename_next, filename_future): logger.debug(f'FTP: downloading {filename}') local_file = open(filename, 'wb') ftp.retrbinary('RETR ' + filename, local_file.write, 1024) local_file.close() ftp.quit() chdir(cur_dir)
def refresh_ecis(): """ Set course and prof ecis_avg and ecis_students by iterating through ecis_scores """ logger.info("Refreshing course and professor ecis fields with respective data") query_tuple = (Course.query.all(), Prof.query.all()) # will iterate between Course and Prof since code is identical for queries in query_tuple: for query in queries: if type(query) is Course: logger.debug(f"Refreshing ecis for Course: {query.dept} {query.num}") elif type(query) is Prof: logger.debug(f"Refreshing ecis for Prof: {query.first_name} {query.last_name}") ecis = 0 students = 0 # iterate through ecis scores specific to the course/prof for prof_course in query.prof_course: for prof_course_sem in prof_course.prof_course_sem: for ecis_child in prof_course_sem.ecis: ecis += ecis_child.course_avg * ecis_child.num_students students += ecis_child.num_students # average will be None if there are no students query.ecis_avg = (ecis / students) if students > 0 else None query.ecis_students = students db.session.commit()
def populate_dept(dept_info, override=False): """ Populate the database with departments :param dept_info: list of tuples with: (abbreviation, name) :type dept_info: tuple(str, str) :param override: override current department with same abbreviation if found in database :type override: bool """ logger.info("Populating database with departments") for abr, name in dept_info: cur_dept = Dept.query.filter_by(abr=abr).first() if cur_dept is None: # add department to database abr = abr.strip() name = name.strip() logger.debug(f"Adding dept {name} ({abr}) to database") dept = Dept(abr=abr, name=name) db.session.add(dept) elif override: # override current department logger.debug(f"Overriding dept {name} ({abr}) to database") cur_dept.abr = abr cur_dept.name = name else: # department already exists and not overriding logger.debug(f"Already exists: dept {name} ({abr})") db.session.commit()
def parse_prof_csv(file_path): """ Parse .csv file containing prof data :param file_path: path to prof file :type file_path: str :return: sorted list of prof data :rtype: list(tuple(str, str, str)) """ __key_sem = 'CCYYS' __key_prof_name = 'INSTR_NAME' __key_prof_eid = 'INSTR_EID' logger.info(f'Parsing prof csv file: {file_path}') df = pd.read_csv(file_path) profs = set() for index, row in df.iterrows(): semester, name, eid = row[__key_sem], row[__key_prof_name], row[ __key_prof_eid] try: semester = int(semester) except ValueError: logger.debug( f'Unable to parse semester {semester}. Defaulting to 0...') semester = 0 profs.add((semester, name.lower(), eid.lower())) profs = sorted(list(profs), key=lambda x: x[0]) return profs
def populate_sem(start_yr=2010, end_yr=2020): """ Populate database with semesters for the given year range. Will populate for spring, summer, fall semesters. :param start_yr: starting year for the populate :type start_yr: int :param end_yr: ending year for the populate :type end_yr: int """ logger.info(f"Populating database with semesters from {start_yr} to {end_yr}") for yr in range(start_yr, end_yr): for sem in (2, 6, 9): if Semester.query.filter_by(year=yr, semester=sem).first() is not None: check_or_add_semester(yr, sem)
def automate_backend(run_once): """ Function used to automate backend tasks such as 1. fetch ftp files and update scheduled course info 2. read maintenance.txt and perform task as necessary 3. organize log files :param name: required parameter as function passed to thread :type name: str """ while True: logger.info("Automation task in backend") dt_today = datetime.datetime.now(pytz.timezone('America/Chicago')) dt_tmr = dt_today + datetime.timedelta(days=1) dt_tmr = dt_tmr.replace(hour=1, minute=0) if run_once: run_once = False logger.info('Running once for automation') else: __a_day_secs = 24 * 3600 until_start = int((dt_tmr - dt_today).total_seconds()) if until_start > __a_day_secs: until_start -= __a_day_secs logger.info(f"Waiting {until_start} seconds until start time for automation") for _ in range(until_start): time.sleep(1) # task 1: fetch ftp files and update scheduled course info logger.info("Fetching new ftp files") fetch_ftp_files('input_data') semester_path = fetch_sem_values("input_data", "input_data") update_sem_vals(semester_path) # logger.info("Updating scheduled course database info") ftp_info = parse_ftp("input_data") reset_scheduled_info() populate_scheduled_course(ftp_info) # task 2: read maintenance.txt and perform task as necessary run_maintenance() # task 3: organize log files organize_log_files() logger.info("Finished automation")
def populate_dept_info(dept_info): """ Populate department with additional information (college and department name) :param dept_info: list of tuples containing: (abbreviation, department name, college name) :type dept_info: list[tuple(str, str, str) """ logger.info('Populating departments with additional info') for abr, dept, college in dept_info: cur_dept = Dept.query.filter_by(abr=abr).first() if cur_dept is None: logger.debug(f"Cannot find dept {abr}") else: logger.debug(f"Updating dept: {abr} with dept={dept}, college={college}") cur_dept.dept = dept cur_dept.college = college db.session.commit()
def maintenance_course_task(path, pages): """ Run maintenance task for course request Will: 1. update department information with respect to the given file 2. update course information with respect to the given file :param path: path to the input file :type path: str :param pages: pages of the file to parse :type pages: list[int] """ logger.info("Updating department info") departments = fetch_depts() populate_dept(departments, override=True) dept_info = fetch_dept_info(path, pages) populate_dept_info(dept_info) logger.info("Updating course info") courses = fetch_courses(path, pages) populate_course(courses, cur_sem=int(sem_current))
def organize_log_files(): """ Function tasked with organizing the log files with the following folder structure: 1. /log 2. /year_<year num> 3. /week_<week start date>_to_<week end date> 4. <log files corresponding to the week Assumption: files without extensions after .log will not be organized (eg file.log vs file.log.20200830) where the extension will mark the date of log """ logger.info("Organizing log files") files = [f for f in os.listdir(DEFAULT_LOG_FOLDER) if os.path.isfile(os.path.join(DEFAULT_LOG_FOLDER, f))] for f in files: log_path = get_log_file_path(f) if log_path is not None: dir_path = os.path.split(log_path)[0] if not os.path.exists(dir_path): os.makedirs(dir_path) original_path = os.path.join(DEFAULT_LOG_FOLDER, f) shutil.move(original_path, log_path)
def parse_ftp(in_dir): """Parse FTP files from the UT Austin FTP server Args: in_dir (str): directory containinig the ftp files """ logger.info(f"Parsing FTP files from {in_dir}") courses = [] for filename in (filename_current, filename_next, filename_future): filepath = join(in_dir, filename) if isfile(filepath): logger.debug(f'FTP: parsing {filename}') with open(filepath) as f: lines = f.readlines() categories, lines = __parse_categories(lines) if categories is not None: for line in lines: # standardizing the lines line = line.lower() data = line.split("\t") data = [d.strip() for d in data] if len(line) > 0 and len(data) >= len(categories): # separating data by category list course = {categories[i]: data[i] for i in range(len(categories))} courses.append(course) else: logger.debug(f'FTP: {filename} does not exist in {in_dir}') return courses
def fetch_sem_values(ftp_dir, out_dir): """ fetch semester values from the FTP data files from the given directory :param ftp_dir: the directory containing the ftp data files :param out_dir: the directory to output a file containing the semester data """ files = (filename_current, filename_next, filename_future) keys = (key_current, key_next, key_future) out_path = join(out_dir, sem_file) sem_dict = {} logger.info(f"Fetching semester values from dir={ftp_dir}, to file={out_path}") for i in range(len(files)): sem = None m_file = files[i] filepath = join(ftp_dir, m_file) if isfile(filepath): with open(filepath, 'r') as f: lines = f.readlines() for line in lines: if __sem_label in line: m = re.search(r'[A-Za-z ]+(\d{5}) (.*)?', line) sem = m.group(1) else: logger.debug(f"Fetch Sem: cannot find file: {m_file} in {ftp_dir}") sem_dict[keys[i]] = sem with open(out_path, 'w') as f: json.dump(sem_dict, f) return out_path
def run_maintenance(): """ Check maintenance txt file (default="maintenance.txt") for maintenance tasks Potential tasks: 1. ‘course <insert path to file> <insert comma separated page numbers>’ update Course rows reading in Excel file (semester basis) 2. ‘ecis <insert path to file> <insert comma separated page numbers>’ update ECIS info (semester basis) 3. ‘prof_course <insert path to file> <insert comma separated page numbers>’ update ProfCourse relationships (should receive most of NEW info from FTP) 4. ‘prof <insert path to file>' update Professor info (unlikely) """ __maintenance_txt_file = "maintenance.txt" logger.info(f"Initiating {__maintenance_txt_file}") if os.path.isfile(__maintenance_txt_file): with open(__maintenance_txt_file, 'r') as f: commands = f.readlines() while len(commands) > 0: command = commands[0] command_parts = command.split(' ') if len(command_parts) >= 2: cmd, path = command_parts[0].strip(), command_parts[1].strip() logger.info(f"Executing {cmd} {path}") if len(command_parts) >= 3: pages = [int(page.strip().replace('\'', "").replace("\"", "")) for page in command_parts[2].split(',')] if cmd == 'course': maintenance_course_task(path, pages) elif cmd == 'ecis': populate_ecis(path, pages) else: if cmd == 'prof_course': populate_prof_course(path) elif cmd == 'prof': profs = parse_prof_csv(path) populate_prof_eid(profs) elif cmd == 'ftp': logger.info("Updating scheduled course database info") ftp_info = parse_ftp("input_data") reset_scheduled_info() populate_scheduled_course(ftp_info) commands = commands[1:] with open(__maintenance_txt_file, 'w') as f: f.writelines(commands)
def reset_scheduled_info(): logger.info("Resetting scheduled info") for scheduled_course in ScheduledCourse.query.all(): scheduled_course.mark_deletion = False db.session.commit()
def populate_course(course_info, cur_sem=None): """ Populate database with courses :param course_info: list of dictionaries containing course data :type course_info: list[dict] :param cur_sem: the current semester. if set to None, data will be replaced with most recent value :type cur_sem: int or None """ __inherit = "(See Base Topic for inherited information.)" null_depts = set() logger.info("Populating database with courses") for course in course_info: # fetch values from dictionary semester = course[KEY_SEM] dept = course[KEY_DEPT] num = course[KEY_NUM] title = course[KEY_TITLE] cs_title = course[KEY_CS_TITLE] description = course[KEY_DESCRIPTION] restrictions = course[KEY_RESTRICTION] t_num = course[KEY_TOPIC_NUM] pre_req = course[KEY_PRE_REQ] # check to see if dept exists --> else ski[ dept_obj = Dept.query.filter_by(abr=dept).first() if dept_obj is None: null_depts.add(dept) continue # if topic number > 0, then title = modified cs title if t_num > 0: cs_title = __parse_title(cs_title) title = title if cs_title is None else cs_title # None if course doesn't currently exist old_course = None # define new base course variable new_course = Course( num=num, title=title, description=description, restrictions=restrictions, pre_req=pre_req, dept_id=dept_obj.id, topic_num=t_num ) # condition on topic number if t_num >= 0: # all courses with same topic number --> should be unique topics # if len 0 --> new topic topic_courses_flask = Course.query.filter_by(dept_id=dept_obj.id, num=num) topic_courses = topic_courses_flask.all() # set topic number --> will create new topic if doesnt exist new_course.topic_id = __check_new_topic(topic_courses_flask) # assumption: unique based on topic number t_course_flask = topic_courses_flask.filter_by(topic_num=t_num) if t_num == 0: if len(t_course_flask.all()) > 0: old_course = t_course_flask.first() __populate_child_topics(new_course, topic_courses, __inherit) else: topic_zero = __get_topic_zero(topic_courses) if len(t_course_flask.all()) > 0: old_course = t_course_flask.first() __populate_child_topics(topic_zero, [new_course], __inherit) else: # course doesn't have topic number old_course = Course.query.filter_by(dept_id=dept_obj.id, num=num).first() # create new or replace old if old_course is None: # new course logger.debug(f"Creating new course {dept_obj.abr} {new_course.num}") db.session.add(new_course) elif cur_sem is None or semester == cur_sem: # course existed but replacing logger.debug(f"Replacing previous {old_course.dept.abr} {old_course.num}") __replace_course(old_course, new_course) else: # course existed and skipping logger.debug(f"Already existed: {old_course.dept.abr} {old_course.num}") db.session.commit() null_depts = list(null_depts) null_depts.sort() for dept in null_depts: logger.debug(f"Unexpected Error: department {dept} cannot be found in the database")
def populate_ecis(file_path, pages): """ Populate database with ECIS information :param file_path: path to file containing data :type file_path: str :param pages: pages of file to parse :type pages: list[int] or list[str] """ # FOR FUTURE UPDATES, PLEASE READ: # remember to update Course and Prof ECIS fields when inputting new ECIS scores: ecis_avg and ecis_students logger.info(f'Populating ecis database with data from: {file_path}') ecis_lst = parse_ecis_excel(file_path, pages) for ecis in ecis_lst: # separate values from dictionary unique, c_avg, p_avg, students, yr, sem = ( ecis[KEY_UNIQUE_NUM], ecis[KEY_COURSE_AVG], ecis[KEY_PROF_AVG], ecis[KEY_NUM_STUDENTS], ecis[KEY_YR], ecis[KEY_SEMESTER] ) # check for existence of specified Semester, ProfCourseSemester in database logger.debug(f'Adding ecis for: unique={unique}, sem={yr}{sem}') sem_obj = Semester.query.filter_by(year=yr, semester=sem).first() if sem_obj is None: logger.debug(f"Cannot find semester for: {yr}{sem}. Skipping...") continue pcs_obj = ProfCourseSemester.query.filter_by(unique_num=unique, sem_id=sem_obj.id).first() if pcs_obj is None: logger.debug( f"Failed to find ProfCourseSemester for: unique={unique}, sem={yr}{sem}. Skipping..." ) continue # assumption: only one ecis score per prof_course_semester instance -> else skip ecis_lst = pcs_obj.ecis if len(ecis_lst) >= 1: # ecis already exists continue # creating the ecis object ecis_obj = EcisScore( course_avg=c_avg, prof_avg=p_avg, num_students=students, prof_course_sem_id=pcs_obj.id) db.session.add(ecis_obj) db.session.commit() # updating course and prof ecis fields logger.debug("Updating prof and course ecis fields") pc_obj = pcs_obj.prof_course course_obj = pc_obj.course prof_obj = pc_obj.prof queries = ((course_obj, c_avg), (prof_obj, p_avg)) for query, avg in queries: total_students = query.ecis_students + students total_avg = ((query.ecis_avg * query.ecis_students) if query.ecis_avg is not None else 0) + \ ((avg * students) if avg is not None else 0) query.ecis_avg = (total_avg / total_students) if total_students > 0 else None query.ecis_students = total_students db.session.commit()
def update_scheduled_courses(s_course_queue): """ Update scheduled_course data in the database with the new information listed in queue. NOTE: minimize add/delete to prevent the database taking too long. Details: this function will set "extra" scheduled_course entries' mark_deletion to True. DO NOT use scheduled_course entries with mark_deletion set to True (no need to actually delete) :param s_course_queue: data to populate the database with :type s_course_queue: list( dict{ "scheduled": ScheduledCourse object containing the data, "prof": Prof object containing related prof data, "course": Course object containing related course data, "semester": Semester object containing related semester data, "unique": str or int containing the unique number for the scheduled_course, "xlist": list of str or int of the unique numbers for schedule_course entries that are crosslisted. } ) """ logger.info("Updating scheduled course information") semesters = { 'current': { 'courses': {}, 'profs': {} }, 'next': { 'courses': {}, 'profs': {} }, 'future': { 'courses': {}, 'profs': {} } } cur_s_courses = ScheduledCourse.query.all() for i in range(min(len(cur_s_courses), len(s_course_queue))): cur_s_course = cur_s_courses[i] s_course = s_course_queue[i] cur_schedule = s_course['scheduled'] cur_prof = s_course['prof'] cur_course = s_course['course'] semester = s_course['semester'] unique_no = s_course['unique'] xlist_str = s_course['xlist'] # check to see if cross_listings exist else create new x_list = check_or_add_xlist(xlist_str, semester) update_scheduled_course(cur_s_course, cur_schedule, x_list) db.session.commit() # add prof course and prof course semester relationship if doesnt exist if cur_prof: _, prof_course = check_or_add_prof_course(cur_prof, cur_course) check_or_add_prof_course_semester(unique_no, prof_course, semester) # update course and prof semester fields (whether they are teaching the respective semesters) full_semester = int(str(semester.year) + str(semester.semester)) if full_semester == sem_current: if cur_course: semesters['current']['courses'][cur_course.id] = True if cur_prof: semesters['current']['profs'][cur_prof.id] = True elif full_semester == sem_next: if cur_course: semesters['next']['courses'][cur_course.id] = True if cur_prof: semesters['next']['profs'][cur_prof.id] = True elif full_semester == sem_future: if cur_course: semesters['future']['courses'][cur_course.id] = True if cur_prof: semesters['future']['profs'][cur_prof.id] = True logger.info("Checking scheduled data for uneven sizings") if len(s_course_queue) > len(cur_s_courses): logger.info("Have additional new schedueled courses") for s_course in s_course_queue[len(cur_s_courses):]: cur_schedule = s_course['scheduled'] cur_prof = s_course['prof'] cur_course = s_course['course'] semester = s_course['semester'] unique_no = s_course['unique'] xlist_str = s_course['xlist'] # check to see if cross_listings exist else create new x_list = check_or_add_xlist(xlist_str, semester) cur_schedule.cross_listed = x_list.id db.session.add(cur_schedule) db.session.commit() # add prof course and prof course semester relationship if doesnt exist if cur_prof: _, prof_course = check_or_add_prof_course(cur_prof, cur_course) check_or_add_prof_course_semester(unique_no, prof_course, semester) # update course and prof semester fields (whether they are teaching the respective semesters) full_semester = int(str(semester.year) + str(semester.semester)) if full_semester == sem_current: if cur_course: semesters['current']['courses'][cur_course.id] = True if cur_prof: semesters['current']['profs'][cur_prof.id] = True elif full_semester == sem_next: if cur_course: semesters['next']['courses'][cur_course.id] = True if cur_prof: semesters['next']['profs'][cur_prof.id] = True elif full_semester == sem_future: if cur_course: semesters['future']['courses'][cur_course.id] = True if cur_prof: semesters['future']['profs'][cur_prof.id] = True for s_course in ScheduledCourse.query.all(): if s_course.mark_deletion is not None: s_course.mark_deletion = True db.session.commit() logger.info("Updating course and professor semesters") all_profs = Prof.query.all() all_courses = Course.query.all() for prof in all_profs: if ( (prof.current_sem != semesters['current']['profs'].get(prof.id, False)) or (prof.next_sem != semesters['next']['profs'].get(prof.id, False)) or (prof.future_sem != semesters['future']['profs'].get(prof.id, False)) ): prof.current_sem = semesters['current']['profs'].get(prof.id, False) prof.next_sem = semesters['next']['profs'].get(prof.id, False) prof.future_sem = semesters['future']['profs'].get(prof.id, False) db.session.commit() for course in all_courses: if ( (course.current_sem != semesters['current']['courses'].get(course.id, False)) or (course.next_sem != semesters['next']['courses'].get(course.id, False)) or (course.future_sem != semesters['future']['courses'].get(course.id, False)) ): course.current_sem = semesters['current']['courses'].get(course.id, False) course.next_sem = semesters['next']['courses'].get(course.id, False) course.future_sem = semesters['future']['courses'].get(course.id, False) db.session.commit()
def parse_ecis_excel(file_path, sheet_lst): """Parse the ecis excel document for ecis information on courses and professors Args: file_path (str): file path to ecis excel documents sheet_lst (list[str]): list of sheet names to parse Returns: list[dict[str, int]]: dictionary containing course and prof ecis information Structure: [ { KEY_UNIQUE_NUM: int, KEY_COURSE_AVG: float, KEY_PROF_AVG: float, KEY_NUM_STUDENTS: int, KEY_YR: int, KEY_SEMESTER: int, }, ... ] """ __unique_num_digits = 5 __sem_key = 'SEMESTER_CCYYS' __unique_key = 'UNIQUE' __num_students_key = 'NBR_SURVEY_FORMS_RETURNED' __course_avg_key = 'AVG_COURSE_RATING' __prof_avg_key = 'AVG_INSTRUCTOR_RATING' ecis_lst = [] for sheet in sheet_lst: rows_skipped = 0 ecis_df = pd.read_excel(file_path, sheet_name=sheet) for index, row in ecis_df.iterrows(): # check for valid year semester string. If invalid, skip yr_sem = str(row[__sem_key]) if len(yr_sem) < 5: rows_skipped += 1 continue yr_sem = yr_sem[0:5] yr = yr_sem[0:-1] sem = yr_sem[-1] # convert everything to int or float--> if N/A then fail and skip try: unique_str = str(row[__unique_key]) unique_str = unique_str.split( '.')[0] if '.' in unique_str else unique_str num_students_str = str(row[__num_students_key]) num_students_str = num_students_str.split( '.')[0] if '.' in num_students_str else num_students_str yr = int(yr) sem = int(sem) unique_num = int(unique_str) num_students = int(num_students_str) course_avg = float(row[__course_avg_key]) prof_avg = float(row[__prof_avg_key]) except (ValueError, IndexError): rows_skipped += 1 continue # TODO: add course and prof relationship once available # create ecis dictionary ecis = { KEY_UNIQUE_NUM: unique_num, KEY_COURSE_AVG: course_avg, KEY_PROF_AVG: prof_avg, KEY_NUM_STUDENTS: num_students, KEY_YR: yr, KEY_SEMESTER: sem } ecis_lst.append(ecis) logger.info( f'Finished parsing {sheet} sheet: num_rows_skipped={rows_skipped}') return ecis_lst
def populate_prof_course(in_file): """ Populate database with Professor and Course relationship using data fetched from the web (utreview.services.fetch_web.fetch_prof_course_info only) :param in_file: file the data was fetched to :type in_file: str """ __sem_fall = "Fall" __sem_spring = "Spring" __sem_summer = "Summer" logger.info(f"Populating database with prof_course info using {in_file}") # creating list of prof-course relationships from the given file prof_courses = [] with open(in_file, 'r') as f: for line in f: prof_courses.append(json.loads(line)) cur_profs = Prof.query.all() # add each prof-course relationship to the database if appropriate for prof_course in prof_courses: # check for existence of professor -> add if does not exist prof_name = [name.strip() for name in prof_course[KEY_PROF].lower().split(",")] last, first = prof_name[0].strip(), prof_name[1].strip() last_words = [word.strip() for word in last.split(' ') if len(word.strip()) > 0] first_words = [word.strip() for word in first.split(' ') if len(word.strip()) > 0] target_prof = None for cur_prof in cur_profs: found = True cur_last, cur_first = cur_prof.last_name.lower(), cur_prof.first_name.lower() cur_last_words = [word.strip() for word in cur_last.split(' ') if len(word.strip()) > 0] cur_first_words = [word.strip() for word in cur_first.split(' ') if len(word.strip()) > 0] for word in last_words: if word not in cur_last_words: found = False break if found: for word in first_words: if word not in cur_first_words: found = False break if found: target_prof = cur_prof break if target_prof is None: logger.debug(f"Cannot find prof: {prof_course[KEY_PROF]}. Skipping...") continue # check for existence of department -> skip if does not exist abr = prof_course[KEY_DEPT].strip().upper() dept = Dept.query.filter_by(abr=abr).first() if dept is None: logger.debug(f"Cannot find dept: {abr}. Skipping...") continue # check if course exists -> add if does not exist # TODO: choosing topic 0 by default. Update when topic info available. num_results, course = check_or_add_course(dept, prof_course[KEY_CNUM], prof_course[KEY_TITLE]) if num_results > 1: courses = Course.query.filter_by(dept_id=dept.id, num=prof_course[KEY_CNUM]) for c in courses: if c.topic_num <= 0: course = c db.session.commit() # check if prof_course exists -> add if it doesn't _, prof_course_obj = check_or_add_prof_course(target_prof, course) db.session.commit() # parse semester to integer representation sem_lst = [s.strip() for s in prof_course[KEY_SEM].split(",")] if sem_lst[1] == __sem_spring: sem = SPRING_SEM elif sem_lst[1] == __sem_summer: sem = SUMMER_SEM elif sem_lst[1] == __sem_fall: sem = FALL_SEM else: logger.debug(f"Invalid semester: {sem_lst[1]}. Skipping...") continue yr = int(sem_lst[0].strip()) # check for semester existence -> if it doesn't, add to database _, sem_obj = check_or_add_semester(yr, sem) # check for prof_course_semester existence -> if it doesn't add to database check_or_add_prof_course_semester(prof_course[KEY_UNIQUE], prof_course_obj, sem_obj) db.session.commit()
def fetch_prof_course_info(out_file, sems, depts): """ Parse prof course info from the site -> for the relationship :param out_file: file to output the relationships/data :type out_file: str :param sems: semesters to fetch data for :type sems: list[str] :param depts: departments to fetch data for :type depts: list[str] """ __sem_header = 'SEMESTER' __dept_header = 'DEPT' __title_header = 'TITLE' __course_num_header = 'COURSENUMBER' __unique_header = 'UNIQUE' __instr_header = 'INSTRUCTOR(S)*' logger.info(f"Fetching prof_course info. Output={out_file}. Semesters={sems}. Departments={depts}") for sem in sems: for dept in depts: # get BeautifulSoup object for the parameters html = fetch_html(get_prof_course_url(sem, dept)) html_soup = BSoup(html, "html.parser") # look for headers on page -> headers for the table headers = html_soup.find("tr", {"class": "tbh header"}) if headers is None: logger.debug("Cannot find headers for prof_course search: " f"Semester={sem}, Department={dept}. Skipping...") continue headers = [header.text.replace("\n", "").strip() for header in headers.findAll("th")] logger.debug(f"Fetched headers from profcourse site with headers: {headers}") # parse out indices for each of the headers sem_index, dept_index, title_index, cnum_index, unique_index, instr_index = get_header_indices( headers, __sem_header, __dept_header, __title_header, __course_num_header, __unique_header, __instr_header ) # iterate through each row in the web table and parse out data rows = html_soup.findAll("tr", {"class": ["tboff", "tbon"]}) for row in rows: cols = row.findAll("td") cols = [col.text.replace("\n", "").strip() for col in cols] # get data via the indices for the headers for i in range(len(cols)): if 'CV' in cols[i]: cols[i] = cols[i].split('CV')[0].strip() # create dictionary containing the data prof_course = { KEY_SEM: cols[sem_index] if sem_index is not None else None, KEY_DEPT: cols[dept_index] if dept_index is not None else None, KEY_TITLE: cols[title_index].strip()[:-1] if title_index is not None else None, KEY_CNUM: cols[cnum_index] if cnum_index is not None else None, KEY_UNIQUE: cols[unique_index] if unique_index is not None else None, KEY_PROF: cols[instr_index] if instr_index is not None else None } # write dictionary to file with open(out_file, "a") as f: json.dump(prof_course, f) f.write("\n")
def populate_scheduled_course(course_info): """ Populate the database with scheduled course info as parsed from FTP :param course_info: list of course data :type course_info: list[dict] """ logger.info("Populating database with scheduled course info") s_course_queue = [] for s_course in course_info: # create ScheduledCourseInfo object using the s_course dictionary try: scheduled = ScheduledCourseInfo(s_course) except ValueError as err: logger.warn(f"Populate scheduled course error: {err}. Skipping...") continue # check to see if dept exists dept_obj = Dept.query.filter_by(abr=scheduled.dept).first() if dept_obj is None: logger.debug(f"Populate scheduled course: cannot find department {scheduled.dept}. Skipping...") continue # check to see if course exists cur_courses = Course.query.filter_by(dept_id=dept_obj.id, num=scheduled.c_num) if len(cur_courses.all()) > 1: cur_courses = cur_courses.filter_by(topic_num=scheduled.topic) cur_course = cur_courses.first() if cur_course is None: course_log_description = f"{scheduled.dept} {scheduled.c_num} w/ topic num {scheduled.topic}" logger.debug(f"Populate scheduled course: cannot find course {course_log_description}. Skipping...") continue # check to see if prof exists --> if not then leave empty cur_prof = Prof.query.filter_by(eid=scheduled.prof_eid).first() if cur_prof is None: logger.warn(f"Could not find professor w/ EID={scheduled.prof_eid}. Leaving empty...") # check to see if semester exists else add semester _, semester = check_or_add_semester(yr=scheduled.yr, sem=scheduled.sem) # check to see if scheduled course exists else create new num_results, cur_schedule = check_or_add_scheduled_course(scheduled, cur_course, cur_prof, None, semester, add=False) if num_results > 0: logger.debug(f"""Updating scheduled course. Unique = {scheduled.unique_no} semester={repr(semester)} course={repr(cur_course)} prof={repr(cur_prof)}""") cur_schedule = scheduled.to_scheduled_course(cur_schedule, semester, cur_course, cur_prof, None) s_course_queue.append({ 'scheduled': cur_schedule, 'prof': cur_prof, 'course': cur_course, 'semester': semester, 'unique': scheduled.unique_no, 'xlist': scheduled.x_listings, }) update_scheduled_courses(s_course_queue)