def merge_course_if_fullyear(self): if self.course.num[-1] == "A": other_half_letter = "B" elif self.course.num[-1] == "B": other_half_letter = "A" else: #Not a full year course return other_half_key = self.course.get_key()[:-1] + other_half_letter if other_half_key not in self.courses_dict: #Haven't scraped the other course yet print "Haven't found other half yet." return other_half = self.courses_dict[other_half_key] print "Merging with other half: %s" % (other_half.get_key()) #Remove the other course from the dict so we can re-add the full course del self.courses_dict[other_half_key] del self.courses_dict[self.course.get_key()] merged = SolusModels.SolusCourse() merged.add_merged_info(other_half, self.course) self.add_course(merged)
def read_from_file(self): with open(self.read_file_name) as f: all_str = f.read() all_list = json.loads(all_str) for course_dict in all_list: course = SolusModels.SolusCourse(course_dict) self.courses_dict[course.get_key()] = course print len(self.courses_dict)
def scrape_single_section_component(self, piece_array): if len(piece_array) < 6: import pdb; pdb.set_trace() components = [] #Date range m = re.search('^([\S]+)\s*-\s*([\S]+)$', piece_array.pop()) start_date = m.group(1) end_date = m.group(2) instructor = piece_array.pop() room = piece_array.pop() #Timeslot end = piece_array.pop() start = piece_array.pop() #Sometimes day is e.g. "MoTuWeSaSu" all_days = piece_array.pop() if all_days == "TBA": all_days = "TB" while len(all_days) > 0: day = SolusModels.index_of_day_abbr(all_days[-2:]) all_days = all_days[:-2] section_component = SolusModels.SectionComponent() components.append(section_component) section_component.start_date = start_date section_component.end_date = end_date section_component.instructor = instructor section_component.room = room section_component.timeslot = SolusModels.timeslot_index_by_components(day, start, end) return components
def scrape_section_header(self, piece_array, section): section_info = piece_array.pop() m = re.search('^([\S]+)-([\S]+)\s+\((\S+)\)$', section_info) while not m: section_info = piece_array.pop() m = re.search('^([\S]+)-([\S]+)\s+\((\S+)\)$', section_info) section.index = m.group(1) section.type = SolusModels.section_type_index_by_key(m.group(2)) section.id = m.group(3)
def scrape_section_page(self): section_pieces = self.section_pieces_from_page() while len(section_pieces) > 0: section = SolusModels.Section() self.course.sections.append(section) section.term = self.current_term self.scrape_single_section(section_pieces, section)
def scrape_single_section_component(self, piece_array): if len(piece_array) < 6: import pdb pdb.set_trace() components = [] #Date range m = re.search('^([\S]+)\s*-\s*([\S]+)$', piece_array.pop()) start_date = m.group(1) end_date = m.group(2) instructor = piece_array.pop() room = piece_array.pop() #Timeslot end = piece_array.pop() start = piece_array.pop() #Sometimes day is e.g. "MoTuWeSaSu" all_days = piece_array.pop() if all_days == "TBA": all_days = "TB" while len(all_days) > 0: day = SolusModels.index_of_day_abbr(all_days[-2:]) all_days = all_days[:-2] section_component = SolusModels.SectionComponent() components.append(section_component) section_component.start_date = start_date section_component.end_date = end_date section_component.instructor = instructor section_component.room = room section_component.timeslot = SolusModels.timeslot_index_by_components( day, start, end) return components
def scrape_sections(self): sel = self.selenium term_options = sel.get_select_options("id=DERIVED_SAA_CRS_TERM_ALT") for option in term_options: if not len(term_options) == 1: sel.select("id=DERIVED_SAA_CRS_TERM_ALT", "label=%s" % option) sel.click("id=DERIVED_SAA_CRS_SSR_PB_GO$92$") sel.wait_for_page_to_load(self.timeout_milliseconds) self.current_term = SolusModels.term_index_by_key(option) self.scrape_term()
def scrape_subjects_for_alphanum(self, alphanum): sel = self.selenium sel.click("id=DERIVED_SSS_BCC_SSR_ALPHANUM_" + alphanum) sel.wait_for_page_to_load(self.timeout_milliseconds) #Prepare to traverse all links link_number = self.starting_subject_index link_name_base = "name=DERIVED_SSS_BCC_GROUP_BOX_1$84$$%d" link_name = link_name_base % (link_number,) while sel.is_element_present(link_name): #Store subject title m = re.search("^([^-]*) - (.*)$", sel.get_text(link_name).strip()) subject_key = m.group(1) subject_title = m.group(2) print "\nSubject: %s: %s" % (subject_key, subject_title) if subject_key not in self.ignored_subjects: self.subject_index = SolusModels.subject_index_by_key(subject_key) SolusModels.Subject.subjects[self.subject_index].title = subject_title #Open the dropdown sel.click(link_name) sel.wait_for_page_to_load(self.timeout_milliseconds) #Traverses all course links in the dropdown self.scrape_single_dropdown() #Close the dropdown try: sel.click(link_name) except: print "FAILURE %s" % link_name time.sleep(100) sel.wait_for_page_to_load(self.timeout_milliseconds) else: print "Ignored" #Go to next link link_number += 1 if self.max_subjects_per_letter and link_number >= self.max_subjects_per_letter + self.starting_subject_index: break link_name = link_name_base % (link_number,)
def scrape_subjects_for_alphanum(self, alphanum): sel = self.selenium sel.click("id=DERIVED_SSS_BCC_SSR_ALPHANUM_" + alphanum) sel.wait_for_page_to_load(self.timeout_milliseconds) #Prepare to traverse all links link_number = self.starting_subject_index link_name_base = "name=DERIVED_SSS_BCC_GROUP_BOX_1$84$$%d" link_name = link_name_base % (link_number, ) while sel.is_element_present(link_name): #Store subject title m = re.search("^([^-]*) - (.*)$", sel.get_text(link_name).strip()) subject_key = m.group(1) subject_title = m.group(2) print "\nSubject: %s: %s" % (subject_key, subject_title) if subject_key not in self.ignored_subjects: self.subject_index = SolusModels.subject_index_by_key( subject_key) SolusModels.Subject.subjects[ self.subject_index].title = subject_title #Open the dropdown sel.click(link_name) sel.wait_for_page_to_load(self.timeout_milliseconds) #Traverses all course links in the dropdown self.scrape_single_dropdown() #Close the dropdown try: sel.click(link_name) except: print "FAILURE %s" % link_name time.sleep(100) sel.wait_for_page_to_load(self.timeout_milliseconds) else: print "Ignored" #Go to next link link_number += 1 if self.max_subjects_per_letter and link_number >= self.max_subjects_per_letter + self.starting_subject_index: break link_name = link_name_base % (link_number, )
def scrape_single_dropdown(self): sel = self.selenium #Prepare to traverse all links link_number = self.starting_course_index link_name_base = "id=CRSE_TITLE$%d" link_name = link_name_base % (link_number, ) while sel.is_element_present(link_name): #Go into the course sel.click(link_name) sel.wait_for_page_to_load(self.timeout_milliseconds) self.course = SolusModels.SolusCourse() SolusModels.SolusCourse.num_courses += 1 self.course.subject = self.subject_index #Scrape info from course try: self.scrape_single_course() self.course.clean() self.add_course(self.course) self.merge_course_if_fullyear() except SolusModels.UselessCourseException as e: print "Ignored" SolusModels.SolusCourse.num_courses -= 1 #Back out from course page sel.click("id=DERIVED_SAA_CRS_RETURN_PB") sel.wait_for_page_to_load(self.timeout_milliseconds) #Go to next course link_number += 1 if self.max_courses_per_subject and link_number >= self.max_courses_per_subject + self.starting_course_index: break link_name = link_name_base % (link_number, )
def scrape_title(self): sel = self.selenium raw_title = sel.get_text("css=span.PALEVEL0SECONDARY").strip() m = re.search('^([\S]+)\s+([\S]+)\s+-\s+(.*)$', raw_title) #Subject is assigned earlier #self.course.subject = SolusModels.subject_index_by_key(m.group(1)) self.course.subject_description = m.group(1) self.course.num = m.group(2) self.course.title = m.group(3) print "" print "%s/%s %s - %s" % (self.course.subject_description, self.course.subject, self.course.num, self.course.title) if re.search('^(UNSP)|(.*UNS)$', self.course.num): raise SolusModels.UselessCourseException( "%s %s" % (self.course.subject, self.course.num))