def get_prof_page(self, extension): try: prof_page = scraper_base.get_soup(f"{self.TOP_LINK}{extension}") except requests.exceptions.RequestException as e: print(WARNING, f"Failed scraping {self.TOP_LINK}{extension}: {e}") return None else: print(DEBUG, f"Retrieved professor page from {self.TOP_LINK}{extension}") prof_name = prof_page.title.text.strip() # Why is all relevant data in a button block? I have no idea. main_block = prof_page.findAll('button')[2] prof_rating = main_block.find("span", { "class": "teacher-rating" }).text prof_difficulty = main_block.find("span", { "class": "evals-span" }).text if prof_difficulty: prof_difficulty = prof_difficulty.split()[1] p = { "NAME": prof_name, "RATING": prof_rating, "DIFFICULTY": prof_difficulty } return p
def scrape(self): data = [] print( DEBUG, f"Starting calpolyratings scrape: TOP_LINK={self.TOP_LINK}, REST_TIME={self.REST_TIME}ms" ) page_num = 1 while True: try: soup = scraper_base.get_soup( f"{self.TOP_LINK}/?page={page_num}") except requests.exceptions.RequestException as e: # Keep trying to get new pages until 404. On 404, return existing data if str(e).startswith('404 Client Error'): print(NOTICE, f"Page {page_num} not found. Ending scrape") print(SUCCESS, f"Done! Scraped ratings for {len(data)} professors") return pd.DataFrame(data).to_csv(None, index=False) print(ALERT, e) return None else: print(SUCCESS, f"Retrieved page {page_num}") page_num += 1 links = (a['href'] for a in soup.find_all('a', href=True)) prof_links = [ a for a in links if a.startswith('/') and not a.endswith('/') ] for prof in prof_links: sleep(self.REST_TIME / 1000) page = self.get_prof_page(prof) if page: data.append(page)
def scrape_schedules_from_url(self, url, verify=True, preprocess=None): try: soup = scraper_base.get_soup(url, verify) except requests.exceptions.RequestException as e: print(ALERT, e) return None print(SUCCESS, "Retrieved schedules page") return self.scrape_schedules_from_html(str(soup), preprocess)
def scrape(self): """ Scrapes academic calendar data to CSV returns: str: A CSV string of scraped data """ starting_year = self.CALENDAR_EPOCH calendar = dict() while True: ending_year = starting_year + 1 current_year = starting_year calendar_url = f'{self.TOP_LINK}/{starting_year}-{ending_year - 2000}-academic-calendar' try: calendar_soup = scraper_base.get_soup(calendar_url) # Returns on an invalid school year; should always return. except requests.exceptions.HTTPError: dates = list(calendar.values()) return pd.DataFrame(dates).to_csv(None, index=False) else: # Finds all tables on the page (summer/fall/winter/spring quarters) # Excludes the last summary table. # Note: summary table id has a space at the end. All years are like this. tables = calendar_soup.find_all( lambda tag: tag.name == 'table' and tag.has_attr( 'id') and tag['id'] != "SUMMARY OF CALENDAR DAYS ") for table in tables: for row in table.find_all('tr'): cols = row.find_all('td') dates = cols[0].text parsed_dates = self.parse_dates(dates, current_year) # Ugly solution to change the calendar year during the school year. # Assumes there will always be an event in January. if 'January' in parsed_dates and current_year != ending_year: current_year = ending_year # Second column is just the days of the week; ignore events = [t.strip() for t in cols[2].text.splitlines()] for month, days in parsed_dates.items(): for day in days: date = self.make_date(month, day, current_year) if date in calendar: calendar[date]['EVENTS'].extend(events) else: entry = dict() entry['DATE'] = date entry['DAY'] = day entry['MONTH'] = month entry['YEAR'] = current_year entry['EVENTS'] = events calendar[date] = entry starting_year += 1
def scrape(self): """ Scrapes data from all CPE and CSC employees args: no_upload (Bool): Used by scraper.py to control uploading vs just returning scraped data returns: str: A CSV string of scraped data """ scraped_faculty = [] # Verification turned off; read main note in self.parse_single_employee soup = scraper_base.get_soup(self.CSC_TOP_LINK, ver=False) for link in soup.find_all("a", href=True): nav = link["href"] if (nav.startswith("/faculty/") or nav.startswith("/staff")) and (nav != "/faculty/" and nav != "/staff/"): info = self.parse_single_employee("https://csc.calpoly.edu" + nav) scraped_faculty.append(info) sleep(self.REST_TIME / 1000) soup = scraper_base.get_soup(self.CPE_TOP_LINK) for link in soup.find_all("a", href=True): nav = link["href"] if (nav.startswith("/faculty/") or nav.startswith("/staff")) and (nav != "/faculty/" and nav != "/staff/"): info = self.parse_single_employee("https://cpe.calpoly.edu" + nav) scraped_faculty.append(info) sleep(self.REST_TIME / 1000) return pd.DataFrame(scraped_faculty).to_csv(None, index=False)
def scrape(self): """ Scrapes club information to CSV returns: str: A CSV string of scraped data """ top = scraper_base.get_soup(self.TOP_LINK) raw = [l.text.strip() for l in top.find_all('span')] # Filters out some info we don't need info = [x for x in raw if x and x != "Website" and x != "Homepage:"] info.pop(0) # Don't need first line current_club = None info_len = len(info) club_info = dict() scraped_clubs = [] i = 0 while i < info_len: line = info[i] if line in self.INFO_ENTRY_PAIRS: next_line = info[i + 1] entry_name = self.INFO_ENTRY_PAIRS[line] if next_line.endswith(':'): club_info[entry_name] = 'NA' i += 1 else: club_info[entry_name] = next_line i += 2 elif line == "Contact Email:": next_line = info[i + 1] try: bool(club_info['CONTACT_EMAIL'] ) # Checks if there's already a contact_email entry. except KeyError: if next_line.endswith(':'): club_info['CONTACT_EMAIL'] = 'NA' i += 1 else: club_info['CONTACT_EMAIL'] = next_line i += 2 else: # Two fields are called "Contact Email"--the email of the main contact for the club, and the club's # official email to contact them. For now, the official email is called "contact_email_2" if '@' in next_line: # Next line could be the name of another club; doesn't end in ':' club_info['CONTACT_EMAIL_2'] = next_line i += 2 else: club_info['CONTACT_EMAIL_2'] = 'NA' i += 1 else: # Checking len(club_info) filters unused addresses and stuff that get parsed as club names. if current_club and len(club_info) != 0: club_info['NAME'] = current_club scraped_clubs.append(club_info) club_info = dict() current_club = line i += 1 return pd.DataFrame(scraped_clubs).to_csv(None, index=False)
def scrape(self, all_departments=False): """ Scrapes course information and requirements to CSV args: all_departments (bool): Scrapes all departments if True, or just CPE and CSC if False (default False) returns: str: A CSV string of scraped data """ # Retrieves department list from Cal Poly print( DEBUG, f"Starting course scrape: all_departments={all_departments}, REST_TIME={self.REST_TIME}" ) print(INFO, "Starting course scrape") if all_departments: top_link = "http://catalog.calpoly.edu/coursesaz/" print(INFO, f"Starting scrape on {top_link}") try: top_soup = scraper_base.get_soup(top_link, ver=False) except requests.exceptions.RequestException as e: print(ALERT, e) return None print(SUCCESS, "Retrieved top-level courses page") # Changed scraping method because source for visible links changed, but # old links are still in the source and cause some 404 errors departments_az = top_soup.find('table') department_urls = [ department.get('href') for department in departments_az.find_all('a') if department.get('href') ] if not department_urls: print(ALERT, "Couldn't find departments list. Aborting scrape.") return None print(INFO, f"Found URLs for {len(department_urls)} departments") else: print(INFO, "Just scraping CSC and CPE courses") department_urls = ['/coursesaz/csc/', '/coursesaz/cpe/'] scraped_courses = [] # Retrieves course info for each department for department in department_urls: # Extracts the department name from the URL dep_name = (department.rsplit('/', 2)[1]).upper() # Gets raw list of courses and info for department dep_link = 'http://catalog.calpoly.edu' + department try: dep_soup = scraper_base.get_soup(dep_link) sleep(self.REST_TIME / 1000) except requests.exceptions.RequestException as e: print(ALERT, e) return None print(SUCCESS, f"Retrieved {dep_name} courses from {dep_link}") courses = dep_soup.findAll("div", {"class": "courseblock"}) print(DEBUG, f"Found {len(courses)} courses") for course in courses: course_name_and_units = (course.find( "p", {"class": "courseblocktitle"})).get_text() course_name, course_units = course_name_and_units.splitlines() print(DEBUG, f"Found {course_name}") course_units = course_units.split(' ', 1)[0] paragraphs = course.findAll('p') if len(paragraphs) == 5: ge_areas = re.findall(r'Area (\w+)', paragraphs[1].text) else: ge_areas = None course_terms_and_reqs = (course.find( "div", {"class": "noindent courseextendedwrap"})).get_text() section = None course_prereqs, course_coreqs, course_conc, course_rec, course_terms = [], [], [], [], [] for word in course_terms_and_reqs.split(): if word.endswith(':'): if word == 'Offered:': section = 'terms' # Last term (F,W,SP, etc) will be appended to the front of "Prerequisite:" or whatever category # comes immediately after terms offered, so "str.endswith(blah)" has to be done instead # of "str == blah" elif word.endswith('Prerequisite:'): try: course_terms.append((word.split('Pre'))[0]) print(DEBUG, "Found prerequisites") except IndexError: pass section = 'prereq' elif word.endswith('Corequisite:'): try: course_terms.append((word.split('Cor'))[0]) print(DEBUG, "Found corequisites") except IndexError: pass section = 'coreq' elif word.endswith('Concurrent:'): try: course_terms.append((word.split('Con'))[0]) print(DEBUG, "Found concurrent courses") except IndexError: pass section = 'conc' elif word.endswith('Recommended:'): try: course_terms.append((word.split('Rec'))[0]) print(DEBUG, "Found recommended courses") except IndexError: pass section = 'rec' else: pass else: if section == 'prereq': course_prereqs.append(word) elif section == 'coreq': course_coreqs.append(word) elif section == 'conc': course_conc.append(word) elif section == 'rec': course_rec.append(word) elif section == 'terms': course_terms.append(word) else: pass # Update: Now joined with a comma maybe_join = (lambda x: ','.join(x) if x else 'NA') course_prereqs = maybe_join(course_prereqs) course_coreqs = maybe_join(course_coreqs) course_conc = maybe_join(course_conc) course_rec = maybe_join(course_rec) course_terms = maybe_join(course_terms) ge_areas = maybe_join(ge_areas) document = { "DEPARTMENT": dep_name, "COURSE_NAME": course_name, "UNITS": course_units, "PREREQUISITES": course_prereqs, "COREQUISITES": course_coreqs, "CONCURRENT": course_conc, "RECOMMENDED": course_rec, "TERMS_TYPICALLY_OFFERED": course_terms, "GE_AREAS": ge_areas } scraped_courses.append(document) print(SUCCESS, f"Done! Scraped {len(scraped_courses)} courses") return pd.DataFrame(scraped_courses).to_csv(None, index=False)
def scrape_schedules_from_url(self, url, verify=True, preprocess=None): soup = scraper_base.get_soup(url, verify) return self.scrape_schedules_from_html(str(soup), preprocess)
def scrape(self): """ Scrapes club information to CSV returns: str: A CSV string of scraped data """ print(INFO, f'Starting scrape on {self.TOP_LINK}') try: top = scraper_base.get_soup(self.TOP_LINK) except requests.exceptions.RequestException as e: print(ALERT, e) return None print(SUCCESS, f'Retrieved club list') raw = [l.text.strip() for l in top.find_all('span')] # Filters out some info we don't need info = [x for x in raw if x and x != "Website" and x != "Homepage:"] info.pop(0) # Don't need first line current_club = None info_len = len(info) club_info = dict() scraped_clubs = [] i = 0 while i < info_len: line = info[i] if line in self.INFO_ENTRY_PAIRS: next_line = info[i + 1] entry_name = self.INFO_ENTRY_PAIRS[line] if next_line.endswith(':'): club_info[entry_name] = 'NA' i += 1 else: club_info[entry_name] = next_line i += 2 elif line == "Contact Email:": next_line = info[i + 1] try: bool(club_info['CONTACT_EMAIL'] ) # Checks if there's already a contact_email entry. except KeyError: if next_line.endswith(':'): club_info['CONTACT_EMAIL'] = 'NA' i += 1 else: club_info['CONTACT_EMAIL'] = next_line i += 2 else: # Two fields are called "Contact Email"--the email of the main contact for the club, and the club's # official email to contact them. For now, the official email is called "contact_email_2" if '@' in next_line: # Next line could be the name of another club; doesn't end in ':' club_info['CONTACT_EMAIL_2'] = next_line i += 2 else: club_info['CONTACT_EMAIL_2'] = 'NA' i += 1 else: # Checking len(club_info) filters unused addresses and stuff that get parsed as club names. if current_club and len(club_info) != 0: club_info['NAME'] = current_club scraped_clubs.append(club_info) print(DEBUG, f'Scraped {current_club}') else: print(DEBUG, f'Discarding non-club {current_club}') club_info = dict() current_club = line i += 1 clubs_request = json.dumps({ 'clubs': [self.transform_club_to_db(club) for club in scraped_clubs] }) requests.post(url=self.CLUBS_API, json=clubs_request) print(SUCCESS, f'Done! Scraped {len(scraped_clubs)} clubs') return pd.DataFrame(scraped_clubs).to_csv(None, index=False)
def scrape(self): """ Scrapes academic calendar data to CSV returns: str: A CSV string of scraped data """ print( DEBUG, f"Starting calendar scrape: CALENDAR_EPOCH={self.CALENDAR_EPOCH}, TOP_LINK={self.TOP_LINK}" ) starting_year = self.CALENDAR_EPOCH calendar = dict() while True: ending_year = starting_year + 1 current_year = starting_year calendar_url = f'{self.TOP_LINK}/{starting_year}-{ending_year - 2000}-academic-calendar' print( DEBUG, f"Attempting to retrieve {starting_year}-{ending_year} calendar from {calendar_url}" ) try: calendar_soup = scraper_base.get_soup(calendar_url) # Returns on an invalid school year; should always return. except requests.exceptions.HTTPError: print( NOTICE, f"{starting_year}-{ending_year} calendar doesn't exist. Ending scrape." ) dates = list(calendar.values()) if len(dates) > 0: print( SUCCESS, f"Done! Scraped {ending_year-self.CALENDAR_EPOCH-1} calendar(s)" ) else: print(ERR, "Did not successfully scrape any dates") return None return pd.DataFrame(dates).to_csv(None, index=False) except requests.exceptions.RequestException as e: print(ALERT, e) return None else: print( SUCCESS, f"Successfully retrieved {starting_year}-{ending_year} calendar" ) # Finds all tables on the page (summer/fall/winter/spring quarters) # Excludes the last summary table. # Note: summary table id has a space at the end. All years are like this. tables = calendar_soup.find_all( lambda tag: tag.name == 'table' and tag.has_attr( 'id') and tag['id'] != "SUMMARY OF CALENDAR DAYS ") print(DEBUG, f"Found {len(tables)} tables") for table in tables: for row in table.find_all('tr'): cols = row.find_all('td') dates = cols[0].text parsed_dates = self.parse_dates(dates, current_year) # Ugly solution to change the calendar year during the school year. # Assumes there will always be an event in January. if 'January' in parsed_dates and current_year != ending_year: print( DEBUG, f"Switching current year from {current_year} to {ending_year}" ) current_year = ending_year # Second column is just the days of the week; ignore events = [t.strip() for t in cols[2].text.splitlines()] for month, days in parsed_dates.items(): for day in days: date = self.make_date(month, day, current_year) if date in calendar: print(DEBUG, f"Adding event to {date}") calendar[date]['EVENTS'].extend(events) else: print(DEBUG, f"Making calendar entry for {date}") entry = dict() entry['DATE'] = date entry['DAY'] = day entry['MONTH'] = month entry['YEAR'] = current_year entry['EVENTS'] = events calendar[date] = entry starting_year += 1 scraped_calendar_entries = [] for date, full_entry in calendar.items(): for event in full_entry['EVENTS']: scraped_calendar_entries.append({ 'date': full_entry['DATE'], 'day': full_entry['DAY'], 'month': full_entry['MONTH'], 'year': full_entry['YEAR'], 'raw_events_text': event, }) calendars_request = json.dumps( {'calendars': scraped_calendar_entries}) requests.post(url=self.CALENDARS_API, json=calendars_request)
def parse_single_employee(self, url): """ Scrapes data from a single Cal Poly employee. args: url (str) returns: dict(str:str) """ # Due to certificate issues with CSC employee pages, verification # is turned off for requests in the scraper module. This leads to # lots of warning during runtime but unaffected data. soup = scraper_base.get_soup(url, ver=False) name = soup.find("h1").text office = 'NA' email = 'NA' phone = 'NA' research_interests = 'NA' # Information is stored in different blocks for staff and faculty if url.rsplit("/", 3)[1] == "staff": main_info_text = [l.text for l in soup.find_all("span")] table = soup.find("table") else: faculty_main_info = soup.find(id="facultyMainBlock") table = faculty_main_info.find("table") main_info_text = faculty_main_info.text.splitlines() # Getting the research interests for every professor # Doesn't work for Hasmik Gharibyan and other people who have a biography faculty_additional_info = soup.find(class_="facultyBlock") if faculty_additional_info is not None and faculty_additional_info.span: research_interests = [] for string in faculty_additional_info.stripped_strings: research_interests.append(string) found_office = found_phone = found_email = False for line in main_info_text: if not found_office: if line.startswith("Office:"): office = line.rsplit(" ", 1)[1] found_office = True if not found_phone: if line.startswith("Phone "): phone = line.rsplit(" ", 1)[1] found_phone = True if not found_email: if line.startswith("Email:"): # For faculty pages, the character following "Email:" is not a space but \xa0 try: first_split = line.split("\xa0", 1)[1] except IndexError: first_split = line.split(" ", 1)[1] email = first_split.split("(", 1)[0] + "@calpoly.edu" if found_office and found_phone and found_email: break # office_hours = dict() # Office hour parsing is currently disabled. The previous solution only worked for the CPE department # and stored office hours in a dictionary, which is not represented in a CSV file. A separate source # of office hours should replace this. # # # # This method DOES NOT parse office hours for staff, whose office hours are single-line formatted # # like: "Mon - Fri: 10:00 am to 5:00 pm". Additional parsing will have to be added for them. For now, their # # office hours will show up as 'NA' # try: # rows = table.find_all("tr") # except AttributeError: # # Best way to represent no office hours? Currently using 'NA' as a string # office_hours = "NA" # else: # rows.pop(0) # Don't need header row # for row in rows: # col = row.find_all("td") # days = ( # col[0].text.replace(",", "").split() # ) # Turns "Monday, Wednesday" into ['Monday', 'Wednesday'] # time = col[1].text # room = col[2].text # for day in days: # office_hours[day] = dict() # office_hours[day]["time"] = time # office_hours[day]["room"] = room faculty_info = { "NAME": name, "OFFICE": office, "EMAIL": email, "PHONE": phone, "RESEARCH_INTERESTS": research_interests, } return faculty_info
def scrape(self, all_departments=False): """ Scrapes course information and requirements to CSV args: all_departments (bool): Scrapes all departments if True, or just CPE and CSC if False (default False) returns: str: A CSV string of scraped data """ # Retrieves department list from Cal Poly if all_departments: top_link = "http://catalog.calpoly.edu/coursesaz/" top_soup = scraper_base.get_soup(top_link, ver=False) # Changed scraping method because source for visible links changed, but # old links are still in the source and cause some 404 errors departments_az = top_soup.find('table') department_urls = [ department.get('href') for department in departments_az.find_all('a') if department.get('href') ] else: department_urls = ['/coursesaz/csc/', '/coursesaz/cpe/'] scraped_courses = [] # Retrieves course info for each department for department in department_urls: # Extracts the department name from the URL dep_name = (department.rsplit('/', 2)[1]).upper() # Gets raw list of courses and info for department dep_link = 'http://catalog.calpoly.edu' + department dep_soup = scraper_base.get_soup(dep_link) courses = dep_soup.findAll("div", {"class": "courseblock"}) for course in courses: course_name_and_units = (course.find( "p", {"class": "courseblocktitle"})).get_text() course_name, course_units = course_name_and_units.splitlines() course_units = course_units.split(' ', 1)[0] course_terms_and_reqs = (course.find( "div", {"class": "noindent courseextendedwrap"})).get_text() section = None course_prereqs, course_coreqs, course_conc, course_rec, course_terms = [], [], [], [], [] for word in course_terms_and_reqs.split(): if word.endswith(':'): if word == 'Offered:': section = 'terms' # Last term (F,W,SP, etc) will be appended to the front of "Prerequisite:" or whatever category # comes immediately after terms offered, so "str.endswith(blah)" has to be done instead # of "str == blah" elif word.endswith('Prerequisite:'): try: course_terms.append((word.split('Pre'))[0]) except IndexError: pass section = 'prereq' elif word.endswith('Corequisite:'): try: course_terms.append((word.split('Cor'))[0]) except IndexError: pass section = 'coreq' elif word.endswith('Concurrent:'): try: course_terms.append((word.split('Con'))[0]) except IndexError: pass section = 'conc' elif word.endswith('Recommended:'): try: course_terms.append((word.split('Rec'))[0]) except IndexError: pass section = 'rec' else: pass else: if section == 'prereq': course_prereqs.append(word) elif section == 'coreq': course_coreqs.append(word) elif section == 'conc': course_conc.append(word) elif section == 'rec': course_rec.append(word) elif section == 'terms': course_terms.append(word) else: pass maybe_join = (lambda x: ' '.join(x) if x else 'NA') course_prereqs = maybe_join(course_prereqs) course_coreqs = maybe_join(course_coreqs) course_conc = maybe_join(course_conc) course_rec = maybe_join(course_rec) course_terms = maybe_join(course_terms) document = { "DEPARTMENT": dep_name, "COURSE_NAME": course_name, "UNITS": course_units, "PREREQUISITES": course_prereqs, "COREQUISITES": course_coreqs, "CONCURRENT": course_conc, "RECOMMENDED": course_rec, "TERMS_TYPICALLY_OFFERED": course_terms } scraped_courses.append(document) return pd.DataFrame(scraped_courses).to_csv(None, index=False)