def scrape_courses(self): """ Scrape the course table on the current browser page. Also, make a request to scrape the description from the course detail page as well. """ # Skip the first row cause its a header row. rows = self.browser.find_elements_by_xpath('//table/tbody/tr')[1:] # Each row has 4 column. # Course (link to detail) | Category | Title | Credits for row in rows: abbrev, category, title, credits = row.find_elements_by_tag_name( 'td') course_link = abbrev.find_element_by_tag_name('a') # filter out fake rows if abbrev.text == 'category': continue resp = requests.get(course_link.get_attribute('href')) soup = bs4.BeautifulSoup(resp.text, 'lxml') desc = soup.select('div.location-result p')[0] course = { 'course': abbrev.text.upper(), 'title': title.text.title(), 'category': category.text.title(), 'credits': credits.text, 'description': desc.text } final = utils.clean_course(course) print final['course'], '-', final['title'] self.courses.append(utils.clean_course(final))
def run(self): """ There's only one page for Lasell College with all the info on it, so this function does all that. """ url = ('http://www.lasell.edu/academics/academic-catalog' '/undergraduate-catalog/course-descriptions.html') web_page = urllib2.urlopen(url).read() soup = bs4.BeautifulSoup(web_page, 'lxml') # ALl the courses are in the #tab-3 element. The element is # structured very neatly: # <h4> --> title # <p> --> description courses_titles = soup.select('#tab-3 h4') for title in courses_titles: course = {} course['title'] = title.text.strip() # Find the department. department = re.search(r'([A-Z]{2,4})[0-9]', course['title']) if department: abbrev = department.groups(0)[0] course['department'] = self.department_map.get(abbrev) else: course['department'] = None desc = title.find_next_sibling() if desc: course['description'] = desc.text.strip() else: course['description'] = None self.courses.append(utils.clean_course(course))
def scrape_courses(self, soup): """ The site is laid out the following way: <p class="Heading-6-Courses"> -> has the course title and credits <p class="Course-Body"> -> description (sometimes more than one of these) """ category = soup.select('#contentBody h1')[0].text.encode('ascii', 'ignore') category = re.search( r'Course Catalog: (.*?) Courses', category ).groups(0)[0] div = soup.select('#contentLeft')[0] course = {} for child in div.children: if isinstance(child, bs4.element.NavigableString): continue if child.attrs == {'class': ['Heading-6-Courses']}: # New course. # We should close out the old course, and then parse the new # one. if course and course['title'] not in self.course_set: course = utils.clean_course(course) self.courses.append(course) self.course_set.add(course['title']) # This element should look like this: # course abbreviation<br>title<br>credits contents = list(child.children)[0] contents = [a.string.encode('ascii', 'ignore') for a in contents if isinstance(a, bs4.element.NavigableString)] assert len(contents) == 3 title = '{}: {}'.format(contents[0], contents[1]) creds = re.sub(r' [cC]redit[s]{0,1}(.*?)$', '', contents[2]) creds = re.sub(r'[ ]{0,1}-[ ]{0,1}', ' to ', creds) course = { 'category': category, 'title': title, 'credits': creds } elif ((child.attrs == {'class': ['Course-Body']} or child.attrs == {'class': ['Body-Text']}) and course): desc = child.text.encode('ascii', 'ignore') if 'description' in course: course['description'] += ' ' + desc else: course['description'] = desc
def parse_course(self, response): course = {} course['title'] = response.xpath( '//span[@id="detail_title"]/text()').extract()[0] course['description'] = response.xpath( '//p[@id="detail_description"]/text()').extract()[0] # There are 5 tables on every course page # the interesting data is in the second row of each table tables = response.xpath('//div[@id="detail"]/table') # Table 1: School, Department, Faculty row1_cols = tables[0].xpath('tr[2]/td') course['school'] = row1_cols[0].xpath('text()').extract()[0] course['department'] = row1_cols[1].xpath('text()').extract()[0] course['faculty'] = row1_cols[2].xpath('span/text()').extract()[0] # Table 2: Term, Day and Time row2_cols = tables[1].xpath('tr[2]/td') course['term'] = row2_cols[0].xpath('text()').extract()[0] # day and time has some weird spacing, fix it day_and_time_raw = row2_cols[1].xpath('text()').extract()[0] day_and_time_raw = [ s.encode('ascii', 'ignore') for s in day_and_time_raw.split('\t') if s ] course['day_and_time'] = ' '.join(day_and_time_raw) # Table 3: Credits, Credit Level row3_cols = tables[2].xpath('tr/td') course['credits'] = row3_cols[0].xpath('text()').extract()[0] course['credit_level'] = row3_cols[1].xpath('text()').extract()[0] # self.course_list.append(utils.clean_course(course)) # self.courses[course['title']] = course final = utils.clean_course(course) final['url'] = response._url yield final
def scrape_courses(self, soup): """ This site is a mess! """ section = soup.find('div', {'id': 'MainContent_0_0_pnlDiv'}) if section is None: return items = section.find_all('p') for item in items: all_text = [] for s in item.children: if isinstance(s, bs4.element.NavigableString): cleaned = s.string.encode('ascii', 'ignore').strip() all_text.append(cleaned) else: for a in s.contents: if isinstance(a, bs4.element.NavigableString): cleaned = a.string.encode('ascii', 'ignore').strip() all_text.append(cleaned) else: text = [ c.string.encode('ascii', 'ignore').strip() for c in a.contents if isinstance(c, bs4.element.NavigableString) ] all_text.append(': '.join(text)) all_text = [a for a in all_text if a] if all_text: course = {} if len(all_text) > 2: t1, t2 = all_text[:2] course['title'] = '{}: {}'.format(t1, t2) all_text = all_text[2:] else: course['title'] = all_text.pop(0) course['desc'] = ''.join(all_text) self.courses.append(utils.clean_course(course))
def scrape_course(self, soup, college, department): course = {} section = soup.select('#main')[0] title = section.find('h1') description = section.find('div', {'class': 'desc'}) credits = section.find('div', {'class': 'credits'}) others = section.find_all('h3') others.pop(0) if others: indices = [(h3, section.contents.index(h3)) for h3 in others if h3 in section.contents] for i, (h3, h3_index) in enumerate(indices): if i == len(indices) - 1: end = len(section.contents) else: end = indices[i + 1][1] if h3_index == end - 1: contents = [section.contents[end]] else: contents = section.contents[h3_index + 1:end] final_contents = [] for c in contents: if isinstance(c, bs4.element.NavigableString): final_contents.append(c.string.strip()) elif c.name == 'div': pass else: final_contents.append(c.text.strip()) field = h3.text.strip().encode('ascii', 'ignore') self.fields.add(field) course[field] = ' '.join(final_contents) course['college'] = college course['department'] = department course['title'] = title.text.strip() course['description'] = description.text.strip() course['credits'] = credits.text.strip() if credits else None print course['title'] self.courses.append(utils.clean_course(course))
def scrape_course(self, row, category): course = {} info = row.find('a') title = info.text credits = re.search(r'(\d)\s?cr', title) if credits: course['credits'] = credits.group(0).replace('cr', '').strip() course['title'] = title.replace(credits.group(0), '').strip() else: course['credits'] = None course['title'] = title course['category'] = category course['link'] = '{}/{}'.format(self.base_url, info.attrs['href']) desc_page = urllib2.urlopen(course['link']).read() soup = bs4.BeautifulSoup(desc_page, 'lxml') td = soup.find('td', {'class': 'block_content'}) # We need to be smart about how we grab the description cause its not # very organized. # We only want elements that contain description text. desc = [] for i, c in enumerate(td.contents): if c.name in ('h1', 'table', 'div'): pass elif c.name == 'br' and td.contents[i + 1].name == 'br': break else: if isinstance(c, bs4.element.NavigableString): text = c.string else: text = c.text desc.append(text.strip().encode('ascii', 'ignore')) # desc = [c.string.strip().encode('ascii','ignore') for c in td.contents # if isinstance(c, bs4.element.NavigableString) and # c.string.strip().encode('ascii','ignore')] course['description'] = ' '.join(desc).replace(' ,', ',').strip() print course['title'], ':', course['credits'] self.courses.append(utils.clean_course(course))
def parse_course(self, response): """ Scrape the contents for an individual course. """ course = {} course['title'] = response.xpath( '//h1/a[@class="title"]/text()').extract()[0] course['category'] = response.xpath( '//div[@class="Breads"]/span/text()').extract()[0] # The description element manifests differently on every course page! desc_all = response.xpath( '//span[@class="text"]/descendant-or-self::*/text()') # Filter line breaks and other random artifacts. desc_extracted = [ c.extract().strip().replace('\r\n', '').encode('ascii', 'ignore') for c in desc_all ] # Filter out known unnecessary information. desc_filtered = [ c for c in desc_extracted[:-1] if 'Credit Hours' not in c and 'Course Descriptions' not in c and c != course['title'] and c != '' ] # Separate out prerequisites, if there are any. prerequisites = [ c for c in desc_filtered if c.startswith('Prerequisite') ] if prerequisites: course['prerequisite'] = prerequisites[0] desc_filtered.remove(course['prerequisite']) else: course['prerequisite'] = None course['description'] = '; '.join(desc_filtered) print course['title'] yield utils.clean_course(course)
def scrape(): html = open('necc_spring_2017.html', 'r') soup = bs4.BeautifulSoup(html, 'html.parser') url = 'https://ssb.necc.mass.edu:9030' rows = soup.select('table.datadisplaytable tr') # Every odd row is the title of the course # Every even row is the info about the course # So we need to parse them as couples courses = [] for i in range(0, len(rows) - 1, 2): course = {} # TITLE course['title'] = rows[i].text.strip().encode('ascii', 'ignore') course['link'] = url + rows[i].find('a').attrs['href'] # DESCRIPTION # text in wrapper <td> element, before <br> td = rows[i + 1].find('td') desc = td.next_element course['description'] = desc.strip() if desc.next_element.name == 'b': # PREREQUISITES # looks like "Prerequisite(s): bunch of stuff # just want stuff after the first colon # also in some cases, there is a newline with non-prereq info. # cut that out as well. all_prereq_text = desc.next_element.text.strip() just_prereqs = ''.join( all_prereq_text.split(': ')[1:]).split('\n')[0] course['prerequisites'] = just_prereqs else: course['prerequisites'] = None # CREDITS # ??? # LEVELS spans = td.select('.fieldlabeltext') try: course['levels'] = spans[0].next_sibling.strip() except IndexError: course['levels'] = None # SCHEDULE TYPE # first text after the second span # sometimes they are wrapped in <a>, sometimes not try: if spans[1].next_sibling.name == 'a': items = spans[1].find_next_siblings('a') course['schedule_type'] = ', '.join( [s.string.strip() for s in items]) else: course['schedule_type'] = spans[1].next_sibling.string.strip() except IndexError: course['schedule_type'] = None # DEPARTMENT # usually after a newline, always has word Department # extract just the important bit department_matches = re.search('\\n[ a-zA-Z&]{1,}Department', td.text) if department_matches: department = re.sub('\\n[ ]{1,}', '', department_matches.group(0)) course['department'] = department.replace(' Department', '') else: course['department'] = None print course['title'] courses.append(utils.clean_course(course)) return courses
def scrape_courses(self, soup): """ Parses a page of courses under a particular letter and extracts the course information. :param soup: BeautifulSoup of a web page under a letter. """ # Page layout resembles the following: # <h2> -> course category, one of these has multiple courses under it # <h3> - > course title # <p> -> course description # <div class="tablewrap"> -> course details # <div class="separator"> -> separates courses # We want to start with the first h2 element, then traverse the # section by moving on to the next sibling and reacting based on the # type of element. course = {} cur_category = None cur_course_name = None for element in soup.find(id='simmonsmainBody').children: if element.name == 'h2': cur_category = element.text course['category'] = cur_category elif element.name == 'h3': course['title'] = element.text elif element.name == 'p': course['description'] = element.text elif (element.name == 'div' and element.attrs == { 'class': ['tablewrap'] }): # Within this element is a table with the following columns: # Section, Dates, Days, Times, Room, Instructor, # Section Status, Avail Seats, Requires Consent, Credits # Sometimes this table has multiple rows for different # sections of the course. These should be treated as separate # courses for now. rows = element.find_all('tr') if not rows: continue # First row is table headers - skip them for row in rows[1:]: cols = row.find_all('td') # if the instructor column has a <br> element, we should join # the names with a , course['section'], \ course['dates'], \ course['days'], \ course['times'], \ course['room'], \ course['section status'], \ course['avail seats'], \ course['requires consent'], \ course['credits'] = [c.text.strip() for c in cols if cols.index(c) != 5] course['instructor'] = ', '.join([ c.string.strip() for c in cols[5].children if isinstance(c, bs4.element.NavigableString) ]) course = utils.clean_course(course) self.courses.append(course) elif (element.name == 'div' and element.attrs == { 'class': ['separator'] } and course): course = {'category': cur_category}