def scrape_coursera_webrootpage_into_courses_subset(html_text): courses_subset = [] xhtml_root = lxml.html.fromstring(html_text) start_date_and_duration_str = xhtml_root.xpath('./div/div/span/text()')[0] # start_date_and_duration_str example ==>>> Aug 5th (7 weeks long) start_date, duration_in_weeks = timeutils.parse_start_date_with_duration_in_weeks_within_parentheses(start_date_and_duration_str) if start_date == None or duration_in_weeks == None: continue course_subset = CourseSubset() course_subset.start_date = start_date course_subset.duration_in_weeks = duration_in_weeks subdivs_to_introspect = xhtml_root.findall(".//div[@class]") for subdiv in subdivs_to_introspect: classname = subdiv.get('class') if classname == 'coursera-course-listing-main': # listing_main_div # instropecting the course's title the_courses_title = subdiv.xpath('./h3/a/text()')[0] if the_courses_title == None: continue the_courses_title = the_courses_title.lstrip(' \t\r\n').rstrip(' \t\r\n') course_subset.title = the_courses_title # instropecting the course's university listed listing_statement_div = subdiv.xpath('./div')[1] # <div class="coursera-course-listing-statement"> university_name = listing_statement_div.xpath('./div/a/text()')[0] # university's inner div with its enclosing a[@href] course_subset.university = university_name courses_subset.append(course_subset) return courses_subset
def scrape_coursera_webrootpage_into_courses_subset(self): ''' ''' xhtml_root = lxml.html.fromstring(self.get_xhtml_text()) #body = xhtml_root.xpath('./body')[0] #xml_courses = body.getchildren() xml_courses = xhtml_root.xpath('.//div') for xml_course in xml_courses: try: start_date_and_duration_str = xml_course.xpath('./div/span/text()')[0] #print 'start_date_and_duration_str', start_date_and_duration_str # start_date_and_duration_str example ==>>> Aug 5th (7 weeks long) start_date, duration_in_weeks = timeutils.parse_start_date_with_duration_in_weeks_within_parentheses(start_date_and_duration_str) if start_date == None and duration_in_weeks == None: continue #print 'start_date', start_date, 'duration_in_weeks', duration_in_weeks course_subset = CourseSubset() if start_date != None: course_subset.start_date = start_date if duration_in_weeks > 0: course_subset.duration_in_weeks = duration_in_weeks elif duration_in_weeks == -1: # Self study course_subset.is_self_study = True elif duration_in_weeks == -2: # Date TBA (To Be Announced) course_subset.is_start_date_TBA = True subdivs_to_introspect = xml_course.findall(".//div[@class]") for subdiv in subdivs_to_introspect: classname = subdiv.get('class') if classname == 'coursera-course-listing-main': # listing_main_div # instropecting the course's title and its id and n_seq course_a_tag = subdiv.xpath('./h3/a')[0] if course_a_tag == None: continue href = course_a_tag.get('href') cid, n_seq = derive_cid_and_n_seq_from_href(href) if cid == None or n_seq == None: continue course_subset.cid = cid course_subset.n_seq = n_seq the_courses_title = course_a_tag.text # xpath('./h3/a/text()')[0] if the_courses_title == None: continue the_courses_title = the_courses_title.lstrip(' \t\r\n').rstrip(' \t\r\n') course_subset.title = the_courses_title # instropecting the course's university listed # listing_statement_div = subdiv.xpath('./div')[1] # <div class="coursera-course-listing-statement"> listing_statement_inner_divs = subdiv.xpath('./div') for inner_div in listing_statement_inner_divs: classname2 = inner_div.get('class') if classname2 == 'coursera-course-listing-more coursera-course-my-listing-more': try: university_name = inner_div.xpath('./a/text()')[0] # university's inner div with its enclosing a[@href] print university_name #university_name = university_name.decode('utf-8') course_subset.university = university_name except IndexError: continue self.courses_subset.append(course_subset) except IndexError: continue