コード例 #1
0
def scrape_coursera_webrootpage_into_courses_subset(html_text):
  
  courses_subset = []

  xhtml_root = lxml.html.fromstring(html_text)
  start_date_and_duration_str = xhtml_root.xpath('./div/div/span/text()')[0]
  # start_date_and_duration_str example ==>>> Aug 5th (7 weeks long)
  start_date, duration_in_weeks = timeutils.parse_start_date_with_duration_in_weeks_within_parentheses(start_date_and_duration_str)
  if start_date == None or duration_in_weeks == None:
    continue
  course_subset = CourseSubset()
  course_subset.start_date = start_date
  course_subset.duration_in_weeks = duration_in_weeks

  subdivs_to_introspect = xhtml_root.findall(".//div[@class]")
  for subdiv in subdivs_to_introspect:
    classname = subdiv.get('class')
    if classname == 'coursera-course-listing-main': # listing_main_div
      # instropecting the course's title
      the_courses_title = subdiv.xpath('./h3/a/text()')[0]
      if the_courses_title == None:
        continue
      the_courses_title = the_courses_title.lstrip(' \t\r\n').rstrip(' \t\r\n')
      course_subset.title = the_courses_title 
      
      # instropecting the course's university listed
      listing_statement_div = subdiv.xpath('./div')[1] # <div class="coursera-course-listing-statement">
      university_name = listing_statement_div.xpath('./div/a/text()')[0] # university's inner div with its enclosing a[@href]
      course_subset.university = university_name
      courses_subset.append(course_subset)  

  return courses_subset
コード例 #2
0
 def scrape_coursera_webrootpage_into_courses_subset(self):
   '''
   '''
   xhtml_root = lxml.html.fromstring(self.get_xhtml_text())
   #body = xhtml_root.xpath('./body')[0]
   #xml_courses = body.getchildren()
   xml_courses = xhtml_root.xpath('.//div')
   for xml_course in xml_courses:
     try:
       start_date_and_duration_str = xml_course.xpath('./div/span/text()')[0]
       #print 'start_date_and_duration_str', start_date_and_duration_str
       # start_date_and_duration_str example ==>>> Aug 5th (7 weeks long)
       start_date, duration_in_weeks = timeutils.parse_start_date_with_duration_in_weeks_within_parentheses(start_date_and_duration_str)
       if start_date == None and duration_in_weeks == None:
         continue
       #print 'start_date', start_date, 'duration_in_weeks', duration_in_weeks
       course_subset = CourseSubset()
       if start_date != None: 
         course_subset.start_date = start_date
       if duration_in_weeks > 0:
         course_subset.duration_in_weeks = duration_in_weeks
       elif duration_in_weeks == -1: # Self study
         course_subset.is_self_study = True
       elif duration_in_weeks == -2: # Date TBA (To Be Announced)
         course_subset.is_start_date_TBA = True
       subdivs_to_introspect = xml_course.findall(".//div[@class]")
       for subdiv in subdivs_to_introspect:
         classname = subdiv.get('class')
         if classname == 'coursera-course-listing-main': # listing_main_div
           # instropecting the course's title and its id and n_seq
           course_a_tag = subdiv.xpath('./h3/a')[0]
           if course_a_tag == None:
             continue
           href = course_a_tag.get('href')
           cid, n_seq = derive_cid_and_n_seq_from_href(href)
           if cid == None or n_seq == None:
             continue  
           course_subset.cid   = cid
           course_subset.n_seq = n_seq 
           the_courses_title = course_a_tag.text # xpath('./h3/a/text()')[0]
           if the_courses_title == None:
             continue
           the_courses_title = the_courses_title.lstrip(' \t\r\n').rstrip(' \t\r\n')
           course_subset.title = the_courses_title
           
           # instropecting the course's university listed
           # listing_statement_div = subdiv.xpath('./div')[1] # <div class="coursera-course-listing-statement">
           listing_statement_inner_divs = subdiv.xpath('./div')
           for inner_div in listing_statement_inner_divs:
             classname2 = inner_div.get('class') 
             if classname2 == 'coursera-course-listing-more coursera-course-my-listing-more':
               try:
                 university_name = inner_div.xpath('./a/text()')[0] # university's inner div with its enclosing a[@href]
                 print university_name 
                 #university_name = university_name.decode('utf-8')
                 course_subset.university = university_name
               except IndexError:
                 continue
           self.courses_subset.append(course_subset)
     except IndexError:
       continue