def extract(self,url): self.req = urlopen(url) self.html = self.req.read() self.soup = BeautifulSoup(self.html) courseLinks = self.soup.findAll('a',{'class':'featured-course-live'}) courseNames = self.soup.findAll('span',{'class':'featured-course-title'}) schoolNames = self.soup.findAll('div',{'class':'featured-course-school'}) instructorNames = self.soup.findAll('h5',{'class':'last emboss-light instructor-name'}) courseDetails = self.soup.findAll('p',{'class':'last fineprint pad-box-mini top-rule-box featured-course-desc'}) for (itemname,iteminstructor,itemschool,itemlink,itemdetail) in zip(courseNames,instructorNames,schoolNames,courseLinks,courseDetails): detail = itemdetail.contents[0].encode('utf-8') instructor = iteminstructor.contents[0].split(',')[0] school = itemschool.div.img['title'] name = itemname.span.contents[0] url = itemlink['href'] date = itemlink.span.contents[2] #format like:\blank Feb 25,\blank 2012\n\blank year = int(date.strip().split(',')[1]) month = self.MONTHDIC[date.strip().split()[0]] day = int(date.strip().split(',')[0].split()[1]) lasttime = '' record = [name,instructor,school,url,detail,year,month,day,lasttime] #insert or update the database dao = courseDAO() if not dao.exist(url): dao.insert(record) else: record.append(url) dao.update(record)
def extract(self,url): self.req = urlopen(url) self.html = self.req.read() self.soup = BeautifulSoup(self.html) #url list courselist = self.soup.find('ul',id = 'unfiltered-class-list') """ get a list of urls of all the courses here. parser each url and may throw exceptions here. Todo:exceptions handling """ for linkitem in courselist.findAll('a'): #get the course url url = self.baseUrl + linkitem['href'] #parser the course index page self.url.append(url) #print url request = urlopen(url) tmpHtml = request.read() tmpSoup = BeautifulSoup(tmpHtml) #get detail and instructors information detail = tmpSoup.find('div',{'class':'span3'}).findAll('p')[1].contents[0] instructors = ','.join([item.contents[0].encode('utf-8') for item in tmpSoup.findAll('span',{'class':'oview-side-instr'})]) self.instructor.append(instructors) self.detail.append(detail) request.close() for titleitem,subitem in zip(courselist.findAll('div',{'class':'crs-li-title'}),courselist.findAll('div',{'class':'crs-li-sub'})): #get course name and subname name = titleitem.contents[0].strip() + '-' + subitem.contents[0].strip() self.courseName.append(name) for (name,url,instructor,detail) in zip(self.courseName,self.url,self.instructor,self.detail): start_year = 0 start_month = 0 start_day = 0 lasttime = '' record = [name,instructor,self.schoolName,url,detail,start_year,start_month,start_day,lasttime] #insert or update the database dao = courseDAO() if not dao.exist(url): dao.insert(record) else: record.append(url) dao.update(record)
def extract(self, url): self.req = urlopen(url) self.html = self.req.read() # read from json format data jsondata = json.loads(self.html) # parser json data jsoncourses = jsondata["courses"] jsoncats = jsondata["cats"] jsoninsts = jsondata["insts"] jsontopics = jsondata["topics"] jsonunis = jsondata["unis"] for item in jsoncats: self.cats[item["id"]] = item for item in jsoncourses: if item["status"] == 1: # current available courses self.courses[int(item["topic_id"])] = item for item in jsoninsts: self.insts[item["id"]] = item for item in jsonunis: self.unis[item["id"]] = item for id in jsontopics: info = jsontopics[id] self.topics[int(id)] = info university = ",".join([self.unis[unis]["name"].encode("utf-8") for unis in info.get("unis", [])]) name = info["name"].encode("utf-8") url = self.baseUrl + info["short_name"].encode("utf-8") insts = ",".join( [ " ".join( [self.insts[inst]["first_name"].encode("utf-8"), self.insts[inst]["last_name"].encode("utf-8")] ) for inst in info.get("insts", []) ] ) insts = insts.strip().strip(",") if not int(id) in self.courses.keys(): # courses which are not available yet start_year = 0 start_month = 0 start_day = 0 duration_time = "" else: start_year = self.courses[int(id)]["start_year"] start_month = self.courses[int(id)]["start_month"] start_day = self.courses[int(id)]["start_day"] duration_string = self.courses[int(id)]["duration_string"] # date to be announced if start_year == None: start_year = 0 if start_month == None: start_month = 0 if start_day == None: start_day = 0 if not duration_string == "": duration_time = duration_string.encode("utf-8") detail = "" record = [name, insts, university, url, detail, start_year, start_month, start_day, duration_time] # insert or update the database dao = courseDAO() if not dao.exist(url): dao.insert(record) else: record.append(url) dao.update(record)
def extract(self, url): self.req = urlopen(url) self.html = self.req.read() self.soup = BeautifulSoup(self.html) detaillist = [] linklist = [] namelist = [] schoollist = [] yearlist = [] monthlist = [] daylist = [] lasttimelist = [] instructorlist = [] # link of each course for linkitem in self.soup.findAll("article", {"class": "course"}): url = self.baseUrl + linkitem.a["href"] linklist.append(url) # open each link and get duration time of this course and the instructors request = urlopen(url) tmpHtml = request.read() tmpSoup = BeautifulSoup(tmpHtml) # when finishdate doesn't exist this section of code will throw an IndexError exception try: startdate = self.dateformat(tmpSoup.find("span", {"class": "start-date"}).contents[0]) finishdate = self.dateformat(tmpSoup.find("span", {"class": "final-date"}).contents[0]) lasttime = ( datetime.date(finishdate[0], finishdate[1], finishdate[2]) - datetime.date(startdate[0], startdate[1], startdate[2]) ).days / 7 # number of weeks is the unit if lasttime < 0: # something wrong in the website lasttime = "" else: lasttime = str(lasttime) + " weeks" except: lasttime = "" lasttimelist.append(lasttime) instructors = ",".join( [teacheritem.h3.contents[0] for teacheritem in tmpSoup.findAll("article", {"class": "teacher"})] ) instructorlist.append(instructors) # print instructors request.close() # extract desc of the course for detailitem in self.soup.findAll("div", {"class": "desc"}): contents = "" for split in detailitem.p.contents: if not self.filterstring in split.strip(): contents += split.strip() detaillist.append(contents) # extract courseName of the course for courseNum in self.soup.findAll("span", {"class": "course-number"}): namelist.append(courseNum.contents[0] + courseNum.parent.contents[1]) # extract university name of the course for universityitem in self.soup.findAll("a", {"class": "university"}): # if the name isn't in the abbreviation dic,just add the it if universityitem.contents[0] in self.universityDic.keys(): schoollist.append(self.universityDic[universityitem.contents[0]]) else: schoollist.append(universityitem.contents[0]) # extract and format the start date of the course for dateitem in self.soup.findAll("span", {"class": "start-date"}): year, month, day = self.dateformat(dateitem.contents[0]) yearlist.append(year) monthlist.append(month) daylist.append(day) # zip the information of the course and update the database for (name, url, instructor, school, detail, year, month, day, lasttime) in zip( namelist, linklist, instructorlist, schoollist, detaillist, yearlist, monthlist, daylist, lasttimelist ): record = [name, instructor, school, url, detail, year, month, day, lasttime] # insert or update the database dao = courseDAO() if not dao.exist(url): dao.insert(record) else: record.append(url) dao.update(record)