Exemple #1
0
	def extract(self,url):
		self.req = urlopen(url)
		self.html = self.req.read()
		self.soup = BeautifulSoup(self.html)
		courseLinks = self.soup.findAll('a',{'class':'featured-course-live'})
		courseNames = self.soup.findAll('span',{'class':'featured-course-title'})
		schoolNames = self.soup.findAll('div',{'class':'featured-course-school'})
		instructorNames = self.soup.findAll('h5',{'class':'last emboss-light instructor-name'})
		courseDetails = self.soup.findAll('p',{'class':'last fineprint pad-box-mini top-rule-box featured-course-desc'})

		for (itemname,iteminstructor,itemschool,itemlink,itemdetail) in zip(courseNames,instructorNames,schoolNames,courseLinks,courseDetails):
			detail = itemdetail.contents[0].encode('utf-8')
			instructor = iteminstructor.contents[0].split(',')[0]
			school = itemschool.div.img['title']
			name = itemname.span.contents[0]
			url = itemlink['href']
			date = itemlink.span.contents[2]	#format like:\blank Feb 25,\blank 2012\n\blank
			year = int(date.strip().split(',')[1])
			month = self.MONTHDIC[date.strip().split()[0]]
			day = int(date.strip().split(',')[0].split()[1])
			lasttime = ''
			record = [name,instructor,school,url,detail,year,month,day,lasttime]
			#insert or update the database
			dao = courseDAO()
			if not dao.exist(url):
				dao.insert(record)
			else:
				record.append(url)
				dao.update(record)
	def extract(self,url):
		self.req = urlopen(url)
		self.html = self.req.read()
		self.soup = BeautifulSoup(self.html)
		#url list
		courselist = self.soup.find('ul',id = 'unfiltered-class-list')	
		
		"""
			get a list of urls of all the courses here.
			parser each url and may throw exceptions here.
			Todo:exceptions handling
		"""
		for linkitem in courselist.findAll('a'):
			#get the course url
			url = self.baseUrl + linkitem['href']
			#parser the course index page
			self.url.append(url)
			#print url
			request = urlopen(url)
			tmpHtml = request.read()
			tmpSoup = BeautifulSoup(tmpHtml)
			#get detail and instructors information
			detail = tmpSoup.find('div',{'class':'span3'}).findAll('p')[1].contents[0]
			instructors = ','.join([item.contents[0].encode('utf-8') for item in tmpSoup.findAll('span',{'class':'oview-side-instr'})])
			self.instructor.append(instructors)
			self.detail.append(detail)
			request.close()

		for titleitem,subitem in zip(courselist.findAll('div',{'class':'crs-li-title'}),courselist.findAll('div',{'class':'crs-li-sub'})):
			#get course name and subname
			name = titleitem.contents[0].strip() + '-' + subitem.contents[0].strip()
			self.courseName.append(name)
		
		for (name,url,instructor,detail) in zip(self.courseName,self.url,self.instructor,self.detail):
			start_year = 0
			start_month = 0
			start_day = 0
			lasttime = ''
			record = [name,instructor,self.schoolName,url,detail,start_year,start_month,start_day,lasttime]
			#insert or update the database
			dao = courseDAO()
			if not dao.exist(url):
				dao.insert(record)
			else:
				record.append(url)
				dao.update(record)	
    def extract(self, url):
        self.req = urlopen(url)
        self.html = self.req.read()  # read from json format data
        jsondata = json.loads(self.html)  # parser json data
        jsoncourses = jsondata["courses"]
        jsoncats = jsondata["cats"]
        jsoninsts = jsondata["insts"]
        jsontopics = jsondata["topics"]
        jsonunis = jsondata["unis"]

        for item in jsoncats:
            self.cats[item["id"]] = item
        for item in jsoncourses:
            if item["status"] == 1:  # current available courses
                self.courses[int(item["topic_id"])] = item
        for item in jsoninsts:
            self.insts[item["id"]] = item
        for item in jsonunis:
            self.unis[item["id"]] = item

        for id in jsontopics:
            info = jsontopics[id]
            self.topics[int(id)] = info
            university = ",".join([self.unis[unis]["name"].encode("utf-8") for unis in info.get("unis", [])])
            name = info["name"].encode("utf-8")
            url = self.baseUrl + info["short_name"].encode("utf-8")
            insts = ",".join(
                [
                    " ".join(
                        [self.insts[inst]["first_name"].encode("utf-8"), self.insts[inst]["last_name"].encode("utf-8")]
                    )
                    for inst in info.get("insts", [])
                ]
            )
            insts = insts.strip().strip(",")
            if not int(id) in self.courses.keys():  # courses which are not available yet
                start_year = 0
                start_month = 0
                start_day = 0
                duration_time = ""
            else:
                start_year = self.courses[int(id)]["start_year"]
                start_month = self.courses[int(id)]["start_month"]
                start_day = self.courses[int(id)]["start_day"]
                duration_string = self.courses[int(id)]["duration_string"]
                # date to be announced
                if start_year == None:
                    start_year = 0
                if start_month == None:
                    start_month = 0
                if start_day == None:
                    start_day = 0

                if not duration_string == "":
                    duration_time = duration_string.encode("utf-8")
            detail = ""
            record = [name, insts, university, url, detail, start_year, start_month, start_day, duration_time]
            # insert or update the database
            dao = courseDAO()
            if not dao.exist(url):
                dao.insert(record)
            else:
                record.append(url)
                dao.update(record)
Exemple #4
0
 def extract(self, url):
     self.req = urlopen(url)
     self.html = self.req.read()
     self.soup = BeautifulSoup(self.html)
     detaillist = []
     linklist = []
     namelist = []
     schoollist = []
     yearlist = []
     monthlist = []
     daylist = []
     lasttimelist = []
     instructorlist = []
     # link of each course
     for linkitem in self.soup.findAll("article", {"class": "course"}):
         url = self.baseUrl + linkitem.a["href"]
         linklist.append(url)
         # open each link and get duration time of this course and the instructors
         request = urlopen(url)
         tmpHtml = request.read()
         tmpSoup = BeautifulSoup(tmpHtml)
         # when finishdate doesn't exist this section of code will throw an IndexError exception
         try:
             startdate = self.dateformat(tmpSoup.find("span", {"class": "start-date"}).contents[0])
             finishdate = self.dateformat(tmpSoup.find("span", {"class": "final-date"}).contents[0])
             lasttime = (
                 datetime.date(finishdate[0], finishdate[1], finishdate[2])
                 - datetime.date(startdate[0], startdate[1], startdate[2])
             ).days / 7  # number of weeks is the unit
             if lasttime < 0:  # something wrong in the website
                 lasttime = ""
             else:
                 lasttime = str(lasttime) + " weeks"
         except:
             lasttime = ""
         lasttimelist.append(lasttime)
         instructors = ",".join(
             [teacheritem.h3.contents[0] for teacheritem in tmpSoup.findAll("article", {"class": "teacher"})]
         )
         instructorlist.append(instructors)
         # print instructors
         request.close()
         # extract desc of the course
     for detailitem in self.soup.findAll("div", {"class": "desc"}):
         contents = ""
         for split in detailitem.p.contents:
             if not self.filterstring in split.strip():
                 contents += split.strip()
         detaillist.append(contents)
         # extract courseName of the course
     for courseNum in self.soup.findAll("span", {"class": "course-number"}):
         namelist.append(courseNum.contents[0] + courseNum.parent.contents[1])
         # extract university name of the course
     for universityitem in self.soup.findAll("a", {"class": "university"}):
         # if the name isn't in the abbreviation dic,just add the it
         if universityitem.contents[0] in self.universityDic.keys():
             schoollist.append(self.universityDic[universityitem.contents[0]])
         else:
             schoollist.append(universityitem.contents[0])
             # extract and format the start date of the course
     for dateitem in self.soup.findAll("span", {"class": "start-date"}):
         year, month, day = self.dateformat(dateitem.contents[0])
         yearlist.append(year)
         monthlist.append(month)
         daylist.append(day)
         # zip the information of the course and update the database
     for (name, url, instructor, school, detail, year, month, day, lasttime) in zip(
         namelist, linklist, instructorlist, schoollist, detaillist, yearlist, monthlist, daylist, lasttimelist
     ):
         record = [name, instructor, school, url, detail, year, month, day, lasttime]
         # insert or update the database
         dao = courseDAO()
         if not dao.exist(url):
             dao.insert(record)
         else:
             record.append(url)
             dao.update(record)