Exemple #1
0
	def get(self):
		
		page = requests.get(SITE + self.url,headers=HEAD,proxies=proxies)
		dom = etree.HTML(page.content)
		session = DB_Session()
		# ugly code
		people = {}
		people['name'] = self.url
		people['bio']  = (dom.xpath("//span[@class='bio']/@title") or " ")[0].encode("utf-8")
		people['location'] = (dom.xpath("//span[@class='location item']/@title") or " ")[0].encode("utf-8")
		people['business'] = (dom.xpath("//span[@class='business item']/@title") or " ")[0].encode("utf-8")
		people['education'] = (dom.xpath("//span[@class='education item']/@title") or " ")[0].encode("utf-8")
		session.execute(User.__table__.insert(), people)
		session.commit()
		session.close()
		
		print page.status_code ,"got url %s !" %self.url
		return set(re.findall(PEOPLE, page.content)+re.findall(QUESTION, page.content))