def get(self): page = requests.get(SITE + self.url,headers=HEAD,proxies=proxies) dom = etree.HTML(page.content) session = DB_Session() # ugly code people = {} people['name'] = self.url people['bio'] = (dom.xpath("//span[@class='bio']/@title") or " ")[0].encode("utf-8") people['location'] = (dom.xpath("//span[@class='location item']/@title") or " ")[0].encode("utf-8") people['business'] = (dom.xpath("//span[@class='business item']/@title") or " ")[0].encode("utf-8") people['education'] = (dom.xpath("//span[@class='education item']/@title") or " ")[0].encode("utf-8") session.execute(User.__table__.insert(), people) session.commit() session.close() print page.status_code ,"got url %s !" %self.url return set(re.findall(PEOPLE, page.content)+re.findall(QUESTION, page.content))