def get(self): page = requests.get(SITE + self.url,headers=HEAD,proxies=proxies) dom = etree.HTML(page.content) session = DB_Session() # ugly code people = {} people['name'] = self.url people['bio'] = (dom.xpath("//span[@class='bio']/@title") or " ")[0].encode("utf-8") people['location'] = (dom.xpath("//span[@class='location item']/@title") or " ")[0].encode("utf-8") people['business'] = (dom.xpath("//span[@class='business item']/@title") or " ")[0].encode("utf-8") people['education'] = (dom.xpath("//span[@class='education item']/@title") or " ")[0].encode("utf-8") session.execute(User.__table__.insert(), people) session.commit() session.close() print page.status_code ,"got url %s !" %self.url return set(re.findall(PEOPLE, page.content)+re.findall(QUESTION, page.content))
def partition(pages): user_dct=defaultdict(list) for article , ukey_lst in pages: session = DB_Session() session.execute( Comment.__table__.insert(), [{'article':article, 'user':ukey} for ukey in ukey_lst] ) session.commit() session.close() for ukey in ukey_lst: try: user_dct[ukey].append(article) except : user_dct[ukey]=[article] finally:pass return user_dct
def main(): urls = json.load(open("urls.txt",'r')) page_set = pool.imap(foo_map, urls)|where(lambda x:x) ukeys = partition(page_set) ''' with open("ukeys.txt",'w') as fuk: json.dump(ukeys,fuk,indent=4) ''' print "finish articles !" users = pool.map(foo_reduce,ukeys.iterkeys())| where(lambda x :x)|as_tuple users_lst = chunks(users,1000) session = DB_Session() for us in users_lst: session.execute( User.__table__.insert(), us #pool.map(foo_reduce,ukeys.iterkeys()) ) session.commit() session.close()
# from bottle import * from redis import StrictRedis import sys sys.path.append("..") from model import DB_Session, User Session = DB_Session() query = Session.query(User) urlprefix ="" rdb = StrictRedis(db=2) @route("%s/" %urlprefix) def index(): return template("index.html") @route("%s/name"%urlprefix) def find_people(): key = request.GET.get('keyword') #return str(request.GET.get if key: userlst = rdb.smembers(key) users = { user : Session.execute("SELECT bio FROM peoples WHERE name='{0}'".format(user)).fetchall()[0][0].encode("utf-8") for user in userlst} return template("result.tpl",peoples= users) else: redirect("%s/"%urlprefix)