コード例 #1
0
ファイル: zhihu_fetch.py プロジェクト: luyajie/zhihu
	def get(self):
		
		page = requests.get(SITE + self.url,headers=HEAD,proxies=proxies)
		dom = etree.HTML(page.content)
		session = DB_Session()
		# ugly code
		people = {}
		people['name'] = self.url
		people['bio']  = (dom.xpath("//span[@class='bio']/@title") or " ")[0].encode("utf-8")
		people['location'] = (dom.xpath("//span[@class='location item']/@title") or " ")[0].encode("utf-8")
		people['business'] = (dom.xpath("//span[@class='business item']/@title") or " ")[0].encode("utf-8")
		people['education'] = (dom.xpath("//span[@class='education item']/@title") or " ")[0].encode("utf-8")
		session.execute(User.__table__.insert(), people)
		session.commit()
		session.close()
		
		print page.status_code ,"got url %s !" %self.url
		return set(re.findall(PEOPLE, page.content)+re.findall(QUESTION, page.content))
コード例 #2
0
ファイル: pip.py プロジェクト: edisonqkj/gk_mining
def partition(pages):
    user_dct=defaultdict(list)
    for article , ukey_lst in pages:
        session = DB_Session()
        session.execute(
        Comment.__table__.insert(),
        [{'article':article, 'user':ukey} for ukey in ukey_lst]
        )
        session.commit()
        session.close()
        for ukey in ukey_lst:
            try:
                user_dct[ukey].append(article)
            except :
                user_dct[ukey]=[article]
    
    finally:pass 
    return user_dct
コード例 #3
0
ファイル: mapredu.py プロジェクト: edisonqkj/gk_mining
def main():    
    urls = json.load(open("urls.txt",'r'))
    
    page_set = pool.imap(foo_map, urls)|where(lambda x:x)
    ukeys = partition(page_set)
    '''
    with open("ukeys.txt",'w') as fuk:
        json.dump(ukeys,fuk,indent=4)
    '''
    print "finish articles !"
    
    users = pool.map(foo_reduce,ukeys.iterkeys())| where(lambda x :x)|as_tuple
    users_lst = chunks(users,1000)
    session = DB_Session()
    for us in users_lst:
        session.execute(
            User.__table__.insert(),
            us
            #pool.map(foo_reduce,ukeys.iterkeys())
        ) 
        session.commit()
    session.close()
コード例 #4
0
ファイル: zhihu_search.py プロジェクト: jude90/zhihu
#
from bottle import * 
from redis import StrictRedis
import sys
sys.path.append("..")
from model import  DB_Session, User

Session = DB_Session()
query = Session.query(User)
urlprefix =""
rdb = StrictRedis(db=2)

@route("%s/" %urlprefix)
def index():
	return template("index.html")



@route("%s/name"%urlprefix)
def find_people():
	key = request.GET.get('keyword')
	#return str(request.GET.get
	if key:
		userlst = rdb.smembers(key)
   		users = { user : Session.execute("SELECT bio FROM peoples WHERE name='{0}'".format(user)).fetchall()[0][0].encode("utf-8") for user in userlst}
   		return template("result.tpl",peoples= users)
	else:
		redirect("%s/"%urlprefix)