def getCommentById(id): url=getAbsPath('/poi/__pagelet__/pagelet/poiCommentListApi') pagecount=-1 pagenum=1 file=open('comments.csv','a',encoding='utf-8') writer =None try: while True: print('Comment page '+str(pagenum)) data=geneCommData(id,pagenum) html=requestByGet(url,isJson=True,data=data)['data']['html'] dom=loadDom(html) if pagecount<0: pagecount=getPageCount(dom) for comment in dom.find_all('li',class_='rev-item comment-item clearfix'): com=getCommentDetails(comment,id) if(writer==None): writer=getCsvWriter(file,list(com.keys())) writer.writerow(com) pagenum=pagenum+1 if(pagenum>pagecount): break finally: file.close()
def getAround(node): if node: params=node['data-params'] id=json.loads(params)['poi_id'] data={'params':params} url=getAbsPath('/poi/__pagelet__/pagelet/poiLocationApi') data=requestByGet(url,isJson=True,data=data)['data']['html'] dom=loadDom(data) file=open('around.csv','a',encoding='utf-8') writer=None try: for node in dom.find_all('li'): dic=dict() dic['id']=id dic['aid']=node['data-id'] dic['aname']=node['data-name'] dic['atype']=node['data-type'] dic['dist']=getValue(node.find('span')) if writer==None: writer=getCsvWriter(file,dic.keys()) writer.writerow(dic) finally: file.close()
def getPoiDescription(url,dic): url=getAbsPath(url) html=requestByGet(url) dom=loadDom(html) dic['description']=getValue(findByClass(dom,'summary')) if(dic['description']!=''): node = dom.find(attrs={'data-anchor':'overview'}) getOverview(node,dic)
def getReview(uid): url=getAbsPath('/home/ajax_review.php') data={'act':'loadList', 'filter':0, 'offset':0, 'limit':500, 'uid':uid, 'sort':1} data=requestByGet(url,isJson=True,data=data)
def getDetail(url): dic = dict() url=getAbsPath(url) html=requestByGet(url) dom=loadDom(html) node = dom.find(attrs={'data-anchor':'overview'}) getOverview(node,dic) node=dom.find(attrs={'data-anchor':'commentlist'}) getComments(node,dic) return dic
def collectComment(): try: file=open('comments.csv','w',encoding='utf-8') writer =None while True: lock.acquire() item=comDic.popitem() lock.release() scenic=item[0] params=item[1] id=json.loads(params)['poi_id'] print(scenic,' ',id) url=getAbsPath('/poi/__pagelet__/pagelet/poiCommentListApi') pagecount = -1 pagenum = 1 while True: print('Comment page ' + str(pagenum)) data=geneCommentPageData(params,pagenum) html = requestByGet(url,isJson=True,data=data)['data']['html'] dom = loadDom(html) if pagecount < 0:#首次请求 pagecount = getPageCount(dom) getCommentDevide(dom,dic) for comment in dom.find_all('li',class_='rev-item comment-item clearfix'): com = getCommentDetails(comment,id) if(writer == None): writer = getCsvWriter(file,list(com.keys())) writer.writerow(com) pagenum = pagenum + 1 if(pagenum > pagecount): break finally: file.close()
def getUsersThread(writer,uid): url=getAbsPath('/home/ajax_review.php') offset=0 hasmore=True while hasmore: data=geneUserData(uid,offset) data=requestByGet(url,isJson=True,data=data)['data']; hasmore=data['hasmore']=='true' data=data['html'] dom=loadDom(data) for node in dom.find_all(class_=re.compile('poi-item')): dic=dict() dic['uid']=uid dic['poi_href']=findByClass(node,'cover').find('a')['href'] dic['poi_id']=extractNum(dic['poi_href']) dic['name']=getValue(findByClass(node,'title')) dic['star']=findByClass(node,'rating')['data-star'] dic['content']=getValue(findByClass(node,'poi-rev _j_comment')) dic['datetime']=getValue(findByClass(node,'time')) getPoiDescription(dic['poi_href'],dic) writer.writerow(dic) offset=offset+40
def getComments(node,dic): if node: params=node.find('div')['data-params'] id=json.loads(params)['poi_id'] dic['id']=id url=getAbsPath('/poi/__pagelet__/pagelet/poiCommentListApi') pagecount=-1 pagenum=1 file=open('comments.csv','a',encoding='utf-8') #writer =None try: while True: #print('Comment page '+str(pagenum)) data=geneCommentPageData(params,pagenum) html=requestByGet(url,isJson=True,data=data)['data']['html'] dom=loadDom(html) if pagecount<0:#首次请求 #pagecount=getPageCount(dom) getCommentDevide(dom,dic) break #for comment in dom.find_all('li',class_='rev-item comment-item clearfix'): # com=getCommentDetails(comment,id) # if(writer==None): # writer=getCsvWriter(file,list(com.keys())) # writer.writerow(com) pagenum=pagenum+1 if(pagenum>pagecount): break finally: file.close()
def getUsers(): file=open('leftusers.txt','rt',encoding='utf-8') destfile=open('userpois.csv','wt',encoding='utf-8') try: writer=None url=getAbsPath('/home/ajax_review.php') users=list(file.readlines()) for user in users: uid=extractNum(user) print('user ',user) offset=0 hasmore=True while hasmore: data=geneUserData(uid,offset) data=requestByGet(url,isJson=True,data=data)['data']; hasmore=data['hasmore']=='true' data=data['html'] dom=loadDom(data) for node in dom.find_all(class_=re.compile('poi-item')): dic=dict() dic['uid']=uid dic['poi_href']=findByClass(node,'cover').find('a')['href'] dic['poi_id']=extractNum(dic['poi_href']) dic['name']=getValue(findByClass(node,'title')) dic['star']=findByClass(node,'rating')['data-star'] dic['content']=getValue(findByClass(node,'poi-rev _j_comment')) dic['datetime']=getValue(findByClass(node,'time')) getPoiDescription(dic['poi_href'],dic) if writer==None: writer=getCsvWriter(destfile,dic.keys()) writer.writerow(dic) offset=offset+40 finally: file.close() destfile.close()
def geneDomFromUrl(url): html = requestByGet(url) return geneDom(html)