def getOnePostSmart(self,post,from_page=1): '''获取页面中的帖子。根据页面中的“下一页”链接是否存在来决定是否结束翻页,比根据主题列表中显示的回帖数决定翻页范围要更准确''' cnt=0 sr=urlparse.urlsplit(post.url) if not sr.scheme: posturl=post.sector.base_url+post.url else: posturl=post.url posturl+='&TopicPage=%d' posturl=posturl.replace('topicdisplay.asp','topicdisplay_safe.asp') parser=etree.HTMLParser() pg=from_page while True: if self.exitevent.is_set(): self.logger.info('got exit signal!') break self.logger.debug('post %s %s ...',post.locate_id,posturl%pg) data=self._getData(posturl%pg,None,post.title) tree=etree.fromstring(data,parser) posts=tree.xpath('//table[@class="maintable"][1]/tr[position()>1]') haspostinpage=False gotfuckingword=False for item in posts: haspostinpage=True try: author=item[0].xpath('a/text()')[0] except IndexError: self.logger.debug('no author info?') author='' if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in htmlentitydecode(etree.tostring(item)): self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item))) gotfuckingword=True continue try: crt_date=item[1].xpath('font/text()')[0][6:-1] # 判断是否是主题贴,因为主题贴无法获得replyid tmp=item[1].xpath('*[position()<5]') if len(tmp)==4 and [x.tag for x in tmp]==['b','hr','font','hr']: # 是主题贴 replyid=0 try: realtitle=item[1].xpath('b/text()')[0] except IndexError: try: realtitle=item[1].xpath('b/font/b/text()')[0] except IndexError: realtitle=item[1].xpath('b/font/text()')[0] if post.title!=realtitle: self.logger.debug('post %s realtitle %s',post.locate_id,realtitle) post.title=realtitle # 为后面获取回复内容而删除非回复信息(主题/分割线/发表时间等) for x in item[1].xpath('*[position()<5]'): item[1].remove(x) item[1].text='' else: # 非主题贴 replyurl=item[1].xpath('font/a[1]/@href')[0] replyid=re.search('ReplyID=(\d+)',replyurl).group(1) # 为后面获取回复内容而删除非回复信息(分割线/发表时间等) for x in item[1].xpath('*[position()<3]'): item[1].remove(x) replycontent=self.exclude_first_td_tag.match(htmlentitydecode(etree.tostring(item[1])).strip()).group(1) if replycontent.startswith('<br/>'): replycontent=replycontent[5:] if author=='': author='usr for %d-%s'%(post.id,replyid) ## open('/home/kevin/tmp_post.txt','w').write(replycontent) try: r=Reply.objects.filter(post=post,locate_id=replyid)[0:1].get() except Reply.DoesNotExist: r=Reply(post=post,locate_id=replyid,crt_date=crt_date,author=author,content=replycontent) try: r.save() except _mysql_exceptions.Warning: self.logger.debug('got _mysql_exceptions.Warning!') r.content=self.exclude_first_td_tag.match(etree.tostring(item[1]).strip()[:-5]).group(1) if replycontent.startswith('<br/>'): replycontent=replycontent[5:] r.save() cnt+=1 except IndexError: if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in htmlentitydecode(etree.tostring(item)): self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item))) gotfuckingword=True continue else: raise except AttributeError: if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in htmlentitydecode(etree.tostring(item)): self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item))) gotfuckingword=True continue else: raise # check page next x=tree.xpath('//td[@class="outtd"][1]/table[2]/tr[1]/td[2]/a') if ('[>]' in [t.text for t in x]) and haspostinpage: pg+=1 elif gotfuckingword: # pg+=1 else: break self.logger.debug('post %s %+d reply',post.locate_id,cnt) # 检查实际获得数量 actualcnt=Reply.objects.filter(post=post).count() self.logger.debug('post %s actual %d reply in DB',post.locate_id,actualcnt) post.nr_reply=actualcnt-1 if actualcnt-1>=0 else 0 post.save()
def getPost(self, post, new_nr=0): cnt = 0 sr = urlparse.urlsplit(post.url) if not sr.scheme: posturl = post.sector.base_url + post.url else: posturl = post.url posturl = "%d".join(re.findall("(.*?thread-\d+-)\d+(-.*?.html)", posturl)[0]) startpage, stoppage = self.getPageRange(self.NR_REPLY_PER_PAGE, post.nr_reply, new_nr) self.logger.debug("post %s page range [%d,%d)", post.locate_id, startpage, stoppage) parser = etree.HTMLParser() for pg in xrange(startpage, stoppage): self.logger.debug("post %s %s ...", post.locate_id, posturl % pg) data = self._getData(posturl % pg, None, post.title) if not data: self.logger.debug("-=-=- !!!!!!!!!!!!!! got None or '', skip !!!!!!!!!!!!!!!!!!! -=-=-") continue # check authentication. some posts need high permission to view. if data.find("超级大本营军事论坛 提示信息") != -1 and data.find("您无权进行当前操作,原因如下:") != -1: self.logger.debug( "got err %s", re.search('<div class="alert_error">(.*?)</div>', data, re.M | re.S | re.I).group(1) ) if self.login(): data = self._getData(posturl % pg, None, post.title) if not data: self.logger.debug("-=-=- !!!!!!!!!!!!!! got None or '', skip !!!!!!!!!!!!!!!!!!! -=-=-") continue tree = etree.fromstring(data, parser) posts = tree.xpath('//div[@id="postlist"]/div[starts-with(@id,"post_")=true()]') for item in posts: replyid = item.attrib["id"][5:] assert replyid try: author = item.xpath('./table//td[@class="pls"]/div[@class="pi"]/div[@class="authi"]/a/text()')[ 0 ] # item.xpath('./table/tr[1]/td[@class="pls"]/div[@class="pi"]/div[@class="authi"]/a/text()')[0] except IndexError: author = item.xpath('./table//td[@class="pls"]/div[@class="pi"]/text()')[0].strip() assert author is not None ## crt_date=item.xpath('./table/tr[1]/td[@class="postcontent"]/div[@class="postinfo"]/div[@class="posterinfo"]/div[@class="authorinfo"]/em/text()')[0][4:] crt_date = item.xpath('./table//div[@class="pti"]/div[@class="authi"]/em/text()')[0][ 4: ] # crt_date=item.xpath('./table/tr[1]/td[@class="plc"]/div[@class="pi"]/div[@class="pti"]/div[@class="authi"]/em/text()')[0][4:] # './table/tr[1]/td[@class="plc"]/div[@class="pct"]/div[@class="pcb"]/div[@class="t_fsz"]/table/tr/td[@id="postmessage_%s"]'%replyid ## replycontent= item.xpath('//td[@id="postmessage_%s"]/*[not(@class="a_pr" or @class="pstatus")]'%replyid)[0] try: replycontent = item.xpath('//td[@id="postmessage_%s"]' % replyid)[0] except IndexError: if item.xpath('./table//div[@class="pct"]/div[@class="pcb"]/div[@class="locked"]'): replycontent = item.xpath('./table//div[@class="pct"]/div[@class="pcb"]/div[@class="locked"]')[ 0 ] replycontent = htmlentitydecode(etree.tostring(replycontent)).strip() else: raise else: assert replycontent is not None # remove element 'ad' and 'poststatus' for i in replycontent: if "class" in i.attrib and i.attrib["class"] in ("a_pr", "pstatus"): textlist = re.findall( r"\A<%s.*?>.*?</%s>(.*?)\Z" % (i.tag, i.tag), etree.tostring(i), re.M | re.S | re.U ) textlist = [x for x in textlist if x.strip() != ""] if len(textlist) > 0: remaintext = "<br />".join(textlist) newelement = item.makeelement("br") newelement.text = remaintext replycontent.replace(i, newelement) else: replycontent.remove(i) replycontent = self.exclude_first_td_tag.match( htmlentitydecode(etree.tostring(replycontent)).strip() ).group(1) try: r = Reply.objects.filter(post=post, locate_id=replyid)[0:1].get() except Reply.DoesNotExist: r = Reply(post=post, locate_id=replyid, crt_date=crt_date, author=author, content=replycontent) try: r.save() except _mysql_exceptions.Warning: self.logger.debug("got _mysql_exceptions.Warning!") r.content = self.exclude_first_td_tag.match(etree.tostring(item[1]).strip()[:-5]).group(1) if replycontent.startswith("<br/>"): replycontent = replycontent[5:] r.save() cnt += 1 if new_nr != 0: if post.nr_reply + cnt == new_nr: self.logger.info("post %s %+d reply. now %d", post.locate_id, cnt, new_nr + 1) post.nr_reply += cnt # 增加实际变化(新增)的数量 else: self.logger.debug( "post %s %+d reply, %d != expect %d (no right new_nr info?)", post.locate_id, cnt, post.nr_reply + cnt, new_nr, ) # 检查实际获得数量 actualcnt = Reply.objects.filter(post=post).count() self.logger.info("post %s actual %d reply in DB", post.locate_id, actualcnt) post.nr_reply = actualcnt - 1 if actualcnt - 1 >= 0 else 0 else: if post.nr_reply + 1 == cnt: self.logger.info("post %s init %+d reply.", post.locate_id, cnt) else: self.logger.info("post %s init %+d reply, != expect %d", post.locate_id, cnt, post.nr_reply + 1) post.nr_reply = cnt - 1 if cnt - 1 >= 0 else 0 # 设为实际获得值-1,以便下次再次尝试查找新增帖子 post.save()
def getPost(self,post,new_nr=0): '''获取页面中的帖子。根据主题列表中显示的回帖数决定翻页范围''' cnt=0 sr=urlparse.urlsplit(post.url) if not sr.scheme: posturl=post.sector.base_url+post.url else: posturl=post.url posturl+='&TopicPage=%d' posturl=posturl.replace('topicdisplay.asp','topicdisplay_safe.asp') startpage,stoppage=self.getPageRange(self.NR_REPLY_PER_PAGE,0 if post.nr_reply-1<0 else post.nr_reply-1,0 if new_nr==0 else new_nr-1) self.logger.debug('post %s page range [%d,%d)',post.locate_id,startpage,stoppage) parser=etree.HTMLParser() for pg in xrange(startpage,stoppage): if self.exitevent.is_set(): self.logger.info('got exit signal!') break self.logger.debug('post %s %s ...',post.locate_id,posturl%pg) data=self._getData(posturl%pg,None,post.title) tree=etree.fromstring(data,parser) posts=tree.xpath('//table[@class="maintable"][1]/tr[position()>1]') for item in posts: try: author=item[0].xpath('a/text()')[0] except IndexError: self.logger.debug('no author info?') author='' if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in htmlentitydecode(etree.tostring(item)): self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item))) continue try: crt_date=item[1].xpath('font/text()')[0][6:-1] # 判断是否是主题贴,因为主题贴无法获得replyid tmp=item[1].xpath('*[position()<5]') if len(tmp)==4 and [x.tag for x in tmp]==['b','hr','font','hr']: # 是主题贴 replyid=0 try: realtitle=item[1].xpath('b/text()')[0] except IndexError: try: realtitle=item[1].xpath('b/font/b/text()')[0] except IndexError: realtitle=item[1].xpath('b/font/text()')[0] if post.title!=realtitle: self.logger.info('post %s realtitle %s',post.locate_id,realtitle) post.title=realtitle post.save() # 为后面获取回复内容而删除非回复信息(主题/分割线/发表时间等) for x in item[1].xpath('*[position()<5]'): item[1].remove(x) item[1].text='' else: # 非主题贴 replyurl=item[1].xpath('font/a[1]/@href')[0] replyid=re.search('ReplyID=(\d+)',replyurl).group(1) # 为后面获取回复内容而删除非回复信息(分割线/发表时间等) for x in item[1].xpath('*[position()<3]'): item[1].remove(x) replycontent=self.exclude_first_td_tag.match(htmlentitydecode(etree.tostring(item[1])).strip()).group(1) if replycontent.startswith('<br/>'): replycontent=replycontent[5:] if author=='': author='usr for %d-%s'%(post.id,replyid) ## open('/home/kevin/tmp_post.txt','w').write(replycontent) try: r=Reply.objects.filter(post=post,locate_id=replyid)[0:1].get() except Reply.DoesNotExist: r=Reply(post=post,locate_id=replyid,crt_date=crt_date,author=author,content=replycontent) try: r.save() except _mysql_exceptions.Warning: self.logger.debug('got _mysql_exceptions.Warning!') r.content=self.exclude_first_td_tag.match(etree.tostring(item[1]).strip()[:-5]).group(1) if replycontent.startswith('<br/>'): replycontent=replycontent[5:] r.save() cnt+=1 except IndexError: if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in htmlentitydecode(etree.tostring(item)): self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item))) continue else: raise except AttributeError: if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in htmlentitydecode(etree.tostring(item)): self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item))) continue else: raise if new_nr!=0: if post.nr_reply+cnt==new_nr: self.logger.info('post %s %+d reply. now %d',post.locate_id,cnt,new_nr+1) post.nr_reply+=cnt # 增加实际变化(新增)的数量 else: self.logger.debug('post %s %+d reply, %d != expect %d (no right new_nr info?)',post.locate_id,cnt,post.nr_reply+cnt,new_nr) # 检查实际获得数量 actualcnt=Reply.objects.filter(post=post).count() self.logger.info('post %s actual %d reply in DB',post.locate_id,actualcnt) post.nr_reply=actualcnt-1 if actualcnt-1>=0 else 0 else: if post.nr_reply+1==cnt: self.logger.info('post %s init %+d reply.',post.locate_id,cnt) else: self.logger.info('post %s init %+d reply, != expect %d',post.locate_id,cnt,post.nr_reply+1) post.nr_reply=cnt-1 if cnt-1>=0 else 0 # 设为实际获得值-1,以便下次再次尝试查找新增帖子 post.save() self.stat_reply_add+=cnt