def getPostList(self,forum_name,sector_name,time_since,page_start,page_stop): self.forum_name=forum_name # for login purpose upd=None cnt=0 try: forum=Forum.objects.filter(name=forum_name)[0:1].get() except Forum.DoesNotExist: self.logger.debug('can\'t find forum info of %s!',forum_name) raise try: sector=Sector.objects.filter(forum=forum,name=sector_name)[0:1].get() except Sector.DoesNotExist: self.logger.debug('can\'t find sector info of %s!',sector_name) raise for i in xrange(page_start,page_stop): # max page number gotnew_this_page=False # 标记本页面中是否处理了新贴/更新回帖,以便在本页没有新内容的情况下尽早退出循环 if self.exitevent.is_set(): self.logger.info('got exit signal!') break pageurl=sector.url%(i,) ## self.logger.info('get %s ...',pageurl) data=self._getData(pageurl,None,'%s-page %02d'%(sector.name,i)) if len(data)<100: # 论坛大姨妈了 self.logger.info('bbs down? %s',data) break # 用 lxml xpath 选择包含正文的 div parser=etree.HTMLParser() tree=etree.fromstring(data,parser) el=tree.xpath(sector.url_pattern) for item in el: # 列表中的每个主题 if self.exitevent.is_set(): self.logger.info('got exit signal!') break ## self.logger.debug('-='*10) keeptop= True if 'folderstate3' in item[0].xpath('img/@src')[0] else False # 通过图标名称判断是否为置顶贴 if keeptop: gotnew_this_page=True # 避免首页都是置顶贴且无更新的情况下过早退出 try: try: title=item[3].xpath('table/tr/td/a/text()')[0] except IndexError: try: title=item[3].xpath('table/tr/td/a/font/b/text()')[0] except IndexError: title=item[3].xpath('table/tr/td/a/font/text()')[0] # 取主题信息 title_url=item[3].xpath('table/tr/td/a/@href')[0] m=re.search('TopicID=(\d+)',title_url) if m: lid=m.group(1) else: self.logger.debug('can\'t find locate_id for post %s',title) lid='' try: author=item[6].xpath('a/text()')[0] except IndexError: author=='' crt_date=item[6].xpath('@title')[0][5:] nr_reply=int(item[5].text) pages= int(ceil((nr_reply)/float(self.NR_REPLY_PER_PAGE)) ) upd_date=item[8].text.strip() ## self.logger.info('(%02d) (%02d) %s |%s|%s|%s|%s|%s',cnt,pages,title,author,crt_date,upd_date,nr_reply,title_url) cnt+=1 ## self.logger.info('%d---%02d)%s|%s|%d-%d|%s|%s',item.sourceline,cnt,lid,upd_date,nr_reply,pages,author,title) self.logger.info('%s\n%02d)\t%s|%s|%d-%d|%s|%s', '-='*10, cnt, lid, upd_date, nr_reply, pages, author, title.strip()) try: upd=datetime.datetime.strptime(upd_date,'%Y-%m-%d %H:%M:%S') except ValueError: self.logger.debug('upd_date=%s,no H:M:S?',upd_date) upd=datetime.datetime.strptime(upd_date,'%Y-%m-%d') if time_since and upd<time_since and (not keeptop): self.logger.info('%s\ngot time stop in page list.','-~'*20) return if author=='': author='usr for post %s'%(post.lid,) # check DB, insert if not exist, check update otherwise try: p=Post.objects.filter(sector=sector,locate_id=lid)[0:1].get() except Post.DoesNotExist: p=Post(sector=sector,locate_id=lid,url=title_url,title=title,author=author, crt_date=crt_date,upd_date=upd_date,nr_reply=0) # 先将DB中的nr_reply设为0,避免后面取帖子出错的情况下无法通过重新执行来重取帖子 p.save() self.stat_post_add+=1 p.nr_reply=nr_reply self.logger.debug('post %s created.',lid) self.getPost(p) gotnew_this_page=True else: if p.upd_date!=upd: p.upd_date=upd_date p.save() if p.nr_reply!=nr_reply: self.logger.info('post %s nr_reply changed. %+d',lid,nr_reply-p.nr_reply) self.getPost(p,nr_reply) gotnew_this_page=True except IndexError: if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in htmlentitydecode(etree.tostring(item)): self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item))) continue else: raise
def getPostList(self, forum_name, sector_name, time_since, page_start, page_stop): self.forum_name = forum_name upd = None cnt = 0 try: forum = Forum.objects.filter(name=forum_name)[0:1].get() except Forum.DoesNotExist: self.logger.debug("can't find forum info of %s!", forum_name) raise try: sector = Sector.objects.filter(forum=forum, name=sector_name)[0:1].get() except Sector.DoesNotExist: self.logger.debug("can't find sector info of %s!", sector_name) raise for i in xrange(page_start, page_stop): # max page number pageurl = sector.url % (i,) ## self.logger.info('get %s ...',pageurl) data = self._getData(pageurl, None, "%s-page %02d" % (sector.name, i)) if not data: self.logger.debug("got nothing, skip!") break # 用 lxml xpath 选择包含正文的 div parser = etree.HTMLParser() tree = etree.fromstring(data, parser) ## el=tree.xpath(sector.url_pattern) ## el=tree.xpath('//div[@id="threadlist"]//form[@id="moderate"]/table/*[not(@id="separatorline")]') el = tree.xpath('//div[@id="threadlist"]//form[@id="moderate"]/table/*[@id]') sticktopic = None for item in el: # 列表中的每个主题 if item.attrib["id"] == "separatorline": continue ## self.logger.debug('-='*10) if "stickthread" in item.xpath("@id")[0]: sticktopic = True else: sticktopic = False title = item.xpath("./tr/th/a/text()")[0] if item.xpath('./tr/th/span[@class="tps"]'): pages = int(item.xpath('./tr/th/span[@class="tps"]/a[position()=last()]/text()')[0]) else: pages = 1 # 取主题信息 ## title_url=item.xpath('./tr/th/span[1]/a[1]/@href')[0] title_url = item.xpath("./tr/th/a/@href")[0] m = re.search("thread-(\d+)-.*?.html", title_url) if m: lid = m.group(1) else: self.logger.debug("can't find locate_id for post %s", title) lid = "" author = item.xpath('./tr/td[@class="by"][1]/cite/a/text()')[0] assert author crt_date = item.xpath('./tr/td[@class="by"][1]/em/span/text()')[0] unknownreply = True try: nr_reply = int(item.xpath('./tr/td[@class="num"]/a/text()')[0]) unknownreply = False except ValueError: ## self.logger.debug('post %s has no reply number!',lid) # 移动/锁贴 nr_reply = 0 upd_date = item.xpath('./tr/td[@class="by"][2]/em/a/text()')[0] ## self.logger.info('(%02d) (%02d) %s |%s|%s|%s|%s|%s',cnt,pages,title,author,crt_date,upd_date,nr_reply,title_url) cnt += 1 ## self.logger.info('%d---%02d)%s|%s|%d-%d|%s|%s',item.sourceline,cnt,lid,upd_date,nr_reply,pages,author,title) self.logger.info( "%s\n%02d)\t%s|%s|%s-%d|%s|%s", "-=" * 10, cnt, lid, upd_date, str(nr_reply) if not unknownreply else "?", pages, author, title.strip(), ) upd = datetime.datetime.strptime(upd_date, "%Y-%m-%d %H:%M") if time_since and (not sticktopic) and upd < time_since: self.logger.info("%s\ngot time stop in page list.", "-~" * 20) return # check DB, insert if not exist, else check update try: p = Post.objects.filter(sector=sector, locate_id=lid)[0:1].get() except Post.DoesNotExist: p = Post( sector=sector, locate_id=lid, url=title_url, title=title, author=author, crt_date=crt_date, upd_date=upd_date, nr_reply=0, ) p.save() p.nr_reply = nr_reply self.logger.debug("post %s created.", lid) self.getPost(p) else: if p.upd_date != upd: p.upd_date = upd_date p.save() if unknownreply: # check every time 不过每次最多只能获取一页新帖子 self.logger.info( "post %s nr_reply may changed(unknown reply cnt). now %d in DB", lid, p.nr_reply + 1 ) nr_reply = p.nr_reply + 1 # 设为比原来的大以触发查找 if p.nr_reply != nr_reply: self.logger.info("post %s nr_reply changed. %+d", lid, nr_reply - p.nr_reply) self.getPost(p, nr_reply)
def getPostList(self,forum_name,sector_name,time_since,page_start,page_stop): ## self.forum_name=forum_name ptoppic=re.compile('.+?t_top\d+.gif|announcement.gif'.decode(self.dft_html_encoding),re.S|re.I|re.U) info, debug= self.logger.info, self.logger.debug upd=None cnt=0 try: forum=Forum.objects.filter(name=forum_name)[0:1].get() except Forum.DoesNotExist: debug('can\'t find forum info of %s!',forum_name) raise try: sector=Sector.objects.filter(forum=forum,name=sector_name)[0:1].get() except Sector.DoesNotExist: debug('can\'t find sector info of %s!',sector_name) raise for i in xrange(page_start,page_stop): # max page number pageurl=sector.url%(i,) ## self.logger.info('get %s ...',pageurl) data=self._getData(pageurl,None,'%s-page %02d'%(sector.name,i)) # 用 lxml xpath 选择包含正文的 div parser=etree.HTMLParser() tree=etree.fromstring(data,parser) el=tree.xpath('/html/body/div[@class="wrap cl"]/div[@class="main thread"]/form[@id="moderate"]/div[@class="threadlist"]/table/tbody/*') for item in el: # 列表中的每个主题 ## self.logger.debug('-='*10) if item.xpath('../@class') and item.xpath('../@class')[0]=='separation': # skip separation line continue tdlist= item.xpath('*') assert len(tdlist)==6 try: title=tdlist[2].xpath('a/text()')[0] except IndexError: try: title=tdlist[2].xpath('a/font/text()')[0] except IndexError: title=tdlist[2].xpath('a/span/text()')[0] # 取置顶信息 istoptopic=False try: toppic=tdlist[0].xpath('a/img/@src')[0] except IndexError: toppic=tdlist[0].xpath('img/@src')[0] if ptoppic.search(toppic): istoptopic=True # 取主题信息 title_url=tdlist[2].xpath('a/@href')[0] m=re.search('topicid=(\d+)&?',title_url) if m: lid=m.group(1) else: debug('can\'t find locate_id for post %s',title) lid='' author=tdlist[3].xpath('cite/a/text()')[0] crt_date=tdlist[3].xpath('em/text()')[0] if crt_date.find('前')!=-1 or crt_date.find('天')!=-1: ## debug('%s not real crt_date: %s', lid, crt_date) crt_date=self.str2time(crt_date).strftime('%Y-%m-%d %H:%M:%S') if crt_date.find('刚才')!=-1: crt_date=datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S') if crt_date.find('/')!=-1: # in form 2011/7/24 9:16:34 crt_date=datetime.datetime.strptime(crt_date,'%Y/%m/%d %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S') try: nr_reply=int(tdlist[4].xpath('a[1]/text()')[0]) except IndexError: nr_reply=0 try: upd_date=tdlist[5].xpath('em/a/text()')[0].strip() except IndexError: upd_date=crt_date # 找不到最近更新时间则将发表时间作为最近更新时间 if upd_date.find('前')!=-1 or upd_date.find('天')!=-1: ## debug('%s not real upd_date: %s', lid, upd_date) upd_date=self.str2time(upd_date).strftime('%Y-%m-%d %H:%M:%S') if upd_date.find('刚才')!=-1: upd_date=datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S') pages= int(ceil((nr_reply+1)/float(self.NR_REPLY_PER_PAGE)) ) info('%s\n%02d)\t%s|%s|%d-%d|%s|%s', '-='*10, cnt, lid, upd_date, nr_reply, pages, author, title.strip()) cnt+=1 try: upd=datetime.datetime.strptime(upd_date,'%Y-%m-%d %H:%M:%S') except ValueError: try: upd=datetime.datetime.strptime(upd_date,'%Y-%m-%d %H:%M') except ValueError: upd=datetime.datetime.strptime(upd_date,'%Y/%m/%d %H:%M:%S') #-# upd_date=upd.strftime('%Y-%m-%d %H:%M:%S') ## info('(%02d) (%02d) %s |%s|%s|%s|%s|%s',cnt,pages,title,author,crt_date,upd_date,nr_reply,title_url) #-# info('%d---%02d)%s|%s|%d-%d|%s|%s',item.sourceline,cnt,lid,upd_date,nr_reply,pages,author,title) if upd<time_since and (not istoptopic): info('%s\ngot time stop in page list.','-~'*20) return # check DB, insert if not exist, else check update try: p=Post.objects.filter(sector=sector,locate_id=lid)[0:1].get() except Post.DoesNotExist: p=Post(sector=sector, locate_id=lid, url=title_url, title=title, author=author, crt_date=crt_date, upd_date=upd_date, nr_reply=0) p.save() p.nr_reply=nr_reply debug('post %s created.',lid) self.getPost(p) else: if p.upd_date!=upd: p.upd_date=upd p.save() if p.nr_reply<nr_reply: info('post %s nr_reply changed. %+d',lid,nr_reply-p.nr_reply) self.getPost(p,nr_reply)