Ejemplo n.º 1
0
	def getPostList(self,forum_name,sector_name,time_since,page_start,page_stop):
		self.forum_name=forum_name # for login purpose

		upd=None
		cnt=0
		try:
			forum=Forum.objects.filter(name=forum_name)[0:1].get()
		except Forum.DoesNotExist:
			self.logger.debug('can\'t find forum info of %s!',forum_name)
			raise

		try:
			sector=Sector.objects.filter(forum=forum,name=sector_name)[0:1].get()
		except Sector.DoesNotExist:
			self.logger.debug('can\'t find sector info of %s!',sector_name)
			raise

		for i in xrange(page_start,page_stop): # max page number
			gotnew_this_page=False # 标记本页面中是否处理了新贴/更新回帖,以便在本页没有新内容的情况下尽早退出循环

			if self.exitevent.is_set():
				self.logger.info('got exit signal!')
				break

			pageurl=sector.url%(i,)
##			self.logger.info('get %s ...',pageurl)
			data=self._getData(pageurl,None,'%s-page %02d'%(sector.name,i))
			if len(data)<100: # 论坛大姨妈了
				self.logger.info('bbs down? %s',data)
				break

			# 用 lxml xpath 选择包含正文的 div
			parser=etree.HTMLParser()
			tree=etree.fromstring(data,parser)
			el=tree.xpath(sector.url_pattern)

			for item in el: # 列表中的每个主题
				if self.exitevent.is_set():
					self.logger.info('got exit signal!')
					break
##				self.logger.debug('-='*10)
				keeptop= True if 'folderstate3' in item[0].xpath('img/@src')[0] else False # 通过图标名称判断是否为置顶贴
				if keeptop:
					gotnew_this_page=True # 避免首页都是置顶贴且无更新的情况下过早退出

				try:
					try:
						title=item[3].xpath('table/tr/td/a/text()')[0]
					except IndexError:
						try:
							title=item[3].xpath('table/tr/td/a/font/b/text()')[0]
						except IndexError:
							title=item[3].xpath('table/tr/td/a/font/text()')[0]

					# 取主题信息
					title_url=item[3].xpath('table/tr/td/a/@href')[0]
					m=re.search('TopicID=(\d+)',title_url)
					if m:
						lid=m.group(1)
					else:
						self.logger.debug('can\'t find locate_id for post %s',title)
						lid=''

					try:
						author=item[6].xpath('a/text()')[0]
					except IndexError:
						author==''
					crt_date=item[6].xpath('@title')[0][5:]
					nr_reply=int(item[5].text)

					pages= int(ceil((nr_reply)/float(self.NR_REPLY_PER_PAGE)) )

					upd_date=item[8].text.strip()
	##				self.logger.info('(%02d) (%02d) %s |%s|%s|%s|%s|%s',cnt,pages,title,author,crt_date,upd_date,nr_reply,title_url)
					cnt+=1
	##				self.logger.info('%d---%02d)%s|%s|%d-%d|%s|%s',item.sourceline,cnt,lid,upd_date,nr_reply,pages,author,title)
					self.logger.info('%s\n%02d)\t%s|%s|%d-%d|%s|%s', '-='*10, cnt, lid, upd_date, nr_reply, pages, author, title.strip())
					try:
						upd=datetime.datetime.strptime(upd_date,'%Y-%m-%d %H:%M:%S')
					except ValueError:
						self.logger.debug('upd_date=%s,no H:M:S?',upd_date)
						upd=datetime.datetime.strptime(upd_date,'%Y-%m-%d')
					if time_since and upd<time_since and (not keeptop):
						self.logger.info('%s\ngot time stop in page list.','-~'*20)
						return

					if author=='':
						author='usr for post %s'%(post.lid,)

					# check DB, insert if not exist, check update otherwise
					try:
						p=Post.objects.filter(sector=sector,locate_id=lid)[0:1].get()
					except Post.DoesNotExist:
						p=Post(sector=sector,locate_id=lid,url=title_url,title=title,author=author,
							crt_date=crt_date,upd_date=upd_date,nr_reply=0) # 先将DB中的nr_reply设为0,避免后面取帖子出错的情况下无法通过重新执行来重取帖子
						p.save()
						self.stat_post_add+=1
						p.nr_reply=nr_reply
						self.logger.debug('post %s created.',lid)
						self.getPost(p)
						gotnew_this_page=True
					else:
						if p.upd_date!=upd:
							p.upd_date=upd_date
							p.save()
						if p.nr_reply!=nr_reply:
							self.logger.info('post %s nr_reply changed. %+d',lid,nr_reply-p.nr_reply)
							self.getPost(p,nr_reply)
							gotnew_this_page=True
				except IndexError:
					if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in  htmlentitydecode(etree.tostring(item)):
						self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item)))
						continue
					else:
						raise
Ejemplo n.º 2
0
    def getPostList(self, forum_name, sector_name, time_since, page_start, page_stop):
        self.forum_name = forum_name

        upd = None
        cnt = 0
        try:
            forum = Forum.objects.filter(name=forum_name)[0:1].get()
        except Forum.DoesNotExist:
            self.logger.debug("can't find forum info of %s!", forum_name)
            raise

        try:
            sector = Sector.objects.filter(forum=forum, name=sector_name)[0:1].get()
        except Sector.DoesNotExist:
            self.logger.debug("can't find sector info of %s!", sector_name)
            raise

        for i in xrange(page_start, page_stop):  # max page number
            pageurl = sector.url % (i,)
            ##			self.logger.info('get %s ...',pageurl)
            data = self._getData(pageurl, None, "%s-page %02d" % (sector.name, i))
            if not data:
                self.logger.debug("got nothing, skip!")
                break

                # 用 lxml xpath 选择包含正文的 div
            parser = etree.HTMLParser()
            tree = etree.fromstring(data, parser)
            ##			el=tree.xpath(sector.url_pattern)
            ##			el=tree.xpath('//div[@id="threadlist"]//form[@id="moderate"]/table/*[not(@id="separatorline")]')
            el = tree.xpath('//div[@id="threadlist"]//form[@id="moderate"]/table/*[@id]')

            sticktopic = None
            for item in el:  # 列表中的每个主题
                if item.attrib["id"] == "separatorline":
                    continue

                ##				self.logger.debug('-='*10)
                if "stickthread" in item.xpath("@id")[0]:
                    sticktopic = True
                else:
                    sticktopic = False

                title = item.xpath("./tr/th/a/text()")[0]

                if item.xpath('./tr/th/span[@class="tps"]'):
                    pages = int(item.xpath('./tr/th/span[@class="tps"]/a[position()=last()]/text()')[0])
                else:
                    pages = 1

                    # 取主题信息
                ##				title_url=item.xpath('./tr/th/span[1]/a[1]/@href')[0]
                title_url = item.xpath("./tr/th/a/@href")[0]
                m = re.search("thread-(\d+)-.*?.html", title_url)
                if m:
                    lid = m.group(1)
                else:
                    self.logger.debug("can't find locate_id for post %s", title)
                    lid = ""

                author = item.xpath('./tr/td[@class="by"][1]/cite/a/text()')[0]
                assert author

                crt_date = item.xpath('./tr/td[@class="by"][1]/em/span/text()')[0]
                unknownreply = True
                try:
                    nr_reply = int(item.xpath('./tr/td[@class="num"]/a/text()')[0])
                    unknownreply = False
                except ValueError:
                    ##					self.logger.debug('post %s has no reply number!',lid) # 移动/锁贴
                    nr_reply = 0

                upd_date = item.xpath('./tr/td[@class="by"][2]/em/a/text()')[0]
                ##				self.logger.info('(%02d) (%02d) %s |%s|%s|%s|%s|%s',cnt,pages,title,author,crt_date,upd_date,nr_reply,title_url)
                cnt += 1
                ##				self.logger.info('%d---%02d)%s|%s|%d-%d|%s|%s',item.sourceline,cnt,lid,upd_date,nr_reply,pages,author,title)
                self.logger.info(
                    "%s\n%02d)\t%s|%s|%s-%d|%s|%s",
                    "-=" * 10,
                    cnt,
                    lid,
                    upd_date,
                    str(nr_reply) if not unknownreply else "?",
                    pages,
                    author,
                    title.strip(),
                )
                upd = datetime.datetime.strptime(upd_date, "%Y-%m-%d %H:%M")
                if time_since and (not sticktopic) and upd < time_since:
                    self.logger.info("%s\ngot time stop in page list.", "-~" * 20)
                    return

                    # check DB, insert if not exist, else check update
                try:
                    p = Post.objects.filter(sector=sector, locate_id=lid)[0:1].get()
                except Post.DoesNotExist:
                    p = Post(
                        sector=sector,
                        locate_id=lid,
                        url=title_url,
                        title=title,
                        author=author,
                        crt_date=crt_date,
                        upd_date=upd_date,
                        nr_reply=0,
                    )
                    p.save()
                    p.nr_reply = nr_reply
                    self.logger.debug("post %s created.", lid)
                    self.getPost(p)
                else:
                    if p.upd_date != upd:
                        p.upd_date = upd_date
                        p.save()
                    if unknownreply:  # check every time 不过每次最多只能获取一页新帖子
                        self.logger.info(
                            "post %s nr_reply may changed(unknown reply cnt). now %d in DB", lid, p.nr_reply + 1
                        )
                        nr_reply = p.nr_reply + 1  # 设为比原来的大以触发查找

                    if p.nr_reply != nr_reply:
                        self.logger.info("post %s nr_reply changed. %+d", lid, nr_reply - p.nr_reply)
                        self.getPost(p, nr_reply)
Ejemplo n.º 3
0
	def getPostList(self,forum_name,sector_name,time_since,page_start,page_stop):
##		self.forum_name=forum_name
		ptoppic=re.compile('.+?t_top\d+.gif|announcement.gif'.decode(self.dft_html_encoding),re.S|re.I|re.U)
		info, debug= self.logger.info, self.logger.debug

		upd=None
		cnt=0
		try:
			forum=Forum.objects.filter(name=forum_name)[0:1].get()
		except Forum.DoesNotExist:
			debug('can\'t find forum info of %s!',forum_name)
			raise

		try:
			sector=Sector.objects.filter(forum=forum,name=sector_name)[0:1].get()
		except Sector.DoesNotExist:
			debug('can\'t find sector info of %s!',sector_name)
			raise

		for i in xrange(page_start,page_stop): # max page number
			pageurl=sector.url%(i,)
##			self.logger.info('get %s ...',pageurl)
			data=self._getData(pageurl,None,'%s-page %02d'%(sector.name,i))


			# 用 lxml xpath 选择包含正文的 div
			parser=etree.HTMLParser()
			tree=etree.fromstring(data,parser)
			el=tree.xpath('/html/body/div[@class="wrap cl"]/div[@class="main thread"]/form[@id="moderate"]/div[@class="threadlist"]/table/tbody/*')


			for item in el: # 列表中的每个主题
##				self.logger.debug('-='*10)
				if item.xpath('../@class') and item.xpath('../@class')[0]=='separation': # skip separation line
					continue

				tdlist= item.xpath('*')
				assert len(tdlist)==6
				try:
					title=tdlist[2].xpath('a/text()')[0]
				except IndexError:
					try:
						title=tdlist[2].xpath('a/font/text()')[0]
					except IndexError:
						title=tdlist[2].xpath('a/span/text()')[0]

				# 取置顶信息
				istoptopic=False
				try:
					toppic=tdlist[0].xpath('a/img/@src')[0]
				except IndexError:
					toppic=tdlist[0].xpath('img/@src')[0]
				if ptoppic.search(toppic):
					istoptopic=True

				# 取主题信息
				title_url=tdlist[2].xpath('a/@href')[0]
				m=re.search('topicid=(\d+)&?',title_url)
				if m:
					lid=m.group(1)
				else:
					debug('can\'t find locate_id for post %s',title)
					lid=''

				author=tdlist[3].xpath('cite/a/text()')[0]
				crt_date=tdlist[3].xpath('em/text()')[0]
				if crt_date.find('前')!=-1 or crt_date.find('天')!=-1:
##					debug('%s not real crt_date: %s', lid, crt_date)
					crt_date=self.str2time(crt_date).strftime('%Y-%m-%d %H:%M:%S')
				if crt_date.find('刚才')!=-1:
					crt_date=datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
				if crt_date.find('/')!=-1: # in form 2011/7/24 9:16:34
					crt_date=datetime.datetime.strptime(crt_date,'%Y/%m/%d %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')


				try:
					nr_reply=int(tdlist[4].xpath('a[1]/text()')[0])
				except IndexError:
					nr_reply=0

				try:
					upd_date=tdlist[5].xpath('em/a/text()')[0].strip()
				except IndexError:
					upd_date=crt_date # 找不到最近更新时间则将发表时间作为最近更新时间

				if upd_date.find('前')!=-1 or upd_date.find('天')!=-1:
##					debug('%s not real upd_date: %s', lid, upd_date)
					upd_date=self.str2time(upd_date).strftime('%Y-%m-%d %H:%M:%S')
				if upd_date.find('刚才')!=-1:
					upd_date=datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
				pages= int(ceil((nr_reply+1)/float(self.NR_REPLY_PER_PAGE)) )
				info('%s\n%02d)\t%s|%s|%d-%d|%s|%s', '-='*10, cnt, lid, upd_date, nr_reply, pages, author, title.strip())
				cnt+=1
				try:
					upd=datetime.datetime.strptime(upd_date,'%Y-%m-%d %H:%M:%S')
				except ValueError:
					try:
						upd=datetime.datetime.strptime(upd_date,'%Y-%m-%d %H:%M')
					except ValueError:
						upd=datetime.datetime.strptime(upd_date,'%Y/%m/%d %H:%M:%S')
#-#						upd_date=upd.strftime('%Y-%m-%d %H:%M:%S')
##				info('(%02d) (%02d) %s |%s|%s|%s|%s|%s',cnt,pages,title,author,crt_date,upd_date,nr_reply,title_url)
#-#				info('%d---%02d)%s|%s|%d-%d|%s|%s',item.sourceline,cnt,lid,upd_date,nr_reply,pages,author,title)
				if upd<time_since and (not istoptopic):
					info('%s\ngot time stop in page list.','-~'*20)
					return

				# check DB, insert if not exist, else check update
				try:
					p=Post.objects.filter(sector=sector,locate_id=lid)[0:1].get()
				except Post.DoesNotExist:
					p=Post(sector=sector, locate_id=lid, url=title_url, title=title, author=author,
						crt_date=crt_date, upd_date=upd_date, nr_reply=0)
					p.save()
					p.nr_reply=nr_reply
					debug('post %s created.',lid)
					self.getPost(p)
				else:
					if p.upd_date!=upd:
						p.upd_date=upd
						p.save()
					if p.nr_reply<nr_reply:
						info('post %s nr_reply changed. %+d',lid,nr_reply-p.nr_reply)
						self.getPost(p,nr_reply)