Python htmlentitydecode Exemples, PostGetter.htmlentitydecode Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : CdPostGetter.py Projet : liveonnet/postgetter-app

    def getPost(self, post, new_nr=0):
        cnt = 0
        sr = urlparse.urlsplit(post.url)
        if not sr.scheme:
            posturl = post.sector.base_url + post.url
        else:
            posturl = post.url

        posturl = "%d".join(re.findall("(.*?thread-\d+-)\d+(-.*?.html)", posturl)[0])

        startpage, stoppage = self.getPageRange(self.NR_REPLY_PER_PAGE, post.nr_reply, new_nr)

        self.logger.debug("post %s page range [%d,%d)", post.locate_id, startpage, stoppage)
        parser = etree.HTMLParser()
        for pg in xrange(startpage, stoppage):
            self.logger.debug("post %s %s ...", post.locate_id, posturl % pg)
            data = self._getData(posturl % pg, None, post.title)
            if not data:
                self.logger.debug("-=-=- !!!!!!!!!!!!!! got None or '', skip !!!!!!!!!!!!!!!!!!! -=-=-")
                continue
                # check authentication. some posts need high permission to view.
            if data.find("超级大本营军事论坛 提示信息") != -1 and data.find("您无权进行当前操作，原因如下：") != -1:
                self.logger.debug(
                    "got err %s", re.search('<div class="alert_error">(.*?)</div>', data, re.M | re.S | re.I).group(1)
                )
                if self.login():
                    data = self._getData(posturl % pg, None, post.title)
                    if not data:
                        self.logger.debug("-=-=- !!!!!!!!!!!!!! got None or '', skip !!!!!!!!!!!!!!!!!!! -=-=-")
                        continue

            tree = etree.fromstring(data, parser)

            posts = tree.xpath('//div[@id="postlist"]/div[starts-with(@id,"post_")=true()]')
            for item in posts:
                replyid = item.attrib["id"][5:]
                assert replyid
                try:
                    author = item.xpath('./table//td[@class="pls"]/div[@class="pi"]/div[@class="authi"]/a/text()')[
                        0
                    ]  # item.xpath('./table/tr[1]/td[@class="pls"]/div[@class="pi"]/div[@class="authi"]/a/text()')[0]
                except IndexError:
                    author = item.xpath('./table//td[@class="pls"]/div[@class="pi"]/text()')[0].strip()
                assert author is not None

                ##				crt_date=item.xpath('./table/tr[1]/td[@class="postcontent"]/div[@class="postinfo"]/div[@class="posterinfo"]/div[@class="authorinfo"]/em/text()')[0][4:]
                crt_date = item.xpath('./table//div[@class="pti"]/div[@class="authi"]/em/text()')[0][
                    4:
                ]  # crt_date=item.xpath('./table/tr[1]/td[@class="plc"]/div[@class="pi"]/div[@class="pti"]/div[@class="authi"]/em/text()')[0][4:]

                # './table/tr[1]/td[@class="plc"]/div[@class="pct"]/div[@class="pcb"]/div[@class="t_fsz"]/table/tr/td[@id="postmessage_%s"]'%replyid
                ##				replycontent= item.xpath('//td[@id="postmessage_%s"]/*[not(@class="a_pr" or @class="pstatus")]'%replyid)[0]
                try:
                    replycontent = item.xpath('//td[@id="postmessage_%s"]' % replyid)[0]
                except IndexError:
                    if item.xpath('./table//div[@class="pct"]/div[@class="pcb"]/div[@class="locked"]'):
                        replycontent = item.xpath('./table//div[@class="pct"]/div[@class="pcb"]/div[@class="locked"]')[
                            0
                        ]
                        replycontent = htmlentitydecode(etree.tostring(replycontent)).strip()
                    else:
                        raise
                else:
                    assert replycontent is not None
                    # remove element 'ad' and 'poststatus'
                    for i in replycontent:
                        if "class" in i.attrib and i.attrib["class"] in ("a_pr", "pstatus"):
                            textlist = re.findall(
                                r"\A<%s.*?>.*?</%s>(.*?)\Z" % (i.tag, i.tag), etree.tostring(i), re.M | re.S | re.U
                            )
                            textlist = [x for x in textlist if x.strip() != ""]
                            if len(textlist) > 0:
                                remaintext = "<br />".join(textlist)
                                newelement = item.makeelement("br")
                                newelement.text = remaintext
                                replycontent.replace(i, newelement)
                            else:
                                replycontent.remove(i)
                    replycontent = self.exclude_first_td_tag.match(
                        htmlentitydecode(etree.tostring(replycontent)).strip()
                    ).group(1)

                try:
                    r = Reply.objects.filter(post=post, locate_id=replyid)[0:1].get()
                except Reply.DoesNotExist:
                    r = Reply(post=post, locate_id=replyid, crt_date=crt_date, author=author, content=replycontent)
                    try:
                        r.save()
                    except _mysql_exceptions.Warning:
                        self.logger.debug("got _mysql_exceptions.Warning!")
                        r.content = self.exclude_first_td_tag.match(etree.tostring(item[1]).strip()[:-5]).group(1)
                        if replycontent.startswith("<br/>"):
                            replycontent = replycontent[5:]
                        r.save()

                    cnt += 1

        if new_nr != 0:
            if post.nr_reply + cnt == new_nr:
                self.logger.info("post %s %+d reply. now %d", post.locate_id, cnt, new_nr + 1)
                post.nr_reply += cnt  # 增加实际变化（新增）的数量
            else:
                self.logger.debug(
                    "post %s %+d reply, %d != expect %d (no right new_nr info?)",
                    post.locate_id,
                    cnt,
                    post.nr_reply + cnt,
                    new_nr,
                )
                # 检查实际获得数量
                actualcnt = Reply.objects.filter(post=post).count()
                self.logger.info("post %s actual %d reply in DB", post.locate_id, actualcnt)
                post.nr_reply = actualcnt - 1 if actualcnt - 1 >= 0 else 0
        else:
            if post.nr_reply + 1 == cnt:
                self.logger.info("post %s init %+d reply.", post.locate_id, cnt)
            else:
                self.logger.info("post %s init %+d reply, != expect %d", post.locate_id, cnt, post.nr_reply + 1)
            post.nr_reply = cnt - 1 if cnt - 1 >= 0 else 0  # 设为实际获得值-1，以便下次再次尝试查找新增帖子
        post.save()

Exemple #2

0

Afficher le fichier

Fichier : SbzPostGetter.py Projet : liveonnet/postgetter-app

	def getPostList(self,forum_name,sector_name,time_since,page_start,page_stop):
		self.forum_name=forum_name # for login purpose

		upd=None
		cnt=0
		try:
			forum=Forum.objects.filter(name=forum_name)[0:1].get()
		except Forum.DoesNotExist:
			self.logger.debug('can\'t find forum info of %s!',forum_name)
			raise

		try:
			sector=Sector.objects.filter(forum=forum,name=sector_name)[0:1].get()
		except Sector.DoesNotExist:
			self.logger.debug('can\'t find sector info of %s!',sector_name)
			raise

		for i in xrange(page_start,page_stop): # max page number
			gotnew_this_page=False # 标记本页面中是否处理了新贴/更新回帖，以便在本页没有新内容的情况下尽早退出循环

			if self.exitevent.is_set():
				self.logger.info('got exit signal!')
				break

			pageurl=sector.url%(i,)
##			self.logger.info('get %s ...',pageurl)
			data=self._getData(pageurl,None,'%s-page %02d'%(sector.name,i))
			if len(data)<100: # 论坛大姨妈了
				self.logger.info('bbs down? %s',data)
				break

			# 用 lxml xpath 选择包含正文的 div
			parser=etree.HTMLParser()
			tree=etree.fromstring(data,parser)
			el=tree.xpath(sector.url_pattern)

			for item in el: # 列表中的每个主题
				if self.exitevent.is_set():
					self.logger.info('got exit signal!')
					break
##				self.logger.debug('-='*10)
				keeptop= True if 'folderstate3' in item[0].xpath('img/@src')[0] else False # 通过图标名称判断是否为置顶贴
				if keeptop:
					gotnew_this_page=True # 避免首页都是置顶贴且无更新的情况下过早退出

				try:
					try:
						title=item[3].xpath('table/tr/td/a/text()')[0]
					except IndexError:
						try:
							title=item[3].xpath('table/tr/td/a/font/b/text()')[0]
						except IndexError:
							title=item[3].xpath('table/tr/td/a/font/text()')[0]

					# 取主题信息
					title_url=item[3].xpath('table/tr/td/a/@href')[0]
					m=re.search('TopicID=(\d+)',title_url)
					if m:
						lid=m.group(1)
					else:
						self.logger.debug('can\'t find locate_id for post %s',title)
						lid=''

					try:
						author=item[6].xpath('a/text()')[0]
					except IndexError:
						author==''
					crt_date=item[6].xpath('@title')[0][5:]
					nr_reply=int(item[5].text)

					pages= int(ceil((nr_reply)/float(self.NR_REPLY_PER_PAGE)) )

					upd_date=item[8].text.strip()
	##				self.logger.info('(%02d) (%02d) %s |%s|%s|%s|%s|%s',cnt,pages,title,author,crt_date,upd_date,nr_reply,title_url)
					cnt+=1
	##				self.logger.info('%d---%02d)%s|%s|%d-%d|%s|%s',item.sourceline,cnt,lid,upd_date,nr_reply,pages,author,title)
					self.logger.info('%s\n%02d)\t%s|%s|%d-%d|%s|%s', '-='*10, cnt, lid, upd_date, nr_reply, pages, author, title.strip())
					try:
						upd=datetime.datetime.strptime(upd_date,'%Y-%m-%d %H:%M:%S')
					except ValueError:
						self.logger.debug('upd_date=%s,no H:M:S?',upd_date)
						upd=datetime.datetime.strptime(upd_date,'%Y-%m-%d')
					if time_since and upd<time_since and (not keeptop):
						self.logger.info('%s\ngot time stop in page list.','-~'*20)
						return

					if author=='':
						author='usr for post %s'%(post.lid,)

					# check DB, insert if not exist, check update otherwise
					try:
						p=Post.objects.filter(sector=sector,locate_id=lid)[0:1].get()
					except Post.DoesNotExist:
						p=Post(sector=sector,locate_id=lid,url=title_url,title=title,author=author,
							crt_date=crt_date,upd_date=upd_date,nr_reply=0) # 先将DB中的nr_reply设为0，避免后面取帖子出错的情况下无法通过重新执行来重取帖子
						p.save()
						self.stat_post_add+=1
						p.nr_reply=nr_reply
						self.logger.debug('post %s created.',lid)
						self.getPost(p)
						gotnew_this_page=True
					else:
						if p.upd_date!=upd:
							p.upd_date=upd_date
							p.save()
						if p.nr_reply!=nr_reply:
							self.logger.info('post %s nr_reply changed. %+d',lid,nr_reply-p.nr_reply)
							self.getPost(p,nr_reply)
							gotnew_this_page=True
				except IndexError:
					if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in  htmlentitydecode(etree.tostring(item)):
						self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item)))
						continue
					else:
						raise

Exemple #3

0

Afficher le fichier

Fichier : SbzPostGetter.py Projet : liveonnet/postgetter-app

	def getOnePostSmart(self,post,from_page=1):
		'''获取页面中的帖子。根据页面中的“下一页”链接是否存在来决定是否结束翻页，比根据主题列表中显示的回帖数决定翻页范围要更准确'''
		cnt=0
		sr=urlparse.urlsplit(post.url)
		if not sr.scheme:
			posturl=post.sector.base_url+post.url
		else:
			posturl=post.url

		posturl+='&TopicPage=%d'
		posturl=posturl.replace('topicdisplay.asp','topicdisplay_safe.asp')

		parser=etree.HTMLParser()
		pg=from_page
		while True:
			if self.exitevent.is_set():
				self.logger.info('got exit signal!')
				break
			self.logger.debug('post %s %s ...',post.locate_id,posturl%pg)
			data=self._getData(posturl%pg,None,post.title)

			tree=etree.fromstring(data,parser)

			posts=tree.xpath('//table[@class="maintable"][1]/tr[position()>1]')
			haspostinpage=False
			gotfuckingword=False
			for item in posts:
				haspostinpage=True
				try:
					author=item[0].xpath('a/text()')[0]
				except IndexError:
					self.logger.debug('no author info?')
					author=''

					if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in  htmlentitydecode(etree.tostring(item)):
						self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item)))
						gotfuckingword=True
						continue

				try:
					crt_date=item[1].xpath('font/text()')[0][6:-1]
					# 判断是否是主题贴，因为主题贴无法获得replyid
					tmp=item[1].xpath('*[position()<5]')
					if len(tmp)==4 and [x.tag for x in tmp]==['b','hr','font','hr']: # 是主题贴
						replyid=0
						try:
							realtitle=item[1].xpath('b/text()')[0]
						except IndexError:
							try:
								realtitle=item[1].xpath('b/font/b/text()')[0]
							except IndexError:
								realtitle=item[1].xpath('b/font/text()')[0]

						if post.title!=realtitle:
							self.logger.debug('post %s realtitle %s',post.locate_id,realtitle)
							post.title=realtitle
						# 为后面获取回复内容而删除非回复信息(主题/分割线/发表时间等）
						for x in item[1].xpath('*[position()<5]'):
							item[1].remove(x)
						item[1].text=''
					else: # 非主题贴
						replyurl=item[1].xpath('font/a[1]/@href')[0]
						replyid=re.search('ReplyID=(\d+)',replyurl).group(1)
						# 为后面获取回复内容而删除非回复信息(分割线/发表时间等）
						for x in item[1].xpath('*[position()<3]'):
							item[1].remove(x)

					replycontent=self.exclude_first_td_tag.match(htmlentitydecode(etree.tostring(item[1])).strip()).group(1)
					if replycontent.startswith('<br/>'):
						replycontent=replycontent[5:]

					if author=='':
						author='usr for %d-%s'%(post.id,replyid)

	##					open('/home/kevin/tmp_post.txt','w').write(replycontent)
					try:
						r=Reply.objects.filter(post=post,locate_id=replyid)[0:1].get()
					except Reply.DoesNotExist:
						r=Reply(post=post,locate_id=replyid,crt_date=crt_date,author=author,content=replycontent)
						try:
							r.save()
						except _mysql_exceptions.Warning:
							self.logger.debug('got _mysql_exceptions.Warning!')
							r.content=self.exclude_first_td_tag.match(etree.tostring(item[1]).strip()[:-5]).group(1)
							if replycontent.startswith('<br/>'):
								replycontent=replycontent[5:]
							r.save()

						cnt+=1
				except IndexError:
					if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in  htmlentitydecode(etree.tostring(item)):
						self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item)))
						gotfuckingword=True
						continue
					else:
						raise
				except AttributeError:
					if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in  htmlentitydecode(etree.tostring(item)):
						self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item)))
						gotfuckingword=True
						continue
					else:
						raise

			# check page next
			x=tree.xpath('//td[@class="outtd"][1]/table[2]/tr[1]/td[2]/a')
			if ('[>]' in [t.text for t in x]) and haspostinpage:
				pg+=1
			elif gotfuckingword: #
				pg+=1
			else:
				break


		self.logger.debug('post %s %+d reply',post.locate_id,cnt)
		# 检查实际获得数量
		actualcnt=Reply.objects.filter(post=post).count()
		self.logger.debug('post %s actual %d reply in DB',post.locate_id,actualcnt)
		post.nr_reply=actualcnt-1 if actualcnt-1>=0 else 0
		post.save()

Exemple #4

0

Afficher le fichier

Fichier : SbzPostGetter.py Projet : liveonnet/postgetter-app

	def getPost(self,post,new_nr=0):
		'''获取页面中的帖子。根据主题列表中显示的回帖数决定翻页范围'''
		cnt=0
		sr=urlparse.urlsplit(post.url)
		if not sr.scheme:
			posturl=post.sector.base_url+post.url
		else:
			posturl=post.url

		posturl+='&TopicPage=%d'
		posturl=posturl.replace('topicdisplay.asp','topicdisplay_safe.asp')

		startpage,stoppage=self.getPageRange(self.NR_REPLY_PER_PAGE,0 if post.nr_reply-1<0 else post.nr_reply-1,0 if new_nr==0 else new_nr-1)

		self.logger.debug('post %s page range [%d,%d)',post.locate_id,startpage,stoppage)
		parser=etree.HTMLParser()
		for pg in xrange(startpage,stoppage):
			if self.exitevent.is_set():
				self.logger.info('got exit signal!')
				break
			self.logger.debug('post %s %s ...',post.locate_id,posturl%pg)
			data=self._getData(posturl%pg,None,post.title)

			tree=etree.fromstring(data,parser)

			posts=tree.xpath('//table[@class="maintable"][1]/tr[position()>1]')
			for item in posts:
				try:
					author=item[0].xpath('a/text()')[0]
				except IndexError:
					self.logger.debug('no author info?')
					author=''

					if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in  htmlentitydecode(etree.tostring(item)):
						self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item)))
						continue

				try:
					crt_date=item[1].xpath('font/text()')[0][6:-1]
					# 判断是否是主题贴，因为主题贴无法获得replyid
					tmp=item[1].xpath('*[position()<5]')
					if len(tmp)==4 and [x.tag for x in tmp]==['b','hr','font','hr']: # 是主题贴
						replyid=0
						try:
							realtitle=item[1].xpath('b/text()')[0]
						except IndexError:
							try:
								realtitle=item[1].xpath('b/font/b/text()')[0]
							except IndexError:
								realtitle=item[1].xpath('b/font/text()')[0]

						if post.title!=realtitle:
							self.logger.info('post %s realtitle %s',post.locate_id,realtitle)
							post.title=realtitle
							post.save()
						# 为后面获取回复内容而删除非回复信息(主题/分割线/发表时间等）
						for x in item[1].xpath('*[position()<5]'):
							item[1].remove(x)
						item[1].text=''
					else: # 非主题贴
						replyurl=item[1].xpath('font/a[1]/@href')[0]
						replyid=re.search('ReplyID=(\d+)',replyurl).group(1)
						# 为后面获取回复内容而删除非回复信息(分割线/发表时间等）
						for x in item[1].xpath('*[position()<3]'):
							item[1].remove(x)

					replycontent=self.exclude_first_td_tag.match(htmlentitydecode(etree.tostring(item[1])).strip()).group(1)
					if replycontent.startswith('<br/>'):
						replycontent=replycontent[5:]

					if author=='':
						author='usr for %d-%s'%(post.id,replyid)

	##					open('/home/kevin/tmp_post.txt','w').write(replycontent)
					try:
						r=Reply.objects.filter(post=post,locate_id=replyid)[0:1].get()
					except Reply.DoesNotExist:
						r=Reply(post=post,locate_id=replyid,crt_date=crt_date,author=author,content=replycontent)
						try:
							r.save()
						except _mysql_exceptions.Warning:
							self.logger.debug('got _mysql_exceptions.Warning!')
							r.content=self.exclude_first_td_tag.match(etree.tostring(item[1]).strip()[:-5]).group(1)
							if replycontent.startswith('<br/>'):
								replycontent=replycontent[5:]
							r.save()

						cnt+=1
				except IndexError:
					if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in  htmlentitydecode(etree.tostring(item)):
						self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item)))
						continue
					else:
						raise
				except AttributeError:
					if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in  htmlentitydecode(etree.tostring(item)):
						self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item)))
						continue
					else:
						raise


		if new_nr!=0:
			if post.nr_reply+cnt==new_nr:
				self.logger.info('post %s %+d reply. now %d',post.locate_id,cnt,new_nr+1)
				post.nr_reply+=cnt # 增加实际变化（新增）的数量
			else:
				self.logger.debug('post %s %+d reply, %d != expect %d (no right new_nr info?)',post.locate_id,cnt,post.nr_reply+cnt,new_nr)
				# 检查实际获得数量
				actualcnt=Reply.objects.filter(post=post).count()
				self.logger.info('post %s actual %d reply in DB',post.locate_id,actualcnt)
				post.nr_reply=actualcnt-1 if actualcnt-1>=0 else 0
		else:
			if post.nr_reply+1==cnt:
				self.logger.info('post %s init %+d reply.',post.locate_id,cnt)
			else:
				self.logger.info('post %s init %+d reply, != expect %d',post.locate_id,cnt,post.nr_reply+1)
			post.nr_reply=cnt-1 if cnt-1>=0 else 0 # 设为实际获得值-1，以便下次再次尝试查找新增帖子
		post.save()
		self.stat_reply_add+=cnt

Exemple #5

0

Afficher le fichier

Fichier : QbqPostGetter.py Projet : liveonnet/postgetter-app

	def getPost(self,post,new_nr=0):
		cnt=0
		sr=urlparse.urlsplit(post.url)
		if not sr.scheme:
##			posturl=urlparse.urljoin(post.sector.base_url,post.url)
			posturl=urlparse.urljoin(post.sector.base_url, 'showtopic.aspx?topicid=%s&page=%%d'%post.locate_id)
		else:
			posturl=post.url

		startpage,stoppage=self.getPageRange(self.NR_REPLY_PER_PAGE,post.nr_reply,new_nr)

		self.logger.debug('page range [%d,%d) for post %s',startpage,stoppage,post.locate_id)
		parser=etree.HTMLParser()
		for pg in xrange(startpage,stoppage):
			self.logger.debug('post %s %s ...',post.locate_id,posturl%pg)
			data=self._getData(posturl%pg,None,post.title)
			tree=etree.fromstring(data,parser)

			pl=tree.xpath('//div[@id="postsContainer"]/table')

			for p in pl:
				if p.xpath('@class') and p.xpath('@class')[0]=='plh':
					realtitle=p.xpath('./tbody/tr/td[@class="posttopic"]/h1[@class="ts z"]/span/text()')[0]
					if realtitle and post.title!=realtitle :
						self.logger.debug('realtitle for post %s|%s',post.locate_id,realtitle)
						post.title=realtitle

					continue

				author=p.xpath('./tbody[1]/tr[1]/td[@class="postauthor"]/div[@class="poster"]/span/text()')[0]
				replyid=p.xpath('./@id')[0]
				if p.xpath('//div[@id="message%s"]/div[@id="firstpost"]'%replyid):
					replycontent=htmlentitydecode(etree.tostring(p.xpath('//div[@id="message%s"]/div[@id="firstpost"]'%replyid)[0])).strip()
				else:
					replycontent=htmlentitydecode(etree.tostring(p.xpath('//div[@id="message%s"]'%replyid)[0])).strip()
				replycontent=self.exclude_first_div_tag.match(replycontent).group(1).strip()

##				crt_date=p.xpath('./tbody[1]/tr[1]/td[@class="postcontent"]/div[@class="pi"]/div[@class="postinfo"]/em/span/@title')[0]
				crt_date=p.xpath('//div[@class="postinfo"]/em/span/@title')[0]

				try:
					r=Reply.objects.filter(post=post, author=author, locate_id=replyid)[0:1].get()
				except Reply.DoesNotExist:
					r=Reply(post=post,locate_id=replyid,crt_date=crt_date,author=author,content=replycontent)
					r.save()
					cnt+=1

##		debug('post %s add %d reply.',post.locate_id,cnt)
		if new_nr!=0:
			if post.nr_reply+cnt==new_nr:
				self.logger.info('post %s %+d reply. now %d',post.locate_id,cnt,new_nr+1)
				post.nr_reply+=cnt # 增加实际变化（新增）的数量
			else:
				self.logger.debug('post %s %+d reply, %d != expect %d',post.locate_id,cnt,post.nr_reply+cnt,new_nr)
				# 检查实际获得数量
				actualcnt=Reply.objects.filter(post=post).count()
				self.logger.info('post %s actual %d reply in DB',post.locate_id,actualcnt)
				post.nr_reply=actualcnt-1 if actualcnt-1>=0 else 0
		else:
			if post.nr_reply+1==cnt:
				self.logger.info('post %s init %+d reply.',post.locate_id,cnt)
			else:
				self.logger.info('post %s init %+d reply, != expect %d',post.locate_id,cnt,post.nr_reply+1)
			post.nr_reply=cnt-1 if cnt-1>=0 else 0 # 设为实际获得值-1，以便下次再次尝试查找新增帖子
		post.save()

Exemple #6

0

Afficher le fichier

Fichier : testSpider.py Projet : liveonnet/postgetter-app

	def parse(self, url, data):
		info, debug=self.logger.info, self.logger.debug
		pg, rurl, code= data
		rslt=None
		if url!=rurl:
##			info('%s ==> %s', url, rurl)
			self.url_scheduler.updUrl(rurl)

			sr=urlparse.urlsplit(rurl)
			rurl=unicode(rurl)
			if unicode(sr.netloc) in self.allowHost:
				# 在允许列表中 或者 不在跳过列表中
				if any( (x.search(rurl)  for x in self.pAllowPattern) ) or (not any( (x.search(rurl) for x in self.pSkipPattern) ) ):
					pass
				else:
					return rslt
			else:
				return rslt

##		info('page size: %d', len(pg))
		tree=etree.fromstring(pg, self.htmlparser)

		# get charset
		charset='utf-8'
		element=tree.xpath('/html/head/meta[@http-equiv="Content-Type"]')
		if element:
			charset= element[0].xpath('@content')[0]
			m=re.search(r'(?iLmsu).*\s*charset=([a-z0-9|-]+)\s*', charset)
			try:
				assert m is not None
			except AssertionError:
				info('no charset found in Content-Type: %s', etree.tostring(element[0]))
			else:
				charset=m.group(1)
		elif tree.xpath('/html/head/meta[@lang]'):
			charset = tree.xpath('/html/head/meta[@lang]/@lang')[0]
##		else:
##			charset='utf-8'
##		info('html charset=%s', charset)

		#  get title
		element=tree.xpath('/html/head/title')
		if element:
			try:
				title=element[0].xpath('./text()')[0]
			except UnicodeDecodeError as e:
				for x in ('utf-8', 'gb18030', 'gbk', 'gb2312', 'big5'):
					if charset!=x:
						tmptree=etree.fromstring(pg, etree.HTMLParser(encoding='gb18030'))
						try:
							title=tmptree.xpath('/html/head/title/text()')[0]
							break
						except UnicodeDecodeError:
							pass
				if not title:
					info('charset=%s, UnicodeDecodeError! %s', charset, e)
					open('/home/kevin/tmp.html', 'w').write(pg)
					self.shutdown.set()
					return rslt
				else:
					info('%s|%s get title using %s instead of %s!', title, rurl, x, charset)
		else:
			title=None
##		info('title|url|size|code %s|%s|%d|%s', title, rurl, len(pg), charset)
##		info('%-6d|%s|%s', len(pg), title, rurl)

		al=tree.xpath('//a[@href]')
##		info('al= %d', len(al))

		# determine html type navi/body
		# xhtml(div) or html(table)
		isXhtml=False
		xmlns=tree.attrib.get('xmlns',None)
		if xmlns and xmlns.find('xhtml')!=-1:
			isXhtml=True
##		element=tree.xpath('/html[@xmlns]')
##		if element:
##			xmlns=element[0].xpath('@xmlns')[0]
##			if xmlns.find('xhtml')!=-1:
##				isXhtml=True
		# find keywords
		element=tree.xpath('/html/head/meta[@name="keywords"]')
		if element:
			keywords=[ x.strip() for x in element[0].xpath('@content')[0].split(',') ]
		element=tree.xpath('/html/head/meta[@name="description"]')
		if element:
			desc=element[0].xpath('@content')[0]
		# all：文件将被检索，且页面上的链接可以被查询；
		# none：文件将不被检索，且页面上的链接不可以被查询；(和 "noindex, no follow" 起相同作用)
		# index：文件将被检索；（让robot/spider登录）
		# follow：页面上的链接可以被查询；
		# noindex：文件将不被检索，但页面上的链接可以被查询；(不让robot/spider登录)
		# nofollow：文件将不被检索，页面上的链接可以被查询。(不让robot/spider顺着此页的连接往下探找) 
		element=tree.xpath('/html/head/meta[@name="robots"]')
		if element:
			robot_opts=[x.strip() for x in element[0].xpath('@content')[0].split(',')]
		nr_a=len(al)
		textlist=tree.xpath('//*[local-name()!="script" and local-name()!="style" and local-name()!="a" and local-name()!="input"]//text()')
		nr_text=len('\n'.join(( x.strip() for x in textlist )))
		v_page=float(nr_a)/nr_text
		# try to find body text
		magic=1.0
		if isXhtml:
			tag='div'
		else:
			if len(tree.xpath('/html/body//table'))>len(tree.xpath('/html/body//div')):
				tag='table'
			else:
				tag='div'
		elem=tree.xpath('/html/body')[0]
		info('\n\n%s', '--'*30)
		self.besttag, self.bestvalue=None, 0
		self.getBestBodyText(elem, tag)
##		info('%s|%s isXhtml:%s, v_page:%.3f, tag=%s', title, rurl, isXhtml, v_page, tag)
		if self.besttag is not None:
			bodytext=self.exclude_first_div_tag.match( htmlentitydecode(etree.tostring(self.besttag)).strip() ).group(1)
##			bodytext='\n'.join( ( x.strip() for x in self.besttag) )
			nr_bodytext=len(bodytext)
			info('%-5d|%s|%s|%.3f|%s|%s\n%s\n\n……………………\n%s\n%s', nr_bodytext, title, isXhtml, v_page, tag, rurl, bodytext[: int(0.2*nr_bodytext)], bodytext[int(nr_bodytext*0.8):], '-='*30)
##			open('/home/kevin/tmp.html', 'w').write(pg)



		rslt=set()
		for item in al:
			a=item.xpath('@href')[0]
			try:
				atext=item.xpath('text()')[0]
			except IndexError:
##				info('\n\n IndexError: |->%s<-|', htmlentitydecode(etree.tostring(item)))
				if item.xpath('./img[@alt]'):
					atext=item.xpath('./img[@alt]/@alt')[0]
##					info('found img/@alt as atext! %s', atext)
				else:
					try:
						atext=self.exclude_first_a_tag.match( htmlentitydecode(etree.tostring(item)).strip() ).group(1)
					except AttributeError:
						el=item.xpath('*')
						atext=''.join((htmlentitydecode(etree.tostring(i)) for i in el))

			atext=atext.strip()
##			atext=atext.encode(charset)
##			if not atext:
##				info(' !!! no text: %s', htmlentitydecode(etree.tostring(item)))

##			info('%s\n\nfind url: %s\n\t%s', '-*'*20, atext, a)

			scheme, netloc = None, None
			sr = urlparse.urlsplit(a)
			scheme, netloc=sr.scheme, sr.netloc
			if not scheme:
				scheme='http'
			if scheme in ('javascript', 'mailto', 'ftp', 'file', 'hthttp'):
##				info('skip %s for %s\n\n%s\n', scheme, a, '-='*30)
				continue
			try:
				assert scheme in ('http','https')
			except AssertionError:
				info('scheme %s unknown!', scheme)
			if not netloc:
				element=tree.xpath('/html/head/base/@href')
				if element:
					baseurl=element[0]
##					info('\n%s\nhtml has base url: %s', '~~'*40, baseurl)
					netloc=baseurl
				else:
					bsr=urlparse.urlsplit(rurl)
					if bsr.netloc:
##						info('use rurl netloc: %s', bsr.netloc)
						netloc=bsr.netloc
					else:
						raise StandardError('no netloc found!')

##			fullurl= urlparse.urlunsplit( (scheme, netloc, sr.path, sr.query, sr.fragment) )
			fullurl= urlparse.urlunsplit( (scheme, netloc, sr.path, sr.query, '') )

##			info('%s\n%s: %s', '-='*30, atext, fullurl)

			fullurl=unicode(fullurl)
			# 在允许列表中 或者 不在跳过列表中
			if unicode(netloc) in self.allowHost:
				if any( (x.search(fullurl)  for x in self.pAllowPattern) ) or (not any( (x.search(fullurl) for x in self.pSkipPattern) ) ):
					rslt.add((atext,fullurl))
##				else:
##					info(' !!!!! %s', fullurl)
##			else:
##				info('%s not in allowHost (%d)!!!', netloc, len(self.allowHost))

		return rslt