def getPost(self, post, new_nr=0): cnt = 0 sr = urlparse.urlsplit(post.url) if not sr.scheme: posturl = post.sector.base_url + post.url else: posturl = post.url posturl = "%d".join(re.findall("(.*?thread-\d+-)\d+(-.*?.html)", posturl)[0]) startpage, stoppage = self.getPageRange(self.NR_REPLY_PER_PAGE, post.nr_reply, new_nr) self.logger.debug("post %s page range [%d,%d)", post.locate_id, startpage, stoppage) parser = etree.HTMLParser() for pg in xrange(startpage, stoppage): self.logger.debug("post %s %s ...", post.locate_id, posturl % pg) data = self._getData(posturl % pg, None, post.title) if not data: self.logger.debug("-=-=- !!!!!!!!!!!!!! got None or '', skip !!!!!!!!!!!!!!!!!!! -=-=-") continue # check authentication. some posts need high permission to view. if data.find("超级大本营军事论坛 提示信息") != -1 and data.find("您无权进行当前操作,原因如下:") != -1: self.logger.debug( "got err %s", re.search('<div class="alert_error">(.*?)</div>', data, re.M | re.S | re.I).group(1) ) if self.login(): data = self._getData(posturl % pg, None, post.title) if not data: self.logger.debug("-=-=- !!!!!!!!!!!!!! got None or '', skip !!!!!!!!!!!!!!!!!!! -=-=-") continue tree = etree.fromstring(data, parser) posts = tree.xpath('//div[@id="postlist"]/div[starts-with(@id,"post_")=true()]') for item in posts: replyid = item.attrib["id"][5:] assert replyid try: author = item.xpath('./table//td[@class="pls"]/div[@class="pi"]/div[@class="authi"]/a/text()')[ 0 ] # item.xpath('./table/tr[1]/td[@class="pls"]/div[@class="pi"]/div[@class="authi"]/a/text()')[0] except IndexError: author = item.xpath('./table//td[@class="pls"]/div[@class="pi"]/text()')[0].strip() assert author is not None ## crt_date=item.xpath('./table/tr[1]/td[@class="postcontent"]/div[@class="postinfo"]/div[@class="posterinfo"]/div[@class="authorinfo"]/em/text()')[0][4:] crt_date = item.xpath('./table//div[@class="pti"]/div[@class="authi"]/em/text()')[0][ 4: ] # crt_date=item.xpath('./table/tr[1]/td[@class="plc"]/div[@class="pi"]/div[@class="pti"]/div[@class="authi"]/em/text()')[0][4:] # './table/tr[1]/td[@class="plc"]/div[@class="pct"]/div[@class="pcb"]/div[@class="t_fsz"]/table/tr/td[@id="postmessage_%s"]'%replyid ## replycontent= item.xpath('//td[@id="postmessage_%s"]/*[not(@class="a_pr" or @class="pstatus")]'%replyid)[0] try: replycontent = item.xpath('//td[@id="postmessage_%s"]' % replyid)[0] except IndexError: if item.xpath('./table//div[@class="pct"]/div[@class="pcb"]/div[@class="locked"]'): replycontent = item.xpath('./table//div[@class="pct"]/div[@class="pcb"]/div[@class="locked"]')[ 0 ] replycontent = htmlentitydecode(etree.tostring(replycontent)).strip() else: raise else: assert replycontent is not None # remove element 'ad' and 'poststatus' for i in replycontent: if "class" in i.attrib and i.attrib["class"] in ("a_pr", "pstatus"): textlist = re.findall( r"\A<%s.*?>.*?</%s>(.*?)\Z" % (i.tag, i.tag), etree.tostring(i), re.M | re.S | re.U ) textlist = [x for x in textlist if x.strip() != ""] if len(textlist) > 0: remaintext = "<br />".join(textlist) newelement = item.makeelement("br") newelement.text = remaintext replycontent.replace(i, newelement) else: replycontent.remove(i) replycontent = self.exclude_first_td_tag.match( htmlentitydecode(etree.tostring(replycontent)).strip() ).group(1) try: r = Reply.objects.filter(post=post, locate_id=replyid)[0:1].get() except Reply.DoesNotExist: r = Reply(post=post, locate_id=replyid, crt_date=crt_date, author=author, content=replycontent) try: r.save() except _mysql_exceptions.Warning: self.logger.debug("got _mysql_exceptions.Warning!") r.content = self.exclude_first_td_tag.match(etree.tostring(item[1]).strip()[:-5]).group(1) if replycontent.startswith("<br/>"): replycontent = replycontent[5:] r.save() cnt += 1 if new_nr != 0: if post.nr_reply + cnt == new_nr: self.logger.info("post %s %+d reply. now %d", post.locate_id, cnt, new_nr + 1) post.nr_reply += cnt # 增加实际变化(新增)的数量 else: self.logger.debug( "post %s %+d reply, %d != expect %d (no right new_nr info?)", post.locate_id, cnt, post.nr_reply + cnt, new_nr, ) # 检查实际获得数量 actualcnt = Reply.objects.filter(post=post).count() self.logger.info("post %s actual %d reply in DB", post.locate_id, actualcnt) post.nr_reply = actualcnt - 1 if actualcnt - 1 >= 0 else 0 else: if post.nr_reply + 1 == cnt: self.logger.info("post %s init %+d reply.", post.locate_id, cnt) else: self.logger.info("post %s init %+d reply, != expect %d", post.locate_id, cnt, post.nr_reply + 1) post.nr_reply = cnt - 1 if cnt - 1 >= 0 else 0 # 设为实际获得值-1,以便下次再次尝试查找新增帖子 post.save()
def getPostList(self,forum_name,sector_name,time_since,page_start,page_stop): self.forum_name=forum_name # for login purpose upd=None cnt=0 try: forum=Forum.objects.filter(name=forum_name)[0:1].get() except Forum.DoesNotExist: self.logger.debug('can\'t find forum info of %s!',forum_name) raise try: sector=Sector.objects.filter(forum=forum,name=sector_name)[0:1].get() except Sector.DoesNotExist: self.logger.debug('can\'t find sector info of %s!',sector_name) raise for i in xrange(page_start,page_stop): # max page number gotnew_this_page=False # 标记本页面中是否处理了新贴/更新回帖,以便在本页没有新内容的情况下尽早退出循环 if self.exitevent.is_set(): self.logger.info('got exit signal!') break pageurl=sector.url%(i,) ## self.logger.info('get %s ...',pageurl) data=self._getData(pageurl,None,'%s-page %02d'%(sector.name,i)) if len(data)<100: # 论坛大姨妈了 self.logger.info('bbs down? %s',data) break # 用 lxml xpath 选择包含正文的 div parser=etree.HTMLParser() tree=etree.fromstring(data,parser) el=tree.xpath(sector.url_pattern) for item in el: # 列表中的每个主题 if self.exitevent.is_set(): self.logger.info('got exit signal!') break ## self.logger.debug('-='*10) keeptop= True if 'folderstate3' in item[0].xpath('img/@src')[0] else False # 通过图标名称判断是否为置顶贴 if keeptop: gotnew_this_page=True # 避免首页都是置顶贴且无更新的情况下过早退出 try: try: title=item[3].xpath('table/tr/td/a/text()')[0] except IndexError: try: title=item[3].xpath('table/tr/td/a/font/b/text()')[0] except IndexError: title=item[3].xpath('table/tr/td/a/font/text()')[0] # 取主题信息 title_url=item[3].xpath('table/tr/td/a/@href')[0] m=re.search('TopicID=(\d+)',title_url) if m: lid=m.group(1) else: self.logger.debug('can\'t find locate_id for post %s',title) lid='' try: author=item[6].xpath('a/text()')[0] except IndexError: author=='' crt_date=item[6].xpath('@title')[0][5:] nr_reply=int(item[5].text) pages= int(ceil((nr_reply)/float(self.NR_REPLY_PER_PAGE)) ) upd_date=item[8].text.strip() ## self.logger.info('(%02d) (%02d) %s |%s|%s|%s|%s|%s',cnt,pages,title,author,crt_date,upd_date,nr_reply,title_url) cnt+=1 ## self.logger.info('%d---%02d)%s|%s|%d-%d|%s|%s',item.sourceline,cnt,lid,upd_date,nr_reply,pages,author,title) self.logger.info('%s\n%02d)\t%s|%s|%d-%d|%s|%s', '-='*10, cnt, lid, upd_date, nr_reply, pages, author, title.strip()) try: upd=datetime.datetime.strptime(upd_date,'%Y-%m-%d %H:%M:%S') except ValueError: self.logger.debug('upd_date=%s,no H:M:S?',upd_date) upd=datetime.datetime.strptime(upd_date,'%Y-%m-%d') if time_since and upd<time_since and (not keeptop): self.logger.info('%s\ngot time stop in page list.','-~'*20) return if author=='': author='usr for post %s'%(post.lid,) # check DB, insert if not exist, check update otherwise try: p=Post.objects.filter(sector=sector,locate_id=lid)[0:1].get() except Post.DoesNotExist: p=Post(sector=sector,locate_id=lid,url=title_url,title=title,author=author, crt_date=crt_date,upd_date=upd_date,nr_reply=0) # 先将DB中的nr_reply设为0,避免后面取帖子出错的情况下无法通过重新执行来重取帖子 p.save() self.stat_post_add+=1 p.nr_reply=nr_reply self.logger.debug('post %s created.',lid) self.getPost(p) gotnew_this_page=True else: if p.upd_date!=upd: p.upd_date=upd_date p.save() if p.nr_reply!=nr_reply: self.logger.info('post %s nr_reply changed. %+d',lid,nr_reply-p.nr_reply) self.getPost(p,nr_reply) gotnew_this_page=True except IndexError: if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in htmlentitydecode(etree.tostring(item)): self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item))) continue else: raise
def getOnePostSmart(self,post,from_page=1): '''获取页面中的帖子。根据页面中的“下一页”链接是否存在来决定是否结束翻页,比根据主题列表中显示的回帖数决定翻页范围要更准确''' cnt=0 sr=urlparse.urlsplit(post.url) if not sr.scheme: posturl=post.sector.base_url+post.url else: posturl=post.url posturl+='&TopicPage=%d' posturl=posturl.replace('topicdisplay.asp','topicdisplay_safe.asp') parser=etree.HTMLParser() pg=from_page while True: if self.exitevent.is_set(): self.logger.info('got exit signal!') break self.logger.debug('post %s %s ...',post.locate_id,posturl%pg) data=self._getData(posturl%pg,None,post.title) tree=etree.fromstring(data,parser) posts=tree.xpath('//table[@class="maintable"][1]/tr[position()>1]') haspostinpage=False gotfuckingword=False for item in posts: haspostinpage=True try: author=item[0].xpath('a/text()')[0] except IndexError: self.logger.debug('no author info?') author='' if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in htmlentitydecode(etree.tostring(item)): self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item))) gotfuckingword=True continue try: crt_date=item[1].xpath('font/text()')[0][6:-1] # 判断是否是主题贴,因为主题贴无法获得replyid tmp=item[1].xpath('*[position()<5]') if len(tmp)==4 and [x.tag for x in tmp]==['b','hr','font','hr']: # 是主题贴 replyid=0 try: realtitle=item[1].xpath('b/text()')[0] except IndexError: try: realtitle=item[1].xpath('b/font/b/text()')[0] except IndexError: realtitle=item[1].xpath('b/font/text()')[0] if post.title!=realtitle: self.logger.debug('post %s realtitle %s',post.locate_id,realtitle) post.title=realtitle # 为后面获取回复内容而删除非回复信息(主题/分割线/发表时间等) for x in item[1].xpath('*[position()<5]'): item[1].remove(x) item[1].text='' else: # 非主题贴 replyurl=item[1].xpath('font/a[1]/@href')[0] replyid=re.search('ReplyID=(\d+)',replyurl).group(1) # 为后面获取回复内容而删除非回复信息(分割线/发表时间等) for x in item[1].xpath('*[position()<3]'): item[1].remove(x) replycontent=self.exclude_first_td_tag.match(htmlentitydecode(etree.tostring(item[1])).strip()).group(1) if replycontent.startswith('<br/>'): replycontent=replycontent[5:] if author=='': author='usr for %d-%s'%(post.id,replyid) ## open('/home/kevin/tmp_post.txt','w').write(replycontent) try: r=Reply.objects.filter(post=post,locate_id=replyid)[0:1].get() except Reply.DoesNotExist: r=Reply(post=post,locate_id=replyid,crt_date=crt_date,author=author,content=replycontent) try: r.save() except _mysql_exceptions.Warning: self.logger.debug('got _mysql_exceptions.Warning!') r.content=self.exclude_first_td_tag.match(etree.tostring(item[1]).strip()[:-5]).group(1) if replycontent.startswith('<br/>'): replycontent=replycontent[5:] r.save() cnt+=1 except IndexError: if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in htmlentitydecode(etree.tostring(item)): self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item))) gotfuckingword=True continue else: raise except AttributeError: if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in htmlentitydecode(etree.tostring(item)): self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item))) gotfuckingword=True continue else: raise # check page next x=tree.xpath('//td[@class="outtd"][1]/table[2]/tr[1]/td[2]/a') if ('[>]' in [t.text for t in x]) and haspostinpage: pg+=1 elif gotfuckingword: # pg+=1 else: break self.logger.debug('post %s %+d reply',post.locate_id,cnt) # 检查实际获得数量 actualcnt=Reply.objects.filter(post=post).count() self.logger.debug('post %s actual %d reply in DB',post.locate_id,actualcnt) post.nr_reply=actualcnt-1 if actualcnt-1>=0 else 0 post.save()
def getPost(self,post,new_nr=0): '''获取页面中的帖子。根据主题列表中显示的回帖数决定翻页范围''' cnt=0 sr=urlparse.urlsplit(post.url) if not sr.scheme: posturl=post.sector.base_url+post.url else: posturl=post.url posturl+='&TopicPage=%d' posturl=posturl.replace('topicdisplay.asp','topicdisplay_safe.asp') startpage,stoppage=self.getPageRange(self.NR_REPLY_PER_PAGE,0 if post.nr_reply-1<0 else post.nr_reply-1,0 if new_nr==0 else new_nr-1) self.logger.debug('post %s page range [%d,%d)',post.locate_id,startpage,stoppage) parser=etree.HTMLParser() for pg in xrange(startpage,stoppage): if self.exitevent.is_set(): self.logger.info('got exit signal!') break self.logger.debug('post %s %s ...',post.locate_id,posturl%pg) data=self._getData(posturl%pg,None,post.title) tree=etree.fromstring(data,parser) posts=tree.xpath('//table[@class="maintable"][1]/tr[position()>1]') for item in posts: try: author=item[0].xpath('a/text()')[0] except IndexError: self.logger.debug('no author info?') author='' if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in htmlentitydecode(etree.tostring(item)): self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item))) continue try: crt_date=item[1].xpath('font/text()')[0][6:-1] # 判断是否是主题贴,因为主题贴无法获得replyid tmp=item[1].xpath('*[position()<5]') if len(tmp)==4 and [x.tag for x in tmp]==['b','hr','font','hr']: # 是主题贴 replyid=0 try: realtitle=item[1].xpath('b/text()')[0] except IndexError: try: realtitle=item[1].xpath('b/font/b/text()')[0] except IndexError: realtitle=item[1].xpath('b/font/text()')[0] if post.title!=realtitle: self.logger.info('post %s realtitle %s',post.locate_id,realtitle) post.title=realtitle post.save() # 为后面获取回复内容而删除非回复信息(主题/分割线/发表时间等) for x in item[1].xpath('*[position()<5]'): item[1].remove(x) item[1].text='' else: # 非主题贴 replyurl=item[1].xpath('font/a[1]/@href')[0] replyid=re.search('ReplyID=(\d+)',replyurl).group(1) # 为后面获取回复内容而删除非回复信息(分割线/发表时间等) for x in item[1].xpath('*[position()<3]'): item[1].remove(x) replycontent=self.exclude_first_td_tag.match(htmlentitydecode(etree.tostring(item[1])).strip()).group(1) if replycontent.startswith('<br/>'): replycontent=replycontent[5:] if author=='': author='usr for %d-%s'%(post.id,replyid) ## open('/home/kevin/tmp_post.txt','w').write(replycontent) try: r=Reply.objects.filter(post=post,locate_id=replyid)[0:1].get() except Reply.DoesNotExist: r=Reply(post=post,locate_id=replyid,crt_date=crt_date,author=author,content=replycontent) try: r.save() except _mysql_exceptions.Warning: self.logger.debug('got _mysql_exceptions.Warning!') r.content=self.exclude_first_td_tag.match(etree.tostring(item[1]).strip()[:-5]).group(1) if replycontent.startswith('<br/>'): replycontent=replycontent[5:] r.save() cnt+=1 except IndexError: if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in htmlentitydecode(etree.tostring(item)): self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item))) continue else: raise except AttributeError: if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in htmlentitydecode(etree.tostring(item)): self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item))) continue else: raise if new_nr!=0: if post.nr_reply+cnt==new_nr: self.logger.info('post %s %+d reply. now %d',post.locate_id,cnt,new_nr+1) post.nr_reply+=cnt # 增加实际变化(新增)的数量 else: self.logger.debug('post %s %+d reply, %d != expect %d (no right new_nr info?)',post.locate_id,cnt,post.nr_reply+cnt,new_nr) # 检查实际获得数量 actualcnt=Reply.objects.filter(post=post).count() self.logger.info('post %s actual %d reply in DB',post.locate_id,actualcnt) post.nr_reply=actualcnt-1 if actualcnt-1>=0 else 0 else: if post.nr_reply+1==cnt: self.logger.info('post %s init %+d reply.',post.locate_id,cnt) else: self.logger.info('post %s init %+d reply, != expect %d',post.locate_id,cnt,post.nr_reply+1) post.nr_reply=cnt-1 if cnt-1>=0 else 0 # 设为实际获得值-1,以便下次再次尝试查找新增帖子 post.save() self.stat_reply_add+=cnt
def getPost(self,post,new_nr=0): cnt=0 sr=urlparse.urlsplit(post.url) if not sr.scheme: ## posturl=urlparse.urljoin(post.sector.base_url,post.url) posturl=urlparse.urljoin(post.sector.base_url, 'showtopic.aspx?topicid=%s&page=%%d'%post.locate_id) else: posturl=post.url startpage,stoppage=self.getPageRange(self.NR_REPLY_PER_PAGE,post.nr_reply,new_nr) self.logger.debug('page range [%d,%d) for post %s',startpage,stoppage,post.locate_id) parser=etree.HTMLParser() for pg in xrange(startpage,stoppage): self.logger.debug('post %s %s ...',post.locate_id,posturl%pg) data=self._getData(posturl%pg,None,post.title) tree=etree.fromstring(data,parser) pl=tree.xpath('//div[@id="postsContainer"]/table') for p in pl: if p.xpath('@class') and p.xpath('@class')[0]=='plh': realtitle=p.xpath('./tbody/tr/td[@class="posttopic"]/h1[@class="ts z"]/span/text()')[0] if realtitle and post.title!=realtitle : self.logger.debug('realtitle for post %s|%s',post.locate_id,realtitle) post.title=realtitle continue author=p.xpath('./tbody[1]/tr[1]/td[@class="postauthor"]/div[@class="poster"]/span/text()')[0] replyid=p.xpath('./@id')[0] if p.xpath('//div[@id="message%s"]/div[@id="firstpost"]'%replyid): replycontent=htmlentitydecode(etree.tostring(p.xpath('//div[@id="message%s"]/div[@id="firstpost"]'%replyid)[0])).strip() else: replycontent=htmlentitydecode(etree.tostring(p.xpath('//div[@id="message%s"]'%replyid)[0])).strip() replycontent=self.exclude_first_div_tag.match(replycontent).group(1).strip() ## crt_date=p.xpath('./tbody[1]/tr[1]/td[@class="postcontent"]/div[@class="pi"]/div[@class="postinfo"]/em/span/@title')[0] crt_date=p.xpath('//div[@class="postinfo"]/em/span/@title')[0] try: r=Reply.objects.filter(post=post, author=author, locate_id=replyid)[0:1].get() except Reply.DoesNotExist: r=Reply(post=post,locate_id=replyid,crt_date=crt_date,author=author,content=replycontent) r.save() cnt+=1 ## debug('post %s add %d reply.',post.locate_id,cnt) if new_nr!=0: if post.nr_reply+cnt==new_nr: self.logger.info('post %s %+d reply. now %d',post.locate_id,cnt,new_nr+1) post.nr_reply+=cnt # 增加实际变化(新增)的数量 else: self.logger.debug('post %s %+d reply, %d != expect %d',post.locate_id,cnt,post.nr_reply+cnt,new_nr) # 检查实际获得数量 actualcnt=Reply.objects.filter(post=post).count() self.logger.info('post %s actual %d reply in DB',post.locate_id,actualcnt) post.nr_reply=actualcnt-1 if actualcnt-1>=0 else 0 else: if post.nr_reply+1==cnt: self.logger.info('post %s init %+d reply.',post.locate_id,cnt) else: self.logger.info('post %s init %+d reply, != expect %d',post.locate_id,cnt,post.nr_reply+1) post.nr_reply=cnt-1 if cnt-1>=0 else 0 # 设为实际获得值-1,以便下次再次尝试查找新增帖子 post.save()
def parse(self, url, data): info, debug=self.logger.info, self.logger.debug pg, rurl, code= data rslt=None if url!=rurl: ## info('%s ==> %s', url, rurl) self.url_scheduler.updUrl(rurl) sr=urlparse.urlsplit(rurl) rurl=unicode(rurl) if unicode(sr.netloc) in self.allowHost: # 在允许列表中 或者 不在跳过列表中 if any( (x.search(rurl) for x in self.pAllowPattern) ) or (not any( (x.search(rurl) for x in self.pSkipPattern) ) ): pass else: return rslt else: return rslt ## info('page size: %d', len(pg)) tree=etree.fromstring(pg, self.htmlparser) # get charset charset='utf-8' element=tree.xpath('/html/head/meta[@http-equiv="Content-Type"]') if element: charset= element[0].xpath('@content')[0] m=re.search(r'(?iLmsu).*\s*charset=([a-z0-9|-]+)\s*', charset) try: assert m is not None except AssertionError: info('no charset found in Content-Type: %s', etree.tostring(element[0])) else: charset=m.group(1) elif tree.xpath('/html/head/meta[@lang]'): charset = tree.xpath('/html/head/meta[@lang]/@lang')[0] ## else: ## charset='utf-8' ## info('html charset=%s', charset) # get title element=tree.xpath('/html/head/title') if element: try: title=element[0].xpath('./text()')[0] except UnicodeDecodeError as e: for x in ('utf-8', 'gb18030', 'gbk', 'gb2312', 'big5'): if charset!=x: tmptree=etree.fromstring(pg, etree.HTMLParser(encoding='gb18030')) try: title=tmptree.xpath('/html/head/title/text()')[0] break except UnicodeDecodeError: pass if not title: info('charset=%s, UnicodeDecodeError! %s', charset, e) open('/home/kevin/tmp.html', 'w').write(pg) self.shutdown.set() return rslt else: info('%s|%s get title using %s instead of %s!', title, rurl, x, charset) else: title=None ## info('title|url|size|code %s|%s|%d|%s', title, rurl, len(pg), charset) ## info('%-6d|%s|%s', len(pg), title, rurl) al=tree.xpath('//a[@href]') ## info('al= %d', len(al)) # determine html type navi/body # xhtml(div) or html(table) isXhtml=False xmlns=tree.attrib.get('xmlns',None) if xmlns and xmlns.find('xhtml')!=-1: isXhtml=True ## element=tree.xpath('/html[@xmlns]') ## if element: ## xmlns=element[0].xpath('@xmlns')[0] ## if xmlns.find('xhtml')!=-1: ## isXhtml=True # find keywords element=tree.xpath('/html/head/meta[@name="keywords"]') if element: keywords=[ x.strip() for x in element[0].xpath('@content')[0].split(',') ] element=tree.xpath('/html/head/meta[@name="description"]') if element: desc=element[0].xpath('@content')[0] # all:文件将被检索,且页面上的链接可以被查询; # none:文件将不被检索,且页面上的链接不可以被查询;(和 "noindex, no follow" 起相同作用) # index:文件将被检索;(让robot/spider登录) # follow:页面上的链接可以被查询; # noindex:文件将不被检索,但页面上的链接可以被查询;(不让robot/spider登录) # nofollow:文件将不被检索,页面上的链接可以被查询。(不让robot/spider顺着此页的连接往下探找) element=tree.xpath('/html/head/meta[@name="robots"]') if element: robot_opts=[x.strip() for x in element[0].xpath('@content')[0].split(',')] nr_a=len(al) textlist=tree.xpath('//*[local-name()!="script" and local-name()!="style" and local-name()!="a" and local-name()!="input"]//text()') nr_text=len('\n'.join(( x.strip() for x in textlist ))) v_page=float(nr_a)/nr_text # try to find body text magic=1.0 if isXhtml: tag='div' else: if len(tree.xpath('/html/body//table'))>len(tree.xpath('/html/body//div')): tag='table' else: tag='div' elem=tree.xpath('/html/body')[0] info('\n\n%s', '--'*30) self.besttag, self.bestvalue=None, 0 self.getBestBodyText(elem, tag) ## info('%s|%s isXhtml:%s, v_page:%.3f, tag=%s', title, rurl, isXhtml, v_page, tag) if self.besttag is not None: bodytext=self.exclude_first_div_tag.match( htmlentitydecode(etree.tostring(self.besttag)).strip() ).group(1) ## bodytext='\n'.join( ( x.strip() for x in self.besttag) ) nr_bodytext=len(bodytext) info('%-5d|%s|%s|%.3f|%s|%s\n%s\n\n……………………\n%s\n%s', nr_bodytext, title, isXhtml, v_page, tag, rurl, bodytext[: int(0.2*nr_bodytext)], bodytext[int(nr_bodytext*0.8):], '-='*30) ## open('/home/kevin/tmp.html', 'w').write(pg) rslt=set() for item in al: a=item.xpath('@href')[0] try: atext=item.xpath('text()')[0] except IndexError: ## info('\n\n IndexError: |->%s<-|', htmlentitydecode(etree.tostring(item))) if item.xpath('./img[@alt]'): atext=item.xpath('./img[@alt]/@alt')[0] ## info('found img/@alt as atext! %s', atext) else: try: atext=self.exclude_first_a_tag.match( htmlentitydecode(etree.tostring(item)).strip() ).group(1) except AttributeError: el=item.xpath('*') atext=''.join((htmlentitydecode(etree.tostring(i)) for i in el)) atext=atext.strip() ## atext=atext.encode(charset) ## if not atext: ## info(' !!! no text: %s', htmlentitydecode(etree.tostring(item))) ## info('%s\n\nfind url: %s\n\t%s', '-*'*20, atext, a) scheme, netloc = None, None sr = urlparse.urlsplit(a) scheme, netloc=sr.scheme, sr.netloc if not scheme: scheme='http' if scheme in ('javascript', 'mailto', 'ftp', 'file', 'hthttp'): ## info('skip %s for %s\n\n%s\n', scheme, a, '-='*30) continue try: assert scheme in ('http','https') except AssertionError: info('scheme %s unknown!', scheme) if not netloc: element=tree.xpath('/html/head/base/@href') if element: baseurl=element[0] ## info('\n%s\nhtml has base url: %s', '~~'*40, baseurl) netloc=baseurl else: bsr=urlparse.urlsplit(rurl) if bsr.netloc: ## info('use rurl netloc: %s', bsr.netloc) netloc=bsr.netloc else: raise StandardError('no netloc found!') ## fullurl= urlparse.urlunsplit( (scheme, netloc, sr.path, sr.query, sr.fragment) ) fullurl= urlparse.urlunsplit( (scheme, netloc, sr.path, sr.query, '') ) ## info('%s\n%s: %s', '-='*30, atext, fullurl) fullurl=unicode(fullurl) # 在允许列表中 或者 不在跳过列表中 if unicode(netloc) in self.allowHost: if any( (x.search(fullurl) for x in self.pAllowPattern) ) or (not any( (x.search(fullurl) for x in self.pSkipPattern) ) ): rslt.add((atext,fullurl)) ## else: ## info(' !!!!! %s', fullurl) ## else: ## info('%s not in allowHost (%d)!!!', netloc, len(self.allowHost)) return rslt