def __init__(self, depth, startUrls, keyword, htmlQueue, dataQueue, urlQueue, exitEvent): self.__htmlQueue = htmlQueue self.__dataQueue = dataQueue self.__urlQueue = urlQueue self.__keyword = keyword self.__depth = depth self.__startUrls = startUrls self.__exitEvent = exitEvent # pageFilter用于页面过滤,判断此页面是否需要存储 self.__myPageFilter = PageFilter(keyword) # urlFilter用于url过滤,判断url是否需要继续下载 self.__myUrlFilter = UrlFilter(self.__startUrls)
from downLoadPage import DownLoadPage from urlFilter import UrlFilter from urlparse import urljoin, urlparse from bs4 import BeautifulSoup import time urlfilter = UrlFilter() class GetLinks(object): def __init__(self, html, currentUrl): self.html = html self.links = [] self.currentUrl = currentUrl self.soup = BeautifulSoup(html) def getLinks(self, originalUrl, get='get'): #print "[INFO]:",self.currentUrl##################OUTPUT data if get == 'get': results = self.soup.find_all('a', href=True) ###get型:取a标签href属性 for i in results: href = i.get('href') if not href.startswith('http'): href = urljoin(self.currentUrl, href) if urlfilter.judgeUrlFormat(href, originalUrl): if urlfilter.filterSameLink(href): ###去重 if urlfilter.filterSimilarLink(href): ##去相似 self.links.append(href) else: continue