Exemple #1
0
 def __init__(self, depth, startUrls, keyword, htmlQueue, dataQueue,
              urlQueue, exitEvent):
     self.__htmlQueue = htmlQueue
     self.__dataQueue = dataQueue
     self.__urlQueue = urlQueue
     self.__keyword = keyword
     self.__depth = depth
     self.__startUrls = startUrls
     self.__exitEvent = exitEvent
     # pageFilter用于页面过滤,判断此页面是否需要存储
     self.__myPageFilter = PageFilter(keyword)
     # urlFilter用于url过滤,判断url是否需要继续下载
     self.__myUrlFilter = UrlFilter(self.__startUrls)
Exemple #2
0
from downLoadPage import DownLoadPage
from urlFilter import UrlFilter
from urlparse import urljoin, urlparse
from bs4 import BeautifulSoup
import time
urlfilter = UrlFilter()


class GetLinks(object):
    def __init__(self, html, currentUrl):

        self.html = html
        self.links = []
        self.currentUrl = currentUrl
        self.soup = BeautifulSoup(html)

    def getLinks(self, originalUrl, get='get'):
        #print "[INFO]:",self.currentUrl##################OUTPUT data
        if get == 'get':
            results = self.soup.find_all('a', href=True)  ###get型:取a标签href属性
            for i in results:
                href = i.get('href')
                if not href.startswith('http'):
                    href = urljoin(self.currentUrl, href)
                if urlfilter.judgeUrlFormat(href, originalUrl):
                    if urlfilter.filterSameLink(href):  ###去重
                        if urlfilter.filterSimilarLink(href):  ##去相似
                            self.links.append(href)

                else:
                    continue