Ejemplo n.º 1
0
    def __init__(self, start_urls, domains=None, regexRules=None,
                 downloader=None, scheduler=None, saver=None, pageProcessor=None):
        super(Spider, self).__init__()

        self.start_urls = start_urls
        self.domains = domains
        self.regexRules = regexRules

        self.downloader = downloader or SyncDownloader()
        self.scheduler = scheduler or redisScheduler()
        # 默认将结果输出到控制台
        self.saver = saver or ConsoleSaver()
        self.pageProcessor = pageProcessor or Processor()

        # Prefix start_urls
        for url_idex in range(len(self.start_urls)):
            self.start_urls[url_idex] = self.__fix_urls(self.start_urls[url_idex])

        # Valid URL
        for url in self.start_urls:
            if self.__judge_urls(url):
                # Add init URL collection
                self.scheduler.pushUrl(pUrl(url), 0)

        self.config = Config()
        # Store thread of this Spider
        self.threads = {}
Ejemplo n.º 2
0
        def __run():
            while True:
                # Check the current thread's status
                thread = self.threads[threading.currentThread().getName()]
                if thread['status'] == 'Stopping':
                    # Finish current thread
                    break
                elif thread['status'] == 'Suspending':
                    # Suspend current thread
                    continue

                URL, _ = self.scheduler.popUrl()
                if URL is None and threading.activeCount() == 2:
                    # Main-thread、Monitor-thread
                    break
                elif URL is None:
                    # Sleep one second
                    time.sleep(1)
                else:
                    # print URL.getUrl
                    # step1: download URL
                    document = self.downloader.download(URL)
                    if document:
                        # step2: Get URLs in document
                        urls, document = document.parserLinks()
                        # step3: handle URLs from step2
                        for url in urls:
                            if self.regexRules is None:
                                if self.__judge_urls(url.getUrl):
                                    self.scheduler.pushUrl(pUrl(self.__fix_urls(url.getUrl)), 0)
                            else:
                                if self.regexRules.isMatched(url.getUrl) and self.__judge_urls(url.getUrl):
                                    self.scheduler.pushUrl(pUrl(self.__fix_urls(url.getUrl)), 0)
                        # step4: handle document
                        document = self.pageProcessor.pageParser(document)
                        # step5: save the result from document
                        self.saver.save(document.getItems())
                        document.clear()
                self.scheduler.done()
    def popUrl(self):
        """Pop the URL from redisQueue"""
        url = None
        score = 0
        try:
            if self.qtype == 'q':
                url = self.__Queue.pop()
            elif self.qtype == 'p':
                url, score = self.__Queue.pop(withscores=True)
        except Exception as error:
            log.info('redisScheduler.RedisScheduler.popUrl ERROR(reason: %s)',
                     error)

        return pUrl(url, score), None
Ejemplo n.º 4
0
    def parserLinks(self):
        links = []
        __Set = set()
        try:
            tagsOfa = self.parserDoc.select('a')
            for info in tagsOfa:
                if 'href' not in info.attrs:
                    continue
                url, sign = self.__fixUrl(info.attrs['href'])
                if sign:
                    if url not in __Set:
                        __Set.add(url)
                        links.append(pUrl(url))
        except Exception as error:
            log.error('parser htmlDoc(%r) ERROR:%r', self.pUrl.getUrl, error)

        return links, self
        try:
            if self.qtype == 'q':
                url = self.__Queue.pop()
            elif self.qtype == 'p':
                url, score = self.__Queue.pop(withscores=True)
        except Exception as error:
            log.info('redisScheduler.RedisScheduler.popUrl ERROR(reason: %s)',
                     error)

        return pUrl(url, score), None

    def done(self):
        pass

    def getSize(self):
        """Get redisQueue size from redis server"""
        return self.__Queue.getSize()

    def isEmpty(self):
        """Judge redisQueue whether is empty"""
        return self.__Queue.getSize() == 0

    def join(self):
        pass


if __name__ == '__main__':
    r = redisScheduler()
    r.pushUrl(pUrl('www'), 0)
    print r.popUrl().getUrl
Ejemplo n.º 6
0
    def setNewUrls(self, newUrls):
        self.newUrls = newUrls
        return self

    def getItems(self):
        return self.items

    def setItems(self, key, value):
        try:
            self.items[key] = value
        except:
            pass

    def getParserDoc(self):
        return self.parserDoc

    def clear(self):
        self.items = {}
        self.newUrls = []
        self.parserDoc = None


if __name__ == '__main__':
    import requests

    url = "http://news.nwsuaf.edu.cn/yxxw/75964.htm"
    htmlDoc = requests.get(url).text
    for URL in Document(pUrl(url), htmlDoc).parserLinks():
        print(URL.getUrl)
Ejemplo n.º 7
0
#coding: utf-8
"""
同步下载器
"""

from core.downloader import Downloader
from core.downloader.Net import net
from parserDoc.parserDoc import Document
from defines.pUrl import pUrl


class SyncDownloader(Downloader):
    def __init__(self):
        super(SyncDownloader, self).__init__()

    def download(self, pUrl):
        htmlDoc = net.getInstance().get(pUrl.getUrl)
        document = None
        if htmlDoc is not None:
            document = Document(pUrl, htmlDoc)

        return document


if __name__ == '__main__':
    print SyncDownloader().download(pUrl('http://www.baidu.com'))