コード例 #1
0
ファイル: Spider.py プロジェクト: kungyutucheng/Spider
class Spider(object):
    def __init__(self):
        print 'init'
        self.urlManager = UrlManager()
        self.downloader = Downloader()
        self.praser = HtmlPraser()
        self.outputer = Output()

    def craw(self, rootUrl):
        self.urlManager.addUrl(rootUrl)
        count = 1

        while self.urlManager.hasNewUrl():
            newUrl = self.urlManager.getNewUrl()
            print '爬取第', count, '个url,url是:', newUrl
            htmlContent = self.downloader.download(newUrl)
            newUrls, newData = self.praser.praser(newUrl, htmlContent)
            self.urlManager.addUrls(newUrls)
            self.outputer.collect(newData)

            if count == 10:
                break

            count = count + 1

        self.outputer.output()