Exemple #1
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.parser = HtmlParser()
        self.downloader = HtmlDownloader()
        self.output = DataOutput()

    def crawl(self, root_url):
        """
        程序主逻辑
        :param root_url: 入口 url
        :return:
        """
        self.manager.add_new_url(root_url)
        while self.manager.has_new_url() and self.manager.old_url_size() < 20:
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.downloader(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.output_txt(data)
                print(data)
                print("爬取了{}条链接".format(self.manager.old_url_size()))
            except Exception as e:
                print("爬取失败", e)
Exemple #2
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)

        while (self.manager.has_new_urls()
               and self.manager.old_url_size() < 100):
            #try:
            # 获取新的url
            new_url = self.manager.get_new_url()
            # 下载器下载网页
            html = self.downloader.download(new_url)
            # 解析器抽取网页数据
            new_urls, data = self.parser.parser(new_url, html)
            # 添加UR管理器
            self.manager.add_new_urls(new_urls)
            # 数据存储文件
            self.output.store_data(data)
            print("已经抓取 %s 个链接" % self.manager.old_url_size())
            #except Exception, e:
            #    print("crawl failded", e)
        self.output.out_put_html()