Ejemplo n.º 1
0
class SpiderMan(object):
    def __init__(self):
        self.url_manager = UrlManager()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.validator = Validator()
        self.sqlite_handle = SqliteHandle()

    def crawl(self, root_url):
        self.url_manager.add_new_url(root_url)
        while self.url_manager.has_new_url(
        ) and self.url_manager.old_url_size() < 100:
            try:
                new_url = self.url_manager.get_new_url()
                html_cont = self.html_downloader.download(new_url)
                proxy_list = self.html_parser.parser(html_cont)

                valid_proxy_list = self.validator.check_proxy(proxy_list)
                total_count = len(proxy_list)
                valid_count = len(valid_proxy_list)
                valid_rate = float(valid_count) / float(total_count) * 100
                logger.info(
                    'total_count:%s, valid_count:%s, valid_rate:%.2f%%' %
                    (str(total_count), str(valid_count), valid_rate))

                self.sqlite_handle.insert_data(valid_proxy_list)
                logger.info('Crawl→Download→Parse→Validate→Save: successfully')
            except Exception as e:
                logger.warn('Crawl→Download→Parse→Validate→Save: failed')
                logger.error(e)