Esempio n. 1
0
class SpiderMan(object):

    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口
        self.manager.add_new_url(root_url)
        while(self.manager.has_new_url() and
              self.manager.old_urls_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                # print(new_url, '.......')
                html = self.downloader.download(new_url)
                # print(html)
                new_urls, data = self.parser.parse(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print('已经抓取 %s 个链接' % self.manager.old_urls_size())
            except Exception as e:
                print(e)
                # print('crawl failed')
        self.output.output_html()
class SpiderWork(object):

    def __init__(self):
        # 初始化分布式进程中的工作节点的连接工作
        # 实现第一步: 使用BaseManager注册获取Queue的方法名称
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        # 实现第二步: 连接到服务器
        server_addr = '127.0.0.1'
        print('Connect to server %s...' % server_addr)
        # 端口和验证口令注意保持与服务器进行设置的完全一致:
        self.m = BaseManager(address=(server_addr, 8001), authkey='baike'.encode('utf-8'))
        # 从网络连接
        self.m.connect()
        # 实现第三步:获取Queue的对象
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        # 初始化网页下载器和解析器
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):
        while True:
            try:
                if not self.task.empty():
                    url = self.task.get()

                    if url == 'end':
                        print('控制节点通知爬虫节点停止工作...')
                        # 接着通知其它节点停止工作
                        self.result.put({'new_urls': 'end', 'data': 'end'})
                        return
                    print('爬虫节点正在解析:%s' % url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls, data = self.parser.parse(url, content)
                    self.result.put({'new_urls': new_urls, 'data': data})
            except EOFError as e:
                print('连接工作节点失败')
                return
            except Exception as e:
                print(e)
                print('Crawl fail')
Esempio n. 3
0
class SpideMan(object):
    def __init__(self):
        self.manager = URLManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)

        while self.manager.has_new_url() and self.manager.old_url_size() < 100:
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parse(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print("catch %s link" % self.manager.old_url_size())
            except Exception as e:
                print(e)
        self.output.output_html()
def save_tokens(webpage, dir_name, dst_PATH, dwl_dir, sleep=5, verbose=True):

    os.makedirs(dst_PATH + dir_name, exist_ok=True)

    if verbose:
        print('============================')
        print(dir_name)

    imgNameExtractor = HtmlExtractor(
        ['\<em>', '\</em>'], ['<div class='],
        lambda x: x[x.index('>') + 1:x.index('</')])
    urlExtractor = HtmlExtractor(['<a class="lightly" href="h'], None,
                                 lambda x: x[25:-2])
    roll20Parser = HtmlParser([urlExtractor, imgNameExtractor])
    urlAndNames = roll20Parser.parse(webpage)
    links = urlAndNames[urlExtractor]
    names = urlAndNames[imgNameExtractor]

    assert (len(links) == len(names))

    if verbose:
        print('============================')
        print('Found', len(links), 'extracted images')
        print('============================')

    for j in range(len(links)):
        url = links[j]
        webbrowser.get('windows-default').open_new_tab(url)
        time.sleep(sleep)
        _name = names[j]
        if verbose:
            print(j, '--', _name, '--', url)
        for _img in os.listdir(dwl_dir):
            if (str(_img)[:3] == 'max') and (str(_img)[-1] == 'g'):
                move(dwl_dir + _img,
                     dst_PATH + dir_name + '/' + _name + '.png')

    if not (len(os.listdir(dst_PATH + dir_name)) == len(links)):
        print('WARNING: number of url different than final number of images')
Esempio n. 5
0
class Crawler(object):
    """
    用途: 主要爬虫程序
    """
    def __init__(self, bind_domain):

        # 建立管理爬取URL的物件 , 用于记录已经爬过的URL
        self.urlManager = UrlManager(enable_external_link=False,
                                     bind_domain=bind_domain)

        # 建立请求链接的物件
        self.downloader = HtmlDownloader()

        # 建立转换Html源码成lxml.html物件 , 获取新的链接
        self.parser = HtmlParser()

    def craw(self, url):

        # 加入根页面
        self.urlManager.add_new_url(root_url)

        # 查询Manager内储存url的集合
        while self.urlManager.has_new_url():

            # 获取新的url链接
            request_url = self.urlManager.get_new_url()
            print("目前请求{0}".format(request_url))

            # 下载页面
            html_content = self.downloader.downlaod(request_url)

            # 转换html后筛选出所有a记录并且取得新的链接
            new_urls = self.parser.parse(request_url, html_content)

            # 加入链接到管理器
            self.urlManager.add_new_urls(new_urls)