Exemple #1
0
 def url_manager_proc(self, url_q, conn_q, root_url):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         while url_manager.has_new_url():
             # 从URL管理器获取新的URL
             new_url = url_manager.get_new_url()
             # 将新的URL发给工作节点
             url_q.put(new_url)
             print('old_url=', url_manager.old_url_size())
             # 加一个判断条件,当爬取2000个链接后就关闭,并保存进度
             if url_manager.old_url_size() > 2000:
                 # 通知爬行节点工作结束
                 url_q.put('end')
                 print('控制节点发起结束通知!')
                 # 关闭管理节点,同时存储set状态
                 url_manager.save_progress('new_urls.txt',
                                           url_manager.new_urls)
                 url_manager.save_progress('old_urls.txt',
                                           url_manager.old_urls)
                 return
         # 将从result_solve_proc获取到的URL添加到URL管理器
         try:
             if not conn_q.empty():
                 urls = conn_q.get()
                 url_manager.add_new_url(urls)
         except BaseException:
             # 延时休息
             time.sleep(0.1)
            count += 1
            new_url = url.get_new_url()
            print('正在爬第' + str(count) + '条:' + new_url)
            mutex.release()
            html = downloader.download(new_url)
            url_list = parser.parser(html)
            url.add_new_urls(url_list)
    except:
        print('未知异常')


if __name__ == '__main__':

    # 添加新的URL,并开始一轮爬取
    url.add_new_url(
        'https://baike.baidu.com/item/%E6%99%BA%E6%85%A7%E5%9C%B0%E7%90%83/1071533'
    )
    crawl()

    # 添加旧的URL,在进行了一轮爬取后再添加旧的
    print('已经添加旧的url')
    all_url_path = './all_url.txt'
    all_url_file = open(all_url_path, mode='r', encoding='utf-8')
    url_list = all_url_file.readlines()
    for url_1 in url_list:
        if url_1.find('http') == 0:
            url.add_old_url(url_1[:-1])

    # 开启多线程
    while url.has_new_url():
        t = None