Esempio n. 1
0
 def crawl_items(self, data):
     '''
     :parameter:
     :data 主程序传过来的数据
     格式如下{'title':xxxx,'url':[xxxx,xxxx,xxxx]}
     :return:无
     '''
     manager = UrlManager()
     # 获取文章标题
     title = data.get('title')
     # 去重
     if manager.remove_duplication(title):
         manager.add_new_urls(data.get('url'))
         # 下载图片文件
         while (manager.has_new_url()):
             print('下载开始==>', title)
             image_urls = manager.get_new_urls()
             # 使用序列修改文件名
             for index, url in enumerate(image_urls):
                 print('下载中==>图片%s' % (index + 1))
                 data = self.downloader.download(url)
                 self.output.save_2_binary(title, index + 1, data)
         # 全部下载完成,增加去重标志
         if not manager.has_new_url():
             manager.add_duplication(title)
             print('下载完成==>')
     else:
         print('重复|无需下载==>', title)
Esempio n. 2
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.dataoutput = DataOutput()
     self.mongoengine = Use_MongoEngine()
     self.urloutput = Url_info_Output()
Esempio n. 3
0
def url_manager_proc(url_q, conn_q, root_url, num=6):
    """
    :param url_q:里面放的是url集合单个url
    :param conn_q:里面放的是url集合
    :param root_url:
    :param num:
    :return:
    """
    url_manager = UrlManager()
    url_manager.add_new_url(root_url)
    while True:

        while url_manager.has_new_url():
            print("# url_manager_proc将要爬取的url放入url_q中")
            new_url = url_manager.get_new_url()
            print(new_url)
            url_q.put(new_url)
            if url_manager.old_url_size() > num:
                # 通知爬行节点工作结束
                url_q.put('end')
                print('控制节点发起结束通知!')
                # 关闭管理节点,同时存储 set 状态
                url_manager.save_progress()
                break
        try:
            if not conn_q.empty():
                print("# url_manager_proc从conn_q中拿取urls")
                urls = conn_q.get()
                print(urls)
                url_manager.add_new_urls(urls)
            else:
                # 延时休息
                time.sleep(0.1)
        except Exception as e:
            print(e)
Esempio n. 4
0
    def url_manager_proc(self, url_que, conn_que, root_url):
        url_manager = UrlManager()
        url_manager.add_new_url(root_url)
        while True:
            while url_manager.has_new_url():
                new_url = url_manager.get_new_url()
                # 将新的url发给工作节点
                url_que.put(new_url)
                print('old_url=', url_manager.old_urls_size())
                if url_manager.old_urls_size() > 2000:
                    url_que.put('end')
                    print('控制节点发出结束通知')
                    # 关闭管理节点,同时存储set状态
                    url_manager.save_progress('new_urls.txt',
                                              url_manager.new_urls)
                    url_manager.save_progress('old_urls.txt',
                                              url_manager.old_urls)
                    return
            # 将从result_solve_proc  获取的urls添加到URL管理器
            try:
                if not conn_que.empty():
                    urls = conn_que.get()
                    for url in urls:
                        url_manager.add_new_url(url)

            except BaseException:
                time.sleep(0.1)
Esempio n. 5
0
    def __init__(self):

        self.manager = UrlManager()

        self.downloader = HtmlDownloader()

        self.parser = HtmlParser()

        self.output = DataOutput()
Esempio n. 6
0
 def url_manager_proc(self, task_queue, url_queue, root_url):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         if url_manager.has_new_url():
             new_url = url_manager.get_new_url()
             print('url: %s放入任务队列' % new_url)
             task_queue.put(new_url)
         if not url_queue.empty():
             next_url = url_queue.get()
             url_manager.add_new_url(next_url)
Esempio n. 7
0
 def crawl_image(self, start_url, total_page, __page=2):
     '''
     爬取蜂鸟大师板块和技法板块的画集
     :parameter:
     :start_url 参数为需要下载的文章URL
     :total_page 下载页数
     :__page 扩展页数起始参数,用户请勿设定
     :return:无
     '''
     manager = UrlManager()
     # 添加入口URL
     if 'image' in start_url or 'academy' in start_url:
         manager.add_new_url(start_url)
         # 判断url管理器中是否有新的url
         while (manager.has_new_url()):
             try:
                 # 从URL管理器获取新的url
                 new_url = manager.get_new_url()
                 # HTML下载器下载网页
                 html = self.downloader.download(new_url)
                 # 通过关键词判断是否是二级网页
                 if 'slide' in new_url:
                     # HTML解析器抽取二级网页数据
                     data = self.parser.parse_data(html)
                     self.crawl_items(data)
                 else:
                     # HTML解析器抽取一级网页数据
                     data = self.parser.parse_urls(html)
                     manager.add_new_urls(data)
             except Exception as e:
                 print('爬取失败==>', e)
         # 爬取后续页数
         if __page <= total_page:
             if 'image' in start_url:
                 next_url = '%s/index.php?action=getList&class_id=192&sub_classid=0&page=%s&not_in_id=' % (
                     start_url, str(__page))
             elif 'academy' in start_url:
                 next_url = '%s/index.php?action=getList&class_id=190&sub_classid=0&page=%s&not_in_id=' % (
                     start_url, str(__page))
             print('开始爬取==>第', str(__page), '页')
             return self.crawl_image(next_url, total_page, __page + 1)
     else:
         print('网址有错误,请检查')
Esempio n. 8
0
	def url_manager_proc(self,url_q,conn_q,root_url):
		url_manager = UrlManager()
		url_manager.add_new_url(root_url)
		while True:
			while(url_manager.has_new_url()):
				new_url = url_manager.get_new_url()
				url_q.put(new_url)
				print('old_url=',url_manager.old_url_size())				
				if(url_manager.old_url_size()>30):
					url_q.put('end')
					print('控制节点发起结束通知!')
					url_manager.save_progress('new_urls.txt',url_manager.new_urls)
					url_manager.save_progress('old_urls.txt',url_manager.old_urls)
					return
				try:				
					urls = conn_q.get()
					url_manager.add_new_urls(urls)
				except:
					time.sleep(0.1)
Esempio n. 9
0
 def url_manager_proc(self, url_q, conn_q, root_url):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         while (url_manager.has_new_url()):
             new_url = url_manager.get_new_url()
             url_q.put(new_url)
             print('old_url=', url_manager.old_url_size())
             if (url_manager.old_url_size() > 2000):
                 url_q.put('end')
                 print('控制节点发起通知')
                 url_manager.save_progress('new_urls.txt',
                                           url_manager.new_urls)
                 url_manager.save_progress('old_urls.txt',
                                           url_manager.old_urls)
                 return
             try:
                 if not conn_q.empty():
                     urls = conn_q.get()
                     url_manager.add_new_urls()
             except BaseException as e:
                 print(e)
                 time.sleep(0.1)
 def url_manager_proc(self, url_q, conn_q, root_url, num=200):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         while url_manager.has_new_url():
             new_url = url_manager.get_new_url()
             url_q.put(new_url)
             if url_manager.old_url_size() > num:
                 # 通知爬行节点工作结束
                 url_q.put('end')
                 print('控制节点发起结束通知!')
                 # 关闭管理节点,同时存储 set 状态
                 url_manager.save_progress()
                 return
         # 没有url了就从conn_q里拿
         try:
             if not conn_q.empty():
                 urls = conn_q.get()
                 url_manager.add_new_urls(urls)
             else:
                 # 延时休息
                 time.sleep(0.1)
         except Exception as e:
             print(e)
Esempio n. 11
0
    def url_manager_proc(self, url_q, conn_q, root_url):
        """从conn_q队列获取新URL到URL管理器, 取URL放入url_q供爬虫节点获取"""
        url_manager = UrlManager()
        url_manager.add_new_url(root_url)
        while True:
            while (url_manager.has_new_url()):
                new_url = url_manager.get_new_url()
                url_q.put(new_url)
                logging.info("old_url_size = %s " % url_manager.old_url_size())

                if url_manager.old_url_size() > 50:
                    url_q.put("end")
                    logging.info("控制节点发起结束通知")
                    url_manager.save_process("new_urls.txt",
                                             url_manager.new_urls)
                    url_manager.save_process("old_urls.txt",
                                             url_manager.old_urls)
                    return
            try:
                if not conn_q.empty():
                    urls = conn_q.get()
                    url_manager.add_new_urls(urls)
            except BaseException as e:
                time.sleep(0.1)
Esempio n. 12
0
 def url_manager_proc(self, url_q, conn_q, root_url):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         while url_manager.has_new_url():
             new_url = url_manager.get_new_url()
             print("url " + new_url)
             url_q.put(new_url)
             # print("old_url=",url_manager.old_url_size())
             if url_manager.old_url_size() > 2000:
                 url_q.put("end")
                 print("控制节点发起结束通知!")
                 url_manager.save_progress("new_urls.txt",
                                           url_manager.new_urls)
                 url_manager.save_progress("old_urls.txt",
                                           url_manager.old_urls)
                 return
         try:
             if not conn_q.empty():
                 urls = conn_q.get()
                 # print(urls)
                 url_manager.add_new_urls(urls)
         except BaseException:
             time.sleep(0.1)
Esempio n. 13
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
Esempio n. 14
0
 def __init__(self):
     self.urlmanager = UrlManager()
Esempio n. 15
0
from URLManager import UrlManager
import pickle
import hashlib

print("has_new_url", UrlManager.has_new_url.__doc__)
print("add_new_url", UrlManager.add_new_url.__doc__)
print("add_new_urls", UrlManager.add_new_urls.__doc__)
print("get_new_url", UrlManager.get_new_url.__doc__)
print("new_url_size", UrlManager.new_url_size.__doc__)
print("old_url_size", UrlManager.old_url_size.__doc__)
print("save_progress", UrlManager.save_progress.__doc__)
print("load_progress", UrlManager.load_progress.__doc__)

urls = set([
    "http://qq.ip138.com/tianqi/", "http://qq.ip138.com/shenfenzheng/",
    "http://qq.ip138.com/huoche/",
    "http://qq.ip138.com/daishoudian/mobile.htm",
    "http://www.miitbeian.gov.cn/"
])
urlmanager = UrlManager()
print(type(urls))
# urlmanager获得新的url集合
urlmanager.add_new_urls(urls)
print(urlmanager.has_new_url())
# urlmanager输出一个未爬取的url
new_url = urlmanager.get_new_url()  #拿出的同时将其放的到已经爬取的url集合中
# 没有未爬取的url时返回None
print(new_url)
print(urlmanager.old_url_size())
# 保存进度
urlmanager.save_progress()
Esempio n. 16
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()  # 实例化时连接到数据库
     self.s = Settings().setting
Esempio n. 17
0
 def __init__(self):
     self.manager = UrlManager()  # 我再练习时出错的地方,少了(),导致报错
     self.downloader = HtmlDownloader()
     self.parser = HtmlParse()
     self.output = DataOutput()
Esempio n. 18
0
 def __init__(self):
     self.urlManager = UrlManager()
     self.downloader = HtmlDownloader()
     self.dataStore = SimpleHtmlDataStore()
     self.parser = BikeParser()