def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while (url_manager.has_new_url()): # 从URL管理器获取新的url new_url = url_manager.get_new_url() # 将新的URl发给工作节点 url_q.put(new_url) print('old_url=', url_manager.old_url_size()) # 加一个判断条件,当爬去2000个链接后就关闭,并保存进度 if (url_manager.old_url_size() > 2000): # 通知爬行节点工作结束 url_q.put('end') print('控制节点发起结束通知!') # 关闭管理节点,同时存储set状态 url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return # 将从result_solve_proc获取到的urls添加到URL管理器之间 try: urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException as e: time.sleep(0.1) # 延时休息
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while(url_manager.has_new_url()): # 从URL管理器获取新的url new_url = url_manager.get_new_url() # 将新的URL发给工作节点 url_q.put(new_url) print "[*]The number of crawled url is: ", url_manager.old_url_size() # 加一个判断条件,当爬去2000个链接后就关闭,并保存进度 if(url_manager.old_url_size()>500): # 通知爬行节点工作结束,添加标识符end url_q.put('end') print u"\n[*]控制节点通知爬行结点结束工作..." # 关闭管理节点,同时存储set状态 url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return # 将从result_solve_proc获取到的urls添加到URL管理器 try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException, e: # 延时休息 time.sleep(5)
def url_manager_proc(self,url_q,conn_q,root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) n = 0 while True: while(url_manager.has_new_url()): new_url = url_manager.get_new_url() url_q.put(new_url) print 'old_url =',url_manager.old_url_size() if (url_manager.old_url_size() > 100): url_q.put('end') print '控制节点发起节点结束通知' url_manager.save_progress('new_urls.txt',url_manager.new_urls) url_manager.save_progress('old_urls.txt',UrlManager.old_urls) return try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) url_manager.add_new_urls(urls) except BaseException,e: time.sleep(0.1)
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() pass
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while (url_manager.has_new_url()): #从URL管理器获取新的URL new_url = url_manager.get_new_url() #将新的URL发给工作节点 url_q.put(new_url) print('old_url=', url_manager.old_url_size(), new_url) #加一个判断条件,当爬取2000个链接后就关闭,并保存进度 if (url_manager.old_url_size() > 2000): #通知爬行节点工作结束 url_q.put('end') print('控制节点发起结束通知!') #关闭管理节点,同时存储 set状态 url_manager.save_progress('new_urls.txt', url_manager.new__urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return #将从result_solve_proc获取到的URL添加到URL管理器 try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException as e: pass
def url_manager_proc(self, url_q, conn_q, root_url): ''' url管理进程将conn_q队列获取的新URL提交给URL管理器,经过去重后,取出URL放入url_queue队列中传递给爬虫节点 :param url_q: :param conn_q: :param root_url: :return: ''' url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while (url_manager.has_new_url()): # 从URL管理器获取新的URL new_url = url_manager.get_new_url() # 将新的URL发给爬虫节点 url_q.put(new_url) print 'old_url=', url_manager.old_url_size() # 当爬取2000个链接后就关闭,并保存进度 if (url_manager.old_url_size() > 2000): # 通知爬虫节点工作结束 url_q.put('end') print '控制节点发起结束通知' # 关闭管理节点,同时存储set状态 url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return # 将从result_solve_proc获取到的URL添加到URL管理器 try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException, e: time.sleep(1)
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while url_manager.has_new_url(): new_url = url_manager.get_new_url() url_q.put(new_url) print('old_url=', url_manager.old_url_size()) if url_manager.old_url_size() > 10: # 通知爬行节点工作结束 url_q.put('end') print('控制节点发起结束通知!') # 关闭管理节点,同时存储set状态 url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return # 将从result_solve_proc获取到的URL添加到URL管理器 try: if not conn_q.empty(): urls = conn_q.get(True) url_manager.add_new_urls(urls) except Exception as e: time.sleep(0.1)
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while (url_manager.has_new_url()): # 从URL管理器获取新的url new_url = url_manager.get_new_url() # 将新的URL发给工作节点 url_q.put(new_url) print("old_url=", url_manager.old_url_size()) # 加一个判断条件,当爬取了2000条URL就关闭,并保存进度 if (url_manager.old_url_size() > 20): #通知爬行节点工作结束 print("爬取工作结束") url_manager.save_progress("new_urls.txt", url_manager.new_urls) url_manager.save_progress("old_urls.txt", url_manager.old_urls) return # 将从result_solve_proc获取到的urls添加到URL管理器中 try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException as e: time.sleep(0.1) #延时休息
def url_manager_proc(self, url_q, conn_q, root_url): """ functions: 1. get new url into conn_q and give to UrlManager 2. UrlManager process dereplication 3. pull url out and send to url_queue to spider node """ url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while (url_manager.has_new_urls()): # get a uncrawled url in UrlManager new_url = url_manager.get_new_url() # send url to work node url_q.put(new_url) print 'old_url = ', url_manager.old_url_size() # set conditions if (url_manager.old_url_size() > 2000): url_q.put('end') print('[!] scheduler send information [END]') # close node and store set() url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return # get url into result_solve_proc # and send it into UrlManager try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException, e: time.sleep(0.1)
def __init__(self): self.G_STATE_OK = 200 self.crawMaxNum = -1 self.crawCountNum = 0 self.urlManager = UrlManager() self.dispatch = Dispatch() self.htmlParser = HtmlParser("http://baike.baidu.com") self.applicationShow = ApplicationShow()
def main(): idi = 1405150114 urlmanager = UrlManager() pageur = urlmanager.url_login(idi) infourl = urlmanager.url_userinfo(idi) htmldownloader = HtmlDownloader() htmldownloader.download( 'http://ygcp.njtech.edu.cn/User/LoginInSampleAction.aspx', idi, pageur, infourl)
def __init__(self): #创建一个url管理器 self.urlManager = UrlManager() #创建一个html下载器 self.downloader = htmlDownloader() #创建一个html的解析器 self.htmlparser = htmlParser() #创建一个html的存储器 self.htmlSave = htmlSave()
def __init__(self, sort, sort_url, sortFilename): threading.Thread.__init__(self) self.sort = sort self.sort_url = sort_url self.sortFilename = sortFilename self.manager = UrlManager(self.sort) self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()
def __init__(self, bind_domain): # 建立管理爬取URL的物件 , 用于记录已经爬过的URL self.urlManager = UrlManager(enable_external_link=False, bind_domain=bind_domain) # 建立请求链接的物件 self.downloader = HtmlDownloader() # 建立转换Html源码成lxml.html物件 , 获取新的链接 self.parser = HtmlParser()
def __init__(self, url_argv): sys.setrecursionlimit(10000000) """ 调度数据库接口, 引入初始化, 调度器, 爬取器, 分析器 """ self.db = DbManager.DbManager(db_config).mysql_connect() self.config = spider_config.spider_config() self.initialization = Initialization.Initialization( self.db, self.config, url_argv) self.manager = UrlManager.UrlManager(self.db, self.config) self.craw = UrlCraw.UrlCraw(self.db, self.config) self.analyse = UrlAnalyse.UrlAnalyse(self.db, self.config) self.sprint = SpiderPrint.SpiderPrint() self.initialize_spider()
def url_manager_proc(self, url_q): ''' url管理进程将url_q中的待爬取城市传递给爬虫节点 :param url_q:管理进程通将url传递给爬虫节点的通道 :return: ''' url_manager = UrlManager() while True: while (url_manager.has_new_url()): # 从URL管理器获取新的url new_url = url_manager.get_new_url() # 将新的URL发给工作节点 url_q.put(new_url) # 通知爬虫节点停止工作 url_q.put('end') # 关闭管理节点,同时存储set状态 url_manager.save_progress('new_city.txt', url_manager.new_urls) url_manager.save_progress('old_city.txt', url_manager.old_urls) return
def url_manager_proc(self, url_q, conn_q, root_url): ''' :URL管理进程 :param:向url_q队列put url :param:从conn_q队列中get新的url ''' url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while (url_manager.has_new_url()): #从URL管理器获取新的URL new_url = url_manager.get_new_url() #将新的URL发给工作节点 url_q.put(new_url) print('old_url=', url_manager.old_url_size()) #加一个判断条件,当爬取2000个链接后就关机,并保存进度 if (url_manager.old_url_size() > 100): #通知爬行节点工作结束 url_q.put('end') print '控制节点发起结束通知!' #关闭管理节点,同时存储set状态???? url_manager.save_progress( r'C:\Users\1\Desktop\python_code\distributionCrawler\ControlNode\new_urls.txt', url_manager.new_urls) #?? url_manager.save_progress( r'C:\Users\1\Desktop\python_code\distributionCrawler\ControlNode\old_urls.txt', url_manager.old_urls) return #将从result_solve_proc获取到的URL添加到URL管理器 try: urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException, e: #这是什么错误?? time.sleep(0.1) #延时休息
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while url_manager.has_new_url(): new_url = url_manager.get_new_url() url_q.put(new_url) print 'old_urls=', url_manager.old_urls_size() if url_manager.old_urls_size() > 2000: url_q.put('end') print '控制节点发出结束通知' url_manager.save_process("new_urls.txt", url_manager.new_urls) url_manager.save_process("old_urls.txt", url_manager.old_urls) return try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except Exception: time.sleep(0.1)
def url_manager_process(self, task_queue): ''' url管理器进程 :param task_queue: url队列 :param conn_queue: :param root_url: 起始url :return: ''' sql = 'SELECT id,bname FROM ' + TABLE_NAME + ' WHERE bdoubanlink IS NULL OR bdoubanlink=""' url_manager = UrlManager() db = MysqlHelper(DATABASE_NAME) while True: if not url_manager.has_new_url(): datas = db.select(sql) if datas: for data in datas: task_data = str(data[0])+'$$'+data[1].strip() url_manager.add_new_url(task_data) print('[√] Datas has been read from database!') else: print('[!] Fetch database null.') exit(-1) # 添加爬虫结束条件 if not url_manager.has_new_url(): # 通知节点停止工作 task_queue.put('end') print('[·] Controler send "end" command.') return while task_queue.qsize() < _config.QUEUE_NUM and url_manager.has_new_url(): # 从url管理器获取新的url new_url = url_manager.get_new_url() # 将url分发下去 task_queue.put(new_url) print('[+] >>> %s' % new_url)
def __init__(self): self.manager = UrlManager() self.downloader = FileDownLoader() self.parser = FileParser() self.output = DataOutput()
def __init__(self): self.data_store = DataStore() self.url_manager = UrlManager(self.data_store) self.strategy_container = ParserStrategyContainer() self.downloader = HtmlDownloader()
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser()
def __init__(self): self.manage = UrlManager()
def __init__(self): self.manger = UrlManager() self.download = HtmlDownload() self.parse = HtmlParse() self.outpu = DataOuput()
def __init__(self): print 'init' self.urlManager = UrlManager() self.downloader = Downloader() self.praser = HtmlPraser() self.outputer = Output()
def __init__(self): self.manager = UrlManager() self.downloader = HttpDownloader() self.parser = ContentParser()
def __init__(self): self.urlManager = UrlManager() self.htmlDownloader = HtmlDownloader() self.htmlParser = HtmlParser() self.htmlOutput = DataOutput()
from UrlManager import UrlManager objs = { "https://www.gutenberg.org/files/1342/1342-0.txt": "Pride And Prejudice.txt", "https://www.gutenberg.org/files/11/11-0.txt": "Alice's Adventures in Wonderland.txt", "http://www.gutenberg.org/cache/epub/16328/pg16328.txt": "Beowulf.txt", "https://www.gutenberg.org/files/1661/1661-0.txt": "The Adventures of Sherlock Holmes.txt", "https://www.gutenberg.org/files/1952/1952-0.txt": "The Yellow Wallpaper.txt", "https://www.gutenberg.org/files/98/98-0.txt": "A Tale of Two Cities.txt", "https://www.gutenberg.org/files/2701/2701-0.txt": "Moby Dick.txt", "https://www.gutenberg.org/files/84/84-0.txt": "Frankenstein; Or, The Modern Prometheus.txt", "http://www.gutenberg.org/cache/epub/5200/pg5200.txt": "Metamorphosis.txt", "http://www.gutenberg.org/cache/epub/1497/pg1497.txt": "The Republic.txt" } manager = UrlManager(objs) manager.multi_download() for s in manager.iter(): print(s)
def __init__(self): self.downloader = HtmlDownloader() self.urlmanager = UrlManager()
def __init__(self): self.urlManager = UrlManager() self.htmlDownloader = HtmlDownloader() self.parser = Parser() self.outputer = Outputer()