def url_manager_proc(self, url_q, conn_q, root_url):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         while (url_manager.has_new_url()):
             # 从URL管理器获取新的url
             new_url = url_manager.get_new_url()
             # 将新的URl发给工作节点
             url_q.put(new_url)
             print('old_url=', url_manager.old_url_size())
             # 加一个判断条件,当爬去2000个链接后就关闭,并保存进度
             if (url_manager.old_url_size() > 2000):
                 # 通知爬行节点工作结束
                 url_q.put('end')
                 print('控制节点发起结束通知!')
                 # 关闭管理节点,同时存储set状态
                 url_manager.save_progress('new_urls.txt',
                                           url_manager.new_urls)
                 url_manager.save_progress('old_urls.txt',
                                           url_manager.old_urls)
                 return
         # 将从result_solve_proc获取到的urls添加到URL管理器之间
         try:
             urls = conn_q.get()
             url_manager.add_new_urls(urls)
         except BaseException as e:
             time.sleep(0.1)  # 延时休息
Esempio n. 2
0
    def url_manager_proc(self, url_q, conn_q, root_url):  
        url_manager = UrlManager()  
        url_manager.add_new_url(root_url)  
        while True:  
            while(url_manager.has_new_url()):  
                # 从URL管理器获取新的url  
                new_url = url_manager.get_new_url()  
                # 将新的URL发给工作节点  
                url_q.put(new_url)  
                print "[*]The number of crawled url is: ", url_manager.old_url_size()  

                # 加一个判断条件,当爬去2000个链接后就关闭,并保存进度  
                if(url_manager.old_url_size()>500):  
                    # 通知爬行节点工作结束,添加标识符end  
                    url_q.put('end')  
                    print u"\n[*]控制节点通知爬行结点结束工作..."  
                    # 关闭管理节点,同时存储set状态  
                    url_manager.save_progress('new_urls.txt', url_manager.new_urls)  
                    url_manager.save_progress('old_urls.txt', url_manager.old_urls)  
                    return  
            # 将从result_solve_proc获取到的urls添加到URL管理器  
            try:  
                if not conn_q.empty():  
                    urls = conn_q.get()  
                    url_manager.add_new_urls(urls)  
            except BaseException, e:  
                # 延时休息  
                time.sleep(5)  
Esempio n. 3
0
  def url_manager_proc(self,url_q,conn_q,root_url):
    url_manager = UrlManager()
    url_manager.add_new_url(root_url)
    n = 0
    while True:
   
      while(url_manager.has_new_url()):

        new_url = url_manager.get_new_url()
        url_q.put(new_url)
        print 'old_url =',url_manager.old_url_size()

        if (url_manager.old_url_size() > 100):
          url_q.put('end')
          print '控制节点发起节点结束通知'
          url_manager.save_progress('new_urls.txt',url_manager.new_urls)
          url_manager.save_progress('old_urls.txt',UrlManager.old_urls)
          return
      try:
        if not conn_q.empty():
          urls = conn_q.get()
          url_manager.add_new_urls(urls)
          url_manager.add_new_urls(urls)
      except BaseException,e:
        time.sleep(0.1)
Esempio n. 4
0
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

        pass
Esempio n. 5
0
 def url_manager_proc(self, url_q, conn_q, root_url):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         while (url_manager.has_new_url()):
             #从URL管理器获取新的URL
             new_url = url_manager.get_new_url()
             #将新的URL发给工作节点
             url_q.put(new_url)
             print('old_url=', url_manager.old_url_size(), new_url)
             #加一个判断条件,当爬取2000个链接后就关闭,并保存进度
             if (url_manager.old_url_size() > 2000):
                 #通知爬行节点工作结束
                 url_q.put('end')
                 print('控制节点发起结束通知!')
                 #关闭管理节点,同时存储 set状态
                 url_manager.save_progress('new_urls.txt',
                                           url_manager.new__urls)
                 url_manager.save_progress('old_urls.txt',
                                           url_manager.old_urls)
                 return
         #将从result_solve_proc获取到的URL添加到URL管理器
         try:
             if not conn_q.empty():
                 urls = conn_q.get()
                 url_manager.add_new_urls(urls)
         except BaseException as e:
             pass
Esempio n. 6
0
 def url_manager_proc(self, url_q, conn_q, root_url):
     '''
     url管理进程将conn_q队列获取的新URL提交给URL管理器,经过去重后,取出URL放入url_queue队列中传递给爬虫节点
     :param url_q:
     :param conn_q:
     :param root_url:
     :return:
     '''
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         while (url_manager.has_new_url()):
             # 从URL管理器获取新的URL
             new_url = url_manager.get_new_url()
             # 将新的URL发给爬虫节点
             url_q.put(new_url)
             print 'old_url=', url_manager.old_url_size()
             # 当爬取2000个链接后就关闭,并保存进度
             if (url_manager.old_url_size() > 2000):
                 # 通知爬虫节点工作结束
                 url_q.put('end')
                 print '控制节点发起结束通知'
                 # 关闭管理节点,同时存储set状态
                 url_manager.save_progress('new_urls.txt',
                                           url_manager.new_urls)
                 url_manager.save_progress('old_urls.txt',
                                           url_manager.old_urls)
                 return
         # 将从result_solve_proc获取到的URL添加到URL管理器
         try:
             if not conn_q.empty():
                 urls = conn_q.get()
                 url_manager.add_new_urls(urls)
         except BaseException, e:
             time.sleep(1)
Esempio n. 7
0
 def url_manager_proc(self, url_q, conn_q, root_url):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         while url_manager.has_new_url():
             new_url = url_manager.get_new_url()
             url_q.put(new_url)
             print('old_url=', url_manager.old_url_size())
             if url_manager.old_url_size() > 10:
                 # 通知爬行节点工作结束
                 url_q.put('end')
                 print('控制节点发起结束通知!')
                 # 关闭管理节点,同时存储set状态
                 url_manager.save_progress('new_urls.txt',
                                           url_manager.new_urls)
                 url_manager.save_progress('old_urls.txt',
                                           url_manager.old_urls)
                 return
         # 将从result_solve_proc获取到的URL添加到URL管理器
         try:
             if not conn_q.empty():
                 urls = conn_q.get(True)
                 url_manager.add_new_urls(urls)
         except Exception as e:
             time.sleep(0.1)
 def url_manager_proc(self, url_q, conn_q, root_url):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         while (url_manager.has_new_url()):
             # 从URL管理器获取新的url
             new_url = url_manager.get_new_url()
             # 将新的URL发给工作节点
             url_q.put(new_url)
             print("old_url=", url_manager.old_url_size())
             # 加一个判断条件,当爬取了2000条URL就关闭,并保存进度
             if (url_manager.old_url_size() > 20):
                 #通知爬行节点工作结束
                 print("爬取工作结束")
                 url_manager.save_progress("new_urls.txt",
                                           url_manager.new_urls)
                 url_manager.save_progress("old_urls.txt",
                                           url_manager.old_urls)
                 return
         # 将从result_solve_proc获取到的urls添加到URL管理器中
         try:
             if not conn_q.empty():
                 urls = conn_q.get()
                 url_manager.add_new_urls(urls)
         except BaseException as e:
             time.sleep(0.1)  #延时休息
Esempio n. 9
0
 def url_manager_proc(self, url_q, conn_q, root_url):
     """
     functions: 
         1. get new url into conn_q and give to UrlManager 
         2. UrlManager process dereplication
         3. pull url out and send to url_queue to spider node
     """
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         while (url_manager.has_new_urls()):
             # get a uncrawled url in UrlManager
             new_url = url_manager.get_new_url()
             # send url to work node
             url_q.put(new_url)
             print 'old_url = ', url_manager.old_url_size()
             # set conditions
             if (url_manager.old_url_size() > 2000):
                 url_q.put('end')
                 print('[!] scheduler send information [END]')
                 # close node and store set()
                 url_manager.save_progress('new_urls.txt',
                                           url_manager.new_urls)
                 url_manager.save_progress('old_urls.txt',
                                           url_manager.old_urls)
                 return
             # get url into result_solve_proc
             # and send it into UrlManager
         try:
             if not conn_q.empty():
                 urls = conn_q.get()
                 url_manager.add_new_urls(urls)
         except BaseException, e:
             time.sleep(0.1)
Esempio n. 10
0
    def __init__(self):
        self.G_STATE_OK = 200
        self.crawMaxNum = -1
        self.crawCountNum = 0

        self.urlManager = UrlManager()
        self.dispatch = Dispatch()
        self.htmlParser = HtmlParser("http://baike.baidu.com")
        self.applicationShow = ApplicationShow()
Esempio n. 11
0
def main():
    idi = 1405150114
    urlmanager = UrlManager()
    pageur = urlmanager.url_login(idi)
    infourl = urlmanager.url_userinfo(idi)
    htmldownloader = HtmlDownloader()
    htmldownloader.download(
        'http://ygcp.njtech.edu.cn/User/LoginInSampleAction.aspx', idi, pageur,
        infourl)
Esempio n. 12
0
 def __init__(self):
     #创建一个url管理器
     self.urlManager = UrlManager()
     #创建一个html下载器
     self.downloader = htmlDownloader()
     #创建一个html的解析器
     self.htmlparser = htmlParser()
     #创建一个html的存储器
     self.htmlSave = htmlSave()
Esempio n. 13
0
    def __init__(self, sort, sort_url, sortFilename):
        threading.Thread.__init__(self)
        self.sort = sort
        self.sort_url = sort_url
        self.sortFilename = sortFilename

        self.manager = UrlManager(self.sort)
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()
Esempio n. 14
0
    def __init__(self, bind_domain):

        # 建立管理爬取URL的物件 , 用于记录已经爬过的URL
        self.urlManager = UrlManager(enable_external_link=False,
                                     bind_domain=bind_domain)

        # 建立请求链接的物件
        self.downloader = HtmlDownloader()

        # 建立转换Html源码成lxml.html物件 , 获取新的链接
        self.parser = HtmlParser()
Esempio n. 15
0
 def __init__(self, url_argv):
     sys.setrecursionlimit(10000000)
     """ 调度数据库接口, 引入初始化, 调度器, 爬取器, 分析器 """
     self.db = DbManager.DbManager(db_config).mysql_connect()
     self.config = spider_config.spider_config()
     self.initialization = Initialization.Initialization(
         self.db, self.config, url_argv)
     self.manager = UrlManager.UrlManager(self.db, self.config)
     self.craw = UrlCraw.UrlCraw(self.db, self.config)
     self.analyse = UrlAnalyse.UrlAnalyse(self.db, self.config)
     self.sprint = SpiderPrint.SpiderPrint()
     self.initialize_spider()
Esempio n. 16
0
 def url_manager_proc(self, url_q):
     '''
     url管理进程将url_q中的待爬取城市传递给爬虫节点
     :param url_q:管理进程通将url传递给爬虫节点的通道
     :return:
     '''
     url_manager = UrlManager()
     while True:
         while (url_manager.has_new_url()):
             # 从URL管理器获取新的url
             new_url = url_manager.get_new_url()
             # 将新的URL发给工作节点
             url_q.put(new_url)
         # 通知爬虫节点停止工作
         url_q.put('end')
         # 关闭管理节点,同时存储set状态
         url_manager.save_progress('new_city.txt', url_manager.new_urls)
         url_manager.save_progress('old_city.txt', url_manager.old_urls)
         return
Esempio n. 17
0
    def url_manager_proc(self, url_q, conn_q, root_url):
        '''
        :URL管理进程
        :param:向url_q队列put url
        :param:从conn_q队列中get新的url
        '''
        url_manager = UrlManager()
        url_manager.add_new_url(root_url)

        while True:
            while (url_manager.has_new_url()):

                #从URL管理器获取新的URL
                new_url = url_manager.get_new_url()

                #将新的URL发给工作节点
                url_q.put(new_url)
                print('old_url=', url_manager.old_url_size())

                #加一个判断条件,当爬取2000个链接后就关机,并保存进度
                if (url_manager.old_url_size() > 100):

                    #通知爬行节点工作结束
                    url_q.put('end')
                    print '控制节点发起结束通知!'

                    #关闭管理节点,同时存储set状态????
                    url_manager.save_progress(
                        r'C:\Users\1\Desktop\python_code\distributionCrawler\ControlNode\new_urls.txt',
                        url_manager.new_urls)  #??
                    url_manager.save_progress(
                        r'C:\Users\1\Desktop\python_code\distributionCrawler\ControlNode\old_urls.txt',
                        url_manager.old_urls)
                    return

                #将从result_solve_proc获取到的URL添加到URL管理器
                try:
                    urls = conn_q.get()
                    url_manager.add_new_urls(urls)
                except BaseException, e:  #这是什么错误??
                    time.sleep(0.1)  #延时休息
 def url_manager_proc(self, url_q, conn_q, root_url):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         while url_manager.has_new_url():
             new_url = url_manager.get_new_url()
             url_q.put(new_url)
             print 'old_urls=', url_manager.old_urls_size()
             if url_manager.old_urls_size() > 2000:
                 url_q.put('end')
                 print '控制节点发出结束通知'
                 url_manager.save_process("new_urls.txt",
                                          url_manager.new_urls)
                 url_manager.save_process("old_urls.txt",
                                          url_manager.old_urls)
                 return
         try:
             if not conn_q.empty():
                 urls = conn_q.get()
                 url_manager.add_new_urls(urls)
         except Exception:
             time.sleep(0.1)
Esempio n. 19
0
    def url_manager_process(self, task_queue):
        '''
        url管理器进程
        :param task_queue:   url队列
        :param conn_queue:
        :param root_url:    起始url
        :return:
        '''
        sql = 'SELECT id,bname FROM ' + TABLE_NAME + ' WHERE bdoubanlink IS NULL OR bdoubanlink=""'
        url_manager = UrlManager()
        db = MysqlHelper(DATABASE_NAME)

        while True:
            if not url_manager.has_new_url():
                datas = db.select(sql)
                if datas:
                    for data in datas:
                        task_data = str(data[0])+'$$'+data[1].strip()
                        url_manager.add_new_url(task_data)
                    print('[√]  Datas has been read from database!')
                else:
                    print('[!]  Fetch database null.')
                    exit(-1)

            # 添加爬虫结束条件
            if not url_manager.has_new_url():
                # 通知节点停止工作
                task_queue.put('end')
                print('[·]  Controler send "end" command.')
                return

            while task_queue.qsize() < _config.QUEUE_NUM and url_manager.has_new_url():
                # 从url管理器获取新的url
                new_url = url_manager.get_new_url()
                # 将url分发下去
                task_queue.put(new_url)
                print('[+]  >>> %s' % new_url)
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = FileDownLoader()
     self.parser = FileParser()
     self.output = DataOutput()
Esempio n. 21
0
 def __init__(self):
     self.data_store = DataStore()
     self.url_manager = UrlManager(self.data_store)
     self.strategy_container = ParserStrategyContainer()
     self.downloader = HtmlDownloader()
Esempio n. 22
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
Esempio n. 23
0
 def __init__(self):
     self.manage = UrlManager()
Esempio n. 24
0
 def __init__(self):
     self.manger = UrlManager()
     self.download = HtmlDownload()
     self.parse = HtmlParse()
     self.outpu = DataOuput()
Esempio n. 25
0
 def __init__(self):
     print 'init'
     self.urlManager = UrlManager()
     self.downloader = Downloader()
     self.praser = HtmlPraser()
     self.outputer = Output()
Esempio n. 26
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HttpDownloader()
     self.parser = ContentParser()
Esempio n. 27
0
 def __init__(self):
     self.urlManager = UrlManager()
     self.htmlDownloader = HtmlDownloader()
     self.htmlParser = HtmlParser()
     self.htmlOutput = DataOutput()
Esempio n. 28
0
from UrlManager import UrlManager

objs = {
    "https://www.gutenberg.org/files/1342/1342-0.txt":
    "Pride And Prejudice.txt",
    "https://www.gutenberg.org/files/11/11-0.txt":
    "Alice's Adventures in Wonderland.txt",
    "http://www.gutenberg.org/cache/epub/16328/pg16328.txt": "Beowulf.txt",
    "https://www.gutenberg.org/files/1661/1661-0.txt":
    "The Adventures of Sherlock Holmes.txt",
    "https://www.gutenberg.org/files/1952/1952-0.txt":
    "The Yellow Wallpaper.txt",
    "https://www.gutenberg.org/files/98/98-0.txt": "A Tale of Two Cities.txt",
    "https://www.gutenberg.org/files/2701/2701-0.txt": "Moby Dick.txt",
    "https://www.gutenberg.org/files/84/84-0.txt":
    "Frankenstein; Or, The Modern Prometheus.txt",
    "http://www.gutenberg.org/cache/epub/5200/pg5200.txt": "Metamorphosis.txt",
    "http://www.gutenberg.org/cache/epub/1497/pg1497.txt": "The Republic.txt"
}

manager = UrlManager(objs)
manager.multi_download()

for s in manager.iter():
    print(s)
Esempio n. 29
0
 def __init__(self):
     self.downloader = HtmlDownloader()
     self.urlmanager = UrlManager()
Esempio n. 30
0
 def __init__(self):
     self.urlManager = UrlManager()
     self.htmlDownloader = HtmlDownloader()
     self.parser = Parser()
     self.outputer = Outputer()