Beispiel #1
0
 def crawl_items(self, data):
     '''
     :parameter:
     :data 主程序传过来的数据
     格式如下{'title':xxxx,'url':[xxxx,xxxx,xxxx]}
     :return:无
     '''
     manager = UrlManager()
     # 获取文章标题
     title = data.get('title')
     # 去重
     if manager.remove_duplication(title):
         manager.add_new_urls(data.get('url'))
         # 下载图片文件
         while (manager.has_new_url()):
             print('下载开始==>', title)
             image_urls = manager.get_new_urls()
             # 使用序列修改文件名
             for index, url in enumerate(image_urls):
                 print('下载中==>图片%s' % (index + 1))
                 data = self.downloader.download(url)
                 self.output.save_2_binary(title, index + 1, data)
         # 全部下载完成,增加去重标志
         if not manager.has_new_url():
             manager.add_duplication(title)
             print('下载完成==>')
     else:
         print('重复|无需下载==>', title)
Beispiel #2
0
class EySpider(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()

    def urlsCrawl(self, root_url):
        #主要用来获取链接
        self.manager.add_new_url(root_url)
        #判断url管理器中是否有新的url并且可以规定爬取url的数量
        #self.manager.old_url_size()<***
        while (self.manager.has_new_url()):
            try:
                #从url管理器中取出未爬取的连接
                new_url = self.manager.get_new_url()
                #下载页面
                html = self.downloader.staticPageDownload(new_url)
                #获取到新的urls
                urls = self.parser.urlsparser(html)
                self.manager.add_new_urls_to_old(new_url)
            except:
                print("爬取链接失败")

    def keywordsCrawl(self):
        while (self.manager.has_new_url()):
            try:
                # 从url管理器中取出未爬取的连接
                new_url = self.manager.get_new_url()
                # 下载页面
                html = self.downloader.staticPageDownload(new_url)
                # 获取到新的urls
                keywords = self.parser.Parser(html)
                self.manager.add_new_urls_to_old(new_url)
            except:
                print("爬取关键字失败")
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()  # 实例化时连接到数据库
        self.s = Settings().setting

    def crawl(self):
        self.output.create_table()  # 创建表
        total_page = self.s["Index"][1] - self.s["Index"][0]
        total_data = total_page * self.s["Page"]
        total_errors = 0
        total_duplicates = 0
        old_total = self.output.get_total()

        for Index in range(self.s["Index"][0], self.s["Index"][1]):
            duplicates = self.manager.add_urls(Index, self.output)
            urls = self.manager.get_urls()
            bar = pyprind.ProgBar(self.s["Page"] - duplicates,
                                  title="Crawling " + "Page " + str(Index) +
                                  " ......")  # 进度条
            for url in urls:
                try:
                    bar.update()
                    html = self.downloader.download(url)
                    data = self.parser.parse(html)
                    self.output.insert_into_db(data)  # 插入数据库
                except Exception:
                    continue
        new_total = self.output.get_total()
        self.output.close_cursor()  # 关闭数据库连接

        print("本次爬取", new_total - old_total, "条")
Beispiel #4
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()  # 实例化时连接到数据库

    def crawl(self):
        self.output.create_table()      # 创建表
        self.manager.add_new_urls()     # 创建url
        total = self.manager.new_urls_size()
        bar = pyprind.ProgBar(30, title="Crawling......")   # 进度条
        while (self.manager.new_urls_size()):
            url = self.manager.get_new_url()
            html = self.downloader.download(url)
            data = self.parser.parse(html)
            errors, errors_messages = self.output.insert_into_db(
                data)        # 插入数据库
            bar.update()
            '''
            sys.stdout.write(
                str(self.manager.old_urls_size() / total * 100) + "%")
            sys.stdout.flush()
            # print('爬取', self.manager.old_urls_size(), '条。')
            '''
        self.output.close_cursor()  # 关闭数据库连接
        print("本次共爬取", total, "条")
        if errors:
            print("其中", errors, "条数据出错")
            print("错误:" + str(errors_messages))
Beispiel #5
0
    def __init__(self):

        self.manager = UrlManager()

        self.downloader = HtmlDownloader()

        self.parser = HtmlParser()

        self.output = DataOutput()
Beispiel #6
0
class Spider(object):
    def __init__(self):
        self.urlManager = UrlManager()
        self.downloader = HtmlDownloader()
        self.dataStore = SimpleHtmlDataStore()
        self.parser = BikeParser()

    def start(self, url):

        if self.urlManager.has_new_url():
            url = self.urlManager.get_new_url()

        for _ in range(20):
            urls = []
            data = {}
            print('now crawling: %s' % url)
            cont = self.downloader.download(url)
            urls, data = self.parser.parser(url, cont)
            if urls is not None:
                self.urlManager.add_new_urls(urls)
            print('have %d urls to go' % self.urlManager.new_url_size())
            if data is not None:
                self.dataStore.store_data(data)
            if self.urlManager.has_new_url():
                url = self.urlManager.get_new_url()
            time.sleep(1)
        self.urlManager.save()
Beispiel #7
0
class SpiderMan(object):

    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownLoader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断取了多少个url
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print(self.manager.old_url_size())
                print(data)
            except Exception as e:
                print('crawl failed')

        self.output.output_question()
        self.output.output_answer()
Beispiel #8
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口url
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 200):
            try:
                #从url管理器中获取url
                new_url = self.manager.get_new_url()
                # html下载器下载网页
                html = self.downloader.download(new_url)
                #html解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取的url放到url管理器中
                self.manager.add_new_urls(new_urls)
                # 将抽取的数据存储起来
                self.output.store_data(data)
                print " 已经抽取了 %s 个链接" % self.manager.old_url_size()
            except Exception, e:
                print "crawl failed", e

        self.output.output_html()
Beispiel #9
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()
    def crawl(self,root_url):
        #添加入口URL
        self.manager.add_new_url(root_url)
        #判断url管理器中是否有新的url,同时判断抓取了多少个url
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                #从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                #HTML下载器下载网页
                html = self.downloader.download(new_url)
                #HTML解析器抽取网页数据
                new_th, new_td, new_urls= self.parser.parser(new_url,html,"th","时间","td")
                #将抽取到url添加到URL管理器中
                if new_urls!="meiyou":
                    self.manager.add_new_urls(new_urls)
                    print "已经抓取%s个链接"%self.manager.old_url_size()
                #数据存储器储存文件
                if new_th!="meiyou" and new_td!="meiyou":
                    self.output.store_data(new_th,new_td)
                    self.output.output_html()
            except Exception as e:
                print "抓取失败!"
Beispiel #10
0
class SpiderMan(object):

    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口
        self.manager.add_new_url(root_url)
        while(self.manager.has_new_url() and
              self.manager.old_urls_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                # print(new_url, '.......')
                html = self.downloader.download(new_url)
                # print(html)
                new_urls, data = self.parser.parse(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print('已经抓取 %s 个链接' % self.manager.old_urls_size())
            except Exception as e:
                print(e)
                # print('crawl failed')
        self.output.output_html()
Beispiel #11
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        #添加入口url
        self.manager.add_new_url(root_url)
        #判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                #从url管理器获取新的url
                new_url = self.manager.get_new_url()
                #html下载器下载网页
                html = self.downloader.download(new_url)
                #html解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                #将抽取的url添加到url管理器中
                # self.manager.add_new_url(new_urls)    出现set不可hash问题,因为可迭代的数据是无法hash的
                for new_url in new_urls:
                    self.manager.add_new_url(new_url)
                #数据存储器存储文件
                self.output.store_data(data)
                print('已经抓取%s个链接' % self.manager.old_url_size())
            except Exception as e:
                print('crawl failed with' + str(e))
Beispiel #12
0
class SpiderMan(object):
    '''爬虫调度器
    Attributes:
        manager: URL管理器
        downloader: HTML下载器
        parser: HTML解析器
        output: 数据存储器
    '''
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        '''爬虫调度函数
        Args:
            root_url: 爬虫入口URL
        Raises:
            Expection: 'NoneType' object has no attribute
        '''
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print('已经抓取了%s个链接' % self.manager.old_url_size())
            except Exception as e:
                print('Crawl failed: %s' % e)
        self.output.output_html()
Beispiel #13
0
class SpiderMan:
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口 url
        self.manager.add_new_url(root_url)
        # 判断 url 管理器是否有新的 url,同时判断抓去了多少 url
        while self.manager.has_new_url() and self.manager.old_url_size() < 100:
            try:
                # 从 URL 管理器获取新的 url
                new_url = self.manager.get_new_url()
                # 从 html 下载器下载网页
                html = self.downloader.download(new_url)
                # print(html)
                # 从 html 解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取的 url 添加到 URl 管理器
                self.manager.add_new_urls(new_urls)
                # 数据存储器存储文件
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_url_size())
            except Exception as e:
                print("crawl failed")
        self.output.output_html()
Beispiel #14
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)

        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 50):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                if html == None:
                    print('failded to get pages')
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print('has scraped %s links' % self.manager.old_url_size())
            except Exception as e:
                print('crawl failed')
        self.output.output_html()
        '''
Beispiel #15
0
class SpiderMain(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取到url添加到URL管理器中
                self.manager.add_new_urls(new_urls)
                # 数据存储器储存文件
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_url_size())
                #print(new_url)
            except Exception as e:
                print("crawl failed")
            # 数据存储器将文件输出成指定格式
        self.output.output_html()
Beispiel #16
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = DataOutput()
        self.output = DataOutput()

    def crawl(self, root_url):
        #添加入口
        self.manager.add_new_url(root_url)
        #判断url管理管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.hes_new_url()
               and self.manager.old_url_size() < 100):
            try:
                #从url管理器中获取新的url
                new_url = self.manager.get_new_url()
                #html解释器抽取网页数据
                html = self.downloader.download(new_url)
                #将抽取的url添加到url管理器中
                self.manager.add_new_url(new_url)
                #将数据存储到文件
                self.output.stor_data(data)
                print("已经抓取%个链接" % self.manager.old_url_size())
            except Exception, e:
                print("crawl failed")
        #数据存储器将文件输出成指定格式
        self.output.output_html()
class Url_info_Output(object):
    def __init__(self):
        self.urlmanager = UrlManager()

    def output_url_info(self, initial_url):
        patents_divisor = self.urlmanager.get_all_patents_num(initial_url)[0]
        patents_remainder = self.urlmanager.get_all_patents_num(initial_url)[1]
        items = [str(i) for i in range(1, patents_divisor + 2)]
        htt = 'http://appft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&p='
        laa = '&u=%2Fnetahtml%2FPTO%2Fsearch-bool.html&r=0&f=S&l=50&TERM1=seal structure&FIELD1=&co1=AND&TERM2=&FIELD2=&d=PG01'
        urls = [htt + items[i] + laa for i in range(patents_divisor + 1)]
        pool = threadpool.ThreadPool(8)
        tasks = threadpool.makeRequests(self.urlmanager.get_all_item_info,
                                        urls)
        [pool.putRequest(task) for task in tasks]
        pool.wait()
Beispiel #18
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawlOneTag(self, book_tag):
        page_num = 0
        book_list = []
        while page_num <= 2:
            try:
                new_url = self.manager.get_new_url(page_num, book_tag)
                html = self.downloader.download(new_url)
                book_list += self.parser.parser(html)
            except Exception as e:
                print("crawl failed")
            page_num += 1
        return book_list

    def crawlAllTags(self, book_tag_lists, topath):
        book_lists = []
        for book_tag in book_tag_lists:
            book_list = self.crawlOneTag(book_tag)
            book_list = sorted(book_list, key=lambda x: x[1], reverse=True)
            book_lists.append(book_list)
        self.output.output(book_lists, book_tag_lists, topath)
Beispiel #19
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.dataoutput = DataOutput()
     self.mongoengine = Use_MongoEngine()
     self.urloutput = Url_info_Output()
Beispiel #20
0
 def url_manager_proc(self, task_queue, url_queue, root_url):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         if url_manager.has_new_url():
             new_url = url_manager.get_new_url()
             print('url: %s放入任务队列' % new_url)
             task_queue.put(new_url)
         if not url_queue.empty():
             next_url = url_queue.get()
             url_manager.add_new_url(next_url)
Beispiel #21
0
 def crawl_image(self, start_url, total_page, __page=2):
     '''
     爬取蜂鸟大师板块和技法板块的画集
     :parameter:
     :start_url 参数为需要下载的文章URL
     :total_page 下载页数
     :__page 扩展页数起始参数,用户请勿设定
     :return:无
     '''
     manager = UrlManager()
     # 添加入口URL
     if 'image' in start_url or 'academy' in start_url:
         manager.add_new_url(start_url)
         # 判断url管理器中是否有新的url
         while (manager.has_new_url()):
             try:
                 # 从URL管理器获取新的url
                 new_url = manager.get_new_url()
                 # HTML下载器下载网页
                 html = self.downloader.download(new_url)
                 # 通过关键词判断是否是二级网页
                 if 'slide' in new_url:
                     # HTML解析器抽取二级网页数据
                     data = self.parser.parse_data(html)
                     self.crawl_items(data)
                 else:
                     # HTML解析器抽取一级网页数据
                     data = self.parser.parse_urls(html)
                     manager.add_new_urls(data)
             except Exception as e:
                 print('爬取失败==>', e)
         # 爬取后续页数
         if __page <= total_page:
             if 'image' in start_url:
                 next_url = '%s/index.php?action=getList&class_id=192&sub_classid=0&page=%s&not_in_id=' % (
                     start_url, str(__page))
             elif 'academy' in start_url:
                 next_url = '%s/index.php?action=getList&class_id=190&sub_classid=0&page=%s&not_in_id=' % (
                     start_url, str(__page))
             print('开始爬取==>第', str(__page), '页')
             return self.crawl_image(next_url, total_page, __page + 1)
     else:
         print('网址有错误,请检查')
Beispiel #22
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                new_url = self.manager.get_new_url()
                html=self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url,html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print "already get %s url" % self.manager.old_url_size()
            except Exception,e:
                print "crawl failed"
        self.output.output_html()
Beispiel #23
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        #添加入口URL
        self.manager.add_new_url(root_url)  #先添加第一个链接到为处理的列表中
        #判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (
                self.manager.has_new_url()
                and self.manager.old_url_size() < 100
        ):  #第一个页面爬出来的链接已经占用了70多个,导致循环后面获取的链接无法被使用过,n为循环次数,m为一个页面爬出的链接,爬出所有的内容=n*m;需要优化,应该每个页面爬出所有链接后,循环爬出那些链接的内容,然后进行下一个循环,即为二次循环才能满足
            try:
                #从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                print(new_url)
                #HTML下载器下载网页
                html = self.downloader.download(new_url)  #下载整个列表的html内容
                #HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(
                    new_url, html)  #解析每个html页面的内容,获取所有的链接,还有一段内容
                print(new_urls)
                print(
                    len(new_urls)
                )  #每次解析html的url列表都很多,都插入到未处理的url集合里面,但是只循环100次,导致后面循环爬到的url未被使用过
                print(data)
                #将抽取到url添加到URL管理器中
                self.manager.add_new_urls(new_urls)  #新的url集合插入未处理的url里面
                #数据存储器储存文件
                self.output.store_data(data)  #data插入显示的文件

                print("已经抓取%s个链接" % self.manager.old_url_size())

            except Exception as e:
                print(e)
                print("crawl failed")
            #数据存储器将文件输出成指定格式
        self.output.output_html()
Beispiel #24
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawler(self, root_url):
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print("已抓取{0}个链接".format(self.manager.old_url_size()))
            except Exception as e:
                print("crawler failed {0}".format(e))
        self.output.output_html()
Beispiel #25
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口 URl
        self.manager.add_new_url(root_url)

        n = 0
        # 判断 URL 管理器中是否有新的url,同时判断抓取量多少个 URl
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:

                # 从 URl 管理器获取新的 url
                new_url = self.manager.get_new_url()

                # 从 Html 下载器下载网页
                html = self.downloader.download(new_url)

                # HTML 解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                if n == 0:
                    # 将抽取的 urls 添加到URL 管理器中
                    self.manager.add_new_urls(new_urls)

                # 数据存储器 存储文件
                self.output.store_data(data)
                n += 1

                print('已经抓取%s个连接' % self.manager.old_url_size())

            except Exception as e:
                print(e)
        # 数据存储器将文件输出成指定格式
        self.output.output_html(self.output.filepath)
        self.output.output_end(self.output.filepath)
Beispiel #26
0
class SpiderMan(object):

    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = Dataoutput()

    def crawl(self,root_utl):
        self.manager.add_new_url(root_utl)
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls,data = self.parser.parser(new_url,html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print "已经抓取%s个链接" % self.manager.old_url_size()
            except Exception as e:
                # print 'crawl failed'
                print e
        self.output.output_html()
Beispiel #27
0
 def __init__(self):
     self.manager = UrlManager()  # 我再练习时出错的地方,少了(),导致报错
     self.downloader = HtmlDownloader()
     self.parser = HtmlParse()
     self.output = DataOutput()
 def __init__(self):
     self.urlmanager = UrlManager()
Beispiel #29
0
def url_manager_proc(url_q, conn_q, root_url, num=6):
    """
    :param url_q:里面放的是url集合单个url
    :param conn_q:里面放的是url集合
    :param root_url:
    :param num:
    :return:
    """
    url_manager = UrlManager()
    url_manager.add_new_url(root_url)
    while True:

        while url_manager.has_new_url():
            print("# url_manager_proc将要爬取的url放入url_q中")
            new_url = url_manager.get_new_url()
            print(new_url)
            url_q.put(new_url)
            if url_manager.old_url_size() > num:
                # 通知爬行节点工作结束
                url_q.put('end')
                print('控制节点发起结束通知!')
                # 关闭管理节点,同时存储 set 状态
                url_manager.save_progress()
                break
        try:
            if not conn_q.empty():
                print("# url_manager_proc从conn_q中拿取urls")
                urls = conn_q.get()
                print(urls)
                url_manager.add_new_urls(urls)
            else:
                # 延时休息
                time.sleep(0.1)
        except Exception as e:
            print(e)
Beispiel #30
0
from URLManager import UrlManager
import pickle
import hashlib

print("has_new_url", UrlManager.has_new_url.__doc__)
print("add_new_url", UrlManager.add_new_url.__doc__)
print("add_new_urls", UrlManager.add_new_urls.__doc__)
print("get_new_url", UrlManager.get_new_url.__doc__)
print("new_url_size", UrlManager.new_url_size.__doc__)
print("old_url_size", UrlManager.old_url_size.__doc__)
print("save_progress", UrlManager.save_progress.__doc__)
print("load_progress", UrlManager.load_progress.__doc__)

urls = set([
    "http://qq.ip138.com/tianqi/", "http://qq.ip138.com/shenfenzheng/",
    "http://qq.ip138.com/huoche/",
    "http://qq.ip138.com/daishoudian/mobile.htm",
    "http://www.miitbeian.gov.cn/"
])
urlmanager = UrlManager()
print(type(urls))
# urlmanager获得新的url集合
urlmanager.add_new_urls(urls)
print(urlmanager.has_new_url())
# urlmanager输出一个未爬取的url
new_url = urlmanager.get_new_url()  #拿出的同时将其放的到已经爬取的url集合中
# 没有未爬取的url时返回None
print(new_url)
print(urlmanager.old_url_size())
# 保存进度
urlmanager.save_progress()
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()  # 实例化时连接到数据库
     self.s = Settings().setting
Beispiel #32
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()