Esempio n. 1
0
class SpiderMan:
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口 url
        self.manager.add_new_url(root_url)
        # 判断 url 管理器是否有新的 url,同时判断抓去了多少 url
        while self.manager.has_new_url() and self.manager.old_url_size() < 100:
            try:
                # 从 URL 管理器获取新的 url
                new_url = self.manager.get_new_url()
                # 从 html 下载器下载网页
                html = self.downloader.download(new_url)
                # print(html)
                # 从 html 解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取的 url 添加到 URl 管理器
                self.manager.add_new_urls(new_urls)
                # 数据存储器存储文件
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_url_size())
            except Exception as e:
                print("crawl failed")
        self.output.output_html()
Esempio n. 2
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)

        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 50):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                if html == None:
                    print('failded to get pages')
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print('has scraped %s links' % self.manager.old_url_size())
            except Exception as e:
                print('crawl failed')
        self.output.output_html()
        '''
Esempio n. 3
0
class SpiderWorker():
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')

        server_adrr = '127.0.0.1'
        print('connect to %s...' % server_adrr)

        self.m = BaseManager(address=(server_adrr, 8001), authkey=b'qiye')

        self.m.connect()
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.htmlparser = HtmlParser()
        self.dataoutput = DataOutput()

    def crawl(self):

        while True:
            try:
                if self.task.empty:
                    url = self.task.get()
                    if url == 'end':
                        return None
                    print('正在解析 %s' % url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls, data = self.htmlparser.parser(url, content)
                    self.result.put({'new_urls': new_urls})
                    self.dataoutput.output_mongo({'data': data})
            except Exception as e:
                print(e)
Esempio n. 4
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def start(self, url, numMax=50):
        self.manager.addUrl(url)
        num = 0
        errorsNum = 0
        while self.manager.sizeofNew() != 0 and num < numMax:
            try:
                num = num + 1
                url = self.manager.getUrl()
                print('%d\n %s' % (num, url))
                html = self.downloader.download(url)
                newUrls, data = self.parser.parser(url, html)
                self.output.addData(data)
                if self.manager.sizeofNew() + self.manager.sizeofOld(
                ) < numMax:
                    self.manager.addUrls(newUrls)
                print(data['title'])
            except:
                num = num - 1
                errorsNum = errorsNum + 1
                print('crawl failed %d' % errorsNum)
        self.output.outputData()
Esempio n. 5
0
class SpiderMan(object):

    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownLoader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断取了多少个url
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print(self.manager.old_url_size())
                print(data)
            except Exception as e:
                print('crawl failed')

        self.output.output_question()
        self.output.output_answer()
Esempio n. 6
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawlOneTag(self, book_tag):
        page_num = 0
        book_list = []
        while page_num <= 2:
            try:
                new_url = self.manager.get_new_url(page_num, book_tag)
                html = self.downloader.download(new_url)
                book_list += self.parser.parser(html)
            except Exception as e:
                print("crawl failed")
            page_num += 1
        return book_list

    def crawlAllTags(self, book_tag_lists, topath):
        book_lists = []
        for book_tag in book_tag_lists:
            book_list = self.crawlOneTag(book_tag)
            book_list = sorted(book_list, key=lambda x: x[1], reverse=True)
            book_lists.append(book_list)
        self.output.output(book_lists, book_tag_lists, topath)
Esempio n. 7
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManger()
        self.downloader = HtmlDownload()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        # print(self.manager.new_url_size())
        # print(self.manager.old_urls_size())
        while (self.manager.has_new_url()
               and self.manager.old_urls_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                # print(html)
                # print('新的url:', new_url)
                new_urls, data = self.parser.parser(new_url, html)
                # print("new_urls长度:", len(new_urls))
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print('已经抓取了%s个链接' % self.manager.old_urls_size())
            except Exception as e:
                print(e, 'Crawl failed')
        self.output.output_html()
        print("已保存至 baike.html")
Esempio n. 8
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()
    def crawl(self,root_url):
        #添加入口URL
        self.manager.add_new_url(root_url)
        #判断url管理器中是否有新的url,同时判断抓取了多少个url
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                #从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                #HTML下载器下载网页
                html = self.downloader.download(new_url)
                #HTML解析器抽取网页数据
                new_th, new_td, new_urls= self.parser.parser(new_url,html,"th","时间","td")
                #将抽取到url添加到URL管理器中
                if new_urls!="meiyou":
                    self.manager.add_new_urls(new_urls)
                    print "已经抓取%s个链接"%self.manager.old_url_size()
                #数据存储器储存文件
                if new_th!="meiyou" and new_td!="meiyou":
                    self.output.store_data(new_th,new_td)
                    self.output.output_html()
            except Exception as e:
                print "抓取失败!"
Esempio n. 9
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断是否有新的URL及已抓取数量
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的链接
                new_url = self.manager.get_new_url()
                print '1-------->new_url', new_url
                # 下载网页
                html = self.downloader.download(new_url)
                print '2-------->html'
                # 解析抽取网页
                new_urls, data = self.parser.parser(new_url, html)
                print '3-------->new_urls, data', new_urls, data
                # 将抽取的URL添加到管理器中
                self.manager.add_new_urls(new_urls)
                print '4-------->new_urls', new_urls
                # 数据存储器存储文件
                self.output.store_data(data)
                print '已经抓取%d个链接' % self.manager.old_url_size()
            except Exception, e:
                print 'crawl failed %s' % e
        # 将数据存储为指定格式
        self.output.output_html()
Esempio n. 10
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        #添加入口url
        self.manager.add_new_url(root_url)
        #判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                #从url管理器获取新的url
                new_url = self.manager.get_new_url()
                #html下载器下载网页
                html = self.downloader.download(new_url)
                #html解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                #将抽取的url添加到url管理器中
                # self.manager.add_new_url(new_urls)    出现set不可hash问题,因为可迭代的数据是无法hash的
                for new_url in new_urls:
                    self.manager.add_new_url(new_url)
                #数据存储器存储文件
                self.output.store_data(data)
                print('已经抓取%s个链接' % self.manager.old_url_size())
            except Exception as e:
                print('crawl failed with' + str(e))
Esempio n. 11
0
class SpiderMan(object):
    """爬虫调度器"""
    def __init__(self):
        self.urlManager = UrlManager()
        self.htmlDownloader = HtmlDownloader()
        self.htmlParser = HtmlParser()
        self.htmlOutput = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.urlManager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.urlManager.has_new_url()
               and self.urlManager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的url
                new_url = self.urlManager.get_new_url()
                # HTML下载器下载网页
                html = self.htmlDownloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.htmlParser.parser(new_url, html)
                # 将抽取的url添加到URL管理器中
                self.urlManager.add_new_urls(new_urls)
                # 数据存储器存储数据
                self.htmlOutput.store_data(data)
            except Exception as e:
                print(traceback.format_exc())
        # 数据存储器将文件输出成指定格式
        self.htmlOutput.output_html()
Esempio n. 12
0
class SpiderMan(object):

    def __init__(self):
        self.manger = UrlManger()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        print 'crawl %s ' % root_url
        self.manger.add_new_url(root_url)

        #pdb.set_trace()
        while (self.manger.has_new_url() and self.manger.old_urls_size() < 100):
            try:
                new_url = self.manger.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manger.add_new_urls(new_urls)
                self.output.store_data(data)
                print 'Has crawl %s links ' % self.manger.old_urls_size()
            except Exception, e:
                print "crawl failed %s" % e
                break

        self.output.output_html()
Esempio n. 13
0
class Spider_Scheduler(object):
    def __init__(self):
        self.urlmanager = UrlQueue()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 入口放url种子
        self.urlmanager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.urlmanager.has_new_url()
               and self.urlmanager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的url
                new_url = self.urlmanager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取到url添加到URL管理器中
                self.urlmanager.add_new_urls(new_urls)
                # 存储器将数据序列化
                self.output.data_to_list(data)
                print("已经抓取%s个链接" % self.urlmanager.old_url_size())
            except Exception as e:
                print("crawl failed")
        # 存储器输出成指定格式
        self.output.output_html()
Esempio n. 14
0
class SpiderMain(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取到url添加到URL管理器中
                self.manager.add_new_urls(new_urls)
                # 数据存储器储存文件
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_url_size())
                #print(new_url)
            except Exception as e:
                print("crawl failed")
            # 数据存储器将文件输出成指定格式
        self.output.output_html()
Esempio n. 15
0
class SpiderSchedule(object):
    '''
    爬虫调度器,负责初始化各个模块,然后通过crawl传递入口url
    方法内部安卓运行流畅控制各个模块工作
    '''
    def __init__(self):
        self.manager = URLManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口url
        self.manager.add_new_url(root_url)
        # 判断是否有新的url,同时判断抓取url个数
        while self.manager.has_new_url() and self.manager.old_urls_size() < 10:
            try:
                # 1.从URL管理器获取新的URL
                new_url = self.manager.get_new_url()
                # 2.将URL交给HtmlDownloader下载
                html = self.downloader.download(new_url)
                # 3.将下载的页面交给HtmlParser解析
                urls, data = self.parser.parser(new_url, html)
                # 4.将解析的数据存储,将重新抽取的URL交给URLManager
                self.output.store_data(data)
                for url in urls:
                    self.manager.add_new_url(url)
                print('已经抓取{0}个链接:'.format(self.manager.old_urls_size()),
                      new_url)
            except Exception as e:
                print(e.args)
                print('crawl failed:', url)
        self.output.output_html()
Esempio n. 16
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口url
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 200):
            try:
                #从url管理器中获取url
                new_url = self.manager.get_new_url()
                # html下载器下载网页
                html = self.downloader.download(new_url)
                #html解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取的url放到url管理器中
                self.manager.add_new_urls(new_urls)
                # 将抽取的数据存储起来
                self.output.store_data(data)
                print " 已经抽取了 %s 个链接" % self.manager.old_url_size()
            except Exception, e:
                print "crawl failed", e

        self.output.output_html()
Esempio n. 17
0
class SpiderWork(object):
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        server_addr = '127.0.0.1'
        print('Connect to server %s...' % server_addr)
        self.m = BaseManager(address=(server_addr, 8001),authkey=('baike'.encode('utf-8')))
        self.m.connect()
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')
    def crawl(self):
        while(True):
            try:
                if not self.task.empty():
                    url = self.task.get()
                    print(url)
                    if url =='end':
                        print('控制节点通知爬虫节点停止工作...')
                        self.result.put({'new_urls':'end','data':'end'})
                        return
                    print('爬虫节点正在解析:%s'%url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls,data = self.parser.parser(url,content)
                    self.result.put({"new_urls":new_urls,"data":data})
            except EOFError as e:
                print("连接工作节点失败")
                return
            except Exception as e:
                print(e)
                print('Crawl  fail')
class SpiderWork():
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        server_addr = '127.0.0.1'
        print('Connect to server %s'%server_addr)
        self.m = BaseManager(address = (server_addr,8001),authkey = b'baike')
        self.m.connect()
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):
        while True:
            try:
                if not self.task.empty():
                    url = self.task.get()
                    if url == 'end':
                        print('控制节点通知爬虫结束工作')
                        self.result.put({'new_urls':'end','data':'end'})
                        return
                    print('爬虫正在解析%s'%url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls,data = self.parser.parser(url,content)
                    self.result.put({'new_urls':new_urls,'data':data})
            except EOFError:
                print('连接工作节点失败')
                return
            except Exception:
                print(Exception)
                print('Crawl fail')
class SpiderWork(object):
    def __init__(self):
        #初始化分布式进程中的工作节点的连接工作
        class QueueManager(BaseManager):
            pass

        # 实现第一步:使用BaseManager注册获取Queue的方法名称
        QueueManager.register('get_task_queue')
        QueueManager.register('get_result_queue')

        # 实现第二步:连接到服务器:
        server_addr = ('192.168.10.128', 8004)
        print('Connect to server {}...'.format(server_addr))

        # 端口和验证口令注意保持与服务进程设置的完全一致:
        self.m = QueueManager(address=server_addr, authkey='janson'.encode())

        # 从网络连接:
        self.m.connect()

        # 实现第三步:获取Queue的对象:
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()

        #初始化网页下载器和解析器
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):
        '''
        分布式爬虫节点调度器
        :return:
        '''
        while True:
            try:
                # url任务节点不为空时
                if not self.task.empty():
                    # 获取url
                    url = self.task.get()
                    # 当url为end时,说明控制节点通知关闭
                    if url == 'end':
                        print('控制节点通知爬虫节点停止工作...')
                        # 接着通知其它节点停止工作
                        self.result.put({'new_urls': 'end', 'data': 'end'})
                        return
                    else:
                        # 否则解析数据
                        print('爬虫节点正在解析:%s' % url.encode('utf-8'))
                        # 下载器下载
                        content = self.downloader.download(url)
                        # 解析数据
                        new_urls, data = self.parser.parser(url, content)
                        # 提交队列
                        self.result.put({"new_urls": new_urls, "data": data})
            except Exception as error:
                print('error-------->', error)
                print("连接工作节点失败")
                return
Esempio n. 20
0
class SpiderWork(object):
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        server_addr = '127.0.0.1'
        print('connect to server %s ...' % server_addr)
        self.m = BaseManager(address=(server_addr, 8001), authkey='baike')
        self.m.connect()
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()

        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()

        self.lock = 0
        self.start = time.time()
        print 'init finish'

    def work_thread(self, url):
        global lock
        self.lock += 1
        print '爬虫节点正在解析:%s' % url.encode('utf-8')
        #yield
        content = self.downloader.download(url)
        new_urls, data = self.parser.parser(url, content)
        self.result.put({'new_urls': new_urls, 'data': data})
        print '爬虫节点解析完成:%s' % url.encode('utf-8')
        self.lock -= 1

    def gevent_print(self, str):
        print str

    def crawl(self):
        global lock
        while (True):
            try:
                if self.lock > 10:
                    time.sleep(1)
                elif not self.task.empty():
                    url = self.task.get()
                    if url == 'end':
                        while self.lock:
                            time.sleep(0.1)
                        print '控制节点通知爬虫节点停止工作...'
                        print "using time:", time.time() - self.start
                        self.task.put('end')
                        self.result.put({'new_urls': 'end', 'data': 'end'})
                        return
                    thread = threading.Thread(target=self.work_thread,
                                              args=(url, ))
                    thread.start()
                    #time.sleep(0.1)
            except EOFError, e:
                print "连接工作节点失败"
                return
            except Exception, e:
                print e
                print 'Crawl fali'
Esempio n. 21
0
class SpiderMan(object):
    def __init__(self):
        self.address = Get_Address()
        self.download = HtmlDownloader()
        self.parser = HtmlParser()

    def crawl(self):
        info = self.address.get_address()
        html_lists = self.download.download(info)
        tickets = self.parser.parser(html_lists)
Esempio n. 22
0
class SpiderWork(object):
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        server_addr = '127.0.0.1'
        print('connect to server %s ...' % server_addr)
        self.m = BaseManager(address=(server_addr, 8001), authkey='baike')
        self.m.connect()
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()

        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()

        self.lock = 0
        print 'init finish'

    def work_gevent(self, url):
        monkey.patch_all()
        self.lock += 1
        print '爬虫节点正在解析:%s' % url.encode('utf-8')
        #yield
        content = self.downloader.download(url)
        new_urls, data = self.parser.parser(url, content)
        self.result.put({'new_urls': new_urls, 'data': data})
        print '爬虫节点解析完成:%s' % url.encode('utf-8')
        self.lock -= 1

    def gevent_print(self, str):
        print str

    def crawl(self):
        while (True):
            try:
                if not self.task.empty():
                    url = self.task.get()
                    if url == 'end':
                        while self.lock:
                            time.sleep(0.1)
                        print '控制节点通知爬虫节点停止工作...'
                        self.task.put('end')
                        self.result.put({'new_urls': 'end', 'data': 'end'})
                        return
                    g = gevent.spawn(self.work_gevent, url)
                    g1 = gevent.spawn(self.gevent_print, 1)
                    g2 = gevent.spawn(self.gevent_print, 2)
                    g.join()
                    time.sleep(0.1)
            except EOFError, e:
                print "连接工作节点失败"
                return
            except Exception, e:
                print e
                print 'Crawl fali'
Esempio n. 23
0
class CrawlerMain(object):
    def __init__(self):
        self.G_STATE_OK = 200
        self.crawMaxNum = -1
        self.crawCountNum = 0

        self.urlManager = UrlManager()
        self.dispatch = Dispatch()
        self.htmlParser = HtmlParser("http://baike.baidu.com")
        self.applicationShow = ApplicationShow()

    def __crawl(self, url):
        """
         设定计数器,
         如果crawMaxCount > 0 说明需要进行计数,
         生成一个count计数器,最大搜索多少次
         当count计数器大于crawMaxCount的时候停止

         否则,不进行计数
        """
        try:
            self.dispatch.launch_request(url)
            if self.dispatch.get_status() != self.G_STATE_OK:
                return

            context = self.dispatch.get_content()

            self.htmlParser.set_content(context)
            self.htmlParser.parser()

            summary = self.htmlParser.get_summary()
            title = self.htmlParser.get_title

            urls = self.htmlParser.get_new_urls()

        except Exception, e:
            print "Error " + url + " " + str(e)
            return

        self.applicationShow.add(url, title, summary)
        self.urlManager.add_url(urls)
Esempio n. 24
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url()):
            # try:
            new_url = self.manager.get_new_url()
            html = self.downloader.download(new_url)
            new_urls, data = self.parser.parser(new_url, html)
            self.manager.add_new_urls(new_urls)
Esempio n. 25
0
class Spider(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.paser = HtmlParser()

    def crawl(self, idi):
        rootloginurl = 'http://ygcp.njtech.edu.cn/User/LoginInSampleAction.aspx'
        pageurl = self.manager.url_login(idi)
        infourl = self.manager.url_userinfo(idi)
        htmlf, htmli = self.downloader.download(rootloginurl, idi, pageurl,
                                                infourl)
        xuehao, xingming, changpao, chenpao = self.paser.parser(
            infourl, pageurl, htmli, htmlf)
        print("学号:" + xuehao[0], "姓名:" + xingming[0], changpao, chenpao)
Esempio n. 26
0
class SpiderManager:
    def __init__(self):
        self.parser = HtmlParser()
        self.save = SaveData()

    def prcocess(self):
        self.save.coneect()
        for page in range(10):
            url = "http://maoyan.com/board/4?offset=" + str(page * 10)
            res = requests.get(url)
            if res.status_code == 200:
                movies = self.parser.parser(res.text)
                for name, star, releasetime, nation in movies:
                    self.save.save(name, star, releasetime, nation)
        self.save.close()
Esempio n. 27
0
class SpiderWork(object):
    def __init__(self):
        #初始化分布式进程中工作节点的连接工作
        #实现第一步:使用BaseManager注册用于获取QueQue的方法名称?
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')

        #实现第二步:连接到服务器
        server_addr = '127.0.0.1'
        print('Connect to server %s...' % server_addr)

        #注意保持端口和验证口令与服务进程设置的完全一致
        self.m = BaseManager(address=(server_addr, 8001), authkey='baike')

        #从网络连接
        self.m.connect()

        #实现第三步:获取Queue的对象
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()

        #初始化网页下载器和解析器
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print 'init finish'

    def crawl(self):
        while True:
            try:
                if not self.task.empty():
                    url = self.task.get()
                    if url == 'end':
                        print "控制节点通知爬虫节点停止工作..."
                        #接着通知其他节点停止工作
                        self.result.put({'new_urls': "end", 'data': 'end'})
                        return
                    print '爬虫节点正在解析:%s' % url.encode('utf-8')
                    content = self.downloader.download(url)
                    new_urls, data = self.parser.parser(url, content)
                    self.result.put({'new_urls': new_urls, 'data': data})
            except EOFError, e:  #EOFError是什么错误?end of file文件末尾错误
                print "连接工作节点失败"
                return
            except Exception, e:
                print e
                print 'Crawl fail'
Esempio n. 28
0
class SpiderWork(object):
    def __init__(self):
        """
        initialization:
            1. initial distribution process then connect to control node
            2. register basemanager for get Queue
            3. sync with control node 
        """
        # 1. register
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        # 2. connect to server
        server_addr = '192.168.65.176'
        print "connect to server %s..." % server_addr
        # 3. match port & authkey with control node
        self.m = BaseManager(address=(server_addr, 8001), authkey='baike')
        self.m.connect()
        # 4. get Queue object (url_q, result_q)
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        # 5. initialize downloader, parser
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print 'init finish'

    def crawl(self):
        while (True):
            try:
                if not self.task.empty():
                    url = self.task.get()

                    if url == 'end':
                        print 'control node inform spider node stop working'
                        self.result.put({'new_urls': 'end', 'data': 'end'})
                        return
                    print 'spider node is parsing:%s' % url.encode('utf-8')
                    content = self.downloader.download(url)
                    new_urls, data = self.parser.parser(url, content)
                    self.result.put({"new_urls": new_urls, "data": data})
            except EOFError, e:
                print "work node connect filed"
                return
            except Exception, e:
                print e
                print 'Crawl fail'
Esempio n. 29
0
class SpiderWorker2(object):
    def __init__(self):
        # 初始化分布式进程中的工作节点的连接工作
        # 实现第一步:使用BaseManager注册获取Queue的方法名称
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        # 实现第二步:连接到服务器:
        server_addr = '127.0.0.1'
        print "[*]Connect to server %s..." % server_addr
        # 端口和验证口令注意保持与服务进程设置的完全一致:
        self.m = BaseManager(address=(server_addr, 8001), authkey='test')
        # 从网络连接:
        self.m.connect()
        # 实现第三步:获取Queue的对象
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        # 初始化网页下载器和解析器
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print "[*]Init finished."

    def crawl(self):
        while True:
            try:
                if not self.task.empty():
                    url = self.task.get()

                    if url == 'end':
                        print "[*]Control Node informs all the Spider Nodes stop working."
                        # 接着通知其它节点停止工作
                        self.result.put({'new_urls': 'end', 'data': 'end'})
                        return
                    print "[*]The Spider Node is parsing: %s" % url.encode(
                        'utf-8')
                    content = self.downloader.download(url)
                    new_urls, data = self.parser.parser(url, content)
                    self.result.put({'new_urls': new_urls, 'data': data})
            except EOFError, e:
                print "[-]Fail to connect to the Worker Node."
                return
            except Exception, e:
                print e
                print "[-]Crawl failed."
Esempio n. 30
0
class SpiderMain(object):
    def __init__(self):
	    self.manager=urlManager()
	    self.parser=HtmlParser()
	    self.downloader=HtmlDownloader()
	    self.output=DataOutPut()
    def crawl(self,root_url):
	    self.manager.add_new_url(root_url)
	    while(self.manager.has_new_url() and self.manager.old_url_size()<100):
	   	 try:
	   	         new_url = self.manager.get_new_url()
	   	         html = self.downloader.download(new_url)
	   	         new_urls,data=self.parser.parser(new_url,html)
	   	         self.manager.add_new_urls(new_urls)
	   	         self.output.store_data(data)
	   	         print('have got %s urls:'%self.manager.old_url_size())
	   	 except:
		   	 print('crawl failed')
	    self.output.output_html()
Esempio n. 31
0
class SpiderWork(object):
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')

        server_addr = '127.0.0.1'
        print ('Connect to server %s...' % server_addr)

        self.m=BaseManager(address=(server_addr,8001),authkey='qiye'.encode('utf-8'))
        print 'connecting...'
        self.m.connect()
        print 'connected'

        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()

        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print 'spider init finish'


    def crawl(self):
        while True:
            try:
                # print self.task
                if not self.task.empty():
                    url = self.task.get()

                    if url == 'end':
                        print ('stop...')
                        # 通知其它节点停止
                        self.result.put({'new_urls':'end','data':'end'})
                        return
                    print ('spider is working on %s'%url)
                    content = self.downloader.download(url)
                    new_urls, data = self.parser.parser(url, content)
                    self.result.put({'new_urls':new_urls,'data':data})
            except EOFError as e:
                print 'cannot connect other'
                return
            except Exception as e:
                print e
                print 'crawl fail'
Esempio n. 32
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                new_url = self.manager.get_new_url()
                html=self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url,html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print "already get %s url" % self.manager.old_url_size()
            except Exception,e:
                print "crawl failed"
        self.output.output_html()