Beispiel #1
0
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

        pass
    def __init__(self):
        #初始化分布式进程中的工作节点的连接工作
        class QueueManager(BaseManager):
            pass

        # 实现第一步:使用BaseManager注册获取Queue的方法名称
        QueueManager.register('get_task_queue')
        QueueManager.register('get_result_queue')

        # 实现第二步:连接到服务器:
        server_addr = ('192.168.10.128', 8004)
        print('Connect to server {}...'.format(server_addr))

        # 端口和验证口令注意保持与服务进程设置的完全一致:
        self.m = QueueManager(address=server_addr, authkey='janson'.encode())

        # 从网络连接:
        self.m.connect()

        # 实现第三步:获取Queue的对象:
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()

        #初始化网页下载器和解析器
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')
Beispiel #3
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()

    def craw(self, my_root_url):
        count = 1
        self.urls.add_new_url(my_root_url)
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()
                print("craw %d : %s" % (count, new_url))
                # 下载网页
                html_cont = self.downloader.download(new_url)
                # 解析网页
                self.parser.parse_test(new_url, html_cont)
                """
                new_urls, new_data = self.parser.parse(new_url, html_cont)
                self.urls.add_new_urls(new_urls)
                # 网页输出器收集数据
                self.outputer.collect_data(new_data)
                if count == 10:
                    break
                count += 1
                """

            except:
                print("craw failed")

        self.outputer.output_html()
Beispiel #4
0
class SpiderMan(object):  
  def __init__(self):
    self.downloader = HtmlDownloader()
    self.parser = HtmlParser()
    self.output = DataOutput()

  def crawl(self,root_url):
    content = self.downloader.download(root_url)
    urls = self.parser.parser_url(root_url,content)
    for url in urls:
      try:
        time.sleep(0.1)
        t = time.strftime("%Y%m%d%H%M%S",time.localtime())
        rank_url ='http://service.library.mtime.com/Movie.api'\
        '?Ajax_CallBack=true'\
        '&Ajax_CallBackType=Mtime.Library.Services'\
        '&Ajax_CallBackMethod=GetMovieOverviewRating'\
        '&Ajax_CrossDomain=1'\
        '&Ajax_RequestUrl=%s'\
        '&t=%s'\
        '&Ajax_CallBackArgument0=%s'% (url[0],t,url[1])
        rank_content = self.downloader.download(rank_url)
        data = self.parser.parser_json(rank_url,rank_content)
        self.output.store_data(data)
      except Exception,e:
        print 'Crawl failed'
    self.output.output_end()
    print "Crawl finish"
Beispiel #5
0
class EySpider(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()

    def urlsCrawl(self, root_url):
        #主要用来获取链接
        self.manager.add_new_url(root_url)
        #判断url管理器中是否有新的url并且可以规定爬取url的数量
        #self.manager.old_url_size()<***
        while (self.manager.has_new_url()):
            try:
                #从url管理器中取出未爬取的连接
                new_url = self.manager.get_new_url()
                #下载页面
                html = self.downloader.staticPageDownload(new_url)
                #获取到新的urls
                urls = self.parser.urlsparser(html)
                self.manager.add_new_urls_to_old(new_url)
            except:
                print("爬取链接失败")

    def keywordsCrawl(self):
        while (self.manager.has_new_url()):
            try:
                # 从url管理器中取出未爬取的连接
                new_url = self.manager.get_new_url()
                # 下载页面
                html = self.downloader.staticPageDownload(new_url)
                # 获取到新的urls
                keywords = self.parser.Parser(html)
                self.manager.add_new_urls_to_old(new_url)
            except:
                print("爬取关键字失败")
Beispiel #6
0
 def __init__(self, conf):
     self.url1 = "https://www.mk.co.kr/news/economy"
     self.conf = conf
     self.html = HtmlParser(conf)
     self.news = CrawlerNewspaper(conf)
     self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port'])
     self.util = Utils()
Beispiel #7
0
    def get_params(self, url, content):
        urlparams = urlparse(url)
        html = HtmlParser(content, urlparams.scheme+'://'+urlparams.netloc)
        errors = []
        results = {}
        for i in range(len(self.search_params)):
            param = self.search_params[i]
            if 'eval' in param:
                eval_param = param['eval'].split('.')
                if len(eval_param)==2:
                    param[eval_param[0]][eval_param[1]] = eval(param['eval_string'])
            res = html.get(param['possible'], param['params'])
            results[param['name']] = res
            if res is None:
                errors.append(param['name'])
                if 'critical' in param and param['critical']:
                    return {
                        'parsed': False,
                        'errors': errors,
                    }

        results['media'] = {}
        results['media']['image'] = html.get_image()
        return {
            'parsed': True,
            'results': results,
            'errors': errors,
        }
Beispiel #8
0
 def __init__(self, conf):
     self.url1 = "http://www.sisanews.kr/news/articleList.html?sc_section_code=S1N17&view_type=sm"
     self.url2 = "http://www.sisanews.kr/news/articleList.html?sc_section_code=S1N16&view_type=sm"
     self.burl = "http://www.sisanews.kr"
     self.html = HtmlParser(conf)
     self.news = CrawlerNewspaper(conf)
     self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port'])
Beispiel #9
0
 def __init__(self, conf):
     self.url1 = "https://www.hankyung.com/finance/0104"
     self.url2 = "https://www.hankyung.com/finance/0103"
     self.url3 = "https://www.hankyung.com/finance/0102"
     self.conf = conf
     self.html = HtmlParser(conf)
     self.news = CrawlerNewspaper(conf)
     self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port'])
Beispiel #10
0
    def __init__(self):
        self.G_STATE_OK = 200
        self.crawMaxNum = -1
        self.crawCountNum = 0

        self.urlManager = UrlManager()
        self.dispatch = Dispatch()
        self.htmlParser = HtmlParser("http://baike.baidu.com")
        self.applicationShow = ApplicationShow()
Beispiel #11
0
 def __init__(self):
     self.manager = URLManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()
     self.pageUrl = []
     for num in range(1, 29):
         self.pageUrl.append(
             f'https://cl.887x.xyz/thread0806.php?fid=20&search=&page={num}'
         )
Beispiel #12
0
    def __init__(self, sort, sort_url, sortFilename):
        threading.Thread.__init__(self)
        self.sort = sort
        self.sort_url = sort_url
        self.sortFilename = sortFilename

        self.manager = UrlManager(self.sort)
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()
Beispiel #13
0
    def __init__(self, bind_domain):

        # 建立管理爬取URL的物件 , 用于记录已经爬过的URL
        self.urlManager = UrlManager(enable_external_link=False,
                                     bind_domain=bind_domain)

        # 建立请求链接的物件
        self.downloader = HtmlDownloader()

        # 建立转换Html源码成lxml.html物件 , 获取新的链接
        self.parser = HtmlParser()
 def __init__(self):
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     server_addr = '127.0.0.1'
     print('Connect to server %s'%server_addr)
     self.m = BaseManager(address = (server_addr,8001),authkey = b'baike')
     self.m.connect()
     self.task = self.m.get_task_queue()
     self.result = self.m.get_result_queue()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     print('init finish')
Beispiel #15
0
class SpiderWork(object):
    def __init__(self):
        # 初始化分布式进程中的工作节点的连接工作
        # 第一步:使用BaseManageer获取Queue的方法名称
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        # 第二步:连接到服务器
        server_addr = '127.0.0.1'
        print(('Connect to server %s...' % server_addr))
        # 端口和验证口令注意和服务进程设置的完全一致:
        self.m = BaseManager(address=(server_addr, 8002),
                             authkey='lagou'.encode('utf-8'))
        # 从网络连接
        self.m.connect()
        # 第三步:获取Queue的对象
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        # 初始化网页下载器和解析器
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):
        while True:
            try:
                if not self.task.empty():
                    url = self.task.get()
                    if url == 'end':
                        print('控制节点通知爬虫节点停止工作...')
                        # 接着通知其他节点停止工作
                        self.result.put('end')
                        return
                    #print('成功获取到第%d个任务'%(316-self.task.qsize()))
                    print('该爬虫节点正在解析:%s' % url)
                    # 先下载第一页来获取总页
                    html = self.downloader.download_job(url, 1)
                    tal_page = self.parser.get_page(html)
                    print("共%d页职位信息" % tal_page)
                    for page in range(1, tal_page + 1):
                        print("正在爬取第%d页" % page + "共%d页" % tal_page)
                        html = self.downloader.download_job(url, page)
                        data = self.parser.get_job(html)
                        self.result.put(data)

            except EOFError as e:
                print("连接工作节点失败")
                return
            except Exception as e:
                print(e)
                print('crawl fail')
Beispiel #16
0
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')

        server_adrr = '127.0.0.1'
        print('connect to %s...' % server_adrr)

        self.m = BaseManager(address=(server_adrr, 8001), authkey=b'qiye')

        self.m.connect()
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.htmlparser = HtmlParser()
        self.dataoutput = DataOutput()
Beispiel #17
0
 def __init__(self):
     # 爬虫调度器需要先连接上控制节点,然后从url_q队列中获取URL,下载并解析网页,接着将获取的数据提交给
     # result_q队列并返回给控制节点
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     # 链接到服务器
     server_addr = '127.0.0.1'
     print ('connect to server %s....' % server_addr)
     self.m = BaseManager(address=(server_addr, 8001), authkey='baike')
     self.m.connect()
     # 获取Queue对象
     self.task = self.m.get_task_queue()
     self.result = self.m.get_result_queue()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
Beispiel #18
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def start(self, url, numMax=50):
        self.manager.addUrl(url)
        num = 0
        errorsNum = 0
        while self.manager.sizeofNew() != 0 and num < numMax:
            try:
                num = num + 1
                url = self.manager.getUrl()
                print('%d\n %s' % (num, url))
                html = self.downloader.download(url)
                newUrls, data = self.parser.parser(url, html)
                self.output.addData(data)
                if self.manager.sizeofNew() + self.manager.sizeofOld(
                ) < numMax:
                    self.manager.addUrls(newUrls)
                print(data['title'])
            except:
                num = num - 1
                errorsNum = errorsNum + 1
                print('crawl failed %d' % errorsNum)
        self.output.outputData()
Beispiel #19
0
class SpiderSchedule(object):
    '''
    爬虫调度器,负责初始化各个模块,然后通过crawl传递入口url
    方法内部安卓运行流畅控制各个模块工作
    '''
    def __init__(self):
        self.manager = URLManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口url
        self.manager.add_new_url(root_url)
        # 判断是否有新的url,同时判断抓取url个数
        while self.manager.has_new_url() and self.manager.old_urls_size() < 10:
            try:
                # 1.从URL管理器获取新的URL
                new_url = self.manager.get_new_url()
                # 2.将URL交给HtmlDownloader下载
                html = self.downloader.download(new_url)
                # 3.将下载的页面交给HtmlParser解析
                urls, data = self.parser.parser(new_url, html)
                # 4.将解析的数据存储,将重新抽取的URL交给URLManager
                self.output.store_data(data)
                for url in urls:
                    self.manager.add_new_url(url)
                print('已经抓取{0}个链接:'.format(self.manager.old_urls_size()),
                      new_url)
            except Exception as e:
                print(e.args)
                print('crawl failed:', url)
        self.output.output_html()
Beispiel #20
0
class SpiderMan(object):

    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownLoader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断取了多少个url
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print(self.manager.old_url_size())
                print(data)
            except Exception as e:
                print('crawl failed')

        self.output.output_question()
        self.output.output_answer()
Beispiel #21
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)

        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 50):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                if html == None:
                    print('failded to get pages')
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print('has scraped %s links' % self.manager.old_url_size())
            except Exception as e:
                print('crawl failed')
        self.output.output_html()
        '''
Beispiel #22
0
class SpiderWork(object):
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        server_addr = '127.0.0.1'
        print('Connect to server %s...' % server_addr)
        self.m = BaseManager(address=(server_addr, 8001),authkey=('baike'.encode('utf-8')))
        self.m.connect()
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')
    def crawl(self):
        while(True):
            try:
                if not self.task.empty():
                    url = self.task.get()
                    print(url)
                    if url =='end':
                        print('控制节点通知爬虫节点停止工作...')
                        self.result.put({'new_urls':'end','data':'end'})
                        return
                    print('爬虫节点正在解析:%s'%url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls,data = self.parser.parser(url,content)
                    self.result.put({"new_urls":new_urls,"data":data})
            except EOFError as e:
                print("连接工作节点失败")
                return
            except Exception as e:
                print(e)
                print('Crawl  fail')
Beispiel #23
0
class SpiderWorker():
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')

        server_adrr = '127.0.0.1'
        print('connect to %s...' % server_adrr)

        self.m = BaseManager(address=(server_adrr, 8001), authkey=b'qiye')

        self.m.connect()
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.htmlparser = HtmlParser()
        self.dataoutput = DataOutput()

    def crawl(self):

        while True:
            try:
                if self.task.empty:
                    url = self.task.get()
                    if url == 'end':
                        return None
                    print('正在解析 %s' % url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls, data = self.htmlparser.parser(url, content)
                    self.result.put({'new_urls': new_urls})
                    self.dataoutput.output_mongo({'data': data})
            except Exception as e:
                print(e)
Beispiel #24
0
class SpiderMan(object):
    """爬虫调度器"""
    def __init__(self):
        self.urlManager = UrlManager()
        self.htmlDownloader = HtmlDownloader()
        self.htmlParser = HtmlParser()
        self.htmlOutput = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.urlManager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.urlManager.has_new_url()
               and self.urlManager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的url
                new_url = self.urlManager.get_new_url()
                # HTML下载器下载网页
                html = self.htmlDownloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.htmlParser.parser(new_url, html)
                # 将抽取的url添加到URL管理器中
                self.urlManager.add_new_urls(new_urls)
                # 数据存储器存储数据
                self.htmlOutput.store_data(data)
            except Exception as e:
                print(traceback.format_exc())
        # 数据存储器将文件输出成指定格式
        self.htmlOutput.output_html()
Beispiel #25
0
class SpiderMan:
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口 url
        self.manager.add_new_url(root_url)
        # 判断 url 管理器是否有新的 url,同时判断抓去了多少 url
        while self.manager.has_new_url() and self.manager.old_url_size() < 100:
            try:
                # 从 URL 管理器获取新的 url
                new_url = self.manager.get_new_url()
                # 从 html 下载器下载网页
                html = self.downloader.download(new_url)
                # print(html)
                # 从 html 解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取的 url 添加到 URl 管理器
                self.manager.add_new_urls(new_urls)
                # 数据存储器存储文件
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_url_size())
            except Exception as e:
                print("crawl failed")
        self.output.output_html()
Beispiel #26
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawlOneTag(self, book_tag):
        page_num = 0
        book_list = []
        while page_num <= 2:
            try:
                new_url = self.manager.get_new_url(page_num, book_tag)
                html = self.downloader.download(new_url)
                book_list += self.parser.parser(html)
            except Exception as e:
                print("crawl failed")
            page_num += 1
        return book_list

    def crawlAllTags(self, book_tag_lists, topath):
        book_lists = []
        for book_tag in book_tag_lists:
            book_list = self.crawlOneTag(book_tag)
            book_list = sorted(book_list, key=lambda x: x[1], reverse=True)
            book_lists.append(book_list)
        self.output.output(book_lists, book_tag_lists, topath)
class SpiderWork():
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        server_addr = '127.0.0.1'
        print('Connect to server %s'%server_addr)
        self.m = BaseManager(address = (server_addr,8001),authkey = b'baike')
        self.m.connect()
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):
        while True:
            try:
                if not self.task.empty():
                    url = self.task.get()
                    if url == 'end':
                        print('控制节点通知爬虫结束工作')
                        self.result.put({'new_urls':'end','data':'end'})
                        return
                    print('爬虫正在解析%s'%url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls,data = self.parser.parser(url,content)
                    self.result.put({'new_urls':new_urls,'data':data})
            except EOFError:
                print('连接工作节点失败')
                return
            except Exception:
                print(Exception)
                print('Crawl fail')
Beispiel #28
0
class SpiderMain(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取到url添加到URL管理器中
                self.manager.add_new_urls(new_urls)
                # 数据存储器储存文件
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_url_size())
                #print(new_url)
            except Exception as e:
                print("crawl failed")
            # 数据存储器将文件输出成指定格式
        self.output.output_html()
Beispiel #29
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.dataoutput = DataOutput()
     self.mongoengine = Use_MongoEngine()
     self.urloutput = Url_info_Output()
Beispiel #30
0
class SpiderMan(object):
    def __init__(self):
        self.downloader=HtmlDownloader()
        self.parser=HtmlParser()
        self.output=HtmlOutput()

    def crawl(self,root_url):
        album_response = self.downloader.download(root_url)
        self.output.output_head()
        for album in self.parser.get_kw_album(album_response):
            self.output.output_album(album)
            track_url = 'http://mobile.ximalaya.com/mobile/v1/album/ts-1552364593682?ac=WIFI&albumId=%d&device=android&isAsc=true&isQueryInvitationBrand=true&pageId=1&pageSize=20&pre_page=0&source=0&supportWebp=true' %album['albumId']
            track_response = self.downloader.download(track_url)
            track_info = self.parser.get_kw_track(track_response)
            self.output.output_track(track_info)
        self.output.output_end()
Beispiel #31
0
class Spider_Scheduler(object):
    def __init__(self):
        self.urlmanager = UrlQueue()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 入口放url种子
        self.urlmanager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.urlmanager.has_new_url()
               and self.urlmanager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的url
                new_url = self.urlmanager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取到url添加到URL管理器中
                self.urlmanager.add_new_urls(new_urls)
                # 存储器将数据序列化
                self.output.data_to_list(data)
                print("已经抓取%s个链接" % self.urlmanager.old_url_size())
            except Exception as e:
                print("crawl failed")
        # 存储器输出成指定格式
        self.output.output_html()
Beispiel #32
0
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')

        server_addr = '127.0.0.1'
        print ('Connect to server %s...' % server_addr)

        self.m=BaseManager(address=(server_addr,8001),authkey='qiye'.encode('utf-8'))
        print 'connecting...'
        self.m.connect()
        print 'connected'

        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()

        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print 'spider init finish'
Beispiel #33
0
class SpiderWork(object):
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')

        server_addr = '127.0.0.1'
        print ('Connect to server %s...' % server_addr)

        self.m=BaseManager(address=(server_addr,8001),authkey='qiye'.encode('utf-8'))
        print 'connecting...'
        self.m.connect()
        print 'connected'

        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()

        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print 'spider init finish'


    def crawl(self):
        while True:
            try:
                # print self.task
                if not self.task.empty():
                    url = self.task.get()

                    if url == 'end':
                        print ('stop...')
                        # 通知其它节点停止
                        self.result.put({'new_urls':'end','data':'end'})
                        return
                    print ('spider is working on %s'%url)
                    content = self.downloader.download(url)
                    new_urls, data = self.parser.parser(url, content)
                    self.result.put({'new_urls':new_urls,'data':data})
            except EOFError as e:
                print 'cannot connect other'
                return
            except Exception as e:
                print e
                print 'crawl fail'
Beispiel #34
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                new_url = self.manager.get_new_url()
                html=self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url,html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print "already get %s url" % self.manager.old_url_size()
            except Exception,e:
                print "crawl failed"
        self.output.output_html()
Beispiel #35
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()
Beispiel #36
0
class collector():
    '''
    从html中提取相关tag内容
    '''
    def __init__(self,html):
        self.html=html
        self.d=pq(html)
        self.d('script').remove()
        self.d('style').remove()
        self.html_parser=HtmlParser(self.html)
       
        
    def clear_other_node(self):
        '''
        删除无用标签
        '''
        self.d('head').remove()
        self.d('h1').remove()
        self.d('h2').remove()
        self.d('h3').remove()
        self.d('b').remove()
        self.d('a').remove()
        
    def get_title(self):
        '''
        提取 title
        '''
        return self.d('title').text()
    
    def get_node(self,node):
        '''
        提取 字符型节点 字符串
        '''
        nodes=self.html_parser.get_node(node)
        text=''
        for i in nodes:
            text+=i
        return text
    
    def get_urls(self):
        '''
        返回url  与 get_as 想配套
        '''
    

    def xml(self,docID):
        '返回xml源码'
        #通过docID 在sortedurls 中确定 tem_home_url
        self.transurl.setTemHomeUrl(docID) #确定tem_home_url
        str='<html></html>'
        titleText=self.d('title').text()
        self.dd=dom.parseString(str)
        #print self.dd
        html=self.dd.firstChild
        #生成title
        htmlCtrl=htmlctrl(self.d.html())
        title=self.dd.createElement('title')
        html.appendChild(title)
        title.setAttribute('text',titleText)
        #生成b
        bb=htmlCtrl.gNode('b')
        b=self.dd.createElement('b')
        for i in bb:
            ii=self.dd.createElement('item')
            ii.setAttribute('text',i)
            b.appendChild(ii)
        html.appendChild(b)
        #生成h1
        bb=htmlCtrl.gNode('h1')
        b=self.dd.createElement('h1')
        for i in bb:
            ii=self.dd.createElement('item')
            ii.setAttribute('text',i)
            b.appendChild(ii)
        html.appendChild(b)
        #生成h2
        bb=htmlCtrl.gNode('h2')
        b=self.dd.createElement('h2')
        for i in bb:
            ii=self.dd.createElement('item')
            ii.setAttribute('text',i)
            b.appendChild(ii)
        html.appendChild(b)
        #生成h3
        bb=htmlCtrl.gNode('h3')
        b=self.dd.createElement('h3')
        for i in bb:
            ii=self.dd.createElement('item')
            ii.setAttribute('text',i)
            b.appendChild(ii)
        html.appendChild(b)
        #生成a
        aa=htmlCtrl.gA()
        a=self.dd.createElement('a')
        for i in aa:
            #i=self.transurl.trans_d(i) #对url转化为标准绝对地址
            aindex=self.dd.createElement('item')
            aindex.setAttribute('name',i)
            #aindex.setAttribute('href',self.a_trav(aa[i]))
            aindex.setAttribute('href',self.transurl.trans_d(aa[i]))
            a.appendChild(aindex)
        html.appendChild(a)
        #加入content
        htmltext=self.d.html().decode('gbk','ignore').encode('utf-8')
        ht=pq(htmltext)
        #bug 说明
        #此处  需啊注意 其中有html的特殊字符   &# 等等
        #在分词的时候另外说明
        content=ht.text()
        cc=self.dd.createElement('content')
        ctext=self.dd.createTextNode(content)
        cc.appendChild(ctext)
        html.appendChild(cc)
        #print self.dd.toprettyxml()
        return self.dd
Beispiel #37
0
 def __init__(self,html):
     self.html=html
     self.d=pq(html)
     self.d('script').remove()
     self.d('style').remove()
     self.html_parser=HtmlParser(self.html)