예제 #1
0
class SpiderWork(object):
    def __init__(self):
        self.dataout = DataOut()
        self.parser = HtmlParser()
        self.downloader = HtmlDownloader()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parse_url(root_url, content)

        for url in urls:
            t = time.strftime('%Y%m%d%H%M', time.localtime())
            try:
                movie_id = ''
                mobj = re.match(r'.*?/(\d+)/.*?', url)
                if mobj:
                    movie_id = mobj.group(1)
                # print(movie_id)
                ajax_url = '''http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.
                            Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CrossDomain=1&
                            Ajax_RequestUrl={0}&t={1}&Ajax_CallBackArgument0={2}'''.format(
                    url, t, movie_id)  # 构造ajax的url
                ajax_content = self.downloader.download(ajax_url)  # 获取ajax响应内容
                data = self.parser.parse_ajax(ajax_url, ajax_content)  # 解析出数据
                self.dataout.store_data(data)
                print('crawling: ', ajax_url)
            except Exception as e:
                print('crawl failed: ', url, e)
        self.dataout.output_end()
        print('crawl finish!')
예제 #2
0
class SpiderMan(object):
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParse()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.downloader(root_url)
        urls = self.parser.parser_url(root_url, content)
        # 构造一个获取票房连接的URL
        for url in urls:
            print '---------->URL', url, url[0], url[1]
            try:
                t = time.strftime("%Y%m%d%H%M%S3282", time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                    '?Ajax_CallBack=true' \
                    '&Ajax_CallBackType=Mtime.Library.Services' \
                    '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                    '&Ajax_CrossDomain=1' \
                    '&Ajax_RequestUrl=%s' \
                    '&t=%s' \
                    '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1])
                rank_content = self.downloader.downloader(rank_url)
                print 'ajax接口返回内容,汉字正常显示-------->', rank_content
                print 'ajax接口返回内容type为unicode-------->', type(rank_content)
                data = self.parser.parser_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception, e:
                print '获取ajax动态数据失败', e
        self.output.output_end()
        print '=======end========='
예제 #3
0
class SpiderMan(object):  
  def __init__(self):
    self.downloader = HtmlDownloader()
    self.parser = HtmlParser()
    self.output = DataOutput()

  def crawl(self,root_url):
    content = self.downloader.download(root_url)
    urls = self.parser.parser_url(root_url,content)
    for url in urls:
      try:
        time.sleep(0.1)
        t = time.strftime("%Y%m%d%H%M%S",time.localtime())
        rank_url ='http://service.library.mtime.com/Movie.api'\
        '?Ajax_CallBack=true'\
        '&Ajax_CallBackType=Mtime.Library.Services'\
        '&Ajax_CallBackMethod=GetMovieOverviewRating'\
        '&Ajax_CrossDomain=1'\
        '&Ajax_RequestUrl=%s'\
        '&t=%s'\
        '&Ajax_CallBackArgument0=%s'% (url[0],t,url[1])
        rank_content = self.downloader.download(rank_url)
        data = self.parser.parser_json(rank_url,rank_content)
        self.output.store_data(data)
      except Exception,e:
        print 'Crawl failed'
    self.output.output_end()
    print "Crawl finish"
예제 #4
0
class EySpider(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()

    def urlsCrawl(self, root_url):
        #主要用来获取链接
        self.manager.add_new_url(root_url)
        #判断url管理器中是否有新的url并且可以规定爬取url的数量
        #self.manager.old_url_size()<***
        while (self.manager.has_new_url()):
            try:
                #从url管理器中取出未爬取的连接
                new_url = self.manager.get_new_url()
                #下载页面
                html = self.downloader.staticPageDownload(new_url)
                #获取到新的urls
                urls = self.parser.urlsparser(html)
                self.manager.add_new_urls_to_old(new_url)
            except:
                print("爬取链接失败")

    def keywordsCrawl(self):
        while (self.manager.has_new_url()):
            try:
                # 从url管理器中取出未爬取的连接
                new_url = self.manager.get_new_url()
                # 下载页面
                html = self.downloader.staticPageDownload(new_url)
                # 获取到新的urls
                keywords = self.parser.Parser(html)
                self.manager.add_new_urls_to_old(new_url)
            except:
                print("爬取关键字失败")
예제 #5
0
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

        pass
    def __init__(self):
        #初始化分布式进程中的工作节点的连接工作
        class QueueManager(BaseManager):
            pass

        # 实现第一步:使用BaseManager注册获取Queue的方法名称
        QueueManager.register('get_task_queue')
        QueueManager.register('get_result_queue')

        # 实现第二步:连接到服务器:
        server_addr = ('192.168.10.128', 8004)
        print('Connect to server {}...'.format(server_addr))

        # 端口和验证口令注意保持与服务进程设置的完全一致:
        self.m = QueueManager(address=server_addr, authkey='janson'.encode())

        # 从网络连接:
        self.m.connect()

        # 实现第三步:获取Queue的对象:
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()

        #初始化网页下载器和解析器
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')
예제 #7
0
 def __init__(self):
     self.manager = URLManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()
     self.pageUrl = []
     for num in range(1, 29):
         self.pageUrl.append(
             f'https://cl.887x.xyz/thread0806.php?fid=20&search=&page={num}'
         )
예제 #8
0
    def __init__(self, sort, sort_url, sortFilename):
        threading.Thread.__init__(self)
        self.sort = sort
        self.sort_url = sort_url
        self.sortFilename = sortFilename

        self.manager = UrlManager(self.sort)
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()
예제 #9
0
    def __init__(self, bind_domain):

        # 建立管理爬取URL的物件 , 用于记录已经爬过的URL
        self.urlManager = UrlManager(enable_external_link=False,
                                     bind_domain=bind_domain)

        # 建立请求链接的物件
        self.downloader = HtmlDownloader()

        # 建立转换Html源码成lxml.html物件 , 获取新的链接
        self.parser = HtmlParser()
예제 #10
0
def main():
    idi = 1405150114
    urlmanager = UrlManager()
    pageurl = urlmanager.url_login(idi)
    infourl = urlmanager.url_userinfo(idi)
    htmldownloader = HtmlDownloader()
    htmlf, htmli = htmldownloader.download(
        'http://ygcp.njtech.edu.cn/User/LoginInSampleAction.aspx', idi,
        pageurl, infourl)
    parse = HtmlParser()
    parse.parser(infourl, pageurl, htmli, htmlf)
예제 #11
0
 def __init__(self):
     BaseManager.register("get_task_queue")
     BaseManager.register("get_result_queue")
     server_addr = "127.0.0.1"
     logging.info('Connect to server %s ...' % server_addr)
     self.m = BaseManager(address=(server_addr, 8001), authkey="baike".encode())
     self.m.connect()
     self.task = self.m.get_task_queue()
     self.result = self.m.get_result_queue()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParparser()
     logging.info("init finish")
예제 #12
0
 def __init__(self):
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     server_addr = '127.0.0.1'
     print('Connect to server %s'%server_addr)
     self.m = BaseManager(address = (server_addr,8001),authkey = b'baike')
     self.m.connect()
     self.task = self.m.get_task_queue()
     self.result = self.m.get_result_queue()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     print('init finish')
예제 #13
0
파일: SpliderWork.py 프로젝트: MumumuZi/-
class SpiderWork(object):
    def __init__(self):
        # 初始化分布式进程中的工作节点的连接工作
        # 第一步:使用BaseManageer获取Queue的方法名称
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        # 第二步:连接到服务器
        server_addr = '127.0.0.1'
        print(('Connect to server %s...' % server_addr))
        # 端口和验证口令注意和服务进程设置的完全一致:
        self.m = BaseManager(address=(server_addr, 8002),
                             authkey='lagou'.encode('utf-8'))
        # 从网络连接
        self.m.connect()
        # 第三步:获取Queue的对象
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        # 初始化网页下载器和解析器
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):
        while True:
            try:
                if not self.task.empty():
                    url = self.task.get()
                    if url == 'end':
                        print('控制节点通知爬虫节点停止工作...')
                        # 接着通知其他节点停止工作
                        self.result.put('end')
                        return
                    #print('成功获取到第%d个任务'%(316-self.task.qsize()))
                    print('该爬虫节点正在解析:%s' % url)
                    # 先下载第一页来获取总页
                    html = self.downloader.download_job(url, 1)
                    tal_page = self.parser.get_page(html)
                    print("共%d页职位信息" % tal_page)
                    for page in range(1, tal_page + 1):
                        print("正在爬取第%d页" % page + "共%d页" % tal_page)
                        html = self.downloader.download_job(url, page)
                        data = self.parser.get_job(html)
                        self.result.put(data)

            except EOFError as e:
                print("连接工作节点失败")
                return
            except Exception as e:
                print(e)
                print('crawl fail')
예제 #14
0
class HtmlParser(object):
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.page_num = self._get_page_num()

    def _get_page_num(self):
        data = self.downloader.get_page(1)
        if data and 'value' in data.keys():
            count = re.search(r',"wareCount":(\d*?),"',
                              data['value']).groups()[0]
            if count:
                count = int(count)
                num = count % 10
                if num == 0:
                    return num
                else:
                    return count // 10 + 1

    def get_page_urls(self, page):
        urls = []
        url = ''
        data = self.downloader.get_page(page)
        pattern = re.compile(r',\"eBookFlag\":(.*?),\".*?,"wareId":"(\d*?)"')
        result = re.findall(pattern, data['value'])
        if result:
            for item in result:
                if item[0] == 'true':
                    url = 'https://e.m.jd.com/ebook/' + item[1] + '.html'
                if item[0] == 'false':
                    url = 'https://item.m.jd.com/product/' + item[1] + '.html'
                urls.append(url)
        return urls

    def get_data(self, url):
        html = self.downloader.get_page_data(url)
        soup = BeautifulSoup(html, 'html.parser')
        title = ''
        price = '0'
        if url[0:12] == 'https://e.m.':
            title = soup.find('p', class_='db-title').text
            price = soup.find('span', class_='db-price-num').text
        if url[0:12] == 'https://item':
            title = soup.find('span', class_='title-text').text
            price = float(soup.find('span', class_='big-price').text) + float(
                soup.find('span', class_='small-price').text)
        if price == None:
            price = '0'
        data = {'title': title, 'price': price, 'url': url}
        return data
예제 #15
0
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')

        server_adrr = '127.0.0.1'
        print('connect to %s...' % server_adrr)

        self.m = BaseManager(address=(server_adrr, 8001), authkey=b'qiye')

        self.m.connect()
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.htmlparser = HtmlParser()
        self.dataoutput = DataOutput()
예제 #16
0
 def __init__(self):
     # 爬虫调度器需要先连接上控制节点,然后从url_q队列中获取URL,下载并解析网页,接着将获取的数据提交给
     # result_q队列并返回给控制节点
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     # 链接到服务器
     server_addr = '127.0.0.1'
     print ('connect to server %s....' % server_addr)
     self.m = BaseManager(address=(server_addr, 8001), authkey='baike')
     self.m.connect()
     # 获取Queue对象
     self.task = self.m.get_task_queue()
     self.result = self.m.get_result_queue()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
예제 #17
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawlOneTag(self, book_tag):
        page_num = 0
        book_list = []
        while page_num <= 2:
            try:
                new_url = self.manager.get_new_url(page_num, book_tag)
                html = self.downloader.download(new_url)
                book_list += self.parser.parser(html)
            except Exception as e:
                print("crawl failed")
            page_num += 1
        return book_list

    def crawlAllTags(self, book_tag_lists, topath):
        book_lists = []
        for book_tag in book_tag_lists:
            book_list = self.crawlOneTag(book_tag)
            book_list = sorted(book_list, key=lambda x: x[1], reverse=True)
            book_lists.append(book_list)
        self.output.output(book_lists, book_tag_lists, topath)
예제 #18
0
class Spiderman(object):
    def __init__(self):
        self.manager = Urlmanager()
        self.downloader = HtmlDownloader()
        self.parser = Htmlparser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断url管理器是否有新的url,同时判断抓取了多少url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                # 从url管理器获得新的url
                new_url = self.manager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取数据
                new_urls, data = self.parser.parse(new_url, html)
                # 将抽取的url添加到url管理器中
                self.manager.add_new_urls(new_urls)
                # 数据存储器存储文件
                self.output.store_data(data)
                print('已经抓取%s个连接' % self.manager.old_url_size())
            except Exception as e:
                print('crawl failed')
                raise
            # 数据存储器将文件输出指定格式
        self.output.output_html()
예제 #19
0
class SpiderMain(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取到url添加到URL管理器中
                self.manager.add_new_urls(new_urls)
                # 数据存储器储存文件
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_url_size())
                #print(new_url)
            except Exception as e:
                print("crawl failed")
            # 数据存储器将文件输出成指定格式
        self.output.output_html()
예제 #20
0
class SpiderWork():
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        server_addr = '127.0.0.1'
        print('Connect to server %s'%server_addr)
        self.m = BaseManager(address = (server_addr,8001),authkey = b'baike')
        self.m.connect()
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):
        while True:
            try:
                if not self.task.empty():
                    url = self.task.get()
                    if url == 'end':
                        print('控制节点通知爬虫结束工作')
                        self.result.put({'new_urls':'end','data':'end'})
                        return
                    print('爬虫正在解析%s'%url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls,data = self.parser.parser(url,content)
                    self.result.put({'new_urls':new_urls,'data':data})
            except EOFError:
                print('连接工作节点失败')
                return
            except Exception:
                print(Exception)
                print('Crawl fail')
예제 #21
0
class Spider_Scheduler(object):
    def __init__(self):
        self.urlmanager = UrlQueue()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 入口放url种子
        self.urlmanager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.urlmanager.has_new_url()
               and self.urlmanager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的url
                new_url = self.urlmanager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取到url添加到URL管理器中
                self.urlmanager.add_new_urls(new_urls)
                # 存储器将数据序列化
                self.output.data_to_list(data)
                print("已经抓取%s个链接" % self.urlmanager.old_url_size())
            except Exception as e:
                print("crawl failed")
        # 存储器输出成指定格式
        self.output.output_html()
예제 #22
0
class SpiderSchedule(object):
    '''
    爬虫调度器,负责初始化各个模块,然后通过crawl传递入口url
    方法内部安卓运行流畅控制各个模块工作
    '''
    def __init__(self):
        self.manager = URLManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口url
        self.manager.add_new_url(root_url)
        # 判断是否有新的url,同时判断抓取url个数
        while self.manager.has_new_url() and self.manager.old_urls_size() < 10:
            try:
                # 1.从URL管理器获取新的URL
                new_url = self.manager.get_new_url()
                # 2.将URL交给HtmlDownloader下载
                html = self.downloader.download(new_url)
                # 3.将下载的页面交给HtmlParser解析
                urls, data = self.parser.parser(new_url, html)
                # 4.将解析的数据存储,将重新抽取的URL交给URLManager
                self.output.store_data(data)
                for url in urls:
                    self.manager.add_new_url(url)
                print('已经抓取{0}个链接:'.format(self.manager.old_urls_size()),
                      new_url)
            except Exception as e:
                print(e.args)
                print('crawl failed:', url)
        self.output.output_html()
예제 #23
0
class SpiderMan(object):
    def __init__(self):
        self.downloader=HtmlDownloader()
        self.parser=HtmlParser()
        self.output=HtmlOutput()

    def crawl(self,root_url):
        album_response = self.downloader.download(root_url)
        self.output.output_head()
        for album in self.parser.get_kw_album(album_response):
            self.output.output_album(album)
            track_url = 'http://mobile.ximalaya.com/mobile/v1/album/ts-1552364593682?ac=WIFI&albumId=%d&device=android&isAsc=true&isQueryInvitationBrand=true&pageId=1&pageSize=20&pre_page=0&source=0&supportWebp=true' %album['albumId']
            track_response = self.downloader.download(track_url)
            track_info = self.parser.get_kw_track(track_response)
            self.output.output_track(track_info)
        self.output.output_end()
예제 #24
0
class HtmlParser(object):
    def __init__(self):
        self.downloader = HtmlDownloader()

    def get_all_patents_info(self, url_cont):
        html = self.downloader.download(url_cont)
        text = pq(html).text()
        soup = BeautifulSoup(html, 'html5lib')
        Patent_name = soup.find('font', size="+1").text.strip().lower()
        Patent_num = int(
            soup.select(
                'body > table:nth-of-type(1) > tbody > tr:nth-of-type(1) > td:nth-of-type(2) > b'
            )[0].text.strip())
        Description = soup.find_all('i', text="Description")[0].text
        Patent_description = re.findall(
            r"(?<=%s)[\w\W]*?(?=\* \* \* \* \*)" % Description, text)[0]
        # Patent_abstract = soup.select('body > p:nth-of-type(2)')[0].text.strip()
        # Patent_claim = re.findall(r"(?<=Claims)[\w\W]*?(?=Description)", text)[0].replace("\n", "").replace('  ', '')
        pat_dict = {
            'Patent_name': Patent_name,
            'Patent_num': Patent_num,
            # 'Patent_text':text,
            # 'Patent_claim':Patent_claim,
            'Patent_description': Patent_description,
            'Patent_url': url_cont,
        }
        print(pat_dict)
        return pat_dict
예제 #25
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = DataOutput()
        self.output = DataOutput()

    def crawl(self, root_url):
        #添加入口
        self.manager.add_new_url(root_url)
        #判断url管理管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.hes_new_url()
               and self.manager.old_url_size() < 100):
            try:
                #从url管理器中获取新的url
                new_url = self.manager.get_new_url()
                #html解释器抽取网页数据
                html = self.downloader.download(new_url)
                #将抽取的url添加到url管理器中
                self.manager.add_new_url(new_url)
                #将数据存储到文件
                self.output.stor_data(data)
                print("已经抓取%个链接" % self.manager.old_url_size())
            except Exception, e:
                print("crawl failed")
        #数据存储器将文件输出成指定格式
        self.output.output_html()
예제 #26
0
class SpiderMan(object):
    """爬虫调度器"""
    def __init__(self):
        self.urlManager = UrlManager()
        self.htmlDownloader = HtmlDownloader()
        self.htmlParser = HtmlParser()
        self.htmlOutput = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.urlManager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.urlManager.has_new_url()
               and self.urlManager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的url
                new_url = self.urlManager.get_new_url()
                # HTML下载器下载网页
                html = self.htmlDownloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.htmlParser.parser(new_url, html)
                # 将抽取的url添加到URL管理器中
                self.urlManager.add_new_urls(new_urls)
                # 数据存储器存储数据
                self.htmlOutput.store_data(data)
            except Exception as e:
                print(traceback.format_exc())
        # 数据存储器将文件输出成指定格式
        self.htmlOutput.output_html()
예제 #27
0
class SpiderWork(object):
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        server_addr = '127.0.0.1'
        print('Connect to server %s...' % server_addr)
        self.m = BaseManager(address=(server_addr, 8001),authkey=('baike'.encode('utf-8')))
        self.m.connect()
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')
    def crawl(self):
        while(True):
            try:
                if not self.task.empty():
                    url = self.task.get()
                    print(url)
                    if url =='end':
                        print('控制节点通知爬虫节点停止工作...')
                        self.result.put({'new_urls':'end','data':'end'})
                        return
                    print('爬虫节点正在解析:%s'%url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls,data = self.parser.parser(url,content)
                    self.result.put({"new_urls":new_urls,"data":data})
            except EOFError as e:
                print("连接工作节点失败")
                return
            except Exception as e:
                print(e)
                print('Crawl  fail')
예제 #28
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def start(self, url, numMax=50):
        self.manager.addUrl(url)
        num = 0
        errorsNum = 0
        while self.manager.sizeofNew() != 0 and num < numMax:
            try:
                num = num + 1
                url = self.manager.getUrl()
                print('%d\n %s' % (num, url))
                html = self.downloader.download(url)
                newUrls, data = self.parser.parser(url, html)
                self.output.addData(data)
                if self.manager.sizeofNew() + self.manager.sizeofOld(
                ) < numMax:
                    self.manager.addUrls(newUrls)
                print(data['title'])
            except:
                num = num - 1
                errorsNum = errorsNum + 1
                print('crawl failed %d' % errorsNum)
        self.output.outputData()
예제 #29
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.dataoutput = DataOutput()
     self.mongoengine = Use_MongoEngine()
     self.urloutput = Url_info_Output()
예제 #30
0
class SpiderMan:
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口 url
        self.manager.add_new_url(root_url)
        # 判断 url 管理器是否有新的 url,同时判断抓去了多少 url
        while self.manager.has_new_url() and self.manager.old_url_size() < 100:
            try:
                # 从 URL 管理器获取新的 url
                new_url = self.manager.get_new_url()
                # 从 html 下载器下载网页
                html = self.downloader.download(new_url)
                # print(html)
                # 从 html 解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取的 url 添加到 URl 管理器
                self.manager.add_new_urls(new_urls)
                # 数据存储器存储文件
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_url_size())
            except Exception as e:
                print("crawl failed")
        self.output.output_html()
예제 #31
0
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')

        server_addr = '127.0.0.1'
        print ('Connect to server %s...' % server_addr)

        self.m=BaseManager(address=(server_addr,8001),authkey='qiye'.encode('utf-8'))
        print 'connecting...'
        self.m.connect()
        print 'connected'

        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()

        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print 'spider init finish'
예제 #32
0
class SpiderWork(object):
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')

        server_addr = '127.0.0.1'
        print ('Connect to server %s...' % server_addr)

        self.m=BaseManager(address=(server_addr,8001),authkey='qiye'.encode('utf-8'))
        print 'connecting...'
        self.m.connect()
        print 'connected'

        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()

        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print 'spider init finish'


    def crawl(self):
        while True:
            try:
                # print self.task
                if not self.task.empty():
                    url = self.task.get()

                    if url == 'end':
                        print ('stop...')
                        # 通知其它节点停止
                        self.result.put({'new_urls':'end','data':'end'})
                        return
                    print ('spider is working on %s'%url)
                    content = self.downloader.download(url)
                    new_urls, data = self.parser.parser(url, content)
                    self.result.put({'new_urls':new_urls,'data':data})
            except EOFError as e:
                print 'cannot connect other'
                return
            except Exception as e:
                print e
                print 'crawl fail'
예제 #33
0
파일: SpiderMan.py 프로젝트: cawind2/TestTu
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                new_url = self.manager.get_new_url()
                html=self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url,html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print "already get %s url" % self.manager.old_url_size()
            except Exception,e:
                print "crawl failed"
        self.output.output_html()
예제 #34
0
class DataHandler():
    searchData = None
    downloader = None
    failureList = []

    def __init__(self, user, passwd):
        self.searchData = SearchData()
        self.downloader = HtmlDownloader(user, passwd)
    
    def isOffLine(self):
        return self.downloader.isOffLine()

    def download(self, url):
        return self.downloader.downLoad(url)

    def printLog(self):
        print "failure records: "
        for failure in self.failureList:
            print "  " + failure
        self.failureList = []
        
    def addSearchItem(self, alias, name, category, quality):
        newItem = SearchItem()
        newItem.alias = alias
        newItem.name = name
        newItem.category = category
        newItem.quality = quality
        for item in self.searchData.searchItems:
            if cmp (newItem.alias, item.alias) == 0 or (cmp(newItem.name, item.name) == 0 and cmp(newItem.category, item.category) and cmp(newItem.quality, item.quality)):
                print "exist"
                return item
        self.searchData.searchItems.append(newItem)
        return newItem

    def getSearchItemURL(self, searchItem, page):
        url = "http://www.zhaoonline.com/search/"
        url += urllib.pathname2url(searchItem.name)
        url += "-8-3-trade-"
        url += urllib.pathname2url(categoryDic[searchItem.category])
        url += "-"
        url += urllib.pathname2url(qualityDic[searchItem.quality])
        url += "-00-N-0-N-1-"
        url += str(page)
        url += ".htm"
        return  url

    def getHistoryItemURL(self, ref):
        url = "http://www.zhaoonline.com"
        url += ref
        return url

    def updateSearchItem(self, searchItem):
        page = 1
        historyItemListParser = HistoryItemListParser()
        while historyItemListParser.hasNextPage(page):
            url = self.getSearchItemURL(searchItem, page)
            print "parsing search list: " + url
            html = self.downloader.getHtml(url)
            if html == None:
                break
            #html = self.downloader.download(url)
            if historyItemListParser.parse(html) == False:
                self.failureList.append(url)
            self.saveToListFile(searchItem.name+"_"+str(page), html)
            page += 1
        historyItemList = historyItemListParser.getHistoryItemList()
        historyItemListParser.clean()
        searchItem.historyItems = historyItemList
        # now every HistoryItem has id only
        for i in range(0, len(searchItem.historyItems)):
            historyItem = searchItem.historyItems[i]
            url = self.getHistoryItemURL(historyItem.ref)
            print "(" + str(i) + "/" + str(len(searchItem.historyItems))+ ") downloading page: " + url
            html = self.downloader.getHtml(url)
            if html == None:
                continue
            #html = self.downloader.download(url)
            historyItemParser = HistoryItemParser()
            historyItemParser.parse(html)
            tmpItem = historyItemParser.getHistoryItem()
            #historyItem.ref = tmpItem.ref
            historyItem.id = tmpItem.id
            historyItem.name = tmpItem.name
            historyItem.comments = tmpItem.comments
            historyItem.quality = tmpItem.quality
            historyItem.date = tmpItem.date
            historyItem.price = tmpItem.price
            historyItem.auctionText = tmpItem.auctionText
            historyItem.auctionData = tmpItem.auctionData
            # save the html content to tmp directory
            self.saveToTmpFile(historyItem, html)
        return

    def loadAllSearchItemsFromXml(self):
        searchResultXmlLoader = SearchResultXmlLoader()
        self.searchData = searchResultXmlLoader.loadAllXmlFiles()
        # debug
        #for searchItem in self.searchData.searchItems:
        #    self.dumpSearchItem(searchItem)
        return

    def saveAllSearchItemsToXml(self):
        for searchItem in self.searchData.searchItems:
            self.saveSearchItemToXml(searchItem)
        return

    def saveSearchItemToXml(self, searchItem):
        searchResultXmlGenerator = SearchResultXmlGenerator(searchItem)
        searchResultXmlGenerator.generateXml()
        return

    def getSearchItemByAlias(self, alias):
        if alias == None:
            return None
        for searchItem in self.searchData.searchItems:
            if cmp(searchItem.alias, alias) == 0:
                return searchItem
        return None

    def getAllSearchItems(self):
        return self.searchData.searchItems

    def saveToTmpFile(self, historyItem, html):
        fileName = 'tmp/' + historyItem.id + ".shtml"
        f= open(fileName, 'w')
        f.write(html)
        f.close()

    def saveToListFile(self, name, html):
        fileName = 'list/' + name + ".html"
        f= open(fileName, 'w')
        f.write(html)
        f.close()
        
    # debug function
    def dumpSearchItem(self, searchItem):
        print "Dumping SearchItem: " + searchItem.name
        print "  alias: " + searchItem.alias
        print "  category: " + searchItem.category
        print "  quality: " + searchItem.quality
        for historyItem in searchItem.historyItems:
            print "    name =     " + historyItem.name
            print "    comments = " + historyItem.comments
예제 #35
0
 def __init__(self, user, passwd):
     self.searchData = SearchData()
     self.downloader = HtmlDownloader(user, passwd)
예제 #36
0
파일: SpiderMan.py 프로젝트: cawind2/TestTu
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()