Example #1
0
class SpiderMain(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取到url添加到URL管理器中
                self.manager.add_new_urls(new_urls)
                # 数据存储器储存文件
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_url_size())
                #print(new_url)
            except Exception as e:
                print("crawl failed")
            # 数据存储器将文件输出成指定格式
        self.output.output_html()
Example #2
0
class SpiderMan(object):
    '''爬虫调度器
    Attributes:
        manager: URL管理器
        downloader: HTML下载器
        parser: HTML解析器
        output: 数据存储器
    '''
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        '''爬虫调度函数
        Args:
            root_url: 爬虫入口URL
        Raises:
            Expection: 'NoneType' object has no attribute
        '''
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print('已经抓取了%s个链接' % self.manager.old_url_size())
            except Exception as e:
                print('Crawl failed: %s' % e)
        self.output.output_html()
Example #3
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)

        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 50):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                if html == None:
                    print('failded to get pages')
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print('has scraped %s links' % self.manager.old_url_size())
            except Exception as e:
                print('crawl failed')
        self.output.output_html()
        '''
Example #4
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        #添加入口url
        self.manager.add_new_url(root_url)
        #判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                #从url管理器获取新的url
                new_url = self.manager.get_new_url()
                #html下载器下载网页
                html = self.downloader.download(new_url)
                #html解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                #将抽取的url添加到url管理器中
                # self.manager.add_new_url(new_urls)    出现set不可hash问题,因为可迭代的数据是无法hash的
                for new_url in new_urls:
                    self.manager.add_new_url(new_url)
                #数据存储器存储文件
                self.output.store_data(data)
                print('已经抓取%s个链接' % self.manager.old_url_size())
            except Exception as e:
                print('crawl failed with' + str(e))
Example #5
0
class SpiderMan(object):

    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownLoader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断取了多少个url
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print(self.manager.old_url_size())
                print(data)
            except Exception as e:
                print('crawl failed')

        self.output.output_question()
        self.output.output_answer()
Example #6
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口url
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 200):
            try:
                #从url管理器中获取url
                new_url = self.manager.get_new_url()
                # html下载器下载网页
                html = self.downloader.download(new_url)
                #html解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取的url放到url管理器中
                self.manager.add_new_urls(new_urls)
                # 将抽取的数据存储起来
                self.output.store_data(data)
                print " 已经抽取了 %s 个链接" % self.manager.old_url_size()
            except Exception, e:
                print "crawl failed", e

        self.output.output_html()
Example #7
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = DataOutput()
        self.output = DataOutput()

    def crawl(self, root_url):
        #添加入口
        self.manager.add_new_url(root_url)
        #判断url管理管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.hes_new_url()
               and self.manager.old_url_size() < 100):
            try:
                #从url管理器中获取新的url
                new_url = self.manager.get_new_url()
                #html解释器抽取网页数据
                html = self.downloader.download(new_url)
                #将抽取的url添加到url管理器中
                self.manager.add_new_url(new_url)
                #将数据存储到文件
                self.output.stor_data(data)
                print("已经抓取%个链接" % self.manager.old_url_size())
            except Exception, e:
                print("crawl failed")
        #数据存储器将文件输出成指定格式
        self.output.output_html()
Example #8
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()
    def crawl(self,root_url):
        #添加入口URL
        self.manager.add_new_url(root_url)
        #判断url管理器中是否有新的url,同时判断抓取了多少个url
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                #从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                #HTML下载器下载网页
                html = self.downloader.download(new_url)
                #HTML解析器抽取网页数据
                new_th, new_td, new_urls= self.parser.parser(new_url,html,"th","时间","td")
                #将抽取到url添加到URL管理器中
                if new_urls!="meiyou":
                    self.manager.add_new_urls(new_urls)
                    print "已经抓取%s个链接"%self.manager.old_url_size()
                #数据存储器储存文件
                if new_th!="meiyou" and new_td!="meiyou":
                    self.output.store_data(new_th,new_td)
                    self.output.output_html()
            except Exception as e:
                print "抓取失败!"
Example #9
0
class SpiderMan:
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口 url
        self.manager.add_new_url(root_url)
        # 判断 url 管理器是否有新的 url,同时判断抓去了多少 url
        while self.manager.has_new_url() and self.manager.old_url_size() < 100:
            try:
                # 从 URL 管理器获取新的 url
                new_url = self.manager.get_new_url()
                # 从 html 下载器下载网页
                html = self.downloader.download(new_url)
                # print(html)
                # 从 html 解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取的 url 添加到 URl 管理器
                self.manager.add_new_urls(new_urls)
                # 数据存储器存储文件
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_url_size())
            except Exception as e:
                print("crawl failed")
        self.output.output_html()
Example #10
0
def url_manager_proc(url_q, conn_q, root_url, num=6):
    """
    :param url_q:里面放的是url集合单个url
    :param conn_q:里面放的是url集合
    :param root_url:
    :param num:
    :return:
    """
    url_manager = UrlManager()
    url_manager.add_new_url(root_url)
    while True:

        while url_manager.has_new_url():
            print("# url_manager_proc将要爬取的url放入url_q中")
            new_url = url_manager.get_new_url()
            print(new_url)
            url_q.put(new_url)
            if url_manager.old_url_size() > num:
                # 通知爬行节点工作结束
                url_q.put('end')
                print('控制节点发起结束通知!')
                # 关闭管理节点,同时存储 set 状态
                url_manager.save_progress()
                break
        try:
            if not conn_q.empty():
                print("# url_manager_proc从conn_q中拿取urls")
                urls = conn_q.get()
                print(urls)
                url_manager.add_new_urls(urls)
            else:
                # 延时休息
                time.sleep(0.1)
        except Exception as e:
            print(e)
Example #11
0
	def url_manager_proc(self,url_q,conn_q,root_url):
		url_manager = UrlManager()
		url_manager.add_new_url(root_url)
		while True:
			while(url_manager.has_new_url()):
				new_url = url_manager.get_new_url()
				url_q.put(new_url)
				print('old_url=',url_manager.old_url_size())				
				if(url_manager.old_url_size()>30):
					url_q.put('end')
					print('控制节点发起结束通知!')
					url_manager.save_progress('new_urls.txt',url_manager.new_urls)
					url_manager.save_progress('old_urls.txt',url_manager.old_urls)
					return
				try:				
					urls = conn_q.get()
					url_manager.add_new_urls(urls)
				except:
					time.sleep(0.1)
Example #12
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                new_url = self.manager.get_new_url()
                html=self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url,html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print "already get %s url" % self.manager.old_url_size()
            except Exception,e:
                print "crawl failed"
        self.output.output_html()
Example #13
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.download = HtmlDownload()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self,root_url):
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url()  and self.manager.old_url_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                html = self.download.download(new_url)
                new_urls,data = self.parser.parser(new_url,html)
                self.manager.add_new_url(new_urls)
                self.output.store_data(data)
                print('已经抓取%s个链接'%self.manager.old_url_size())
            except Exception as e:
                print(e)
        self.output.output_html()
Example #14
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        #添加入口URL
        self.manager.add_new_url(root_url)  #先添加第一个链接到为处理的列表中
        #判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (
                self.manager.has_new_url()
                and self.manager.old_url_size() < 100
        ):  #第一个页面爬出来的链接已经占用了70多个,导致循环后面获取的链接无法被使用过,n为循环次数,m为一个页面爬出的链接,爬出所有的内容=n*m;需要优化,应该每个页面爬出所有链接后,循环爬出那些链接的内容,然后进行下一个循环,即为二次循环才能满足
            try:
                #从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                print(new_url)
                #HTML下载器下载网页
                html = self.downloader.download(new_url)  #下载整个列表的html内容
                #HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(
                    new_url, html)  #解析每个html页面的内容,获取所有的链接,还有一段内容
                print(new_urls)
                print(
                    len(new_urls)
                )  #每次解析html的url列表都很多,都插入到未处理的url集合里面,但是只循环100次,导致后面循环爬到的url未被使用过
                print(data)
                #将抽取到url添加到URL管理器中
                self.manager.add_new_urls(new_urls)  #新的url集合插入未处理的url里面
                #数据存储器储存文件
                self.output.store_data(data)  #data插入显示的文件

                print("已经抓取%s个链接" % self.manager.old_url_size())

            except Exception as e:
                print(e)
                print("crawl failed")
            #数据存储器将文件输出成指定格式
        self.output.output_html()
Example #15
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口 URl
        self.manager.add_new_url(root_url)

        n = 0
        # 判断 URL 管理器中是否有新的url,同时判断抓取量多少个 URl
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:

                # 从 URl 管理器获取新的 url
                new_url = self.manager.get_new_url()

                # 从 Html 下载器下载网页
                html = self.downloader.download(new_url)

                # HTML 解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                if n == 0:
                    # 将抽取的 urls 添加到URL 管理器中
                    self.manager.add_new_urls(new_urls)

                # 数据存储器 存储文件
                self.output.store_data(data)
                n += 1

                print('已经抓取%s个连接' % self.manager.old_url_size())

            except Exception as e:
                print(e)
        # 数据存储器将文件输出成指定格式
        self.output.output_html(self.output.filepath)
        self.output.output_end(self.output.filepath)
Example #16
0
 def url_manager_proc(self, url_q, conn_q, root_url):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         while (url_manager.has_new_url()):
             new_url = url_manager.get_new_url()
             url_q.put(new_url)
             print('old_url=', url_manager.old_url_size())
             if (url_manager.old_url_size() > 2000):
                 url_q.put('end')
                 print('控制节点发起通知')
                 url_manager.save_progress('new_urls.txt',
                                           url_manager.new_urls)
                 url_manager.save_progress('old_urls.txt',
                                           url_manager.old_urls)
                 return
             try:
                 if not conn_q.empty():
                     urls = conn_q.get()
                     url_manager.add_new_urls()
             except BaseException as e:
                 print(e)
                 time.sleep(0.1)
Example #17
0
    def url_manager_proc(self, url_q, conn_q, root_url):
        """从conn_q队列获取新URL到URL管理器, 取URL放入url_q供爬虫节点获取"""
        url_manager = UrlManager()
        url_manager.add_new_url(root_url)
        while True:
            while (url_manager.has_new_url()):
                new_url = url_manager.get_new_url()
                url_q.put(new_url)
                logging.info("old_url_size = %s " % url_manager.old_url_size())

                if url_manager.old_url_size() > 50:
                    url_q.put("end")
                    logging.info("控制节点发起结束通知")
                    url_manager.save_process("new_urls.txt",
                                             url_manager.new_urls)
                    url_manager.save_process("old_urls.txt",
                                             url_manager.old_urls)
                    return
            try:
                if not conn_q.empty():
                    urls = conn_q.get()
                    url_manager.add_new_urls(urls)
            except BaseException as e:
                time.sleep(0.1)
Example #18
0
 def url_manager_proc(self, url_q, conn_q, root_url):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         while url_manager.has_new_url():
             new_url = url_manager.get_new_url()
             print("url " + new_url)
             url_q.put(new_url)
             # print("old_url=",url_manager.old_url_size())
             if url_manager.old_url_size() > 2000:
                 url_q.put("end")
                 print("控制节点发起结束通知!")
                 url_manager.save_progress("new_urls.txt",
                                           url_manager.new_urls)
                 url_manager.save_progress("old_urls.txt",
                                           url_manager.old_urls)
                 return
         try:
             if not conn_q.empty():
                 urls = conn_q.get()
                 # print(urls)
                 url_manager.add_new_urls(urls)
         except BaseException:
             time.sleep(0.1)
 def url_manager_proc(self, url_q, conn_q, root_url, num=200):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         while url_manager.has_new_url():
             new_url = url_manager.get_new_url()
             url_q.put(new_url)
             if url_manager.old_url_size() > num:
                 # 通知爬行节点工作结束
                 url_q.put('end')
                 print('控制节点发起结束通知!')
                 # 关闭管理节点,同时存储 set 状态
                 url_manager.save_progress()
                 return
         # 没有url了就从conn_q里拿
         try:
             if not conn_q.empty():
                 urls = conn_q.get()
                 url_manager.add_new_urls(urls)
             else:
                 # 延时休息
                 time.sleep(0.1)
         except Exception as e:
             print(e)
Example #20
0
class SxsIntern(object):
    def __init__(self):
        self.manger = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def get_collect_intern_urls(self, username, password):
        collect_url = '{0}/my/collect'.format(self.main_url)

        self.downloader.login(username, password)
        response = self.downloader.get(collect_url,
                                       session=self.downloader.session)
        page_num = self.parser.get_collect_page_num(response)
        print(f'预计下载 {page_num} 页')

        for i in range(1, int(page_num) + 1):
            page_url = '{0}?p={1}'.format(collect_url, i)
            response = self.downloader.get(page_url,
                                           session=self.downloader.session)
            urls = self.parser.get_intern_urls(response, 'collect')
            self.manger.add_new_urls(urls)

    def get_job_urls(self,
                     job='数据',
                     city='北京',
                     pages=100,
                     release_time='ft-wek'):
        """
        爬取指定的job信息
        
        Parameters
        ----------
        job: 职位信息,搜索关键字
        city: 所在城市,默认'北京'
        pages: 设定爬取多少页信息,默认为100,如果页面不足则以实际页面为准
        release_time: 发布时间,默认为'ft-wek',即获取一周内发布的职位,具体参数为:
                        'ft-day': 一天内
                        'ft-wek': 一周内
                        'ft-mon': 一月内
        """
        # ft-day, ft-wek, ft-mon
        city_code = self.parser.get_city_code(city)

        if release_time not in ['ft-day', 'ft-wek', 'ft-mon']:
            raise ValueError(
                'release_time 应为 ["ft-day", "ft-wek", "ft-mon"] 之一')

        page = 1
        url = '{url}/interns/st-intern_c-{c}_{r}?k={k}&p={p}'.format(
            url=self.main_url, r=release_time, c=city_code, k=job, p=page)
        response = requests.get(url, headers=self.headers)

        # 获得总页数
        page_num = re.search(r'<a href=\".*?p=(.*?)\">尾页',
                             response.text).group(1)
        page_num = min(int(page_num), int(pages))
        print(f'预计下载 {page_num} 页')
        response.close()

        # 逐页处理
        for page in range(1, page_num + 1):
            url = '{url}/interns/st-intern_c-{c}_{r}?k={k}&p={p}'.format(
                url=self.main_url, r=release_time, c=city_code, k=job, p=page)
            response = requests.get(url, headers=self.headers)
            links = self._get_internlinks(response, 'jobs')
            self._links_parse(links)
            response.close()

    def crawl(self, root_url, save_path=None, max_amount=100):
        self.manger.add_new_url(root_url)
        while (self.manger.has_new_url()
               and self.manger.old_url_size() < max_amount):
            try:
                new_url = self.manger.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manger.add_new_urls(new_urls)
                self.output.store_data(data)
                print('Already found {} urls'.format(self.manger.old_url_size))
            except Exception as e:
                print('Crawl failed')
        self.output.write(save_path)
Example #21
0
from URLManager import UrlManager
import pickle
import hashlib

print("has_new_url", UrlManager.has_new_url.__doc__)
print("add_new_url", UrlManager.add_new_url.__doc__)
print("add_new_urls", UrlManager.add_new_urls.__doc__)
print("get_new_url", UrlManager.get_new_url.__doc__)
print("new_url_size", UrlManager.new_url_size.__doc__)
print("old_url_size", UrlManager.old_url_size.__doc__)
print("save_progress", UrlManager.save_progress.__doc__)
print("load_progress", UrlManager.load_progress.__doc__)

urls = set([
    "http://qq.ip138.com/tianqi/", "http://qq.ip138.com/shenfenzheng/",
    "http://qq.ip138.com/huoche/",
    "http://qq.ip138.com/daishoudian/mobile.htm",
    "http://www.miitbeian.gov.cn/"
])
urlmanager = UrlManager()
print(type(urls))
# urlmanager获得新的url集合
urlmanager.add_new_urls(urls)
print(urlmanager.has_new_url())
# urlmanager输出一个未爬取的url
new_url = urlmanager.get_new_url()  #拿出的同时将其放的到已经爬取的url集合中
# 没有未爬取的url时返回None
print(new_url)
print(urlmanager.old_url_size())
# 保存进度
urlmanager.save_progress()