Ejemplo n.º 1
0
 def __init__(self):
     """构造函数,初始化属性"""
     self.urls = UrlManager()
     self.log = MyLog("spider_main", "logs")
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.outputer = HtmlOutputer()
Ejemplo n.º 2
0
class SpiderMan(object):
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parser_url(root_url, content)
        for url in urls:
            try:
                t = time.strftime("%Y%m%d%H%M%S3282", time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                    '?Ajax_CallBack=true' \
                    '&Ajax_CallBackType=Mtime.Library.Services' \
                    '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                    '&Ajax_CrossDomain=1' \
                    '&Ajax_RequestUrl=%s' \
                    '&t=%s' \
                    '&Ajax_CallBackArgument0=%s' % (url[0],t,url[1])
                rank_content = self.downloader.download(rank_url)
                data = self.parser.parser_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                print('Crawl failed')
        self.output.output_end()
        print('Crawl finish')
Ejemplo n.º 3
0
class SpiderMain:

    def __init__(self):
        """
        初始化方法,主要是将其他组件实例化
        """
        self.url_manager = UrlManager()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.data_storage = DataStorage()

    def start(self):
        """
        爬虫的主启动方法
        :return:
        """
        """ 页码 """
        title = set()
        for a in range(2, 10):
            html = self.html_downloader.download(
                'http://ggzy.foshan.gov.cn/jyxx/fss/zfcg_1108551/zbxx/index_'+str(a)+'.html?1')
            _title = self.html_parser.titleParer(html)
            for i in _title:
                title.add(i)
        for i in title:
            print(i)
            html = self.html_downloader.download(i)
            _product = self.html_parser.contextParer(html)
            self.data_storage.storage(_product)
Ejemplo n.º 4
0
 def __init__(self):
     """
     初始化方法,主要是将其他组件实例化
     """
     self.url_manager = UrlManager()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
     self.data_storage = DataStorage()
Ejemplo n.º 5
0
 def __init__(self):
     # URL 管理器
     # self.urls = UrlManager.UrlManager()
     self.urls = UrlManager()
     # URL 下载器
     # self.downloader = HtmlDownloader.HtmlDownloader()
     self.downloader = HtmlDownloader()
     # URL 解析器
     # self.parser = html_parser.HtmlParser()
     self.parser = HtmlParser()
     # self.outputer = html_outputer.HtmlOutputer()
     self.outputer = HtmlOutputer()
Ejemplo n.º 6
0
 def __init__(self):
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     server_addr = '127.0.0.1'
     print('Connect to server %s...' % server_addr)
     self.m = BaseManager(address=(server_addr,8001),authkey=b'baike')
     self.m.connect()
     self.task = self.m.get_task_queue()
     print(self.task.qsize())
     self.result = self.m.get_result_queue()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     print('init finish')
Ejemplo n.º 7
0
class SpiderWork(object):
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        server_addr = '127.0.0.1'
        print('Connect to server %s...' % server_addr)
        self.m = BaseManager(address=(server_addr,8001),authkey=b'baike')
        self.m.connect()
        self.task = self.m.get_task_queue()
        print(self.task.qsize())
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):
        import time
        while(True):
            try:
                if not self.task.empty():
                    url = self.task.get()
                    if url == 'end':
                        print('控制节点通知爬虫节点停止工作...')
                        self.result.put({'new_urls':'end', 'data':'end'})
                        return
                    print('爬虫节点正在解析:%s' % url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls,data = self.parser.parser(url,content)
                    self.result.put({'new_urls':new_urls, 'data':data})
            except EOFError as e:
                print('连接工作节点失败')
                return
            except Exception as e:
                print(e)
                print('Crawl fail')
Ejemplo n.º 8
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = HtmlOutputer()

    def craw(self, root_url, page_amount=5, time_sleep=None):
        count = 1
        # 添加第一个待爬取url
        self.urls.add_new_url(root_url)
        # 如果集合中有url, 就取出一个url 请求, 没有链接则跳出。
        while self.urls.has_new_url():
            try:
                # 开始爬取
                new_url = self.urls.get_new_url()
                print(f'craw{count}:{new_url}')
                # 请求url, 返回html
                html_content = self.downloader.download(new_url)
                # xpath 解析html,得到需要的数据
                new_urls, new_data = self.parser.parse(html_content)
                # 一个词条页面上关联的a 链表列表加入到url 管理器中待爬取
                self.urls.add_new_urls(new_urls)
                self.output.collect_data(new_url, new_data)
                count += 1
                if count > page_amount:
                    break

                time.sleep(2)
            except Exception as e:
                print(e)
                print(f'抓取失败:{new_url}')
        self.output.output_html()
Ejemplo n.º 9
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()

    def craw(self, url):
        count = 1
        self.urls.add_new_url(url)
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()
                html_cont = self.downloader.download(new_url)
                new_urls, html_data = self.parser.parse(new_url, html_cont)
                self.urls.add_new_urls(new_urls)
                self.outputer.collect_data(html_data)
                print "%d craw success : %s" % (count, new_url)
                if count >= 10:
                    break
                count = count + 1
            except Exception as e:
                print str(e)
                print "%d craw failed : %s" % (count, new_url)
        self.outputer.output()
Ejemplo n.º 10
0
 def __init__(self):
     # 实例化其他模块类
     self.mysql_handler = MysqlHandler()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
     # 爬取起点url
     self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
     # 用于后续url的拼接
     self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
     # 省页面列表
     self.province_url_list = []
     # 市页面列表
     self.city_url_list = []
     # 区页面列表
     self.county_url_list = []
     # 乡镇、街道页面列表
     self.town_url_list = []
Ejemplo n.º 11
0
 def __init__(self, address='127.0.0.1', port=8001, authkey=b'baike'):
     """初始化分布式进程中工作节点的连接工作"""
     # 注册用于获取Queue的方法名称
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     # 连接到服务器
     print('Connect to server %s:%s...' % (address, port))
     self.manager = BaseManager(address=(address, port), authkey=authkey)
     # 开始连接
     self.manager.connect()
     # 获取Queue对象
     self.task_q = self.manager.get_task_queue()
     self.result_q = self.manager.get_result_queue()
     # 初始化下载器和解析器
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     print('init finish')
Ejemplo n.º 12
0
 def __init__(self):
     args = ArgumentParser()
     index_start = 1
     try:
         with open(args.index_end_path, 'r', encoding='utf-8') as f:
             index_end = int(f.readline().strip('\n'))
     except Exception as e:
         print(e)
         sys.exit(-1)
     self.new_urls = set()
     print("Adding all urls ...")
     for index in range(index_start, index_end):
         url = "https://baike.baidu.com/view/" + str(index)
         self.new_urls.add(url)
     print("Done.")
     self.old_urls = set()
     self.fail_urls = set()
     self.fail_url_mark = True
     self.downloader = HtmlDownloader()
Ejemplo n.º 13
0
    def craw(self):
        # 下载
        downloader = HtmlDownloader()

        root_cont = downloader.download(self.url)
        parser = HtmlParser()
        urls, data = parser.parse(self.url, root_cont, True)
        result = ""
        for url in urls:
            cont = downloader.download(url)
            newurls, month = parser.parse(url, cont, False)
            if month != None:
                result += month.getMonthly()
            month = None
            #print(month.getMonthly())

        f = open("阿里巴巴数据库内核组月报.md", "w+", encoding='utf-8')
        result = "## 阿里巴巴数据库内核月报\n\n" + result
        f.write(result)
        f.close()

        pass
Ejemplo n.º 14
0
class Spider:
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parse_url(root_url, content)
        for url in urls:
            try:
                # http://service.library.mtime.com/Movie.api
                # ?Ajax_CallBack=true
                # &Ajax_CallBackType=Mtime.Library.Services
                # &Ajax_CallBackMethod=GetMovieOverviewRating
                # &Ajax_CrossDomain=1
                # &Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F246526%2F&t=201710117174393728&Ajax_CallBackArgument0=246526
                t = time.strftime('%Y%m%d%H%M%S3282', time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                           '?Ajax_CallBack=true' \
                           '&Ajax_CallBackType=Mtime.Library.Services' \
                           '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                           '&Ajax_CrossDomain=1' \
                           '&Ajax_RequestUrl=%s' \
                           '&t=%s' \
                           '&Ajax_CallbackArgument0=%s' % (url[0].replace('://', '%3A%2F%2F')[:-1], t, url[1])
                rank_content = self.downloader.download(rank_url)
                if rank_content is None:
                    print('None')
                data = self.parser.parse_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                raise e
                # print(e)
                # print('Crawl failed')

        self.output.output_end()
        print('Crawl finish')
Ejemplo n.º 15
0
class SpiderWorker:
    def __init__(self, address='127.0.0.1', port=8001, authkey=b'baike'):
        """初始化分布式进程中工作节点的连接工作"""
        # 注册用于获取Queue的方法名称
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        # 连接到服务器
        print('Connect to server %s:%s...' % (address, port))
        self.manager = BaseManager(address=(address, port), authkey=authkey)
        # 开始连接
        self.manager.connect()
        # 获取Queue对象
        self.task_q = self.manager.get_task_queue()
        self.result_q = self.manager.get_result_queue()
        # 初始化下载器和解析器
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):

        while True:
            try:
                if not self.task_q.empty():
                    url = self.task_q.get()

                    if url == 'end':
                        print('控制节点通知爬虫节点停止工作...')
                        # 接着通知其他节点停止工作
                        self.result_q.put({'new_urls': 'end', 'data': 'end'})
                        return

                    print('爬虫节点正在解析: %s' % url)
                    content = self.downloader.download(url)
                    new_urls, data = self.parser.parse(url, content)
                    self.result_q.put({'new_urls': new_urls, 'data': data})

                else:
                    print('task queue is empty', self.task_q.empty())
            except EOFError:
                print('连接工作节点失败')
                return
            except Exception as e:
                print(e)
                print('crawl fail')
Ejemplo n.º 16
0
class SpiderMain():
    def __init__(self):
        # URL 管理器
        # self.urls = UrlManager.UrlManager()
        self.urls = UrlManager()
        # URL 下载器
        # self.downloader = HtmlDownloader.HtmlDownloader()
        self.downloader = HtmlDownloader()
        # URL 解析器
        # self.parser = html_parser.HtmlParser()
        self.parser = HtmlParser()
        # self.outputer = html_outputer.HtmlOutputer()
        self.outputer = HtmlOutputer()

    def craw(self, root_url):
        count = 1
        originSet = set()
        originSet.add(root_url)
        self.urls.add_new_urls(originSet)
        while self.urls.has_new_rul():
            try:
                new_url = self.urls.get_new_url()
                print "craw %d : %s" % (count, new_url)
                html_cont = self.downloader.downloader(new_url)

                # 输出信息
                downStat = "ERROR"
                if html_cont != None:
                    downStat = "SUCCESS"
                    print "[Page ID : %d downloader %s!]" % (count, downStat)

                new_urls, new_data = self.parser.parser(new_url, html_cont)
                # print "\nnew_urls[%s], new_data[%s]" % (new_urls, new_data)

                self.urls.add_new_urls(new_urls)
                self.outputer.collect_data(new_data)

                if count == 15:
                    break
                count = count + 1
            except Exception as err:
                print "craw failed! ERROR infomation : %s" % err
        self.outputer.output_html()
Ejemplo n.º 17
0
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        #self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.path = "/Users/spike/python_项目/get_cd_school/"
        # # 爬取起点url
        # self.root_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=1'
        # # 用于后续url的拼接
        # self.split_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages='
        # school info
        # self.school_infos = []

    def craw(self, downloading_url):
        try:
            # 记录正在下载、解析的url,便于分析错误
            # downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            # 第二个参数:用于url拼接的url
            self.school_infos = self.html_parser.province_parser(html_content)
            # print(self.school_infos)
            #exit()
            if (len(self.school_infos) != 20):
                print(downloading_url + "解析成功")
                print("当前页面数据:" + str(len(self.school_infos)))
            #print(self.province_url_list)
            with open(self.path + "school.txt", "a") as f:
                # print("writting")
                for mc, xd, qy, xz, dh, dz in self.school_infos:
                    f.write(mc + "\t" + xd + "\t" + qy + "\t" + xz + "\t" +
                            dh + "\t" + dz)
            f.close()
            return len(self.school_infos)

        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()
Ejemplo n.º 18
0
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        # 爬取起点url
        # self.root_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=1'
        # # 用于后续url的拼接
        # self.split_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages='
        # # school info
        # self.school_infos = []
        #日志文件路径需要自行修改
        # self.last_log_path = "d:\\log.txt"
        # self.last_log_path = "/Users/spike/spider_log.txt"
    def craw(self,downloading_url):
        try:
            # 记录正在下载、解析的url,便于分析错误
            # downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            self.school_infos = self.html_parser.province_parser(html_content)
            # print(self.school_infos)
            if (len(self.school_infos)!=20):
                print(downloading_url+"解析成功")
                print("当前页面数据:"+str(len(self.school_infos)))
            for mc,xd,qy,xz,dh,dz in self.school_infos:
                # print(mc+xd+qy+xz+dh+dz)
                province_id = self.mysql_handler.insert(mc,xd,qy,xz,dh,dz)     
                # print(province_id)
                # exit()
                # 记录正在下载、解析的url,便于分析错误  
            # self.mysql_handler.close()
            return len(self.school_infos)
        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()
            time.sleep(60)            
Ejemplo n.º 19
0
class SpiderMain:
    def __init__(self):
        self.url_manager = UrlManager()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.data_storage = DataStorage()

    def start(self):
        """
        爬虫的主启动方法
        :return:
        """
        self.url_manager.add_new_url(
            "http://127.0.0.1:8848/xiaomi-master/index.html")
        # 从url管理器获取url
        url = self.url_manager.get_new_url()
        # 将获取到的url使用下载器进行下载
        html = self.html_downloader.download(url)
        # 将html进行解析
        res = self.html_parser.parser(html)
        # 数据存储
        self.data_storage.storage(res)
Ejemplo n.º 20
0
class SpiderMain(object):
    """docstring for SpiderMain"""
    def __init__(self):
        self.urlManage = UrlManage()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()
    def craw(self,url):
        self.urlManage.add_new_url(url)
        
        count = 1
        while self.urlManage.has_new_url():
            url = self.urlManage.get_new_url()
            print '%dth page,address:%s' % (count,url)
            html_content = self.downloader.downloadPage(url)
            new_urls,new_data = self.parser.parse(html_content,url)
            self.urlManage.add_new_urls(new_urls)
            self.outputer.collect_data(new_data)

            if count == 10:
                break

            count = count + 1 
        self.outputer.output_html()
Ejemplo n.º 21
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()

    def craw(self, root_url):
        count = 1
        self.urls.add_new_url(root_url)
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()  # 获取新url
                html_cont = self.downloader.download(new_url)  # 下载url内容
                new_urls, new_data = self.parser.parse(new_url,
                                                       html_cont)  # 解析url内容
                self.urls.add_new_urls(new_urls)  # 将解析到的新url存入url管理器
                self.outputer.collect_data(new_data)  # 收集解析到的数据
                if count == 200:
                    break
                count = count + 1
            except:
                print("craw failed")
        self.outputer.output_html()
Ejemplo n.º 22
0
class UrlManager(object):
    def __init__(self):
        args = ArgumentParser()
        index_start = 1
        try:
            with open(args.index_end_path, 'r', encoding='utf-8') as f:
                index_end = int(f.readline().strip('\n'))
        except Exception as e:
            print(e)
            sys.exit(-1)
        self.new_urls = set()
        print("Adding all urls ...")
        for index in range(index_start, index_end):
            url = "https://baike.baidu.com/view/" + str(index)
            self.new_urls.add(url)
        print("Done.")
        self.old_urls = set()
        self.fail_urls = set()
        self.fail_url_mark = True
        self.downloader = HtmlDownloader()
        # self.update_new_url(index_end, args)

    def update_new_url(self, index_end, args):
        err_cnt = 0
        start = index_end
        end = start
        while True:
            if err_cnt > 10:
                break
            url = "https://baike.baidu.com/view/" + str(start)
            response, response_url = self.downloader.download_update(url)
            if not response_url or response_url == 'https://baike.baidu.com/error.html':
                err_cnt += 1
            else:
                err_cnt = 0
                self.new_urls.add(url)
                end = start
            start += 1
        with open(args.index_end_path, 'w', encoding='utf-8') as f:
            f.write(str(end))

    def add_new_url(self, url):
        if url is None:
            return
        if url not in self.new_urls and url not in self.old_urls:
            self.new_urls.add(url)

    def add_old_url(self, url):
        if url is None:
            return
        if url not in self.new_urls:
            self.old_urls.add(url)

    def add_fail_url(self, url):
        if url is None:
            return
        if url not in self.new_urls and url not in self.old_urls:
            self.fail_urls.add(url)

    def add_new_urls(self, urls):
        if urls is None or len(urls) == 0:
            return
        for url in urls:
            if url not in self.old_urls:
                self.new_urls.add(url)

    def has_new_url(self):
        if len(self.new_urls) != 0:
            return True
        elif self.fail_url_mark:
            self.new_urls = self.fail_urls.copy()
            self.fail_urls.clear()
            self.fail_url_mark = False
            return True
        else:
            return False

    def get_new_url(self):
        new_url = self.new_urls.pop()
        return new_url
Ejemplo n.º 23
0
class SpiderMain():
    """爬虫程序主模块"""
    def __init__(self):
        """构造函数,初始化属性"""
        self.urls = UrlManager()
        self.log = MyLog("spider_main", "logs")
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()
        #self.util=utill.DBConn()

    def craw(self, root_url):
        """爬虫入口函数"""
        areas = {
            "gulou": 100,
            "jianye": 72,
            "qinhuai": 100,
            "xuanwu": 67,
            "yuhuatai": 32,
            "qixia": 62,
            "baijiahu": 33,
            "chalukou1": 26,
            "jiangningqita11": 3,
            "dongshanzhen": 29,
            "jiangningdaxuecheng": 15,
            "jiulonghu": 12,
            "jiangjundadao11": 22,
            "kexueyuan": 9,
            "qilinzhen": 42,
            "tiexinqiao": 9,
            "pukou": 100,
            "liuhe": 1,
        }

        #areas = {"gulou":1}

        #1、抓取所有二手房详情界面链接,并将所有连接放入URL管理模块
        for area, pg_sum in areas.items():
            for num in range(1, pg_sum + 1):
                #1.1 拼接页面地址: https://nj.lianjia.com/ershoufang/gulou/pg2/
                pg_url = root_url + area + "/pg" + str(num) + "/"
                self.log.logger.info("1.1 拼接页面地址:" + pg_url)
                print("1.1 拼接页面地址:" + pg_url)
                #1.2 启动下载器,下载页面.
                try:
                    html_cont = self.downloader.download(pg_url)
                except Exception as e:
                    self.log.logger.error("1.2 下载页面出现异常:" + repr(e))
                    time.sleep(60 * 30)
                else:
                    #1.3 解析PG页面,获得二手房详情页面的链接,并将所有链接放入URL管理模块
                    try:
                        ershoufang_urls = self.parser.get_erhoufang_urls(
                            html_cont)
                    except Exception as e:
                        self.log.logger.error("1.3 页面解析出现异常:" + repr(e))
                    else:
                        self.urls.add_new_urls(ershoufang_urls)
                        #暂停0~3秒的整数秒,时间区间:[0,3]
                        time.sleep(random.randint(0, 3))

        time.sleep(60 * 20)
        #2、解析二手房具体细心页面
        id = 1
        stop = 1
        while self.urls.has_new_url():
            #2.1 获取url
            try:
                detail_url = self.urls.get_new_url()
                self.log.logger.info("2.1 二手房页面地址:" + detail_url)
                print("2.1 二手房页面地址:" + detail_url)
            except Exception as e:
                print("2.1 拼接地址出现异常")
                self.log.logger.error("2.1 拼接地址出现异常:" + detail_url)

            #2.2 下载页面
            try:
                detail_html = self.downloader.download(detail_url)
            except Exception as e:
                self.log.logger.error("2.2 下载页面出现异常:" + repr(e))
                self.urls.add_new_url(detail_url)
                time.sleep(60 * 30)
            else:
                #2.3 解析页面
                try:
                    ershoufang_data = self.parser.get_ershoufang_data(
                        detail_html, id)
                except Exception as e:
                    self.log.logger.error("2.3 解析页面出现异常:" + repr(e))
                else:
                    #2.4 输出数据
                    try:
                        self.outputer.collect_data(ershoufang_data)
                    except Exception as e:
                        self.log.logger.error("2.4 输出数据出现异常:" + repr(e))
                    else:
                        print(id)
                        id = id + 1
                        stop = stop + 1
                        #暂停0~3秒的整数秒,时间区间:[0,3]
                        time.sleep(random.randint(0, 3))
                        if stop == 2500:
                            stop = 1
                            time.sleep(60 * 20)
Ejemplo n.º 24
0
 def __init__(self):
     self.urls = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.outputer = HtmlOutputer()
Ejemplo n.º 25
0
 def __init__(self):
     # 实例化其他模块类
     self.mysql_handler = MysqlHandler()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
Ejemplo n.º 26
0
class LinkExtractor(object):
    def __init__(self):
        self.counter = 0
        self.k_count = 0
        self.downloader = HtmlDownloader()

    def get_menu_page_info(self, menu_page_url):
        if menu_page_url is None:
            return None

        html_text = self.downloader.download(menu_page_url)

        if html_text == None:
            return None

        self.counter = (self.counter + 1)%100
        if self.counter == 0:
            self.k_count += 1
            print('Get Manu Pages: %d00'%(self.k_count))

        return self.parse_menu_page_info(html_text)

    def parse_menu_page_info(self, html_text):
        if html_text is None:
            return None

        soup = BeautifulSoup(html_text, 'lxml')

        menu_page_data = []
        for entry in soup.select('.r-ent'):
            data = {
                'title': entry.select('.title')[0].text.strip(),
                'post_url': PTT_HOST_URL + entry.select('.title > a')[0].get('href') if entry.select('.title > a') else None,
                'date': entry.select('.date')[0].text.strip(),
                'author': entry.select('.author')[0].text.strip(),
                'visited': 0
            }
            menu_page_data.append(data)
        return menu_page_data

    # 抓 post_links 到 post_url_infos table
    def fetch_menu_page_links(self, menu_page_url):
        menu_page_data = self.get_menu_page_info(menu_page_url)
        if menu_page_data != None:
            url_manager.add_new_url_infos(menu_page_data)

    def next_page(self, html_text):
        soup = BeautifulSoup(html_text, 'lxml')
        if soup.find_all('a', class_='btn wide', text='下頁 ›'):
            return PTT_HOST_URL + soup.find_all('a', class_='btn wide', text='下頁 ›')[0].get('href')
        return None

    def run(self, root_menu_page, min_menu_page_index=1, max_menu_page_index=6000, threadNum=5):
        print('===================== start run extractor() ========================')
        try:
            pool = threadpool.ThreadPool(threadNum) 
            
            menu_page_urls = [root_menu_page.format(i) for i in range(min_menu_page_index, max_menu_page_index)]
            requests = threadpool.makeRequests(self.fetch_menu_page_links, menu_page_urls) 
            [pool.putRequest(req) for req in requests] 
            pool.wait()
            print('link extractor done.')
        except:
            print('link_extractor excepttion')
            raise
Ejemplo n.º 27
0
 def __init__(self):
     self.counter = 0
     self.k_count = 0
     self.downloader = HtmlDownloader()
Ejemplo n.º 28
0
 def __init__(self):
     self.urlManage = UrlManage()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.outputer = HtmlOutputer()
Ejemplo n.º 29
0
 def __init__(self):
     self.url_manager = UrlManager()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
     self.data_storage = DataStorage()
Ejemplo n.º 30
0
 def __init__(self):
     # 实例化其他模块类
     #self.mysql_handler = MysqlHandler()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
     self.path = "/Users/spike/python_项目/get_cd_school/"
Ejemplo n.º 31
0
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        # 爬取起点url
        self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
        # 用于后续url的拼接
        self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
        # 省页面列表
        self.province_url_list = []
        # 市页面列表
        self.city_url_list = []
        # 区页面列表
        self.county_url_list = []
        # 乡镇、街道页面列表
        self.town_url_list = []
        self.last_log_path = "d:\\log.txt"

    def craw(self):
        try:
            # 记录正在下载、解析的url,便于分析错误
            downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            # 第二个参数:用于url拼接的url
            self.province_url_list = self.html_parser.province_parser(
                html_content, self.split_url)
            #print(self.province_url_list)
            pro = self.province_url_list
            #print(self.province_url_list[0][0])
            with open(self.last_log_path, "r") as r:
                last_log = r.read()
            #print(last_log)
            if last_log != "":
                last_log_index = pro.index(tuple(last_log.split(';')))
                #print("inde:"+str(last_log_index))
                for i in range(last_log_index):
                    del self.province_url_list[0]

                print("删除已下载元素后还剩余:" + str(len(self.province_url_list)) +
                      "共计:31")
                #print(self.province_url_list)
                #exit()
            #else:
            #  print("下载开始,共计:"+str(len(pro))
            #print(last_log_index)
            #exit()
            for province_name, province_url, province_code in self.province_url_list:
                #print(province_code)
                #记录最后一个下载
                last_record = (province_name, province_url, province_code)
                #print(last_record)
                with open(self.last_log_path, "w") as l:
                    #last_name = province_name.encode('utf8')
                    l.write(last_record[0] + ";" + last_record[1] + ";" +
                            last_record[2])
                #exit()
                province_id = self.mysql_handler.insert(
                    province_code + '0000000000', province_name)
                #print(province_id)
                # 记录正在下载、解析的url,便于分析错误
                downloading_url = province_url
                html_content = self.html_downloader.download(downloading_url)
                self.city_url_list = self.html_parser.city_parser(
                    html_content, self.split_url)
                for city_name, city_url, city_code in self.city_url_list:
                    city_id = self.mysql_handler.insert(city_code, city_name)
                    # 例如直辖市没有下级页面
                    if city_url is None:
                        continue
                    # 记录正在下载、解析的url,便于分析错误
                    downloading_url = city_url
                    html_content = self.html_downloader.download(
                        downloading_url)
                    self.county_url_list = self.html_parser.county_parser(
                        html_content, self.split_url + province_code + "/")
                    for county_name, county_url, county_code in self.county_url_list:
                        county_id = self.mysql_handler.insert(
                            county_code, county_name)
                        if county_url is None:
                            continue
                        # 记录正在下载、解析的url,便于分析错误
                        downloading_url = county_url
                        html_content = self.html_downloader.download(
                            downloading_url)
                        self.town_url_list = self.html_parser.town_parser(
                            html_content, self.split_url)
                        for town_name, town_url, town_code in self.town_url_list:
                            # 输出抓取到的乡镇街道的名称、链接(实际不需要)、编号代码
                            print(town_name, town_url, town_code)
                            self.mysql_handler.insert(town_code, town_name)
            self.mysql_handler.close()
        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()
            time.sleep(60)
            return self.craw()
Ejemplo n.º 32
0
from html_downloader import HtmlDownloader
from html_paraser import HtmlParser
import pymysql
from date_provider import getAllDayPerYear
import time

conn = pymysql.connect(host='192.168.64.135',
                       port=3306,
                       user='******',
                       passwd='123456',
                       db='comp')
cursor = conn.cursor()

if __name__ == '__main__':
    hd = HtmlDownloader()
    hp = HtmlParser()

    province = 'zhejiang'
    for year in range(2019, 1949, -1):
        print(year)
        year_date_list = getAllDayPerYear(year)
        # print(year_date_list)
        for comregdate in year_date_list:
            print(comregdate)
            errcnt = 0
            pagecnt_tmp = 0
            for pagecnt in range(0, 1000):

                url = r'https://gongshang.mingluji.com/' + province + r'/riqi/' + comregdate + r'?page=' + str(
                    pagecnt)
Ejemplo n.º 33
0
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        #self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.path = "D:\\python_work\\get_diqu_dm\\"
        # 爬取起点url
        self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
        # 用于后续url的拼接
        self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
        # 省页面列表
        self.province_url_list = []
        # 市页面列表
        self.city_url_list = []
        # 区页面列表
        self.county_url_list = []
        # 乡镇、街道页面列表
        self.town_url_list = []

    def craw(self):
        try:
            # 记录正在下载、解析的url,便于分析错误
            downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            # 第二个参数:用于url拼接的url
            self.province_url_list = self.html_parser.province_parser(html_content, self.split_url)
            #print(self.province_url_list)
            with open(self.path+"shen_daima.txt", "a") as f:
                for province_name, province_url, province_code in self.province_url_list:
                    province_code = province_code+'0000000000'
                    
                    f.write(province_code+"\t"+province_name+"\n")
                    
                    # 第一个参数:1-插入一个省数据;2-市数据;3-区数据;4-乡镇街道数据
                    # 第二个参数:省市区街道名称
                    # 第三个参数:上级的id,注意省没有上级id
                    # 第四个参数:市区街道的行政区划编码
                    #province_id = self.mysql_handler.insert(1, province_name, None, None)
                    
                    # 记录正在下载、解析的url,便于分析错误
                    downloading_url = province_url
                    html_content = self.html_downloader.download(downloading_url)
                    self.city_url_list = self.html_parser.city_parser(html_content, self.split_url)
                    with open(self.path+"other_daima.txt","a") as o:
                        for city_name, city_url, city_code in self.city_url_list:
                            o.write(city_code+"\t"+city_name+"\n")
                            #city_id = self.mysql_handler.insert(2, city_name, province_id, city_code)
                            # 例如直辖市没有下级页面
                            if city_url is None:
                                continue
                            # 记录正在下载、解析的url,便于分析错误
                            
                            downloading_url = city_url
                            html_content = self.html_downloader.download(downloading_url)
                            self.county_url_list = self.html_parser.county_parser(html_content, self.split_url + province_code + "/")
                            for county_name, county_url, county_code in self.county_url_list:
                                o.write(county_code+"\t"+county_name+"\n")
                                #county_id = self.mysql_handler.insert(3, county_name, city_id, county_code)
                                if county_url is None:
                                    continue
                                # 记录正在下载、解析的url,便于分析错误
                                print('To deal with county')
                                downloading_url = county_url
                                html_content = self.html_downloader.download(downloading_url)
                                self.town_url_list = self.html_parser.town_parser(html_content, self.split_url)
                                for town_name, town_url, town_code in self.town_url_list:
                                    # 输出抓取到的乡镇街道的名称、链接(实际不需要)、编号代码
                                    o.write(town_code+"\t"+town_name+"\n")
                                    print(town_name, town_url, town_code)
                                    #self.mysql_handler.insert(4, town_name, county_id, town_code)
            #self.mysql_handler.close()
            f.close()
            o.close()
        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()
Ejemplo n.º 34
0
 def __init__(self):
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()