class SpiderMan(object):
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parser_url(root_url, content)
        for url in urls:
            try:
                t = time.strftime("%Y%m%d%H%M%S3282", time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                    '?Ajax_CallBack=true' \
                    '&Ajax_CallBackType=Mtime.Library.Services' \
                    '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                    '&Ajax_CrossDomain=1' \
                    '&Ajax_RequestUrl=%s' \
                    '&t=%s' \
                    '&Ajax_CallBackArgument0=%s' % (url[0],t,url[1])
                rank_content = self.downloader.download(rank_url)
                data = self.parser.parser_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                print('Crawl failed')
        self.output.output_end()
        print('Crawl finish')
Beispiel #2
0
class SpiderMain:

    def __init__(self):
        """
        初始化方法,主要是将其他组件实例化
        """
        self.url_manager = UrlManager()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.data_storage = DataStorage()

    def start(self):
        """
        爬虫的主启动方法
        :return:
        """
        """ 页码 """
        title = set()
        for a in range(2, 10):
            html = self.html_downloader.download(
                'http://ggzy.foshan.gov.cn/jyxx/fss/zfcg_1108551/zbxx/index_'+str(a)+'.html?1')
            _title = self.html_parser.titleParer(html)
            for i in _title:
                title.add(i)
        for i in title:
            print(i)
            html = self.html_downloader.download(i)
            _product = self.html_parser.contextParer(html)
            self.data_storage.storage(_product)
Beispiel #3
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = HtmlOutputer()

    def craw(self, root_url, page_amount=5, time_sleep=None):
        count = 1
        # 添加第一个待爬取url
        self.urls.add_new_url(root_url)
        # 如果集合中有url, 就取出一个url 请求, 没有链接则跳出。
        while self.urls.has_new_url():
            try:
                # 开始爬取
                new_url = self.urls.get_new_url()
                print(f'craw{count}:{new_url}')
                # 请求url, 返回html
                html_content = self.downloader.download(new_url)
                # xpath 解析html,得到需要的数据
                new_urls, new_data = self.parser.parse(html_content)
                # 一个词条页面上关联的a 链表列表加入到url 管理器中待爬取
                self.urls.add_new_urls(new_urls)
                self.output.collect_data(new_url, new_data)
                count += 1
                if count > page_amount:
                    break

                time.sleep(2)
            except Exception as e:
                print(e)
                print(f'抓取失败:{new_url}')
        self.output.output_html()
Beispiel #4
0
class SpiderWork(object):
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        server_addr = '127.0.0.1'
        print('Connect to server %s...' % server_addr)
        self.m = BaseManager(address=(server_addr,8001),authkey=b'baike')
        self.m.connect()
        self.task = self.m.get_task_queue()
        print(self.task.qsize())
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):
        import time
        while(True):
            try:
                if not self.task.empty():
                    url = self.task.get()
                    if url == 'end':
                        print('控制节点通知爬虫节点停止工作...')
                        self.result.put({'new_urls':'end', 'data':'end'})
                        return
                    print('爬虫节点正在解析:%s' % url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls,data = self.parser.parser(url,content)
                    self.result.put({'new_urls':new_urls, 'data':data})
            except EOFError as e:
                print('连接工作节点失败')
                return
            except Exception as e:
                print(e)
                print('Crawl fail')
Beispiel #5
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()

    def craw(self, url):
        count = 1
        self.urls.add_new_url(url)
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()
                html_cont = self.downloader.download(new_url)
                new_urls, html_data = self.parser.parse(new_url, html_cont)
                self.urls.add_new_urls(new_urls)
                self.outputer.collect_data(html_data)
                print "%d craw success : %s" % (count, new_url)
                if count >= 10:
                    break
                count = count + 1
            except Exception as e:
                print str(e)
                print "%d craw failed : %s" % (count, new_url)
        self.outputer.output()
Beispiel #6
0
    def craw(self):
        # 下载
        downloader = HtmlDownloader()

        root_cont = downloader.download(self.url)
        parser = HtmlParser()
        urls, data = parser.parse(self.url, root_cont, True)
        result = ""
        for url in urls:
            cont = downloader.download(url)
            newurls, month = parser.parse(url, cont, False)
            if month != None:
                result += month.getMonthly()
            month = None
            #print(month.getMonthly())

        f = open("阿里巴巴数据库内核组月报.md", "w+", encoding='utf-8')
        result = "## 阿里巴巴数据库内核月报\n\n" + result
        f.write(result)
        f.close()

        pass
Beispiel #7
0
class Spider:
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parse_url(root_url, content)
        for url in urls:
            try:
                # http://service.library.mtime.com/Movie.api
                # ?Ajax_CallBack=true
                # &Ajax_CallBackType=Mtime.Library.Services
                # &Ajax_CallBackMethod=GetMovieOverviewRating
                # &Ajax_CrossDomain=1
                # &Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F246526%2F&t=201710117174393728&Ajax_CallBackArgument0=246526
                t = time.strftime('%Y%m%d%H%M%S3282', time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                           '?Ajax_CallBack=true' \
                           '&Ajax_CallBackType=Mtime.Library.Services' \
                           '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                           '&Ajax_CrossDomain=1' \
                           '&Ajax_RequestUrl=%s' \
                           '&t=%s' \
                           '&Ajax_CallbackArgument0=%s' % (url[0].replace('://', '%3A%2F%2F')[:-1], t, url[1])
                rank_content = self.downloader.download(rank_url)
                if rank_content is None:
                    print('None')
                data = self.parser.parse_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                raise e
                # print(e)
                # print('Crawl failed')

        self.output.output_end()
        print('Crawl finish')
Beispiel #8
0
class SpiderWorker:
    def __init__(self, address='127.0.0.1', port=8001, authkey=b'baike'):
        """初始化分布式进程中工作节点的连接工作"""
        # 注册用于获取Queue的方法名称
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        # 连接到服务器
        print('Connect to server %s:%s...' % (address, port))
        self.manager = BaseManager(address=(address, port), authkey=authkey)
        # 开始连接
        self.manager.connect()
        # 获取Queue对象
        self.task_q = self.manager.get_task_queue()
        self.result_q = self.manager.get_result_queue()
        # 初始化下载器和解析器
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):

        while True:
            try:
                if not self.task_q.empty():
                    url = self.task_q.get()

                    if url == 'end':
                        print('控制节点通知爬虫节点停止工作...')
                        # 接着通知其他节点停止工作
                        self.result_q.put({'new_urls': 'end', 'data': 'end'})
                        return

                    print('爬虫节点正在解析: %s' % url)
                    content = self.downloader.download(url)
                    new_urls, data = self.parser.parse(url, content)
                    self.result_q.put({'new_urls': new_urls, 'data': data})

                else:
                    print('task queue is empty', self.task_q.empty())
            except EOFError:
                print('连接工作节点失败')
                return
            except Exception as e:
                print(e)
                print('crawl fail')
Beispiel #9
0
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        #self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.path = "/Users/spike/python_项目/get_cd_school/"
        # # 爬取起点url
        # self.root_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=1'
        # # 用于后续url的拼接
        # self.split_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages='
        # school info
        # self.school_infos = []

    def craw(self, downloading_url):
        try:
            # 记录正在下载、解析的url,便于分析错误
            # downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            # 第二个参数:用于url拼接的url
            self.school_infos = self.html_parser.province_parser(html_content)
            # print(self.school_infos)
            #exit()
            if (len(self.school_infos) != 20):
                print(downloading_url + "解析成功")
                print("当前页面数据:" + str(len(self.school_infos)))
            #print(self.province_url_list)
            with open(self.path + "school.txt", "a") as f:
                # print("writting")
                for mc, xd, qy, xz, dh, dz in self.school_infos:
                    f.write(mc + "\t" + xd + "\t" + qy + "\t" + xz + "\t" +
                            dh + "\t" + dz)
            f.close()
            return len(self.school_infos)

        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        # 爬取起点url
        # self.root_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=1'
        # # 用于后续url的拼接
        # self.split_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages='
        # # school info
        # self.school_infos = []
        #日志文件路径需要自行修改
        # self.last_log_path = "d:\\log.txt"
        # self.last_log_path = "/Users/spike/spider_log.txt"
    def craw(self,downloading_url):
        try:
            # 记录正在下载、解析的url,便于分析错误
            # downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            self.school_infos = self.html_parser.province_parser(html_content)
            # print(self.school_infos)
            if (len(self.school_infos)!=20):
                print(downloading_url+"解析成功")
                print("当前页面数据:"+str(len(self.school_infos)))
            for mc,xd,qy,xz,dh,dz in self.school_infos:
                # print(mc+xd+qy+xz+dh+dz)
                province_id = self.mysql_handler.insert(mc,xd,qy,xz,dh,dz)     
                # print(province_id)
                # exit()
                # 记录正在下载、解析的url,便于分析错误  
            # self.mysql_handler.close()
            return len(self.school_infos)
        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()
            time.sleep(60)            
class SpiderMain:
    def __init__(self):
        self.url_manager = UrlManager()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.data_storage = DataStorage()

    def start(self):
        """
        爬虫的主启动方法
        :return:
        """
        self.url_manager.add_new_url(
            "http://127.0.0.1:8848/xiaomi-master/index.html")
        # 从url管理器获取url
        url = self.url_manager.get_new_url()
        # 将获取到的url使用下载器进行下载
        html = self.html_downloader.download(url)
        # 将html进行解析
        res = self.html_parser.parser(html)
        # 数据存储
        self.data_storage.storage(res)
Beispiel #12
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()

    def craw(self, root_url):
        count = 1
        self.urls.add_new_url(root_url)
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()  # 获取新url
                html_cont = self.downloader.download(new_url)  # 下载url内容
                new_urls, new_data = self.parser.parse(new_url,
                                                       html_cont)  # 解析url内容
                self.urls.add_new_urls(new_urls)  # 将解析到的新url存入url管理器
                self.outputer.collect_data(new_data)  # 收集解析到的数据
                if count == 200:
                    break
                count = count + 1
            except:
                print("craw failed")
        self.outputer.output_html()
Beispiel #13
0
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        # 爬取起点url
        self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
        # 用于后续url的拼接
        self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
        # 省页面列表
        self.province_url_list = []
        # 市页面列表
        self.city_url_list = []
        # 区页面列表
        self.county_url_list = []
        # 乡镇、街道页面列表
        self.town_url_list = []
        self.last_log_path = "d:\\log.txt"

    def craw(self):
        try:
            # 记录正在下载、解析的url,便于分析错误
            downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            # 第二个参数:用于url拼接的url
            self.province_url_list = self.html_parser.province_parser(
                html_content, self.split_url)
            #print(self.province_url_list)
            pro = self.province_url_list
            #print(self.province_url_list[0][0])
            with open(self.last_log_path, "r") as r:
                last_log = r.read()
            #print(last_log)
            if last_log != "":
                last_log_index = pro.index(tuple(last_log.split(';')))
                #print("inde:"+str(last_log_index))
                for i in range(last_log_index):
                    del self.province_url_list[0]

                print("删除已下载元素后还剩余:" + str(len(self.province_url_list)) +
                      "共计:31")
                #print(self.province_url_list)
                #exit()
            #else:
            #  print("下载开始,共计:"+str(len(pro))
            #print(last_log_index)
            #exit()
            for province_name, province_url, province_code in self.province_url_list:
                #print(province_code)
                #记录最后一个下载
                last_record = (province_name, province_url, province_code)
                #print(last_record)
                with open(self.last_log_path, "w") as l:
                    #last_name = province_name.encode('utf8')
                    l.write(last_record[0] + ";" + last_record[1] + ";" +
                            last_record[2])
                #exit()
                province_id = self.mysql_handler.insert(
                    province_code + '0000000000', province_name)
                #print(province_id)
                # 记录正在下载、解析的url,便于分析错误
                downloading_url = province_url
                html_content = self.html_downloader.download(downloading_url)
                self.city_url_list = self.html_parser.city_parser(
                    html_content, self.split_url)
                for city_name, city_url, city_code in self.city_url_list:
                    city_id = self.mysql_handler.insert(city_code, city_name)
                    # 例如直辖市没有下级页面
                    if city_url is None:
                        continue
                    # 记录正在下载、解析的url,便于分析错误
                    downloading_url = city_url
                    html_content = self.html_downloader.download(
                        downloading_url)
                    self.county_url_list = self.html_parser.county_parser(
                        html_content, self.split_url + province_code + "/")
                    for county_name, county_url, county_code in self.county_url_list:
                        county_id = self.mysql_handler.insert(
                            county_code, county_name)
                        if county_url is None:
                            continue
                        # 记录正在下载、解析的url,便于分析错误
                        downloading_url = county_url
                        html_content = self.html_downloader.download(
                            downloading_url)
                        self.town_url_list = self.html_parser.town_parser(
                            html_content, self.split_url)
                        for town_name, town_url, town_code in self.town_url_list:
                            # 输出抓取到的乡镇街道的名称、链接(实际不需要)、编号代码
                            print(town_name, town_url, town_code)
                            self.mysql_handler.insert(town_code, town_name)
            self.mysql_handler.close()
        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()
            time.sleep(60)
            return self.craw()
Beispiel #14
0
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        # 爬取起点url
        self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
        # 用于后续url的拼接
        self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
        # 省页面列表
        self.province_url_list = []
        # 市页面列表
        self.city_url_list = []
        # 区页面列表
        self.county_url_list = []
        # 乡镇、街道页面列表
        self.town_url_list = []

    def craw(self):
        try:
            # 记录正在下载、解析的url,便于分析错误
            downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            # 第二个参数:用于url拼接的url
            self.province_url_list = self.html_parser.province_parser(
                html_content, self.split_url)
            for province_name, province_url, province_code in self.province_url_list:
                # 第一个参数:1-插入一个省数据;2-市数据;3-区数据;4-乡镇街道数据
                # 第二个参数:省市区街道名称
                # 第三个参数:上级的id,注意省没有上级id
                # 第四个参数:市区街道的行政区划编码
                province_id = self.mysql_handler.insert(
                    1, province_name, None, None)
                if province_id == 0:
                    continue
                sleep(5)
                # 记录正在下载、解析的url,便于分析错误
                downloading_url = province_url
                try:
                    html_content = self.html_downloader.download(
                        downloading_url)
                except Exception as e:
                    sleep(10)
                    print e, "重新下载 省份"
                    html_content = self.html_downloader.download(
                        downloading_url)
                self.city_url_list = self.html_parser.city_parser(
                    html_content, self.split_url)
                for city_name, city_url, city_code in self.city_url_list:
                    city_id = self.mysql_handler.insert(
                        2, city_name, province_id, city_code)
                    # 例如直辖市没有下级页面
                    if city_url is None:
                        continue
                    # 记录正在下载、解析的url,便于分析错误
                    downloading_url = city_url
                    try:
                        html_content = self.html_downloader.download(
                            downloading_url)
                    except Exception as e:
                        sleep(10)
                        print e, "重新下载 直辖市"
                        html_content = self.html_downloader.download(
                            downloading_url)

                    self.county_url_list = self.html_parser.county_parser(
                        html_content, self.split_url + province_code + "/")
                    for county_name, county_url, county_code in self.county_url_list:
                        county_id = self.mysql_handler.insert(
                            3, county_name, city_id, county_code)
                        if county_url is None:
                            continue
                        # 记录正在下载、解析的url,便于分析错误
                        downloading_url = county_url
                        try:
                            html_content = self.html_downloader.download(
                                downloading_url)
                        except Exception as e:
                            sleep(10)
                            print e, "重新下载乡镇"
                            html_content = self.html_downloader.download(
                                downloading_url)
                        self.town_url_list = town_parser(
                            html_content, self.split_url)
                        for town_name, town_url, town_code in self.town_url_list:
                            # 输出抓取到的乡镇街道的名称、链接(实际不需要)、编号代码
                            if town_code == "130408100000":
                                print town_url
                            print(town_name, town_url, town_code)
                            self.mysql_handler.insert(4, town_name, county_id,
                                                      town_code)
            self.mysql_handler.close()
        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()
class LinkExtractor(object):
    def __init__(self):
        self.counter = 0
        self.k_count = 0
        self.downloader = HtmlDownloader()

    def get_menu_page_info(self, menu_page_url):
        if menu_page_url is None:
            return None

        html_text = self.downloader.download(menu_page_url)

        if html_text == None:
            return None

        self.counter = (self.counter + 1)%100
        if self.counter == 0:
            self.k_count += 1
            print('Get Manu Pages: %d00'%(self.k_count))

        return self.parse_menu_page_info(html_text)

    def parse_menu_page_info(self, html_text):
        if html_text is None:
            return None

        soup = BeautifulSoup(html_text, 'lxml')

        menu_page_data = []
        for entry in soup.select('.r-ent'):
            data = {
                'title': entry.select('.title')[0].text.strip(),
                'post_url': PTT_HOST_URL + entry.select('.title > a')[0].get('href') if entry.select('.title > a') else None,
                'date': entry.select('.date')[0].text.strip(),
                'author': entry.select('.author')[0].text.strip(),
                'visited': 0
            }
            menu_page_data.append(data)
        return menu_page_data

    # 抓 post_links 到 post_url_infos table
    def fetch_menu_page_links(self, menu_page_url):
        menu_page_data = self.get_menu_page_info(menu_page_url)
        if menu_page_data != None:
            url_manager.add_new_url_infos(menu_page_data)

    def next_page(self, html_text):
        soup = BeautifulSoup(html_text, 'lxml')
        if soup.find_all('a', class_='btn wide', text='下頁 ›'):
            return PTT_HOST_URL + soup.find_all('a', class_='btn wide', text='下頁 ›')[0].get('href')
        return None

    def run(self, root_menu_page, min_menu_page_index=1, max_menu_page_index=6000, threadNum=5):
        print('===================== start run extractor() ========================')
        try:
            pool = threadpool.ThreadPool(threadNum) 
            
            menu_page_urls = [root_menu_page.format(i) for i in range(min_menu_page_index, max_menu_page_index)]
            requests = threadpool.makeRequests(self.fetch_menu_page_links, menu_page_urls) 
            [pool.putRequest(req) for req in requests] 
            pool.wait()
            print('link extractor done.')
        except:
            print('link_extractor excepttion')
            raise
Beispiel #16
0
        print(year)
        year_date_list = getAllDayPerYear(year)
        # print(year_date_list)
        for comregdate in year_date_list:
            print(comregdate)
            errcnt = 0
            pagecnt_tmp = 0
            for pagecnt in range(0, 1000):

                url = r'https://gongshang.mingluji.com/' + province + r'/riqi/' + comregdate + r'?page=' + str(
                    pagecnt)
                # print(url)
                time.sleep(1)
                pagecnt_tmp = pagecnt
                try:
                    html_content = hd.download(url)
                    hp.cityparase(html_content, cursor, province, comregdate)
                    conn.commit()
                    print(province, comregdate, pagecnt)
                except Exception as e:
                    print(e)
                    with open('download.err', 'a') as f:
                        f.write(url + '\n')
                    if (pagecnt - pagecnt_tmp == 0):
                        errcnt += 1
                    # print(pagecnt)
                    # print(pagecnt_tmp)
                    # print(errcnt)
                    if (errcnt > 10):
                        break
Beispiel #17
0
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        #self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.path = "D:\\python_work\\get_diqu_dm\\"
        # 爬取起点url
        self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
        # 用于后续url的拼接
        self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
        # 省页面列表
        self.province_url_list = []
        # 市页面列表
        self.city_url_list = []
        # 区页面列表
        self.county_url_list = []
        # 乡镇、街道页面列表
        self.town_url_list = []

    def craw(self):
        try:
            # 记录正在下载、解析的url,便于分析错误
            downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            # 第二个参数:用于url拼接的url
            self.province_url_list = self.html_parser.province_parser(html_content, self.split_url)
            #print(self.province_url_list)
            with open(self.path+"shen_daima.txt", "a") as f:
                for province_name, province_url, province_code in self.province_url_list:
                    province_code = province_code+'0000000000'
                    
                    f.write(province_code+"\t"+province_name+"\n")
                    
                    # 第一个参数:1-插入一个省数据;2-市数据;3-区数据;4-乡镇街道数据
                    # 第二个参数:省市区街道名称
                    # 第三个参数:上级的id,注意省没有上级id
                    # 第四个参数:市区街道的行政区划编码
                    #province_id = self.mysql_handler.insert(1, province_name, None, None)
                    
                    # 记录正在下载、解析的url,便于分析错误
                    downloading_url = province_url
                    html_content = self.html_downloader.download(downloading_url)
                    self.city_url_list = self.html_parser.city_parser(html_content, self.split_url)
                    with open(self.path+"other_daima.txt","a") as o:
                        for city_name, city_url, city_code in self.city_url_list:
                            o.write(city_code+"\t"+city_name+"\n")
                            #city_id = self.mysql_handler.insert(2, city_name, province_id, city_code)
                            # 例如直辖市没有下级页面
                            if city_url is None:
                                continue
                            # 记录正在下载、解析的url,便于分析错误
                            
                            downloading_url = city_url
                            html_content = self.html_downloader.download(downloading_url)
                            self.county_url_list = self.html_parser.county_parser(html_content, self.split_url + province_code + "/")
                            for county_name, county_url, county_code in self.county_url_list:
                                o.write(county_code+"\t"+county_name+"\n")
                                #county_id = self.mysql_handler.insert(3, county_name, city_id, county_code)
                                if county_url is None:
                                    continue
                                # 记录正在下载、解析的url,便于分析错误
                                print('To deal with county')
                                downloading_url = county_url
                                html_content = self.html_downloader.download(downloading_url)
                                self.town_url_list = self.html_parser.town_parser(html_content, self.split_url)
                                for town_name, town_url, town_code in self.town_url_list:
                                    # 输出抓取到的乡镇街道的名称、链接(实际不需要)、编号代码
                                    o.write(town_code+"\t"+town_name+"\n")
                                    print(town_name, town_url, town_code)
                                    #self.mysql_handler.insert(4, town_name, county_id, town_code)
            #self.mysql_handler.close()
            f.close()
            o.close()
        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()
Beispiel #18
0
class SpiderMain():
    """爬虫程序主模块"""
    def __init__(self):
        """构造函数,初始化属性"""
        self.urls = UrlManager()
        self.log = MyLog("spider_main", "logs")
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()
        #self.util=utill.DBConn()

    def craw(self, root_url):
        """爬虫入口函数"""
        areas = {
            "gulou": 100,
            "jianye": 72,
            "qinhuai": 100,
            "xuanwu": 67,
            "yuhuatai": 32,
            "qixia": 62,
            "baijiahu": 33,
            "chalukou1": 26,
            "jiangningqita11": 3,
            "dongshanzhen": 29,
            "jiangningdaxuecheng": 15,
            "jiulonghu": 12,
            "jiangjundadao11": 22,
            "kexueyuan": 9,
            "qilinzhen": 42,
            "tiexinqiao": 9,
            "pukou": 100,
            "liuhe": 1,
        }

        #areas = {"gulou":1}

        #1、抓取所有二手房详情界面链接,并将所有连接放入URL管理模块
        for area, pg_sum in areas.items():
            for num in range(1, pg_sum + 1):
                #1.1 拼接页面地址: https://nj.lianjia.com/ershoufang/gulou/pg2/
                pg_url = root_url + area + "/pg" + str(num) + "/"
                self.log.logger.info("1.1 拼接页面地址:" + pg_url)
                print("1.1 拼接页面地址:" + pg_url)
                #1.2 启动下载器,下载页面.
                try:
                    html_cont = self.downloader.download(pg_url)
                except Exception as e:
                    self.log.logger.error("1.2 下载页面出现异常:" + repr(e))
                    time.sleep(60 * 30)
                else:
                    #1.3 解析PG页面,获得二手房详情页面的链接,并将所有链接放入URL管理模块
                    try:
                        ershoufang_urls = self.parser.get_erhoufang_urls(
                            html_cont)
                    except Exception as e:
                        self.log.logger.error("1.3 页面解析出现异常:" + repr(e))
                    else:
                        self.urls.add_new_urls(ershoufang_urls)
                        #暂停0~3秒的整数秒,时间区间:[0,3]
                        time.sleep(random.randint(0, 3))

        time.sleep(60 * 20)
        #2、解析二手房具体细心页面
        id = 1
        stop = 1
        while self.urls.has_new_url():
            #2.1 获取url
            try:
                detail_url = self.urls.get_new_url()
                self.log.logger.info("2.1 二手房页面地址:" + detail_url)
                print("2.1 二手房页面地址:" + detail_url)
            except Exception as e:
                print("2.1 拼接地址出现异常")
                self.log.logger.error("2.1 拼接地址出现异常:" + detail_url)

            #2.2 下载页面
            try:
                detail_html = self.downloader.download(detail_url)
            except Exception as e:
                self.log.logger.error("2.2 下载页面出现异常:" + repr(e))
                self.urls.add_new_url(detail_url)
                time.sleep(60 * 30)
            else:
                #2.3 解析页面
                try:
                    ershoufang_data = self.parser.get_ershoufang_data(
                        detail_html, id)
                except Exception as e:
                    self.log.logger.error("2.3 解析页面出现异常:" + repr(e))
                else:
                    #2.4 输出数据
                    try:
                        self.outputer.collect_data(ershoufang_data)
                    except Exception as e:
                        self.log.logger.error("2.4 输出数据出现异常:" + repr(e))
                    else:
                        print(id)
                        id = id + 1
                        stop = stop + 1
                        #暂停0~3秒的整数秒,时间区间:[0,3]
                        time.sleep(random.randint(0, 3))
                        if stop == 2500:
                            stop = 1
                            time.sleep(60 * 20)
# -*- coding: utf-8 -*-
# @Author: cyb

from html_downloader import HtmlDownloader

downloader = HtmlDownloader()
html_content = downloader.download(
    url='https://baike.baidu.com/item/Python/407313')
print(html_content)

# 测试得到响应的htmlcontent,经过比对是纯静态页面,想要的数据都已包含。
# # resp.text自动转换偶尔会得到乱码,所以改为resp.content.decode()