コード例 #1
0
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        # 爬取起点url
        self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
        # 用于后续url的拼接
        self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
        # 省页面列表
        self.province_url_list = []
        # 市页面列表
        self.city_url_list = []
        # 区页面列表
        self.county_url_list = []
        # 乡镇、街道页面列表
        self.town_url_list = []
        self.last_log_path = "d:\\log.txt"

    def craw(self):
        try:
            # 记录正在下载、解析的url,便于分析错误
            downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            # 第二个参数:用于url拼接的url
            self.province_url_list = self.html_parser.province_parser(
                html_content, self.split_url)
            #print(self.province_url_list)
            pro = self.province_url_list
            #print(self.province_url_list[0][0])
            with open(self.last_log_path, "r") as r:
                last_log = r.read()
            #print(last_log)
            if last_log != "":
                last_log_index = pro.index(tuple(last_log.split(';')))
                #print("inde:"+str(last_log_index))
                for i in range(last_log_index):
                    del self.province_url_list[0]

                print("删除已下载元素后还剩余:" + str(len(self.province_url_list)) +
                      "共计:31")
                #print(self.province_url_list)
                #exit()
            #else:
            #  print("下载开始,共计:"+str(len(pro))
            #print(last_log_index)
            #exit()
            for province_name, province_url, province_code in self.province_url_list:
                #print(province_code)
                #记录最后一个下载
                last_record = (province_name, province_url, province_code)
                #print(last_record)
                with open(self.last_log_path, "w") as l:
                    #last_name = province_name.encode('utf8')
                    l.write(last_record[0] + ";" + last_record[1] + ";" +
                            last_record[2])
                #exit()
                province_id = self.mysql_handler.insert(
                    province_code + '0000000000', province_name)
                #print(province_id)
                # 记录正在下载、解析的url,便于分析错误
                downloading_url = province_url
                html_content = self.html_downloader.download(downloading_url)
                self.city_url_list = self.html_parser.city_parser(
                    html_content, self.split_url)
                for city_name, city_url, city_code in self.city_url_list:
                    city_id = self.mysql_handler.insert(city_code, city_name)
                    # 例如直辖市没有下级页面
                    if city_url is None:
                        continue
                    # 记录正在下载、解析的url,便于分析错误
                    downloading_url = city_url
                    html_content = self.html_downloader.download(
                        downloading_url)
                    self.county_url_list = self.html_parser.county_parser(
                        html_content, self.split_url + province_code + "/")
                    for county_name, county_url, county_code in self.county_url_list:
                        county_id = self.mysql_handler.insert(
                            county_code, county_name)
                        if county_url is None:
                            continue
                        # 记录正在下载、解析的url,便于分析错误
                        downloading_url = county_url
                        html_content = self.html_downloader.download(
                            downloading_url)
                        self.town_url_list = self.html_parser.town_parser(
                            html_content, self.split_url)
                        for town_name, town_url, town_code in self.town_url_list:
                            # 输出抓取到的乡镇街道的名称、链接(实际不需要)、编号代码
                            print(town_name, town_url, town_code)
                            self.mysql_handler.insert(town_code, town_name)
            self.mysql_handler.close()
        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()
            time.sleep(60)
            return self.craw()
コード例 #2
0
ファイル: spider_main.py プロジェクト: vebys/address_spider
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        #self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.path = "D:\\python_work\\get_diqu_dm\\"
        # 爬取起点url
        self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
        # 用于后续url的拼接
        self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
        # 省页面列表
        self.province_url_list = []
        # 市页面列表
        self.city_url_list = []
        # 区页面列表
        self.county_url_list = []
        # 乡镇、街道页面列表
        self.town_url_list = []

    def craw(self):
        try:
            # 记录正在下载、解析的url,便于分析错误
            downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            # 第二个参数:用于url拼接的url
            self.province_url_list = self.html_parser.province_parser(html_content, self.split_url)
            #print(self.province_url_list)
            with open(self.path+"shen_daima.txt", "a") as f:
                for province_name, province_url, province_code in self.province_url_list:
                    province_code = province_code+'0000000000'
                    
                    f.write(province_code+"\t"+province_name+"\n")
                    
                    # 第一个参数:1-插入一个省数据;2-市数据;3-区数据;4-乡镇街道数据
                    # 第二个参数:省市区街道名称
                    # 第三个参数:上级的id,注意省没有上级id
                    # 第四个参数:市区街道的行政区划编码
                    #province_id = self.mysql_handler.insert(1, province_name, None, None)
                    
                    # 记录正在下载、解析的url,便于分析错误
                    downloading_url = province_url
                    html_content = self.html_downloader.download(downloading_url)
                    self.city_url_list = self.html_parser.city_parser(html_content, self.split_url)
                    with open(self.path+"other_daima.txt","a") as o:
                        for city_name, city_url, city_code in self.city_url_list:
                            o.write(city_code+"\t"+city_name+"\n")
                            #city_id = self.mysql_handler.insert(2, city_name, province_id, city_code)
                            # 例如直辖市没有下级页面
                            if city_url is None:
                                continue
                            # 记录正在下载、解析的url,便于分析错误
                            
                            downloading_url = city_url
                            html_content = self.html_downloader.download(downloading_url)
                            self.county_url_list = self.html_parser.county_parser(html_content, self.split_url + province_code + "/")
                            for county_name, county_url, county_code in self.county_url_list:
                                o.write(county_code+"\t"+county_name+"\n")
                                #county_id = self.mysql_handler.insert(3, county_name, city_id, county_code)
                                if county_url is None:
                                    continue
                                # 记录正在下载、解析的url,便于分析错误
                                print('To deal with county')
                                downloading_url = county_url
                                html_content = self.html_downloader.download(downloading_url)
                                self.town_url_list = self.html_parser.town_parser(html_content, self.split_url)
                                for town_name, town_url, town_code in self.town_url_list:
                                    # 输出抓取到的乡镇街道的名称、链接(实际不需要)、编号代码
                                    o.write(town_code+"\t"+town_name+"\n")
                                    print(town_name, town_url, town_code)
                                    #self.mysql_handler.insert(4, town_name, county_id, town_code)
            #self.mysql_handler.close()
            f.close()
            o.close()
        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()
コード例 #3
0
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        # 爬取起点url
        self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
        # 用于后续url的拼接
        self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
        # 省页面列表
        self.province_url_list = []
        # 市页面列表
        self.city_url_list = []
        # 区页面列表
        self.county_url_list = []
        # 乡镇、街道页面列表
        self.town_url_list = []

    def craw(self):
        try:
            # 记录正在下载、解析的url,便于分析错误
            downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            # 第二个参数:用于url拼接的url
            self.province_url_list = self.html_parser.province_parser(
                html_content, self.split_url)
            for province_name, province_url, province_code in self.province_url_list:
                # 第一个参数:1-插入一个省数据;2-市数据;3-区数据;4-乡镇街道数据
                # 第二个参数:省市区街道名称
                # 第三个参数:上级的id,注意省没有上级id
                # 第四个参数:市区街道的行政区划编码
                province_id = self.mysql_handler.insert(
                    1, province_name, None, None)
                if province_id == 0:
                    continue
                sleep(5)
                # 记录正在下载、解析的url,便于分析错误
                downloading_url = province_url
                try:
                    html_content = self.html_downloader.download(
                        downloading_url)
                except Exception as e:
                    sleep(10)
                    print e, "重新下载 省份"
                    html_content = self.html_downloader.download(
                        downloading_url)
                self.city_url_list = self.html_parser.city_parser(
                    html_content, self.split_url)
                for city_name, city_url, city_code in self.city_url_list:
                    city_id = self.mysql_handler.insert(
                        2, city_name, province_id, city_code)
                    # 例如直辖市没有下级页面
                    if city_url is None:
                        continue
                    # 记录正在下载、解析的url,便于分析错误
                    downloading_url = city_url
                    try:
                        html_content = self.html_downloader.download(
                            downloading_url)
                    except Exception as e:
                        sleep(10)
                        print e, "重新下载 直辖市"
                        html_content = self.html_downloader.download(
                            downloading_url)

                    self.county_url_list = self.html_parser.county_parser(
                        html_content, self.split_url + province_code + "/")
                    for county_name, county_url, county_code in self.county_url_list:
                        county_id = self.mysql_handler.insert(
                            3, county_name, city_id, county_code)
                        if county_url is None:
                            continue
                        # 记录正在下载、解析的url,便于分析错误
                        downloading_url = county_url
                        try:
                            html_content = self.html_downloader.download(
                                downloading_url)
                        except Exception as e:
                            sleep(10)
                            print e, "重新下载乡镇"
                            html_content = self.html_downloader.download(
                                downloading_url)
                        self.town_url_list = town_parser(
                            html_content, self.split_url)
                        for town_name, town_url, town_code in self.town_url_list:
                            # 输出抓取到的乡镇街道的名称、链接(实际不需要)、编号代码
                            if town_code == "130408100000":
                                print town_url
                            print(town_name, town_url, town_code)
                            self.mysql_handler.insert(4, town_name, county_id,
                                                      town_code)
            self.mysql_handler.close()
        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()