Python str_extract Exemples, project_insurance_scrap.scrap_functions.str_extract Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : a农银人寿.py Projet : Shanlearning/insurance-data-scraping-project

    def tingshou_parse(self, response):                
        # 从每一行抽取数据
        result = response.css('.dis_proboxul a').extract()
        for part in result:
            # 停售保险的内容输入
            item = ProjectInsuranceScrapItem()            
            item['company_name'] = '农银人寿'
            name =shan.str_extract('>·(.*?)</a>',part)
            if "条款" in name:
                item['product_name'] = shan.str_extract('(.*?)条款',name)
            elif "产品说明书" in name:
                item['product_name'] = shan.str_extract('(.*?)产品',name)
            else:
                item['product_name'] = name
            item['product_sale_status'] = '停售'
            item['product_contract_link'] = "http://www.abchinalife.cn"+ shan.str_extract('href="(.*)" target',part)
            
            # 输出数据
            yield item 

        a = response.css('option').extract()
        b = shan.str_extract('value="(.*?)">',a)
        b = b[1:len(b)]
        for part in b:
            yield response.follow("http://www.abchinalife.cn" + part, callback=self.tingshou_parse)

Exemple #2

0

Afficher le fichier

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('#a2 font , .aproName').extract()
        zs_result = result[0:shan.which(shan.str_detect("人寿保险", result))[0]]
        ts_result = result[shan.which(shan.str_detect("人寿保险", result))[0]:(
            shan.which(shan.str_detect("zip", result))[0] - 1)]

        zs_result = shan.str_keep('国泰', zs_result)
        ts_result = shan.str_keep('国泰', ts_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '陆家嘴国泰'
            item['product_name'] = shan.str_extract('>(.*?)</a>', part)
            item['product_sale_status'] = "在售"
            item[
                'product_contract_link'] = "http://www.cathaylife.cn" + shan.str_extract(
                    'href="(.*)"', part)
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '陆家嘴国泰'
            item['product_name'] = shan.str_extract('>(.*?)</a>', part)
            item['product_sale_status'] = "停售"
            item[
                'product_contract_link'] = "http://www.cathaylife.cn" + shan.str_extract(
                    'href="(.*)"', part)
            # 输出数据
            yield item

Exemple #3

0

Afficher le fichier

    def tingshou_parse(self, response):
        # 从每一行抽取数据
        result = response.css('tr').extract()
        result = shan.str_drop('停售', result)
        result = shan.str_keep('险', result)
        for part in result:
            # 停售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '长生人寿'
            name = shan.str_extract('">(.*?)</td>', part)
            if "<" in name:
                item['product_name'] = shan.str_extract('>(.*?)<', name)
            else:
                item['product_name'] = name
            link = shan.str_extract('href="(.*)pdf"', part)
            if "http" in link:
                item['product_contract_link'] = link + "pdf"
            else:
                item[
                    'product_contract_link'] = "http://www.gwcslife.com" + shan.str_extract(
                        'href="(.*)pdf"', part) + "pdf"
            item['product_sale_status'] = '停售'

            # 输出数据
            yield item

Exemple #4

0

Afficher le fichier

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('.news_list a').extract()
        for part in result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['product_id'] = shan.str_extract('title="(.*?)平安', part)
            item['company_name'] = '平安养老'
            name = shan.str_extract('平安(.*?)</a>', part)
            if "条款" in name:
                item['product_name'] = "平安" + shan.str_extract('(.*?)条款', name)
            else:
                item['product_name'] = "平安" + name
            item['product_sale_status'] = ' 在售'
            item['product_contract_link'] = shan.str_extract(
                'href="(.*?)" title', part)
            # 输出数据
            yield item

        # 找到下一页的代码
        page = response.css('li.page').extract()
        next_pages = shan.str_extract(
            "/px/informationDisclosure/insuranceProductList_\d+[.]shtml", page)
        for next_page in next_pages:
            yield response.follow("http://yl.pingan.com" + next_page,
                                  callback=self.parse)

Exemple #5

0

Afficher le fichier

Fichier : a太平人寿.py Projet : Shanlearning/insurance-data-scraping-project

    def parse(self, response):
        # 从每一行抽取数据

        result = response.css(".ts_product")
        zs_result = result[0].css("tr").getall()
        zs_result = shan.str_keep("条款PDF文档", zs_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '太平人寿'
            item['product_name'] = shan.str_extract('<td>(.*)</td>', part)
            item['product_sale_status'] = '在售'

            item['product_contract_link'] = shan.str_extract(
                'href="(.*)?">', part)
            # 输出数据
            yield item

        ts_result = result[1].css("tr").getall()
        ts_result = shan.str_keep("条款PDF文档", ts_result)

        for part in ts_result:
            # 停售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '太平人寿'
            item['product_name'] = shan.str_extract('<td>(.*)</td>', part)
            item['product_sale_status'] = '停售'
            item['product_contract_link'] = shan.str_extract(
                'href="(.*)?">', part)
            # 输出数据
            yield item

Exemple #6

0

Afficher le fichier

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css("tr").extract()
        zs_result = shan.str_keep('class="STYLE14"', result)
        ts_result = shan.str_keep('class="STYLE15"', result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            part = re.findall('<td>(.*?)</td>', part)
            item['company_name'] = '富德生命'
            item['product_name'] = part[1]
            item['product_sale_status'] = '在售'
            item[
                'product_contract_link'] = "https://www.sino-life.com" + shan.str_keep(
                    'upload', shan.str_extract('href="(.*)pdf',
                                               part[4])) + "pdf"
            item['product_start_date'] = part[2]
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            part = re.findall('<td>(.*?)</td>', part)
            item['company_name'] = '富德生命'
            item['product_name'] = part[1]
            item['product_sale_status'] = '停售'
            item[
                'product_contract_link'] = "https://www.sino-life.com" + shan.str_keep(
                    'upload', shan.str_extract('href="(.*)pdf',
                                               part[4])) + "pdf"
            item['product_start_date'] = part[2]
            # 输出数据
            yield item

Exemple #7

0

Afficher le fichier

 def parse(self, response):
     # 从每一行抽取数据
     result = response.css('.bxContent').extract()
     result = re.split('查看', result[0])
     result = shan.str_keep('国华', result)
     for part in result:
         # 在售保险的内容输入
         item = ProjectInsuranceScrapItem()
         item['company_name'] = '国华人寿'
         item['product_name'] = "国华" + shan.str_extract('国华(.*?)</p>', part)
         item['product_sale_status'] = ''
         link = shan.str_extract('href="(.*?)" target', part)
         if "content" not in link:
             contract_link = "http://www.95549.cn/pages/intro/" + link
             yield response.follow(contract_link,
                                   callback=self.contract_parse,
                                   meta=({
                                       'item': item
                                   }))
         else:
             link1 = link + "z"
             item[
                 'product_contract_link'] = "http://www.95549.cn/pages/" + shan.str_extract(
                     '../(.*?)z', link1)
             # 输出数据
             yield item

Exemple #8

0

Afficher le fichier

    def tingshou_parse(self, response):
        # 从每一行抽取数据
        result = response.css(".pi-pubinfo")
        result1 = result.css("li").extract()
        for part in result1:
            # 停售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '民生人寿'
            a = shan.str_keep('class="pi-pls-prodname off-float-left"', part)
            item['product_type'] = ''
            item['product_id'] = ''
            item['product_name'] = shan.str_extract('>(.*?)<', a)
            item['product_sale_status'] = '停售'
            b = shan.str_keep('class="dsm-choise-zoon dsm-none"', part)
            item[
                'product_contract_link'] = "http://www.minshenglife.com" + shan.str_extract(
                    'href="(.*?)">', b)
            item['product_price_link'] = ''

            item['product_start_date'] = ''
            item['product_end_date'] = ''
            # 输出数据
            yield item
        # 找到下一页的代码
        next_pages = re.findall("index_\d+[.]shtml", response.text)
        for next_page in next_pages:
            yield response.follow(next_page, callback=self.tingshou_parse)

Exemple #9

0

Afficher le fichier

Fichier : a中法人寿.py Projet : Shanlearning/insurance-data-scraping-project

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('.list-content a').extract()
        for part in result:
                 # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()            
            item['company_name'] = '中法人寿'
            name = shan.str_extract('>·(.*?)<span>',part)
            if "目录" in name:
                item['product_name'] = name
                item['product_sale_status'] = '停售'
            elif "产品说明书" in name:
                item['product_name'] = shan.str_extract('(.*?)产品说明书',name)
                item['product_sale_status'] = '在售'
            else:
                item['product_name'] = name
                item['product_sale_status'] = '在售'
            item['product_contract_link'] = "http://www.sfli.com.cn"+ shan.str_extract('href="(.*?)" target',part) 
                # 输出数据
            yield item 

        a = response.css('option').extract()
        b = shan.str_extract('value="(.*?)">',a)
        b = b[1:len(b)]
        for part in b:
            yield response.follow("http://www.sfli.com.cn" + part, callback=self.parse)

Exemple #10

0

Afficher le fichier

Fichier : a信美相互.py Projet : Shanlearning/insurance-data-scraping-project

    def tingshou_parse(self, response):
        # 从每一行抽取数据
        result = response.css('p').extract()
        result = shan.str_keep('信美', result)
        for part in result:
            # 停售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '信美相互'
            item['product_name'] = shan.str_extract(
                '\r\n\t\t\t\t\t\t\t\t\t\t\t(.*?)\r\n\t\t\t\t\t\t\t\t\t\t\t</span>',
                part)
            item['product_sale_status'] = '停售'
            item['product_contract_link'] = shan.str_extract(
                'href="(.*)" target', part)

            # 输出数据
            yield item

        a = response.css('button').extract()
        b = shan.str_extract('value="(.*?)" onclick', a)
        b = b[2:(len(b) - 1)]
        for part in b:
            yield response.follow(
                "https://www.trustlife.com/cms/html/productClauseStop/index_" +
                part + ".html",
                callback=self.tingshou_parse)

Exemple #11

0

Afficher le fichier

 def contract_parse(self, response):
     result = response.css(".articleShowText a").extract()
     item = response.meta['item']
     a = shan.str_keep('产品',result)
     if 'pdf' in a: 
         item['product_official_report_list'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',a) 
     else: 
         item['product_official_report_list'] = ''
     b = shan.str_keep('费率',result)
     if 'pdf' in b: 
         item['product_price_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',b) 
     else:
         item['product_price_link'] =''
     c = shan.str_keep('条款',result)
     if 'pdf' in c: 
         item['product_contract_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',c) 
     else:
         item['product_contract_link'] = ''
     d = shan.str_keep('价值表',result)
     if 'pdf' in d: 
         item['product_pv_full_list_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',d) 
     else:
         item['product_pv_full_list_link'] = ''
     f = shan.str_keep('总精算师',result)  
     if 'pdf' in f: 
         item['product_chief_actuary_claim_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',f) 
     else:
         item['product_chief_actuary_claim_link'] =''
     g = shan.str_keep('法律责任人',result)  
     if 'pdf' in g: 
         item['prodcct_law_response_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',g)
     else:
         item['prodcct_law_response_link'] = ''
     yield item

Exemple #12

0

Afficher le fichier

Fichier : a太平养老.py Projet : Shanlearning/insurance-data-scraping-project

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('tr').extract()
        result = result[1:len(result)]
        a = shan.str_detect("健康保险", result)
        zs_result = result[0:shan.which(a)[len(shan.which(a)) - 1]]
        ts_result = result[shan.which(a)[len(shan.which(a)) - 1]:(len(result))]

        zs_result = shan.str_keep('太平', zs_result)
        ts_result = shan.str_keep('太平', ts_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '太平养老'
            item['product_name'] = shan.str_extract('<td>(.*?)</td>', part)
            item['product_sale_status'] = "在售"
            item['product_contract_link'] = shan.str_extract(
                'href="(.*)"', part)
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '太平养老'
            item['product_name'] = shan.str_extract('<td>(.*?)</td>', part)
            item['product_sale_status'] = "停售"
            item['product_contract_link'] = shan.str_extract(
                'href="(.*)"', part)
            # 输出数据
            yield item

Exemple #13

0

Afficher le fichier

Fichier : S光大永明.py Projet : Shanlearning/insurance-data-scraping-project

 def contract_parse(self, response):
     result = response.css("tr")
     result = result[1:len(result)].extract()
     item = response.meta['item']
     a = shan.str_keep('材料清单', result)
     if 'pdf' in a:
         item[
             'product_official_report_list'] = "http://www.sunlife-everbright.com" + shan.str_extract(
                 'href="(.*?)"', a)
     else:
         item['product_official_report_list'] = ''
     b = shan.str_keep('费率', result)
     if 'pdf' in b:
         item[
             'product_price_link'] = "http://www.sunlife-everbright.com" + shan.str_extract(
                 'href="(.*?)"', b)
     else:
         item['product_price_link'] = ''
     c = shan.str_keep('条款', result)
     if 'pdf' in c:
         item[
             'product_contract_link'] = "http://www.sunlife-everbright.com" + shan.str_extract(
                 'href="(.*?)"', c)
     else:
         item['product_contract_link'] = ''
     d = shan.str_keep('价值表（全表）', result)
     if 'xlsx' in d:
         item[
             'product_pv_full_list_link'] = "http://www.sunlife-everbright.com" + shan.str_extract(
                 'href="(.*?)"', d)
     else:
         item['product_pv_full_list_link'] = ''
     e = shan.str_keep('价值表（示例）', result)
     if 'pdf' in e:
         item[
             'product_pv_example_link'] = "http://www.sunlife-everbright.com" + shan.str_extract(
                 'href="(.*?)"', e)
     else:
         item['product_pv_example_link'] = ''
     f = shan.str_keep('总精算师', result)
     if 'pdf' in f:
         item[
             'product_chief_actuary_claim_link'] = "http://www.sunlife-everbright.com" + shan.str_extract(
                 'href="(.*?)"', f)
     else:
         item['product_chief_actuary_claim_link'] = ''
     g = shan.str_keep('法律责任人', result)
     if 'pdf' in g:
         item[
             'prodcct_law_response_link'] = "http://www.sunlife-everbright.com" + shan.str_extract(
                 'href="(.*?)"', g)
     else:
         item['prodcct_law_response_link'] = ''
     sale_status_url = [
         'http://www.sunlife-everbright.com/sleb/info/jbxx/cpjbxx/jydbxcpmljtk/index.html',
     ]
     yield item

Exemple #14

0

Afficher le fichier

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('.product_right_content').extract()
        result = result[2]
        result = re.split('<tr>', result)

        zs_result = result[shan.which(shan.str_detect("在售", result))[0]:(
            shan.which(shan.str_detect("停售", result))[0] + 1)]
        ts_result = result[(shan.which(shan.str_detect("停售", result))[0] +
                            1):len(result)]

        zs_result = shan.str_keep('德华', zs_result)
        ts_result = shan.str_keep('德华', ts_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '德华安顾'
            item['product_name'] = shan.str_extract('<td>(.*?)</td>', part)
            item['product_sale_status'] = "在售"
            item[
                'product_contract_link'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                    "dhag(.*)pdf", part) + "pdf"
            if "rar" in part:
                item[
                    'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                        "dhag(.*)rar", part) + "rar"
            else:
                item[
                    'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                        "dhag(.*)zip", part) + "zip"
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '德华安顾'
            item['product_name'] = shan.str_extract('<td>(.*?)</td>', part)
            item['product_sale_status'] = "停售"
            item[
                'product_contract_link'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                    "dhag(.*)pdf", part) + "pdf"
            if "rar" in part:
                item[
                    'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                        "dhag(.*)rar", part) + "rar"
            else:
                item[
                    'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                        "dhag(.*)zip", part) + "zip"
            # 输出数据
            yield item

Exemple #15

0

Afficher le fichier

 def weishou_parse(self, response):                
     # 从每一行抽取数据
     result = response.css('.grey2').extract()
     result = shan.str_keep('险',result)
     for part in result:
         # 停售保险的内容输入
         item = ProjectInsuranceScrapItem()            
         item['company_name'] = '中邮人寿'
         item['product_name'] = shan.str_extract('\r\n\t\t\t\t(.*?)\r\n\t\t\t',part) 
         item['product_sale_status'] = '未售'
         contract_link = "http://www.chinapost-life.com"+ shan.str_extract('href="(.*)" class',part)
         yield response.follow(contract_link, callback= self.contract_parse , meta=({'item': item}) )

Exemple #16

0

Afficher le fichier

Fichier : a新华人寿.py Projet : Shanlearning/insurance-data-scraping-project

    def contract_parse(self, response):
        result = response.css("tr")
        result = result[1:len(result)].extract()
        for part in result:
            item = response.meta['item']
            part = re.findall('<td>(.*)</td>', part)

            item['product_name'] = shan.str_extract('>(.*?)</a>', part[1])
            item['product_special_status'] = part[2]
            item['product_contract_link'] = shan.str_extract(
                'href="(.*?)"', part[1])
            # 输出数据
            yield item

Exemple #17

0

Afficher le fichier

 def zaishou_parse(self, response):
     # 从每一行抽取数据
     result = response.css('tr').extract()
     result = result[1:len(result)]
     for part in result:
              # 在售保险的内容输入
             item = ProjectInsuranceScrapItem()            
             item['company_name'] = '前海人寿'
             item['product_name'] = shan.str_extract('\t\t\t\t\t(.*?)\t\t\t\t</td>',part)
             item['product_sale_status'] = '在售'
             item['product_contract_link'] = "https://www.foresealife.com"+ shan.str_extract('href="(.*)">点击查看</a> \t\t\t\t</td>\t\t\t\t<td style="text-align:center;">',part)
             # 输出数据
             yield item

Exemple #18

0

Afficher le fichier

 def parse(self, response):
     # 从每一行抽取数据
     result = response.css('tr').extract()
     result = result[1:len(result)]
     for part in result:
         # 在售保险的内容输入
         item = ProjectInsuranceScrapItem()
         item['company_name'] = '北京人寿'
         item['product_name'] = "北京" + shan.str_extract('北京(.*?)</', part)
         item['product_sale_status'] = ''
         item['product_contract_link'] = shan.str_extract(
             'href="(.*?)" target', part)
         # 输出数据
         yield item

Exemple #19

0

Afficher le fichier

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css("p , #content a , .h2_title1").extract()
        result = result[2:len(result)]
        zs_result = result[shan.which(shan.str_detect("在售", result)[0]):shan.
                           which(shan.str_detect("停售", result))[0]]
        ts_result = result[shan.which(shan.str_detect("停售", result))[0]:shan.
                           which(shan.str_detect("在售", result))[1]]

        zs_result = shan.str_keep('style="color:#626263;"', zs_result)

        ts_result = shan.str_keep('style="color:#626263;"', ts_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '工银安盛'
            item['product_type'] = ''

            item['product_name'] = shan.str_extract(">(.*?)</a>", part)
            item['product_sale_status'] = '在售'

            item[
                'product_contract_link'] = "www.icbc-axa.com" + shan.str_extract(
                    'href="(.*?)pdf', part) + "pdf"
            item['product_price_link'] = ''

            item['product_start_date'] = ''
            item['product_end_date'] = ''
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '工银安盛'
            item['product_type'] = ''

            item['product_name'] = shan.str_extract(">(.*?)</a>", part)
            item['product_sale_status'] = '停售'

            item[
                'product_contract_link'] = "www.icbc-axa.com" + shan.str_extract(
                    'href="(.*?)pdf', part) + "pdf"
            item['product_price_link'] = ''

            item['product_start_date'] = ''
            item['product_end_date'] = ''
            # 输出数据
            yield item

Exemple #20

0

Afficher le fichier

Fichier : a国联人寿.py Projet : Shanlearning/insurance-data-scraping-project

 def parse(self, response):
     # 从每一行抽取数据
     result = response.css('li').extract()
     result = shan.str_keep('条款',result)
     result = result[1:len(result)]
     for part in result:
              # 在售保险的内容输入
         item = ProjectInsuranceScrapItem()            
         item['company_name'] = '国联人寿'
         item['product_name'] = shan.str_extract('target="_blank">(.*?)</a>',part)
         item['product_sale_status'] = ''
         item['product_contract_link'] = shan.str_extract('href="(.*?)" target=',part)
             # 输出数据
         yield item

Exemple #21

0

Afficher le fichier

Fichier : a大都会人寿.py Projet : Shanlearning/insurance-data-scraping-project

 def zaishou_parse(self, response):
     # 从每一行抽取数据
     result = response.css('.width-100-authored a').extract()
     for part in result:
         # 在售保险的内容输入
         item = ProjectInsuranceScrapItem()
         item['company_name'] = '大都会人寿'
         item['product_name'] = shan.str_extract('公司(.*?)</a>', part)
         item['product_sale_status'] = '在售'
         item[
             'product_contract_link'] = "https://www.metlife.com.cn" + shan.str_extract(
                 'href="(.*)zip', part) + "zip"
         # 输出数据
         yield item

Exemple #22

0

Afficher le fichier

 def parse(self, response):
     # 从每一行抽取数据
     result = response.css('tr').extract()
     result = result[1:len(result)]
     for part in result:
         item = ProjectInsuranceScrapItem()
         item['company_name'] = '渤海人寿'
         item['product_name'] = shan.str_extract("渤海(.*?)</p>", part)
         item['product_sale_status'] = shan.str_extract(">(\S+)售",
                                                        part) + "售"
         item[
             'product_contract_link'] = "http://www.bohailife.net" + shan.str_extract(
                 'href="(.*?)pdf', part) + "pdf"
         # 输出数据
         yield item

Exemple #23

0

Afficher le fichier

 def tingshou_parse(self, response):
     # 从每一行抽取数据
     result = response.css('tr').extract()
     result = result[1:len(result)]
     for part in result:
         # 停售保险的内容输入
         item = ProjectInsuranceScrapItem()
         item['company_name'] = '合众人寿'
         item['product_name'] = "合众" + shan.str_extract('合众(.*?)<', part)
         item['product_sale_status'] = '停售'
         item[
             'product_contract_link'] = "http://www.unionlife.com.cn" + shan.str_extract(
                 'href="(.*)" style', part)
         # 输出数据
         yield item

Exemple #24

0

Afficher le fichier

Fichier : a人保健康.py Projet : Shanlearning/insurance-data-scraping-project

 def tingshou_parse(self, response):
     # 从每一行抽取数据
     result = response.css('#ess_contentpane a').extract()
     result = result[1:len(result)]
     for part in result:
         # 停售保险的内容输入
         item = ProjectInsuranceScrapItem()
         item['company_name'] = '人保健康'
         item['product_name'] = shan.str_extract('>(.*?)</a>', part)
         item['product_sale_status'] = '停售'
         item[
             'product_contract_link'] = "http://www.picchealth.com" + shan.str_extract(
                 'href="(.*)" id', part)
         # 输出数据
         yield item

Exemple #25

0

Afficher le fichier

 def zaishou_parse(self, response):
     # 从每一行抽取数据
     result = response.css('li').extract()
     result = shan.str_keep('条款', result)
     result = result[1:len(result)]
     for part in result:
         # 在售保险的内容输入
         item = ProjectInsuranceScrapItem()
         item['company_name'] = '泰康养老'
         item['product_name'] = shan.str_extract('《(.*?)》', part)
         item['product_sale_status'] = '在售'
         item[
             'product_contract_link'] = "http://tkyl.pension.taikang.com" + shan.str_extract(
                 'href="(.*)" target', part)
         # 输出数据
         yield item

Exemple #26

0

Afficher le fichier

 def parse(self, response):
     # 从每一行抽取数据
     result = response.css('tr').extract()
     result = shan.str_keep('爱心', result)
     for part in result:
         # 在售保险的内容输入
         item = ProjectInsuranceScrapItem()
         item['company_name'] = '爱心人寿'
         item['product_name'] = shan.str_extract('>(.*?)</a>', part)
         item['product_sale_status'] = shan.str_extract('\n(.*?)售',
                                                        part) + "售"
         item[
             'product_contract_link'] = "http://www.aixin-ins.com" + shan.str_extract(
                 'href="(.*?)pdf', part) + "pdf"
         # 输出数据
         yield item

Exemple #27

0

Afficher le fichier

Fichier : S光大永明.py Projet : Shanlearning/insurance-data-scraping-project

    def second_parse(self, response):
        # 从每一行抽取数据
        result = response.css('.news_list a').extract()
        for part in result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '光大永明'
            item['product_name'] = shan.str_extract('title="(.*?)"', part)
            item['product_sale_status'] = scrapy.Request(
                'http://www.sunlife-everbright.com/sleb/info/jbxx/cpjbxx/jydbxcpmljtk/index.html',
                callback=self.zts_parse,
                meta=({
                    'item': item
                }))
            contract_link = re.findall('href="(.*?)" ', part)[0]
            contract_link = "http://www.sunlife-everbright.com" + contract_link
            yield response.follow(contract_link,
                                  callback=self.contract_parse,
                                  meta=({
                                      'item': item
                                  }))

        # 找到下一页的代码
        a = str(response.css('.pagingNormal').extract())
        next_pages = re.findall(
            "/sleb/info/jbxx/cpjbxx/cpxxp/468a89fa-\d+[.]html", a)
        for next_page in next_pages:
            yield response.follow("http://www.sunlife-everbright.com" +
                                  next_page,
                                  callback=self.second_parse)

Exemple #28

0

Afficher le fichier

Fichier : a安邦人寿.py Projet : Shanlearning/insurance-data-scraping-project

 def parse(self, response):
     # 从每一行抽取数据
     result = response.css('tr').extract()
     result = result[1:len(result)]
     for part in result:
         # 在售保险的内容输入
         item = ProjectInsuranceScrapItem()
         item['company_name'] = '安邦人寿'
         item['product_name'] = "安邦" + shan.str_extract(
             '安邦(.*?)</span>', part)
         item['product_sale_status'] = '在售'
         item[
             'product_contract_link'] = "http://www.anbang-life.com" + shan.str_extract(
                 'href="../../..(.*?)">', part)
         # 输出数据
         yield item

Exemple #29

0

Afficher le fichier

 def zaishou_parse(self, response):
     # 从每一行抽取数据
     result = response.css('#jigou_right_k a').extract()
     result = shan.str_keep('条款',result)
     for part in result:
              # 在售保险的内容输入
             item = ProjectInsuranceScrapItem()            
             item['company_name'] = '人保人寿'
             item['product_name'] = shan.str_extract('>(.*?)条款',part)
             item['product_sale_status'] = '在售'
             item['product_contract_link'] = "http://www.picclife.com"+ shan.str_extract('href="(.*)" title',part)
             # 输出数据
             yield item 
             
     a = response.css('.yeshu_icon').extract()
     b = shan.str_extract("\'(.*?)\'",a)
     for part in b:
         yield response.follow("http://www.picclife.com/IndividualLongrisk/" + part, callback=self.zaishou_parse)

Exemple #30

0

Afficher le fichier

Fichier : a中英人寿.py Projet : Shanlearning/insurance-data-scraping-project

 def tx_parse(self, response):
     # 从每一行抽取数据
     result = response.css('.li_content').extract()
     for part in result:
              # 在售保险的内容输入
             item = ProjectInsuranceScrapItem()            
             item['company_name'] = '中英人寿'
             item['product_name'] = "中英" + shan.str_extract('中英(.*?)</a>',part)    
             item['product_sale_status'] = ""
             item['product_contract_link'] = "http://www.aviva-cofco.com.cn"+ shan.str_extract('href="(.*)"',part)
             # 输出数据
             yield item 
             
     # 找到下一页的代码
     next_pages = re.findall("list-\d+[.]shtml",response.text)
     next_pages = next_pages[0:(len(next_pages)-1)]
     for next_page in next_pages:
         yield response.follow(next_page, callback=self.tx_parse)