Ejemplos de str_keep en Python, ejemplos de project_insurance_scrap.scrap_functions.str_keep en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: a太平养老.py Proyecto: Shanlearning/insurance-data-scraping-project

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('tr').extract()
        result = result[1:len(result)]
        a = shan.str_detect("健康保险", result)
        zs_result = result[0:shan.which(a)[len(shan.which(a)) - 1]]
        ts_result = result[shan.which(a)[len(shan.which(a)) - 1]:(len(result))]

        zs_result = shan.str_keep('太平', zs_result)
        ts_result = shan.str_keep('太平', ts_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '太平养老'
            item['product_name'] = shan.str_extract('<td>(.*?)</td>', part)
            item['product_sale_status'] = "在售"
            item['product_contract_link'] = shan.str_extract(
                'href="(.*)"', part)
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '太平养老'
            item['product_name'] = shan.str_extract('<td>(.*?)</td>', part)
            item['product_sale_status'] = "停售"
            item['product_contract_link'] = shan.str_extract(
                'href="(.*)"', part)
            # 输出数据
            yield item

Ejemplo n.º 2

0

Mostrar archivo

    def tingshou_parse(self, response):
        # 从每一行抽取数据
        result = response.css(".pi-pubinfo")
        result1 = result.css("li").extract()
        for part in result1:
            # 停售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '民生人寿'
            a = shan.str_keep('class="pi-pls-prodname off-float-left"', part)
            item['product_type'] = ''
            item['product_id'] = ''
            item['product_name'] = shan.str_extract('>(.*?)<', a)
            item['product_sale_status'] = '停售'
            b = shan.str_keep('class="dsm-choise-zoon dsm-none"', part)
            item[
                'product_contract_link'] = "http://www.minshenglife.com" + shan.str_extract(
                    'href="(.*?)">', b)
            item['product_price_link'] = ''

            item['product_start_date'] = ''
            item['product_end_date'] = ''
            # 输出数据
            yield item
        # 找到下一页的代码
        next_pages = re.findall("index_\d+[.]shtml", response.text)
        for next_page in next_pages:
            yield response.follow(next_page, callback=self.tingshou_parse)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: a太平人寿.py Proyecto: Shanlearning/insurance-data-scraping-project

    def parse(self, response):
        # 从每一行抽取数据

        result = response.css(".ts_product")
        zs_result = result[0].css("tr").getall()
        zs_result = shan.str_keep("条款PDF文档", zs_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '太平人寿'
            item['product_name'] = shan.str_extract('<td>(.*)</td>', part)
            item['product_sale_status'] = '在售'

            item['product_contract_link'] = shan.str_extract(
                'href="(.*)?">', part)
            # 输出数据
            yield item

        ts_result = result[1].css("tr").getall()
        ts_result = shan.str_keep("条款PDF文档", ts_result)

        for part in ts_result:
            # 停售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '太平人寿'
            item['product_name'] = shan.str_extract('<td>(.*)</td>', part)
            item['product_sale_status'] = '停售'
            item['product_contract_link'] = shan.str_extract(
                'href="(.*)?">', part)
            # 输出数据
            yield item

Ejemplo n.º 4

0

Mostrar archivo

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('#a2 font , .aproName').extract()
        zs_result = result[0:shan.which(shan.str_detect("人寿保险", result))[0]]
        ts_result = result[shan.which(shan.str_detect("人寿保险", result))[0]:(
            shan.which(shan.str_detect("zip", result))[0] - 1)]

        zs_result = shan.str_keep('国泰', zs_result)
        ts_result = shan.str_keep('国泰', ts_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '陆家嘴国泰'
            item['product_name'] = shan.str_extract('>(.*?)</a>', part)
            item['product_sale_status'] = "在售"
            item[
                'product_contract_link'] = "http://www.cathaylife.cn" + shan.str_extract(
                    'href="(.*)"', part)
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '陆家嘴国泰'
            item['product_name'] = shan.str_extract('>(.*?)</a>', part)
            item['product_sale_status'] = "停售"
            item[
                'product_contract_link'] = "http://www.cathaylife.cn" + shan.str_extract(
                    'href="(.*)"', part)
            # 输出数据
            yield item

Ejemplo n.º 5

0

Mostrar archivo

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css("tr").extract()
        zs_result = shan.str_keep('class="STYLE14"', result)
        ts_result = shan.str_keep('class="STYLE15"', result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            part = re.findall('<td>(.*?)</td>', part)
            item['company_name'] = '富德生命'
            item['product_name'] = part[1]
            item['product_sale_status'] = '在售'
            item[
                'product_contract_link'] = "https://www.sino-life.com" + shan.str_keep(
                    'upload', shan.str_extract('href="(.*)pdf',
                                               part[4])) + "pdf"
            item['product_start_date'] = part[2]
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            part = re.findall('<td>(.*?)</td>', part)
            item['company_name'] = '富德生命'
            item['product_name'] = part[1]
            item['product_sale_status'] = '停售'
            item[
                'product_contract_link'] = "https://www.sino-life.com" + shan.str_keep(
                    'upload', shan.str_extract('href="(.*)pdf',
                                               part[4])) + "pdf"
            item['product_start_date'] = part[2]
            # 输出数据
            yield item

Ejemplo n.º 6

0

Mostrar archivo

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('.product_right_content').extract()
        result = result[2]
        result = re.split('<tr>', result)

        zs_result = result[shan.which(shan.str_detect("在售", result))[0]:(
            shan.which(shan.str_detect("停售", result))[0] + 1)]
        ts_result = result[(shan.which(shan.str_detect("停售", result))[0] +
                            1):len(result)]

        zs_result = shan.str_keep('德华', zs_result)
        ts_result = shan.str_keep('德华', ts_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '德华安顾'
            item['product_name'] = shan.str_extract('<td>(.*?)</td>', part)
            item['product_sale_status'] = "在售"
            item[
                'product_contract_link'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                    "dhag(.*)pdf", part) + "pdf"
            if "rar" in part:
                item[
                    'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                        "dhag(.*)rar", part) + "rar"
            else:
                item[
                    'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                        "dhag(.*)zip", part) + "zip"
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '德华安顾'
            item['product_name'] = shan.str_extract('<td>(.*?)</td>', part)
            item['product_sale_status'] = "停售"
            item[
                'product_contract_link'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                    "dhag(.*)pdf", part) + "pdf"
            if "rar" in part:
                item[
                    'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                        "dhag(.*)rar", part) + "rar"
            else:
                item[
                    'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                        "dhag(.*)zip", part) + "zip"
            # 输出数据
            yield item

Ejemplo n.º 7

0

Mostrar archivo

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css("p , #content a , .h2_title1").extract()
        result = result[2:len(result)]
        zs_result = result[shan.which(shan.str_detect("在售", result)[0]):shan.
                           which(shan.str_detect("停售", result))[0]]
        ts_result = result[shan.which(shan.str_detect("停售", result))[0]:shan.
                           which(shan.str_detect("在售", result))[1]]

        zs_result = shan.str_keep('style="color:#626263;"', zs_result)

        ts_result = shan.str_keep('style="color:#626263;"', ts_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '工银安盛'
            item['product_type'] = ''

            item['product_name'] = shan.str_extract(">(.*?)</a>", part)
            item['product_sale_status'] = '在售'

            item[
                'product_contract_link'] = "www.icbc-axa.com" + shan.str_extract(
                    'href="(.*?)pdf', part) + "pdf"
            item['product_price_link'] = ''

            item['product_start_date'] = ''
            item['product_end_date'] = ''
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '工银安盛'
            item['product_type'] = ''

            item['product_name'] = shan.str_extract(">(.*?)</a>", part)
            item['product_sale_status'] = '停售'

            item[
                'product_contract_link'] = "www.icbc-axa.com" + shan.str_extract(
                    'href="(.*?)pdf', part) + "pdf"
            item['product_price_link'] = ''

            item['product_start_date'] = ''
            item['product_end_date'] = ''
            # 输出数据
            yield item

Ejemplo n.º 8

0

Mostrar archivo

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('li').extract()
        result = shan.str_keep('time_r', result)
        for part in result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '平安健康'
            name = shan.str_extract('title="(.*?)"', part)
            if "（停售）" in name:
                item['product_name'] = shan.str_extract("(.*?)（停", name)
                item['product_sale_status'] = '停售'
            elif "（自" in name:
                item['product_name'] = shan.str_extract("(.*?)（自", name)
                item['product_sale_status'] = '停售'
            else:
                item['product_name'] = name
                item['product_sale_status'] = '在售'
            item['product_contract_link'] = shan.str_extract(
                'href="(.*?)">', part)
            # 输出数据
            yield item

        # 找到下一页的代码
        a = response.css('.next').extract()
        next_pages = shan.str_extract('href="(.*?)">', a)
        for next_page in next_pages:
            yield response.follow("https://health.pingan.com" + next_page,
                                  callback=self.parse)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: a信美相互.py Proyecto: Shanlearning/insurance-data-scraping-project

    def tingshou_parse(self, response):
        # 从每一行抽取数据
        result = response.css('p').extract()
        result = shan.str_keep('信美', result)
        for part in result:
            # 停售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '信美相互'
            item['product_name'] = shan.str_extract(
                '\r\n\t\t\t\t\t\t\t\t\t\t\t(.*?)\r\n\t\t\t\t\t\t\t\t\t\t\t</span>',
                part)
            item['product_sale_status'] = '停售'
            item['product_contract_link'] = shan.str_extract(
                'href="(.*)" target', part)

            # 输出数据
            yield item

        a = response.css('button').extract()
        b = shan.str_extract('value="(.*?)" onclick', a)
        b = b[2:(len(b) - 1)]
        for part in b:
            yield response.follow(
                "https://www.trustlife.com/cms/html/productClauseStop/index_" +
                part + ".html",
                callback=self.tingshou_parse)

Ejemplo n.º 10

0

Mostrar archivo

 def parse(self, response):
     # 从每一行抽取数据
     result = response.css('.bxContent').extract()
     result = re.split('查看', result[0])
     result = shan.str_keep('国华', result)
     for part in result:
         # 在售保险的内容输入
         item = ProjectInsuranceScrapItem()
         item['company_name'] = '国华人寿'
         item['product_name'] = "国华" + shan.str_extract('国华(.*?)</p>', part)
         item['product_sale_status'] = ''
         link = shan.str_extract('href="(.*?)" target', part)
         if "content" not in link:
             contract_link = "http://www.95549.cn/pages/intro/" + link
             yield response.follow(contract_link,
                                   callback=self.contract_parse,
                                   meta=({
                                       'item': item
                                   }))
         else:
             link1 = link + "z"
             item[
                 'product_contract_link'] = "http://www.95549.cn/pages/" + shan.str_extract(
                     '../(.*?)z', link1)
             # 输出数据
             yield item

Ejemplo n.º 11

0

Mostrar archivo

    def tingshou_parse(self, response):
        # 从每一行抽取数据
        result = response.css('tr').extract()
        result = shan.str_drop('停售', result)
        result = shan.str_keep('险', result)
        for part in result:
            # 停售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '长生人寿'
            name = shan.str_extract('">(.*?)</td>', part)
            if "<" in name:
                item['product_name'] = shan.str_extract('>(.*?)<', name)
            else:
                item['product_name'] = name
            link = shan.str_extract('href="(.*)pdf"', part)
            if "http" in link:
                item['product_contract_link'] = link + "pdf"
            else:
                item[
                    'product_contract_link'] = "http://www.gwcslife.com" + shan.str_extract(
                        'href="(.*)pdf"', part) + "pdf"
            item['product_sale_status'] = '停售'

            # 输出数据
            yield item

Ejemplo n.º 12

0

Mostrar archivo

Archivo: a交银康联.py Proyecto: Shanlearning/insurance-data-scraping-project

    def parse(self, response):
        # 从每一行抽取数据
        result = shan.str_keep("险", response.css("tr").extract())
        zs_result = result[shan.which(shan.str_detect("在售", result))[0]:shan.
                           which(shan.str_detect("停售", result))[0]]
        ts_result = result[shan.which(shan.str_detect("停售", result)
                                      )[0]:len(result)]

        zs_result = shan.str_keep('交银', zs_result)
        ts_result = shan.str_keep('交银', ts_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '交银康联'
            if "附加交银" in part:
                item['product_name'] = "附加交银" + shan.str_extract(
                    '附加交银(.*?)</', part)
            else:
                item['product_name'] = "交银" + shan.str_extract(
                    '交银(.*?)</', part)
            item['product_sale_status'] = '在售'
            item[
                'product_contract_link'] = "www.bocommlife.com" + shan.str_extract(
                    'href="(.*?)">', part)
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '交银康联'
            if "附加交银" in part:
                item['product_name'] = "附加交银" + shan.str_extract(
                    '附加交银(.*?)<', part)
            else:
                item['product_name'] = "交银" + shan.str_extract(
                    '交银(.*?)</', part)
            item['product_sale_status'] = '停售'
            item[
                'product_contract_link'] = "www.bocommlife.com" + shan.str_extract(
                    'href="(.*?)">', part)
            # 输出数据
            yield item

Ejemplo n.º 13

0

Mostrar archivo

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('tr').extract()
        result = shan.str_keep('险',result)
        for part in result:
                 # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()            
            item['company_name'] = '太保安联'
            item['product_name'] = shan.str_extract('target="_blank">(.*?)</a>',part)
            item['product_sale_status'] = ''
            item['product_contract_link'] = "http://health.cpic.com.cn"+ shan.str_extract('href="(.*?)" target',part) 
                # 输出数据
            yield item 

        a = response.css('.z_num').extract()
        b = shan.str_extract('href="(.*?)">',a)
        c = shan.str_keep('index',b)
        for part in c:
            yield response.follow("http://health.cpic.com.cn/jkx/gkxxpl/jbxx/bxcpmljtk/" + part, callback=self.parse)

Ejemplo n.º 14

0

Mostrar archivo

 def zaishou_parse(self, response):
     # 从每一行抽取数据
     result = response.css('tr').extract()
     result = shan.str_keep('三峡',result)
     for part in result:
              # 在售保险的内容输入
             item = ProjectInsuranceScrapItem()            
             item['company_name'] = '三峡人寿'
             item['product_name']  = "三峡" + shan.str_extract('三峡(.*?)</td>',part) 
             item['product_sale_status'] = '在售'
             item['product_contract_link'] = "http://www.tg-life.com.cn"+ shan.str_extract('href="(.*)" target',part)
             item['product_price_link'] = "http://www.tg-life.com.cn"+ shan.str_extract('条款</a></td>\r\n\t\t\t\t\t\t\t\t<td align="center"><a href="(.*)" target',part)
             # 输出数据
             yield item 
             
     a = response.css('a~ a+ a').extract()
     a = shan.str_keep('下一页',a) 
     b = shan.str_extract('href="(.*?)">',a) 
     yield response.follow("http://www.tg-life.com.cn" + b, callback=self.zaishou_parse)

Ejemplo n.º 15

0

Mostrar archivo

 def parse(self, response):
     # 从每一行抽取数据
     result = response.css('tr').extract()
     result = shan.str_keep('君龙', result)
     result = shan.str_keep('bgcolor="#F5F2EF"', result)
     result = result[5:len(result)]
     for part in result:
         # 在售保险的内容输入
         item = ProjectInsuranceScrapItem()
         item['company_name'] = '君龙人寿'
         item['product_name'] = "君龙" + shan.str_extract(
             '君龙(.*?)\r\n\t\t\t\t', part)
         item['product_sale_status'] = shan.str_extract('\t(.*?)售',
                                                        part) + "售"
         item[
             'product_contract_link'] = "http://www.kdlins.com.cn/" + shan.str_extract(
                 'href="(.*?)" target', part)
         # 输出数据
         yield item

Ejemplo n.º 16

0

Mostrar archivo

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('P').extract()

        zs_result = result[shan.which(shan.str_detect("表一", result))[0]:shan.
                           which(shan.str_detect("表二", result))[0]]
        ts_result = result[shan.which(shan.str_detect("表三", result)
                                      )[0]:len(result)]

        zs_result = shan.str_keep('险', zs_result)
        ts_result = shan.str_keep('险', ts_result)

        zs_result = zs_result[1:len(zs_result)]
        ts_result = ts_result[1:len(ts_result)]

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '幸福人寿'
            item['product_sale_status'] = '在售'
            item['product_contract_link'] = shan.str_extract(
                'href="(.*?)">', part)
            name = shan.str_extract('幸福(.*?)</a>', part)
            if "（<" in name:
                item['product_name'] = "幸福" + shan.str_extract(
                    '）(.*?)</font>', name)
            else:
                item['product_name'] = "幸福" + name

                # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '幸福人寿'
            item['product_name'] = "幸福" + shan.str_extract('幸福(.*?)</a>', part)
            item['product_sale_status'] = '停售'
            item['product_contract_link'] = shan.str_extract(
                'href="(.*?)">', part)
            # 输出数据
            yield item

Ejemplo n.º 17

0

Mostrar archivo

Archivo: S光大永明.py Proyecto: Shanlearning/insurance-data-scraping-project

 def contract_parse(self, response):
     result = response.css("tr")
     result = result[1:len(result)].extract()
     item = response.meta['item']
     a = shan.str_keep('材料清单', result)
     if 'pdf' in a:
         item[
             'product_official_report_list'] = "http://www.sunlife-everbright.com" + shan.str_extract(
                 'href="(.*?)"', a)
     else:
         item['product_official_report_list'] = ''
     b = shan.str_keep('费率', result)
     if 'pdf' in b:
         item[
             'product_price_link'] = "http://www.sunlife-everbright.com" + shan.str_extract(
                 'href="(.*?)"', b)
     else:
         item['product_price_link'] = ''
     c = shan.str_keep('条款', result)
     if 'pdf' in c:
         item[
             'product_contract_link'] = "http://www.sunlife-everbright.com" + shan.str_extract(
                 'href="(.*?)"', c)
     else:
         item['product_contract_link'] = ''
     d = shan.str_keep('价值表（全表）', result)
     if 'xlsx' in d:
         item[
             'product_pv_full_list_link'] = "http://www.sunlife-everbright.com" + shan.str_extract(
                 'href="(.*?)"', d)
     else:
         item['product_pv_full_list_link'] = ''
     e = shan.str_keep('价值表（示例）', result)
     if 'pdf' in e:
         item[
             'product_pv_example_link'] = "http://www.sunlife-everbright.com" + shan.str_extract(
                 'href="(.*?)"', e)
     else:
         item['product_pv_example_link'] = ''
     f = shan.str_keep('总精算师', result)
     if 'pdf' in f:
         item[
             'product_chief_actuary_claim_link'] = "http://www.sunlife-everbright.com" + shan.str_extract(
                 'href="(.*?)"', f)
     else:
         item['product_chief_actuary_claim_link'] = ''
     g = shan.str_keep('法律责任人', result)
     if 'pdf' in g:
         item[
             'prodcct_law_response_link'] = "http://www.sunlife-everbright.com" + shan.str_extract(
                 'href="(.*?)"', g)
     else:
         item['prodcct_law_response_link'] = ''
     sale_status_url = [
         'http://www.sunlife-everbright.com/sleb/info/jbxx/cpjbxx/jydbxcpmljtk/index.html',
     ]
     yield item

Ejemplo n.º 18

0

Mostrar archivo

Archivo: S光大永明.py Proyecto: Shanlearning/insurance-data-scraping-project

    def first_parse(self, response):
        result = response.css("tr").extract()
        zs_result = result[shan.which(shan.str_detect("在售", result))[0]:shan.
                           which(shan.str_detect("停售", result))[0]]
        zs_result = shan.str_keep("(寿|保)险", zs_result)
        ts_result = result[shan.which(shan.str_detect("停售", result)
                                      )[0]:len(result)]
        ts_result = shan.str_keep("(寿|保)险", ts_result)

        urls = [
            'http://www.sunlife-everbright.com/sleb/info/jbxx/cpjbxx/cpxxp/468a89fa-1.html'
        ]

        header = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep-alive',
            'Cookie':
            'UM_distinctid=16bf88a1698404-01cafbdd88510b-37647e05-13c680-16bf88a169975c; CNZZDATA1274208563=1695324027-1563242927-%7C1563352631',
            'Host':
            'www.sunlife-everbright.com',
            'If-Modified-Since':
            'Wed, 03 Jul 2019 07:18:28 GMT',
            'If-None-Match':
            "4a89-58cc1aaf03900-gzip",
            'Referer':
            'http://www.sunlife-everbright.com/sleb/info/jbxx/cpjbxx/cpxxp/468a89fa-10.html',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
        }
        yield scrapy.Request(url=urls,
                             headers=header,
                             callback=self.second_parse)

Ejemplo n.º 19

0

Mostrar archivo

 def parse(self, response):
     # 从每一行抽取数据
     result = response.css('tr').extract()
     result = result[2:len(result)]
     
     zs_result = shan.str_keep('data-isstate="1"',result)
     ts_result = shan.str_keep('data-isstate="2"',result)
     ds_result = shan.str_keep('data-isstate="3"',result)
     
     for part in zs_result:
              # 在售保险的内容输入
             item = ProjectInsuranceScrapItem()            
             item['company_name'] = '吉祥人寿'
             item['product_name'] = shan.str_extract('class="td_body">(.*?)</td>',part)  
             item['product_sale_status'] = "在售" 
             item['product_contract_link'] = "http://www.jxlife.com.cn" + shan.str_extract('href="(.*)pdf"',part) + "pdf"
             item['product_official_report_list'] = "http://www.jxlife.com.cn" + shan.str_extract('href="(.*)" target="_blank" style="cursor: hand; text-decoration: underline;">\n\t\t\t\t\t\t\t\t\t\t\t\t<font color="blue">其他备案资料</font>',part) 
             # 输出数据
             yield item 
             
     for part in ts_result:
              # 在售保险的内容输入
             item = ProjectInsuranceScrapItem()            
             item['company_name'] = '吉祥人寿'
             item['product_name'] = shan.str_extract('class="td_body">(.*?)</td>',part) 
             item['product_sale_status'] = "停售" 
             item['product_contract_link'] = "http://www.jxlife.com.cn" + shan.str_extract('href="(.*)pdf"',part) + "pdf"  
             item['product_official_report_list'] = "http://www.jxlife.com.cn" + shan.str_extract('href="(.*)" target="_blank" style="cursor: hand; text-decoration: underline;">\n\t\t\t\t\t\t\t\t\t\t\t\t<font color="blue">其他备案资料</font>',part) 
             # 输出数据
             yield item 
             
     for part in ds_result:
              # 在售保险的内容输入
             item = ProjectInsuranceScrapItem()            
             item['company_name'] = '吉祥人寿'
             item['product_name'] = shan.str_extract('class="td_body">(.*?)</td>',part) 
             item['product_sale_status'] = "待售" 
             item['product_contract_link'] = "http://www.jxlife.com.cn" + shan.str_extract('href="(.*)pdf"',part) + "pdf"  
             item['product_official_report_list'] = "http://www.jxlife.com.cn" + shan.str_extract('href="(.*)" target="_blank" style="cursor: hand; text-decoration: underline;">\n\t\t\t\t\t\t\t\t\t\t\t\t<font color="blue">其他备案资料</font>',part) 
             # 输出数据
             yield item

Ejemplo n.º 20

0

Mostrar archivo

 def contract_parse(self, response):
     result = response.css(".detail a").extract()
     item = response.meta['item']
     c = shan.str_keep('条款', result)
     if len(c) == 1:
         c = c
     if len(c) == 2:
         c = c[1]
     item[
         'product_contract_link'] = "https://www.e-guofu.com" + shan.str_extract(
             'href="(.*?)" target', c)
     yield item

Ejemplo n.º 21

0

Mostrar archivo

 def weishou_parse(self, response):                
     # 从每一行抽取数据
     result = response.css('.grey2').extract()
     result = shan.str_keep('险',result)
     for part in result:
         # 停售保险的内容输入
         item = ProjectInsuranceScrapItem()            
         item['company_name'] = '中邮人寿'
         item['product_name'] = shan.str_extract('\r\n\t\t\t\t(.*?)\r\n\t\t\t',part) 
         item['product_sale_status'] = '未售'
         contract_link = "http://www.chinapost-life.com"+ shan.str_extract('href="(.*)" class',part)
         yield response.follow(contract_link, callback= self.contract_parse , meta=({'item': item}) )

Ejemplo n.º 22

0

Mostrar archivo

 def contract_parse(self, response):
     result = response.css(".articleShowText a").extract()
     item = response.meta['item']
     a = shan.str_keep('产品',result)
     if 'pdf' in a: 
         item['product_official_report_list'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',a) 
     else: 
         item['product_official_report_list'] = ''
     b = shan.str_keep('费率',result)
     if 'pdf' in b: 
         item['product_price_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',b) 
     else:
         item['product_price_link'] =''
     c = shan.str_keep('条款',result)
     if 'pdf' in c: 
         item['product_contract_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',c) 
     else:
         item['product_contract_link'] = ''
     d = shan.str_keep('价值表',result)
     if 'pdf' in d: 
         item['product_pv_full_list_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',d) 
     else:
         item['product_pv_full_list_link'] = ''
     f = shan.str_keep('总精算师',result)  
     if 'pdf' in f: 
         item['product_chief_actuary_claim_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',f) 
     else:
         item['product_chief_actuary_claim_link'] =''
     g = shan.str_keep('法律责任人',result)  
     if 'pdf' in g: 
         item['prodcct_law_response_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',g)
     else:
         item['prodcct_law_response_link'] = ''
     yield item

Ejemplo n.º 23

0

Mostrar archivo

Archivo: a光大永明.py Proyecto: Shanlearning/insurance-data-scraping-project

 def zts_parse(self, response):
     result = response.css('tr').extract()
     result = result[1:len(result)]
     result1 = shan.str_keep('险', result)
     zs_result = result[shan.which(shan.str_detect("在售", result))[0]:shan.
                        which(shan.str_detect("停售", result))[0]]
     item = response.meta['item']
     for part in result1:
         if shan.str_extract('href="(.*?)pdf', part) in zs_result:
             item['product_sale_status'] = '在售'
         else:
             item['product_sale_status'] = '停售'
         yield item

Ejemplo n.º 24

0

Mostrar archivo

Archivo: a中宏人寿.py Proyecto: Shanlearning/insurance-data-scraping-project

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css(".faq-container-list").extract()
        zs_result = shan.str_keep("在售", result)
        ts_result = shan.str_keep("停售", result)
        zs_result1 = []
        for part in zs_result:
            zs_result1.extend(re.split('div class="item"', part))
        zs_result = shan.str_keep("(寿|保)险", zs_result1)

        ts_result1 = []
        for part in ts_result:
            ts_result1.extend(re.split('div class="item"', part))
        ts_result = shan.str_keep("(寿|保)", ts_result1)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '中宏人寿'
            item['product_name'] = shan.str_extract("中宏.*?险", part)
            item['product_sale_status'] = '在售'
            item[
                'product_contract_link'] = "www.manulife-sinochem.com" + shan.str_extract(
                    '<a href="(.*)target', part)
            # 输出数据
            yield item

        for part in ts_result:
            # 停售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '中宏人寿'
            item['product_name'] = shan.str_extract("中宏.*?险", part)
            item['product_sale_status'] = '停售'
            item[
                'product_contract_link'] = "www.manulife-sinochem.com" + shan.str_extract(
                    '<a href="(.*)target', part)
            # 输出数据
            yield item

Ejemplo n.º 25

0

Mostrar archivo

Archivo: a国联人寿.py Proyecto: Shanlearning/insurance-data-scraping-project

 def parse(self, response):
     # 从每一行抽取数据
     result = response.css('li').extract()
     result = shan.str_keep('条款',result)
     result = result[1:len(result)]
     for part in result:
              # 在售保险的内容输入
         item = ProjectInsuranceScrapItem()            
         item['company_name'] = '国联人寿'
         item['product_name'] = shan.str_extract('target="_blank">(.*?)</a>',part)
         item['product_sale_status'] = ''
         item['product_contract_link'] = shan.str_extract('href="(.*?)" target=',part)
             # 输出数据
         yield item

Ejemplo n.º 26

0

Mostrar archivo

 def zaishou_parse(self, response):
     # 从每一行抽取数据
     result = response.css('li').extract()
     result = shan.str_keep('条款', result)
     result = result[1:len(result)]
     for part in result:
         # 在售保险的内容输入
         item = ProjectInsuranceScrapItem()
         item['company_name'] = '泰康养老'
         item['product_name'] = shan.str_extract('《(.*?)》', part)
         item['product_sale_status'] = '在售'
         item[
             'product_contract_link'] = "http://tkyl.pension.taikang.com" + shan.str_extract(
                 'href="(.*)" target', part)
         # 输出数据
         yield item

Ejemplo n.º 27

0

Mostrar archivo

 def parse(self, response):
     # 从每一行抽取数据
     result = response.css('tr').extract()
     result = shan.str_keep('爱心', result)
     for part in result:
         # 在售保险的内容输入
         item = ProjectInsuranceScrapItem()
         item['company_name'] = '爱心人寿'
         item['product_name'] = shan.str_extract('>(.*?)</a>', part)
         item['product_sale_status'] = shan.str_extract('\n(.*?)售',
                                                        part) + "售"
         item[
             'product_contract_link'] = "http://www.aixin-ins.com" + shan.str_extract(
                 'href="(.*?)pdf', part) + "pdf"
         # 输出数据
         yield item

Ejemplo n.º 28

0

Mostrar archivo

Archivo: a大都会人寿.py Proyecto: Shanlearning/insurance-data-scraping-project

    def tingshou_parse(self, response):
        # 从每一行抽取数据
        result = response.css('.width-80-authored a').extract()
        result = shan.str_keep('险', result)
        for part in result:
            # 停售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '大都会人寿'
            item['product_name'] = shan.str_extract('公司(.*?)</a>', part)
            item['product_sale_status'] = '停售'
            item[
                'product_contract_link'] = "https://www.metlife.com.cn" + shan.str_extract(
                    'href="(.*)pdf', part) + "pdf"

            # 输出数据
            yield item

Ejemplo n.º 29

0

Mostrar archivo

Archivo: a复星联合.py Proyecto: Shanlearning/insurance-data-scraping-project

 def parse(self, response):
     # 从每一行抽取数据
     result = response.css('p').extract()
     result = shan.str_keep('条款',result)
     for part in result:
              # 在售保险的内容输入
         item = ProjectInsuranceScrapItem()            
         item['company_name'] = '复星联合'
         name = shan.str_extract('blank">(.*?)</a>',part)
         if "停售" in name:
             item['product_name'] = shan.str_extract('(.*?)条款',name)
             item['product_sale_status'] = '停售'
         else:
             item['product_name'] = shan.str_extract('(.*?)条款',name)
             item['product_sale_status'] = '在售'
         item['product_contract_link'] = shan.str_extract('href="(.*?)" target',part) 
             # 输出数据
         yield item

Ejemplo n.º 30

0

Mostrar archivo

 def zaishou_parse(self, response):
     # 从每一行抽取数据
     result = response.css('#jigou_right_k a').extract()
     result = shan.str_keep('条款',result)
     for part in result:
              # 在售保险的内容输入
             item = ProjectInsuranceScrapItem()            
             item['company_name'] = '人保人寿'
             item['product_name'] = shan.str_extract('>(.*?)条款',part)
             item['product_sale_status'] = '在售'
             item['product_contract_link'] = "http://www.picclife.com"+ shan.str_extract('href="(.*)" title',part)
             # 输出数据
             yield item 
             
     a = response.css('.yeshu_icon').extract()
     b = shan.str_extract("\'(.*?)\'",a)
     for part in b:
         yield response.follow("http://www.picclife.com/IndividualLongrisk/" + part, callback=self.zaishou_parse)