def parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = result[1:len(result)] a = shan.str_detect("健康保险", result) zs_result = result[0:shan.which(a)[len(shan.which(a)) - 1]] ts_result = result[shan.which(a)[len(shan.which(a)) - 1]:(len(result))] zs_result = shan.str_keep('太平', zs_result) ts_result = shan.str_keep('太平', ts_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '太平养老' item['product_name'] = shan.str_extract('<td>(.*?)</td>', part) item['product_sale_status'] = "在售" item['product_contract_link'] = shan.str_extract( 'href="(.*)"', part) # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '太平养老' item['product_name'] = shan.str_extract('<td>(.*?)</td>', part) item['product_sale_status'] = "停售" item['product_contract_link'] = shan.str_extract( 'href="(.*)"', part) # 输出数据 yield item
def tingshou_parse(self, response): # 从每一行抽取数据 result = response.css(".pi-pubinfo") result1 = result.css("li").extract() for part in result1: # 停售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '民生人寿' a = shan.str_keep('class="pi-pls-prodname off-float-left"', part) item['product_type'] = '' item['product_id'] = '' item['product_name'] = shan.str_extract('>(.*?)<', a) item['product_sale_status'] = '停售' b = shan.str_keep('class="dsm-choise-zoon dsm-none"', part) item[ 'product_contract_link'] = "http://www.minshenglife.com" + shan.str_extract( 'href="(.*?)">', b) item['product_price_link'] = '' item['product_start_date'] = '' item['product_end_date'] = '' # 输出数据 yield item # 找到下一页的代码 next_pages = re.findall("index_\d+[.]shtml", response.text) for next_page in next_pages: yield response.follow(next_page, callback=self.tingshou_parse)
def parse(self, response): # 从每一行抽取数据 result = response.css(".ts_product") zs_result = result[0].css("tr").getall() zs_result = shan.str_keep("条款PDF文档", zs_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '太平人寿' item['product_name'] = shan.str_extract('<td>(.*)</td>', part) item['product_sale_status'] = '在售' item['product_contract_link'] = shan.str_extract( 'href="(.*)?">', part) # 输出数据 yield item ts_result = result[1].css("tr").getall() ts_result = shan.str_keep("条款PDF文档", ts_result) for part in ts_result: # 停售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '太平人寿' item['product_name'] = shan.str_extract('<td>(.*)</td>', part) item['product_sale_status'] = '停售' item['product_contract_link'] = shan.str_extract( 'href="(.*)?">', part) # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('#a2 font , .aproName').extract() zs_result = result[0:shan.which(shan.str_detect("人寿保险", result))[0]] ts_result = result[shan.which(shan.str_detect("人寿保险", result))[0]:( shan.which(shan.str_detect("zip", result))[0] - 1)] zs_result = shan.str_keep('国泰', zs_result) ts_result = shan.str_keep('国泰', ts_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '陆家嘴国泰' item['product_name'] = shan.str_extract('>(.*?)</a>', part) item['product_sale_status'] = "在售" item[ 'product_contract_link'] = "http://www.cathaylife.cn" + shan.str_extract( 'href="(.*)"', part) # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '陆家嘴国泰' item['product_name'] = shan.str_extract('>(.*?)</a>', part) item['product_sale_status'] = "停售" item[ 'product_contract_link'] = "http://www.cathaylife.cn" + shan.str_extract( 'href="(.*)"', part) # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css("tr").extract() zs_result = shan.str_keep('class="STYLE14"', result) ts_result = shan.str_keep('class="STYLE15"', result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() part = re.findall('<td>(.*?)</td>', part) item['company_name'] = '富德生命' item['product_name'] = part[1] item['product_sale_status'] = '在售' item[ 'product_contract_link'] = "https://www.sino-life.com" + shan.str_keep( 'upload', shan.str_extract('href="(.*)pdf', part[4])) + "pdf" item['product_start_date'] = part[2] # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() part = re.findall('<td>(.*?)</td>', part) item['company_name'] = '富德生命' item['product_name'] = part[1] item['product_sale_status'] = '停售' item[ 'product_contract_link'] = "https://www.sino-life.com" + shan.str_keep( 'upload', shan.str_extract('href="(.*)pdf', part[4])) + "pdf" item['product_start_date'] = part[2] # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('.product_right_content').extract() result = result[2] result = re.split('<tr>', result) zs_result = result[shan.which(shan.str_detect("在售", result))[0]:( shan.which(shan.str_detect("停售", result))[0] + 1)] ts_result = result[(shan.which(shan.str_detect("停售", result))[0] + 1):len(result)] zs_result = shan.str_keep('德华', zs_result) ts_result = shan.str_keep('德华', ts_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '德华安顾' item['product_name'] = shan.str_extract('<td>(.*?)</td>', part) item['product_sale_status'] = "在售" item[ 'product_contract_link'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)pdf", part) + "pdf" if "rar" in part: item[ 'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)rar", part) + "rar" else: item[ 'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)zip", part) + "zip" # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '德华安顾' item['product_name'] = shan.str_extract('<td>(.*?)</td>', part) item['product_sale_status'] = "停售" item[ 'product_contract_link'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)pdf", part) + "pdf" if "rar" in part: item[ 'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)rar", part) + "rar" else: item[ 'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)zip", part) + "zip" # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css("p , #content a , .h2_title1").extract() result = result[2:len(result)] zs_result = result[shan.which(shan.str_detect("在售", result)[0]):shan. which(shan.str_detect("停售", result))[0]] ts_result = result[shan.which(shan.str_detect("停售", result))[0]:shan. which(shan.str_detect("在售", result))[1]] zs_result = shan.str_keep('style="color:#626263;"', zs_result) ts_result = shan.str_keep('style="color:#626263;"', ts_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '工银安盛' item['product_type'] = '' item['product_name'] = shan.str_extract(">(.*?)</a>", part) item['product_sale_status'] = '在售' item[ 'product_contract_link'] = "www.icbc-axa.com" + shan.str_extract( 'href="(.*?)pdf', part) + "pdf" item['product_price_link'] = '' item['product_start_date'] = '' item['product_end_date'] = '' # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '工银安盛' item['product_type'] = '' item['product_name'] = shan.str_extract(">(.*?)</a>", part) item['product_sale_status'] = '停售' item[ 'product_contract_link'] = "www.icbc-axa.com" + shan.str_extract( 'href="(.*?)pdf', part) + "pdf" item['product_price_link'] = '' item['product_start_date'] = '' item['product_end_date'] = '' # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('li').extract() result = shan.str_keep('time_r', result) for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '平安健康' name = shan.str_extract('title="(.*?)"', part) if "(停售)" in name: item['product_name'] = shan.str_extract("(.*?)(停", name) item['product_sale_status'] = '停售' elif "(自" in name: item['product_name'] = shan.str_extract("(.*?)(自", name) item['product_sale_status'] = '停售' else: item['product_name'] = name item['product_sale_status'] = '在售' item['product_contract_link'] = shan.str_extract( 'href="(.*?)">', part) # 输出数据 yield item # 找到下一页的代码 a = response.css('.next').extract() next_pages = shan.str_extract('href="(.*?)">', a) for next_page in next_pages: yield response.follow("https://health.pingan.com" + next_page, callback=self.parse)
def tingshou_parse(self, response): # 从每一行抽取数据 result = response.css('p').extract() result = shan.str_keep('信美', result) for part in result: # 停售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '信美相互' item['product_name'] = shan.str_extract( '\r\n\t\t\t\t\t\t\t\t\t\t\t(.*?)\r\n\t\t\t\t\t\t\t\t\t\t\t</span>', part) item['product_sale_status'] = '停售' item['product_contract_link'] = shan.str_extract( 'href="(.*)" target', part) # 输出数据 yield item a = response.css('button').extract() b = shan.str_extract('value="(.*?)" onclick', a) b = b[2:(len(b) - 1)] for part in b: yield response.follow( "https://www.trustlife.com/cms/html/productClauseStop/index_" + part + ".html", callback=self.tingshou_parse)
def parse(self, response): # 从每一行抽取数据 result = response.css('.bxContent').extract() result = re.split('查看', result[0]) result = shan.str_keep('国华', result) for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '国华人寿' item['product_name'] = "国华" + shan.str_extract('国华(.*?)</p>', part) item['product_sale_status'] = '' link = shan.str_extract('href="(.*?)" target', part) if "content" not in link: contract_link = "http://www.95549.cn/pages/intro/" + link yield response.follow(contract_link, callback=self.contract_parse, meta=({ 'item': item })) else: link1 = link + "z" item[ 'product_contract_link'] = "http://www.95549.cn/pages/" + shan.str_extract( '../(.*?)z', link1) # 输出数据 yield item
def tingshou_parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = shan.str_drop('停售', result) result = shan.str_keep('险', result) for part in result: # 停售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '长生人寿' name = shan.str_extract('">(.*?)</td>', part) if "<" in name: item['product_name'] = shan.str_extract('>(.*?)<', name) else: item['product_name'] = name link = shan.str_extract('href="(.*)pdf"', part) if "http" in link: item['product_contract_link'] = link + "pdf" else: item[ 'product_contract_link'] = "http://www.gwcslife.com" + shan.str_extract( 'href="(.*)pdf"', part) + "pdf" item['product_sale_status'] = '停售' # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = shan.str_keep("险", response.css("tr").extract()) zs_result = result[shan.which(shan.str_detect("在售", result))[0]:shan. which(shan.str_detect("停售", result))[0]] ts_result = result[shan.which(shan.str_detect("停售", result) )[0]:len(result)] zs_result = shan.str_keep('交银', zs_result) ts_result = shan.str_keep('交银', ts_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '交银康联' if "附加交银" in part: item['product_name'] = "附加交银" + shan.str_extract( '附加交银(.*?)</', part) else: item['product_name'] = "交银" + shan.str_extract( '交银(.*?)</', part) item['product_sale_status'] = '在售' item[ 'product_contract_link'] = "www.bocommlife.com" + shan.str_extract( 'href="(.*?)">', part) # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '交银康联' if "附加交银" in part: item['product_name'] = "附加交银" + shan.str_extract( '附加交银(.*?)<', part) else: item['product_name'] = "交银" + shan.str_extract( '交银(.*?)</', part) item['product_sale_status'] = '停售' item[ 'product_contract_link'] = "www.bocommlife.com" + shan.str_extract( 'href="(.*?)">', part) # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = shan.str_keep('险',result) for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '太保安联' item['product_name'] = shan.str_extract('target="_blank">(.*?)</a>',part) item['product_sale_status'] = '' item['product_contract_link'] = "http://health.cpic.com.cn"+ shan.str_extract('href="(.*?)" target',part) # 输出数据 yield item a = response.css('.z_num').extract() b = shan.str_extract('href="(.*?)">',a) c = shan.str_keep('index',b) for part in c: yield response.follow("http://health.cpic.com.cn/jkx/gkxxpl/jbxx/bxcpmljtk/" + part, callback=self.parse)
def zaishou_parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = shan.str_keep('三峡',result) for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '三峡人寿' item['product_name'] = "三峡" + shan.str_extract('三峡(.*?)</td>',part) item['product_sale_status'] = '在售' item['product_contract_link'] = "http://www.tg-life.com.cn"+ shan.str_extract('href="(.*)" target',part) item['product_price_link'] = "http://www.tg-life.com.cn"+ shan.str_extract('条款</a></td>\r\n\t\t\t\t\t\t\t\t<td align="center"><a href="(.*)" target',part) # 输出数据 yield item a = response.css('a~ a+ a').extract() a = shan.str_keep('下一页',a) b = shan.str_extract('href="(.*?)">',a) yield response.follow("http://www.tg-life.com.cn" + b, callback=self.zaishou_parse)
def parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = shan.str_keep('君龙', result) result = shan.str_keep('bgcolor="#F5F2EF"', result) result = result[5:len(result)] for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '君龙人寿' item['product_name'] = "君龙" + shan.str_extract( '君龙(.*?)\r\n\t\t\t\t', part) item['product_sale_status'] = shan.str_extract('\t(.*?)售', part) + "售" item[ 'product_contract_link'] = "http://www.kdlins.com.cn/" + shan.str_extract( 'href="(.*?)" target', part) # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('P').extract() zs_result = result[shan.which(shan.str_detect("表一", result))[0]:shan. which(shan.str_detect("表二", result))[0]] ts_result = result[shan.which(shan.str_detect("表三", result) )[0]:len(result)] zs_result = shan.str_keep('险', zs_result) ts_result = shan.str_keep('险', ts_result) zs_result = zs_result[1:len(zs_result)] ts_result = ts_result[1:len(ts_result)] for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '幸福人寿' item['product_sale_status'] = '在售' item['product_contract_link'] = shan.str_extract( 'href="(.*?)">', part) name = shan.str_extract('幸福(.*?)</a>', part) if "(<" in name: item['product_name'] = "幸福" + shan.str_extract( ')(.*?)</font>', name) else: item['product_name'] = "幸福" + name # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '幸福人寿' item['product_name'] = "幸福" + shan.str_extract('幸福(.*?)</a>', part) item['product_sale_status'] = '停售' item['product_contract_link'] = shan.str_extract( 'href="(.*?)">', part) # 输出数据 yield item
def contract_parse(self, response): result = response.css("tr") result = result[1:len(result)].extract() item = response.meta['item'] a = shan.str_keep('材料清单', result) if 'pdf' in a: item[ 'product_official_report_list'] = "http://www.sunlife-everbright.com" + shan.str_extract( 'href="(.*?)"', a) else: item['product_official_report_list'] = '' b = shan.str_keep('费率', result) if 'pdf' in b: item[ 'product_price_link'] = "http://www.sunlife-everbright.com" + shan.str_extract( 'href="(.*?)"', b) else: item['product_price_link'] = '' c = shan.str_keep('条款', result) if 'pdf' in c: item[ 'product_contract_link'] = "http://www.sunlife-everbright.com" + shan.str_extract( 'href="(.*?)"', c) else: item['product_contract_link'] = '' d = shan.str_keep('价值表(全表)', result) if 'xlsx' in d: item[ 'product_pv_full_list_link'] = "http://www.sunlife-everbright.com" + shan.str_extract( 'href="(.*?)"', d) else: item['product_pv_full_list_link'] = '' e = shan.str_keep('价值表(示例)', result) if 'pdf' in e: item[ 'product_pv_example_link'] = "http://www.sunlife-everbright.com" + shan.str_extract( 'href="(.*?)"', e) else: item['product_pv_example_link'] = '' f = shan.str_keep('总精算师', result) if 'pdf' in f: item[ 'product_chief_actuary_claim_link'] = "http://www.sunlife-everbright.com" + shan.str_extract( 'href="(.*?)"', f) else: item['product_chief_actuary_claim_link'] = '' g = shan.str_keep('法律责任人', result) if 'pdf' in g: item[ 'prodcct_law_response_link'] = "http://www.sunlife-everbright.com" + shan.str_extract( 'href="(.*?)"', g) else: item['prodcct_law_response_link'] = '' sale_status_url = [ 'http://www.sunlife-everbright.com/sleb/info/jbxx/cpjbxx/jydbxcpmljtk/index.html', ] yield item
def first_parse(self, response): result = response.css("tr").extract() zs_result = result[shan.which(shan.str_detect("在售", result))[0]:shan. which(shan.str_detect("停售", result))[0]] zs_result = shan.str_keep("(寿|保)险", zs_result) ts_result = result[shan.which(shan.str_detect("停售", result) )[0]:len(result)] ts_result = shan.str_keep("(寿|保)险", ts_result) urls = [ 'http://www.sunlife-everbright.com/sleb/info/jbxx/cpjbxx/cpxxp/468a89fa-1.html' ] header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'UM_distinctid=16bf88a1698404-01cafbdd88510b-37647e05-13c680-16bf88a169975c; CNZZDATA1274208563=1695324027-1563242927-%7C1563352631', 'Host': 'www.sunlife-everbright.com', 'If-Modified-Since': 'Wed, 03 Jul 2019 07:18:28 GMT', 'If-None-Match': "4a89-58cc1aaf03900-gzip", 'Referer': 'http://www.sunlife-everbright.com/sleb/info/jbxx/cpjbxx/cpxxp/468a89fa-10.html', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36', } yield scrapy.Request(url=urls, headers=header, callback=self.second_parse)
def parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = result[2:len(result)] zs_result = shan.str_keep('data-isstate="1"',result) ts_result = shan.str_keep('data-isstate="2"',result) ds_result = shan.str_keep('data-isstate="3"',result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '吉祥人寿' item['product_name'] = shan.str_extract('class="td_body">(.*?)</td>',part) item['product_sale_status'] = "在售" item['product_contract_link'] = "http://www.jxlife.com.cn" + shan.str_extract('href="(.*)pdf"',part) + "pdf" item['product_official_report_list'] = "http://www.jxlife.com.cn" + shan.str_extract('href="(.*)" target="_blank" style="cursor: hand; text-decoration: underline;">\n\t\t\t\t\t\t\t\t\t\t\t\t<font color="blue">其他备案资料</font>',part) # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '吉祥人寿' item['product_name'] = shan.str_extract('class="td_body">(.*?)</td>',part) item['product_sale_status'] = "停售" item['product_contract_link'] = "http://www.jxlife.com.cn" + shan.str_extract('href="(.*)pdf"',part) + "pdf" item['product_official_report_list'] = "http://www.jxlife.com.cn" + shan.str_extract('href="(.*)" target="_blank" style="cursor: hand; text-decoration: underline;">\n\t\t\t\t\t\t\t\t\t\t\t\t<font color="blue">其他备案资料</font>',part) # 输出数据 yield item for part in ds_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '吉祥人寿' item['product_name'] = shan.str_extract('class="td_body">(.*?)</td>',part) item['product_sale_status'] = "待售" item['product_contract_link'] = "http://www.jxlife.com.cn" + shan.str_extract('href="(.*)pdf"',part) + "pdf" item['product_official_report_list'] = "http://www.jxlife.com.cn" + shan.str_extract('href="(.*)" target="_blank" style="cursor: hand; text-decoration: underline;">\n\t\t\t\t\t\t\t\t\t\t\t\t<font color="blue">其他备案资料</font>',part) # 输出数据 yield item
def contract_parse(self, response): result = response.css(".detail a").extract() item = response.meta['item'] c = shan.str_keep('条款', result) if len(c) == 1: c = c if len(c) == 2: c = c[1] item[ 'product_contract_link'] = "https://www.e-guofu.com" + shan.str_extract( 'href="(.*?)" target', c) yield item
def weishou_parse(self, response): # 从每一行抽取数据 result = response.css('.grey2').extract() result = shan.str_keep('险',result) for part in result: # 停售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '中邮人寿' item['product_name'] = shan.str_extract('\r\n\t\t\t\t(.*?)\r\n\t\t\t',part) item['product_sale_status'] = '未售' contract_link = "http://www.chinapost-life.com"+ shan.str_extract('href="(.*)" class',part) yield response.follow(contract_link, callback= self.contract_parse , meta=({'item': item}) )
def contract_parse(self, response): result = response.css(".articleShowText a").extract() item = response.meta['item'] a = shan.str_keep('产品',result) if 'pdf' in a: item['product_official_report_list'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',a) else: item['product_official_report_list'] = '' b = shan.str_keep('费率',result) if 'pdf' in b: item['product_price_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',b) else: item['product_price_link'] ='' c = shan.str_keep('条款',result) if 'pdf' in c: item['product_contract_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',c) else: item['product_contract_link'] = '' d = shan.str_keep('价值表',result) if 'pdf' in d: item['product_pv_full_list_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',d) else: item['product_pv_full_list_link'] = '' f = shan.str_keep('总精算师',result) if 'pdf' in f: item['product_chief_actuary_claim_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',f) else: item['product_chief_actuary_claim_link'] ='' g = shan.str_keep('法律责任人',result) if 'pdf' in g: item['prodcct_law_response_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',g) else: item['prodcct_law_response_link'] = '' yield item
def zts_parse(self, response): result = response.css('tr').extract() result = result[1:len(result)] result1 = shan.str_keep('险', result) zs_result = result[shan.which(shan.str_detect("在售", result))[0]:shan. which(shan.str_detect("停售", result))[0]] item = response.meta['item'] for part in result1: if shan.str_extract('href="(.*?)pdf', part) in zs_result: item['product_sale_status'] = '在售' else: item['product_sale_status'] = '停售' yield item
def parse(self, response): # 从每一行抽取数据 result = response.css(".faq-container-list").extract() zs_result = shan.str_keep("在售", result) ts_result = shan.str_keep("停售", result) zs_result1 = [] for part in zs_result: zs_result1.extend(re.split('div class="item"', part)) zs_result = shan.str_keep("(寿|保)险", zs_result1) ts_result1 = [] for part in ts_result: ts_result1.extend(re.split('div class="item"', part)) ts_result = shan.str_keep("(寿|保)", ts_result1) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '中宏人寿' item['product_name'] = shan.str_extract("中宏.*?险", part) item['product_sale_status'] = '在售' item[ 'product_contract_link'] = "www.manulife-sinochem.com" + shan.str_extract( '<a href="(.*)target', part) # 输出数据 yield item for part in ts_result: # 停售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '中宏人寿' item['product_name'] = shan.str_extract("中宏.*?险", part) item['product_sale_status'] = '停售' item[ 'product_contract_link'] = "www.manulife-sinochem.com" + shan.str_extract( '<a href="(.*)target', part) # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('li').extract() result = shan.str_keep('条款',result) result = result[1:len(result)] for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '国联人寿' item['product_name'] = shan.str_extract('target="_blank">(.*?)</a>',part) item['product_sale_status'] = '' item['product_contract_link'] = shan.str_extract('href="(.*?)" target=',part) # 输出数据 yield item
def zaishou_parse(self, response): # 从每一行抽取数据 result = response.css('li').extract() result = shan.str_keep('条款', result) result = result[1:len(result)] for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '泰康养老' item['product_name'] = shan.str_extract('《(.*?)》', part) item['product_sale_status'] = '在售' item[ 'product_contract_link'] = "http://tkyl.pension.taikang.com" + shan.str_extract( 'href="(.*)" target', part) # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = shan.str_keep('爱心', result) for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '爱心人寿' item['product_name'] = shan.str_extract('>(.*?)</a>', part) item['product_sale_status'] = shan.str_extract('\n(.*?)售', part) + "售" item[ 'product_contract_link'] = "http://www.aixin-ins.com" + shan.str_extract( 'href="(.*?)pdf', part) + "pdf" # 输出数据 yield item
def tingshou_parse(self, response): # 从每一行抽取数据 result = response.css('.width-80-authored a').extract() result = shan.str_keep('险', result) for part in result: # 停售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '大都会人寿' item['product_name'] = shan.str_extract('公司(.*?)</a>', part) item['product_sale_status'] = '停售' item[ 'product_contract_link'] = "https://www.metlife.com.cn" + shan.str_extract( 'href="(.*)pdf', part) + "pdf" # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('p').extract() result = shan.str_keep('条款',result) for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '复星联合' name = shan.str_extract('blank">(.*?)</a>',part) if "停售" in name: item['product_name'] = shan.str_extract('(.*?)条款',name) item['product_sale_status'] = '停售' else: item['product_name'] = shan.str_extract('(.*?)条款',name) item['product_sale_status'] = '在售' item['product_contract_link'] = shan.str_extract('href="(.*?)" target',part) # 输出数据 yield item
def zaishou_parse(self, response): # 从每一行抽取数据 result = response.css('#jigou_right_k a').extract() result = shan.str_keep('条款',result) for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '人保人寿' item['product_name'] = shan.str_extract('>(.*?)条款',part) item['product_sale_status'] = '在售' item['product_contract_link'] = "http://www.picclife.com"+ shan.str_extract('href="(.*)" title',part) # 输出数据 yield item a = response.css('.yeshu_icon').extract() b = shan.str_extract("\'(.*?)\'",a) for part in b: yield response.follow("http://www.picclife.com/IndividualLongrisk/" + part, callback=self.zaishou_parse)