def tingshou_parse(self, response): # 从每一行抽取数据 result = response.css('.dis_proboxul a').extract() for part in result: # 停售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '农银人寿' name =shan.str_extract('>·(.*?)</a>',part) if "条款" in name: item['product_name'] = shan.str_extract('(.*?)条款',name) elif "产品说明书" in name: item['product_name'] = shan.str_extract('(.*?)产品',name) else: item['product_name'] = name item['product_sale_status'] = '停售' item['product_contract_link'] = "http://www.abchinalife.cn"+ shan.str_extract('href="(.*)" target',part) # 输出数据 yield item a = response.css('option').extract() b = shan.str_extract('value="(.*?)">',a) b = b[1:len(b)] for part in b: yield response.follow("http://www.abchinalife.cn" + part, callback=self.tingshou_parse)
def parse(self, response): # 从每一行抽取数据 result = response.css('#a2 font , .aproName').extract() zs_result = result[0:shan.which(shan.str_detect("人寿保险", result))[0]] ts_result = result[shan.which(shan.str_detect("人寿保险", result))[0]:( shan.which(shan.str_detect("zip", result))[0] - 1)] zs_result = shan.str_keep('国泰', zs_result) ts_result = shan.str_keep('国泰', ts_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '陆家嘴国泰' item['product_name'] = shan.str_extract('>(.*?)</a>', part) item['product_sale_status'] = "在售" item[ 'product_contract_link'] = "http://www.cathaylife.cn" + shan.str_extract( 'href="(.*)"', part) # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '陆家嘴国泰' item['product_name'] = shan.str_extract('>(.*?)</a>', part) item['product_sale_status'] = "停售" item[ 'product_contract_link'] = "http://www.cathaylife.cn" + shan.str_extract( 'href="(.*)"', part) # 输出数据 yield item
def tingshou_parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = shan.str_drop('停售', result) result = shan.str_keep('险', result) for part in result: # 停售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '长生人寿' name = shan.str_extract('">(.*?)</td>', part) if "<" in name: item['product_name'] = shan.str_extract('>(.*?)<', name) else: item['product_name'] = name link = shan.str_extract('href="(.*)pdf"', part) if "http" in link: item['product_contract_link'] = link + "pdf" else: item[ 'product_contract_link'] = "http://www.gwcslife.com" + shan.str_extract( 'href="(.*)pdf"', part) + "pdf" item['product_sale_status'] = '停售' # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('.news_list a').extract() for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['product_id'] = shan.str_extract('title="(.*?)平安', part) item['company_name'] = '平安养老' name = shan.str_extract('平安(.*?)</a>', part) if "条款" in name: item['product_name'] = "平安" + shan.str_extract('(.*?)条款', name) else: item['product_name'] = "平安" + name item['product_sale_status'] = ' 在售' item['product_contract_link'] = shan.str_extract( 'href="(.*?)" title', part) # 输出数据 yield item # 找到下一页的代码 page = response.css('li.page').extract() next_pages = shan.str_extract( "/px/informationDisclosure/insuranceProductList_\d+[.]shtml", page) for next_page in next_pages: yield response.follow("http://yl.pingan.com" + next_page, callback=self.parse)
def parse(self, response): # 从每一行抽取数据 result = response.css(".ts_product") zs_result = result[0].css("tr").getall() zs_result = shan.str_keep("条款PDF文档", zs_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '太平人寿' item['product_name'] = shan.str_extract('<td>(.*)</td>', part) item['product_sale_status'] = '在售' item['product_contract_link'] = shan.str_extract( 'href="(.*)?">', part) # 输出数据 yield item ts_result = result[1].css("tr").getall() ts_result = shan.str_keep("条款PDF文档", ts_result) for part in ts_result: # 停售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '太平人寿' item['product_name'] = shan.str_extract('<td>(.*)</td>', part) item['product_sale_status'] = '停售' item['product_contract_link'] = shan.str_extract( 'href="(.*)?">', part) # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css("tr").extract() zs_result = shan.str_keep('class="STYLE14"', result) ts_result = shan.str_keep('class="STYLE15"', result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() part = re.findall('<td>(.*?)</td>', part) item['company_name'] = '富德生命' item['product_name'] = part[1] item['product_sale_status'] = '在售' item[ 'product_contract_link'] = "https://www.sino-life.com" + shan.str_keep( 'upload', shan.str_extract('href="(.*)pdf', part[4])) + "pdf" item['product_start_date'] = part[2] # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() part = re.findall('<td>(.*?)</td>', part) item['company_name'] = '富德生命' item['product_name'] = part[1] item['product_sale_status'] = '停售' item[ 'product_contract_link'] = "https://www.sino-life.com" + shan.str_keep( 'upload', shan.str_extract('href="(.*)pdf', part[4])) + "pdf" item['product_start_date'] = part[2] # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('.bxContent').extract() result = re.split('查看', result[0]) result = shan.str_keep('国华', result) for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '国华人寿' item['product_name'] = "国华" + shan.str_extract('国华(.*?)</p>', part) item['product_sale_status'] = '' link = shan.str_extract('href="(.*?)" target', part) if "content" not in link: contract_link = "http://www.95549.cn/pages/intro/" + link yield response.follow(contract_link, callback=self.contract_parse, meta=({ 'item': item })) else: link1 = link + "z" item[ 'product_contract_link'] = "http://www.95549.cn/pages/" + shan.str_extract( '../(.*?)z', link1) # 输出数据 yield item
def tingshou_parse(self, response): # 从每一行抽取数据 result = response.css(".pi-pubinfo") result1 = result.css("li").extract() for part in result1: # 停售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '民生人寿' a = shan.str_keep('class="pi-pls-prodname off-float-left"', part) item['product_type'] = '' item['product_id'] = '' item['product_name'] = shan.str_extract('>(.*?)<', a) item['product_sale_status'] = '停售' b = shan.str_keep('class="dsm-choise-zoon dsm-none"', part) item[ 'product_contract_link'] = "http://www.minshenglife.com" + shan.str_extract( 'href="(.*?)">', b) item['product_price_link'] = '' item['product_start_date'] = '' item['product_end_date'] = '' # 输出数据 yield item # 找到下一页的代码 next_pages = re.findall("index_\d+[.]shtml", response.text) for next_page in next_pages: yield response.follow(next_page, callback=self.tingshou_parse)
def parse(self, response): # 从每一行抽取数据 result = response.css('.list-content a').extract() for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '中法人寿' name = shan.str_extract('>·(.*?)<span>',part) if "目录" in name: item['product_name'] = name item['product_sale_status'] = '停售' elif "产品说明书" in name: item['product_name'] = shan.str_extract('(.*?)产品说明书',name) item['product_sale_status'] = '在售' else: item['product_name'] = name item['product_sale_status'] = '在售' item['product_contract_link'] = "http://www.sfli.com.cn"+ shan.str_extract('href="(.*?)" target',part) # 输出数据 yield item a = response.css('option').extract() b = shan.str_extract('value="(.*?)">',a) b = b[1:len(b)] for part in b: yield response.follow("http://www.sfli.com.cn" + part, callback=self.parse)
def tingshou_parse(self, response): # 从每一行抽取数据 result = response.css('p').extract() result = shan.str_keep('信美', result) for part in result: # 停售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '信美相互' item['product_name'] = shan.str_extract( '\r\n\t\t\t\t\t\t\t\t\t\t\t(.*?)\r\n\t\t\t\t\t\t\t\t\t\t\t</span>', part) item['product_sale_status'] = '停售' item['product_contract_link'] = shan.str_extract( 'href="(.*)" target', part) # 输出数据 yield item a = response.css('button').extract() b = shan.str_extract('value="(.*?)" onclick', a) b = b[2:(len(b) - 1)] for part in b: yield response.follow( "https://www.trustlife.com/cms/html/productClauseStop/index_" + part + ".html", callback=self.tingshou_parse)
def contract_parse(self, response): result = response.css(".articleShowText a").extract() item = response.meta['item'] a = shan.str_keep('产品',result) if 'pdf' in a: item['product_official_report_list'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',a) else: item['product_official_report_list'] = '' b = shan.str_keep('费率',result) if 'pdf' in b: item['product_price_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',b) else: item['product_price_link'] ='' c = shan.str_keep('条款',result) if 'pdf' in c: item['product_contract_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',c) else: item['product_contract_link'] = '' d = shan.str_keep('价值表',result) if 'pdf' in d: item['product_pv_full_list_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',d) else: item['product_pv_full_list_link'] = '' f = shan.str_keep('总精算师',result) if 'pdf' in f: item['product_chief_actuary_claim_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',f) else: item['product_chief_actuary_claim_link'] ='' g = shan.str_keep('法律责任人',result) if 'pdf' in g: item['prodcct_law_response_link'] = "http://www.chinapost-life.com" + shan.str_extract('href="(.*?)"',g) else: item['prodcct_law_response_link'] = '' yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = result[1:len(result)] a = shan.str_detect("健康保险", result) zs_result = result[0:shan.which(a)[len(shan.which(a)) - 1]] ts_result = result[shan.which(a)[len(shan.which(a)) - 1]:(len(result))] zs_result = shan.str_keep('太平', zs_result) ts_result = shan.str_keep('太平', ts_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '太平养老' item['product_name'] = shan.str_extract('<td>(.*?)</td>', part) item['product_sale_status'] = "在售" item['product_contract_link'] = shan.str_extract( 'href="(.*)"', part) # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '太平养老' item['product_name'] = shan.str_extract('<td>(.*?)</td>', part) item['product_sale_status'] = "停售" item['product_contract_link'] = shan.str_extract( 'href="(.*)"', part) # 输出数据 yield item
def contract_parse(self, response): result = response.css("tr") result = result[1:len(result)].extract() item = response.meta['item'] a = shan.str_keep('材料清单', result) if 'pdf' in a: item[ 'product_official_report_list'] = "http://www.sunlife-everbright.com" + shan.str_extract( 'href="(.*?)"', a) else: item['product_official_report_list'] = '' b = shan.str_keep('费率', result) if 'pdf' in b: item[ 'product_price_link'] = "http://www.sunlife-everbright.com" + shan.str_extract( 'href="(.*?)"', b) else: item['product_price_link'] = '' c = shan.str_keep('条款', result) if 'pdf' in c: item[ 'product_contract_link'] = "http://www.sunlife-everbright.com" + shan.str_extract( 'href="(.*?)"', c) else: item['product_contract_link'] = '' d = shan.str_keep('价值表(全表)', result) if 'xlsx' in d: item[ 'product_pv_full_list_link'] = "http://www.sunlife-everbright.com" + shan.str_extract( 'href="(.*?)"', d) else: item['product_pv_full_list_link'] = '' e = shan.str_keep('价值表(示例)', result) if 'pdf' in e: item[ 'product_pv_example_link'] = "http://www.sunlife-everbright.com" + shan.str_extract( 'href="(.*?)"', e) else: item['product_pv_example_link'] = '' f = shan.str_keep('总精算师', result) if 'pdf' in f: item[ 'product_chief_actuary_claim_link'] = "http://www.sunlife-everbright.com" + shan.str_extract( 'href="(.*?)"', f) else: item['product_chief_actuary_claim_link'] = '' g = shan.str_keep('法律责任人', result) if 'pdf' in g: item[ 'prodcct_law_response_link'] = "http://www.sunlife-everbright.com" + shan.str_extract( 'href="(.*?)"', g) else: item['prodcct_law_response_link'] = '' sale_status_url = [ 'http://www.sunlife-everbright.com/sleb/info/jbxx/cpjbxx/jydbxcpmljtk/index.html', ] yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('.product_right_content').extract() result = result[2] result = re.split('<tr>', result) zs_result = result[shan.which(shan.str_detect("在售", result))[0]:( shan.which(shan.str_detect("停售", result))[0] + 1)] ts_result = result[(shan.which(shan.str_detect("停售", result))[0] + 1):len(result)] zs_result = shan.str_keep('德华', zs_result) ts_result = shan.str_keep('德华', ts_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '德华安顾' item['product_name'] = shan.str_extract('<td>(.*?)</td>', part) item['product_sale_status'] = "在售" item[ 'product_contract_link'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)pdf", part) + "pdf" if "rar" in part: item[ 'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)rar", part) + "rar" else: item[ 'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)zip", part) + "zip" # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '德华安顾' item['product_name'] = shan.str_extract('<td>(.*?)</td>', part) item['product_sale_status'] = "停售" item[ 'product_contract_link'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)pdf", part) + "pdf" if "rar" in part: item[ 'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)rar", part) + "rar" else: item[ 'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)zip", part) + "zip" # 输出数据 yield item
def weishou_parse(self, response): # 从每一行抽取数据 result = response.css('.grey2').extract() result = shan.str_keep('险',result) for part in result: # 停售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '中邮人寿' item['product_name'] = shan.str_extract('\r\n\t\t\t\t(.*?)\r\n\t\t\t',part) item['product_sale_status'] = '未售' contract_link = "http://www.chinapost-life.com"+ shan.str_extract('href="(.*)" class',part) yield response.follow(contract_link, callback= self.contract_parse , meta=({'item': item}) )
def contract_parse(self, response): result = response.css("tr") result = result[1:len(result)].extract() for part in result: item = response.meta['item'] part = re.findall('<td>(.*)</td>', part) item['product_name'] = shan.str_extract('>(.*?)</a>', part[1]) item['product_special_status'] = part[2] item['product_contract_link'] = shan.str_extract( 'href="(.*?)"', part[1]) # 输出数据 yield item
def zaishou_parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = result[1:len(result)] for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '前海人寿' item['product_name'] = shan.str_extract('\t\t\t\t\t(.*?)\t\t\t\t</td>',part) item['product_sale_status'] = '在售' item['product_contract_link'] = "https://www.foresealife.com"+ shan.str_extract('href="(.*)">点击查看</a> \t\t\t\t</td>\t\t\t\t<td style="text-align:center;">',part) # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = result[1:len(result)] for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '北京人寿' item['product_name'] = "北京" + shan.str_extract('北京(.*?)</', part) item['product_sale_status'] = '' item['product_contract_link'] = shan.str_extract( 'href="(.*?)" target', part) # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css("p , #content a , .h2_title1").extract() result = result[2:len(result)] zs_result = result[shan.which(shan.str_detect("在售", result)[0]):shan. which(shan.str_detect("停售", result))[0]] ts_result = result[shan.which(shan.str_detect("停售", result))[0]:shan. which(shan.str_detect("在售", result))[1]] zs_result = shan.str_keep('style="color:#626263;"', zs_result) ts_result = shan.str_keep('style="color:#626263;"', ts_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '工银安盛' item['product_type'] = '' item['product_name'] = shan.str_extract(">(.*?)</a>", part) item['product_sale_status'] = '在售' item[ 'product_contract_link'] = "www.icbc-axa.com" + shan.str_extract( 'href="(.*?)pdf', part) + "pdf" item['product_price_link'] = '' item['product_start_date'] = '' item['product_end_date'] = '' # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '工银安盛' item['product_type'] = '' item['product_name'] = shan.str_extract(">(.*?)</a>", part) item['product_sale_status'] = '停售' item[ 'product_contract_link'] = "www.icbc-axa.com" + shan.str_extract( 'href="(.*?)pdf', part) + "pdf" item['product_price_link'] = '' item['product_start_date'] = '' item['product_end_date'] = '' # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('li').extract() result = shan.str_keep('条款',result) result = result[1:len(result)] for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '国联人寿' item['product_name'] = shan.str_extract('target="_blank">(.*?)</a>',part) item['product_sale_status'] = '' item['product_contract_link'] = shan.str_extract('href="(.*?)" target=',part) # 输出数据 yield item
def zaishou_parse(self, response): # 从每一行抽取数据 result = response.css('.width-100-authored a').extract() for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '大都会人寿' item['product_name'] = shan.str_extract('公司(.*?)</a>', part) item['product_sale_status'] = '在售' item[ 'product_contract_link'] = "https://www.metlife.com.cn" + shan.str_extract( 'href="(.*)zip', part) + "zip" # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = result[1:len(result)] for part in result: item = ProjectInsuranceScrapItem() item['company_name'] = '渤海人寿' item['product_name'] = shan.str_extract("渤海(.*?)</p>", part) item['product_sale_status'] = shan.str_extract(">(\S+)售", part) + "售" item[ 'product_contract_link'] = "http://www.bohailife.net" + shan.str_extract( 'href="(.*?)pdf', part) + "pdf" # 输出数据 yield item
def tingshou_parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = result[1:len(result)] for part in result: # 停售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '合众人寿' item['product_name'] = "合众" + shan.str_extract('合众(.*?)<', part) item['product_sale_status'] = '停售' item[ 'product_contract_link'] = "http://www.unionlife.com.cn" + shan.str_extract( 'href="(.*)" style', part) # 输出数据 yield item
def tingshou_parse(self, response): # 从每一行抽取数据 result = response.css('#ess_contentpane a').extract() result = result[1:len(result)] for part in result: # 停售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '人保健康' item['product_name'] = shan.str_extract('>(.*?)</a>', part) item['product_sale_status'] = '停售' item[ 'product_contract_link'] = "http://www.picchealth.com" + shan.str_extract( 'href="(.*)" id', part) # 输出数据 yield item
def zaishou_parse(self, response): # 从每一行抽取数据 result = response.css('li').extract() result = shan.str_keep('条款', result) result = result[1:len(result)] for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '泰康养老' item['product_name'] = shan.str_extract('《(.*?)》', part) item['product_sale_status'] = '在售' item[ 'product_contract_link'] = "http://tkyl.pension.taikang.com" + shan.str_extract( 'href="(.*)" target', part) # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = shan.str_keep('爱心', result) for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '爱心人寿' item['product_name'] = shan.str_extract('>(.*?)</a>', part) item['product_sale_status'] = shan.str_extract('\n(.*?)售', part) + "售" item[ 'product_contract_link'] = "http://www.aixin-ins.com" + shan.str_extract( 'href="(.*?)pdf', part) + "pdf" # 输出数据 yield item
def second_parse(self, response): # 从每一行抽取数据 result = response.css('.news_list a').extract() for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '光大永明' item['product_name'] = shan.str_extract('title="(.*?)"', part) item['product_sale_status'] = scrapy.Request( 'http://www.sunlife-everbright.com/sleb/info/jbxx/cpjbxx/jydbxcpmljtk/index.html', callback=self.zts_parse, meta=({ 'item': item })) contract_link = re.findall('href="(.*?)" ', part)[0] contract_link = "http://www.sunlife-everbright.com" + contract_link yield response.follow(contract_link, callback=self.contract_parse, meta=({ 'item': item })) # 找到下一页的代码 a = str(response.css('.pagingNormal').extract()) next_pages = re.findall( "/sleb/info/jbxx/cpjbxx/cpxxp/468a89fa-\d+[.]html", a) for next_page in next_pages: yield response.follow("http://www.sunlife-everbright.com" + next_page, callback=self.second_parse)
def parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = result[1:len(result)] for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '安邦人寿' item['product_name'] = "安邦" + shan.str_extract( '安邦(.*?)</span>', part) item['product_sale_status'] = '在售' item[ 'product_contract_link'] = "http://www.anbang-life.com" + shan.str_extract( 'href="../../..(.*?)">', part) # 输出数据 yield item
def zaishou_parse(self, response): # 从每一行抽取数据 result = response.css('#jigou_right_k a').extract() result = shan.str_keep('条款',result) for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '人保人寿' item['product_name'] = shan.str_extract('>(.*?)条款',part) item['product_sale_status'] = '在售' item['product_contract_link'] = "http://www.picclife.com"+ shan.str_extract('href="(.*)" title',part) # 输出数据 yield item a = response.css('.yeshu_icon').extract() b = shan.str_extract("\'(.*?)\'",a) for part in b: yield response.follow("http://www.picclife.com/IndividualLongrisk/" + part, callback=self.zaishou_parse)
def tx_parse(self, response): # 从每一行抽取数据 result = response.css('.li_content').extract() for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '中英人寿' item['product_name'] = "中英" + shan.str_extract('中英(.*?)</a>',part) item['product_sale_status'] = "" item['product_contract_link'] = "http://www.aviva-cofco.com.cn"+ shan.str_extract('href="(.*)"',part) # 输出数据 yield item # 找到下一页的代码 next_pages = re.findall("list-\d+[.]shtml",response.text) next_pages = next_pages[0:(len(next_pages)-1)] for next_page in next_pages: yield response.follow(next_page, callback=self.tx_parse)