def parse(self, response): # 从每一行抽取数据 result = response.css('#a2 font , .aproName').extract() zs_result = result[0:shan.which(shan.str_detect("人寿保险", result))[0]] ts_result = result[shan.which(shan.str_detect("人寿保险", result))[0]:( shan.which(shan.str_detect("zip", result))[0] - 1)] zs_result = shan.str_keep('国泰', zs_result) ts_result = shan.str_keep('国泰', ts_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '陆家嘴国泰' item['product_name'] = shan.str_extract('>(.*?)</a>', part) item['product_sale_status'] = "在售" item[ 'product_contract_link'] = "http://www.cathaylife.cn" + shan.str_extract( 'href="(.*)"', part) # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '陆家嘴国泰' item['product_name'] = shan.str_extract('>(.*?)</a>', part) item['product_sale_status'] = "停售" item[ 'product_contract_link'] = "http://www.cathaylife.cn" + shan.str_extract( 'href="(.*)"', part) # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('.list_ul a , .list_title').extract() zs_result = result[(shan.which(shan.str_detect("在售", result))[0] + 1):shan.which(shan.str_detect("停售", result))[0]] ts_result = result[(shan.which(shan.str_detect("停售", result))[0] + 1):len(result)] for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '恒安标准' item['product_name'] = shan.str_extract('>(.*?)</a>', part) item['product_sale_status'] = "在售" item['product_contract_link'] = shan.str_extract( 'href="(.*)" target=', part) # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '恒安标准' item['product_name'] = shan.str_extract('>(.*?)</a>', part) item['product_sale_status'] = "停售" item['product_contract_link'] = shan.str_extract( 'href="(.*)" target=', part) # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() result = result[1:len(result)] a = shan.str_detect("健康保险", result) zs_result = result[0:shan.which(a)[len(shan.which(a)) - 1]] ts_result = result[shan.which(a)[len(shan.which(a)) - 1]:(len(result))] zs_result = shan.str_keep('太平', zs_result) ts_result = shan.str_keep('太平', ts_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '太平养老' item['product_name'] = shan.str_extract('<td>(.*?)</td>', part) item['product_sale_status'] = "在售" item['product_contract_link'] = shan.str_extract( 'href="(.*)"', part) # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '太平养老' item['product_name'] = shan.str_extract('<td>(.*?)</td>', part) item['product_sale_status'] = "停售" item['product_contract_link'] = shan.str_extract( 'href="(.*)"', part) # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('.product_right_content').extract() result = result[2] result = re.split('<tr>', result) zs_result = result[shan.which(shan.str_detect("在售", result))[0]:( shan.which(shan.str_detect("停售", result))[0] + 1)] ts_result = result[(shan.which(shan.str_detect("停售", result))[0] + 1):len(result)] zs_result = shan.str_keep('德华', zs_result) ts_result = shan.str_keep('德华', ts_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '德华安顾' item['product_name'] = shan.str_extract('<td>(.*?)</td>', part) item['product_sale_status'] = "在售" item[ 'product_contract_link'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)pdf", part) + "pdf" if "rar" in part: item[ 'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)rar", part) + "rar" else: item[ 'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)zip", part) + "zip" # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '德华安顾' item['product_name'] = shan.str_extract('<td>(.*?)</td>', part) item['product_sale_status'] = "停售" item[ 'product_contract_link'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)pdf", part) + "pdf" if "rar" in part: item[ 'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)rar", part) + "rar" else: item[ 'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract( "dhag(.*)zip", part) + "zip" # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css("p , #content a , .h2_title1").extract() result = result[2:len(result)] zs_result = result[shan.which(shan.str_detect("在售", result)[0]):shan. which(shan.str_detect("停售", result))[0]] ts_result = result[shan.which(shan.str_detect("停售", result))[0]:shan. which(shan.str_detect("在售", result))[1]] zs_result = shan.str_keep('style="color:#626263;"', zs_result) ts_result = shan.str_keep('style="color:#626263;"', ts_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '工银安盛' item['product_type'] = '' item['product_name'] = shan.str_extract(">(.*?)</a>", part) item['product_sale_status'] = '在售' item[ 'product_contract_link'] = "www.icbc-axa.com" + shan.str_extract( 'href="(.*?)pdf', part) + "pdf" item['product_price_link'] = '' item['product_start_date'] = '' item['product_end_date'] = '' # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '工银安盛' item['product_type'] = '' item['product_name'] = shan.str_extract(">(.*?)</a>", part) item['product_sale_status'] = '停售' item[ 'product_contract_link'] = "www.icbc-axa.com" + shan.str_extract( 'href="(.*?)pdf', part) + "pdf" item['product_price_link'] = '' item['product_start_date'] = '' item['product_end_date'] = '' # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = shan.str_keep("险", response.css("tr").extract()) zs_result = result[shan.which(shan.str_detect("在售", result))[0]:shan. which(shan.str_detect("停售", result))[0]] ts_result = result[shan.which(shan.str_detect("停售", result) )[0]:len(result)] zs_result = shan.str_keep('交银', zs_result) ts_result = shan.str_keep('交银', ts_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '交银康联' if "附加交银" in part: item['product_name'] = "附加交银" + shan.str_extract( '附加交银(.*?)</', part) else: item['product_name'] = "交银" + shan.str_extract( '交银(.*?)</', part) item['product_sale_status'] = '在售' item[ 'product_contract_link'] = "www.bocommlife.com" + shan.str_extract( 'href="(.*?)">', part) # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '交银康联' if "附加交银" in part: item['product_name'] = "附加交银" + shan.str_extract( '附加交银(.*?)<', part) else: item['product_name'] = "交银" + shan.str_extract( '交银(.*?)</', part) item['product_sale_status'] = '停售' item[ 'product_contract_link'] = "www.bocommlife.com" + shan.str_extract( 'href="(.*?)">', part) # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = re.split("tr", response.text) result = result[shan.which(shan.str_detect("在售产品目录及条款", result))[0]: shan.which(shan.str_detect("在售产品目录及条款", result))[1]] result = shan.str_keep('和谐', result) result = result[1:len(result)] for part in result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '和谐健康' item['product_name'] = "和谐" + shan.str_extract( '和谐(.*?)</span>', part) item['product_sale_status'] = '在售' item[ 'product_contract_link'] = "http://www.hexiehealth.com/docs" + shan.str_extract( '/docs(.*?)pdf"', part) + "pdf" # 输出数据 yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('P').extract() zs_result = result[shan.which(shan.str_detect("表一", result))[0]:shan. which(shan.str_detect("表二", result))[0]] ts_result = result[shan.which(shan.str_detect("表三", result) )[0]:len(result)] zs_result = shan.str_keep('险', zs_result) ts_result = shan.str_keep('险', ts_result) zs_result = zs_result[1:len(zs_result)] ts_result = ts_result[1:len(ts_result)] for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '幸福人寿' item['product_sale_status'] = '在售' item['product_contract_link'] = shan.str_extract( 'href="(.*?)">', part) name = shan.str_extract('幸福(.*?)</a>', part) if "(<" in name: item['product_name'] = "幸福" + shan.str_extract( ')(.*?)</font>', name) else: item['product_name'] = "幸福" + name # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '幸福人寿' item['product_name'] = "幸福" + shan.str_extract('幸福(.*?)</a>', part) item['product_sale_status'] = '停售' item['product_contract_link'] = shan.str_extract( 'href="(.*?)">', part) # 输出数据 yield item
def first_parse(self, response): result = response.css("tr").extract() zs_result = result[shan.which(shan.str_detect("在售", result))[0]:shan. which(shan.str_detect("停售", result))[0]] zs_result = shan.str_keep("(寿|保)险", zs_result) ts_result = result[shan.which(shan.str_detect("停售", result) )[0]:len(result)] ts_result = shan.str_keep("(寿|保)险", ts_result) urls = [ 'http://www.sunlife-everbright.com/sleb/info/jbxx/cpjbxx/cpxxp/468a89fa-1.html' ] header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'UM_distinctid=16bf88a1698404-01cafbdd88510b-37647e05-13c680-16bf88a169975c; CNZZDATA1274208563=1695324027-1563242927-%7C1563352631', 'Host': 'www.sunlife-everbright.com', 'If-Modified-Since': 'Wed, 03 Jul 2019 07:18:28 GMT', 'If-None-Match': "4a89-58cc1aaf03900-gzip", 'Referer': 'http://www.sunlife-everbright.com/sleb/info/jbxx/cpjbxx/cpxxp/468a89fa-10.html', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36', } yield scrapy.Request(url=urls, headers=header, callback=self.second_parse)
def zts_parse(self, response): result = response.css('tr').extract() result = result[1:len(result)] result1 = shan.str_keep('险', result) zs_result = result[shan.which(shan.str_detect("在售", result))[0]:shan. which(shan.str_detect("停售", result))[0]] item = response.meta['item'] for part in result1: if shan.str_extract('href="(.*?)pdf', part) in zs_result: item['product_sale_status'] = '在售' else: item['product_sale_status'] = '停售' yield item
def parse(self, response): # 从每一行抽取数据 result = response.css('tr').extract() zs_result = result[shan.which(shan.str_detect("在售", result))[0]:shan. which(shan.str_detect("停售", result))[0]] ts_result = result[shan.which(shan.str_detect("停售", result) )[0]:len(result)] zs_result = shan.str_keep('中融', zs_result) ts_result = shan.str_keep('中融', ts_result) for part in zs_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '中融人寿' item['product_name'] = '中融' + shan.str_extract( '中融(.*?)</td>', part) item['product_sale_status'] = "在售" part1 = re.split('</td>', part) if len(part1) == 7: item['product_official_report_list'] = shan.str_extract( 'href="(.*?)" target', part1[1]) item['product_contract_link'] = shan.str_extract( 'href="(.*?)" target', part1[2]) item['product_chief_actuary_claim_link'] = shan.str_extract( 'href="(.*?)" target', part1[4]) item['prodcct_law_response_link'] = shan.str_extract( 'href="(.*?)" target', part1[5]) if len(part1) == 6: item['product_official_report_list'] = shan.str_extract( 'href="(.*?)" target', part1[1]) item['product_contract_link'] = shan.str_extract( 'href="(.*?)" target', part1[2]) item['product_chief_actuary_claim_link'] = shan.str_extract( 'href="(.*?)" target', part1[3]) item['prodcct_law_response_link'] = shan.str_extract( 'href="(.*?)" target', part1[4]) # 输出数据 yield item for part in ts_result: # 在售保险的内容输入 item = ProjectInsuranceScrapItem() item['company_name'] = '中融人寿' item['product_name'] = '中融' + shan.str_extract( '中融(.*?)</td>', part) item['product_sale_status'] = "停售" part1 = re.split('</td>', part) if len(part1) == 7: item['product_official_report_list'] = shan.str_extract( 'href="(.*?)" target', part1[1]) item['product_contract_link'] = shan.str_extract( 'href="(.*?)" target', part1[2]) item['product_chief_actuary_claim_link'] = shan.str_extract( 'href="(.*?)" target', part1[4]) item['prodcct_law_response_link'] = shan.str_extract( 'href="(.*?)" target', part1[5]) if len(part1) == 6: item['product_official_report_list'] = shan.str_extract( 'href="(.*?)" target', part1[1]) item['product_contract_link'] = shan.str_extract( 'href="(.*?)" target', part1[2]) item['product_chief_actuary_claim_link'] = shan.str_extract( 'href="(.*?)" target', part1[3]) item['prodcct_law_response_link'] = shan.str_extract( 'href="(.*?)" target', part1[4]) # 输出数据 yield item