def parse(self, response): prefix = '申通快递-' if response.url == self.start_urls[0]: divNodes = response.xpath( '//div[@class = "main_part nav_product_service clearfix"]/div') for node in divNodes: typeItem = TypeItem() title = node.xpath('./label/text()').extract()[0] typeItem['typeName'] = prefix + title childAs = node.xpath('.//div/a') for childA in childAs: self.links.append(childA.xpath('./@href').extract()[0]) typeItem['serviceName'] = prefix + childA.xpath( './text()').extract()[0] if typeItem['serviceName'] == prefix + '开放平台': continue yield typeItem for link in self.links: new_full_url = urllib.parse.urljoin('http://www.sto.cn', link) yield scrapy.Request(new_full_url, callback=self.parse) item = ServiceItem() contentNode = response.xpath('//div[@class = "product_send"]') temp = contentNode.xpath( './div[@class = "cont_title"]/text()').extract()[0] changetext = '' if temp == '24小时': changetext = '次日达' elif temp == '48小时': changetext = '隔日达' elif temp == '72小时': changetext = '件' elif temp == '申通打印专家': temp = '打印专家' item['serviceName'] = prefix + temp + changetext item['serviceItemName'] = contentNode.xpath('./h4/text()').extract()[0] item['serviceItemDesc'] = Extract.extractNodeText( contentNode.xpath('./p')) yield item itemNodes = contentNode.xpath('.//div') for itemNode in itemNodes: if itemNode == itemNodes[0]: continue titleNode = itemNode.xpath('./h4') if titleNode == []: continue else: item['serviceItemName'] = titleNode.xpath( './text()').extract()[0] desnodes = itemNode.xpath('.//p') des = '' for p in desnodes: des = des + Extract.extractNodeText(p) if '' == des: continue item['serviceItemDesc'] = des yield item
def parse(self, response): if response.url == self.start_urls[0]: liNodes = response.xpath('//ul[@class = "list_menu"]/li') typeItem = TypeItem() for li in liNodes: typeItem['typeName'] = self.prefix + li.xpath( './div/span/text()').extract()[0] childlis = li.xpath('./ul/li') for childli in childlis: a = childli.xpath('./div/a') if a != []: self.links.append(a.xpath('./@href').extract()[0]) typeItem['serviceName'] = self.prefix + a.xpath( './text()').extract()[0] if typeItem['serviceName'] == self.prefix + '鲜花礼仪': typeItem[ 'serviceName'] = self.prefix + '国内特快专递礼仪业务' yield (typeItem) typeItem['typeName'] = self.prefix + '物流业务' typeItem['serviceName'] = self.prefix + '合同物流' yield (typeItem) typeItem['serviceName'] = self.prefix + '国际货代' yield (typeItem) for link in self.links: new_full_url = urllib.parse.urljoin( 'http://www.ems.com.cn/mainservice/ems/', link) yield scrapy.Request(new_full_url, callback=self.parse) else: serviceItem = ServiceItem() ns = response.xpath('/html/body/div[2]/div[2]/*') text = '' if 'script' in ns[-1].extract(): ns = ns[2:-1] else: ns = ns[2:] for n in ns: t = Extract.extractNodeText(n) nText = n.extract() if n == ns[0]: serviceItem['serviceName'] = self.prefix + t serviceItem['serviceItemName'] = '业务简介' elif n == ns[-1]: text = text + t serviceItem['serviceItemDesc'] = text yield (serviceItem) elif 'title' in nText and n != ns[1]: serviceItem['serviceItemDesc'] = text yield (serviceItem) text = '' serviceItem['serviceItemName'] = t else: text = text + t
def parse(self, response): typeNodes = response.xpath('/html/body/div[1]/div[5]/div/div[1]/div') typeItem = TypeItem() serviceItem = ServiceItem() typeItem['typeName'] = self.prefix + '增值服务' for node in typeNodes: if node.xpath('./@class').extract()[0] == 'smallTitle': typeItem['serviceName'] = self.prefix + node.xpath('./text()').extract()[0] yield typeItem serviceItem['serviceName'] = typeItem['serviceName'] serviceItem['serviceItemName'] = '业务介绍' elif node.xpath('./@class').extract()[0] == 'smalldesc': serviceItem['serviceItemDesc'] = node.xpath('./text()').extract()[0] yield serviceItem
def parse_item(self, response): prefix = '顺丰速运' + '-' print(response.url) '''------------------------------------爬取顺丰type 及其对应service name-------------------------------------- ''' if response.url == self.start_urls[0]: typeItem = TypeItem() tableNodes = response.xpath( '//*[@id="header"]/div/ul[1]/li[2]/div/div/div[1]/table') for tbody in tableNodes: trs = tbody.xpath('./tr') for tr in trs[1:]: tds = tr.xpath('.//td') for td in tds: name = td.xpath('./p/text()').extract()[0] if name == '\xa0': typeItem['typeName'] = prefix + '增值服务' else: typeItem['typeName'] = prefix + name.replace( '\n', '').replace('\xa0', '').replace( ' ', '').replace('\t', '') servicenames = td.xpath('./ul//li/a/text()').extract() for s in servicenames: s = s.replace('\n', '').replace('\xa0', '').replace( ' ', '').replace('\t', '') if s == '大件入戶': s = '大件入户' elif s == '前往国际网站' or s == '垫付货款': continue elif s == '派件地址变更': s = '派件地址变更服务' typeItem['serviceName'] = prefix + s print(typeItem) yield typeItem item = ServiceItem() item['serviceName'] = prefix + response.xpath( '//*[@id="express_service_list"]/div/div[1]/h1/text()').extract( )[0] contents = response.xpath('//div[@class="content-editor"]') for c in contents: item['serviceItemName'] = c.xpath('./h2/text()').extract()[0] ctext = '' pnodes = c.xpath('.//*') for pnode in pnodes[3:]: cc = Extract.extractNodeText(pnode) ctext = ctext + cc item['serviceItemDesc'] = ctext yield item
def parse(self, response): if response.url == self.start_urls[0]: ulNodes = response.xpath('//div[@class="row no-gutters align-content-center white"]')[1].xpath('.//ul') typeItem = TypeItem() for ul in ulNodes : aNodes = ul.xpath('./li/a') for i,aNode in enumerate(aNodes): if i == 0: typeItem['typeName'] = self.prefix + aNode.xpath('./text()').extract()[0] else: if typeItem['typeName'] != '德邦快递-增值服务': self.links.append(aNode.xpath('./@href').extract()[0]) typeItem['serviceName'] = self.prefix + aNode.xpath('./text()').extract()[0] # yield typeItem yield(typeItem) typeItem['typeName'] = self.prefix + '增值服务' typeItem['serviceName'] = self.prefix + '超重货操作费' yield typeItem #yield('#',typeItem) for link in self.links: link = link.replace('{{baseUrl}}','https://www.deppon.com/newwebsite') yield scrapy.Request(link, callback=self.parse) elif response.url in self.ValueAddedServicesrUrls: serviceItem = ServiceItem() if response.url == self.ValueAddedServicesrUrls[0]: nodes = response.xpath('//section[@class="component fs14 lh24 border_line"]') for node in nodes: text = '' ps = node.xpath('.//p') for p in ps: t = Extract.extractNodeText(p) ptext = p.extract() if p == ps[-1]: text = text+t serviceItem['serviceItemDesc'] = text text ='' yield(serviceItem) elif p == ps[0]: serviceItem['serviceName'] = self.prefix + t serviceItem['serviceItemName'] = '服务介绍' text = '' elif 'fs18 lh28' in ptext or '18px' in ptext: if p != ps[1]: serviceItem['serviceItemDesc'] = text yield(serviceItem) serviceItem['serviceName'] = self.prefix + t serviceItem['serviceItemName'] = '服务介绍' text = '' elif '24' in ptext or '15px' in ptext: serviceItem['serviceItemDesc'] = text yield(serviceItem) serviceItem['serviceItemName'] = t text = '' else: text = text + t serviceItem['serviceName'] = '德邦快递-超重货操作费' serviceItem['serviceItemName'] = '服务介绍' serviceItem['serviceItemDesc'] = '单件货物重量大于500KG且小于等于1000KG范围内,收取超重货操作服务费100元/件;单件货物重量大于1000KG且小于等于2000KG范围内,收取超重货操作服务费200元/件;若一票货中多件货物满足超重货操作费收取标准,则这一票货收取的重货操作服务费为各件超重货操作费总和。' yield(serviceItem) elif response.url == self.ValueAddedServicesrUrls[1]: ps = response.xpath('//section[@class="component fs14 lh24 border_line"]/p') text = '' serviceItem['serviceName'] = self.prefix + '代收货款' for p in ps: t = Extract.extractNodeText(p) ptext = p.extract() if p == ps[-1]: text = text+t serviceItem['serviceItemDesc'] = text text ='' yield(serviceItem) elif p == ps[0]: serviceItem['serviceItemName'] = t text = '' elif '28' in ptext: serviceItem['serviceItemDesc'] = text yield(serviceItem) serviceItem['serviceItemName'] = t text = '' else: text = text + t serviceItem['serviceItemName'] = '服务介绍' serviceItem['serviceItemDesc'] = '提供“即日退”和“三日退”两种代收货款服务。替您收回货款后,在承诺的退款时效内将货款汇出,让您安全、及时地回笼资金' yield(serviceItem) elif response.url == self.ValueAddedServicesrUrls[2]: ps = response.xpath('//section[@class="fs14 lh24 border_line"]/p') text = '' serviceItem['serviceName'] = self.prefix + '保价运输' for p in ps: t = Extract.extractNodeText(p) ptext = p.extract() if p == ps[-1]: text = text+t serviceItem['serviceItemDesc'] = text text ='' yield(serviceItem) elif p == ps[0]: serviceItem['serviceItemName'] = t text = '' elif '28' in ptext: serviceItem['serviceItemDesc'] = text yield(serviceItem) serviceItem['serviceItemName'] = t text = '' else: text = text + '\n' + t serviceItem['serviceItemName'] = '服务介绍' serviceItem['serviceItemDesc'] = '保价运输是指德邦与您共同确定的以托运人申明货物价值为基础的一种特殊运输方式。您向德邦声明托运货物的实际价值,若货物出险,即可获得我司的相应赔偿' yield(serviceItem) else: serviceItem['serviceName'] = self.prefix + '安全包装服务' serviceItem['serviceItemName'] = '服务介绍' serviceItem['serviceItemDesc'] = '德邦将为您的货物量身定制安全放心的包装解决方案,让您更安心' yield(serviceItem) serviceItem['serviceItemName'] = '服务区域' serviceItem['serviceItemDesc'] = '中国大陆地区、香港地区' yield(serviceItem) cardNodes = response.xpath('//div[@class = "card-body"]') for card in cardNodes: if card.xpath('./h4') != []: serviceItem['serviceItemName'] = card.xpath('./h4/text()').extract()[0] serviceItem['serviceItemDesc'] = card.xpath('./p/text()').extract()[0] elif card.xpath('./p[2]') != []: serviceItem['serviceItemName'] = '包装材料介绍-' + card.xpath('./p[1]/text()').extract()[0] serviceItem['serviceItemDesc'] = card.xpath('./p[2]/text()').extract()[0] else: serviceItem['serviceItemDesc'] = '新型塑料缓冲材料,质地轻、透明性好,良好的减震性、抗冲击性,是易碎易损货物包装的首选良材' serviceItem['serviceItemName'] = '包装材料介绍-' + card.xpath('./p[1]/text()').extract()[0] yield(serviceItem) else: serviceItem = ServiceItem() #serviceName serviceItemName serviceItemDesc pNodes = response.xpath('//section[@class = "content_wrapper h-100"]/section/section/p') for i,p in enumerate(pNodes): if i == 0: serviceItem['serviceName'] = self.prefix +Extract.extractNodeText(p) yield(serviceItem['serviceName']) if i == 1: serviceItem['serviceItemName'] = '服务介绍' serviceItem['serviceItemDesc'] = Extract.extractNodeText(p) yield(serviceItem) cardNodes = response.xpath('//div[@class = "card-body"]') for card in cardNodes: if card.xpath('./h4') == []: continue else: serviceItem['serviceItemName'] = '产品优势-' + card.xpath('./h4/text()').extract()[0] if card.xpath('./p') == []: serviceItem['serviceItemDesc'] = card.xpath('./ul/li/text()').extract()[0] else: serviceItem['serviceItemDesc'] = card.xpath('./p/text()').extract()[0] yield(serviceItem)
def parse(self, response): if response.url == self.start_urls[0]: boxNodes = response.xpath('/html/body/div[1]/div/div[3]/ul/li[2]/div/dl[2]/div')[0:2] typeItem = TypeItem() for box in boxNodes : typeItem['typeName'] = self.prefix + box.xpath('./div[@class="title"]/text()').extract()[0] childDds = box.xpath('.//dd') for dd in childDds: name = dd.xpath('./a/text()').extract()[0] if name == '禁寄物品范围' or name == '分拨中心招商信息': continue else: self.links.append(dd.xpath('./a/@href').extract()[0]) if name == '国际快递服务': continue elif name == '当天件快递': name = name + '服务' elif name == '项目快递管理综合服务': name = '项目客户快递管理综合服务' typeItem['serviceName'] = self.prefix + name yield typeItem #把左栏中的国际快递服务单独拿出来做类型 typeItem['typeName'] = self.prefix + '国际快递服务' linodes = response.xpath('/html/body/div[2]/div[1]/div[1]/ul[1]/li[10]/ul/li/ul/li') typeItem['serviceName'] = self.prefix + '国际快递业务' yield typeItem for li in linodes: a = li.xpath('./a') self.links.append(a.xpath('./@href').extract()[0]) typeItem['serviceName'] = self.prefix + a.xpath('./text()').extract()[0] yield typeItem #对在线服务中的禁品信息查询页面处理 typeItem['typeName'] = self.prefix + '禁寄物品' expressPrinciple = response.xpath('/html/body/div[2]/div[2]/div[3]/div[3]') typeItem['serviceName'] = self.prefix + '收寄原则' yield typeItem serviceItem = ServiceItem() serviceItem['serviceName'] = self.prefix + '收寄原则' serviceItem['serviceItemName'] = '详细介绍' serviceItem['serviceItemDesc'] = Extract.extractNodeText(expressPrinciple.xpath('./p[1]')).replace('\t','').replace('\r','') yield serviceItem wjcontents = response.xpath('/html/body/div[2]/div[2]/div[3]/div[2]/div') for content in wjcontents: title = content.xpath('./h4/text()').extract() if title != []: typeItem['serviceName'] = self.prefix + title[0] yield typeItem serviceItem['serviceName'] = self.prefix + title[0] serviceItem['serviceItemName'] = '禁止寄递物品名录' if content.xpath('./p[2]')!= []: serviceItem['serviceItemDesc'] = Extract.extractNodeText(content.xpath('./p[2]')).replace('\t','').replace('\r','') else: serviceItem['serviceItemDesc'] = Extract.extractNodeText(content.xpath('./p[1]')).replace('\t','').replace('\r','') yield serviceItem for link in self.links: new_full_url = urllib.parse.urljoin('http://www.yundaex.com/cn/', link) yield scrapy.Request(new_full_url, callback=self.parse) elif 'product_export' in response.url: serviceItem = ServiceItem() serviceItem['serviceName'] = self.prefix + response.xpath('/html/body/div[2]/div[2]/div[3]/h2/text()').extract()[0] contents = response.xpath('//div[@class="main_box_content_left"]') for c in contents: text ='' ps = c.xpath('./p') for p in ps: ptext = p.extract() t = Extract.extractNodeText(p) if p == ps[0]: serviceItem['serviceItemName'] = t elif '<p> *' in ptext: serviceItem['serviceItemDesc'] = text yield serviceItem elif '<p>*' in ptext: text = text+t elif '26' in ptext: text = text + t elif ':</p>' in ptext or ': </p>' in ptext: serviceItem['serviceItemDesc'] = text yield serviceItem serviceItem['serviceItemName'] = t.split(':')[0].replace(' ','') text = '' else: serviceItem = ServiceItem() serviceItem['serviceName'] = self.prefix + response.xpath('/html/body/div[2]/div[2]/div[3]/h2/text()').extract()[0] if serviceItem['serviceName'] == self.prefix + '国际快递服务': serviceItem['serviceName'] = self.prefix + '国际快递业务' contents = response.xpath('//div[@class="main_box_content_left"]') for c in contents: text ='' ps = c.xpath('./*') for p in ps: t = Extract.extractNodeText(p) if p == ps[-1]: text = text+t serviceItem['serviceItemDesc'] = text yield serviceItem elif p == ps[0]: #中文的:号 t = t.split(':')[0].replace(' ','') serviceItem['serviceItemName'] = t elif 'h4' in p.extract(): t = t.split(':')[0].replace(' ','') serviceItem['serviceItemDesc'] = text yield serviceItem serviceItem['serviceItemName'] = t text = '' else: text = text + t
def parse(self, response): prefix = '圆通速递-' if response.url in self.start_urls[:5]: typeNode = response.xpath('//h4') typeItem = TypeItem() if response.url == self.start_urls[3]: typeItem['typeName'] = prefix + '国际服务' elif response.url == self.start_urls[4]: typeItem['typeName'] = prefix + '特种物流' else: typeItem['typeName'] = prefix + typeNode.xpath( './text()')[0].extract() serviceNodes = response.xpath('//div[@class = "fl product-text"]') for serviceNode in serviceNodes: s = serviceNode.xpath('./*')[0] #服务名称 服务简介 name = s.xpath('./span/text()').extract()[0] if name == '通关服务' or name == '融合案例': continue elif name == '仓配一体': name = '仓配一体服务' elif name == '到付件': name = '到付件业务' elif name == '代取件': name = '代取件业务' elif response.url == self.start_urls[4]: name = '特种物流' + name typeItem['serviceName'] = prefix + name # desc = s.xpath('./div/text()').extract()[0] yield typeItem links = typeNode.xpath('..//a/@href').extract() for link in links: new_full_url = urllib.parse.urljoin('http://www.yto.net.cn', link) self.newlinks.append(new_full_url) yield scrapy.Request(new_full_url, callback=self.parse) print(self.newlinks) elif response.url == self.start_urls[5]: typeItem = TypeItem() item = ServiceItem() typeItem['typeName'] = prefix + '服务支持' anodes = response.xpath('//div[@class ="tc"]') for a in anodes: typeItem['serviceName'] = prefix + a.xpath( './span/text()').extract()[0] yield typeItem item['serviceName'] = typeItem['serviceName'] item['serviceItemName'] = '介绍' item['serviceItemDesc'] = a.xpath('./p/text()').extract()[0] yield item elif response.url == self.start_urls[6]: typeItem = TypeItem() item = ServiceItem() typeItem['typeName'] = prefix + '特种物流' typeItem['serviceName'] = prefix + '特种物流' + '联系方式' yield typeItem item['serviceName'] = typeItem['serviceName'] item['serviceItemName'] = response.xpath( '//p[@class = "subhead-name"]/text()').extract()[0] item['serviceItemDesc'] = response.xpath( '//p[@class = "passages"]/text()').extract()[0] yield item else: print('#############item页面', response.url) item = ServiceItem() if 'specialtraffic/about/' in response.url: item['serviceName'] = prefix + '特种物流' + response.xpath( '//p[@class = "subhead-name"]/text()').extract()[0] item['serviceItemName'] = '关于我们' ps = response.xpath( '//p[@class = "passages"]/text()').extract() text = '' for p in ps: text += p item['serviceItemDesc'] = text yield item else: item['serviceName'] = prefix + response.xpath( './/h4/text()').extract()[0] itemNodes = response.xpath('//div[@class = "service-item"]') if 'product/teseservice/tesejinji.html' in response.url: for itemNode in itemNodes: nodes = itemNode.xpath('.//p') des = '' for p in nodes: if p == nodes[0]: item[ 'serviceItemName'] = Extract.extractNodeText( p) else: des = des + Extract.extractNodeText(p) if '' == des: continue item['serviceItemDesc'] = des yield item else: for itemNode in itemNodes: titleNode = itemNode.xpath('./span/text()').extract() if titleNode == []: continue else: item['serviceItemName'] = titleNode[0] desnodes = itemNode.xpath('.//p') des = '' for p in desnodes: des = des + Extract.extractNodeText(p) if '' == des: continue item['serviceItemDesc'] = des yield item #圆通页面BUG 欧洲海外仓服务 item = ServiceItem() item['serviceName'] = prefix + '欧洲海外仓服务' item['serviceItemName'] = '业务介绍' item[ 'serviceItemDesc'] = '针对地区特色经济产品推出全新服务——特色经济产品个性化解决方案,通过“快递+电商”模式,打造“销售”、“运输”、“鲜配”一站式销售配送服务体系,整合圆通空运、陆运、冷链、仓储资源,利用国家工程实验室研发优势,为客户提供安全、高效、智能的快递运输服务' yield item item['serviceItemName'] = '服务品类' item['serviceItemDesc'] = '''1、生鲜产品:肉类、海鲜类; 2、特殊包装产品:酒类、蛋类、鲜花类; 3、水果产品; 4、特产礼盒、节日礼盒产品; 5、初级农产品:红薯、土豆、大蒜、药材等。''' yield item item['serviceItemName'] = '咨询方式' item['serviceItemDesc'] = '联系邮箱:[email protected]' yield item
def parse(self, response): prefix = '中通快递-' #首页 找Type if response.url == self.home_page: typeitem = TypeItem() #4个 总业务 div box-1 2 3 4 typeNodes = response.xpath('//li[@class = "business"]/div/div') for typeNode in typeNodes: # prefix = prefix + typeNode.xpath('./span/text()').extract()[0] +'-' service_nodes = typeNode.xpath('./div/div') for service_node in service_nodes: temp = service_node.xpath('./em/text()').extract()[0] if temp == '\xa0\xa0': typeitem['typeName'] = prefix + '国际件' elif temp == '仓储业务': typeitem['typeName'] = prefix + temp typeitem['serviceName'] = prefix + '中通云仓' yield typeitem break else: typeitem['typeName'] = prefix + temp herfs = service_node.xpath('.//a') for h in herfs: temp = h.xpath('./text()').extract()[0].strip() if temp[:2] == 'To': typeitem['serviceName'] = prefix + 'Toll Global Express(DPEX)' elif temp == "开放平台" or temp == "快递管家": break else: typeitem['serviceName'] = prefix + temp yield typeitem # print(typeitem['name'], typeitem['itemName']) yield scrapy.Request(self.link, callback=self.parse) elif response.url == self.link: body = str(response.body,'utf-8') regex = re.compile('\/business\/.*?html') links = regex.findall(body) for link in links: new_full_url = urllib.parse.urljoin('https://www.zto.com', link) yield scrapy.Request(new_full_url, callback=self.parse) elif response.url in self.cloudChamberurls: print('提取信息:',response.url) serviceItem = ServiceItem() if response.url == self.cloudChamberurls[0]: serviceItem['serviceName'] = prefix + '中通云仓' pnodes = response.xpath('//div[@class = "business-content"]//p') text = '' for p in pnodes: if p.xpath('./@class').extract() == 'phone-number': serviceItem['serviceItemName'] = '咨询热线' serviceItem['serviceItemDesc'] = p.xpath('./em/text()').extract()[0] yield serviceItem elif p.xpath('./@class').extract() == 'address': serviceItem['serviceItemName'] = '云仓地址' serviceItem['serviceItemDesc'] = p.xpath('./span/text()').extract()[0] yield serviceItem else: text = text + p.xpath('./text()').extract()[0] serviceItem['serviceItemName'] = '云仓介绍' serviceItem['serviceItemDesc'] = text yield serviceItem elif response.url == self.cloudChamberurls[1]: serviceItem['serviceName'] = prefix + '中通云仓' div1 = response.xpath('//div[@class ="business-box-detail"]')[0] div2 = response.xpath('//div[@class ="our-service-value"]')[0] for node in div1.xpath('.//dd'): serviceItem['serviceItemName'] = node.xpath('./strong/text()').extract()[0] serviceItem['serviceItemDesc'] = node.xpath('./p/text()').extract()[0] yield serviceItem for node in div2.xpath('.//dd'): serviceItem['serviceItemName'] = '服务价值-'+ node.xpath('./strong/text()').extract()[0] serviceItem['serviceItemDesc'] = node.xpath('./p/text()').extract()[0] yield serviceItem elif response.url == self.cloudChamberurls[2]: serviceItem['serviceName'] = prefix + '中通云仓' serviceItem['serviceItemName'] = '服务范围' serviceItem['serviceItemDesc'] = response.xpath('//*[@id="content"]/div/div[2]/div/div/div/div[1]/p/text()').extract()[0] yield serviceItem else: serviceItem['serviceName'] = prefix + '中通云仓' serviceItem['serviceItemName'] = '退仓保障' serviceItem['serviceItemDesc'] = response.xpath('//*[@id="content"]/div/div[2]/div/div[1]/div[2]/div/p/text()').extract()[0] yield serviceItem nodes = response.xpath('.//dd') for node in nodes: serviceItem['serviceItemName'] = '优势-' + node.xpath('./strong/text()').extract()[0] serviceItem['serviceItemDesc'] = node.xpath('./p/text()').extract()[0] yield serviceItem else : print('提取信息:',response.url) serviceItem = ServiceItem() serviceItem['serviceName'] = prefix + response.xpath('//h2[@class = "business-title"]/span/text()').extract()[0] serviceItemNodes = response.xpath('//div[@class = "business-box"]') for s in serviceItemNodes: #服务名称 serviceItem['serviceItemName'] = s.xpath('./strong/text()').extract()[0] textNodes = s.xpath('.//div[@class="business-box-text"]/*') summary_text = '' for node in textNodes: text = self.extractNodeText(node) if text != '': summary_text = summary_text + text + ' ' if summary_text!= '': serviceItem['serviceItemDesc'] = summary_text yield serviceItem