コード例 #1
0
    def parse(self, response):
        prefix = '申通快递-'
        if response.url == self.start_urls[0]:
            divNodes = response.xpath(
                '//div[@class = "main_part nav_product_service clearfix"]/div')
            for node in divNodes:
                typeItem = TypeItem()
                title = node.xpath('./label/text()').extract()[0]
                typeItem['typeName'] = prefix + title
                childAs = node.xpath('.//div/a')
                for childA in childAs:
                    self.links.append(childA.xpath('./@href').extract()[0])
                    typeItem['serviceName'] = prefix + childA.xpath(
                        './text()').extract()[0]
                    if typeItem['serviceName'] == prefix + '开放平台':
                        continue
                    yield typeItem

            for link in self.links:
                new_full_url = urllib.parse.urljoin('http://www.sto.cn', link)
                yield scrapy.Request(new_full_url, callback=self.parse)

        item = ServiceItem()

        contentNode = response.xpath('//div[@class = "product_send"]')
        temp = contentNode.xpath(
            './div[@class = "cont_title"]/text()').extract()[0]
        changetext = ''
        if temp == '24小时':
            changetext = '次日达'
        elif temp == '48小时':
            changetext = '隔日达'
        elif temp == '72小时':
            changetext = '件'
        elif temp == '申通打印专家':
            temp = '打印专家'
        item['serviceName'] = prefix + temp + changetext
        item['serviceItemName'] = contentNode.xpath('./h4/text()').extract()[0]
        item['serviceItemDesc'] = Extract.extractNodeText(
            contentNode.xpath('./p'))
        yield item

        itemNodes = contentNode.xpath('.//div')
        for itemNode in itemNodes:
            if itemNode == itemNodes[0]:
                continue
            titleNode = itemNode.xpath('./h4')
            if titleNode == []:
                continue
            else:
                item['serviceItemName'] = titleNode.xpath(
                    './text()').extract()[0]
            desnodes = itemNode.xpath('.//p')
            des = ''
            for p in desnodes:
                des = des + Extract.extractNodeText(p)
            if '' == des:
                continue
            item['serviceItemDesc'] = des
            yield item
コード例 #2
0
ファイル: ems.py プロジェクト: ZPS233/Baidubaidke-spider
 def parse(self, response):
     if response.url == self.start_urls[0]:
         liNodes = response.xpath('//ul[@class = "list_menu"]/li')
         typeItem = TypeItem()
         for li in liNodes:
             typeItem['typeName'] = self.prefix + li.xpath(
                 './div/span/text()').extract()[0]
             childlis = li.xpath('./ul/li')
             for childli in childlis:
                 a = childli.xpath('./div/a')
                 if a != []:
                     self.links.append(a.xpath('./@href').extract()[0])
                     typeItem['serviceName'] = self.prefix + a.xpath(
                         './text()').extract()[0]
                     if typeItem['serviceName'] == self.prefix + '鲜花礼仪':
                         typeItem[
                             'serviceName'] = self.prefix + '国内特快专递礼仪业务'
                     yield (typeItem)
         typeItem['typeName'] = self.prefix + '物流业务'
         typeItem['serviceName'] = self.prefix + '合同物流'
         yield (typeItem)
         typeItem['serviceName'] = self.prefix + '国际货代'
         yield (typeItem)
         for link in self.links:
             new_full_url = urllib.parse.urljoin(
                 'http://www.ems.com.cn/mainservice/ems/', link)
             yield scrapy.Request(new_full_url, callback=self.parse)
     else:
         serviceItem = ServiceItem()
         ns = response.xpath('/html/body/div[2]/div[2]/*')
         text = ''
         if 'script' in ns[-1].extract():
             ns = ns[2:-1]
         else:
             ns = ns[2:]
         for n in ns:
             t = Extract.extractNodeText(n)
             nText = n.extract()
             if n == ns[0]:
                 serviceItem['serviceName'] = self.prefix + t
                 serviceItem['serviceItemName'] = '业务简介'
             elif n == ns[-1]:
                 text = text + t
                 serviceItem['serviceItemDesc'] = text
                 yield (serviceItem)
             elif 'title' in nText and n != ns[1]:
                 serviceItem['serviceItemDesc'] = text
                 yield (serviceItem)
                 text = ''
                 serviceItem['serviceItemName'] = t
             else:
                 text = text + t
コード例 #3
0
ファイル: tiantian.py プロジェクト: ZPS233/Baidubaidke-spider
 def parse(self, response):
     typeNodes =  response.xpath('/html/body/div[1]/div[5]/div/div[1]/div')
     typeItem = TypeItem()
     serviceItem = ServiceItem()
     typeItem['typeName'] = self.prefix + '增值服务'
     for node in typeNodes:
         if node.xpath('./@class').extract()[0] == 'smallTitle':
             typeItem['serviceName'] = self.prefix + node.xpath('./text()').extract()[0]
             yield typeItem
             serviceItem['serviceName'] = typeItem['serviceName']
             serviceItem['serviceItemName'] = '业务介绍'
         elif node.xpath('./@class').extract()[0] == 'smalldesc':
             serviceItem['serviceItemDesc'] = node.xpath('./text()').extract()[0]
             yield serviceItem
コード例 #4
0
ファイル: shunfeng.py プロジェクト: ZPS233/Baidubaidke-spider
    def parse_item(self, response):
        prefix = '顺丰速运' + '-'
        print(response.url)
        '''------------------------------------爬取顺丰type 及其对应service name-------------------------------------- '''
        if response.url == self.start_urls[0]:
            typeItem = TypeItem()
            tableNodes = response.xpath(
                '//*[@id="header"]/div/ul[1]/li[2]/div/div/div[1]/table')
            for tbody in tableNodes:
                trs = tbody.xpath('./tr')
                for tr in trs[1:]:
                    tds = tr.xpath('.//td')
                    for td in tds:
                        name = td.xpath('./p/text()').extract()[0]
                        if name == '\xa0':
                            typeItem['typeName'] = prefix + '增值服务'
                        else:
                            typeItem['typeName'] = prefix + name.replace(
                                '\n', '').replace('\xa0', '').replace(
                                    ' ', '').replace('\t', '')
                        servicenames = td.xpath('./ul//li/a/text()').extract()
                        for s in servicenames:
                            s = s.replace('\n',
                                          '').replace('\xa0', '').replace(
                                              ' ', '').replace('\t', '')
                            if s == '大件入戶':
                                s = '大件入户'
                            elif s == '前往国际网站' or s == '垫付货款':
                                continue
                            elif s == '派件地址变更':
                                s = '派件地址变更服务'
                            typeItem['serviceName'] = prefix + s
                            print(typeItem)
                            yield typeItem

        item = ServiceItem()
        item['serviceName'] = prefix + response.xpath(
            '//*[@id="express_service_list"]/div/div[1]/h1/text()').extract(
            )[0]
        contents = response.xpath('//div[@class="content-editor"]')
        for c in contents:
            item['serviceItemName'] = c.xpath('./h2/text()').extract()[0]
            ctext = ''
            pnodes = c.xpath('.//*')
            for pnode in pnodes[3:]:
                cc = Extract.extractNodeText(pnode)
                ctext = ctext + cc
            item['serviceItemDesc'] = ctext
            yield item
コード例 #5
0
ファイル: debang.py プロジェクト: ZPS233/Baidubaidke-spider
    def parse(self, response):
        if response.url == self.start_urls[0]:
            ulNodes =  response.xpath('//div[@class="row no-gutters align-content-center white"]')[1].xpath('.//ul') 
            typeItem = TypeItem()
            for ul in ulNodes :
                aNodes = ul.xpath('./li/a')
                for i,aNode in enumerate(aNodes):
                    if i == 0:
                        typeItem['typeName'] = self.prefix + aNode.xpath('./text()').extract()[0]
                    else:
                        if typeItem['typeName'] != '德邦快递-增值服务':
                            self.links.append(aNode.xpath('./@href').extract()[0])
                        typeItem['serviceName'] = self.prefix + aNode.xpath('./text()').extract()[0]
#                        yield typeItem
                        yield(typeItem)
            typeItem['typeName'] = self.prefix + '增值服务'
            typeItem['serviceName'] = self.prefix + '超重货操作费'
            yield typeItem
            #yield('#',typeItem)
            for link in self.links:  
                link = link.replace('{{baseUrl}}','https://www.deppon.com/newwebsite')
                yield scrapy.Request(link, callback=self.parse)
        elif response.url in self.ValueAddedServicesrUrls:
            serviceItem = ServiceItem()
            if response.url == self.ValueAddedServicesrUrls[0]:
                nodes = response.xpath('//section[@class="component fs14 lh24 border_line"]')                
                for node in nodes:
                    text = ''
                    ps = node.xpath('.//p')
                    for p in ps:
                        t = Extract.extractNodeText(p)
                        ptext = p.extract()
                        if p == ps[-1]:
                            text = text+t
                            serviceItem['serviceItemDesc'] = text
                            text =''
                            yield(serviceItem)
                        elif p == ps[0]:
                            serviceItem['serviceName'] = self.prefix + t
                            serviceItem['serviceItemName'] = '服务介绍'
                            text = ''
                        elif 'fs18 lh28' in ptext or '18px' in ptext:
                            if p != ps[1]:
                                serviceItem['serviceItemDesc'] = text
                                yield(serviceItem)
                            serviceItem['serviceName'] = self.prefix + t
                            serviceItem['serviceItemName'] = '服务介绍'
                            text = ''
                        elif '24' in ptext or '15px' in ptext:
                            serviceItem['serviceItemDesc'] = text
                            yield(serviceItem)
                            serviceItem['serviceItemName'] = t
                            text = ''
                        else:
                            text = text + t
                serviceItem['serviceName'] = '德邦快递-超重货操作费'
                serviceItem['serviceItemName'] = '服务介绍'
                serviceItem['serviceItemDesc'] = '单件货物重量大于500KG且小于等于1000KG范围内,收取超重货操作服务费100元/件;单件货物重量大于1000KG且小于等于2000KG范围内,收取超重货操作服务费200元/件;若一票货中多件货物满足超重货操作费收取标准,则这一票货收取的重货操作服务费为各件超重货操作费总和。'
                yield(serviceItem)
            elif response.url == self.ValueAddedServicesrUrls[1]: 
                ps = response.xpath('//section[@class="component fs14 lh24 border_line"]/p')
                text = ''
                serviceItem['serviceName'] = self.prefix + '代收货款'
                for p in ps:
                    t = Extract.extractNodeText(p)
                    ptext = p.extract()
                    if p == ps[-1]:
                        text = text+t
                        serviceItem['serviceItemDesc'] = text
                        text =''
                        yield(serviceItem)
                    elif p == ps[0]:
                        serviceItem['serviceItemName'] = t
                        text = ''
                    elif '28' in ptext:
                        serviceItem['serviceItemDesc'] = text
                        yield(serviceItem)
                        serviceItem['serviceItemName'] = t
                        text = ''
                    else:
                        text = text + t
                serviceItem['serviceItemName'] = '服务介绍'
                serviceItem['serviceItemDesc'] = '提供“即日退”和“三日退”两种代收货款服务。替您收回货款后,在承诺的退款时效内将货款汇出,让您安全、及时地回笼资金'
                yield(serviceItem)
            elif response.url == self.ValueAddedServicesrUrls[2]:
                ps = response.xpath('//section[@class="fs14 lh24 border_line"]/p')
                text = ''
                serviceItem['serviceName'] = self.prefix + '保价运输'
                for p in ps:
                    t = Extract.extractNodeText(p)
                    ptext = p.extract()
                    if p == ps[-1]:
                        text = text+t
                        serviceItem['serviceItemDesc'] = text
                        text =''
                        yield(serviceItem)
                    elif p == ps[0]:
                        serviceItem['serviceItemName'] = t
                        text = ''
                    elif '28' in ptext:
                        serviceItem['serviceItemDesc'] = text
                        yield(serviceItem)
                        serviceItem['serviceItemName'] = t
                        text = ''
                    else:
                        text = text + '\n' + t
                serviceItem['serviceItemName'] = '服务介绍'
                serviceItem['serviceItemDesc'] = '保价运输是指德邦与您共同确定的以托运人申明货物价值为基础的一种特殊运输方式。您向德邦声明托运货物的实际价值,若货物出险,即可获得我司的相应赔偿'
                yield(serviceItem)
            else: 
                serviceItem['serviceName'] = self.prefix + '安全包装服务'
                serviceItem['serviceItemName'] = '服务介绍'
                serviceItem['serviceItemDesc'] = '德邦将为您的货物量身定制安全放心的包装解决方案,让您更安心'
                yield(serviceItem)
                serviceItem['serviceItemName'] = '服务区域'
                serviceItem['serviceItemDesc'] = '中国大陆地区、香港地区'
                yield(serviceItem)
                cardNodes = response.xpath('//div[@class = "card-body"]')
                for card in cardNodes:
                    if card.xpath('./h4') != []:
                        serviceItem['serviceItemName'] = card.xpath('./h4/text()').extract()[0]
                        serviceItem['serviceItemDesc'] = card.xpath('./p/text()').extract()[0]
                    elif card.xpath('./p[2]') != []:
                        serviceItem['serviceItemName'] = '包装材料介绍-' + card.xpath('./p[1]/text()').extract()[0]
                        serviceItem['serviceItemDesc'] = card.xpath('./p[2]/text()').extract()[0]
                    else:
                        serviceItem['serviceItemDesc'] = '新型塑料缓冲材料,质地轻、透明性好,良好的减震性、抗冲击性,是易碎易损货物包装的首选良材'
                        serviceItem['serviceItemName'] = '包装材料介绍-' + card.xpath('./p[1]/text()').extract()[0]
                    yield(serviceItem)
        else:
            serviceItem = ServiceItem()
            #serviceName serviceItemName serviceItemDesc
            pNodes = response.xpath('//section[@class = "content_wrapper h-100"]/section/section/p')
            for i,p in enumerate(pNodes):
                if i == 0:
                    serviceItem['serviceName'] = self.prefix +Extract.extractNodeText(p)
                    yield(serviceItem['serviceName'])
                if i == 1:
                    serviceItem['serviceItemName'] = '服务介绍'
                    serviceItem['serviceItemDesc'] = Extract.extractNodeText(p)
                    yield(serviceItem)
            cardNodes = response.xpath('//div[@class = "card-body"]')
            for card in cardNodes:
                if card.xpath('./h4') == []:
                    continue
                else:
                    serviceItem['serviceItemName'] = '产品优势-' + card.xpath('./h4/text()').extract()[0]
                    if card.xpath('./p') == []:
                        serviceItem['serviceItemDesc'] = card.xpath('./ul/li/text()').extract()[0]
                    else:
                        serviceItem['serviceItemDesc'] = card.xpath('./p/text()').extract()[0]
                    yield(serviceItem)
コード例 #6
0
 def parse(self, response):
     if response.url == self.start_urls[0]:
         boxNodes =  response.xpath('/html/body/div[1]/div/div[3]/ul/li[2]/div/dl[2]/div')[0:2]
         typeItem = TypeItem()
         for box in boxNodes :
             typeItem['typeName'] = self.prefix + box.xpath('./div[@class="title"]/text()').extract()[0]
             childDds = box.xpath('.//dd')
             for dd in childDds:
                 name = dd.xpath('./a/text()').extract()[0]
                 if name == '禁寄物品范围' or name == '分拨中心招商信息':
                     continue
                 else:
                     self.links.append(dd.xpath('./a/@href').extract()[0])
                     if name == '国际快递服务':
                         continue
                     elif name == '当天件快递':
                         name = name + '服务'
                     elif name == '项目快递管理综合服务':
                         name = '项目客户快递管理综合服务'
                 typeItem['serviceName'] = self.prefix + name
                 yield typeItem
         
         #把左栏中的国际快递服务单独拿出来做类型
         typeItem['typeName'] = self.prefix + '国际快递服务'
         linodes = response.xpath('/html/body/div[2]/div[1]/div[1]/ul[1]/li[10]/ul/li/ul/li')
         typeItem['serviceName'] = self.prefix + '国际快递业务'
         yield typeItem
         for li in linodes:
             a = li.xpath('./a')
             self.links.append(a.xpath('./@href').extract()[0])
             typeItem['serviceName'] = self.prefix + a.xpath('./text()').extract()[0]
             yield typeItem
             
         #对在线服务中的禁品信息查询页面处理
         typeItem['typeName'] = self.prefix + '禁寄物品'
         expressPrinciple = response.xpath('/html/body/div[2]/div[2]/div[3]/div[3]')
         typeItem['serviceName'] = self.prefix + '收寄原则'
         yield typeItem
         
         serviceItem = ServiceItem()
         serviceItem['serviceName'] = self.prefix + '收寄原则'
         serviceItem['serviceItemName'] = '详细介绍'
         serviceItem['serviceItemDesc'] = Extract.extractNodeText(expressPrinciple.xpath('./p[1]')).replace('\t','').replace('\r','')
         yield serviceItem
         
         wjcontents = response.xpath('/html/body/div[2]/div[2]/div[3]/div[2]/div')
         for content in wjcontents:
             title = content.xpath('./h4/text()').extract()
             if title != []:
                 typeItem['serviceName'] = self.prefix + title[0]
                 yield typeItem
                 serviceItem['serviceName'] = self.prefix + title[0]
                 serviceItem['serviceItemName'] = '禁止寄递物品名录'
                 if content.xpath('./p[2]')!= []:
                     serviceItem['serviceItemDesc'] = Extract.extractNodeText(content.xpath('./p[2]')).replace('\t','').replace('\r','')
                 else:
                     serviceItem['serviceItemDesc'] = Extract.extractNodeText(content.xpath('./p[1]')).replace('\t','').replace('\r','')
                 yield serviceItem
         for link in self.links:  
             new_full_url = urllib.parse.urljoin('http://www.yundaex.com/cn/', link)
             yield scrapy.Request(new_full_url, callback=self.parse)
             
     elif 'product_export' in response.url:
         serviceItem = ServiceItem()
         serviceItem['serviceName'] = self.prefix + response.xpath('/html/body/div[2]/div[2]/div[3]/h2/text()').extract()[0]
         contents = response.xpath('//div[@class="main_box_content_left"]')
         for c in contents:
             text =''
             ps = c.xpath('./p')
             for p in ps:
                 ptext = p.extract()
                 t = Extract.extractNodeText(p)
                 if p == ps[0]:
                     serviceItem['serviceItemName'] = t
                 elif '<p> *' in ptext:
                     serviceItem['serviceItemDesc'] = text
                     yield serviceItem
                 elif '<p>*' in ptext:
                     text = text+t
                 elif '26' in ptext:
                     text = text + t 
                 elif ':</p>' in ptext or ': </p>' in ptext:
                     serviceItem['serviceItemDesc'] = text
                     yield serviceItem
                     serviceItem['serviceItemName'] = t.split(':')[0].replace(' ','')
                     text = ''
     else:
         serviceItem = ServiceItem()
         serviceItem['serviceName'] = self.prefix + response.xpath('/html/body/div[2]/div[2]/div[3]/h2/text()').extract()[0]
         if serviceItem['serviceName'] ==  self.prefix + '国际快递服务':
             serviceItem['serviceName'] = self.prefix + '国际快递业务'
         contents = response.xpath('//div[@class="main_box_content_left"]')
         for c in contents:
             text =''
             ps = c.xpath('./*')
             for p in ps:
                 t = Extract.extractNodeText(p)
                 if p == ps[-1]:
                     text = text+t
                     serviceItem['serviceItemDesc'] = text
                     yield serviceItem
                 elif p == ps[0]:
                     #中文的:号
                     t = t.split(':')[0].replace(' ','')
                     serviceItem['serviceItemName'] = t
                 elif 'h4' in p.extract():
                     t = t.split(':')[0].replace(' ','')
                     serviceItem['serviceItemDesc'] = text
                     yield serviceItem
                     serviceItem['serviceItemName'] = t
                     text = ''
                 else:
                     text = text + t    
コード例 #7
0
    def parse(self, response):
        prefix = '圆通速递-'
        if response.url in self.start_urls[:5]:
            typeNode = response.xpath('//h4')
            typeItem = TypeItem()
            if response.url == self.start_urls[3]:
                typeItem['typeName'] = prefix + '国际服务'
            elif response.url == self.start_urls[4]:
                typeItem['typeName'] = prefix + '特种物流'
            else:
                typeItem['typeName'] = prefix + typeNode.xpath(
                    './text()')[0].extract()

            serviceNodes = response.xpath('//div[@class = "fl product-text"]')
            for serviceNode in serviceNodes:
                s = serviceNode.xpath('./*')[0]
                #服务名称  服务简介
                name = s.xpath('./span/text()').extract()[0]
                if name == '通关服务' or name == '融合案例':
                    continue
                elif name == '仓配一体':
                    name = '仓配一体服务'
                elif name == '到付件':
                    name = '到付件业务'
                elif name == '代取件':
                    name = '代取件业务'
                elif response.url == self.start_urls[4]:
                    name = '特种物流' + name
                typeItem['serviceName'] = prefix + name
                #                desc = s.xpath('./div/text()').extract()[0]
                yield typeItem

            links = typeNode.xpath('..//a/@href').extract()
            for link in links:
                new_full_url = urllib.parse.urljoin('http://www.yto.net.cn',
                                                    link)
                self.newlinks.append(new_full_url)
                yield scrapy.Request(new_full_url, callback=self.parse)
            print(self.newlinks)
        elif response.url == self.start_urls[5]:
            typeItem = TypeItem()
            item = ServiceItem()
            typeItem['typeName'] = prefix + '服务支持'
            anodes = response.xpath('//div[@class ="tc"]')
            for a in anodes:
                typeItem['serviceName'] = prefix + a.xpath(
                    './span/text()').extract()[0]
                yield typeItem
                item['serviceName'] = typeItem['serviceName']
                item['serviceItemName'] = '介绍'
                item['serviceItemDesc'] = a.xpath('./p/text()').extract()[0]
                yield item
        elif response.url == self.start_urls[6]:
            typeItem = TypeItem()
            item = ServiceItem()
            typeItem['typeName'] = prefix + '特种物流'
            typeItem['serviceName'] = prefix + '特种物流' + '联系方式'
            yield typeItem
            item['serviceName'] = typeItem['serviceName']
            item['serviceItemName'] = response.xpath(
                '//p[@class = "subhead-name"]/text()').extract()[0]
            item['serviceItemDesc'] = response.xpath(
                '//p[@class = "passages"]/text()').extract()[0]
            yield item
        else:
            print('#############item页面', response.url)
            item = ServiceItem()
            if 'specialtraffic/about/' in response.url:
                item['serviceName'] = prefix + '特种物流' + response.xpath(
                    '//p[@class = "subhead-name"]/text()').extract()[0]
                item['serviceItemName'] = '关于我们'
                ps = response.xpath(
                    '//p[@class = "passages"]/text()').extract()
                text = ''
                for p in ps:
                    text += p
                item['serviceItemDesc'] = text
                yield item
            else:
                item['serviceName'] = prefix + response.xpath(
                    './/h4/text()').extract()[0]
                itemNodes = response.xpath('//div[@class = "service-item"]')
                if 'product/teseservice/tesejinji.html' in response.url:
                    for itemNode in itemNodes:
                        nodes = itemNode.xpath('.//p')
                        des = ''
                        for p in nodes:
                            if p == nodes[0]:
                                item[
                                    'serviceItemName'] = Extract.extractNodeText(
                                        p)
                            else:
                                des = des + Extract.extractNodeText(p)
                        if '' == des:
                            continue
                        item['serviceItemDesc'] = des
                        yield item

                else:
                    for itemNode in itemNodes:
                        titleNode = itemNode.xpath('./span/text()').extract()
                        if titleNode == []:
                            continue
                        else:
                            item['serviceItemName'] = titleNode[0]
                        desnodes = itemNode.xpath('.//p')
                        des = ''
                        for p in desnodes:
                            des = des + Extract.extractNodeText(p)
                        if '' == des:
                            continue
                        item['serviceItemDesc'] = des
                        yield item
        #圆通页面BUG 欧洲海外仓服务
        item = ServiceItem()
        item['serviceName'] = prefix + '欧洲海外仓服务'
        item['serviceItemName'] = '业务介绍'
        item[
            'serviceItemDesc'] = '针对地区特色经济产品推出全新服务——特色经济产品个性化解决方案,通过“快递+电商”模式,打造“销售”、“运输”、“鲜配”一站式销售配送服务体系,整合圆通空运、陆运、冷链、仓储资源,利用国家工程实验室研发优势,为客户提供安全、高效、智能的快递运输服务'
        yield item
        item['serviceItemName'] = '服务品类'
        item['serviceItemDesc'] = '''1、生鲜产品:肉类、海鲜类;
                                    2、特殊包装产品:酒类、蛋类、鲜花类;
                                    3、水果产品;
                                    4、特产礼盒、节日礼盒产品;
                                    5、初级农产品:红薯、土豆、大蒜、药材等。'''
        yield item
        item['serviceItemName'] = '咨询方式'
        item['serviceItemDesc'] = '联系邮箱:[email protected]'
        yield item
コード例 #8
0
    def parse(self, response):
        prefix = '中通快递-'
        #首页 找Type
        if response.url == self.home_page:
            typeitem = TypeItem()
            #4个 总业务   div box-1 2 3 4
            typeNodes = response.xpath('//li[@class = "business"]/div/div')
            for typeNode in typeNodes:
#                prefix = prefix + typeNode.xpath('./span/text()').extract()[0] +'-'
                service_nodes = typeNode.xpath('./div/div')
                for service_node in service_nodes:
                    temp = service_node.xpath('./em/text()').extract()[0]
                    if  temp == '\xa0\xa0':
                        typeitem['typeName'] = prefix + '国际件'
                    elif temp == '仓储业务':
                        typeitem['typeName'] = prefix + temp
                        typeitem['serviceName'] = prefix + '中通云仓'
                        yield typeitem
                        break
                    else:
                        typeitem['typeName'] = prefix + temp
                        
                    herfs = service_node.xpath('.//a')
                    for h in herfs:
                        temp = h.xpath('./text()').extract()[0].strip()
                        if temp[:2] == 'To':
                            typeitem['serviceName'] = prefix + 'Toll Global Express(DPEX)'
                        elif temp == "开放平台" or temp == "快递管家":
                            break
                        else:
                            typeitem['serviceName'] = prefix + temp
                        yield typeitem
#                        print(typeitem['name'], typeitem['itemName'])
            yield scrapy.Request(self.link, callback=self.parse)
        elif response.url == self.link:
            body = str(response.body,'utf-8')
            regex = re.compile('\/business\/.*?html')
            links = regex.findall(body)
            for link in links:
                new_full_url = urllib.parse.urljoin('https://www.zto.com', link)
                yield scrapy.Request(new_full_url, callback=self.parse)
        elif response.url in self.cloudChamberurls:
            print('提取信息:',response.url)
            serviceItem = ServiceItem()
            if response.url == self.cloudChamberurls[0]:
                serviceItem['serviceName'] = prefix + '中通云仓'
                pnodes = response.xpath('//div[@class = "business-content"]//p')
                text = ''
                for p in pnodes:
                    if p.xpath('./@class').extract() == 'phone-number':
                        serviceItem['serviceItemName'] = '咨询热线'
                        serviceItem['serviceItemDesc'] = p.xpath('./em/text()').extract()[0]
                        yield serviceItem
                    elif p.xpath('./@class').extract() == 'address':
                        serviceItem['serviceItemName'] = '云仓地址'
                        serviceItem['serviceItemDesc'] = p.xpath('./span/text()').extract()[0]
                        yield serviceItem
                    else:
                        text = text + p.xpath('./text()').extract()[0]
                serviceItem['serviceItemName'] = '云仓介绍'
                serviceItem['serviceItemDesc'] = text
                yield serviceItem
            elif response.url == self.cloudChamberurls[1]:
                serviceItem['serviceName'] = prefix + '中通云仓'
                div1 = response.xpath('//div[@class ="business-box-detail"]')[0]
                div2 = response.xpath('//div[@class ="our-service-value"]')[0]
                for node in div1.xpath('.//dd'):
                    serviceItem['serviceItemName'] = node.xpath('./strong/text()').extract()[0]
                    serviceItem['serviceItemDesc'] = node.xpath('./p/text()').extract()[0]
                    yield serviceItem
                for node in div2.xpath('.//dd'):
                    serviceItem['serviceItemName'] = '服务价值-'+ node.xpath('./strong/text()').extract()[0]
                    serviceItem['serviceItemDesc'] = node.xpath('./p/text()').extract()[0]
                    yield serviceItem 
            elif response.url == self.cloudChamberurls[2]:
                serviceItem['serviceName'] = prefix + '中通云仓'
                serviceItem['serviceItemName'] = '服务范围'
                serviceItem['serviceItemDesc'] = response.xpath('//*[@id="content"]/div/div[2]/div/div/div/div[1]/p/text()').extract()[0]
                yield serviceItem
            else:
                serviceItem['serviceName'] = prefix + '中通云仓'
                serviceItem['serviceItemName'] = '退仓保障'
                serviceItem['serviceItemDesc'] = response.xpath('//*[@id="content"]/div/div[2]/div/div[1]/div[2]/div/p/text()').extract()[0]
                yield serviceItem
                nodes = response.xpath('.//dd')
                for node in nodes:
                    serviceItem['serviceItemName'] = '优势-' + node.xpath('./strong/text()').extract()[0]
                    serviceItem['serviceItemDesc'] = node.xpath('./p/text()').extract()[0]
                    yield serviceItem
        else :
            print('提取信息:',response.url)
            serviceItem = ServiceItem()
            serviceItem['serviceName'] = prefix + response.xpath('//h2[@class = "business-title"]/span/text()').extract()[0]
            serviceItemNodes = response.xpath('//div[@class = "business-box"]')
            for s in serviceItemNodes:
                #服务名称
                serviceItem['serviceItemName'] = s.xpath('./strong/text()').extract()[0]
                textNodes = s.xpath('.//div[@class="business-box-text"]/*')
                summary_text = ''
                for node in textNodes:
                    text = self.extractNodeText(node)
                    if text != '':
                        summary_text = summary_text + text + ' '
                if summary_text!= '':
                    serviceItem['serviceItemDesc'] = summary_text
                    yield serviceItem