def parse_book_list(self,response):
     books = response.xpath(".//div[@class='book']")
     for book in books:
         novelImageUrl = book.xpath("./a/img/@src").extract_first()
         novelId = book.xpath("./div[@class='book_info']/h3/a/@id").extract_first()
         novelName =book.xpath("./div[@class='book_info']/h3/a/text()").extract_first()
         novelLink = book.xpath("./div[@class='book_info']/h3/a/@href").extract_first()
         novelInfos = book.xpath("./div[@class='book_info']/dl/dd[@class='w_auth']")
         if len(novelInfos)>4:
             novelAuthor = novelInfos[0].xpath('./a/text()').extract_first()
             novelType = novelInfos[1].xpath('./a/text()').extract_first()
             novelStatus = novelInfos[2].xpath('./text()').extract_first()
             novelUpdateTime = novelInfos[3].xpath('./text()').extract_first()
             novelWords  = novelInfos[4].xpath('./text()').extract_first()
         else:
             novelAuthor=''
             novelType =''
             novelStatus=''
             novelUpdateTime=''
             novelWords=0
         bookListItem = YunqiBookListItem(novelId=novelId,novelName=novelName,
                           novelLink=novelLink,novelAuthor=novelAuthor,
                           novelType=novelType,novelStatus=novelStatus,
                           novelUpdateTime=novelUpdateTime,novelWords=novelWords,
                           novelImageUrl=novelImageUrl)
         yield bookListItem
         request = scrapy.Request(url=novelLink,callback=self.parse_book_detail)
         request.meta['novelId'] = novelId
         yield request
Example #2
0
    def parse_book_list(self, response):
        nodeList = response.xpath('//div[@class="book"]')
        for node in nodeList:
            bookListItem = YunqiBookListItem()

            bookListItem['novelId'] = node.xpath(
                './/div[@class="book_info"]/h3/a/@id').extract()[0].split(
                    "_")[-1]
            bookListItem['novelName'] = node.xpath(
                './/div[@class="book_info"]/h3/a/text()').extract()[0]
            bookListItem['novelLink'] = node.xpath(
                './/div[@class="book_info"]/h3/a/@href').extract()[0]
            bookListItem['novelAuthor'] = node.xpath(
                './/div[@class="book_info"]/dl[1]/dd[1]/a/text()').extract()[0]
            bookListItem['novelType'] = node.xpath(
                './/div[@class="book_info"]/dl[1]/dd[2]/a/text()').extract()[0]
            bookListItem['novelStatus'] = node.xpath(
                './/div[@class="book_info"]/dl[1]/dd[3]/text()').extract()[0]

            bookListItem['novelUpdateTime'] = node.xpath(
                './/div[@class="book_info"]/dl[2]/dd[1]/text()').extract()[0]

            bookListItem['novelWords'] = node.xpath(
                './/div[@class="book_info"]/dl[2]/dd[2]/text()').extract()[0]
            bookListItem['novelImageUrl'] = u'http:' + node.xpath(
                './a/img/@src').extract()[0]

            yield bookListItem

            request = scrapy.Request(url=str(bookListItem['novelLink']),
                                     callback=self.parse_book_detail)
            request.meta['novelId'] = bookListItem[
                'novelId']  #在子函数中用response.meta传递
            yield request
    def parse_book_list(self, response):
        # 详细解析参考ch00知识补充-04-网页解析验证,包含各种解析方法使用技巧,经常反复看(重点解析思路)-yunqishuyuan1_Spider.py
        # 下面的两种方法选取的结果一样
        # .选取当前节点,//不管在什么位置,div的class属性为book的所有div标签
        # books = response.xpath(".//div[@class='book']")
        # .选取当前节点,//不管在什么位置,*代表所有的class属性为book的所有标签
        # books = response.xpath(".//*[@class='book']")
        books = response.xpath(".//div[@class='book']")

        for book in books:

            # .选取当前book节点,/从book下面的根节点开始选取,/符号连续使用就是逐级向下选择
            novelImageUrl = book.xpath("./a/img/@src").extract_first()
            novelId = book.xpath(
                "./div[@class='book_info']/h3/a/@id").extract_first()
            novelName = book.xpath(
                "./div[@class='book_info']/h3/a/text()").extract_first()
            novelLink = book.xpath(
                "./div[@class='book_info']/h3/a/@href").extract_first()
            # 注意网页中是dl标签(不是d1),注意区分l和1
            # novelAuthor = book.xpath("./div[@class='book_info']/dl[1]/dd[1]/a/text()").extract_first()
            # 有些书籍的作者,状态信息等信息为空的,因此该处尝试使用if语句,信息都全的话,才提取信息
            # 下面两种方法的结果一样,@class='w_auth'是唯一的
            # novelInfos = book.xpath("./div[@class='book_info']//dl//dd[@class='w_auth']")
            novelInfos = book.xpath(
                "./div[@class='book_info']/dl/dd[@class='w_auth']")

            if len(novelInfos) > 4:
                novelAuthor = novelInfos[0].xpath("./a/text()").extract_first()
                novelType = novelInfos[1].xpath("./a/text()").extract_first()
                novelStatus = novelInfos[2].xpath("./text()").extract_first()
                novelUpdateTime = novelInfos[3].xpath(
                    "./text()").extract_first()
                novelWords = novelInfos[4].xpath("./text()").extract_first()
            else:
                novelAuthor = ''
                novelType = ''
                novelStatus = ''
                novelUpdateTime = ''
                novelWords = 0
            bookListItem = YunqiBookListItem(
                novelId=novelId,
                novelName=novelName,
                novelLink=novelLink,
                novelAuthor=novelAuthor,
                novelType=novelType,
                novelStatus=novelStatus,
                novelUpdateTime=novelUpdateTime,
                novelWords=novelWords,
                novelImageUrl=novelImageUrl,
            )
            # 生成bookListItem,用于存放书单中每一本书的基本信息
            yield bookListItem

            # 进入一本书,解析一本书的详细信息
            request = scrapy.Request(url=novelLink,
                                     meta={'novelId': novelId},
                                     callback=self.parse_book_detail)
            yield request
Example #4
0
 def parse_book_list(self, response):
     books = response.xpath('.//div[@class="book"]')
     for book in books:
         novelId = book\
             .xpath('./div[@class="book_info"]/h3/a/@id').extract_first()
         novelImageUrl = book\
             .xpath('./a/img/@src').extract_first()
         novelLink = book\
             .xpath('./div[@class="book_info"]/h3/a/@href').extract_first()
         novelTitle = book\
             .xpath('./div[@class="book_info"]/h3/a/text()').extract_first()
         novelInfos = book\
             .xpath('./div[@class="book_info"]/dl/dd[@class="w_auth"]')
         if len(novelInfos) > 4:
             novelAuthor = novelInfos[0].xpath('./a/text()').extract_first()
             novelTypeB = novelInfos[1].xpath('./a/text()').extract_first()
             novelStatus = novelInfos[2].xpath('./text()').extract_first()
             novelUpdateTime = novelInfos[3].xpath('./text()')\
                 .extract_first()
             novelWordsCount = novelInfos[4].xpath('./text()')\
                 .extract_first()
         else:
             novelAuthor = ''
             novelTypeB = ''
             novelStatus = ''
             novelUpdateTime = ''
             novelWordsCount = ''
         bookItem = YunqiBookListItem(novelId=novelId,
                                      title=novelTitle,
                                      link=novelLink,
                                      author=novelAuthor,
                                      status=novelStatus,
                                      updateTime=novelUpdateTime,
                                      wordsCount=novelWordsCount,
                                      novelType=novelTypeB,
                                      imageUrl=novelImageUrl)
         yield bookItem
         newRequest = scrapy.Request(url=novelLink,
                                     callback=self.parse_book_detail)
         print 'send request', novelLink
         newRequest.meta['novelId'] = novelId
         yield newRequest
Example #5
0
    def parse_book_list(self, response):
        books = response.xpath('.//div[@class="book"]')
        for book in books:
            novelImageUrl = book.xpath('./a/img/@src').extract_first()
            novelId = book.xpath(
                'div[@class="book_info"]/h3/em/a[2]/@bid').extract_first()
            novelName = book.xpath(
                'div[@class="book_info"]/h3/a/text()').extract_first()
            print novelId, novelName

            novelLink = book.xpath('a/@href').extract_first()
            novelInfos = book.xpath(
                './div[@class="book_info"]/dl/dd[@class="w_auth"]')
            logging.info("novelInfos lenth:%s" % len(novelInfos))
            if len(novelInfos) > 4:
                novelAuthor = novelInfos[0].xpath('./a/text()').extract_first()
                novelType = novelInfos[1].xpath('./a/text()').extract_first()
                novelStatus = novelInfos[2].xpath('./text()').extract_first()
                novelUpdateTime = novelInfos[3].xpath(
                    './text()').extract_first()
                novelWords = novelInfos[4].xpath('./text()').extract_first()
            else:
                novelAuthor = ""
                novelType = ""
                novelStatus = ""
                novelUpdateTime = ""
                novelWords = ""
            booklistItem = YunqiBookListItem(novelId=novelId,
                                             novelName=novelName,
                                             novelLink=novelLink,
                                             novelAuthor=novelAuthor,
                                             novelType=novelType,
                                             novelStatus=novelStatus,
                                             novelUpdateTime=novelUpdateTime,
                                             novelWords=novelWords,
                                             novelImageUrl=novelImageUrl)
            yield booklistItem
            yield scrapy.Request(url=novelLink,
                                 meta={'novelId': novelId},
                                 callback=self.parse_book_detail)
Example #6
0
    def parse_book_list(self, response):

        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        book_list = response.xpath(".//*[@class = 'book']")
        for book in book_list:
            novelId = book.xpath(
                ".//*[@class='book_info']/h3/a/@id").extract_first()
            novelName = book.xpath(
                ".//*[@class='book_info']/h3/a/text()").extract_first()
            novelLink = book.xpath(
                ".//*[@class='book_info']/h3/a/@href").extract_first()
            novelAuthor = book.xpath(
                ".//*[@class='book_info']/dl[1]/dd[1]/a/text()").extract_first(
                )
            novelType = book.xpath(
                ".//*[@class='book_info']/dl[1]/dd[2]/a/text()").extract_first(
                )
            novelStatus = book.xpath(
                ".//*[@class='book_info']/dl[1]/dd[3]/text()").extract_first()
            novelUpdateTime = book.xpath(
                ".//*[@class='book_info']/dl[2]/dd[1]/text()").extract_first()
            novelWords = book.xpath(
                ".//*[@class='book_info']/dl[2]/dd[2]/text()").extract_first()
            novelImgUrl = book.xpath("./a/img/@src").extract_first()
            bookListItem = YunqiBookListItem(novelId=novelId,
                                             novelName=novelName,
                                             novelLink=novelLink,
                                             novelAuthor=novelAuthor,
                                             novelType=novelType,
                                             novelStatus=novelStatus,
                                             novelUpdateTime=novelUpdateTime,
                                             novelWords=novelWords,
                                             novelImgUrl=novelImgUrl)
            yield bookListItem
            request = scrapy.Request(url=novelLink,
                                     callback=self.parse_book_detail)
            request.meta['novelId'] = novelId
            yield request