def parse_book_list(self,response): books = response.xpath(".//div[@class='book']") for book in books: novelImageUrl = book.xpath("./a/img/@src").extract_first() novelId = book.xpath("./div[@class='book_info']/h3/a/@id").extract_first() novelName =book.xpath("./div[@class='book_info']/h3/a/text()").extract_first() novelLink = book.xpath("./div[@class='book_info']/h3/a/@href").extract_first() novelInfos = book.xpath("./div[@class='book_info']/dl/dd[@class='w_auth']") if len(novelInfos)>4: novelAuthor = novelInfos[0].xpath('./a/text()').extract_first() novelType = novelInfos[1].xpath('./a/text()').extract_first() novelStatus = novelInfos[2].xpath('./text()').extract_first() novelUpdateTime = novelInfos[3].xpath('./text()').extract_first() novelWords = novelInfos[4].xpath('./text()').extract_first() else: novelAuthor='' novelType ='' novelStatus='' novelUpdateTime='' novelWords=0 bookListItem = YunqiBookListItem(novelId=novelId,novelName=novelName, novelLink=novelLink,novelAuthor=novelAuthor, novelType=novelType,novelStatus=novelStatus, novelUpdateTime=novelUpdateTime,novelWords=novelWords, novelImageUrl=novelImageUrl) yield bookListItem request = scrapy.Request(url=novelLink,callback=self.parse_book_detail) request.meta['novelId'] = novelId yield request
def parse_book_list(self, response): nodeList = response.xpath('//div[@class="book"]') for node in nodeList: bookListItem = YunqiBookListItem() bookListItem['novelId'] = node.xpath( './/div[@class="book_info"]/h3/a/@id').extract()[0].split( "_")[-1] bookListItem['novelName'] = node.xpath( './/div[@class="book_info"]/h3/a/text()').extract()[0] bookListItem['novelLink'] = node.xpath( './/div[@class="book_info"]/h3/a/@href').extract()[0] bookListItem['novelAuthor'] = node.xpath( './/div[@class="book_info"]/dl[1]/dd[1]/a/text()').extract()[0] bookListItem['novelType'] = node.xpath( './/div[@class="book_info"]/dl[1]/dd[2]/a/text()').extract()[0] bookListItem['novelStatus'] = node.xpath( './/div[@class="book_info"]/dl[1]/dd[3]/text()').extract()[0] bookListItem['novelUpdateTime'] = node.xpath( './/div[@class="book_info"]/dl[2]/dd[1]/text()').extract()[0] bookListItem['novelWords'] = node.xpath( './/div[@class="book_info"]/dl[2]/dd[2]/text()').extract()[0] bookListItem['novelImageUrl'] = u'http:' + node.xpath( './a/img/@src').extract()[0] yield bookListItem request = scrapy.Request(url=str(bookListItem['novelLink']), callback=self.parse_book_detail) request.meta['novelId'] = bookListItem[ 'novelId'] #在子函数中用response.meta传递 yield request
def parse_book_list(self, response): # 详细解析参考ch00知识补充-04-网页解析验证,包含各种解析方法使用技巧,经常反复看(重点解析思路)-yunqishuyuan1_Spider.py # 下面的两种方法选取的结果一样 # .选取当前节点,//不管在什么位置,div的class属性为book的所有div标签 # books = response.xpath(".//div[@class='book']") # .选取当前节点,//不管在什么位置,*代表所有的class属性为book的所有标签 # books = response.xpath(".//*[@class='book']") books = response.xpath(".//div[@class='book']") for book in books: # .选取当前book节点,/从book下面的根节点开始选取,/符号连续使用就是逐级向下选择 novelImageUrl = book.xpath("./a/img/@src").extract_first() novelId = book.xpath( "./div[@class='book_info']/h3/a/@id").extract_first() novelName = book.xpath( "./div[@class='book_info']/h3/a/text()").extract_first() novelLink = book.xpath( "./div[@class='book_info']/h3/a/@href").extract_first() # 注意网页中是dl标签(不是d1),注意区分l和1 # novelAuthor = book.xpath("./div[@class='book_info']/dl[1]/dd[1]/a/text()").extract_first() # 有些书籍的作者,状态信息等信息为空的,因此该处尝试使用if语句,信息都全的话,才提取信息 # 下面两种方法的结果一样,@class='w_auth'是唯一的 # novelInfos = book.xpath("./div[@class='book_info']//dl//dd[@class='w_auth']") novelInfos = book.xpath( "./div[@class='book_info']/dl/dd[@class='w_auth']") if len(novelInfos) > 4: novelAuthor = novelInfos[0].xpath("./a/text()").extract_first() novelType = novelInfos[1].xpath("./a/text()").extract_first() novelStatus = novelInfos[2].xpath("./text()").extract_first() novelUpdateTime = novelInfos[3].xpath( "./text()").extract_first() novelWords = novelInfos[4].xpath("./text()").extract_first() else: novelAuthor = '' novelType = '' novelStatus = '' novelUpdateTime = '' novelWords = 0 bookListItem = YunqiBookListItem( novelId=novelId, novelName=novelName, novelLink=novelLink, novelAuthor=novelAuthor, novelType=novelType, novelStatus=novelStatus, novelUpdateTime=novelUpdateTime, novelWords=novelWords, novelImageUrl=novelImageUrl, ) # 生成bookListItem,用于存放书单中每一本书的基本信息 yield bookListItem # 进入一本书,解析一本书的详细信息 request = scrapy.Request(url=novelLink, meta={'novelId': novelId}, callback=self.parse_book_detail) yield request
def parse_book_list(self, response): books = response.xpath('.//div[@class="book"]') for book in books: novelId = book\ .xpath('./div[@class="book_info"]/h3/a/@id').extract_first() novelImageUrl = book\ .xpath('./a/img/@src').extract_first() novelLink = book\ .xpath('./div[@class="book_info"]/h3/a/@href').extract_first() novelTitle = book\ .xpath('./div[@class="book_info"]/h3/a/text()').extract_first() novelInfos = book\ .xpath('./div[@class="book_info"]/dl/dd[@class="w_auth"]') if len(novelInfos) > 4: novelAuthor = novelInfos[0].xpath('./a/text()').extract_first() novelTypeB = novelInfos[1].xpath('./a/text()').extract_first() novelStatus = novelInfos[2].xpath('./text()').extract_first() novelUpdateTime = novelInfos[3].xpath('./text()')\ .extract_first() novelWordsCount = novelInfos[4].xpath('./text()')\ .extract_first() else: novelAuthor = '' novelTypeB = '' novelStatus = '' novelUpdateTime = '' novelWordsCount = '' bookItem = YunqiBookListItem(novelId=novelId, title=novelTitle, link=novelLink, author=novelAuthor, status=novelStatus, updateTime=novelUpdateTime, wordsCount=novelWordsCount, novelType=novelTypeB, imageUrl=novelImageUrl) yield bookItem newRequest = scrapy.Request(url=novelLink, callback=self.parse_book_detail) print 'send request', novelLink newRequest.meta['novelId'] = novelId yield newRequest
def parse_book_list(self, response): books = response.xpath('.//div[@class="book"]') for book in books: novelImageUrl = book.xpath('./a/img/@src').extract_first() novelId = book.xpath( 'div[@class="book_info"]/h3/em/a[2]/@bid').extract_first() novelName = book.xpath( 'div[@class="book_info"]/h3/a/text()').extract_first() print novelId, novelName novelLink = book.xpath('a/@href').extract_first() novelInfos = book.xpath( './div[@class="book_info"]/dl/dd[@class="w_auth"]') logging.info("novelInfos lenth:%s" % len(novelInfos)) if len(novelInfos) > 4: novelAuthor = novelInfos[0].xpath('./a/text()').extract_first() novelType = novelInfos[1].xpath('./a/text()').extract_first() novelStatus = novelInfos[2].xpath('./text()').extract_first() novelUpdateTime = novelInfos[3].xpath( './text()').extract_first() novelWords = novelInfos[4].xpath('./text()').extract_first() else: novelAuthor = "" novelType = "" novelStatus = "" novelUpdateTime = "" novelWords = "" booklistItem = YunqiBookListItem(novelId=novelId, novelName=novelName, novelLink=novelLink, novelAuthor=novelAuthor, novelType=novelType, novelStatus=novelStatus, novelUpdateTime=novelUpdateTime, novelWords=novelWords, novelImageUrl=novelImageUrl) yield booklistItem yield scrapy.Request(url=novelLink, meta={'novelId': novelId}, callback=self.parse_book_detail)
def parse_book_list(self, response): #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() book_list = response.xpath(".//*[@class = 'book']") for book in book_list: novelId = book.xpath( ".//*[@class='book_info']/h3/a/@id").extract_first() novelName = book.xpath( ".//*[@class='book_info']/h3/a/text()").extract_first() novelLink = book.xpath( ".//*[@class='book_info']/h3/a/@href").extract_first() novelAuthor = book.xpath( ".//*[@class='book_info']/dl[1]/dd[1]/a/text()").extract_first( ) novelType = book.xpath( ".//*[@class='book_info']/dl[1]/dd[2]/a/text()").extract_first( ) novelStatus = book.xpath( ".//*[@class='book_info']/dl[1]/dd[3]/text()").extract_first() novelUpdateTime = book.xpath( ".//*[@class='book_info']/dl[2]/dd[1]/text()").extract_first() novelWords = book.xpath( ".//*[@class='book_info']/dl[2]/dd[2]/text()").extract_first() novelImgUrl = book.xpath("./a/img/@src").extract_first() bookListItem = YunqiBookListItem(novelId=novelId, novelName=novelName, novelLink=novelLink, novelAuthor=novelAuthor, novelType=novelType, novelStatus=novelStatus, novelUpdateTime=novelUpdateTime, novelWords=novelWords, novelImgUrl=novelImgUrl) yield bookListItem request = scrapy.Request(url=novelLink, callback=self.parse_book_detail) request.meta['novelId'] = novelId yield request