def parse_book_list(self, response): #书名列表页 #pages=response.xpath("//*[@class='man_first']/ul/li") #取红色热门小说 #pages=response.xpath("//*[@class='man_first']/ul/li[contains(h1/a/@color,'#FF0000')]") pages = response.xpath( u"//*[@class='man_first']/ul/li[contains(p/text(),'龙马')]") #print pages if len(pages) != 0: for page in pages: item = BookscrapyItem() #注意item的生成时机,不可过早,否则数据返回后会层层覆盖。 item['topclass'] = response.xpath( "normalize-space(//*[@class='listcd']/a[last()-1]/text())" ).extract_first() item['bookclass'] = response.xpath( "normalize-space(//*[@class='listcd']/a[last()]/text())" ).extract_first() item['bookname'] = page.xpath( "normalize-space(./h1/a/@title)").extract_first() item['author'] = page.xpath( "normalize-space(./h3/text())").extract_first() item['downid'] = re.sub( r'\D', "", page.xpath("./h1/a/@href").extract_first()) item['datasize'] = page.xpath( "normalize-space(./h4/text())").extract_first() book_url = self.start_urls[0] + page.xpath( "./h1/a/@href").extract_first() #print book_url #print("%s,%s,%s,%s|%s\n\r"%(item['bookname'],item['author'],item['downid'],item['topclass'],item['bookclass'])) yield scrapy.Request(book_url, callback=self.parse_book_info, meta={'item': item})
def parse_book_list(self, response): pages = response.xpath("//*[@class='man_first']/ul/li") #print pages # item = BookscrapyItem() # item['topclass']=response.xpath("//*[@class='listcd']/a[last()-1]/text()").extract_first() # item['author']=pages[0].xpath("./h3/text()").extract_first() # book_url="http://m.bookbao.cc"+str(pages[0].xpath("./h1/a/@href").extract_first()) # yield scrapy.Request(book_url,callback=self.parse_book_info,meta={'item':item}) for page in pages: item = BookscrapyItem() item['topclass'] = response.xpath( "//*[@class='listcd']/a[last()-1]/text()").extract_first() item['author'] = page.xpath("./h3/text()").extract_first() book_url = "http://m.bookbao.cc" + str( page.xpath("./h1/a/@href").extract_first()) yield scrapy.Request(book_url, callback=self.parse_book_info, meta={'item': item})
def parse_book_info(self, response): item = response.meta['item'] item = BookscrapyItem() item['bookname'] = response.xpath( "//*[@class='mlist']/h1/text()").extract_first() #item['author']=response.xpath("//*[@class='mlist']/ul/li[1]/text()").extract_first()#作者名称有些导不出 item['introduction'] = response.xpath("//*[@class='conten']/p").xpath( 'string(.)').extract_first() item['remarks'] = response.xpath( "//*[@class='conten']/p/span[1]/text()").extract_first() item['datasize'] = response.xpath( "//*[@class='mlist']/ul/li[3]/text()").extract_first() item['bookclass'] = response.xpath( "normalize-space(//*[@class='mlist']/ul/li[2]/text())" ).extract_first() item['downid'] = re.sub( r"\D", "", str( response.xpath( "//*[@class='mlist']/a[1]/@href").extract_first())) print item['bookname'] + "\n\r" + item['author'] + "\n\r" + item[ 'introduction'] + "\n\r" yield item