Esempio n. 1
0
 def parse_book_list(self, response):  #书名列表页
     #pages=response.xpath("//*[@class='man_first']/ul/li")
     #取红色热门小说
     #pages=response.xpath("//*[@class='man_first']/ul/li[contains(h1/a/@color,'#FF0000')]")
     pages = response.xpath(
         u"//*[@class='man_first']/ul/li[contains(p/text(),'龙马')]")
     #print pages
     if len(pages) != 0:
         for page in pages:
             item = BookscrapyItem()  #注意item的生成时机,不可过早,否则数据返回后会层层覆盖。
             item['topclass'] = response.xpath(
                 "normalize-space(//*[@class='listcd']/a[last()-1]/text())"
             ).extract_first()
             item['bookclass'] = response.xpath(
                 "normalize-space(//*[@class='listcd']/a[last()]/text())"
             ).extract_first()
             item['bookname'] = page.xpath(
                 "normalize-space(./h1/a/@title)").extract_first()
             item['author'] = page.xpath(
                 "normalize-space(./h3/text())").extract_first()
             item['downid'] = re.sub(
                 r'\D', "",
                 page.xpath("./h1/a/@href").extract_first())
             item['datasize'] = page.xpath(
                 "normalize-space(./h4/text())").extract_first()
             book_url = self.start_urls[0] + page.xpath(
                 "./h1/a/@href").extract_first()
             #print book_url
             #print("%s,%s,%s,%s|%s\n\r"%(item['bookname'],item['author'],item['downid'],item['topclass'],item['bookclass']))
             yield scrapy.Request(book_url,
                                  callback=self.parse_book_info,
                                  meta={'item': item})
Esempio n. 2
0
 def parse_book_list(self, response):
     pages = response.xpath("//*[@class='man_first']/ul/li")
     #print pages
     # item = BookscrapyItem()
     # item['topclass']=response.xpath("//*[@class='listcd']/a[last()-1]/text()").extract_first()
     # item['author']=pages[0].xpath("./h3/text()").extract_first()
     # book_url="http://m.bookbao.cc"+str(pages[0].xpath("./h1/a/@href").extract_first())
     # yield scrapy.Request(book_url,callback=self.parse_book_info,meta={'item':item})
     for page in pages:
         item = BookscrapyItem()
         item['topclass'] = response.xpath(
             "//*[@class='listcd']/a[last()-1]/text()").extract_first()
         item['author'] = page.xpath("./h3/text()").extract_first()
         book_url = "http://m.bookbao.cc" + str(
             page.xpath("./h1/a/@href").extract_first())
         yield scrapy.Request(book_url,
                              callback=self.parse_book_info,
                              meta={'item': item})
Esempio n. 3
0
 def parse_book_info(self, response):
     item = response.meta['item']
     item = BookscrapyItem()
     item['bookname'] = response.xpath(
         "//*[@class='mlist']/h1/text()").extract_first()
     #item['author']=response.xpath("//*[@class='mlist']/ul/li[1]/text()").extract_first()#作者名称有些导不出
     item['introduction'] = response.xpath("//*[@class='conten']/p").xpath(
         'string(.)').extract_first()
     item['remarks'] = response.xpath(
         "//*[@class='conten']/p/span[1]/text()").extract_first()
     item['datasize'] = response.xpath(
         "//*[@class='mlist']/ul/li[3]/text()").extract_first()
     item['bookclass'] = response.xpath(
         "normalize-space(//*[@class='mlist']/ul/li[2]/text())"
     ).extract_first()
     item['downid'] = re.sub(
         r"\D", "",
         str(
             response.xpath(
                 "//*[@class='mlist']/a[1]/@href").extract_first()))
     print item['bookname'] + "\n\r" + item['author'] + "\n\r" + item[
         'introduction'] + "\n\r"
     yield item