def deal_page(self, link): print(1243455) item = NovelItem() # ls = links.xpath('//ul[@class="item-con]//li') ls = link.xpath('//ul[@class="item-con"]//li') # ls = links.xpath('//*[@id="content"]/div/div[2]/ul//li') print('len:', len(ls)) # print(ls) for link in ls: item['name'] = link.xpath( './/span[@class="s2"]/a/text()').extract()[0] print('name:', item['name']) item['url'] = 'https://www.quanben.ne' + link.xpath( './/span[@class="s2"]/a/@href').extract()[0] print('url:', item['url']) item['type'] = link.xpath( './/span[@class="s1"]/text()').extract()[0] print('type:', item['type']) item['author'] = link.xpath( './/span[@class="s3"]/text()').extract()[0] print('author:', item['type']) item['pub_date'] = link.xpath( './/span[@class="s4"]/text()').extract()[0] print('pub_date:', item['pub_date']) item['statue'] = link.xpath( './/span[@class="s5"]/text()').extract()[0] print('statue:', item['statue']) yield item return link
def parse(self, response): next_text = response.xpath("//a[@id='linkNext']/text()").extract()[0] base_URL = response.xpath("//a[@id='linkNext']/@href").extract()[0] head_URL = response.xpath("//link[1]/@href").extract()[0] url = head_URL.split('/')[:5] s = '/'.join(url) next_URL = s + '/' + base_URL print(next_URL) if next_text == '下一页': global flag flag = response.xpath( "//div[@class='panel-body']/text()").extract() yield scrapy.Request(next_URL, callback=self.parse) if next_text == '下一章': if self.count < 20: self.count += 1 item = NovelItem() item['name'] = response.xpath( "//a[@class='blue'][1]/text()").extract()[0] item['title'] = response.xpath("//h1/text()").extract()[0] item['texts'] = flag + response.xpath( "//div[@class='panel-body']/text()").extract() print(self.count) yield item yield scrapy.Request(next_URL, callback=self.parse) else: # for i in range(1, len(self.links)): if self.link < 100: self.start_urls = self.links[self.link] self.link += 1 self.count = 0 yield scrapy.Request(self.start_urls, callback=self.parse) else: pass
def parse(self, response): # def item_parse(self, link): print(1243455) item = NovelItem() # ls = links.xpath('//ul[@class="item-con]//li') ls = response.xpath('//ul[@class="item-con"]//li') # ls = links.xpath('//*[@id="content"]/div/div[2]/ul//li') print('len:',len(ls)) # print(ls) for response in ls: item['name'] = response.xpath('.//span[@class="s2"]/a/text()').extract()[0] print('name:',item['name']) item['url'] = 'https://www.quanben.ne' + response.xpath('.//span[@class="s2"]/a/@href').extract()[0] print('url:',item['url']) item['type']= response.xpath('.//span[@class="s1"]/text()').extract()[0] print('type:', item['type']) item['author'] = response.xpath('.//span[@class="s3"]/text()').extract()[0] print('author:', item['type']) item['pub_date'] = response.xpath('.//span[@class="s4"]/text()').extract()[0] print('pub_date:', item['pub_date']) item['statue'] = response.xpath('.//span[@class="s5"]/text()').extract()[0] print('statue:', item['statue']) req = scrapy.Request(item['url'], callback=self.item_parse) req['item'] = item yield req