Exemple #1
0
    def deal_page(self, link):
        print(1243455)
        item = NovelItem()
        # ls = links.xpath('//ul[@class="item-con]//li')
        ls = link.xpath('//ul[@class="item-con"]//li')
        # ls = links.xpath('//*[@id="content"]/div/div[2]/ul//li')
        print('len:', len(ls))
        # print(ls)
        for link in ls:
            item['name'] = link.xpath(
                './/span[@class="s2"]/a/text()').extract()[0]
            print('name:', item['name'])
            item['url'] = 'https://www.quanben.ne' + link.xpath(
                './/span[@class="s2"]/a/@href').extract()[0]
            print('url:', item['url'])
            item['type'] = link.xpath(
                './/span[@class="s1"]/text()').extract()[0]
            print('type:', item['type'])
            item['author'] = link.xpath(
                './/span[@class="s3"]/text()').extract()[0]
            print('author:', item['type'])
            item['pub_date'] = link.xpath(
                './/span[@class="s4"]/text()').extract()[0]
            print('pub_date:', item['pub_date'])
            item['statue'] = link.xpath(
                './/span[@class="s5"]/text()').extract()[0]
            print('statue:', item['statue'])

            yield item

        return link
Exemple #2
0
    def parse(self, response):
        next_text = response.xpath("//a[@id='linkNext']/text()").extract()[0]
        base_URL = response.xpath("//a[@id='linkNext']/@href").extract()[0]
        head_URL = response.xpath("//link[1]/@href").extract()[0]
        url = head_URL.split('/')[:5]
        s = '/'.join(url)
        next_URL = s + '/' + base_URL
        print(next_URL)

        if next_text == '下一页':
            global flag
            flag = response.xpath(
                "//div[@class='panel-body']/text()").extract()
            yield scrapy.Request(next_URL, callback=self.parse)
        if next_text == '下一章':
            if self.count < 20:
                self.count += 1
                item = NovelItem()
                item['name'] = response.xpath(
                    "//a[@class='blue'][1]/text()").extract()[0]
                item['title'] = response.xpath("//h1/text()").extract()[0]
                item['texts'] = flag + response.xpath(
                    "//div[@class='panel-body']/text()").extract()
                print(self.count)
                yield item
                yield scrapy.Request(next_URL, callback=self.parse)
            else:
                # for i in range(1, len(self.links)):
                if self.link < 100:
                    self.start_urls = self.links[self.link]
                    self.link += 1
                    self.count = 0
                    yield scrapy.Request(self.start_urls, callback=self.parse)
                else:
                    pass
Exemple #3
0
    def parse(self, response):
    # def item_parse(self, link):
        print(1243455)
        item = NovelItem()
        # ls = links.xpath('//ul[@class="item-con]//li')
        ls = response.xpath('//ul[@class="item-con"]//li')
        # ls = links.xpath('//*[@id="content"]/div/div[2]/ul//li')
        print('len:',len(ls))
        # print(ls)
        for response in ls:
            item['name'] = response.xpath('.//span[@class="s2"]/a/text()').extract()[0]
            print('name:',item['name'])
            item['url'] = 'https://www.quanben.ne' + response.xpath('.//span[@class="s2"]/a/@href').extract()[0]
            print('url:',item['url'])
            item['type']= response.xpath('.//span[@class="s1"]/text()').extract()[0]
            print('type:', item['type'])
            item['author'] = response.xpath('.//span[@class="s3"]/text()').extract()[0]
            print('author:', item['type'])
            item['pub_date'] = response.xpath('.//span[@class="s4"]/text()').extract()[0]
            print('pub_date:', item['pub_date'])
            item['statue'] = response.xpath('.//span[@class="s5"]/text()').extract()[0]
            print('statue:', item['statue'])

            req = scrapy.Request(item['url'], callback=self.item_parse)
            req['item'] = item
            yield req