Ejemplo n.º 1
0
    def parse(self, response):
        pattern = re.compile(
            '<td width="30">([\s\S]*?)</td><td width="60">[\s\S]*?target="_blank">([\s\S]*?)</a></td><td><a class="red" href="([\s\S]*?)" title="([\s\S]*?)" target="_blank">[\s\S]*?</td>[\s\S]*?</td><td>([\s\S]*?)</td><td>[\s\S]*?" title="([\s\S]*?)"[\s\S]*?<td width="30">'
        )
        data = re.findall(pattern, response.body.decode('utf-8'))
        # print(data)
        for item in data:
            top = item[0]
            url = item[2]
            type_ = item[1]
            name = item[3]
            update_time = item[4]
            author = item[5]

            #存入items容器中
            item = FictionItem()
            item['top'] = top
            item['url'] = url
            item['type_'] = type_
            item['name'] = name
            item['update_time'] = update_time
            item['author'] = author
            yield item
        for top in self.top_urls:
            yield Request(top, callback=self.parse)
Ejemplo n.º 2
0
    def parse_content(self, response):
        result = response.text
        #小说名字
        name = response.xpath(
            '//div[@class="main-index"]/a[@class="article_title"]/text()'
        ).extract_first()
        #小说章节名字
        chapter_name_tmp = response.xpath(
            '//strong[@class="l jieqi_title"]/text()').extract_first()
        if '第.*?卷' in chapter_name_tmp:
            chapter_name_tmp_reg = r'第.*?卷.*?(第.*?章[\s][\u4e00-\u9fa5]{2,20})'
            chapter_name = re.findall(chapter_name_tmp_reg, chapter_name_tmp,
                                      re.S)[0]
        else:
            chapter_name_tmp_reg = r'(第.*?章[\s][\u4e00-\u9fa5]{2,20})'
            chapter_name = re.findall(chapter_name_tmp_reg, chapter_name_tmp,
                                      re.S)[0]
        #获取章节ID
        chapter_name_id_reg = r'第(.*?)章'
        chapter_name_id = re.findall(chapter_name_id_reg, chapter_name)[0]
        #小说章节内容
        chapter_content_reg = r'style5\(\);</script>(.*?)<script type="text/javascript">'
        chapter_content_2 = re.findall(chapter_content_reg, result, re.S)[0]
        chapter_content_1 = chapter_content_2.replace(
            '&nbsp;&nbsp;&nbsp;&nbsp;', '')
        chapter_content = chapter_content_1.replace('<br />', '')

        print('正在爬取的小说: ' + name + '\t' + '章节: ' + chapter_name + '\t' +
              '入库成功!')
        item = FictionItem()
        item['name'] = name
        item['chapter_name'] = chapter_name
        item['chapter_content'] = chapter_content
        item['order_id'] = Cn2An(get_tit_num(chapter_name_id))
        yield item
Ejemplo n.º 3
0
    def parse_chapter(self, response):
        idx = response.meta['idx']
        string = response.meta['novel_name']

        title = response.xpath('//h3[@class="j_chapterName"]/span[1]/text()'
                               ).extract_first().strip()
        content = response.xpath(
            '//div[@class="main-text-wrap "]//div[@class="read-content j_readContent"]'
        ).extract_first().strip()

        print(string, type(string))
        print(title)
        novel = string.replace(" ", "")
        title1 = title.replace('!', '!')
        title = title1.replace('?', '?')
        title1 = title.replace('!', '!')
        title = title1.replace('*', '/*')

        item = FictionItem()
        item['idx'] = idx
        item['title'] = title
        item['content'] = content
        item['novel'] = novel
        print(novel, type(novel))
        yield item
Ejemplo n.º 4
0
 def parse_read(self,response):
     item = FictionItem()
     # 马上阅读的URL
     read_url = response.xpath('//a[@class="reader"]/@href').extract()[0]
     # 小说图片的URL
     self.img_url = response.xpath('//div/a/img/@src').extract()[0]
     
     # print(1111111111111,item['image'])
     yield scrapy.Request(read_url,callback=self.parse_chapter)
Ejemplo n.º 5
0
    def parse_content(self, response):
        # 小说名字
        name = response.xpath('//div[@class="main-index"]/a[@class="article_title"]/text()').extract_first()

        result = response.text
        # 小说章节名字
        chapter_name = response.xpath('//strong[@class="l jieqi_title"]/text()').extract_first()
        # 小说章节内容
        chapter_content_reg = r'style5\(\);</script>(.*?)<script type="text/javascript">'
        chapter_content_2 = re.findall(chapter_content_reg, result, re.S)[0]
        chapter_content_1 = chapter_content_2.replace('    ', '')
        chapter_content = chapter_content_1.replace('<br />', '')

        item = FictionItem()
        item['name'] = name
        item['chapter_name'] = chapter_name
        item['chapter_content'] = chapter_content
        yield item
Ejemplo n.º 6
0
    def parse_chapter(self, response):
        idx = response.meta['idx']
        string = response.meta['novel_name']

        title = response.xpath('//*[@id="main"]/h1/text()').extract_first().strip()
        content = response.xpath('//div[@id="main"]//*[@id="content"]').extract_first().strip()

        novel = string.replace(" ", "")
        title1 = title.replace('!', '!')
        title = title1.replace('?', '?')
        title1 = title.replace('!', '!')
        title = title1.replace('*', '/*')
        content2 = content.replace('\xa0\xa0\xa0\xa0', '  ')
        content = content2.replace('<br>', '\n')

        item = FictionItem()
        item['idx'] = idx
        item['title'] = title
        item['content'] = content
        item['novel'] = novel
        yield item
Ejemplo n.º 7
0
    def parse_content(self, response):
        # 小说名字
        name = response.xpath('//div[@class="main-index"]/a[@class="article_title"]/text()').extract()[0]
        # 小说章节名字
        chapter_name = response.xpath('//strong[@class="l jieqi_title"]/text()').extract()[0]

        # 小说内容
        # chapter_content = response.xpath('//div[@class="mainContenr"]/text()').extract()[0]

        chapter_content_reg = r'style5\(\);</script>(.*?)<script type="text/javascript">'
        
        result = response.text
        chapter_content_2 = re.findall(chapter_content_reg, result, re.S)[0]
        chapter_content = chapter_content_2.replace('    ', '').replace('<br />', '').replace('&nbsp;', '')

        item = FictionItem()
        item['name'] = name
        item['chapter_name'] = chapter_name
        item['chapter_content'] = chapter_content
        # item['image'] = self.img_url
        yield item