def parse(self, response): pattern = re.compile( '<td width="30">([\s\S]*?)</td><td width="60">[\s\S]*?target="_blank">([\s\S]*?)</a></td><td><a class="red" href="([\s\S]*?)" title="([\s\S]*?)" target="_blank">[\s\S]*?</td>[\s\S]*?</td><td>([\s\S]*?)</td><td>[\s\S]*?" title="([\s\S]*?)"[\s\S]*?<td width="30">' ) data = re.findall(pattern, response.body.decode('utf-8')) # print(data) for item in data: top = item[0] url = item[2] type_ = item[1] name = item[3] update_time = item[4] author = item[5] #存入items容器中 item = FictionItem() item['top'] = top item['url'] = url item['type_'] = type_ item['name'] = name item['update_time'] = update_time item['author'] = author yield item for top in self.top_urls: yield Request(top, callback=self.parse)
def parse_content(self, response): result = response.text #小说名字 name = response.xpath( '//div[@class="main-index"]/a[@class="article_title"]/text()' ).extract_first() #小说章节名字 chapter_name_tmp = response.xpath( '//strong[@class="l jieqi_title"]/text()').extract_first() if '第.*?卷' in chapter_name_tmp: chapter_name_tmp_reg = r'第.*?卷.*?(第.*?章[\s][\u4e00-\u9fa5]{2,20})' chapter_name = re.findall(chapter_name_tmp_reg, chapter_name_tmp, re.S)[0] else: chapter_name_tmp_reg = r'(第.*?章[\s][\u4e00-\u9fa5]{2,20})' chapter_name = re.findall(chapter_name_tmp_reg, chapter_name_tmp, re.S)[0] #获取章节ID chapter_name_id_reg = r'第(.*?)章' chapter_name_id = re.findall(chapter_name_id_reg, chapter_name)[0] #小说章节内容 chapter_content_reg = r'style5\(\);</script>(.*?)<script type="text/javascript">' chapter_content_2 = re.findall(chapter_content_reg, result, re.S)[0] chapter_content_1 = chapter_content_2.replace( ' ', '') chapter_content = chapter_content_1.replace('<br />', '') print('正在爬取的小说: ' + name + '\t' + '章节: ' + chapter_name + '\t' + '入库成功!') item = FictionItem() item['name'] = name item['chapter_name'] = chapter_name item['chapter_content'] = chapter_content item['order_id'] = Cn2An(get_tit_num(chapter_name_id)) yield item
def parse_chapter(self, response): idx = response.meta['idx'] string = response.meta['novel_name'] title = response.xpath('//h3[@class="j_chapterName"]/span[1]/text()' ).extract_first().strip() content = response.xpath( '//div[@class="main-text-wrap "]//div[@class="read-content j_readContent"]' ).extract_first().strip() print(string, type(string)) print(title) novel = string.replace(" ", "") title1 = title.replace('!', '!') title = title1.replace('?', '?') title1 = title.replace('!', '!') title = title1.replace('*', '/*') item = FictionItem() item['idx'] = idx item['title'] = title item['content'] = content item['novel'] = novel print(novel, type(novel)) yield item
def parse_read(self,response): item = FictionItem() # 马上阅读的URL read_url = response.xpath('//a[@class="reader"]/@href').extract()[0] # 小说图片的URL self.img_url = response.xpath('//div/a/img/@src').extract()[0] # print(1111111111111,item['image']) yield scrapy.Request(read_url,callback=self.parse_chapter)
def parse_content(self, response): # 小说名字 name = response.xpath('//div[@class="main-index"]/a[@class="article_title"]/text()').extract_first() result = response.text # 小说章节名字 chapter_name = response.xpath('//strong[@class="l jieqi_title"]/text()').extract_first() # 小说章节内容 chapter_content_reg = r'style5\(\);</script>(.*?)<script type="text/javascript">' chapter_content_2 = re.findall(chapter_content_reg, result, re.S)[0] chapter_content_1 = chapter_content_2.replace(' ', '') chapter_content = chapter_content_1.replace('<br />', '') item = FictionItem() item['name'] = name item['chapter_name'] = chapter_name item['chapter_content'] = chapter_content yield item
def parse_chapter(self, response): idx = response.meta['idx'] string = response.meta['novel_name'] title = response.xpath('//*[@id="main"]/h1/text()').extract_first().strip() content = response.xpath('//div[@id="main"]//*[@id="content"]').extract_first().strip() novel = string.replace(" ", "") title1 = title.replace('!', '!') title = title1.replace('?', '?') title1 = title.replace('!', '!') title = title1.replace('*', '/*') content2 = content.replace('\xa0\xa0\xa0\xa0', ' ') content = content2.replace('<br>', '\n') item = FictionItem() item['idx'] = idx item['title'] = title item['content'] = content item['novel'] = novel yield item
def parse_content(self, response): # 小说名字 name = response.xpath('//div[@class="main-index"]/a[@class="article_title"]/text()').extract()[0] # 小说章节名字 chapter_name = response.xpath('//strong[@class="l jieqi_title"]/text()').extract()[0] # 小说内容 # chapter_content = response.xpath('//div[@class="mainContenr"]/text()').extract()[0] chapter_content_reg = r'style5\(\);</script>(.*?)<script type="text/javascript">' result = response.text chapter_content_2 = re.findall(chapter_content_reg, result, re.S)[0] chapter_content = chapter_content_2.replace(' ', '').replace('<br />', '').replace(' ', '') item = FictionItem() item['name'] = name item['chapter_name'] = chapter_name item['chapter_content'] = chapter_content # item['image'] = self.img_url yield item