class SancunSpider(scrapy.Spider): name = 'tsxsy_bbsgx' allowed_domains = ['www.bbsgx.com'] #start_urls = ['http://www.xbiquge.la/10/10489/'] url_ori = "http://www.bbsgx.com" url_firstchapter = "http://www.bbsgx.com/book_211549/65039517.html" name_txt = "./novels/贴身小神医bbsgx" pipeline = XbiqugePipeline() pipeline.createtable(name) item = XbiqugeItem() item['name'] = name item['url_firstchapter'] = url_firstchapter item['name_txt'] = name_txt def start_requests(self): start_urls = ['http://www.bbsgx.com/book_211549/'] for url in start_urls: yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): dl = response.css('#list dl dd') #提取章节链接相关信息 for dd in dl: self.url_c = self.url_ori + '/book_211549/' + dd.css( 'a::attr(href)').extract()[0] #组合形成小说的各章节链接 #print(self.url_c) #yield scrapy.Request(self.url_c, callback=self.parse_c,dont_filter=True) yield scrapy.Request( self.url_c, callback=self.parse_c ) #以生成器模式(yield)调用parse_c方法获得各章节链接、上一页链接、下一页链接和章节内容信息。 #print(self.url_c) def parse_c(self, response): #item = XbiqugeItem() #item['name'] = self.name #item['url_firstchapter'] = self.url_firstchapter #item['name_txt'] = self.name_txt self.item['url'] = response.url self.item['preview_page'] = self.url_ori + response.css( '#wrapper > div.content_read > div > div.bookname > div.bottem1 > a:nth-child(2)::attr(href)' ).extract()[0] self.item['next_page'] = self.url_ori + response.css( '#wrapper > div.content_read > div > div.bookname > div.bottem1 > a:nth-child(4)::attr(href)' ).extract()[0] title = response.css( '#wrapper > div.content_read > div > div.bookname > h1::text' ).extract()[0] contents = response.css('#content::text').extract() text = '' for content in contents: text = text + content #print(text) self.item['content'] = title + "\n" + text.replace( '\15', '\n') #各章节标题和内容组合成content数据,\15是^M的八进制表示,需要替换为换行符。 yield self.item #以生成器模式(yield)输出Item对象的内容给pipelines模块。 if self.item['url'][33:41] == self.item['next_page'][33:41]: self.url_c = self.item['next_page'] yield scrapy.Request(self.url_c, callback=self.parse_c)
class SancunSpider(scrapy.Spider): name = 'kjtsg' allowed_domains = ['www.xbiquge.la'] #start_urls = ['http://www.xbiquge.la/10/10489/'] url_ori = "http://www.xbiquge.la" url_firstchapter = "http://www.xbiquge.la/15/15480/8186557.html" name_txt = "./novels/科技图书馆" pipeline = XbiqugePipeline() pipeline.clearcollection( name) #清空小说的数据集合(collection),mongodb的collection相当于mysql的数据表table item = XbiqugeItem() item['id'] = 0 #新增id字段,便于查询 item['name'] = name item['url_firstchapter'] = url_firstchapter item['name_txt'] = name_txt def start_requests(self): start_urls = ['http://www.xbiquge.la/15/15480/'] for url in start_urls: yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): dl = response.css('#list dl dd') #提取章节链接相关信息 for dd in dl: self.url_c = self.url_ori + dd.css('a::attr(href)').extract()[ 0] #组合形成小说的各章节链接 #print(self.url_c) #yield scrapy.Request(self.url_c, callback=self.parse_c,dont_filter=True) yield scrapy.Request( self.url_c, callback=self.parse_c ) #以生成器模式(yield)调用parse_c方法获得各章节链接、上一页链接、下一页链接和章节内容信息。 #print(self.url_c) def parse_c(self, response): #item = XbiqugeItem() #item['name'] = self.name #item['url_firstchapter'] = self.url_firstchapter #item['name_txt'] = self.name_txt self.item['id'] += 1 self.item['url'] = response.url self.item['preview_page'] = self.url_ori + response.css( 'div .bottem1 a::attr(href)').extract()[1] self.item['next_page'] = self.url_ori + response.css( 'div .bottem1 a::attr(href)').extract()[3] title = response.css('.con_top::text').extract()[4] contents = response.css('#content::text').extract() text = '' for content in contents: text = text + content #print(text) self.item['content'] = title + "\n" + text.replace( '\15', '\n') #各章节标题和内容组合成content数据,\15是^M的八进制表示,需要替换为换行符。 yield self.item #以生成器模式(yield)输出Item对象的内容给pipelines模块。
def parse_c(self, response): item = XbiqugeItem() item['url'] = response.url item['preview_page'] = "http://www.xbiquge.la" + response.css('div .bottem1 a::attr(href)').extract()[1] item['next_page'] = "http://www.xbiquge.la" + response.css('div .bottem1 a::attr(href)').extract()[3] title = response.css('.con_top::text').extract()[4] contents = response.css('#content::text').extract() text='' for content in contents: text = text + content #print(text) item['content'] = title + "\n" + text.replace('\15', '\n') #各章节标题和内容组合成content数据,\15是^M的八进制表示,需要替换为换行符。 yield item #以生成器模式(yield)输出Item对象的内容给pipelines模块。
def parse_c(self, response): item = XbiqugeItem() item['url'] = response.url item['preview_page'] = "http://www.778buy.com/659_659168/" + response.css('#bgdiv > table.border_l_r > tbody > tr:nth-child(2) > td > a:nth-child(1)::attr(href)').extract()[0] item['next_page'] = "http://www.778buy.com/659_659168/"+ response.css('#bgdiv > table.border_l_r > tbody > tr:nth-child(2) > td > a:nth-child(3)::attr(href)').extract()[0] title = response.css('#bgdiv > table.border_l_r > tbody > tr:nth-child(1) > td > div > h1::text').extract()[0] contents = response.css('div #content::text').extract() text='' for content in contents: text = text + content #print(text) item['content'] = title + "\n" + text.replace('\15', '\n') #各章节标题和内容组合成content数据,\15是^M的八进制表示,需要替换为换行符。 yield item #以生成器模式(yield)输出Item对象的内容给pipelines模块。
def parse_c(self, response): item = XbiqugeItem() item['url'] = response.url item['preview_page'] = "https://www.86zw.cc" + response.css('div.main_content div#nr_content.nr_content div.nr_page a::attr(href)').extract()[0] item['next_page'] = "https://www.86zw.cc" + response.css('div.main_content div#nr_content.nr_content div.nr_page a::attr(href)').extract()[3] title = response.css('div.main_content div#nr_content.nr_content div.nr_title h3::text').extract()[0] contents = response.css('div.main_content div#nr_content.nr_content div.novelcontent p#articlecontent.articlecontent::text').extract() text='' for content in contents: text = text + content #print(text) item['content'] = title + "\n" + text.replace('\15', '\n') #各章节标题和内容组合成content数据,\15是^M的八进制表示,需要替换为换行符。 yield item #以生成器模式(yield)输出Item对象的内容给pipelines模块。
class SancunSpider(scrapy.Spider): name = 'sancun' allowed_domains = ['www.xbiquge.la'] #start_urls = ['http://www.xbiquge.la/10/10489/'] url_ori = "https://www.xbiquge.la" url_firstchapter = "https://www.xbiquge.la/10/10489/4534454.html" name_txt = "./novels/三寸人间" url_chapters = url_firstchapter[0:32] pipeline = XbiqugePipeline() novelcollection = pipeline.get_collection( name) #获取小说数据集cursor对象,mongodb的数据集(collection)相当于mysql的数据表table #-------------------------------------------- #如果next_page的值是小说目录页面url,则把包含目录页面的记录删除,以免再次抓取时,出现多>个目录页面url,使得无法获得最新内容。 if novelcollection.find({"next_page": url_chapters}).count() != 0: print( "包含目录页面url的记录:", novelcollection.find({ "next_page": url_chapters }, { "_id": 0, "id": 1, "url": 1, "next_page": 1 }).next()) novelcollection.remove({"next_page": url_chapters}) print("已删除包含目录页面url的记录。") #-------------------------------------------- novelcounts = novelcollection.find().count() novelurls = novelcollection.find({}, {"_id": 0, "id": 1, "url": 1}) item = XbiqugeItem() item['id'] = novelcounts #id置初值为colletion的记录总数 item['name'] = name item['url_firstchapter'] = url_firstchapter item['name_txt'] = name_txt def start_requests(self): start_urls = [self.url_chapters] print("小说目录url:", start_urls) for url in start_urls: yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): #网页提取数据,并与mongodb数据集比较,没有相同的数据才从网页抓取。 f = open("/root/xbiquge_w/url_list.txt", "w") #打开文件,以便写入抓取页面url count_bingo = 0 #数据集中已有记录的条数 dl = response.css('#list dl dd') #提取章节链接相关信息 for dd in dl: count_iterator = 0 self.url_c = self.url_ori + dd.css('a::attr(href)').extract()[ 0] #组合形成小说的各章节链接 #print("网页提取url:", self.url_c) self.novelurls = self.novelcollection.find({}, { "_id": 0, "id": 1, "url": 1 }) #通过重新赋值迭代器来重置迭代器指针,使for循环能够从头遍历迭代器。 for url in self.novelurls: #print("mongodb提取url:", url) if url["url"] == self.url_c: #如果数据集中找到与网页提取的url值相同,则跳出循环 count_bingo += 1 count_iterator += 1 break if count_iterator != 0: #如果有命中结果,则继续下一个循环,不执行爬取动作 continue #print("爬取url:",self.url_c) f.write("爬取url:" + self.url_c + "\n") #yield scrapy.Request(self.url_c, callback=self.parse_c,dont_filter=True) yield scrapy.Request( self.url_c, callback=self.parse_c ) #以生成器模式(yield)调用parse_c方法获得各章节链接、上一页链接、下一页链接和章节内容信息。 #print(self.url_c) f.close() print("数据集已有记录数count_bingo:", count_bingo) def parse_c(self, response): self.item['id'] += 1 self.item['url'] = response.url self.item['preview_page'] = self.url_ori + response.css( 'div .bottem1 a::attr(href)').extract()[1] self.item['next_page'] = self.url_ori + response.css( 'div .bottem1 a::attr(href)').extract()[3] title = response.css('.con_top::text').extract()[4] contents = response.css('#content::text').extract() text = '' for content in contents: text = text + content #print(text) self.item['content'] = title + "\n" + text.replace( '\15', '\n') #各章节标题和内容组合成content数据,\15是^M的八进制表示,需要替换为换行符。 yield self.item #以生成器模式(yield)输出Item对象的内容给pipelines模块。 if self.item['url'][32:39] == self.item['next_page'][ 32:39]: #同一章有分页的处理 self.url_c = self.item['next_page'] yield scrapy.Request(self.url_c, callback=self.parse_c)