Beispiel #1
0
class SancunSpider(scrapy.Spider):
    name = 'tsxsy_bbsgx'
    allowed_domains = ['www.bbsgx.com']
    #start_urls = ['http://www.xbiquge.la/10/10489/']
    url_ori = "http://www.bbsgx.com"
    url_firstchapter = "http://www.bbsgx.com/book_211549/65039517.html"
    name_txt = "./novels/贴身小神医bbsgx"

    pipeline = XbiqugePipeline()
    pipeline.createtable(name)
    item = XbiqugeItem()
    item['name'] = name
    item['url_firstchapter'] = url_firstchapter
    item['name_txt'] = name_txt

    def start_requests(self):
        start_urls = ['http://www.bbsgx.com/book_211549/']
        for url in start_urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        dl = response.css('#list dl dd')  #提取章节链接相关信息
        for dd in dl:
            self.url_c = self.url_ori + '/book_211549/' + dd.css(
                'a::attr(href)').extract()[0]  #组合形成小说的各章节链接
            #print(self.url_c)
            #yield scrapy.Request(self.url_c, callback=self.parse_c,dont_filter=True)
            yield scrapy.Request(
                self.url_c, callback=self.parse_c
            )  #以生成器模式(yield)调用parse_c方法获得各章节链接、上一页链接、下一页链接和章节内容信息。
            #print(self.url_c)
    def parse_c(self, response):
        #item = XbiqugeItem()
        #item['name'] = self.name
        #item['url_firstchapter'] = self.url_firstchapter
        #item['name_txt'] = self.name_txt
        self.item['url'] = response.url
        self.item['preview_page'] = self.url_ori + response.css(
            '#wrapper > div.content_read > div > div.bookname > div.bottem1 > a:nth-child(2)::attr(href)'
        ).extract()[0]
        self.item['next_page'] = self.url_ori + response.css(
            '#wrapper > div.content_read > div > div.bookname > div.bottem1 > a:nth-child(4)::attr(href)'
        ).extract()[0]
        title = response.css(
            '#wrapper > div.content_read > div > div.bookname > h1::text'
        ).extract()[0]
        contents = response.css('#content::text').extract()
        text = ''
        for content in contents:
            text = text + content
        #print(text)
        self.item['content'] = title + "\n" + text.replace(
            '\15', '\n')  #各章节标题和内容组合成content数据,\15是^M的八进制表示,需要替换为换行符。
        yield self.item  #以生成器模式(yield)输出Item对象的内容给pipelines模块。

        if self.item['url'][33:41] == self.item['next_page'][33:41]:
            self.url_c = self.item['next_page']
            yield scrapy.Request(self.url_c, callback=self.parse_c)
Beispiel #2
0
class SancunSpider(scrapy.Spider):
    name = 'kjtsg'
    allowed_domains = ['www.xbiquge.la']
    #start_urls = ['http://www.xbiquge.la/10/10489/']
    url_ori = "http://www.xbiquge.la"
    url_firstchapter = "http://www.xbiquge.la/15/15480/8186557.html"
    name_txt = "./novels/科技图书馆"

    pipeline = XbiqugePipeline()
    pipeline.clearcollection(
        name)  #清空小说的数据集合(collection),mongodb的collection相当于mysql的数据表table
    item = XbiqugeItem()
    item['id'] = 0  #新增id字段,便于查询
    item['name'] = name
    item['url_firstchapter'] = url_firstchapter
    item['name_txt'] = name_txt

    def start_requests(self):
        start_urls = ['http://www.xbiquge.la/15/15480/']
        for url in start_urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        dl = response.css('#list dl dd')  #提取章节链接相关信息
        for dd in dl:
            self.url_c = self.url_ori + dd.css('a::attr(href)').extract()[
                0]  #组合形成小说的各章节链接
            #print(self.url_c)
            #yield scrapy.Request(self.url_c, callback=self.parse_c,dont_filter=True)
            yield scrapy.Request(
                self.url_c, callback=self.parse_c
            )  #以生成器模式(yield)调用parse_c方法获得各章节链接、上一页链接、下一页链接和章节内容信息。
            #print(self.url_c)
    def parse_c(self, response):
        #item = XbiqugeItem()
        #item['name'] = self.name
        #item['url_firstchapter'] = self.url_firstchapter
        #item['name_txt'] = self.name_txt
        self.item['id'] += 1
        self.item['url'] = response.url
        self.item['preview_page'] = self.url_ori + response.css(
            'div .bottem1 a::attr(href)').extract()[1]
        self.item['next_page'] = self.url_ori + response.css(
            'div .bottem1 a::attr(href)').extract()[3]
        title = response.css('.con_top::text').extract()[4]
        contents = response.css('#content::text').extract()
        text = ''
        for content in contents:
            text = text + content
        #print(text)
        self.item['content'] = title + "\n" + text.replace(
            '\15', '\n')  #各章节标题和内容组合成content数据,\15是^M的八进制表示,需要替换为换行符。
        yield self.item  #以生成器模式(yield)输出Item对象的内容给pipelines模块。
Beispiel #3
0
 def parse_c(self, response):
     item = XbiqugeItem()
     item['url'] = response.url
     item['preview_page'] = "http://www.xbiquge.la" + response.css('div .bottem1 a::attr(href)').extract()[1]
     item['next_page'] = "http://www.xbiquge.la" + response.css('div .bottem1 a::attr(href)').extract()[3]
     title = response.css('.con_top::text').extract()[4]
     contents = response.css('#content::text').extract()
     text=''
     for content in contents:
         text = text + content
     #print(text)
     item['content'] = title + "\n" + text.replace('\15', '\n')     #各章节标题和内容组合成content数据,\15是^M的八进制表示,需要替换为换行符。
     yield item     #以生成器模式(yield)输出Item对象的内容给pipelines模块。
Beispiel #4
0
 def parse_c(self, response):
     item = XbiqugeItem()
     item['url'] = response.url
     item['preview_page'] = "http://www.778buy.com/659_659168/" + response.css('#bgdiv > table.border_l_r > tbody > tr:nth-child(2) > td > a:nth-child(1)::attr(href)').extract()[0]
     item['next_page'] = "http://www.778buy.com/659_659168/"+ response.css('#bgdiv > table.border_l_r > tbody > tr:nth-child(2) > td > a:nth-child(3)::attr(href)').extract()[0]
     title = response.css('#bgdiv > table.border_l_r > tbody > tr:nth-child(1) > td > div > h1::text').extract()[0]
     contents = response.css('div #content::text').extract()
     text=''
     for content in contents:
         text = text + content
     #print(text)
     item['content'] = title + "\n" + text.replace('\15', '\n')     #各章节标题和内容组合成content数据,\15是^M的八进制表示,需要替换为换行符。
     yield item     #以生成器模式(yield)输出Item对象的内容给pipelines模块。
Beispiel #5
0
 def parse_c(self, response):
     item = XbiqugeItem()
     item['url'] = response.url
     item['preview_page'] = "https://www.86zw.cc" + response.css('div.main_content div#nr_content.nr_content div.nr_page a::attr(href)').extract()[0]
     item['next_page'] = "https://www.86zw.cc" + response.css('div.main_content div#nr_content.nr_content div.nr_page a::attr(href)').extract()[3]
     title = response.css('div.main_content div#nr_content.nr_content div.nr_title h3::text').extract()[0]
     contents = response.css('div.main_content div#nr_content.nr_content div.novelcontent p#articlecontent.articlecontent::text').extract()
     text=''
     for content in contents:
         text = text + content
     #print(text)
     item['content'] = title + "\n" + text.replace('\15', '\n')     #各章节标题和内容组合成content数据,\15是^M的八进制表示,需要替换为换行符。
     yield item     #以生成器模式(yield)输出Item对象的内容给pipelines模块。
Beispiel #6
0
class SancunSpider(scrapy.Spider):
    name = 'sancun'
    allowed_domains = ['www.xbiquge.la']
    #start_urls = ['http://www.xbiquge.la/10/10489/']
    url_ori = "https://www.xbiquge.la"
    url_firstchapter = "https://www.xbiquge.la/10/10489/4534454.html"
    name_txt = "./novels/三寸人间"
    url_chapters = url_firstchapter[0:32]
    pipeline = XbiqugePipeline()
    novelcollection = pipeline.get_collection(
        name)  #获取小说数据集cursor对象,mongodb的数据集(collection)相当于mysql的数据表table
    #--------------------------------------------
    #如果next_page的值是小说目录页面url,则把包含目录页面的记录删除,以免再次抓取时,出现多>个目录页面url,使得无法获得最新内容。
    if novelcollection.find({"next_page": url_chapters}).count() != 0:
        print(
            "包含目录页面url的记录:",
            novelcollection.find({
                "next_page": url_chapters
            }, {
                "_id": 0,
                "id": 1,
                "url": 1,
                "next_page": 1
            }).next())
        novelcollection.remove({"next_page": url_chapters})
        print("已删除包含目录页面url的记录。")
    #--------------------------------------------
    novelcounts = novelcollection.find().count()
    novelurls = novelcollection.find({}, {"_id": 0, "id": 1, "url": 1})
    item = XbiqugeItem()
    item['id'] = novelcounts  #id置初值为colletion的记录总数
    item['name'] = name
    item['url_firstchapter'] = url_firstchapter
    item['name_txt'] = name_txt

    def start_requests(self):
        start_urls = [self.url_chapters]
        print("小说目录url:", start_urls)
        for url in start_urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):  #网页提取数据,并与mongodb数据集比较,没有相同的数据才从网页抓取。
        f = open("/root/xbiquge_w/url_list.txt", "w")  #打开文件,以便写入抓取页面url
        count_bingo = 0  #数据集中已有记录的条数
        dl = response.css('#list dl dd')  #提取章节链接相关信息
        for dd in dl:
            count_iterator = 0
            self.url_c = self.url_ori + dd.css('a::attr(href)').extract()[
                0]  #组合形成小说的各章节链接
            #print("网页提取url:", self.url_c)
            self.novelurls = self.novelcollection.find({}, {
                "_id": 0,
                "id": 1,
                "url": 1
            })  #通过重新赋值迭代器来重置迭代器指针,使for循环能够从头遍历迭代器。
            for url in self.novelurls:
                #print("mongodb提取url:", url)
                if url["url"] == self.url_c:  #如果数据集中找到与网页提取的url值相同,则跳出循环
                    count_bingo += 1
                    count_iterator += 1
                    break
            if count_iterator != 0:  #如果有命中结果,则继续下一个循环,不执行爬取动作
                continue
            #print("爬取url:",self.url_c)
            f.write("爬取url:" + self.url_c + "\n")
            #yield scrapy.Request(self.url_c, callback=self.parse_c,dont_filter=True)
            yield scrapy.Request(
                self.url_c, callback=self.parse_c
            )  #以生成器模式(yield)调用parse_c方法获得各章节链接、上一页链接、下一页链接和章节内容信息。
            #print(self.url_c)
        f.close()
        print("数据集已有记录数count_bingo:", count_bingo)

    def parse_c(self, response):
        self.item['id'] += 1
        self.item['url'] = response.url
        self.item['preview_page'] = self.url_ori + response.css(
            'div .bottem1 a::attr(href)').extract()[1]
        self.item['next_page'] = self.url_ori + response.css(
            'div .bottem1 a::attr(href)').extract()[3]
        title = response.css('.con_top::text').extract()[4]
        contents = response.css('#content::text').extract()
        text = ''
        for content in contents:
            text = text + content
        #print(text)
        self.item['content'] = title + "\n" + text.replace(
            '\15', '\n')  #各章节标题和内容组合成content数据,\15是^M的八进制表示,需要替换为换行符。
        yield self.item  #以生成器模式(yield)输出Item对象的内容给pipelines模块。

        if self.item['url'][32:39] == self.item['next_page'][
                32:39]:  #同一章有分页的处理
            self.url_c = self.item['next_page']
            yield scrapy.Request(self.url_c, callback=self.parse_c)