class spider_detail_list(scrapy.Spider):
    name = "spider_detail_list" #要调用的名字
    allowed_domains = ["qidian.com"] #分一个域
    red=getredis()
    urls=red.lrange('novel_list',0,-1)
    start_urls = []
    ids=[]# 方法二
    dict={}# 方法一
    for url in urls:
        url = str(url, encoding="utf-8")
        url = url.split(',')
        start_urls.append(url[1])
        # ids.append(url[0]) # 方法二
        dict[url[1]]=url[0] # 方法一
    #每爬完一个网页会回调parse方法
    def parse(self, response):
        # 方法一:由于url是线程故无法判断id 是不是对应的url
        Pid=self.dict[response.url]
        bcollection =getMongodb()
        links = response.xpath('//div[@class="sub-type"]/dl[@class=""]/dd/a')
        for link in links:
            print("***************")
            print(Pid)
            print(link.select("text()").extract()[0])
            print(link.select('@href').extract()[0])
            print("***************")
            id = bcollection.insert({'list_child_name': link.select("text()").extract()[0], 'pid': Pid})
            self.red.lpush('bnovel_all_list', str(id) + "," + "https:" + link.select('@href').extract()[0])
Beispiel #2
0
class spider_list_novel(scrapy.Spider):
    name = "spider_list_novel" #要调用的名字
    allowed_domains = ["qidian.com"] #分一个域
    start_urls = []
    dict = {}
    red = getredis()
    mongodb=getMongodb('novel','novels')
    def __init__(self):
        urls = self.red.lrange('bnovel_all_list', 0, -1)
        for url in urls:
            url = str(url, encoding="utf-8")
            url = url.split(',')
            spider_list_novel.start_urls.append(url[2])
            spider_list_novel.dict[url[2]] ={'classId':url[0],'listId':url[1],'sum':0}
            # break
    #每爬完一个网页会回调parse方法
    def parse(self, response):
        print(response.url)
        Pid = self.dict[response.url]
        Pid['sum']+=1
        print(Pid['sum'])
        if Pid['sum']>3:
            return
        links = response.xpath('//div[@class="book-mid-info"]/h4/a')
        for link in links:
            novel_name = link.select("text()").extract()[0]
            novel_id = self.mongodb.insert({'name': novel_name, 'total_list': Pid['classId'], 'list': Pid['listId']})
            href = link.select("@href").extract()[0]
            href = str(novel_id) + ',' + 'https:' + href
            print(href)
            self.red.lpush('all_novel_href',href)
        sleep(0.3)
        href=self.find_next(response)
        if href==None:
            f = open('file/%s.txt' % ("日志"), 'a', encoding='utf-8')
            f.write(response.url)
            f.write('++++++++++++++')
            f.close()
        else:
            href="https:"+href
            if href.find('javascript:;')<0:
                self.dict[href] = Pid
                request=Request(href,callback=self.parse)
                yield request
    def find_next(self,response):
        try:
            hrefs =response.xpath('//li[@class="lbf-pagination-item"]/a')
            i=len(hrefs)
            href=hrefs[i - 1].select("@href").extract()[0]
            return href
        except Exception as err:
            f = open('file/%s.txt' % ("日志"), 'a', encoding='utf-8')
            f.write(str(err)+':'+href)
            f.close()
            return None
Beispiel #3
0
 def parse(self, response):
     red = getredis()
     bcollection = getMongodb()
     hx = response.xpath(
         '//div[@class="work-filter type-filter"]/ul/li/a|//div[@class="work-filter type-filter"]/ul/li/a'
     )
     for i in range(1, len(hx)):
         print(hx[i].select("@href").extract()[0])  # 取长度
         print(hx[i].select("text()").extract()
               [0])  # 取长度str(hx[i].select("@href").extract()[0])
         id = bcollection.insert(
             {'list_name': str(hx[i].select("text()").extract()[0])})
         red.lpush(
             'novel_list',
             str(id) + "," + "https:" +
             str(hx[i].select("@href").extract()[0]))
Beispiel #4
0
class spider_type_list(scrapy.Spider):
    name = "spider_type_list" #要调用的名字
    allowed_domains = ["qidian.com"] #分一个域
    start_urls = []
    red = getredis()
    urls = red.lrange('bnovel_all_list', 0, -1)
    dict={}
    for url in urls:
        url = str(url, encoding="utf-8")
        url = url.split(',')
        start_urls.append(url[1])
        dict[url[1]] = url[0]
    #每爬完一个网页会回调parse方法
    def parse(self, response):
        print("**********")
        links=response.xpath('//div[@class="book-mid-info"]/h4/a')
        for link in links:
            print(link.select("text()").extract()[0])
            print(link.select("@href").extract()[0])
        print("++++++++++++")
Beispiel #5
0
class spider_list_novel(scrapy.Spider):
    name = "spider_list_novel"  #要调用的名字
    allowed_domains = ["qidian.com"]  #分一个域
    start_urls = []
    dict = {}
    red = getredis()
    mongodb = getMongodb('novel', 'novels')

    def __init__(self):

        urls = self.red.lrange('all_novel_href', 0, 5)
        for url in urls:
            url = str(url, encoding="utf-8")
            url = url.split(',')
            spider_list_novel.start_urls.append(url[1])
            spider_list_novel.dict[url[1]] = url[0]
            # break

    #每爬完一个网页会回调parse方法
    def parse(self, response):
        print(response.url)
        Pid = self.dict[response.url]
        print(Pid)
        links = response.xpath('//div[@class="book-mid-info"]/h4/a')
Beispiel #6
0
class spider_detail_novel(scrapy.Spider):
    name = "spider_detail_novel"  #要调用的名字
    allowed_domains = ["qidian.com"]  #分一个域
    start_urls = []
    dict = {}
    red = getredis()
    mongodb = getMongodb('novel', 'novels')

    def __init__(self):
        urls = self.red.lrange('all_novel_href', 0, -1)
        for url in urls:
            url = str(url, encoding="utf-8")
            url = url.split(',')
            spider_detail_novel.start_urls.append(url[1])
            spider_detail_novel.dict[url[1]] = url[0]

    #每爬完一个网页会回调parse方法
    def parse(self, response):
        global status_flag
        id = self.dict[response.url]
        Pid = (ObjectId(id))
        detail_messages = response.xpath('//div[@class="book-info "]')
        # 爬取详细信息
        for detail_message in detail_messages:
            author = detail_message.select('//h1/span/a/text()').extract()[0]
            status = detail_message.select('p/span/text()').extract()[0]
            if status == "连载":
                status_flag = 0
            else:
                status_flag = 1
            # 更新mongodb
            self.mongodb.update(
                {"_id": Pid},
                {"$set": {
                    'author': author,
                    'status': status_flag
                }})
            novel_href = "https:" + detail_message.select(
                'p/a/@href').extract()[2]
            # 爬取小说
            request = Request(
                novel_href,
                callback=lambda response, id=id, status_flag=status_flag: self.
                spider_one_novel(response, id, status_flag))
            yield request

    def spider_one_novel(self, response, id, status_flag):
        chapter_mongodb = getMongodb('novel', 'chapters')
        chapter = response.xpath(
            '//h3[@class="j_chapterName"]/text()').extract()[0]
        print('********处理内容*******')
        contents = response.xpath(
            '//div[@class="read-content j_readContent"]/p/text()').extract()
        novel_names = response.xpath(
            '//div[@class="book-cover-wrap"]/h1/text()').extract()
        novel_name = response.xpath(
            '//div[@class="crumbs-nav"]/a[@class="act"]/text()').extract()[0]
        if len(novel_names) != 0:
            os.makedirs('D:/all_novels/%s' % novel_names[0])
        else:
            pass
        f = open('D:/all_novels/%s/%s.html' % (novel_name, chapter),
                 'w',
                 encoding='utf-8')
        file_path = 'D:/all_novels/%s/%s.html' % (novel_name, chapter)
        # 存入mongodb中
        chapter_mongodb.insert({chapter: file_path, 'pid': id})
        for content in contents:
            f.write(content)
            f.write('<br>')
        f.close()
        print('+++++++++++++++++++++')
        next_chapter = "https:" + response.xpath(
            '//a[@id="j_chapterNext"]/@href').extract()[0]
        if next_chapter.find('lastpage') > 0:
            if status_flag == 0:
                self.red.lpush('serialize_list', id + ',' + response.url)
            return None
        print('+++++++++++++++++++++')
        request = Request(
            next_chapter,
            callback=lambda response, id=id, status_flag=status_flag: self.
            spider_one_novel(response, id, status_flag))
        yield request
Beispiel #7
0
 def pushRedis(self, pid, cid, href):
     red = getredis()
     href = "%s,%s,%s" % (pid, cid, href)
     red.lpush('bnovel_all_list', href)