Ejemplo n.º 1
0
    def parse_item(self, response:HtmlResponse):
        i = DushuItem()
        print('--------获取图书概要信息----------')
        print(response.url)
        print(response.xpath('//title/text()').extract()[0])
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()

        # 从当前的网页中获取图书信息
        books = response.xpath('//div[@class="book-info"]')
        for book in books:
            i['name'] = book.xpath('./h3/a/text()').extract_first()
            i['book_url'] = book.xpath('./h3/a/@href').extract_first()
            i['author'] = book.xpath('./p/a/text()').extract_first()
            i['summary'] = book.xpath('./p[last()-1]/text()').extract_first()
            i['img'] = book.xpath('.//a/img/@data-original').extract_first()
            print('----发起{}图片下载-----'.format(i['img']))
            # meta 可以实再spider之间的数据传送
            # 主要实现request和response之间的数据共享
            # meta传参时,不要使用对象的引用,需要使用常量值
            yield scrapy.Request(url=i['img'],
                                 meta={'name': i['name']},
                                 callback=self.parse_img)

            yield i
Ejemplo n.º 2
0
    def parse_item(self, response):
        i = DushuItem()   # 字典类型
        # print('-------------获取图书信息---------------')
        # print(response.url)
        # print(response.xpath('//title/text()').extract()[0])

        # 从当前的网页中获取图书的信息
        books = response.xpath('//div[@class="book-info"]')
        for book in books:
            i['name'] = book.xpath('./h3/a/text()').extract_first()      # 提取第一个结果
            i['book_url'] = book.xpath('./h3/a/@href').extract_first()
            i['author'] = book.xpath('./p/a/text()').extract()
            if i['author'] == []:
                i['author'] = book.xpath('./p/text()').extract()
                if i['author'] == []:
                    i['author'] = book.xpath('./p/em/text()').extract()
            i['author'] = ','.join(i['author'])

            i['summary'] = book.xpath('./p[last()-1]/text()').extract_first()
            i['img'] = book.xpath('.//a/img/@data-original').extract_first()

            # meta实现spider之间的数据传送,主要实现请求和响应之间的数据共享
            # meta传参时,不要使用对象的引用,需要使用常量值
            yield scrapy.Request(url = str(i['img']),
                                 meta = {'name':i['name']},
                                 callback=self.parse_img
                                 )

            yield i
Ejemplo n.º 3
0
 def parse(self, response):
     li_list = response.xpath('/html/body/div[5]/div/div[2]/div[3]/ul/li')
     for li in li_list:
         item = DushuItem()
         item['name'] = li.xpath('div/h3/a/text()').extract_first()
         item['author'] = li.xpath('div/p[1]/a[1]/text()').extract_first()
         href = li.css('a[href]::attr("href")').re('/book.*')[0]
         url = response.urljoin(href)
         item['url'] = url
         request = scrapy.Request(url=url, callback=self.parse2)
         request.meta['item'] = item
         yield request
Ejemplo n.º 4
0
 def parse(self, response):
     if response.url == self.start_urls[0]:
         self.logger.info('访问小说目录' + response.url)
         li_list = response.css("div.mulu ul li a")
         for li in li_list:
             link = li.css('a::attr(href)').extract_first()
             yield scrapy.Request(self.start_urls[0] + link)
     else:
         self.logger.info('访问小说内容' + response.url)
         novel = response.css('div.novel')
         item = DushuItem()
         item['chapterName'] = novel.css('h1::text').extract_first()
         item['text'] = novel.css('div.yd_text2::text').extract()
         # self.logger().info(item)
         yield item
Ejemplo n.º 5
0
 def parse_item(self, response):
     book_li_list = response.xpath('//div[@class="bookslist"]/ul/li')
     for book in book_li_list:
         item = DushuItem()
         item['book_image_url'] = book.xpath(
             './/div[@class="book-info"]/div/a/img/@data-original'
         ).extract_first()
         item['book_name'] = book.xpath(
             './/div[@class="book-info"]/div/a/img/@alt').extract_first()
         item['book_author'] = book.xpath(
             './/div[@class="book-info"]/p/a/text()').extract_first()
         # 接着发送请求,去详情页获取其他信息
         book_url = "http://www.dushu.com" + book.xpath(
             './/div[@class="book-info"]/h3/a/@href').extract_first()
         print(item['book_name'])
         print(item['book_image_url'])
         # yield item
         yield scrapy.Request(url=book_url,
                              callback=self.parse_info,
                              meta={'item': item})
Ejemplo n.º 6
0
    def parse_item(self, response:HtmlResponse):
        i = DushuItem()  #字典类型
        print('-----获取图书概要信息----')
        # print(response.url)
        # print(response.xpath('//title/text()'))

        #从当前的网页中获取图书的信息
        books = response.xpath('//div[@class="book-info"]')
        for book in books:
            i['name'] = book.xpath('./h3/a/text()').extract_first()  #提取第一个结果
            i['book_url'] =book.xpath('./h3/a/@href').extract_first()    #提取书的详情路径
            #print(type(book.xpath('./h3/a/@href').extract_first()))
            author = book.xpath('./p/a/text()').extract()  #提取作者
            i['author'] = ','.join(author)
            i['summary'] = book.xpath('./p[last()-1]/text()').extract_first()    #提起简介
            i['img'] = book.xpath('.//a/img/@data-original').extract_first()   #提取图片路径
            # iml = img.encode('gbk')
            # meta 可以实现在spider之间的数据传送
            # 主要实现request和response之间的数据共享
            # meta传参时,不要使用对象的引用,需要使用常量值
            yield scrapy.Request(url=str(i['img']),meta={'name':i['name']},callback=self.parse_img)
            #print(iml,name,book_url,author,summary)

            yield i