Example #1
0
 def parse(self, response):
     items = []
     item = BookItem()
     item['url'] = response.url
     item['title'] = response.xpath('//div[@class = "padding"]//h1/text()'
                                    )[0].extract().encode("utf-8")
     item['author'] = response.xpath(
         '//span[@itemprop = "name"]/text()')[0].extract().encode("utf-8")
     item['yayinevi'] = response.xpath(
         '//span[@itemprop = "name"]/text()')[1].extract().encode("utf-8")
     item['summary'] = response.xpath(
         '//span[@itemprop = "description"]/text()')[0].extract().encode(
             "utf-8")
     item['language'] = response.xpath(
         '//table[@class = "attribute"]//tr//td//span/text()')[1].extract(
         ).encode("utf-8")
     item['date'] = response.xpath(
         '//table[@class = "attribute"]//tr//td[@itemprop = "datePublished"]/text()'
     )[0].extract()
     item['genre'] = response.xpath(
         '//div[@class = "grid_6 omega alpha section"]//a/text()'
     )[2].extract().encode("utf-8")
     item['image_urls'] = response.xpath(
         '//div[@class = "image"]//a//img[@itemprop = "image"]/@src'
     ).extract()
     item['price'] = response.xpath(
         '//div[@class = "price-sales column-box mg-b-20"]//span[@class = "value"]/text()'
     )[0].extract()
     items.append(item)
     return items
Example #2
0
    def detail_parse(self, response):
        image_url = response.xpath(
            "//img[@id='largePic']/@src").extract_first()
        title = response.xpath(
            "//div[@class='name_info']/h1[1]/@title").extract_first()
        author = self._author(
            response.xpath("//span[@id='author']/a[1]/text()").extract())
        publisher = response.xpath(
            "//div[@class='messbox_info']/span[2]/a[1]/text()").extract_first(
            )
        pubdate = self._pubdate(
            response.xpath(
                "//div[@class='messbox_info']/span[3]/text()").extract_first())
        isbn = self._isbn(
            response.xpath(
                "//ul[@class='key clearfix']/li[5]/text()").extract_first())
        types = ','.join(
            response.xpath("//span[@class='lie']/a/text()").extract()[1:])
        description = self._description(
            response.xpath(
                "//div[@class='name_info']/h2/span[1]/text()").extract_first())

        book = BookItem()
        book['title'] = title
        book['description'] = description
        book['isbn'] = isbn
        book['image_url'] = image_url
        book['types'] = types
        book['author'] = author
        book['publisher'] = publisher
        book['pubdate'] = pubdate

        yield book
Example #3
0
    def parse_item(self, response):
        i = BookItem()

        i['chapter'] = response.xpath(r'//h3/text()').extract()[0]
        i['text'] = ','.join(
            response.xpath(
                "//div[@class='read-content j_readContent']//p/text()").
            extract())
        return i
Example #4
0
File: book.py Project: LSY0901/Book
 def u_start(self, response):
     s = '//div[@class="info"]/h2/a/@href'
     ss = response.xpath(s).extract()
     item = BookItem()
     for i in range(0, len(ss)):
         item['mhref'] = ss[i]  # 子分类的链接
         url = item['mhref']
         time.sleep(1)
         yield scrapy.Request(url, callback=self.u_parse)
Example #5
0
 def parse_item(self, response):
     item = BookItem()
     for eitem in response.xpath('//div[@class="mod-list-item"]'):
         item['title'] = ''.join(
             eitem.xpath('.//a[@class="title-link"]/text()').extract())
         item['description'] = ''.join(
             eitem.xpath('.//p[@class="smaller pb15"]/text()').extract())
         item['price'] = ''.join(
             eitem.xpath('.//p[@class="currentPrice"]/text()').extract())
         yield item
Example #6
0
    def parse(self, response):
        print('我是请求的url地址:', response.url)
        dt_list = response.xpath('//*[@id="booksort"]/div[2]/dl/dt')
        for dt in dt_list:
            item = BookItem()
            item['big_name'] = dt.xpath('./a/text()').extract_first()
            em_list = dt.xpath('./following-sibling::*[1]/em')
            for em in em_list:
                item['small_name'] = em.xpath('./a/text()').extract_first()
                item['small_url'] = 'https:' + em.xpath(
                    './a/@href').extract_first()

                yield scrapy.Request(item['small_url'],
                                     callback=self.parse_small_list,
                                     meta={'bigkey': deepcopy(item)})
Example #7
0
 def parse_item(self, response):
     selector = etree.HTML(response.body)
     book = BookItem()
     titletext = selector.xpath("//h1/text()")[0].strip()
     if titletext.__contains__('章:'):
         title = titletext.split("章:")
     elif titletext.__contains__('章;'):
         title = titletext.split("章;")
     else:
         title = ['', titletext]
     if len(title) > 1:
         if len(title) > 1:
             text = selector.xpath("//div[@id='content']/text()")
             book["content"] = text
             book["title"] = title[1]
             return book
     else:
         print("-----------------------", title)
Example #8
0
    def parse(self, response):
        # 2、图书分类标签
        # dt_list = response.xpath('//*[@id="booksort"]/div[2]/dl/dt')
        dt_list = response.xpath('//*[@id="booksort"]/div[2]/dl/dt[1]')
        # 3、图书分类,获取名字与链接
        # 遍历所有dt标签,使用xpath中的follwong - sibling::*[1]
        # 取出下一届点的平级元素dd
        for dt in dt_list:
            item = BookItem()
            item['big_name'] = dt.xpath('a/text()').extract_first()
            # 具体分类
            em_list = dt.xpath('./following-sibling::*[1]/em')
            for em in em_list[:1]:
                item['small_name'] = em.xpath('a/text()').extract_first()
                # 拼接url前缀
                small_link = 'https:' + em.xpath('a/@href').extract_first()

                # 发送图书列表页(第二层)
                yield scrapy.Request(small_link,
                                     callback=self.parse_book,
                                     meta={'book': deepcopy(item)})
Example #9
0
File: book.py Project: LSY0901/Book
 def m_parse(self, response):
     print('*********' + "1、类型选择  2、自己搜寻" + '*********')
     n = int(input())
     if n == 1:
         print("1、文学   2、流行   3、文化   4、生活   5、经管   6、科技")
         m = int(input("请选择您喜欢的类型:"))
         m_title = '//div[@class="article"]/div[2]/div[' + str(
             m) + ']/table/tbody/tr/td/a/text()'
         titles = response.xpath(m_title).extract()
         print(titles)
         item = BookItem()
         a = []
         for i, value in enumerate(titles):
             item['title'] = value  # 每个种类的子分类
             fulurl = 'https://book.douban.com/tag/' + item[
                 'title']  # 子分类的链接
             item['href'] = fulurl
             a.append(fulurl)
             print(i + 1, fulurl)
             time.sleep(1)
         m = int(input("请继续选择您喜欢的种类:"))
         url = a[m - 1]
         # print('_____________________'+url)
         yield scrapy.Request(url, callback=self.u_start)
     elif n == 2:
         m = input("请输入你要查询的书名:")
         n = quote(m)
         # https://api.douban.com/v2/book/search?q=
         url = 'https://api.douban.com/v2/book/search?q=' + n
         std = urllib.request.urlopen(url)
         rs = json.loads(std.read())
         s = []
         for i in rs['books']:
             s.append(i['id'])
         # https://book.douban.com/subject/1008145/
         ull = 'https://book.douban.com/subject/' + s[0]
         print(ull)
         yield scrapy.Request(ull, callback=self.u_parse)
     else:
         yield scrapy.Request(response, callback=self.m_parse)
Example #10
0
File: book.py Project: LSY0901/Book
 def u_parse(self, response):
     time.sleep(1)
     s = response.url
     fulurl = s + '/comments/'
     m_ltitle = '//div[@id="wrapper"]/h1/span/text()'
     # //div[@id="info"]/span/a/text()
     m_lauthor = '//div[@id="info"]/a[1]/text()'
     n_lauthor = '//div[@id="info"]/span/a/text()'
     m_lscore = '//strong/text()'
     m_lnumber = '//div[@class="rating_sum"]/span/a/span/text()'
     m_lbs = '//div[@id="info"]/text()'
     m_lcontent = '//div[@id="link-report"]/div/div/p[1]/text()'
     # //div[@id="link-report"]/*/div/div/p[1]/text()
     n_lcontent = '//div[@id="link-report"]/*/div/div/p[1]/text()'
     ltitles = response.xpath(m_ltitle).extract()  # 标题
     lauthors = response.xpath(m_lauthor).extract()  # 作者
     lscores = response.xpath(m_lscore).extract()  # 评分
     lnumbers = response.xpath(m_lnumber).extract()  # 人数
     lbss = response.xpath(m_lbs).extract()  #出版社
     lcontents = response.xpath(m_lcontent).extract()  # 内容简介
     if len(lauthors) == 0:
         lauthors = response.xpath(n_lauthor).extract()
     if len(lcontents) == 0:
         lcontents = response.xpath(n_lcontent).extract()  # 内容简介
     if len(lcontents) == 0:
         lcontents = ['null']  # 内容简介
     item = BookItem()
     for i in range(0, len(ltitles)):
         item['ltitle'] = ltitles[i]
         item['lauthor'] = lauthors[i]
         item['lscore'] = lscores[i]
         item['lnumber'] = lnumbers[i]
         item['lbs'] = lbss[i + 4]
         item['lcontent'] = lcontents[i]
         print('**********' + item['ltitle'] + item['lauthor'] +
               item['lscore'] + item['lnumber'] + item['lbs'] +
               item['lcontent'])
         yield scrapy.Request(fulurl,
                              meta={'item': item},
                              callback=self.PLk)