Ejemplo n.º 1
0
    def parse(self, response):
        """ 创建解析规则 """

        book_item = DoubanBookItem()

        ul_xpath = "//div[@class='bd']/ul/li"
        info_xpath = "./div[@class='info']"

        book_list = response.xpath(ul_xpath)
        for item in book_list:

            book_item['book_name'] = item.xpath("%s/div[@class='title']/a/text()" % info_xpath).extract_first()
            book_item['subtitle'] = item.xpath(
                "%s/div[@class='title']/p/text()" % info_xpath).extract_first()
            book_item['price'] = item.xpath(
                "%s//div[@class='action-buttons']/span/text()" % info_xpath).extract_first()
            book_item['author'] = item.xpath(
                "%s//a[@class='author-item']/text()" % info_xpath).extract_first()
            book_item['category'] = item.xpath(
                "%s//span[@class='category']/span[@class='labeled-text']/span/text()" % info_xpath).extract_first()
            book_item['average'] = item.xpath(
                "%s//span[@class='rating-average']/text()" % info_xpath).extract_first()
            book_item['evaluate'] = item.xpath(
                "%s//span[@class='rating-amount']/a/span/text()" % info_xpath).extract_first()
            book_item['desc'] = item.xpath(
                "%s/div[@class='article-desc-brief']/text()" % info_xpath).extract_first()
            book_item['cover'] = item.xpath(
                "./div[1]/a/img/@src").extract_first()
            yield book_item

        next_link = response.xpath(".//li[@class='next']/a/@href").extract_first()
        if next_link:
            next_link = "%s%s" % (self.start_urls[0], next_link)
Ejemplo n.º 2
0
 def parse_next(self, response):
     for item in response.xpath('//tr[@class="item"]'):
         book = DoubanBookItem()
         book['name'] = item.xpath('td[2]/div[1]/a/@title').extract()[0]
         book['price'] = item.xpath('td[2]/p/text()').extract()[0]
         book['ratings'] = item.xpath('td[2]/div[2]/span[2]/text()').extract()[0]
         yield book
Ejemplo n.º 3
0
    def parse(self, response):

        selector = Selector(response)
        books = selector.xpath('//tr[@class="item"]')

        for eachbook in books:
            item = DoubanBookItem()

            title = eachbook.xpath(
                'td[@valign="top"  and not(@width)]/div[@class="pl2"]/a/text()'
            ).extract()
            title = title[0]

            title2 = eachbook.xpath(
                'td[@valign="top"  and not(@width)]/div[@class="pl2"]/span/text()'
            ).extract()
            title2 = title2[0] if len(title2) > 0 else ''

            info = eachbook.xpath(
                'td[@valign="top"  and not(@width)]/p[@class="pl"]/text()'
            ).extract()
            info = info[0]
            rate = eachbook.xpath(
                'td[@valign="top"  and not(@width)]/div[@class="star clearfix"]/span[@class="rating_nums"]/text()'
            ).extract()
            rate = rate[0]
            hot = eachbook.xpath(
                'td[@valign="top"  and not(@width)]/div[@class="star clearfix"]/span[@class="pl"]/text()'
            ).extract()
            hot = hot[0]

            img_url = eachbook.xpath(
                'td[@valign="top"  and @width]/a[@class="nbg"]/img/@src'
            ).extract()
            item['title'] = title
            item['title2'] = title2
            item['info'] = info
            item['rate'] = rate
            item['hot'] = hot
            item['img_url'] = img_url

            yield item

        nextlink = selector.xpath('//span[@class="next"]/a/@href').extract()
        if nextlink:
            nextlink = nextlink[0]
            yield Request(nextlink, callback=self.parse)
Ejemplo n.º 4
0
    def parse_item(self, response):
        sel = Selector(response)
        item = DoubanBookItem()

        infos = filter(lambda info: not info.startswith('\n'),
                       sel.xpath('//div[@id="info"]/text()').extract())

        try:
            item['bookname'] = sel.xpath(
                '//span[@property="v:itemreviewed"]/text()').extract()[0]
            item['bookauthor'] = sel.xpath(
                '//a[@class=""]/text()').extract()[0]
            item['bookpress'] = infos[0]
            item['pressdate'] = infos[1]
            item['bookpages'] = infos[2]
            item['bookprice'] = infos[3]
            item['bookisbn'] = infos[5]
            item['bookcode'] = sel.xpath(
                '//strong[@property="v:average"]/text()').extract()[0]

            item['bookprice'] = item['bookprice'].split('.')[0]
            item['bookcode'] = item['bookcode'].strip()
        except Exception as e:
            log.msg('Error Info: %s' % e, level=log.DEBUG)
            log.msg('Item: %s' % item, level=log.DEBUG)
        '''
        In [37]: infos = infos[]

         湖南文艺出版社
         2015-7-1
         192
         48.00元
         平装
         9787540471699

        '''

        return item