def parse(self, response): """ 创建解析规则 """ book_item = DoubanBookItem() ul_xpath = "//div[@class='bd']/ul/li" info_xpath = "./div[@class='info']" book_list = response.xpath(ul_xpath) for item in book_list: book_item['book_name'] = item.xpath("%s/div[@class='title']/a/text()" % info_xpath).extract_first() book_item['subtitle'] = item.xpath( "%s/div[@class='title']/p/text()" % info_xpath).extract_first() book_item['price'] = item.xpath( "%s//div[@class='action-buttons']/span/text()" % info_xpath).extract_first() book_item['author'] = item.xpath( "%s//a[@class='author-item']/text()" % info_xpath).extract_first() book_item['category'] = item.xpath( "%s//span[@class='category']/span[@class='labeled-text']/span/text()" % info_xpath).extract_first() book_item['average'] = item.xpath( "%s//span[@class='rating-average']/text()" % info_xpath).extract_first() book_item['evaluate'] = item.xpath( "%s//span[@class='rating-amount']/a/span/text()" % info_xpath).extract_first() book_item['desc'] = item.xpath( "%s/div[@class='article-desc-brief']/text()" % info_xpath).extract_first() book_item['cover'] = item.xpath( "./div[1]/a/img/@src").extract_first() yield book_item next_link = response.xpath(".//li[@class='next']/a/@href").extract_first() if next_link: next_link = "%s%s" % (self.start_urls[0], next_link)
def parse_next(self, response): for item in response.xpath('//tr[@class="item"]'): book = DoubanBookItem() book['name'] = item.xpath('td[2]/div[1]/a/@title').extract()[0] book['price'] = item.xpath('td[2]/p/text()').extract()[0] book['ratings'] = item.xpath('td[2]/div[2]/span[2]/text()').extract()[0] yield book
def parse(self, response): selector = Selector(response) books = selector.xpath('//tr[@class="item"]') for eachbook in books: item = DoubanBookItem() title = eachbook.xpath( 'td[@valign="top" and not(@width)]/div[@class="pl2"]/a/text()' ).extract() title = title[0] title2 = eachbook.xpath( 'td[@valign="top" and not(@width)]/div[@class="pl2"]/span/text()' ).extract() title2 = title2[0] if len(title2) > 0 else '' info = eachbook.xpath( 'td[@valign="top" and not(@width)]/p[@class="pl"]/text()' ).extract() info = info[0] rate = eachbook.xpath( 'td[@valign="top" and not(@width)]/div[@class="star clearfix"]/span[@class="rating_nums"]/text()' ).extract() rate = rate[0] hot = eachbook.xpath( 'td[@valign="top" and not(@width)]/div[@class="star clearfix"]/span[@class="pl"]/text()' ).extract() hot = hot[0] img_url = eachbook.xpath( 'td[@valign="top" and @width]/a[@class="nbg"]/img/@src' ).extract() item['title'] = title item['title2'] = title2 item['info'] = info item['rate'] = rate item['hot'] = hot item['img_url'] = img_url yield item nextlink = selector.xpath('//span[@class="next"]/a/@href').extract() if nextlink: nextlink = nextlink[0] yield Request(nextlink, callback=self.parse)
def parse_item(self, response): sel = Selector(response) item = DoubanBookItem() infos = filter(lambda info: not info.startswith('\n'), sel.xpath('//div[@id="info"]/text()').extract()) try: item['bookname'] = sel.xpath( '//span[@property="v:itemreviewed"]/text()').extract()[0] item['bookauthor'] = sel.xpath( '//a[@class=""]/text()').extract()[0] item['bookpress'] = infos[0] item['pressdate'] = infos[1] item['bookpages'] = infos[2] item['bookprice'] = infos[3] item['bookisbn'] = infos[5] item['bookcode'] = sel.xpath( '//strong[@property="v:average"]/text()').extract()[0] item['bookprice'] = item['bookprice'].split('.')[0] item['bookcode'] = item['bookcode'].strip() except Exception as e: log.msg('Error Info: %s' % e, level=log.DEBUG) log.msg('Item: %s' % item, level=log.DEBUG) ''' In [37]: infos = infos[] 湖南文艺出版社 2015-7-1 192 48.00元 平装 9787540471699 ''' return item