def parse(self, response):

        try:
            sel = Selector(response)

            # not sure if this needs editing
            movies = sel.xpath('//table[@class="left rt_table"]/tbody/tr')

            # make sure we have a list of movies
            if not movies:
                self.log("Unable to find list of movies in {:s}.".format(response.request.url), level=log.ERROR)

            items = []

            for movie in movies:
                # Ignore the header row, which is the first row returned
                if (movie.xpath('th')):
                    continue

                # Gather information about the movie
                loader = ItemLoader(MovieItem(), response = response, selector = movie)

                # Generate the href for the details page
                details_href = 'http://' + response.url.split('/')[2] + str(movie.xpath('td[3]/a/@href').extract()[0])

                loader.add_xpath('category', '//form[@action="/top/bestofrt/"]/p/select/option[@selected="selected"]/text()')

                loader.add_xpath('rank', 'td[1]/text()', re = r'\d+') # ignore the '.'
                loader.add_xpath('rating_tomatoes', 'td[2]/span/span[2]/text()', re = r'\d+')
                loader.add_xpath('title', 'td[3]/a/text()', re = r'.*(?= \([0-9]{4}\))')
                loader.add_xpath('review_count', 'td[4]/text()')
                loader.add_xpath('year', 'td[3]/a/text()', re = r'\d{4}(?=\)$)')

                yield loader.load_item()
                # yield Request(url=details_href, callback=self.parse_movie_details)

        except Exception as e:
            # Log the exception then reraise it.
            log("Could not parse URL '{:s}'".format(response.request.url), level=log.ERROR)

        def parse_movie_details(self, response, loader):

            # Process further details here

            yield loader.load_item()
Ejemplo n.º 2
0
	* Scrapy提供5层logging级别
		scrapy.log.CRITICAL	严重错误的Log级别

		scrapy.log.ERROR	错误的Log级别 Log level for errors

		scrapy.log.WARNING	警告的Log级别 Log level for warnings

		scrapy.log.INFO		记录信息的Log级别(生产部署时推荐的Log级别)

		scrapy.log.DEBUG	调试信息的Log级别(开发时推荐的Log级别)
	
	* 使用
		from scrapy import log
		log.msg("This is a warning", level=log.WARNING)
	
	* 在spider中添加log的推荐方式是使用Spider的 log() 方法
	* 该方法会自动在调用 scrapy.log.msg() 时赋值 spider 参数,其他的参数则直接传递给 msg() 方法

	* '以上都是被标识为过时(废弃)的了... ...'

----------------------------
loging						|
----------------------------
	import logging
	logging.warning("This is a warning")
	logging.debug()

	logging.log(logging.WARNING, "This is a warning")

	* 在scrapy中使用
		self.logger.warning('警告信息')
Ejemplo n.º 3
0
 def process_response(self, request, response, spider):
     log('Response received from request url %s ' % (request.url))
Ejemplo n.º 4
0
 def process_request(self, request, spider):
     log('Requesting url %s with ' % (request.url))