def parse_movie_item(self, response): item = MovieItem() item['url'] = response.url item['name'] = response.xpath( '//span[@property="v:itemreviewed"]/text()').extract_first() item['summary'] = response.xpath( '//span[@property="v:summary"]/text()').extract_first() item['score'] = response.xpath( '//strong[@property="v:average"]/text()').extract_first() return item
def parse_movie_item(self, response): item = MovieItem() item['score'] = response.css('strong.rating_num::text').extract_first() #if float(item['score']) > 8.0: item['url'] = response.url item['name'] = response.css( 'div#content h1 span::text').extract_first() item['summary'] = response.xpath( '//span[@property="v:summary"]/text()').extract_first() return item
def parse_item(self, response): item = MovieItem() item['url'] = response.url item['name'] = response.xpath( '//div[@id="content"]//h1/span[1]/text()').extract_first().strip() item['summary'] = response.xpath( '//span[@property="v:summary"]/text()').extract_first().strip() item['score'] = response.xpath( '//strong[contains(@class, "rating_num")]/text()').extract_first() return item
def parse_movie_item(self, response): i = MovieItem() i['url'] = response.url i['name'] = response.xpath('//div[@id="content"]/h1/span[@property="v:itemreviewed"]/text()').extract_first() i['summary'] = response.xpath('//span[@property="v:summary"]/text()').re_first(r'\n*\s*(.*)\n*') i['score'] = response.css('div#interest_sectl').xpath('.//strong/text()').extract_first() #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() return i
def parse_movie_item(self, response): item = MovieItem() item['url'] = response.url item['name'] = response.xpath('//div[@id="content"]').xpath( './/h1/span/text()').extract_first() item['summary'] = response.xpath('//div[@id="content"]').xpath( './/span[@property="v:summary"]/text()').extract_first() item['score'] = response.xpath('//div[@id="content"]').xpath( './/strong[@class="ll rating_num"]/text()').extract_first() return item
def parse_movie_item(self, response): "TODO: 解析 item" item = MovieItem() item['url'] = response.url item['name'] = response.css('title::text').extract()[0][0:-5].strip() summary_text = response.css( 'span[property="v:summary"]::text').extract_first() item['summary'] = re.sub('\s', '', summary_text) item['score'] = response.css( 'strong[property="v:average"]::text').extract()[0] return item
def parse_movie_item(self, response): item = MovieItem() item['url'] = response.url item['name'] = response.xpath( '//span[@property="v:itemreviewed"]/text()').extract_first() item['summary'] = response.xpath( '//span[@property="v:summary"]/text()').extract_first() item['score'] = response.xpath( '//div[contains(@class,"rating_self")]/strong/text()' ).extract_first() return item
def parse_movie_item(self, response): item = MovieItem() item['url'] = response.url item['name'] = response.xpath( '//span[@property="v:itemreviewed"]/text()').extract_first() #item['summary'] = response.xpath('//span[@class="all hidden"]/text()').extract_first() item['summary'] = response.xpath( '//span[@property="v:summary"]/text()').extract_first() item['score'] = response.xpath( '//strong[@class="ll rating_num"]/text()').extract_first() yield item
def parse_movie_item(self, response): item = MovieItem() item['url'] = response.url item['name'] = response.xpath( '//*[@id="content"]/h1/span[1]/text()').extract_first() item['summary'] = response.xpath( '//span[@property="v:summary"]/text()').extract_first().strip() item['score'] = response.xpath( '//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()' ).extract_first() return item
def parse_movie_item(self, response): item = MovieItem() item['url'] = response.url item['name'] = response.xpath( '//span[@property="v:itemreviewed"]/text()').extract_first() item['summary'] = response.xpath( '//span[@property="v:summary"]/text()').extract_first().strip() item['score'] = float( response.css('strong.rating_num::text').extract_first().strip()) if item['score'] >= 8: yield item
def parse_movie_page(self, response): if DoubanMoviePipeline.count > 40: return yield MovieItem({ 'url': response.url, 'name': response.xpath('//h1/span[1]/text()').extract_first(), 'summary': response.xpath( '//div[@id="link-report"]/span[1]/text()').extract_first(), 'score': response.xpath('//strong[contains(@class,"rating_num")]/text()'). extract_first(), })
def parse_page(self, response): meta = response.meta meta['data']['start'] += 20 movie_datas = json.loads(response.text) item = MovieItem() if movie_datas['data']: for movie_data in movie_datas['data']: item['area'] = response.meta['area'] for field in item.fields: if field in movie_data.keys(): item[field] = movie_data.get(field) yield item url = self.header_url + urlencode(meta['data']) yield Request(url=url, meta=deepcopy(meta), callback=self.parse_page)