def parse(self, response): Details = Selector(response=response).xpath( '//div[@class="channel-detail movie-item-title"]') for info in Details[0:10]: item = MaoyanItem() url = 'https://maoyan.com' + info.xpath( './a/@href').extract_first() yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse2)
def parse_page(self, response): item = MaoyanItem() reg = re.compile( '<img data-src="http://p0.meituan.net/movie/(.*?)" .*?/>.*?<a href=".*?".*?>(.*?)</a>.*?<p class=".*?">(.*?)</p>.*?<p class="releasetime">(.*?)</p>.*?<i class="integer">(.*?)</i><i class="fraction">(.*?)</i>', re.S) contes = reg.findall(response.text) for cont in contes: item['img_url'] = 'http://p0.meituan.net/movie/' + cont[0], item['title'] = cont[1], item['star_name'] = cont[2].strip(), item['show_time'] = cont[3], item['score'] = cont[4] + cont[5] return item
def parse(self, response): dd_list = response.xpath('//dl[@class="board-wrapper"]/dd') for dd in dd_list: # item={} item = MaoyanItem() item['name'] = dd.xpath('./a/@title').extract()[0] item['star'] = dd.xpath( './/p[@class="star"]/text()').extract()[0].strip() item['time'] = dd.xpath( './/p[@class="releasetime"]/text()').extract()[0] yield item for offset in range(10, 91, 10): url = 'http://maoyan.com/board/4?offset={}'.format(str(offset)) #把地址交给调度器入队列 yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): items = MaoyanItem() # print("="*50) # a=1 print(response.text) #print(response.xpath("//dl[@class='board-wrapper']//dd/a").extract()) node_list = response.xpath("//dl[@class='board-wrapper']//dd") # print("222",node_list) for node in node_list: # a+=1 # print("1111",node) items['title'] = node.xpath(".//p[@class='name']/a/text()").extract() items['star'] = node.xpath(".//p[@class='star']/text()").extract() #print("-----{}-----{}".format(title[0],star[0])) # print("+"*50) # print("="*50,a) yield items
def parse2(self, response): item = MaoyanItem() # 电影名称 Movie_Name = Selector(response=response).xpath( '//h1[@class="name"]/text()').extract_first() # 电影类别 Categories = [] for Category in Selector(response=response).xpath( '//a[@class="text-link"]/text()').extract(): Categories.append(Category.strip()) Movie_Categories = '/'.join(Categories) # 上映时间 Release_Date = Selector(response=response).xpath( '//li[@class="ellipsis"][3]/text()').extract_first()[:10] item['Movie_Name'] = Movie_Name item['Movie_Categories'] = Movie_Categories item['Release_Date'] = Release_Date return item
def parse2(self, response): item = MaoyanItem() # 电影名称 movie_name = Selector(response=response).xpath( '//h1[@class="name"]/text()').extract_first() # 电影类别 categories = [] for category in Selector(response=response).xpath( '//a[@class="text-link"]/text()').extract(): categories.append(category.strip()) movie_categories = '/'.join(categories) # 上映时间 release_date = Selector(response=response).xpath( '//li[@class="ellipsis"][3]/text()').extract_first()[:10] item['movie_name'] = movie_name item['movie_categories'] = movie_categories item['release_date'] = release_date return item