def parse_item(self, response): # 创建一个item对象 item = MovieprojectItem() # 电影海报 item['post'] = response.xpath('//a[@class="movie-post"]/img/@src').extract_first() # 电影名字 item['name'] = response.xpath('//h1').xpath('string(.)').extract_first() # 电影评分 item['score'] = response.xpath('//div[@class="col-xs-8"]/table/tbody/tr[last()]/td[2]').xpath('string(.)').extract_first() # 电影类型 item['_type'] = response.xpath('//div[@class="col-xs-8"]/table/tbody/tr[3]/td[2]').xpath('string(.)').extract_first() # 导演 item['director'] = response.xpath('//div[@class="col-xs-8"]/table/tbody/tr[1]/td[2]/a/text()').extract_first() # 编剧 item['editor'] = response.xpath('//div[@class="col-xs-8"]/table/tbody/tr[2]/td[2]/a/text()').extract_first() # 主演 # '张静初 / 龙品旭 / 黎兆丰 / 王同辉 / 张国强 / 叶婉娴 / 丽娜 / 吴海燕 / 吴若林 / 喻引娣 显示全部' item['actor'] = response.xpath('//div[@class="col-xs-8"]/table/tbody/tr[3]/td[2]').xpath('string(.)').extract_first().replace(' ', '').replace('显示全部', '') # 片长 lala = response.xpath('//div[@class="col-xs-8"]/table/tbody/tr[8]/td[2]/text()').extract_first() if lala and ('分钟' in lala): item['long_time'] = lala else: item['long_time'] = '' # 电影介绍 introduce = response.xpath('//div[@class="col-xs-12 movie-introduce"]').xpath('string(.)').extract_first() if introduce == None: item['introduce'] = '' else: item['introduce'] = introduce.replace('\u3000', '').replace('展开全部', '') # 电影链接 # item['download_url'] = response.xpath('') yield item
def parse_item(self, response): item = MovieprojectItem() item['name'] = response.xpath('//h1/text()').xpath( 'string(.)').extract_first() item['director'] = response.xpath( '//tbody/tr[1]/td[2]/a/text()').extract_first() item['writer'] = response.xpath('//tbody/tr[2]/td[2]').xpath( 'string(.)').extract_first() item['actor'] = response.xpath('//tbody/tr[3]/td[2]').xpath( 'string(.)').extract_first().replace('显示全部', '') item['_type'] = response.xpath('//tbody/tr[4]/td[2]').xpath( 'string(.)').extract_first() item['area'] = response.xpath('//tbody/tr[5]/td[2]').xpath( 'string(.)').extract_first() item['language'] = response.xpath('//tbody/tr[6]/td[2]').xpath( 'string(.)').extract_first() item['release_date'] = response.xpath('//tbody/tr[7]/td[2]').xpath( 'string(.)').extract_first() item['duration'] = response.xpath('//tbody/tr[8]/td[2]').xpath( 'string(.)').extract_first() item['grade'] = response.xpath('//tbody/tr[10]/td[2]').xpath( 'string(.)').extract_first() item['introduce'] = response.xpath( '//div[@class="col-xs-12 movie-introduce"]/p/text()' ).extract_first().replace('\u3000', '') yield item
def parse_detail(self, response): item = MovieprojectItem() item['name'] = response.xpath( '/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first() item['desc'] = response.xpath( '/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]//text()' ).extract() item['desc'] = ''.join(item['desc']) yield item
def parse_item(self, response): div_list = response.xpath('//div[starts-with(@class,"col-xs-1-5")]') for odiv in div_list: item = MovieprojectItem() item['poster'] = odiv.xpath('.//img/@data-original').extract()[0] item['name'] = odiv.xpath('.//h1/a/text()').extract()[0] item['score'] = odiv.xpath('.//h1/em/text()').extract()[0] item['movie_type'] = odiv.xpath( './/div[@class = "otherinfo"]').xpath('string(.)').extract()[0] yield item
def parse_item(self, response): # 就是在详情页提取电影的所有信息 item = MovieprojectItem() # 接着提取其他信息 # 提取海报 poster = response.xpath( '//a[@class="movie-post"]/img/@src').extract_first() # 提取电影名字 name = response.xpath('//h1')[0].xpath('string(.)').extract_first() # 自己实现下面的两个提取 # 提取电影类型 movie_type = response.xpath('//span[contains(text(),"类型")]/../../td[2]' )[0].xpath('string(.)').extract_first() # 提取电影评分 score = response.xpath('//span[contains(text(),"评分")]/../../td[2]' )[0].xpath('string(.)').extract_first() # 获取导演信息 director = response.xpath('//div[@class="col-xs-8"]/table//tr[1]/td[2]' )[0].xpath('string(.)').extract_first() # 获取编剧 try: editor = response.xpath('//span[contains(text(),"编剧")]/../../td[2]' )[0].xpath('string(.)').extract_first() except: editor = '' # 获取主演 try: actor = response.xpath('//span[contains(text(),"主演")]/../../td[2]' )[0].xpath('string(.)').extract_first() except: actor = '' # 获取地区 area = response.xpath('//span[contains(text(),"地区")]/../../td[2]' )[0].xpath('string(.)').extract_first() # 获取上映时间 publish_time = response.xpath( '//span[contains(text(),"上映时间")]/../../td[2]')[0].xpath( 'string(.)').extract_first() # 获取简介 info = response.xpath( '//div[@class="col-xs-12 movie-introduce"]/p/text()' ).extract_first() for field in [ 'director', 'editor', 'actor', 'area', 'publish_time', 'info', 'poster', 'name', 'movie_type', 'score' ]: item[field] = eval(field) # 将item仍走 yield item
def parse_item(self, response): # 首先找到所有的div div_list = response.xpath('//div[contains(@class,"col-xs-1-5")]') # 遍历div,依次获取每一个信息 for odiv in div_list: # 创建一个item item = MovieprojectItem() item['post'] = odiv.xpath('.//img/@data-original').extract_first() item['name'] = odiv.xpath('.//img/@alt').extract_first() item['score'] = odiv.xpath('.//h1/em/text()').extract_first().strip(' -') # 获取类型 item['_type'] = odiv.xpath('.//div[@class="otherinfo"]').xpath('string(.)').extract_first() yield item
def parse(self, response): table_list = response.xpath('div[@class="co_content"]/ul/table') for table in table_list: item = MovieprojectItem() item['name'] = table.xpath( './/a[@class="ulink"]/text()').extract_first() item['movie_info'] = table.xpath( './/tr[last()]/td/text()').extract_first() movie_url = 'http://www.dytt8.net' + table.xpath( './/a[@class="ulink"]/@href').extract_first() yield scrapy.Request(url=movie_url, callback=self.parse_info, meta={'item': item})
def parse_detail(self, response): #获取电影名称 name = response.xpath( '/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first() #获取电影简介,电影描述信息 desc = response.xpath( '/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]//text()' ).extract_first() desc = ''.join(desc) print(f"电影名称:{name}\n电影简介:{desc}") item = MovieprojectItem() item['name'] = name item['desc'] = desc yield item
def parse(self, response): # 首先找到所有的div div_list = response.xpath('//div[contains(@class,"col-xs-1-5")]') # 遍历div,依次获取每一个信息 for odiv in div_list: # 创建一个item item = MovieprojectItem() item['post'] = odiv.xpath('.//img/@data-original').extract_first() item['name'] = odiv.xpath('.//img/@alt').extract_first() item['score'] = odiv.xpath('.//h1/em/text()').extract_first().strip(' -') # 获取类型 item['_type'] = odiv.xpath('.//div[@class="otherinfo"]').xpath('string(.)').extract_first() # 获取详情页面链接 detail_url = odiv.xpath('.//h1/a/@href').extract_first() # 向详情页发送请求, 并且通过meta将item传递过去 yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})
def parse_item(self, response): print('*' * 100) div_list = response.xpath('//div[starts-with(@class,"col-xs-1-5")]') # 遍历这个div_list列表,提取每一个电影的信息 for odiv in div_list: # 创建对象 item = MovieprojectItem() # 电影海报 item['poster'] = odiv.xpath('.//img/@data-original').extract()[0] # 电影名字 item['name'] = odiv.xpath('.//h1/a/text()').extract()[0] # 电影评分 item['score'] = odiv.xpath('.//h1/em/text()').extract()[0] # 电影类型 item['movie_type'] = odiv.xpath('.//div[@class="otherinfo"]')[0].xpath('string(.)').extract()[0] yield item
def parse(self, response): div_list = response.xpath('//div[starts-with(@class,"col-xs-1-5")]') for odiv in div_list: item = MovieprojectItem() item['poster'] = odiv.xpath('.//img/@data-original').extract()[0] item['name'] = odiv.xpath('.//h1/a/text()').extract()[0] item['score'] = odiv.xpath('.//h1/em/text()').extract()[0] item['movie_type'] = odiv.xpath( './/div[@class = "otherinfo"]').xpath('string(.)').extract()[0] detail_url = odiv.xpath('.//h1/a/@href').extract()[0] yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'xxx': item}) if self.page <= 2: self.page += 1 url = self.url.format(self.page) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): # 找到所有的电影 table_list = response.xpath('//div[@class="co_content8"]/ul//table') # 遍历所有的电影列表,得到电影的详细信息 for table in table_list: # 在当前的页面的只能提取到两个信息,一个是name,一个是movie_info # 创建一个对象 item = MovieprojectItem() # 提取对应的信息 # 【注】在点的后面要加两个杠 item['name'] = table.xpath('.//a[@class="ulink"]/text()').extract_first() item['movie_info'] = table.xpath('.//tr[last()]/td/text()').extract_first() # 获取电影的链接 movie_url = 'http://www.dytt8.net' + table.xpath('.//a[@class="ulink"]/@href').extract_first() # yield item # 这里面涉及到一个传递item的问题,我们要学习如何传参,加上一个meta参数,meta参数是一个字典,过去之后,通过字典的键获取其值 yield scrapy.Request(url=movie_url, callback=self.parse_info, meta={'item': item})
def parse_item(self, response): # 先找到所有的电影div列表 div_list = response.xpath('xxx') for odiv in div_list: item = MovieprojectItem() # 获取电影海报 item['post'] = response.xpath('xxx') # 获取电影类型 item['_type'] = response.xpath('') # 获取电影名字 item['name'] = response.xpath('') # 获取电影评分 item['score'] = response.xpath('') # 获取电影的详情页链接 detail_url = response.xpath('') yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})
def parse_item(self, response): # 提取item item = MovieprojectItem() item['poster'] = response.xpath( '//a[@class="movie-post"]/img/@src').extract_first() item['name'] = response.xpath('//h1')[0].xpath( 'string(.)').extract_first() item['movie_type'] = response.xpath( '//span[contains(text(),"类型")]/../../td[2]')[0].xpath( 'string(.)').extract()[0] item['score'] = response.xpath( '//span[contains(text(),"评分")]/../../td[2]')[0].xpath( 'string(.)').extract_first() item['director'] = response.xpath( '//span[contains(text(),"导演")]/../../td[2]')[0].xpath( 'string(.)').extract()[0] try: item['editor'] = response.xpath( '//span[contains(text(),"编剧")]/../../td[2]')[0].xpath( 'string(.)').extract()[0] item['actor'] = response.xpath( '//span[contains(text(),"主演")]/../../td[2]')[0].xpath( 'string(.)').extract()[0].rstrip(' 显示全部') except: item['editor'] = '' item['actor'] = '' item['area'] = response.xpath( '//span[contains(text(),"地区")]/../../td[2]')[0].xpath( 'string(.)').extract()[0] item['publish_time'] = response.xpath( '//span[contains(text(),"上映时间")]/../../td[2]')[0].xpath( 'string(.)').extract()[0] item['info'] = response.xpath( '//div[@class="col-xs-12 movie-introduce"]/p/text()')[0].extract( ).strip('\u3000') yield item
def parse(self, response): # 解析首页 # 首先找到包含所有电影的div或者table div_list = response.xpath('//div[starts-with(@class,"col-xs-1-5")]') # 遍历这个div_list列表,提取每一个电影的信息 for odiv in div_list: # 创建对象 item = MovieprojectItem() # 电影海报 item['poster'] = odiv.xpath('.//img/@data-original').extract()[0] # 电影名字 item['name'] = odiv.xpath('.//h1/a/text()').extract()[0] # 电影评分 item['score'] = odiv.xpath('.//h1/em/text()').extract()[0] # 电影类型 item['movie_type'] = odiv.xpath('.//div[@class="otherinfo"]')[ 0].xpath('string(.)').extract()[0] # 提取所有详情页的链接 detail_url = odiv.xpath('.//h1/a/@href').extract()[0] # 向详情页发送请求 yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'xxx': item})