Exemple #1
0
    def parse_item(self, response):
        # print(response.body)
        sel = Selector(response)
        movie_name = sel.xpath(
            '/html/body/div[3]/div/div[2]/div[1]/h3/text()').extract_first()
        movie_ename = sel.xpath(
            '/html/body/div[3]/div/div[2]/div[1]/div/text()').extract_first()
        movie_type = sel.xpath(
            '/html/body/div[3]/div/div[2]/div[1]/ul/li[1]/text()'
        ).extract_first()
        movie_publish = sel.xpath(
            '/html/body/div[3]/div/div[2]/div[1]/ul/li[2]/text()'
        ).extract_first()
        country = movie_publish.split('/')[0].strip()
        movie_time = movie_publish.split('/')[1].strip()
        online_time = sel.xpath(
            '/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()'
        ).extract_first()
        movie_star = sel.xpath(
            '/html/body/div[3]/div/div[2]/div[3]/div[1]/div/span/span/text()'
        ).extract_first()
        movie_total_price = sel.xpath(
            '/html/body/div[3]/div/div[2]/div[3]/div[2]/div/span[1]/text()'
        ).extract_first()
        img = sel.xpath(
            '/html/body/div[3]/div/div[1]//img/@src').extract_first()
        director = sel.xpath(
            '//li[@class="celebrity "]/div/a/text()').extract_first()
        director = director.strip()
        director_src = sel.xpath(
            '//li[@class="celebrity "]/a/img/@data-src').extract_first()
        # print(director_src)
        # director_src = director_src.split('@')[0].strip()
        actor = sel.xpath(
            '//li[@class="celebrity actor"]/div/a/text()').extract()
        for i in range(len(actor)):
            actor[i] = actor[i].strip()
        actor_src = sel.xpath(
            '//li[@class="celebrity actor"]/a/img/@data-src').extract()

        introduction = sel.xpath(
            './/div[@class="mod-content"]/span/text()').extract_first()

        item = MaoyanItem()
        item['movie_name'] = movie_name
        item['movie_ename'] = movie_ename
        item['movie_type'] = movie_type
        item['country'] = country
        item['movie_time'] = movie_time
        item['online_time'] = online_time
        item['movie_star'] = movie_star
        item['movie_total_price'] = movie_total_price
        item['img'] = img
        item['director'] = director
        item['director_src'] = director_src
        item['actor'] = actor
        item['actor_src'] = actor_src
        item['introduction'] = introduction

        yield item
Exemple #2
0
 def parse2(self, response):
     item = MaoyanItem()
     item['title'] = Selector(response=response).xpath('//h1[@class="name"]/text()').extract_first().strip()
     tages = Selector(response=response).xpath('//li[@class="ellipsis"]')
     item['mtype'] = ','.join(tages[0].xpath('./a/text()').extract())
     item['mtime'] = tages[2].xpath('./text()').extract_first().strip()
     yield item
Exemple #3
0
    def parse(self, response):
        item = MaoyanItem()
        print(response.text)
        movies = Selector(
            response=response).xpath('//div[@class="movie-hover-info"]')[:10]
        # movies = Selector(response=response).xpath('//div[contains(@class,"movie-hover-info") and position()<10]')
        # movies = Selector(response=response).xpath('//div[contains(@class,"movie-hover-info")')[:10]

        for movie in movies:
            # 电影名字
            title = movie.xpath(
                './div[1]/span[1]/text()').extract_first().strip()
            print(title)
            # 电影类型
            filmtype_title = movie.xpath(
                './div[2]/span/text()').extract_first().strip()
            filmtype = movie.xpath('./div[2]/text()').extract()[1].strip()
            print(filmtype)
            # 上映时间
            rt_titile = movie.xpath(
                './div[4]/span/text()').extract_first().strip()
            releasetime = movie.xpath('./div[4]/text()').extract()[1].strip()
            print(releasetime)
            item['title'] = title
            item['filmtype'] = filmtype_title + filmtype
            item['releasetime'] = rt_titile + releasetime
            yield item
Exemple #4
0
 def parse_item(self, response):
     # print(response.body)
     sel = Selector(response)
     movie_name = sel.xpath(
         '/html/body/div[3]/div/div[2]/div[1]/h3/text()').extract()
     movie_ename = sel.xpath(
         '/html/body/div[3]/div/div[2]/div[1]/div/text()').extract()
     movie_type = sel.xpath(
         '/html/body/div[3]/div/div[2]/div[1]/ul/li[1]/text()').extract()
     movie_publish = sel.xpath(
         '/html/body/div[3]/div/div[2]/div[1]/ul/li[2]/text()').extract()
     movie_time = sel.xpath(
         '/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()').extract()
     movie_star = sel.xpath(
         '/html/body/div[3]/div/div[2]/div[3]/div[1]/div/span/span/text()'
     ).extract()
     # movie_total_price = sel.xpath('/html/body/div[3]/div/div[2]/div[3]/div[2]/div/span[1]/text()').extract()
     # movie_introd = sel.xpath('//*[@id="app"]/div/div[1]/div/div[2]/div[1]/div[1]/div[2]/span/text()').extract()
     # print(movie_name)
     # print(movie_ename)
     # print(movie_type)
     # print(movie_publish)
     # print(movie_time)
     # print(movie_star)
     # print(movie_total_price)
     item = MaoyanItem()
     item['movie_name'] = movie_name
     item['movie_ename'] = movie_ename
     item['movie_type'] = movie_type
     item['movie_publish'] = movie_publish
     item['movie_time'] = movie_time
     item['movie_star'] = movie_star
     # item['movie_total_price'] = movie_total_price
     # item['movie_introd'] = movie_introd
     yield item
    def parse(self, response):
        item = MaoyanItem()
        selector = Selector(response)
        infos = selector.xpath('//dl[@class="board-wrapper"]/dd')
        for info in infos:
            title = info.xpath('div/div/div[1]/p[1]/a/text()').extract()[0]
            actor = info.xpath(
                'div/div/div[1]/p[2]/text()').extract()[0].strip()
            time = info.xpath('div/div/div[1]/p[3]/text()').extract()[0]
            score = info.xpath('div/div/div[2]/p/i[1]/text()').extract()[0] + \
                    info.xpath('div/div/div[2]/p/i[2]/text()').extract()[0]

            item['title'] = title
            item['actor'] = actor
            item['time'] = time
            item['score'] = score

            yield item

        urls = [
            'http://maoyan.com/board/4?offset={}'.format(str(i))
            for i in range(10, 100, 10)
        ]
        for url in urls:
            yield Request(url, callback=self.parse)
Exemple #6
0
    def parse(self, response):
        item = MaoyanItem()
        item['rank'] = response.xpath(
            '//dl[@class="board-wrapper"]/dd/i/text()').extract()
        item['title'] = response.xpath(
            '//div[@class="movie-item-info"]/p/a/text()').extract()
        item['link'] = response.xpath(
            '//dl[@class="board-wrapper"]/dd/a/@href').extract()
        item['star'] = response.xpath('//p[@class="star"]/text()').extract()
        item['times'] = response.xpath(
            '//p[@class="releasetime"]/text()').extract()
        item['score1'] = response.xpath(
            '//p[@class="score"]//i[@class="integer"]/text()').extract()
        item['score2'] = response.xpath(
            '//p[@class="score"]//i[@class="fraction"]/text()').extract()

        # for i in range(0, 10):
        #     item['link'][i] = "http://maoyan.com"+link[i]
        #     item['score'][i] = score1[i]+score2[i]
        #     item['star'] = star[i].strip()[3:]
        #     item['times'] = times[i].strip()[5:]
        #     item['title'] = title[i].strip()

        yield item

        for i in range(1, 10):
            nexturl = "http://maoyan.com/board/4?offset=" + str(i * 10)
            yield Request(
                nexturl,
                callback=self.parse,
                headers={
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
                })
Exemple #7
0
 def parse(self, response):
     # 测试下载器下载好的response有没有数据
     # print(response.text)
     # 提取数据
     # 实例化定义好的item,注意要将该包导入
     item = MaoyanItem()
     '''
     response对象的xpath方法:不需要使用lxml在转
     responsex.xpath('xpath表达式')--返回值:[selector对象]
     从selector中提取数据的方法:
         responsex.xpath('xpath表达式').extract_first()-就是一个字符串
         responsex.xpath('xpath表达式').extract()---返回值是list,list里面是所有的字符串内容
     '''
     dd_list = response.xpath('//div[@class="main"]/dl/dd')
     for dd in dd_list:
         movie_title = dd.xpath(
             './/p[@class="name"]/a/@title').extract_first()
         # print(movie_title)
         movie_actor = dd.xpath(
             './/p[@class="star"]/text()').extract_first().strip()
         date = dd.xpath(
             './/p[@class="releasetime"]/text()').extract_first()
         scores = dd.xpath('.//p[@class="score"]/i/text()').extract()
         detail = dd.xpath('.//p[@class="name"]/a/@href').extract_first()
         # print(scores)
         scores = ''.join(scores)
         item['movie_title'] = movie_title
         item['movie_actor'] = movie_actor
         item['date'] = date
         item['detail'] = detail
         item['scores'] = scores
         # print(item)
         yield item
Exemple #8
0
    def parse(self, response):
        item = MaoyanItem()
        movies = response.xpath('//dl[@class="board-wrapper"]/dd')
        print(response.text)
        for movie in movies:
            item['ranking'] = movie.xpath('.//i/text()').extract()[0]
            item['name'] = movie.xpath(
                './/p[@class="name"]/a/text()').extract()[0]
            item['star'] = movie.xpath(
                './/p[@class="star"]/text()').extract()[0]
            item['releasetime'] = movie.xpath(
                './/p[@class="releasetime"]/text()').extract()[0]
            yield item

            # next_url = response.xpath(
            #     '//ul[@class="list-pager"]/li/a[contains(text(),"下一页")]/@href'
            # ).extract()
            # print(next_url,type(next_url))
            # if next_url:
            #     next_url = 'https://maoyan.com/board/4' + next_url[0]
            #     yield Request(next_url, headers=self.headers)

            a = response.xpath(
                '//ul[@class="list-pager"]/li/a/@href').extract()[-1]
            yield response.follow(a, callback=self.parse)
    def parse(self, response):
        # 测试下载器下载好的response有没有数据
        # print(response.text)
        # 提取数据

        item = MaoyanItem()
        '''
        response对象的xpath方法
        从selector中提取数据的方法
            response.xpath().extract_first() -- 就是一个字符串
            response.xpath().extract() -- 返回值是list,list里面是所有字符串的内容
        '''

        dd_list = response.xpath('//div[@class="main"]/dl/dd')
        # print(2)
        for dd in dd_list:
            movie_title = dd.xpath(
                './/p[@class="name"]/a/@title').extract_first()
            movie_actor = dd.xpath(
                './/p[@class="star"]/text()').extract_first().strip()
            movie_date = dd.xpath(
                './/p[@class="releasetime"]/text()').extract_first()
            movie_score = dd.xpath('.//p[@class="score"]/i/text()').extract()
            movie_detail = dd.xpath(
                './/p[@class="name"]/a/@href').extract_first()
            # print(3)
            # 在网页中评分的个位数和小数是分开的,所以需要连接起来
            movie_score = "".join(movie_score)
            item['movie_title'] = movie_title
            item['movie_actor'] = movie_actor
            item['movie_date'] = movie_date
            item['movie_score'] = movie_score
            item['movie_detail'] = movie_detail
            # print(item)
            yield item
Exemple #10
0
 def parse(self, response):
     # bs4 写法
     # soup = BeautifulSoup(response.text, 'html.parser')
     # title_list = soup.find_all('div', attrs={'class': 'board-item-main'})
     # for i in title_list:
     #     # 这里新增的 item 需要在 items.py 中相应增加
     #     item = MaoyanItem()
     #     title = i.find('a').get_text('title')
     #     link = 'https://maoyan.com'+i.find('a').get('href')
     #     item['title'] = title
     #     item['link'] = link
     #     yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
     # xpath 写法
     movies = Selector(
         response=response).xpath('//div[@class="movie-item-info"]')
     for movie in movies:
         item = MaoyanItem()
         # extract 方法会将输出的 xpath 数据列表化,first 是取第一个
         title = movie.xpath(
             './p[@class="name"]/a/@title').extract_first().strip()
         link = 'https://maoyan.com' + movie.xpath(
             './p[@class="name"]/a/@href').extract_first().strip()
         item['title'] = title
         item['link'] = link
         yield scrapy.Request(url=link,
                              meta={'item': item},
                              callback=self.parse2)
Exemple #11
0
    def parse(self, response):

        # 获取电影列表
        tags = Selector(
            response=response).xpath('//div[@class="movie-item film-channel"]')

        for i in range(10):
            # 只取前10个电影
            tag = tags[i]

            # 获取电影名称
            title = tag.xpath(
                './/span[contains(@class,"name")]/text()').extract_first()

            # 获取上映时间、电影类型
            hover_texts = tag.xpath(
                './/span[@class="hover-tag"]/../text()').extract()
            # 通过xpath定位时多出了很多\n,数据索引有变化
            types = hover_texts[1].strip('\n').strip()
            times = hover_texts[5].strip('\n').strip()

            item = MaoyanItem()
            item['title'] = title
            item['types'] = types
            item['times'] = times

            yield item
Exemple #12
0
    def get_parse(self, response):
        movie_list = response.xpath('//*[@id="app"]/div/div/div[1]/dl/dd')

        for movie in movie_list:
            sort = movie.xpath('./i/text()').extract()[0].strip()
            title = movie.xpath('./a/@title').extract()[0].strip()
            actor = movie.xpath(
                './div/div/div[1]/p[2]/text()').extract()[0].strip()
            releasetime = movie.xpath(
                './div/div/div[1]/p[3]/text()').extract()[0].strip()
            integer = movie.xpath(
                "./div/div/div[2]/p/i[1]/text()").extract()[0].strip()
            fraction = movie.xpath(
                "./div/div/div[2]/p/i[2]/text()").extract()[0].strip()

            print(sort, title, actor, releasetime, integer, fraction)
            item = MaoyanItem()

            item['sort'] = sort
            item['name'] = title
            item['actor'] = actor
            item['releasetime'] = releasetime
            item['score'] = integer + fraction

            yield item
Exemple #13
0
    def parse2(self, response):
        item = MaoyanItem()

        # 电影名称
        movie_name = Selector(response=response).xpath(
            '//h1[@class="name"]/text()').extract_first()

        # 电影类别
        all_list = Selector(
            response=response).xpath('//li[@class="ellipsis"][1]')
        movie_cate_tags = all_list.xpath('./a[@class="text-link"]')
        movie_cates = ''
        for movie_cate_tag in movie_cate_tags:
            movie_cate = movie_cate_tag.xpath('text()').extract_first().strip()
            movie_cates = movie_cates + movie_cate + '/'
        movie_cates = movie_cates[:-1]

        # 上映日期
        movie_date = Selector(response=response).xpath(
            '//li[@class="ellipsis"][3]/text()').extract_first()[:10]

        # 储存到item
        item['movie_name'] = movie_name
        item['movie_cates'] = movie_cates
        item['movie_date'] = movie_date
        yield item
Exemple #14
0
    def parse(self, response):
        div_tags = Selector(
            response=response).xpath('//div[@class="movie-hover-info"]')
        index = 0

        for div_tag in div_tags:

            if index >= 10:
                break

            index += 1

            item = MaoyanItem()
            fname = div_tag.xpath(
                './/div[@class="movie-hover-title"][1]/span[1]/text()')
            ftype = div_tag.xpath(
                './/div[@class="movie-hover-title"][2]/text()')
            fdate = div_tag.xpath(
                './/div[@class="movie-hover-title movie-hover-brief"]/text()')

            item['fname'] = fname.extract()[0]
            item['ftype'] = ftype.extract()[1]
            item['fdate'] = fdate.extract()[1]

            yield item
Exemple #15
0
 def parse(self, response):
     movie_names = response.xpath(
         '//div[@class="movie-item-info"]//a/text()').extract()
     for movie_name in movie_names:
         maoyan_item = MaoyanItem()
         maoyan_item['name'] = movie_name
         yield maoyan_item
Exemple #16
0
 def parse(self, response):
     print(response.url)
     print(response.text)
     # movies = Selector(response=response).xpath('//div[@class="movie-hover-info"]')
     # for i in range(10):
     #     movie = movies[i]
     #     item = MaoyanItem()
     #     title = movie.xpath('./div[1]/span[1]/text()').extract()[0]
     #     movie_type = movie.xpath('./div[2]/text()[2]').extract()[0].strip()
     #     release_date = movie.xpath('./div[4]/text()[2]').extract()[0].strip()
     #     item['title'] = title
     #     item['movie_type'] = movie_type
     #     item['release_date'] = release_date
     #     yield item
     print(response.url)
     # movies = Selector(response=response).xpath('//div[@class="channel-detail movie-item-title"]')
     movies = Selector(response=response).xpath(
         '//*[@id="app"]/div/div[2]/div[2]/dl/dd[2]/div[2]')
     for movie in movies:
         item = MaoyanItem()
         title = movie.xpath('./a/text()')
         link = movie.xpath('./a/@href')
         item['title'] = title.extract_first().strip()
         item['link'] = 'https://' + self.allowed_domains[
             0] + link.extract_first().strip()
         yield scrapy.Request(url=item['link'],
                              meta={'item': item},
                              callback=self.parse2)
    def parse(self, response):

        item = MaoyanItem()

        #提取详细信息:
        #计数器:
        count = 0
        #找到电影栏目
        tags = Selector(
            response=response).xpath('//div[@class="movie-hover-info"]')
        for tag in tags:
            #只获取前10部电影:
            count += 1
            if count > 10:
                break
            #在每个栏目里获取详细信息:
            details = tag.xpath('./div[@class="movie-hover-title"]')
            #获取电影名:
            film_name = details[0].xpath(
                './span[contains(@class,"name")]/text()').get()
            #获取电影类型:
            film_type = details[1].xpath('./text()')[1].get()
            film_type = film_type.split('\n')[1].strip()
            #获取上映时间:
            plan_time = tags[0].xpath(
                './div[@class="movie-hover-title movie-hover-brief"]/text()'
            )[1].get()
            plan_time = plan_time.split('\n')[1].strip()
            print(film_name, film_type, plan_time)
            #数据存到item中:
            item['film_name'] = film_name
            item['film_type'] = film_type
            item['plan_time'] = plan_time

            yield item
Exemple #18
0
    def parse(self, response):  # 解析函数,用来解析返回的结果,提取数据
        # response对象里面scrapy已经封装好了xpath、css方法,可以直接用
        # xpath就是写xpath,css就是写css选择器,返回的都是一个list
        # 这个是用xpath获取页面上所有的电影

        # 下面是css选择器
        # all_movie = response.css('dl.board-wrapper dd')
        item = MaoyanItem()  # 实例化item
        sites = json.loads(response.body_as_unicode())
        list = sites['data']['list']
        for l in list:
            item['b_movieName'] = l['movieName']
            item['b_avgSeatView'] = l['avgSeatView']
            item['b_avgShowView'] = l['avgShowView']
            item['b_avgViewBox'] = l['avgViewBox']
            item['b_boxInfo'] = l['boxInfo']
            item['b_movieId'] = l['movieId']
            item['b_releaseInfo'] = l['releaseInfo']
            item['b_seatRate'] = l['seatRate']
            item['b_showInfo'] = l['showInfo']
            item['b_showRate'] = l['showRate']
            item['b_splitAvgViewBox'] = l['splitAvgViewBox']
            item['b_splitBoxInfo'] = l['splitBoxInfo']
            item['b_splitBoxRate'] = l['splitBoxRate']
            item['b_splitSumBoxInfo'] = l['splitSumBoxInfo']
            item['b_sumBoxInfo'] = l['sumBoxInfo']
            item['b_viewInfo'] = l['viewInfo']
            item['b_viewInfoV2'] = l['viewInfoV2']
            item['b_currentTime'] = datetime.datetime.now()
            yield item
Exemple #19
0
    def detail_parse2(self, response):
        # 接收上级已爬取的数据
        # item = response.meta['item']
        all_comment = response.css(
            'div[class="comment-list-container"] ul li[class="comment-container "]'
        )
        c_movie = response.css(
            'div[class="movie-brief-container"] h3[class="name"]::text'
        ).extract()[0]
        # all_movie = response.xpath('//dl=[@class="movie-list"]')
        # 这个是用css选择器获取到,效果和xpath是一样的
        for comment in all_comment:
            item = MaoyanItem()  # 实例化item
            item['c_user'] = comment.css(
                'div[class="user"] span[class="name"]::text').extract()[0]
            item['c_movie'] = c_movie

            item['c_comment'] = comment.css(
                'div[class="comment-content"]::text').extract()[0]
            item['c_good'] = comment.css(
                'div[class="approve "] span[class="num"]::text').extract()[0]
            item['c_time'] = comment.css(
                'div[class="time"]::attr(title)').extract()[0]
            font_link = re.findall(r'vfile.meituan.net/colorstone/(\w+\.woff)',
                                   response.text)[0]
            self.get_font(font_link)
            box = response.css(
                'div[class="movie-index-content box"] span[class="stonefont"]::text'
            ).extract()[0]
            box_unit = response.css(
                'div[class="movie-index-content box"] span[class="unit"]::text'
            ).extract()[0]
            box = self.modify_data(box)
            item['c_box'] = box + box_unit
            yield item
Exemple #20
0
    def parse_maoyan(self, response):
        item = MaoyanItem()
        #pattern = re.compile(r'\"movieId\"\:\"[0-9]+\"')
        id_str = response.xpath(
            '/html/body/div[3]/div/div[2]/div[2]/div/@data-val').extract()[0]
        item['id'] = id_str.split(':')[1].rstrip('}')
        # 影片中文名称/英文名称
        item['ztitle'] = response.xpath('//h3/text()').extract()[0]
        #item['etitle'] = response.xpath('//div[@class="ename ellipsis"]/text()').extract()[0]
        # 上映时间
        item['release_time'] = response.xpath(
            '//li[@class="ellipsis"][3]/text()').extract()[0]
        # 国家和地区
        item['mv_country'] = response.xpath(
            '//li[@class="ellipsis"][2]/text()').extract()[0].split()[0]
        # 导演
        item['mv_director'] = response.xpath(
            '//a[@class="name"]/text()').extract()[0].strip()

        # 主演
        star_1 = response.xpath(
            '//li[@class="celebrity actor"][1]//a[@class="name"]/text()'
        ).extract()[0].strip()
        star_2 = response.xpath(
            '//li[@class="celebrity actor"][2]//a[@class="name"]/text()'
        ).extract()[0].strip()
        star_3 = response.xpath(
            '//li[@class="celebrity actor"][3]//a[@class="name"]/text()'
        ).extract()[0].strip()
        item['mv_star'] = star_1 + "\\" + star_2 + '\\' + star_3

        # 编剧
        item['mv_scriptwriter'] = response.xpath(
            '//li[@class="celebrity "][1]//a[@class="name"]/text()').extract(
            )[0].strip()

        # 图片链接
        item['mv_image'] = response.xpath(
            '//img[@class="avatar"]/@src').extract()[0]

        # 影片时间
        #item['time'] = response.xpath('//li[@class="ellipsis"][2]/text()').extract()[0].strip()[-5:]

        # 评分
        result = get_realtext.web(response.url)
        item['mv_score'] = result[0]

        # 评论人数
        item['mv_numbers'] = result[1]

        # 简介
        item['mv_introduction'] = response.xpath(
            '//span[@class="dra"]/text()').extract()[0].strip()

        # 影片类型
        item['mv_type'] = response.xpath(
            '//li[@class="ellipsis"][1]/text()').extract()[0]

        yield item
 def parse(self, response):
     sel = Selector(response)
     item = MaoyanItem()
     for movie in sel.xpath('//dl["board-wrapper"]/dd'):
         item['title'] = movie.xpath('a/@title').extract_first()
         item['link'] = 'https://www.maoyan.com' + movie.xpath(
             'a/@href').extract_first()
         yield item
Exemple #22
0
 def parse(self, response):
     re = response.xpath('//dl//div[@class="movie-item film-channel"]')
     for urls in re:
         item = MaoyanItem()
         url = 'https://maoyan.com' + urls.xpath(
             './a/@href')[0:10].extract_first()
         yield scrapy.Request(url=url, callback=self.parse2)
         sleep(2)
Exemple #23
0
 def parse(self, response):
     movies = Selector(response=response).xpath(
         '//div[@class="channel-detail movie-item-title"]')
     for movie in movies:
         item = MaoyanItem()
         title = movie.xpath('./a/text()')  # 电影名称
         item['title'] = title.extract_first().strip()
         yield scrapy.Request(url=item['link'],
                              meta={'item': item},
                              callback=self.parse2)
Exemple #24
0
 def parse2(self, response):
     re1 = response.xpath('//div[@class="movie-brief-container"]')
     for movie in re1:
         item = MaoyanItem()
         item['title'] = movie.xpatn('./h1/text()').extract_first()
         item['film_type'] = movie.xpatn(
             './ul/li[1]/a[1]/text()').extract_first()
         item['film_date'] = movie.xpatn(
             './ul/li[3]/text()').extract_first()
         yield item
Exemple #25
0
 def parse_info(self, response):
     result = json.loads(response.text)
     # print(result)
     item = MaoyanItem()
     cmts_list = result.get('cmts')
     for field in item.fields:
         for cmts in cmts_list:
             if field in cmts.keys():
                 item[field] = cmts.get(field)
     yield item
 def getInfo(self, response):
     m_movies = Selector(response).xpath(
         '//div[@class="movie-brief-container"]')
     for info in m_movies:
         item = MaoyanItem()
         item['film_name'] = info.xpath('./h1/text()').extract_first()
         film_types = info.xpath('./ul/li[1]/a/text()').extract()
         item['film_type'] = (','.join(film_types)).strip()
         item['plan_date'] = info.xpath('./ul/li[3]/text()').extract_first()
         yield item
Exemple #27
0
 def parse2(self, response):
     # movie = Selector(response=response).xpath('//div[@class="channel-detail movie-item-title"]/a/@href')
     # //div[@class="movie-brief-container"]/h1/text()
     # //div[@class="movie-brief-container"]/ul/li/a/text()
     # //div[@class="movie-brief-container"]/ul/li[@class="ellipsis"][3]/text()
     item = MaoyanItem()
     item['name'] = response.xpath('//div[@class="movie-brief-container"]/h1/text()').get()
     item['category'] = response.xpath('//div[@class="movie-brief-container"]/ul/li/a/text()').get()
     item['date'] = response.xpath('//div[@class="movie-brief-container"]/ul/li[@class="ellipsis"][3]/text()').get
     yield item
Exemple #28
0
 def parse_tag(self, respnse):
     item = MaoyanItem()
     item['cinema_name'] = respnse.xpath(
         '//h3[@class="name text-ellipsis"]/text()').extract_first()
     item['price'] = random.choice(self.price_range)
     item['address'] = respnse.xpath(
         '//div[@class="address text-ellipsis"]/text()').extract_first()
     item['city'] = respnse.xpath(
         'normalize-space(///div[@class="city-name"]/text())'
     ).extract_first()
     yield item
Exemple #29
0
 def parse(self, response):
     movies = Selector(response=response).xpath(
         '//div[@class="channel-detail movie-item-title"]')
     for movie in movies[0:10]:
         item = MaoyanItem()
         link_uri = movie.xpath('./a/@href').extract_first().strip()
         link = 'https://maoyan.com' + link_uri
         item['link'] = link
         yield scrapy.Request(url=link,
                              meta={'item': item},
                              callback=self.parse2)
 def parse(self, response):
     item = []
     movies = Selector(
         response=response).xpath('//div[@class="movie-item-hover"]')
     #print(len(movies))
     for movie in movies[0:10]:
         item = MaoyanItem()
         movie_href = movie.xpath('./a/@href').extract_first()
         #print(movie_href)
         yield scrapy.Request(url='https://maoyan.com' + movie_href,
                              meta={'item': item},
                              callback=self.parse2)