def parse_movie(self, response):
     movie_id = response.meta['movie_id']
     title = response.meta['title']
     year = response.meta['year']
     resource_list = response.xpath('//div[@class="download-link"]/a')
     if resource_list:
         imdb_id = 0
         info_list = response.xpath('//div[@id="info"]/text()').getall()
         for info in reversed(info_list):
             imdb = re.search(' IMDb链接: tt(\d+)', info)
             if imdb is not None:
                 imdb_id = imdb.group(1)
                 break
         for resource in resource_list:
             url = resource.xpath('@href').get()
             name_origin = resource.xpath('text()').get()
             item_resource = ResourceMovie()
             item_resource['id_movie_douban'] = 0
             item_resource['id_movie_imdb'] = imdb_id
             item_resource['id_website_resource'] = 104
             item_resource['id_type_resource'] = config.parse_type(name_origin)
             item_resource['name_zh'] = title
             item_resource['create_year'] = year
             item_resource['name_origin'] = name_origin
             item_resource['url_resource'] = url
             yield item_resource
             print('-------------------------')
             print(item_resource)
         self.logger.info('get xl720\'s movie success,movie_id:{},movie_name:{}'.format(movie_id, title))
     else:
         self.logger.warning('get xl720\'s movie failed,movie_id:{},movie_name:{}'.format(movie_id, title))
Esempio n. 2
0
 def parse_movie(self, response):
     movie_id = response.meta['movie_id']
     movie_name = response.meta['movie_name']
     year = response.meta['year']
     resource_list = response.xpath('//tbody/tr/td')
     if resource_list:
         for resource in resource_list:
             url = resource.xpath('a/@href').get()
             text = ''.join(resource.xpath('text()').getall())
             if '网盘' in text:
                 name_origin = '网盘提取码:{}'.format(
                     re.search('[a-zA-Z0-9]{4}', text).group())
                 type_id = 102
             else:
                 name_origin = resource.xpath('a/text()').get()
                 type_id = config.parse_type(name_origin)
             item_resource = ResourceMovie()
             item_resource['id_movie_douban'] = 0
             item_resource['id_movie_imdb'] = 0
             item_resource['id_website_resource'] = 105
             item_resource['id_type_resource'] = type_id
             item_resource['name_zh'] = movie_name
             item_resource['create_year'] = year
             item_resource['name_origin'] = name_origin
             item_resource['url_resource'] = url
             yield item_resource
             print('-------------------------')
             print(item_resource)
         self.logger.info(
             'get hao6v\'s movie success,movie_id:{},movie_name:{}'.format(
                 movie_id, movie_name))
     else:
         self.logger.warning(
             'get hao6v\'s movie failed,movie_id:{},movie_name:{}'.format(
                 movie_id, movie_name))
Esempio n. 3
0
 def parse_movie_list(self, response):
     type_id = response.meta['type_id']
     page_id = response.meta['page_id']
     year = response.meta['year']
     # 电影列表
     movie_list = response.xpath(
         '//ul[@class="stui-vodlist clearfix"]/li/div[@class="stui-vodlist__box"]/a'
     )
     if movie_list:
         for movie in movie_list:
             url = movie.xpath('@href').get()
             name = movie.xpath('@title').get()
             item_resource = ResourceMovie()
             item_resource['id_movie_douban'] = 0
             item_resource['id_movie_imdb'] = 0
             item_resource['id_website_resource'] = 107
             item_resource['id_type_resource'] = 101
             item_resource['name_zh'] = name
             item_resource['create_year'] = year
             item_resource['name_origin'] = name
             item_resource['url_resource'] = '{}{}'.format(
                 config.URL_ZXZJS, url)
             yield item_resource
             print('-------------------------')
             print(item_resource)
         self.logger.info(
             'get zxzjs\'s movie list success,type:{},page:{},year:{}'.
             format(type_id, page_id, year))
         # 爬虫结束 / 仅最新电影
         if year < self.end_year or (self.type == self.type_new
                                     and year < self.new_min_year
                                     and page_id > self.new_max_page):
             return
         # 下一页
         next_page = response.xpath('//a[text()="下一页"]/@href').get()
         if next_page is None:
             next_year = year - 1
             next_page = '/vodshow/{}-----------{}.html'.format(
                 type_id, next_year)
             next_page_id = 1
         else:
             next_year = year
             next_page_id = page_id + 1
         yield scrapy.Request(url='{}{}'.format(config.URL_ZXZJS,
                                                next_page),
                              meta={
                                  'type_id': type_id,
                                  'page_id': next_page_id,
                                  'year': next_year
                              },
                              callback=self.parse_movie_list)
     else:
         self.logger.warning(
             'get zxzjs\'s movie list failed,type:{},page:{},year:{}'.
             format(type_id, page_id, year))
 def parse_movie_list(self, response):
     type = response.meta['type']
     page_id = response.meta['page_id']
     # 电影列表
     movie_list = response.xpath('//a[@class="link-hover"]')
     if movie_list:
         for movie in movie_list:
             movie_id = re.search('\d+', movie.xpath('@href').get()).group()
             name = movie.xpath('@title').get()
             create_year = 0000
             info_list = movie.xpath(
                 'span[@class="lzbz"]/p/text()').getall()
             for info in reversed(info_list):
                 year = re.search('\d{4}', info)
                 if year is not None:
                     create_year = year.group()
                     break
             item_resource = ResourceMovie()
             item_resource['id_movie_douban'] = 0
             item_resource['id_movie_imdb'] = 0
             item_resource['id_website_resource'] = 106
             item_resource['id_type_resource'] = 101
             item_resource['name_zh'] = name
             item_resource['create_year'] = create_year
             item_resource['name_origin'] = name
             item_resource[
                 'url_resource'] = '{0}/videoplayer/{1}.html?{1}-1-1'.format(
                     config.URL_GOUDAITV, movie_id)
             yield item_resource
             print('-------------------------')
             print(item_resource)
         self.logger.info(
             'get goudaitv\'s movie list success,type:{},page:{}'.format(
                 type, page_id))
         # 仅最新电影
         if self.type == self.type_new and page_id > self.new_max_pages:
             return
         # 下一页
         yield scrapy.Request(url='{}/v/{}-{}.html'.format(
             config.URL_GOUDAITV, type, page_id + 1),
                              meta={
                                  'type': type,
                                  'page_id': page_id + 1
                              },
                              callback=self.parse_movie_list)
     else:
         self.logger.warning(
             'get goudaitv\'s movie list failed,type:{},page:{}'.format(
                 type, page_id))
 def parse_movie(self, response):
     movie_id = response.meta['movie_id']
     title = response.meta['title']
     year = response.meta['year']
     type_list = response.xpath('//div[@class="p_list"]')
     if type_list is not None:
         for type in type_list:
             type_title = type.xpath('h2/text()').get()
             type_id = config.parse_type(
                 type_title) if type_title is not None else 100
             for resource in type.xpath('.//li'):
                 # 在线资源
                 if type_id == 101:
                     name_origin = resource.xpath('a/text()').get()
                     url = '{}{}'.format(config.URL_BTBTDY,
                                         resource.xpath('a/@href').get())
                 # 网盘资源
                 elif type_id == 102:
                     name_origin = resource.xpath('span/text()').get()
                     url = resource.xpath('a/@href').get()
                 # 其他资源
                 else:
                     name_origin = resource.xpath('a/text()').get()
                     url = resource.xpath('span/a/@href').get()
                 item_resource = ResourceMovie()
                 item_resource['id_movie_douban'] = 0
                 item_resource['id_movie_imdb'] = 0
                 item_resource['id_website_resource'] = 103
                 item_resource['id_type_resource'] = config.parse_type(
                     name_origin) if type_id == 100 else type_id
                 item_resource['name_zh'] = title
                 item_resource['create_year'] = year
                 item_resource[
                     'name_origin'] = name_origin if name_origin is not None else ''
                 item_resource[
                     'url_resource'] = url if url is not None else ''
                 yield item_resource
                 print('-------------------------')
                 print(item_resource)
         self.logger.info(
             'get btbtdy\'s movie success,movie_id:{},movie_name:{}'.format(
                 movie_id, title))
     else:
         self.logger.info(
             'get btbtdy\'s movie failed,movie_id:{},movie_name:{}'.format(
                 movie_id, title))
 def parse_movie(self, response):
     movie_id = response.meta['movie_id']
     title = response.xpath('//h1/a/text()').get()
     if title is not None:
         resource_list = response.xpath('//div[@id="liebiao"]//a[@title]')
         for resource in resource_list:
             url = resource.xpath('@href').get()
             name_origin = resource.xpath('text()').get()
             description = response.xpath(
                 '//*[@id="juqing"]//text()').getall()
             flag_year = False
             flag_imdb = False
             create_year = 0000
             imdb_id = 0
             for detail in description:
                 if flag_year and flag_imdb:
                     break
                 year = re.search('(\d{4})-\d{2}-\d{2}', detail)
                 imdb = re.search('IMDb链接: tt(\d+)', detail)
                 if year is not None:
                     create_year = year.group(1)
                     flag_year = True
                 if imdb is not None:
                     imdb_id = imdb.group(1)
                     flag_imdb = True
             item_resource = ResourceMovie()
             item_resource['id_movie_douban'] = 0
             item_resource['id_movie_imdb'] = imdb_id
             item_resource['id_website_resource'] = 102
             item_resource['id_type_resource'] = config.parse_type(
                 name_origin)
             item_resource['name_zh'] = title
             item_resource['create_year'] = create_year
             item_resource['name_origin'] = name_origin
             item_resource['url_resource'] = url
             yield item_resource
             print('-------------------------')
             print(item_resource)
         self.logger.info(
             'get loldytt\'s movie success,movie_id:{},movie_name:{}'.
             format(movie_id, title))
     else:
         self.logger.warning(
             'get loldytt\'s movie failed,movie_id:{}'.format(movie_id))
 def parse_movie(self, response):
     movie_id = response.meta['movie_id']
     title = response.xpath('//h1/text()').get()
     name = re.search('《(.*)》', title).group(1) if title is not None else ''
     if title is not None:
         online_list = response.xpath(
             '//div[@class="player_list"]//a/@href').getall()
         offline_list = response.xpath('//td[@style]/a/@href').getall()
         for url in online_list + offline_list:
             item_resource = ResourceMovie()
             item_resource['id_movie_douban'] = 0
             item_resource['id_movie_imdb'] = 0
             item_resource['id_website_resource'] = 101
             item_resource[
                 'id_type_resource'] = 101 if config.URL_DY2018 in url else 100
             item_resource['name_zh'] = name
             year_maybe = response.xpath(
                 '//div[@id="Zoom"]/text()').getall()
             for index, year in enumerate(year_maybe):
                 if index > 5:
                     break
                 create_year = re.search('年  代 (\d+)', year)
                 if create_year is not None:
                     item_resource['create_year'] = create_year.group(1)
                     break
             item_resource['name_origin'] = title
             item_resource['url_resource'] = url
             yield item_resource
             print('-------------------------')
             print(item_resource)
         self.logger.info(
             'get dy2018\'s movie success,movie_id:{},movie_name:{}'.format(
                 movie_id, name))
     else:
         self.logger.warning(
             'get dy2018\'s movie failed,movie_id:{}'.format(movie_id))
    def parse(self, response):
        movie_id = response.meta['id']
        if response.xpath('//div[@id="content"]'):
            info = response.xpath('//div[@id="info"]')
            title = response.xpath('//h1/span[1]/text()').get()
            type_list = info.xpath('span[@property="v:genre"]/text()').getall()

            imdb_xp = info.xpath(
                'span[text()="IMDb链接:"]/following-sibling::a/text()').get()
            imdb_re = re.search('tt(\d+)',
                                imdb_xp) if imdb_xp is not None else ''
            id_movie_imdb = imdb_re.group(1) if imdb_re != '' else 0

            year_xp = response.xpath('//h1/span[@class="year"]/text()').get()
            year_re = re.search('[(](\d+)[)]',
                                year_xp) if year_xp is not None else ''
            start_year = year_re.group(1) if year_re != '' else 0

            name_zh_xp = re.search('[\u4e00-\u9fff():\d\s]*',
                                   title) if title is not None else ''
            name_zh = name_zh_xp.group().strip() if name_zh_xp != '' else ''

            name_origin_xp = re.search('[\u4e00-\u9fff()\d\s]*(.*)',
                                       title) if title is not None else ''
            name_origin = name_origin_xp.group(
                1).strip() if name_origin_xp != '' else ''

            runtime = info.xpath('span[@property="v:runtime"]/@content').get()

            url_poster_xp = response.xpath(
                '//a[@class="nbgnbg"]/img/@src').get()
            url_poster_re = re.search(
                '[ps](\d+)',
                url_poster_xp) if url_poster_xp is not None else ''
            url_poster = url_poster_re.group(1) if url_poster_re != '' else ''

            summary_list_xp = response.xpath(
                '//span[@property="v:summary"]/text()').getall()
            summary = ''
            for summary_xp in summary_list_xp:
                summary += summary_xp.strip()

            see_list = response.xpath(
                '//div[@class="subject-others-interests-ft"]/a/text()').getall(
                )

            # 豆瓣电影
            item_movie = MovieDouban()
            item_movie['id'] = movie_id
            # 影片类型 1:未知 2:电影 3:电视剧 4:短片
            item_movie['id_type_video'] = 2
            if '集数:' in info.xpath('/span/text()').getall():
                item_movie['id_type_video'] = 3
            elif '短片' in type_list:
                item_movie['id_type_video'] = 4
            item_movie['id_movie_imdb'] = id_movie_imdb
            item_movie['start_year'] = start_year
            item_movie['name_zh'] = name_zh
            item_movie['name_origin'] = name_origin
            item_movie['runtime'] = runtime if runtime is not None else 0
            item_movie['url_poster'] = url_poster
            item_movie['summary'] = summary
            item_movie['have_seen'] = 0
            item_movie['wanna_see'] = 0
            for see in see_list:
                if re.match('(\d+)人看过', see) is not None:
                    item_movie['have_seen'] = re.search('(\d+)人看过',
                                                        see).group(1)
                if re.match('(\d+)人想看', see) is not None:
                    item_movie['wanna_see'] = re.search('(\d+)人想看',
                                                        see).group(1)
            item_movie['update_date'] = self.today
            print('--------------------------------------')
            print(item_movie)
            yield item_movie

            trailer_xp = response.xpath(
                '//li[@class="label-trailer"]/a/@href').get()
            trailer_re = re.search(
                '\d+', trailer_xp) if trailer_xp is not None else ''

            # 电影预告片
            item_trailer = TrailerMovieDouban()
            item_trailer['id'] = trailer_re.group(
            ) if trailer_re is not None else 0
            item_trailer['id_movie_douban'] = movie_id
            item_trailer['url_video'] = ''
            yield item_trailer
            # 电影别名
            alias_label = info.xpath('span[text()="又名:"]').get()
            if alias_label is not None:
                alias_position = 1 if imdb_xp is None else 3
                alias_list = info.xpath('text()[last()-{}]'.format(
                    alias_position)).get().split('/')
                for alias in alias_list:
                    item_alias = AliasMovieDouban()
                    item_alias['id_movie_douban'] = movie_id
                    item_alias['name_alias'] = alias.strip()
                    print('--------------------------------------')
                    print(item_alias)
                    yield item_alias
            # 电影影人
            celebrity_list = info.xpath('.//span/a')
            count = 0
            for celebrity in celebrity_list:
                # 影人类型 2:导演 3:编剧 4:主演
                item_movie_to_celebrity = MovieDoubanToCelebrityDouban()
                item_movie_to_celebrity['id_movie_douban'] = movie_id
                item_movie_to_celebrity['id_celebrity_douban'] = re.search(
                    '\d+',
                    celebrity.xpath('@href').get()).group()
                # 主演
                if celebrity.xpath('@rel').get() == 'v:starring':
                    item_movie_to_celebrity['id_profession'] = 4
                    count += 1
                    item_movie_to_celebrity['sort'] = count
                    print('celebrity --------------------------------------')
                    print(item_movie_to_celebrity)
                    yield item_movie_to_celebrity
                    continue
                # 导演
                elif celebrity.xpath('@rel').get() == 'v:directedBy':
                    item_movie_to_celebrity['id_profession'] = 2
                # 编剧
                else:
                    item_movie_to_celebrity['id_profession'] = 3
                item_movie_to_celebrity['sort'] = 0
                print('celebrity --------------------------------------')
                print(item_movie_to_celebrity)
                yield item_movie_to_celebrity
            # 电影类型
            for type_name in type_list:
                item_movie_to_type = MovieDoubanToTypeMovie()
                item_movie_to_type['id_movie_douban'] = movie_id
                if type_name in config.TYPE_MOVIE_LIST:
                    item_movie_to_type[
                        'id_type_movie'] = config.TYPE_MOVIE_LIST.index(
                            type_name)
                else:
                    continue
                print('--------------------------------------')
                print(item_movie_to_type)
                yield item_movie_to_type
            # 电影评分
            is_score = response.xpath(
                '//div[@class="rating_sum"]/text()').get()
            if re.search('暂无评分', is_score) is None:
                score = response.xpath('//div[@rel="v:rating"]')
                item_score = RateMovieDouban()
                item_score['id'] = movie_id
                item_score['score'] = score.xpath('div/strong/text()').get()
                item_score['vote'] = score.xpath(
                    './/span[@property="v:votes"]/text()').get()
                vote_list = score.xpath(
                    './/span[@class="rating_per"]/text()').getall()
                for index, vote in enumerate(vote_list):
                    item_score['score{}'.format(5 - index)] = re.search(
                        '(.*)%', vote).group(1)
                print('--------------------------------------')
                print(item_score)
                yield item_score
            # 电影标签
            tag_list = response.xpath(
                '//div[@class="tags-body"]/a/text()').getall()
            for tag in tag_list:
                item_tag_movie = TagMovie()
                item_tag_movie['id_movie_douban'] = movie_id
                item_tag_movie['name_zh'] = tag
                print('--------------------------------------')
                print(item_tag_movie)
            # 电影奖项
            award_list = response.xpath('//div[@class="mod"]/ul')
            for award in award_list:
                title = award.xpath('li[1]/a/text()').get()
                id_award = award.xpath('li[1]/a/@href').get().split('/')[4]
                type_award = award.xpath('li[2]/text()').get()
                celebrity_award = award.xpath('li[3]/a/@href').get()
                item_award = AwardMovie()
                item_award['id'] = id_award
                item_award['name_zh'] = re.search('第\d+届(.*)', title).group(1)
                yield item_award
                item_movie_to_award = MovieDoubanToAwardMovie()
                item_movie_to_award['id_movie_douban'] = movie_id
                item_movie_to_award['id_award_movie'] = id_award
                item_movie_to_award['id_celebrity_douban'] = re.search(
                    '\d+', celebrity_award).group(
                    ) if celebrity_award is not None else 0
                item_movie_to_award['type_award'] = type_award.split('(提名)')[0]
                item_movie_to_award['award_th'] = re.search('\d+',
                                                            title).group()
                item_movie_to_award['is_nominated'] = 0 if re.search(
                    '提名', type_award) else 1
                print('--------------------------------------')
                print(item_movie_to_award)
                yield item_movie_to_award
            # 电影影评
            review_list = response.xpath('//div[@class="main review-item"]')
            if review_list:
                for review in review_list:
                    user_id = review.xpath(
                        'header/a[@class="name"]/@href').get().split('/')[4]

                    item_user = UserDouban()
                    item_user['id'] = user_id
                    item_user['name_zh'] = review.xpath(
                        'header/a[@class="name"]/text()').get()
                    yield item_user

                    date_xp = review.xpath(
                        'header/span[@content]/text()').get()
                    review_title = review.xpath(
                        'div[@class="main-bd"]/h2/a/text()').get()
                    review_id_xp = review.xpath(
                        'div[@class="main-bd"]/h2/a/@href').get()
                    review_id_re = re.search(
                        '\d+',
                        review_id_xp) if review_id_xp is not None else ''
                    review_id = review_id_re.group(
                    ) if review_id_re != '' else 0
                    agree_vote = review.xpath(
                        './/a[@title="有用"]/span/text()').get().strip()
                    against_vote = review.xpath(
                        './/a[@title="没用"]/span/text()').get().strip()

                    item_review = ReviewMovieDouban()
                    item_review['id'] = review_id
                    item_review[
                        'agree_vote'] = agree_vote if agree_vote != '' else 0
                    item_review[
                        'against_vote'] = against_vote if against_vote != '' else 0
                    item_review['create_datetime'] = int(
                        time.mktime(time.strptime(date_xp, '%Y-%m-%d %H:%M:%S')
                                    )) if date_xp is not None else 0
                    item_review['title'] = review_title
                    item_review['content'] = ''.join(
                        review.xpath('.//div[@class="short-content"]/text()').
                        getall()).strip().strip('()').strip()
                    yield item_review
                    print('------------')
                    print(item_review)
                    item_user_review = UserDoubanToReviewMovieDouban()
                    item_user_review['id_user_douban'] = user_id
                    item_user_review['id_review_movie_douban'] = review_id
                    yield item_user_review
                    item_movie_review = MovieDoubanToReviewMovieDouban()
                    item_movie_review['id_movie_douban'] = movie_id
                    item_movie_review['id_review_movie_douban'] = review_id
                    yield item_movie_review

                    score_xp = review.xpath('header/span[@title]/@class').get()
                    score_re = re.search(
                        '\d+', score_xp) if score_xp is not None else ''
                    score = int(score_re.group()) if score_re != '' else 0

                    item_user_movie = UserDoubanToMovieDouban()
                    item_user_movie['id_user_douban'] = user_id
                    item_user_movie['id_movie_douban'] = movie_id
                    item_user_movie['score'] = score / 5
                    item_user_movie['is_wish'] = 0
                    item_user_movie['is_seen'] = 1
                    yield item_user_movie
            # 电影资源
            resource_list = response.xpath('//ul[@class="bs"]/li')
            if resource_list:
                for resource in resource_list:
                    item_resource = ResourceMovie()
                    item_resource['id_movie_douban'] = movie_id
                    item_resource['id_movie_imdb'] = id_movie_imdb
                    item_resource['id_website_resource'] = 1
                    website = resource.xpath('a/text()').get().strip()
                    if website in config.WEBSITE_RESOURCE_LIST:
                        item_resource[
                            'id_website_resource'] = config.WEBSITE_RESOURCE_LIST.index(
                                website)
                    type = resource.xpath('..//span/text()').get().strip()
                    item_resource['id_type_resource'] = 1
                    if type in config.TYPE_RESOURCE_LIST:
                        item_resource[
                            'id_type_resource'] = config.TYPE_RESOURCE_LIST.index(
                                type)
                    item_resource['name_zh'] = name_zh
                    item_resource['create_year'] = start_year
                    item_resource['name_origin'] = name_zh
                    url_resource = resource.xpath('a/@href').get()
                    item_resource['url_resource'] = re.search(
                        'https://www\.douban\.com/link2/\?url=(.*)',
                        url_resource).group(
                            1) if url_resource is not None else ''
                    print('--------------------')
                    print(item_resource)
                    yield item_resource
            self.logger.info('get douban movie success,id:{}'.format(movie_id))
        else:
            self.logger.warning(
                'get douban movie failed,id:{}'.format(movie_id))