Exemple #1
0
 def parse(self, response):
     data = json.loads(response.body.decode('utf-8'))['data']
     for movieItem in data:
         movieItem['classify'] = self.tags[self.tagNum]
     print("影片Outer")
     print(data)
     SaveData().save_media_data(data)
     SaveData().save_history_data(self.tagNum, self.num)
Exemple #2
0
 def start_requests(self):
     history = SaveData().query_media_history(self.type)[0][1]
     mediaAwardList = SaveData().query_media_data(history, 1)[0]
     if len(mediaAwardList) > 0:
         dataItem = mediaAwardList
         acturl = 'https://movie.douban.com/celebrity/' + str(
             dataItem[1]) + '/'
         yield Request(acturl, callback=self.parse_actor_detail)
Exemple #3
0
 def start_requests(self):
     history = SaveData().query_media_history('ACTOR')[0][1]
     mediaDataList = SaveData().query_media_data(history, 1)[0]
     if len(mediaDataList) > 0:
         dataItem = mediaDataList
         url = 'https://movie.douban.com/celebrity/' + str(actorId) + '/'
         yield Request(url,
                       meta={
                           'classify': dataItem['classify'],
                           'id': dataItem['id']
                       },
                       callback=self.parse_actor_detail)
Exemple #4
0
 def parse(self, response):
     item = DoubanMovieItem()
     history = SaveData().query_media_history(self.type)[0][1]
     mediaDataList = SaveData().query_media_data(history, 1)[0]
     # while len(mediaDataList) > 0:
     dataList = SaveData().query_media_pic_id(str(mediaDataList[1]))
     item['movieId'] = str(mediaDataList[1])
     item['moviePicIds'] = dataList
     yield item
     # SaveData().update_media_history(self.type)
     # history = history+1
     # mediaDataList = SaveData().query_media_data(history, 1)[0]
Exemple #5
0
 def start_requests(self):
     history = SaveData().query_media_history(self.type)[0][1]
     mediaAwardList = SaveData().query_media_data(history, 1)[0]
     if len(mediaAwardList) > 0:
         dataItem = mediaAwardList
         picUrl = self.get_pic_url(dataItem[1], 0)
         yield Request(picUrl,
                       meta={
                           'movieId': str(dataItem[1]),
                           'title': dataItem[2],
                           'picNum': 30
                       },
                       callback=self.parse_movie_pic)
 def start_requests(self):
     history = SaveData().query_media_history(self.type)[0][1]
     mediaDataList = SaveData().query_media_data(history, 1)[0]
     if len(mediaDataList) > 0:
         dataItem = mediaDataList
         yield Request(dataItem[4],
                       meta={
                           'classify': dataItem[5],
                           'id': str(dataItem[1]),
                           'title': dataItem[2],
                           'history': history + 1
                       },
                       callback=self.parse_movie_detail)
 def start_requests(self):
     history = SaveData().query_media_history(self.type)[0][1]
     mediaAwardList = SaveData().query_media_data(history, 1)[0]
     if len(mediaAwardList) > 0:
         dataItem = mediaAwardList
         awardUrl = dataItem[4] + 'awards/'
         yield Request(awardUrl,
                       meta={
                           'history': history + 1,
                           'id': str(dataItem[1]),
                           'title': dataItem[2]
                       },
                       callback=self.parse_movie_reward)
Exemple #8
0
 def parse_movie_pic(self, response):
     if response.status == 200:
         picIds = response.selector.xpath(
             "//ul[@class='poster-col3 clearfix']/li/attribute::data-id"
         ).extract()
         if len(picIds) != 0:
             print("图片id")
             dataList = []
             for picId in picIds:
                 data = {}
                 data['id'] = response.meta['movieId']
                 data['title'] = response.meta['title']
                 data['picId'] = picId
                 dataList.append(data)
             SaveData().save_media_pic_id(dataList)
             picUrl = self.get_pic_url(response.meta['movieId'],
                                       response.meta['picNum'])
             yield Request(picUrl,
                           meta={
                               'movieId': response.meta['movieId'],
                               'title': response.meta['title'],
                               'picNum': response.meta['picNum'] + 30
                           },
                           callback=self.parse_movie_pic)
         else:
             history = SaveData().query_media_history(self.type)[0][1]
             mediaAwardList = SaveData().query_media_data(history + 1, 1)[0]
             SaveData().update_media_history(self.type)
             if len(mediaAwardList) > 0:
                 dataItem = mediaAwardList
                 picUrl = self.get_pic_url(dataItem[1], 0)
                 yield Request(picUrl,
                               meta={
                                   'movieId': str(dataItem[1]),
                                   'title': dataItem[2],
                                   'picNum': 30
                               },
                               callback=self.parse_movie_pic)
     elif response.status == 404:
         history = SaveData().query_media_history(self.type)[0][1]
         mediaAwardList = SaveData().query_media_data(history + 1, 1)[0]
         SaveData().update_media_history(self.type)
         if len(mediaAwardList) > 0:
             dataItem = mediaAwardList
             picUrl = self.get_pic_url(dataItem[1], 0)
             yield Request(picUrl,
                           meta={
                               'movieId': str(dataItem[1]),
                               'title': dataItem[2],
                               'picNum': 30
                           },
                           callback=self.parse_movie_pic)
     elif response.status == 301 or response.status == 302:
         history = SaveData().query_media_history(self.type)[0][1]
         mediaAwardList = SaveData().query_media_data(history + 1, 1)[0]
         SaveData().update_media_history(self.type)
         time.sleep(10)
         if len(mediaAwardList) > 0:
             dataItem = mediaAwardList
             picUrl = self.get_pic_url(dataItem[1], 0)
             yield Request(picUrl,
                           meta={
                               'movieId': str(dataItem[1]),
                               'title': dataItem[2],
                               'picNum': 30
                           },
                           callback=self.parse_movie_pic)
Exemple #9
0
 def start_requests(self):
     history = SaveData().query_history_data()
     self.tagNum = history[0][0]
     self.num = history[0][1]
     url = self.get_url()
     yield Request(url, callback=self.parse)
    def parse_movie_detail(self, response):
        if response.status == 200:
            movieDetail = {}
            # ID(ALL)
            id = response.meta['id']
            movieDetail['id'] = id

            # 标题(ALL)
            movieDetail['title'] = response.meta['title']

            # 剧集标题(ALL)
            titleLong = ''.join(
                response.selector.xpath(
                    '//span[@property="v:itemreviewed"]/text()').extract())
            movieDetail['titleLong'] = titleLong

            # 年份(ALL)
            year = ''.join(
                response.selector.xpath(
                    "//span[@class='year']/text()").extract())[1:5]
            movieDetail['year'] = year

            # 分类(ALL)
            classify = response.meta['classify']
            movieDetail['classify'] = classify

            # 导演(ALL)
            directorTagList = response.selector.xpath(
                "//span[contains(text(),'导演')]/../span[@class='attrs']/a")
            directorList = []
            for directorTag in directorTagList:
                director = {}
                director['id'] = ''.join(
                    directorTag.xpath('./attribute::href').extract()).replace(
                        '/celebrity/', '').replace('/', '')
                director['name'] = ''.join(
                    directorTag.xpath('./text()').extract())
                directorList.append(director)
            movieDetail['directorList'] = directorList

            # 编剧(ALL)
            writerTagList = response.selector.xpath(
                "//span[contains(text(),'编剧')]/../span[@class='attrs']/a")
            writerList = []
            for writerTag in writerTagList:
                writer = {}
                writer['id'] = ''.join(
                    writerTag.xpath('./attribute::href').extract()).replace(
                        '/celebrity/', '').replace('/', '')
                writer['name'] = ''.join(writerTag.xpath('./text()').extract())
                writerList.append(writer)
            movieDetail['writerList'] = writerList

            # 主演(ALL)
            performerTagList = response.selector.xpath(
                "//span[contains(text(),'主演')]/../span[@class='attrs']/a")
            performerList = []
            for performerTag in performerTagList:
                performer = {}
                performer['id'] = ''.join(
                    performerTag.xpath('./attribute::href').extract()).replace(
                        '/celebrity/', '').replace('/', '')
                performer['name'] = ''.join(
                    performerTag.xpath('./text()').extract())
                performerList.append(performer)
            movieDetail['performerList'] = performerList

            # 类型(ALL)
            type = ','.join(
                response.selector.xpath(
                    "//span[@property='v:genre']/text()").extract())
            movieDetail['type'] = type

            # 制片国家/地区(ALL)
            country = ''.join(
                response.selector.xpath(
                    "//span[contains(text(), '制片国家/地区')]/following::text()[1]"
                ).extract())
            movieDetail['country'] = country

            # 语言(ALL)
            language = ''.join(
                response.selector.xpath(
                    "//span[contains(text(), '语言')]/following::text()[1]").
                extract())
            movieDetail['language'] = language

            # 上映日期(ALL)
            releaseDate = ','.join(
                response.selector.xpath(
                    "//span[@property='v:initialReleaseDate']/text()").extract(
                    ))
            movieDetail['releaseDate'] = releaseDate

            # 片长(ALONG)
            runtime = ''.join(
                response.selector.xpath(
                    "//span[@property='v:runtime']/text()").extract())
            movieDetail['runtime'] = runtime

            # 季数(MULTI)
            season = ''.join(
                response.selector.xpath(
                    "//select[@id='season']/option[@selected='selected']/text()"
                ).extract())
            movieDetail['season'] = season

            # 集数(MULTI)
            episodes = ''.join(
                response.selector.xpath(
                    "//span[contains(text(), '集数')]/following::text()[1]").
                extract()).replace(' ', '').replace('\n', '')
            movieDetail['episodes'] = episodes

            # 单集片长(MULTI)
            alongRuntime = ''.join(
                response.selector.xpath(
                    "//span[contains(text(), '单集片长')]/following::text()[1]").
                extract())
            movieDetail['alongRuntime'] = alongRuntime

            # 又名(ALL)
            alias = ''.join(
                response.selector.xpath(
                    "//span[contains(text(), '又名')]/following::text()[1]").
                extract())
            movieDetail['alias'] = alias

            # IMDbId(ALL)
            imdbId = ''.join(
                response.selector.xpath(
                    "//span[contains(text(), 'IMDb链接')]/following::a[1]/text()"
                ).extract())
            movieDetail['imdbId'] = imdbId

            # 评分(ALL)
            score = ''.join(
                response.selector.xpath(
                    "//strong[@class='ll rating_num']/text()").extract())
            movieDetail['score'] = score

            # 评分等级(ALL)
            rating = {}
            rating['stars5'] = ''.join(
                response.selector.xpath(
                    "//span[@class='stars5 starstop']/../span[@class='rating_per']/text()"
                ).extract())
            rating['stars4'] = ''.join(
                response.selector.xpath(
                    "//span[@class='stars4 starstop']/../span[@class='rating_per']/text()"
                ).extract())
            rating['stars3'] = ''.join(
                response.selector.xpath(
                    "//span[@class='stars3 starstop']/../span[@class='rating_per']/text()"
                ).extract())
            rating['stars2'] = ''.join(
                response.selector.xpath(
                    "//span[@class='stars2 starstop']/../span[@class='rating_per']/text()"
                ).extract())
            rating['stars1'] = ''.join(
                response.selector.xpath(
                    "//span[@class='stars1 starstop']/../span[@class='rating_per']/text()"
                ).extract())
            rating['peopleNum'] = ''.join(
                response.selector.xpath(
                    "//span[@property='v:votes']/text()").extract())
            movieDetail['rating'] = rating

            # 标签(ALL)
            tags = ','.join(
                response.selector.xpath(
                    "//div[@class='tags-body']/a/text()").extract())
            movieDetail['tags'] = tags

            # 剧情简介(ALL)
            # if(len(response.selector.xpath("//div[@class='indent']/span")) > 2):
            #             #     reportList = response.selector.xpath("//div[@class='indent']/span")[len(response.selector.xpath("//div[@class='indent']/span")) - 2].xpath("./text()").extract()
            report = ''.join(
                response.selector.xpath(
                    "//span[@property='v:summary']/text()").extract())
            movieDetail['report'] = report

            # 推荐(ALL)
            recommendTagList = response.selector.xpath(
                "//div[@class='recommendations-bd']/dl/dd/a")
            recommendList = []
            for recommendTag in recommendTagList:
                recommend = {}
                recommend['id'] = ''.join(
                    recommendTag.xpath('./attribute::href').extract()).replace(
                        'https://movie.douban.com/subject/',
                        '').replace('/?from=subject-page', '')
                recommend['name'] = ''.join(
                    recommendTag.xpath('./text()').extract())
                recommendList.append(recommend)
            movieDetail['recommendList'] = recommendList
            print("影片Detail")
            SaveData().save_media_detail(movieDetail)
            SaveData().save_media_recommend(recommendList, response.meta['id'],
                                            response.meta['title'])
            if len(directorList) > 0:
                SaveData().save_media_attr(directorList, response.meta['id'],
                                           response.meta['title'], "DIRECTOR")
            if len(writerList) > 0:
                SaveData().save_media_attr(writerList, response.meta['id'],
                                           response.meta['title'], "WRITER")
            if len(performerList) > 0:
                SaveData().save_media_attr(performerList, response.meta['id'],
                                           response.meta['title'], "PERFORMER")
            SaveData().update_media_history(self.type)

            mediaDataList = SaveData().query_media_data(
                response.meta['history'], 1)[0]
            if len(mediaDataList) > 0:
                dataItem = mediaDataList
                yield Request(dataItem[4],
                              meta={
                                  'classify': dataItem[5],
                                  'id': str(dataItem[1]),
                                  'title': dataItem[2],
                                  'history': response.meta['history'] + 1
                              },
                              callback=self.parse_movie_detail)
        elif response.status == 404:
            SaveData().update_media_history(self.type)
            mediaDataList = SaveData().query_media_data(
                response.meta['history'], 1)[0]
            if len(mediaDataList) > 0:
                dataItem = mediaDataList
                yield Request(dataItem[4],
                              meta={
                                  'classify': dataItem[5],
                                  'id': str(dataItem[1]),
                                  'title': dataItem[2],
                                  'history': response.meta['history'] + 1
                              },
                              callback=self.parse_movie_detail)
        elif response.status == 301 or response.status == 302:
            SaveData().update_media_history(self.type)
            time.sleep(10)
            mediaDataList = SaveData().query_media_data(
                response.meta['history'], 1)[0]
            if len(mediaDataList) > 0:
                dataItem = mediaDataList
                yield Request(dataItem[4],
                              meta={
                                  'classify': dataItem[5],
                                  'id': str(dataItem[1]),
                                  'title': dataItem[2],
                                  'history': response.meta['history'] + 1
                              },
                              callback=self.parse_movie_detail)
    def parse_movie_reward(self, response):
        if response.status == 200:
            movieReward = {}
            movieReward['id'] = response.meta['id']
            movieReward['title'] = response.meta['title']
            awardTypeList = []
            awardTypeTagList = response.selector.xpath(
                "//div[@class='awards']")
            for awardTypeTag in awardTypeTagList:
                awardType = {}
                awardType['name'] = ''.join(
                    awardTypeTag.xpath("./div/h2/a/text()").extract())
                awardType['year'] = ''.join(
                    awardTypeTag.xpath("./div/h2/span/text()").extract())[2:6]
                awardList = []
                awardTagList = awardTypeTag.xpath("./ul")
                for awardTag in awardTagList:
                    award = {}
                    award['name'] = ''.join(
                        awardTag.xpath("./li")[0].xpath("./text()").extract())
                    awardUserList = []
                    awardUserTagList = awardTag.xpath("./li")[1].xpath("./a")
                    for awardUserTag in awardUserTagList:
                        awardUser = {}
                        awardUser['id'] = ''.join(
                            awardUserTag.xpath(
                                "./attribute::href").extract()).replace(
                                    'https://movie.douban.com/celebrity/',
                                    '').replace('/', '')
                        awardUser['name'] = ''.join(
                            awardUserTag.xpath("./text()").extract())
                        awardUserList.append(awardUser)
                    award['awardUserList'] = awardUserList
                    awardList.append(award)
                awardType['awardList'] = awardList
                awardTypeList.append(awardType)
            movieReward['awardTypeList'] = awardTypeList
            dataList = self.get_award_sql(movieReward)
            SaveData().save_media_award(dataList)
            SaveData().update_media_history(self.type)
            print("影片Reward")

            mediaAwardList = SaveData().query_media_data(
                response.meta['history'], 1)[0]
            if len(mediaAwardList) > 0:
                dataItem = mediaAwardList
                awardUrl = dataItem[4] + 'awards/'
                yield Request(awardUrl,
                              meta={
                                  'id': str(dataItem[1]),
                                  'title': dataItem[2],
                                  'history': response.meta['history'] + 1
                              },
                              callback=self.parse_movie_reward)
        elif response.status == 404:
            SaveData().update_media_history(self.type)
            mediaAwardList = SaveData().query_media_data(
                response.meta['history'], 1)[0]
            if len(mediaAwardList) > 0:
                dataItem = mediaAwardList
                awardUrl = dataItem[4] + 'awards/'
                yield Request(awardUrl,
                              meta={
                                  'id': str(dataItem[1]),
                                  'title': dataItem[2],
                                  'history': response.meta['history'] + 1
                              },
                              callback=self.parse_movie_reward)
        elif response.status == 301 or response.status == 302:
            # SaveData().update_media_history(self.type)
            time.sleep(10)
            mediaAwardList = SaveData().query_media_data(
                response.meta['history'], 1)[0]
            if len(mediaAwardList) > 0:
                dataItem = mediaAwardList
                awardUrl = dataItem[4] + 'awards/'
                yield Request(awardUrl,
                              meta={
                                  'id': str(dataItem[1]),
                                  'title': dataItem[2],
                                  'history': response.meta['history'] + 1
                              },
                              callback=self.parse_movie_reward)