def parse(self, response): data = json.loads(response.body.decode('utf-8'))['data'] for movieItem in data: movieItem['classify'] = self.tags[self.tagNum] print("影片Outer") print(data) SaveData().save_media_data(data) SaveData().save_history_data(self.tagNum, self.num)
def start_requests(self): history = SaveData().query_media_history(self.type)[0][1] mediaAwardList = SaveData().query_media_data(history, 1)[0] if len(mediaAwardList) > 0: dataItem = mediaAwardList acturl = 'https://movie.douban.com/celebrity/' + str( dataItem[1]) + '/' yield Request(acturl, callback=self.parse_actor_detail)
def start_requests(self): history = SaveData().query_media_history('ACTOR')[0][1] mediaDataList = SaveData().query_media_data(history, 1)[0] if len(mediaDataList) > 0: dataItem = mediaDataList url = 'https://movie.douban.com/celebrity/' + str(actorId) + '/' yield Request(url, meta={ 'classify': dataItem['classify'], 'id': dataItem['id'] }, callback=self.parse_actor_detail)
def parse(self, response): item = DoubanMovieItem() history = SaveData().query_media_history(self.type)[0][1] mediaDataList = SaveData().query_media_data(history, 1)[0] # while len(mediaDataList) > 0: dataList = SaveData().query_media_pic_id(str(mediaDataList[1])) item['movieId'] = str(mediaDataList[1]) item['moviePicIds'] = dataList yield item # SaveData().update_media_history(self.type) # history = history+1 # mediaDataList = SaveData().query_media_data(history, 1)[0]
def start_requests(self): history = SaveData().query_media_history(self.type)[0][1] mediaAwardList = SaveData().query_media_data(history, 1)[0] if len(mediaAwardList) > 0: dataItem = mediaAwardList picUrl = self.get_pic_url(dataItem[1], 0) yield Request(picUrl, meta={ 'movieId': str(dataItem[1]), 'title': dataItem[2], 'picNum': 30 }, callback=self.parse_movie_pic)
def start_requests(self): history = SaveData().query_media_history(self.type)[0][1] mediaDataList = SaveData().query_media_data(history, 1)[0] if len(mediaDataList) > 0: dataItem = mediaDataList yield Request(dataItem[4], meta={ 'classify': dataItem[5], 'id': str(dataItem[1]), 'title': dataItem[2], 'history': history + 1 }, callback=self.parse_movie_detail)
def start_requests(self): history = SaveData().query_media_history(self.type)[0][1] mediaAwardList = SaveData().query_media_data(history, 1)[0] if len(mediaAwardList) > 0: dataItem = mediaAwardList awardUrl = dataItem[4] + 'awards/' yield Request(awardUrl, meta={ 'history': history + 1, 'id': str(dataItem[1]), 'title': dataItem[2] }, callback=self.parse_movie_reward)
def parse_movie_pic(self, response): if response.status == 200: picIds = response.selector.xpath( "//ul[@class='poster-col3 clearfix']/li/attribute::data-id" ).extract() if len(picIds) != 0: print("图片id") dataList = [] for picId in picIds: data = {} data['id'] = response.meta['movieId'] data['title'] = response.meta['title'] data['picId'] = picId dataList.append(data) SaveData().save_media_pic_id(dataList) picUrl = self.get_pic_url(response.meta['movieId'], response.meta['picNum']) yield Request(picUrl, meta={ 'movieId': response.meta['movieId'], 'title': response.meta['title'], 'picNum': response.meta['picNum'] + 30 }, callback=self.parse_movie_pic) else: history = SaveData().query_media_history(self.type)[0][1] mediaAwardList = SaveData().query_media_data(history + 1, 1)[0] SaveData().update_media_history(self.type) if len(mediaAwardList) > 0: dataItem = mediaAwardList picUrl = self.get_pic_url(dataItem[1], 0) yield Request(picUrl, meta={ 'movieId': str(dataItem[1]), 'title': dataItem[2], 'picNum': 30 }, callback=self.parse_movie_pic) elif response.status == 404: history = SaveData().query_media_history(self.type)[0][1] mediaAwardList = SaveData().query_media_data(history + 1, 1)[0] SaveData().update_media_history(self.type) if len(mediaAwardList) > 0: dataItem = mediaAwardList picUrl = self.get_pic_url(dataItem[1], 0) yield Request(picUrl, meta={ 'movieId': str(dataItem[1]), 'title': dataItem[2], 'picNum': 30 }, callback=self.parse_movie_pic) elif response.status == 301 or response.status == 302: history = SaveData().query_media_history(self.type)[0][1] mediaAwardList = SaveData().query_media_data(history + 1, 1)[0] SaveData().update_media_history(self.type) time.sleep(10) if len(mediaAwardList) > 0: dataItem = mediaAwardList picUrl = self.get_pic_url(dataItem[1], 0) yield Request(picUrl, meta={ 'movieId': str(dataItem[1]), 'title': dataItem[2], 'picNum': 30 }, callback=self.parse_movie_pic)
def start_requests(self): history = SaveData().query_history_data() self.tagNum = history[0][0] self.num = history[0][1] url = self.get_url() yield Request(url, callback=self.parse)
def parse_movie_detail(self, response): if response.status == 200: movieDetail = {} # ID(ALL) id = response.meta['id'] movieDetail['id'] = id # 标题(ALL) movieDetail['title'] = response.meta['title'] # 剧集标题(ALL) titleLong = ''.join( response.selector.xpath( '//span[@property="v:itemreviewed"]/text()').extract()) movieDetail['titleLong'] = titleLong # 年份(ALL) year = ''.join( response.selector.xpath( "//span[@class='year']/text()").extract())[1:5] movieDetail['year'] = year # 分类(ALL) classify = response.meta['classify'] movieDetail['classify'] = classify # 导演(ALL) directorTagList = response.selector.xpath( "//span[contains(text(),'导演')]/../span[@class='attrs']/a") directorList = [] for directorTag in directorTagList: director = {} director['id'] = ''.join( directorTag.xpath('./attribute::href').extract()).replace( '/celebrity/', '').replace('/', '') director['name'] = ''.join( directorTag.xpath('./text()').extract()) directorList.append(director) movieDetail['directorList'] = directorList # 编剧(ALL) writerTagList = response.selector.xpath( "//span[contains(text(),'编剧')]/../span[@class='attrs']/a") writerList = [] for writerTag in writerTagList: writer = {} writer['id'] = ''.join( writerTag.xpath('./attribute::href').extract()).replace( '/celebrity/', '').replace('/', '') writer['name'] = ''.join(writerTag.xpath('./text()').extract()) writerList.append(writer) movieDetail['writerList'] = writerList # 主演(ALL) performerTagList = response.selector.xpath( "//span[contains(text(),'主演')]/../span[@class='attrs']/a") performerList = [] for performerTag in performerTagList: performer = {} performer['id'] = ''.join( performerTag.xpath('./attribute::href').extract()).replace( '/celebrity/', '').replace('/', '') performer['name'] = ''.join( performerTag.xpath('./text()').extract()) performerList.append(performer) movieDetail['performerList'] = performerList # 类型(ALL) type = ','.join( response.selector.xpath( "//span[@property='v:genre']/text()").extract()) movieDetail['type'] = type # 制片国家/地区(ALL) country = ''.join( response.selector.xpath( "//span[contains(text(), '制片国家/地区')]/following::text()[1]" ).extract()) movieDetail['country'] = country # 语言(ALL) language = ''.join( response.selector.xpath( "//span[contains(text(), '语言')]/following::text()[1]"). extract()) movieDetail['language'] = language # 上映日期(ALL) releaseDate = ','.join( response.selector.xpath( "//span[@property='v:initialReleaseDate']/text()").extract( )) movieDetail['releaseDate'] = releaseDate # 片长(ALONG) runtime = ''.join( response.selector.xpath( "//span[@property='v:runtime']/text()").extract()) movieDetail['runtime'] = runtime # 季数(MULTI) season = ''.join( response.selector.xpath( "//select[@id='season']/option[@selected='selected']/text()" ).extract()) movieDetail['season'] = season # 集数(MULTI) episodes = ''.join( response.selector.xpath( "//span[contains(text(), '集数')]/following::text()[1]"). extract()).replace(' ', '').replace('\n', '') movieDetail['episodes'] = episodes # 单集片长(MULTI) alongRuntime = ''.join( response.selector.xpath( "//span[contains(text(), '单集片长')]/following::text()[1]"). extract()) movieDetail['alongRuntime'] = alongRuntime # 又名(ALL) alias = ''.join( response.selector.xpath( "//span[contains(text(), '又名')]/following::text()[1]"). extract()) movieDetail['alias'] = alias # IMDbId(ALL) imdbId = ''.join( response.selector.xpath( "//span[contains(text(), 'IMDb链接')]/following::a[1]/text()" ).extract()) movieDetail['imdbId'] = imdbId # 评分(ALL) score = ''.join( response.selector.xpath( "//strong[@class='ll rating_num']/text()").extract()) movieDetail['score'] = score # 评分等级(ALL) rating = {} rating['stars5'] = ''.join( response.selector.xpath( "//span[@class='stars5 starstop']/../span[@class='rating_per']/text()" ).extract()) rating['stars4'] = ''.join( response.selector.xpath( "//span[@class='stars4 starstop']/../span[@class='rating_per']/text()" ).extract()) rating['stars3'] = ''.join( response.selector.xpath( "//span[@class='stars3 starstop']/../span[@class='rating_per']/text()" ).extract()) rating['stars2'] = ''.join( response.selector.xpath( "//span[@class='stars2 starstop']/../span[@class='rating_per']/text()" ).extract()) rating['stars1'] = ''.join( response.selector.xpath( "//span[@class='stars1 starstop']/../span[@class='rating_per']/text()" ).extract()) rating['peopleNum'] = ''.join( response.selector.xpath( "//span[@property='v:votes']/text()").extract()) movieDetail['rating'] = rating # 标签(ALL) tags = ','.join( response.selector.xpath( "//div[@class='tags-body']/a/text()").extract()) movieDetail['tags'] = tags # 剧情简介(ALL) # if(len(response.selector.xpath("//div[@class='indent']/span")) > 2): # # reportList = response.selector.xpath("//div[@class='indent']/span")[len(response.selector.xpath("//div[@class='indent']/span")) - 2].xpath("./text()").extract() report = ''.join( response.selector.xpath( "//span[@property='v:summary']/text()").extract()) movieDetail['report'] = report # 推荐(ALL) recommendTagList = response.selector.xpath( "//div[@class='recommendations-bd']/dl/dd/a") recommendList = [] for recommendTag in recommendTagList: recommend = {} recommend['id'] = ''.join( recommendTag.xpath('./attribute::href').extract()).replace( 'https://movie.douban.com/subject/', '').replace('/?from=subject-page', '') recommend['name'] = ''.join( recommendTag.xpath('./text()').extract()) recommendList.append(recommend) movieDetail['recommendList'] = recommendList print("影片Detail") SaveData().save_media_detail(movieDetail) SaveData().save_media_recommend(recommendList, response.meta['id'], response.meta['title']) if len(directorList) > 0: SaveData().save_media_attr(directorList, response.meta['id'], response.meta['title'], "DIRECTOR") if len(writerList) > 0: SaveData().save_media_attr(writerList, response.meta['id'], response.meta['title'], "WRITER") if len(performerList) > 0: SaveData().save_media_attr(performerList, response.meta['id'], response.meta['title'], "PERFORMER") SaveData().update_media_history(self.type) mediaDataList = SaveData().query_media_data( response.meta['history'], 1)[0] if len(mediaDataList) > 0: dataItem = mediaDataList yield Request(dataItem[4], meta={ 'classify': dataItem[5], 'id': str(dataItem[1]), 'title': dataItem[2], 'history': response.meta['history'] + 1 }, callback=self.parse_movie_detail) elif response.status == 404: SaveData().update_media_history(self.type) mediaDataList = SaveData().query_media_data( response.meta['history'], 1)[0] if len(mediaDataList) > 0: dataItem = mediaDataList yield Request(dataItem[4], meta={ 'classify': dataItem[5], 'id': str(dataItem[1]), 'title': dataItem[2], 'history': response.meta['history'] + 1 }, callback=self.parse_movie_detail) elif response.status == 301 or response.status == 302: SaveData().update_media_history(self.type) time.sleep(10) mediaDataList = SaveData().query_media_data( response.meta['history'], 1)[0] if len(mediaDataList) > 0: dataItem = mediaDataList yield Request(dataItem[4], meta={ 'classify': dataItem[5], 'id': str(dataItem[1]), 'title': dataItem[2], 'history': response.meta['history'] + 1 }, callback=self.parse_movie_detail)
def parse_movie_reward(self, response): if response.status == 200: movieReward = {} movieReward['id'] = response.meta['id'] movieReward['title'] = response.meta['title'] awardTypeList = [] awardTypeTagList = response.selector.xpath( "//div[@class='awards']") for awardTypeTag in awardTypeTagList: awardType = {} awardType['name'] = ''.join( awardTypeTag.xpath("./div/h2/a/text()").extract()) awardType['year'] = ''.join( awardTypeTag.xpath("./div/h2/span/text()").extract())[2:6] awardList = [] awardTagList = awardTypeTag.xpath("./ul") for awardTag in awardTagList: award = {} award['name'] = ''.join( awardTag.xpath("./li")[0].xpath("./text()").extract()) awardUserList = [] awardUserTagList = awardTag.xpath("./li")[1].xpath("./a") for awardUserTag in awardUserTagList: awardUser = {} awardUser['id'] = ''.join( awardUserTag.xpath( "./attribute::href").extract()).replace( 'https://movie.douban.com/celebrity/', '').replace('/', '') awardUser['name'] = ''.join( awardUserTag.xpath("./text()").extract()) awardUserList.append(awardUser) award['awardUserList'] = awardUserList awardList.append(award) awardType['awardList'] = awardList awardTypeList.append(awardType) movieReward['awardTypeList'] = awardTypeList dataList = self.get_award_sql(movieReward) SaveData().save_media_award(dataList) SaveData().update_media_history(self.type) print("影片Reward") mediaAwardList = SaveData().query_media_data( response.meta['history'], 1)[0] if len(mediaAwardList) > 0: dataItem = mediaAwardList awardUrl = dataItem[4] + 'awards/' yield Request(awardUrl, meta={ 'id': str(dataItem[1]), 'title': dataItem[2], 'history': response.meta['history'] + 1 }, callback=self.parse_movie_reward) elif response.status == 404: SaveData().update_media_history(self.type) mediaAwardList = SaveData().query_media_data( response.meta['history'], 1)[0] if len(mediaAwardList) > 0: dataItem = mediaAwardList awardUrl = dataItem[4] + 'awards/' yield Request(awardUrl, meta={ 'id': str(dataItem[1]), 'title': dataItem[2], 'history': response.meta['history'] + 1 }, callback=self.parse_movie_reward) elif response.status == 301 or response.status == 302: # SaveData().update_media_history(self.type) time.sleep(10) mediaAwardList = SaveData().query_media_data( response.meta['history'], 1)[0] if len(mediaAwardList) > 0: dataItem = mediaAwardList awardUrl = dataItem[4] + 'awards/' yield Request(awardUrl, meta={ 'id': str(dataItem[1]), 'title': dataItem[2], 'history': response.meta['history'] + 1 }, callback=self.parse_movie_reward)