def saveTvPlotList(self, plotList): if plotList is None: return try: for index in range(len(plotList)): plotDict = plotList[index] grabMediaInfo = GrabMediaInfo() grabMediaInfo.mediaId = DBUtil().createPK("GRAB_MEDIA_INFO") grabMediaInfo.grabTime = DBUtil().systemDateTime() grabMediaInfo.mediaSourceCode = plotDict.get("mediaSourceCode") grabMediaInfo.cnName = plotDict.get("cnName") grabMediaInfo.currentSeasonNumber = plotDict.get( "currentSeasonNumber") grabMediaInfo.episodeTitle = plotDict.get("episodeTitle") grabMediaInfo.informationSources = plotDict.get( "informationSources") grabMediaInfo.parentSourceCode = plotDict.get( "parentSourceCode") grabMediaInfo.cleanStatus = 0 grabMediaInfo.cleanAfterId = 0 grabMediaInfo.mediaYear = 0 grabMediaInfo.mediaTimes = 0 grabMediaInfo.totalSeason = 1 grabMediaInfo.currentSeason = 1 grabMediaInfo.save(using="grab", force_insert=True) except BaseException as e: print("e.message:", str(e), "保存电视剧集出错")
def getRemoteFile(self, dataType, currentStamp, inforrationSources, headImgUrl): remoteFile = "" webFileUrl = headImgUrl if inforrationSources == 0: webFileUrl = headImgUrl.replace("s_ratio_celebrity", "l_ratio_celebrity") if dataType.split(data_type_separator)[0] == data_type_1_star and dataType.split(data_type_separator)[ 1] == data_type_2_grab: remoteFile = str(inforrationSources) + url_separator + str(ipDict[localIp]) + url_separator + data_type_1_star + url_separator + str( currentStamp) + url_separator + DBUtil().find_last(webFileUrl, url_separator) elif dataType.split(data_type_separator)[0] == data_type_1_meta and dataType.split(data_type_separator)[ 1] == data_type_2_grab: remoteFile = str(inforrationSources) + url_separator + str(ipDict[localIp]) + url_separator + data_type_1_meta + url_separator + str( currentStamp) + url_separator + DBUtil().find_last(webFileUrl, url_separator) #print("webFileUrl=", webFileUrl, "&remoteFile=", remoteFile) return remoteFile, webFileUrl
def addMataImg(self, media_source_code,poster_img_url,ftpUrl,photoWidth,photoHeight): try: grabPoster = GrabPoster() # 序列id grabPoster.posterId = DBUtil().createPK("GRAB_POSTER") # 影片源id grabPoster.mediaSourceCode = media_source_code # 信息来源 grabPoster.informationSources = 0 # 海报url grabPoster.posterUrl = poster_img_url # 海报FTP grabPoster.posterFtpUrl = ftpUrl # 图片宽度 grabPoster.posterWidth = photoWidth # 图片高度 grabPoster.posterHeight = photoHeight # 抓取时间 grabPoster.grabTime = DateUtils.getSysTimeFormat("%Y-%m-%d %H:%M:%S") # 清洗状态 grabPoster.cleanStatus = 0 # 清洗后id grabPoster.cleanAfterId = 0 # 图片显示状态 grabPoster.displayStatus = 1 grabPoster.save(using="grab", force_insert=True) except BaseException as e: print("e.message:", str(e))
def parsSubset(self, divDate, grabMediaInfo): divDate = str(divDate) divDate = BeautifulSoup(divDate, 'html.parser') divAll = divDate.find_all(attrs={'class': 'p-item-info'}) for divitem in divAll: itemA = divitem.find(attrs={'class': 'item-title'}).find("a") print(itemA) cName = itemA.text itemUrl = itemA["href"] itemStr = itemUrl[:len(itemUrl) - 1].split(".") pSourceCode = grabMediaInfo.mediaSourceCode grabMediaInfo.parentSourceCode = pSourceCode grabMediaInfo.cnName = cName grabMediaInfo.grabWebUrl = itemUrl grabMediaInfo.mediaIntroData = itemStr mediaSourceCode = 0 for item in itemStr: if "id" in item: mediaId = DBUtil().createPK("GRAB_MEDIA_INFO") grabMediaInfo.mediaId = mediaId mediaSourceCode = item.split("_")[2] grabMediaInfo.mediaSourceCode = mediaSourceCode currentSeasonNumber = re.sub("\D", "", cName) grabMediaInfo.currentSeason = currentSeasonNumber print(mediaSourceCode, itemUrl, cName, currentSeasonNumber)
def saveMediaPoster(self, mediaInfo): try: grabPoster = GrabPoster() grabPoster.posterId = DBUtil().createPK("GRAB_POSTER") grabPoster.mediaSourceCode = mediaInfo.get("mediaSourceCode") grabPoster.informationSources = self.informationSources grabPoster.posterUrl = mediaInfo.get("posterImgUrl") grabPoster.posterWidth = mediaInfo.get("posterWidth") grabPoster.posterHeight = mediaInfo.get("posterHeight") grabPoster.posterFtpUrl = mediaInfo.get("posterFtpUrl") grabPoster.displayStatus = 1 grabPoster.cleanStatus = 0 grabPoster.cleanAfterId = 0 grabPoster.grabTime = DBUtil().systemDateTime() grabPoster.save(using="grab", force_insert=True) except BaseException as e: print("e.message:", str(e), "保存媒资图片出错")
def analyzeMediaStarInfoAndSave(self, data, grabStarId): try: workTypes = data.find(attrs={ "id": "block-F" }).find_all(attrs={"class": "piclist-scroll piclist-scroll-h203"}) #print("workTypes=", type(workTypes), workTypes) for workType in workTypes: #全部都是演员(0导演1演员2编剧) mediaStarType = 1 workInfos = workType.find(attrs={ "class": "wrapper-cols" }).find(attrs={ "class": "wrapper-piclist" }).find(attrs={ "class": "site-piclist site-piclist-155203" }).find_all("li") #print("workInfosli", workInfos) for workInfo in workInfos: try: metaHref = workInfo.find(attrs={ "class": "site-piclist_pic" }).find("a")["href"] # roleName = workInfo.find(attrs={"class": "site-piclist_info"}).find(attrs={"class": "site-piclist_info_describe"}).find_all("span")[1].text # print("roleName", roleName) if "/lib/m_" in metaHref: mediaSourceCode = self.getMiddleStr( metaHref, "http://www.iqiyi.com/lib/m_", ".html") #print("mediaSourceCode=", mediaSourceCode) grabMediaStarInfo = GrabMediaStarInfo() grabMediaStarInfo.starSourceCode = grabStarId # 设置抓取的信息来源2代表爱奇艺 grabMediaStarInfo.informationSources = 2 # 设置抓取时间 grabMediaStarInfo.grabTime = DBUtil( ).systemDateTime() # 设置抓取清洗状态 grabMediaStarInfo.cleanStatus = 0 # 设置清洗后id grabMediaStarInfo.cleanAfterId = 0 # 设置媒资类型 grabMediaStarInfo.mediaStarType = mediaStarType # 设置媒资code grabMediaStarInfo.mediaSourceCode = mediaSourceCode # 设置清洗后演员源id (yff0127add) grabMediaStarInfo.cleanStarSourceId = 0 # 设置清洗后媒资数据源id grabMediaStarInfo.cleanMediaSourceId = 0 grabMediaStarInfo.save(using="grab") except BaseException as e: print("e.message:", str(e), "解析明星媒资关联媒资code或保存至数据库出现异常") except BaseException as e: print("e.message:", str(e), "解析明星媒资关联媒资类型出现异常")
def queryItemStar(self, soup): # 获取把html信息转成json js_dict = json.loads(soup.text) avatars = js_dict['avatars'] # 获取演职人员头像 img = avatars['large'] # 获取演职人员名字 name = js_dict['name'] # 获取性别 gender = js_dict['gender'] # 获取演职人员豆瓣id grabStarId = js_dict['id'] # 获取演职人员英文名字 name_en = js_dict['name_en'] # 获取演职人员国籍 born_place = js_dict['born_place'] # 演职人员性别转成需要代码(1是男0是女) if (gender == "男"): # 设置性别男为1 sex = 1 else: # 设置性别女为0 sex = 0 grabStarInfo = GrabStarInfo() # 设置演职人员code grabStarInfo.starSourceCode = grabStarId # 设置演职人员名字 grabStarInfo.chName = str(name) # 设置演职人员英文名字 grabStarInfo.enName = str(name_en) # 设置演职人员性别 grabStarInfo.starSex = sex # 设置演职人员国籍 grabStarInfo.starNationality = str(born_place) # 设置抓取的信息来源0代表豆瓣 grabStarInfo.informationSources = 0 # 设置抓取时间 grabStarInfo.grabTime = DBUtil().systemDateTime() # 设置抓取清洗状态 grabStarInfo.cleanStatus = 0 # 设置清洗后id grabStarInfo.cleanAfterId = 0 # 设置演职人员头像(能抓取的最大图) grabStarInfo.headImgUrl = str(img) try: # 保存数据库 grabStarInfo.save(using="grab") print("入库") self.grabStarPhoto(grabStarId) except BaseException as e: print("e.message:", str(e))
def saveGrabStarPhoto(self, starId, strGrabStarCode, inforrationSources, webFileUrl, photoHeight, photoWidth, remoteFile, cleanAfterId, displayStatus): grabStarPhoto = GrabStarPhoto() grabStarPhoto.photoId = starId grabStarPhoto.starSourceCode = strGrabStarCode grabStarPhoto.informationSources = inforrationSources grabStarPhoto.posterFtpUrl = remoteFile grabStarPhoto.photoUrl = webFileUrl grabStarPhoto.photoHeight = photoHeight grabStarPhoto.photoWidth = photoWidth grabStarPhoto.grabTime = DBUtil().systemDateTime() grabStarPhoto.cleanStatus = 0 grabStarPhoto.cleanAfterId = cleanAfterId grabStarPhoto.displayStatus = displayStatus grabStarPhoto.save(using="grab")
def analyzeStarPrizeHtml(self, data, grabStarId): try: data = data.find_all(attrs={"class": "awards"}) receivedAwards = "" for k in range(len(data)): nian = data[k].find("h2") receivedAwards = receivedAwards + str( nian.text) + award_separator_1 val = data[k].find_all(attrs={"class": "award"}) for i in range(len(val)): list = val[i].find_all("li") prize = "" for a in list[0]: prize += a.string prizes = list[1].string works = list[2].string receivedAwards += prize + award_separator_2 + prizes + award_separator_2 + works if i != len(val) - 1: receivedAwards += award_separator_3 if k != len(data) - 1: receivedAwards += award_separator_4 grabStarDynamic = GrabStarDynamic() # 设置演员原始code grabStarDynamic.starSourceCode = grabStarId # 设置抓取的信息来源 grabStarDynamic.informationSources = 0 # 设置抓取时间 grabStarDynamic.grabTime = DBUtil().systemDateTime() # 设置抓取清洗状态 grabStarDynamic.cleanStatus = 0 # 设置清洗后id grabStarDynamic.cleanAfterId = 0 # 设置奖项 grabStarDynamic.receivedAwards = receivedAwards # 保存 grabStar = GrabStarInfo.objects.using("grab").values( "starId").filter(starSourceCode=grabStarId, informationSources=0) if grabStar: grabStarDynamic.starId = grabStar[0]["starId"] grabStarDynamic.save(using="grab") print("保存或更新明星starId:", grabStarDynamic.starId, "奖项成功") except BaseException as e: print("e.message:", str(e))
def doubanSearch(request): data = {"resultCode": "0", "resultDesc": "", "total": "", "dataList": []} key = request.GET.get("key") if (DBUtil().isBlank(key)): data["resultDesc"] = "缺少参数" return HttpResponse(json.dumps(data, ensure_ascii=False), content_type="application/json") # 调用豆瓣自动抓取函数 print(DoubanMeta().searchMetaByKey(key)) try: DoubanMeta().searchMetaByKey(key) data["resultCode"] = "1" except: print("Error: 豆瓣搜索错误") return HttpResponse(json.dumps(data, ensure_ascii=False), content_type="application/json")
def saveMediaStarInfo(self, starList): for index in range(len(starList)): try: grabMediaStarInfo = GrabMediaStarInfo() mediaStrarDict = starList[index] # 查询关系是否存在starDict = {"starCode": None, "mediaCode": mediaCode, "mediaStarType": 1, "roleName": None,"informationSources": 2} grabMediaStarInfoList = GrabMediaStarInfo.objects.using("grab")\ .filter(mediaSourceCode=mediaStrarDict.get("mediaSourceCode"), starSourceCode=mediaStrarDict.get("starSourceCode") , mediaStarType=mediaStrarDict.get("mediaStarType"), informationSources=mediaStrarDict.get("informationSources")) if len(grabMediaStarInfoList) > 0: grabMediaStarInfo = grabMediaStarInfoList[0] # 更新演员角色名称为空的数据 if (mediaStrarDict.get("mediaStarType") == 1) and (grabMediaStarInfo.roleName is None or len(grabMediaStarInfo.roleName) == 0): grabMediaStarInfo.roleName = mediaStrarDict.get( "roleName") grabMediaStarInfo.save(using="grab") else: grabMediaStarInfo.mediaSourceCode = mediaStrarDict.get( "mediaSourceCode") grabMediaStarInfo.starSourceCode = mediaStrarDict.get( "starSourceCode") grabMediaStarInfo.informationSources = mediaStrarDict.get( "informationSources") grabMediaStarInfo.mediaStarType = mediaStrarDict.get( "mediaStarType") grabMediaStarInfo.roleName = mediaStrarDict.get("roleName") grabMediaStarInfo.grabTime = DBUtil().systemDateTime() grabMediaStarInfo.cleanStatus = 0 grabMediaStarInfo.cleanAfterId = 0 grabMediaStarInfo.cleanStarSourceId = 0 grabMediaStarInfo.cleanMediaSourceId = 0 grabMediaStarInfo.save(using="grab", force_insert=True) except BaseException as e: print("e.message:", str(e), "保存媒资与演员关系异常") return
def analyzeStarRelationAndSave(self, data, grabStarId, isUpdate): try: if data.find(attrs={"class": "center-star"}): relations = data.find(attrs={ "class": "center-star" }).find_all("p") #print("relations", type(relations), relations) #从第二个开始,因为第一个是本人 for i in range(1, len(relations)): try: relationCode = "" if i == 1: relationCode = data.find( attrs={ "class": "sub-star left-top-star" }).find("a")["href"][27:36] if i == 2: relationCode = data.find( attrs={ "class": "sub-star right-top-star" }).find("a")["href"][27:36] if i == 3: relationCode = data.find( attrs={ "class": "sub-star right-center-star" }).find("a")["href"][27:36] if i == 4: relationCode = data.find( attrs={ "class": "sub-star right-bottom-star" }).find("a")["href"][27:36] if i == 5: relationCode = data.find( attrs={ "class": "sub-star left-bottom-star" }).find("a")["href"][27:36] if i == 6: relationCode = data.find( attrs={ "class": "sub-star left-center-star" }).find("a")["href"][27:36] #print("relationCode", relationCode) grabStarRelation = GrabStarRelation() grabStarRelation.starSourceCode = grabStarId grabStarRelation.starRelation = relations[i].text # 设置抓取的信息来源2代表爱奇艺 grabStarRelation.informationSources = 2 # 设置抓取时间 grabStarRelation.grabTime = DBUtil().systemDateTime() # 设置抓取清洗状态 grabStarRelation.cleanStatus = 0 # 设置清洗后id grabStarRelation.cleanAfterId = 0 # 设置清洗后演员源id(yff0127add) grabStarRelation.cleanStarSouceId = 0 # 设置清洗后关系人源id grabStarRelation.cleanStarRalationId = 0 grabStarRelation.relationSourceCode = relationCode #print("starRelation=", grabStarRelation.starRelation, "&relationCode=", grabStarRelation.relationSourceCode) if isUpdate: count = GrabStarRelation.objects.using( "grab").filter(starSourceCode=grabStarId, relationSourceCode=relationCode, informationSources=2).count() if count > 0: GrabStarRelation.objects.using("grab").filter( starSourceCode=grabStarId, relationSourceCode=relationCode, informationSources=2).update( starRelation=grabStarRelation. starRelation, cleanStatus=2) else: grabStarRelation.save(using="grab") else: grabStarRelation.save(using="grab") except BaseException as e: print("e.message:", str(e), "解析明星人物关系code出现异常") else: if data.find(attrs={ "class": "relateStar_list relateStar_hidePopUl clearfix" }): relationInfos = data.find(attrs={ "class": "relateStar_list relateStar_hidePopUl clearfix" }).find_all("li") for relationInfo in relationInfos: try: #这里href的值是""//www.iqiyi.com/lib/s_206307105.html""这种格式,没有带http: relationCode = relationInfo.find( attrs={"class", "relateStar_info"} ).find(attrs={ "class", "relateStar_title relateStar_relationNam" }).find("a")["href"][22:31] starRelation = relationInfo.find( attrs={"class", "relateStar_info"} ).find(attrs={ "class", "relateStar_title relateStar_relationNam" }).find("span").find_all("em")[1].text grabStarRelation = GrabStarRelation() grabStarRelation.starSourceCode = grabStarId grabStarRelation.starRelation = starRelation # 设置抓取的信息来源2代表爱奇艺 grabStarRelation.informationSources = 2 # 设置抓取时间 grabStarRelation.grabTime = DBUtil( ).systemDateTime() # 设置抓取清洗状态 grabStarRelation.cleanStatus = 0 # 设置清洗后id grabStarRelation.cleanAfterId = 0 grabStarRelation.relationSourceCode = relationCode # 设置清洗后演员源id(yff0127add) grabStarRelation.cleanStarSouceId = 0 # 设置清洗后关系人源id grabStarRelation.cleanStarRalationId = 0 # print("相关明星starRelation=", grabStarRelation.starRelation, "&relationCode=", # grabStarRelation.relationSourceCode) if isUpdate: count = GrabStarRelation.objects.using( "grab").filter( starSourceCode=grabStarId, relationSourceCode=relationCode, informationSources=2).count() if count > 0: GrabStarRelation.objects.using( "grab").filter( starSourceCode=grabStarId, relationSourceCode=relationCode, informationSources=2).update( starRelation=grabStarRelation. starRelation, cleanStatus=2) else: grabStarRelation.save(using="grab") else: grabStarRelation.save(using="grab") except BaseException as e: print("e.message:", str(e), "解析相关明星信息出现异常") except BaseException as e: print("e.message:", str(e), "解析明星人物关系类型出现异常")
def analyzeMediaStarHtml(self, data, grabStarId): try: data = data.find(attrs={ "id": "wrapper" }).find(attrs={ "id": "content" }).find(attrs={ "class": "article" }).find_all("h6") for info in data: name = info.find("a") nian = info.find_all("span") code = re.findall(r'(\w*[0-9]+)\w*', name["href"])[0] msType = nian[-1].text grabMediaStarInfo = GrabMediaStarInfo() # 设置媒资源code grabMediaStarInfo.mediaSourceCode = code # 设置演员源code grabMediaStarInfo.starSourceCode = grabStarId # 设置信息来源 grabMediaStarInfo.informationSources = 0 # 设置抓取时间 grabMediaStarInfo.grabTime = DBUtil().systemDateTime() # 设置抓取清洗状态 grabMediaStarInfo.cleanStatus = 0 # 设置清洗后id grabMediaStarInfo.cleanAfterId = 0 # 0导演1演员2编剧3制片4配音5作曲6自己 try: if ('导演' in msType): #print("导演") mediaStarType = 0 # 设置人物类型 grabMediaStarInfo.mediaStarType = mediaStarType # 保存数据库 grabMediaStarInfo.save(using="grab") if ('演员' in msType): #print("演员") mediaStarType = 1 # 设置人物类型 grabMediaStarInfo.mediaStarType = mediaStarType # 保存数据库 grabMediaStarInfo.save(using="grab") if ('编剧' in msType): #print("编剧") mediaStarType = 2 # 设置人物类型 grabMediaStarInfo.mediaStarType = mediaStarType # 保存数据库 grabMediaStarInfo.save(using="grab") if ('制片' in msType): #print("制片") mediaStarType = 3 # 设置人物类型 grabMediaStarInfo.mediaStarType = mediaStarType # 保存数据库 grabMediaStarInfo.save(using="grab") if ('配音' in msType): #print("配音") mediaStarType = 4 # 设置人物类型 grabMediaStarInfo.mediaStarType = mediaStarType # 保存数据库 grabMediaStarInfo.save(using="grab") if ('作曲' in msType): #print("作曲") mediaStarType = 5 # 设置人物类型 grabMediaStarInfo.mediaStarType = mediaStarType # 保存数据库 grabMediaStarInfo.save(using="grab") if ('自己' in msType): #print("自己") mediaStarType = 6 # 设置人物类型 grabMediaStarInfo.mediaStarType = mediaStarType # 保存数据库 grabMediaStarInfo.save(using="grab") except BaseException as e: print("e.message翻译类型:", str(e)) except BaseException as e: print("e.message获取列表:", str(e))
def parsYkDetailsContext(self, ykDetailsHtml, ykDetailsUrl): # 主演 protagonist = [] url = ykDetailsUrl.split("/") mediaSourceCode = url[len(url) - 1].split(".")[0].split("_")[1] basicData = ykDetailsHtml.find(attrs={'class': 'mod mod-new'}) # 获取子集信息 basicSubsetData = ykDetailsHtml.find(attrs={'class': 'mod-area-left'}) mediaDirector, productionContry, subordinateType = self.parsMediaDirector( basicData) #获取图片地址 imgDiv = basicData.find(attrs={'class': 'yk-pack p-list'}) img = imgDiv.find("img") imgUrl = img["src"] baseContext = basicData.find("ul") title = baseContext.find(attrs={'class': 'p-row p-title'}) types = title.find_all("a")[0].text mediaType = self.parseMediaType(types) name = title.text.split(":")[1][:-6] # 发行年份 mediaYear = title.text.split(":")[1][:-2][-4:] #又名 alternateName = baseContext.find(attrs={'class': 'p-alias'}).text #上映时间 releaseTimes = baseContext.find_all(attrs={'class': 'pub'}) #评分 mediaScore = 0 if len(releaseTimes) > 1: releaseTime = releaseTimes[1].text[5:] else: releaseTime = "" mediaScoreHtml = baseContext.find(attrs={'class': 'star-num'}) if mediaScoreHtml == None: mediaScore = 0 else: mediaScore = mediaScoreHtml.text #获取主演 mediaActorLi = baseContext.find(attrs={'class': 'p-performer'}) if mediaActorLi is not None: #获取a标签内容 mediaActorALL = mediaActorLi.find_all("a") for mediaActorA in mediaActorALL: actor = mediaActorA.text protagonist.append(actor) mediaActor = protagonist #获取简介 mediaIntroData = basicData.find(attrs={'class': 'p-row p-intro'}) mediaIntro = mediaIntroData.find_all("span")[1].text try: # 保存媒资主表 grabMediaInfo = GrabMediaInfo() # 媒资主表ID mediaId = DBUtil().createPK("GRAB_MEDIA_INFO") grabMediaInfo.mediaId = mediaId grabMediaInfo.mediaSourceCode = mediaSourceCode grabMediaInfo.informationSources = 1 grabMediaInfo.mediaType = mediaType grabMediaInfo.cnName = name grabMediaInfo.grabWebUrl = ykDetailsUrl grabMediaInfo.mediaWebCode = "" grabMediaInfo.alternateName = alternateName grabMediaInfo.mediaYear = mediaYear grabMediaInfo.mediaLanguage = "" grabMediaInfo.mediaTimes = 0 grabMediaInfo.mediaIntro = mediaIntro grabMediaInfo.subordinateType = subordinateType grabMediaInfo.productionContry = productionContry #未处理 grabMediaInfo.currentSeasonNumber = 1 grabMediaInfo.posterImgUrl = imgUrl grabMediaInfo.mediaDirector = mediaDirector grabMediaInfo.mediaActor = mediaActor grabMediaInfo.releaseTime = releaseTime grabMediaInfo.totalSeason = 1 grabMediaInfo.currentSeason = 1 grabMediaInfo.grabTime = DBUtil().systemDateTime() grabMediaInfo.cleanStatus = 0 grabMediaInfo.cleanAfterId = 0 grabMediaInfo.cleanParentMediaId = 0 grabMediaInfo.save(using="grab", force_insert=True) # 获取影人信息 self.parsMediaFilmHtml(ykDetailsHtml) except BaseException as e: print("e.message:", str(e), "保存媒资信息出现异常") return grabMediaInfo
def parseSubset(self, mediaSourceCode): try: url = "http://cache.video.iqiyi.com/jp/sdvlst/3/" + str( mediaSourceCode) + "/?categoryId=3&sourceId=" + str( mediaSourceCode) print(url) jsonStr = HttpSpiderUtils().spiderHtmlUrl(url) jsonObj = json.loads(jsonStr.split("=")[1]) data = jsonObj['data'] if (data == None): print(data) for i in range(len(data)): subset = data[i] try: if (subset is not None): # 保存媒资主表 grabMediaInfo = GrabMediaInfo() # 媒资主表ID mediaId = DBUtil().createPK("GRAB_MEDIA_INFO") grabMediaInfo.mediaId = mediaId tvYear = subset["tvYear"] if (tvYear is not None): grabMediaInfo.mediaYear = tvYear[0:4] else: tvYear = 0 grabMediaInfo.releaseTime = tvYear grabMediaInfo.mediaSourceCode = subset["tvId"] grabMediaInfo.informationSources = self.informationSources grabMediaInfo.mediaType = "" tvSbtitle = subset["tvSbtitle"] grabMediaInfo.cnName = tvSbtitle aDesc = subset["aDesc"] grabMediaInfo.mediaIntro = aDesc timeLength = subset["timeLength"] grabMediaInfo.mediaLanguage = timeLength #父集id faqipuid = subset["faqipuid"] grabMediaInfo.parentSourceCode = faqipuid grabMediaInfo.episodeTitle = subset[ "tvFocus"] #tvFocus mActors = subset["mActors"] grabMediaInfo.mediaActor = mActors grabMediaInfo.grabWebUrl = url grabMediaInfo.totalSeason = 1 grabMediaInfo.currentSeason = 1 grabMediaInfo.grabTime = DBUtil().systemDateTime() grabMediaInfo.cleanStatus = 0 grabMediaInfo.cleanAfterId = 0 grabMediaInfo.cleanParentMediaId = 0 grabMediaInfo.posterImgUrl = "" grabMediaInfo.mediaTag = "" grabMediaInfo.save(using="grab", force_insert=True) except BaseException as e: print("e.message:", str(e), "保存子集失败") except BaseException as e: print("e.message:", str(e), "保存子集失败")
def queryItemMata(self, response): try: json_dict = json.loads(response) media_source_code = json_dict['id'] cn_name = json_dict['title'] genresNo = len(json_dict['genres']) if genresNo > 0: subordinate_type = json_dict['genres'] else: subordinate_type = "" akaNo = len(json_dict['aka']) if akaNo > 0: alternate_name = json_dict['aka'] else: alternate_name = "" subtypeNo = len(json_dict['subtype']) if subtypeNo > 0: media_type = json_dict['subtype'] else: media_type = "" poster_img_url = json_dict['images']['large'] media_intro = json_dict['summary'] media_year = json_dict['year'] if media_year == "" or media_year == None: media_year = 0 grab_web_url = json_dict['share_url'] pingfen = json_dict['rating']['average'] total_season = json_dict['seasons_count'] if total_season == "" or total_season == None: total_season = "1" current_season = json_dict['current_season'] if current_season == "" or current_season == None: current_season = "1" current_season_number = json_dict['episodes_count'] if current_season_number == "" or current_season_number is None: current_season_number = "1" directorsNo = len(json_dict['directors']) if directorsNo > 0: dy = [] for n in range(directorsNo): media_director = json_dict['directors'][n]['name'] dy.append(media_director) else: dy = "" zy = [] castsNo = len(json_dict['casts']) if castsNo > 0: for i in range(castsNo): media_actor = json_dict['casts'][i]['name'] zy.append(media_actor) else: zy = "" countriesNo = len(json_dict['countries']) if countriesNo > 0: production_contry = json_dict['countries'][0] else: production_contry = '' # -----保存媒资基础信息------ grabMediaInfo = GrabMediaInfo() #设置主键id grabMediaInfo.mediaId = DBUtil().createPK("GRAB_MEDIA_INFO") # 媒资数据源id grabMediaInfo.mediaSourceCode = media_source_code # 信息来源 grabMediaInfo.informationSources = 0 #媒资父id 豆瓣没有剧集默认值为0 grabMediaInfo.parentSourceCode = 0 # grabMediaInfo.cleanParentMediaId = 0 # 媒资类型 grabMediaInfo.mediaType = media_type # 中文名 grabMediaInfo.cnName = cn_name # 又名 grabMediaInfo.alternateName = alternate_name # 发行年份 grabMediaInfo.mediaYear = media_year # 简介 grabMediaInfo.mediaIntro = media_intro # 影片类型 grabMediaInfo.subordinateType = subordinate_type # 制片国家/地区 grabMediaInfo.productionContry = production_contry # 总季数 grabMediaInfo.totalSeason = int(total_season) # 当前季数 grabMediaInfo.currentSeason = int(current_season) # 当前季的集数 grabMediaInfo.currentSeasonNumber = int(current_season_number) # 海报url grabMediaInfo.posterImgUrl = poster_img_url # 导演 grabMediaInfo.mediaDirector = dy # 演员 grabMediaInfo.mediaActor = str(zy) # 是否清洗 grabMediaInfo.cleanStatus = 0 # 清洗后id grabMediaInfo.cleanAfterId = 0 # 抓取时间 grabMediaInfo.grabTime = DateUtils.getSysTimeFormat("%Y-%m-%d %H:%M:%S") # 抓取页面URL grabMediaInfo.grabWebUrl = grab_web_url if (media_type == 'tv'): if ('真人秀' in subordinate_type): # 协助媒资类型 grabMediaInfo.mediaAssistType = 'variety' elif ('音乐' in subordinate_type): # 协助媒资类型 grabMediaInfo.mediaAssistType = 'variety' elif ('歌舞' in subordinate_type): # 协助媒资类型 grabMediaInfo.mediaAssistType = 'variety' elif ('脱口秀' in subordinate_type): # 协助媒资类型 grabMediaInfo.mediaAssistType = 'variety' elif ('动画' in subordinate_type): # 协助媒资类型 grabMediaInfo.mediaAssistType = 'manga' elif ('儿童' in subordinate_type): # 协助媒资类型 grabMediaInfo.mediaAssistType = 'manga' elif ('纪录片' in subordinate_type): # 协助媒资类型 grabMediaInfo.mediaAssistType = 'documentary' else: grabMediaInfo.mediaAssistType = 'tv' else: if ('纪录片' in subordinate_type): # 协助媒资类型 grabMediaInfo.mediaAssistType = 'documentary' else: grabMediaInfo.mediaAssistType = 'movie' # 保存数据库 grabMediaInfo.save(using="grab", force_insert=True) # -----保存媒资图片------ ftpUrl, photoHeight, photoWidth = DoubanMZAnalysis().parseImageUpload(poster_img_url) self.addMataImg(media_source_code, poster_img_url, ftpUrl, photoWidth, photoHeight) # -----保存媒资评分------ self.addMataPF(media_source_code, grabMediaInfo.mediaId, pingfen) #-------保存媒资演员信息------- #DoubanMediaCelebrities().mediaCelebrities(media_source_code) except BaseException as e: print("e.message:", str(e)) time.sleep(10)
def saveIqiyiMedia(self, mediaInfo): try: try: # 保存媒资主表 grabMediaInfo = GrabMediaInfo() # 媒资主表ID mediaId = DBUtil().createPK("GRAB_MEDIA_INFO") grabMediaInfo.mediaId = mediaId grabMediaInfo.mediaSourceCode = mediaInfo.get( "mediaSourceCode") grabMediaInfo.informationSources = self.informationSources grabMediaInfo.mediaType = mediaInfo.get("mediaType") grabMediaInfo.cnName = mediaInfo.get("cnName") grabMediaInfo.grabWebUrl = mediaInfo.get("grabWebUrl") grabMediaInfo.mediaWebCode = mediaInfo.get("mediaWebCode") if mediaInfo.get("alternateName") is not None: grabMediaInfo.alternateName = mediaInfo.get( "alternateName") if mediaInfo.get("mediaYear") is not None: grabMediaInfo.mediaYear = int(mediaInfo.get("mediaYear")) else: grabMediaInfo.mediaYear = 0 if mediaInfo.get("mediaLanguage") is not None: grabMediaInfo.mediaLanguage = mediaInfo.get( "mediaLanguage") if mediaInfo.get("mediaTimes") is not None: grabMediaInfo.mediaTimes = int(mediaInfo.get("mediaTimes")) else: grabMediaInfo.mediaTimes = 0 if mediaInfo.get("mediaIntro") is not None: grabMediaInfo.mediaIntro = mediaInfo.get("mediaIntro") if mediaInfo.get("subordinateType") is not None: grabMediaInfo.subordinateType = mediaInfo.get( "subordinateType") if mediaInfo.get("productionContry") is not None: grabMediaInfo.productionContry = mediaInfo.get( "productionContry") if mediaInfo.get("currentSeasonNumber") is not None: grabMediaInfo.currentSeasonNumber = int( mediaInfo.get("currentSeasonNumber")) else: grabMediaInfo.currentSeasonNumber = 1 if mediaInfo.get("posterImgUrl") is not None: grabMediaInfo.posterImgUrl = mediaInfo.get("posterImgUrl") if mediaInfo.get("mediaDirector") is not None: grabMediaInfo.mediaDirector = mediaInfo.get( "mediaDirector") if mediaInfo.get("protagonist") is not None: grabMediaInfo.mediaActor = str( mediaInfo.get("protagonist")) if mediaInfo.get("releaseTime") is not None: grabMediaInfo.releaseTime = mediaInfo.get("releaseTime") grabMediaInfo.totalSeason = 1 grabMediaInfo.currentSeason = 1 grabMediaInfo.grabTime = DBUtil().systemDateTime() grabMediaInfo.cleanStatus = 0 grabMediaInfo.cleanAfterId = 0 grabMediaInfo.cleanParentMediaId = 0 grabMediaInfo.save(using="grab", force_insert=True) IqiyiMetaVariety().parseSubset( mediaInfo.get("mediaSourceCode")) except BaseException as e: print("e.message:", str(e), "保存媒资信息出现异常") # 保存媒资动态信息 grabMediaDynamic = GrabMediaDynamic() grabMediaDynamic.mediaId = grabMediaInfo.mediaId grabMediaDynamic.mediaSourceCode = grabMediaInfo.mediaSourceCode grabMediaDynamic.informationSources = grabMediaInfo.informationSources if mediaInfo.get("mediaScore") is not None: grabMediaDynamic.mediaScore = mediaInfo.get("mediaScore") else: grabMediaDynamic.mediaScore = 0 grabMediaDynamic.receivedAwards = str( mediaInfo.get("receivedAwards")) grabMediaDynamic.grabTime = DBUtil().systemDateTime() grabMediaDynamic.cleanStatus = 0 grabMediaDynamic.cleanAfterId = 0 grabMediaDynamic.receivedAwards = "" grabMediaDynamic.mediaMoney = 0 grabMediaDynamic.save(using="grab", force_insert=True) # 保存图片 self.saveMediaPoster(mediaInfo) # 保存媒资与影人关系 self.saveMediaStarInfo(mediaInfo.get("starList")) except BaseException as e: print("e.message:", str(e), "保存媒资信息出现异常") return
def analyzeStarInfoAndSave(self, data, grabStarId, starName, isUpdate): grabStarInfo = GrabStarInfo() try: # 设置演职人员code grabStarInfo.starSourceCode = grabStarId # 设置演职人员名字 grabStarInfo.chName = str(starName) # 设置抓取的信息来源2代表爱奇艺 grabStarInfo.informationSources = 2 # 设置抓取时间 grabStarInfo.grabTime = DBUtil().systemDateTime() # 设置抓取清洗状态 grabStarInfo.cleanStatus = 0 # 设置清洗后id grabStarInfo.cleanAfterId = 0 #设置头像url grabStarInfo.headImgUrl = data.find(attrs={ "class": "result_pic" }).find("img")["src"] #print("headImgUrl=", grabStarInfo.headImgUrl) # starKariera = "" # 身高和体重默认为0 starHeight = 0 grabStarInfo.starHeight = starHeight starWeight = 0 grabStarInfo.starWeight = starWeight starWeightHtml = data.find(attrs={ "class": "mx_topic-item" }).find(attrs={ "class": "clearfix" }).find(attrs={ "itemprop": "weight" }).text.replace("体重:", "").strip() if (not DBUtil().isBlank(starWeightHtml) and starWeightHtml != "-"): starWeight = int(starWeightHtml.replace("kg", "")) grabStarInfo.starWeight = starWeight grabStarInfo.starKariera = data.find(attrs={ "class": "mx_topic-item" }).find(attrs={ "class": "clearfix" }).find(attrs={ "itemprop": "jobTitle" }).text.replace("职业:", "").strip().replace("\n", "").replace(" ", "") #print("starWeight=", grabStarInfo.starWeight, "&starKariera=", grabStarInfo.starKariera) # 获取演职人员推荐作品名称 if data.find(attrs={"class": "works-title textOverflow"}): works = data.find(attrs={ "class": "site-piclist site-piclist-13777" }).find_all("li") #print("works", works) representativeWorks = "" for work in works: workName = work.find(attrs={ "class": "site-piclist_info" }).find(attrs={ "class": "site-piclist_info_title" }).find("a")["title"] representativeWorks += workName + ";" #print("representativeWorks", representativeWorks) grabStarInfo.representativeWorks = representativeWorks[ 0:len(representativeWorks) - 1] #获取简介 introduceInfo = data.find(attrs={ "class": "introduce-info" }).text.replace(" ", "") #print("introduceInfo", introduceInfo) grabStarInfo.briefIntroduction = introduceInfo leftBasicInfoValues = data.find( attrs={ "class": "basicInfo-block basicInfo-left" }).find_all("dd") for i in range(len(leftBasicInfoValues)): if i == 0: grabStarInfo.enName = leftBasicInfoValues[i].text.strip() if i == 1: if leftBasicInfoValues[i].text.strip() == "男": grabStarInfo.starSex = 1 else: grabStarInfo.starSex = 0 if i == 2: starHeightHtml = leftBasicInfoValues[i].text.strip() if (not DBUtil().isBlank(starHeightHtml) and starHeightHtml != "-"): starHeight = starHeightHtml.replace("cm", "") grabStarInfo.starHeight = starHeight if i == 3: grabStarInfo.birthDate = leftBasicInfoValues[i].text.strip( ) #print("enName=", grabStarInfo.enName, "&starSex=", grabStarInfo.starSex, "&starHeight=", grabStarInfo.starHeight, "&birthDate=", grabStarInfo.birthDate) rightBasicInfoValues = data.find( attrs={ "class": "basicInfo-block basicInfo-right" }).find_all("dd") for i in range(len(rightBasicInfoValues)): if i == 0: #设置别名 grabStarInfo.anotherName = rightBasicInfoValues[ i].text.strip() if i == 2: #设置国籍 grabStarInfo.starNationality = rightBasicInfoValues[ i].text.strip() if i == 3: #设置星座 grabStarInfo.starSign = rightBasicInfoValues[i].text.strip( ) if i == 6: #设置兴趣爱好 grabStarInfo.hobbiesInterests = rightBasicInfoValues[ i].text.strip() # print("anotherName=", grabStarInfo.anotherName, "&starNationality=", grabStarInfo.starNationality, "&starSign=", # grabStarInfo.starSign, "&hobbiesInterests=", grabStarInfo.hobbiesInterests) except BaseException as e: print("e.message:", str(e), "解析明星详情出现异常") #插入数据库 try: if isUpdate: GrabStarInfo.objects.using("grab").filter( starSourceCode=grabStarId, informationSources=2).update( headImgUrl=grabStarInfo.headImgUrl, starHeight=grabStarInfo.starHeight, starWeight=grabStarInfo.starWeight, starKariera=grabStarInfo.starKariera, representativeWorks=grabStarInfo.representativeWorks, briefIntroduction=grabStarInfo.briefIntroduction, enName=grabStarInfo.enName, starSex=grabStarInfo.starSex, birthDate=grabStarInfo.birthDate, anotherName=grabStarInfo.anotherName, starNationality=grabStarInfo.starNationality, starSign=grabStarInfo.starSign, hobbiesInterests=grabStarInfo.hobbiesInterests, cleanStatus=2) print("更新明星详情成功") else: # 保存数据库 grabStarInfo.save(using="grab") print("保存明星详情成功") self.grabStarPhotoByCodeAndSource(grabStarId, 2) except BaseException as e: print("e.message:", str(e), "保存明星详情出现异常")
def analyzeStarAwardAndSave(self, data, grabStarId, isUpdate): try: if data.find(attrs={"class": "m-getPrice-tab j-starAward-all"}): receivedAwards = "" allAwards = data.find(attrs={ "class": "m-getPrice-tab j-starAward-all" }).find_all(attrs={"class": "getPrice-detail-cont"}) for i in range(len(allAwards)): year = allAwards[i].find(attrs={ "class": "getPrice-tab-title" }).find("span").text receivedAwards = receivedAwards + year + award_separator_1 sameAwardInfos = allAwards[i].find_all( attrs={"class": "getPrice-info-table"}) for k in range(len(sameAwardInfos)): awardInfos = sameAwardInfos[k].find_all("li") for j in range(len(awardInfos)): tabPrResult = "" tabPrTime = awardInfos[j].find( attrs={ "class": "tabPr-time" }).text tabPrName = awardInfos[j].find( attrs={ "class": "tabPr-name" }).text tabPrProj = awardInfos[j].find( attrs={ "class": "tabPr-proj" }).text if awardInfos[j].find( attrs={ "class": "tabPr-result tabPr-result" }): tabPrResult = awardInfos[j].find( attrs={ "class": "tabPr-result tabPr-result" }).text elif awardInfos[j].find( attrs={ "class": "tabPr-result tabPr-succes" }): tabPrResult = awardInfos[j].find( attrs={ "class": "tabPr-result tabPr-succes" }).text receivedAwards = receivedAwards + tabPrTime + tabPrName + award_separator_2 + tabPrProj + award_result_separator_left + tabPrResult + award_result_separator_right if k != len(sameAwardInfos) - 1 or j != len( awardInfos) - 1: receivedAwards += award_separator_3 if i != len(allAwards) - 1: receivedAwards += award_separator_4 #print("receivedAwards=", receivedAwards) grabStarDynamic = GrabStarDynamic() grabStarDynamic.starSourceCode = grabStarId # 设置抓取的信息来源2代表爱奇艺 grabStarDynamic.informationSources = 2 # 设置抓取时间 grabStarDynamic.grabTime = DBUtil().systemDateTime() # 设置抓取清洗状态 grabStarDynamic.cleanStatus = 0 # 设置清洗后id grabStarDynamic.cleanAfterId = 0 grabStarDynamic.receivedAwards = receivedAwards grabStar = GrabStarInfo.objects.using("grab").values( "starId", "cleanAfterId").filter(starSourceCode=grabStarId, informationSources=2) if grabStar: if isUpdate: grabStarDynamic.cleanStatus = 2 grabStarDynamic.cleanAfterId = grabStar[0][ "cleanAfterId"] #print("cleanAfterId", grabStarDynamic.cleanAfterId) grabStarDynamic.starId = grabStar[0]["starId"] grabStarDynamic.save(using="grab") print("保存或更新明星starId:", grabStarDynamic.starId, "奖项成功") except BaseException as e: print("e.message:", str(e), "解析明星奖项出现异常")