def updateMediaItem(self, metadata, kinoPoiskId): titlePage = common.getElementFromHttpRequest( S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, S.ENCODING_KINOPOISK_PAGE) if titlePage is not None: # Don't update if the title page was failed to load. LOGGER.Debug( 'SUCCESS: got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId) try: self.resetMediaMetadata(metadata) self.parseInfoTableTagAndUpdateMetadata( titlePage, metadata) # Title, original title, ratings, and more. self.parseStudioPageData(metadata, kinoPoiskId) # Studio. Студия. self.parseCastPageData(titlePage, metadata, kinoPoiskId) # Actors, etc. Актёры. др. self.parsePostersPageData(metadata, kinoPoiskId) # Posters. Постеры. self.parseStillsPageData( metadata, kinoPoiskId) # Background art. Stills. except: common.logException('failed to update metadata for id %s' % kinoPoiskId)
def parseImageDataFromPhotoTableTag(page, thumbnailList, isPoster, maxImagesToParse): anchorElems = page.xpath('//table[@class="fotos" or @class="fotos fotos1" or @class="fotos fotos2"]/tr/td/a') currItemIndex = len(thumbnailList) for anchorElem in anchorElems: thumb = None try: thumb = parseImageDataFromAnchorElement(anchorElem, currItemIndex) currItemIndex += 1 except: common.logException('unable to parse image URLs') if thumb is None: Log.Debug('no URLs - skipping an image') continue else: common.scoreThumbnailResult(thumb, isPoster) if PREFS.imageChoice == common.IMAGE_CHOICE_BEST and \ thumb.score < common.IMAGE_SCORE_BEST_THRESHOLD: continue thumbnailList.append(thumb) Log.Debug('GOT URLs for an image: index=%d, thumb="%s", full="%s" (%sx%s)' % (thumb.index, str(thumb.thumbImgUrl), str(thumb.fullImgUrl), str(thumb.fullImgWidth), str(thumb.fullImgHeight))) maxImagesToParse = maxImagesToParse - 1 if not maxImagesToParse: break return maxImagesToParse
def getPosterThumbnailBigOrSmall(kinoPoiskId): Log.Debug(' * parsing thumbnail...') thumb = None try: bigImgThumbUrl = KINOPOISK_MOVIE_BIG_THUMBNAIL % kinoPoiskId response = common.getResponseFromHttpRequest(bigImgThumbUrl) if response is not None: contentType = response.headers['content-type'] if 'image/jpeg' == contentType: Log.Debug(' * found BIG thumb') thumb = common.Thumbnail(None, bigImgThumbUrl, KINOPOISK_MOVIE_THUMBNAIL_DEFAULT_WIDTH, KINOPOISK_MOVIE_THUMBNAIL_DEFAULT_HEIGHT, 0, # Index. 1000) # Big thumb should have the highest initial score. else: Log.Debug(' * BIG thumb is NOT found') except: Log.Debug(' * UNABLE to fetch BIG thumb') if IS_DEBUG: common.logException('failed to fetch BIG thumb') if thumb is None: Log.Debug(' * adding default (SMALL) thumb') # If there is no big title, add a small one. thumb = common.Thumbnail(None, KINOPOISK_MOVIE_THUMBNAIL % kinoPoiskId, KINOPOISK_MOVIE_THUMBNAIL_WIDTH, KINOPOISK_MOVIE_THUMBNAIL_HEIGHT, 0, # Index. 0) # Initial score. return thumb
def updateMediaItem(self, metadata, kinoPoiskId, lang): titlePage = common.getElementFromHttpRequest( S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, S.ENCODING_KINOPOISK_PAGE) if titlePage is not None: # Don't update if the title page was failed to load. LOGGER.Debug( 'SUCCESS: got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId) try: self.parseInfoTableTagAndUpdateMetadata( titlePage, metadata) # Title, original title, ratings, and more. # Search for a movie on TMDb to supplement our results with more data. # IMPORTANT, this must be done after parseInfoTableTagAndUpdateMetadata, # which populates the title and the year on the metadata object. tmdbId = self.searchForImdbTitleId(metadata.title, metadata.year) self.parseStudioPageData(metadata, kinoPoiskId) # Studio. Студия. self.parseCastPageData(titlePage, metadata, kinoPoiskId) # Actors, etc. Актёры. др. self.updateImagesMetadata( metadata, kinoPoiskId, tmdbId, lang) # Posters & Background art. Постеры. except: common.logException('failed to update metadata for id %s' % kinoPoiskId)
def parseYearInfo(infoRowElem, metadata): yearText = infoRowElem.xpath('.//a/text()') if len(yearText): Log.Debug(' ... parsed year: %s' % yearText[0]) try: metadata.year = int(yearText[0]) except: common.logException('unable to parse year')
def parseRatingInfo(page, metadata, kinoPoiskId): ratingText = page.xpath('.//*[@id="block_rating"]/div[1]/div[1]/a/span[1]/text()') if len(ratingText): try: rating = float(ratingText[0]) Log.Debug(' ... parsed rating "%s"' % str(rating)) metadata.rating = rating except: common.logException('unable to parse rating')
def parseDurationInfo(infoRowElem, metadata): durationElems = infoRowElem.xpath('./td[@class="time"]/text()') if len(durationElems) > 0: try: match = MATCHER_MOVIE_DURATION.search(durationElems[0]) if match is not None: duration = int(int(match.groups(1)[0])) * 1000 Log.Debug(' ... parsed duration: "%s"' % str(duration)) metadata.duration = duration except: common.logException('unable to parse duration')
def parseOriginallyAvailableInfo(infoRowElem, metadata): originalDateElems = infoRowElem.xpath('.//a/text()') if len(originalDateElems): try: (dd, mm, yy) = originalDateElems[0].split() if len(dd) == 1: dd = '0' + dd mm = RU_MONTH[mm] originalDate = Datetime.ParseDate(yy + '-' + mm + '-' + dd).date() Log.Debug(' ... parsed originally available date: "%s"' % str(originalDate)) metadata.originally_available_at = originalDate except: common.logException('unable to parse originally available date')
def updateMediaItem(self, metadata, kinoPoiskId): titlePage = common.getElementFromHttpRequest(S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, S.ENCODING_KINOPOISK_PAGE) if titlePage is not None: # Don't update if the title page was failed to load. LOGGER.Debug('SUCCESS: got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId) try: self.resetMediaMetadata(metadata) self.parseInfoTableTagAndUpdateMetadata(titlePage, metadata) # Title, original title, ratings, and more. self.parseStudioPageData(metadata, kinoPoiskId) # Studio. Студия. self.parseCastPageData(titlePage, metadata, kinoPoiskId) # Actors, etc. Актёры. др. self.parsePostersPageData(metadata, kinoPoiskId) # Posters. Постеры. self.parseStillsPageData(metadata, kinoPoiskId) # Background art. Stills. except: common.logException('failed to update metadata for id %s' % kinoPoiskId)
def updateMediaItem(self, metadata, kinoPoiskId): titlePage = common.getElementFromHttpRequest(KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, ENCODING_KINOPOISK_PAGE) if titlePage is not None: Log.Debug('got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId) try: resetMediaMetadata(metadata) parseTitleInfo(titlePage, metadata) # Title. Название на русском языке. parseOriginalTitleInfo(titlePage, metadata) # Original title. Название на оригинальном языке. parseSummaryInfo(titlePage, metadata) # Summary. Описание. parseRatingInfo(titlePage, metadata, kinoPoiskId) # Rating. Рейтинг. parseInfoTableTagAndUpdateMetadata(titlePage, metadata) parseStudioInfo(metadata, kinoPoiskId) # Studio. Студия. parsePeoplePageInfo(titlePage, metadata, kinoPoiskId) # Actors, etc. Актёры. др. parsePostersInfo(metadata, kinoPoiskId) # Posters. Постеры. parseBackgroundArtInfo(metadata, kinoPoiskId) # Background art. Задники. except: common.logException('failed to update metadata for id %s' % kinoPoiskId)
def updateMediaItem(self, metadata, kinoPoiskId, lang): titlePage = common.getElementFromHttpRequest( S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, S.ENCODING_KINOPOISK_PAGE) if titlePage is not None: # Don't update if the title page was failed to load. LOGGER.Debug('SUCCESS: got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId) try: self.parseInfoTableTagAndUpdateMetadata(titlePage, metadata) # Title, original title, ratings, and more. # Search for a movie on TMDb to supplement our results with more data. # IMPORTANT, this must be done after parseInfoTableTagAndUpdateMetadata, # which populates the title and the year on the metadata object. tmdbId = self.searchForImdbTitleId(metadata.title, metadata.year) self.parseStudioPageData(metadata, kinoPoiskId) # Studio. Студия. self.parseCastPageData(titlePage, metadata, kinoPoiskId) # Actors, etc. Актёры. др. self.updateImagesMetadata(metadata, kinoPoiskId, tmdbId, lang) # Posters & Background art. Постеры. except: common.logException('failed to update metadata for id %s' % kinoPoiskId)
def fetchImageDataPages(urlTemplate, kinoPoiskId, maxPages): pages = [] page = common.getElementFromHttpRequest(urlTemplate % (kinoPoiskId, 1), ENCODING_KINOPOISK_PAGE) if page is not None: pages.append(page) if maxPages > 1: anchorElems = page.xpath('//div[@class="navigator"]/ul/li[@class="arr"]/a') if len(anchorElems): nav = parseXpathElementValue(anchorElems[-1], './attribute::href') match = re.search('page\/(\d+?)\/$', nav) if match is not None: try: for pageIndex in range(2, int(match.groups(1)[0]) + 1): page = common.getElementFromHttpRequest(urlTemplate % (kinoPoiskId, pageIndex), ENCODING_KINOPOISK_PAGE) if page is not None: pages.append(page) if pageIndex == maxPages: break except: common.logException('unable to parse image art page') return pages
def updateImageMetadata(pages, metadata, maxImages, isPoster, thumb): thumbnailList = [] if thumb is not None: thumbnailList.append(thumb) if maxImages > 1: # Parsing URLs from the passed pages. maxImagesToParse = maxImages - len(thumbnailList) + 2 # Give it a couple of extras to choose from. for page in pages: maxImagesToParse = parseImageDataFromPhotoTableTag(page, thumbnailList, isPoster, maxImagesToParse) if not maxImagesToParse: break # Sort results according to their score and chop out extraneous images. Сортируем результаты. thumbnailList = sorted(thumbnailList, key=lambda t : t.score, reverse=True)[0:maxImages] if IS_DEBUG: common.printImageSearchResults(thumbnailList) # Now, walk over the top N (<max) results and update metadata. if isPoster: imagesContainer = metadata.posters else: imagesContainer = metadata.art index = 0 validNames = list() for result in thumbnailList: if result.thumbImgUrl is None: img = result.fullImgUrl else: img = result.thumbImgUrl try: imagesContainer[result.fullImgUrl] = Proxy.Preview(HTTP.Request(img), sort_order = index) validNames.append(result.fullImgUrl) index += 1 except: common.logException('Error generating preview for: "%s".' % str(img)) imagesContainer.validate_keys(validNames)
def parsePeoplePageInfo(titlePage, metadata, kinoPoiskId): """ Parses people - mostly actors - here (on this page) we have access to extensive information about all who participated creating this movie. """ # First, parse actors from the main title page. parseAllActors = PREFS.getAllActors actorsMap = parseActorsInfoIntoMap(titlePage) mainActors = [] otherActors = [] # Now, parse a dedicated 'people' page. page = common.getElementFromHttpRequest(KINOPOISK_PEOPLE % kinoPoiskId, ENCODING_KINOPOISK_PAGE) if page is None: Log.Debug('NO people page') for actorName in actorsMap.keys(): addActorToMetadata(metadata, actorName, None) return personType = None peopleTags = page.xpath('//div[@id="content_block"]/table/tr/td/div[@class="block_left"]/*') for peopleTagElem in peopleTags: try: if peopleTagElem.tag == 'table': personType = None tagElems = peopleTagElem.xpath('./tr/td[@style="padding-left:20px;border-bottom:2px solid #f60;font-size:16px"]/text()') if len(tagElems): tagName = tagElems[0] if tagName == u'Актеры': personType = 'actor' elif tagName == u'Директора фильма' or tagName == u'Режиссеры': personType = 'director' elif tagName == u'Сценаристы': personType = 'writer' elif tagName == u'Операторы' or \ tagName == u'Монтажеры' or \ tagName == u'Композиторы' or \ tagName == u'Художники': # Skip these tags for now. personType = None Log.Debug('skipping an unsupported tag "%s"' % tagName) else: Log.Debug('skipping an unknown tag "%s"' % tagName) elif peopleTagElem.tag == 'div': personNameElems = peopleTagElem.xpath('./div/div/div[@class="name"]/a/text()') personName = None if len(personNameElems): personName = personNameElems[0] if personType == 'actor': actorRoleElems = peopleTagElem.xpath('./div/div/div[@class="role"]/text()') if len(actorRoleElems): roleName = str(actorRoleElems[0]).strip().strip('. ') if personName in actorsMap: Log.Debug(' . . . . parsed main actor "%s" with role "%s"' % (personName, roleName)) mainActors.append((personName, roleName)) del actorsMap[personName] elif parseAllActors: Log.Debug(' . . . . parsed other actor "%s" with role "%s"' % (personName, roleName)) otherActors.append((personName, roleName)) else: personType = None except: common.logException('unable to parse a people tag') # Adding main actors that were found on the 'people' page. for personName, roleName in mainActors: addActorToMetadata(metadata, personName, roleName) # Adding main actors that were NOT found on the 'people' page. for actorName in actorsMap.keys(): addActorToMetadata(metadata, actorName, None) # Adding other actors if requested. for personName, roleName in otherActors: addActorToMetadata(metadata, personName, roleName)
def search(self, results, media, lang, manual=False): """ Searches for matches on KinoPoisk using the title and year passed via the media object. All matches are saved in a list of results as MetadataSearchResult objects. For each results, we determine a page id, title, year, and the score (how good we think the match is on the scale of 1 - 100). """ Log.Debug('SEARCH START <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') mediaName = media.name mediaYear = media.year Log.Debug('searching for name="%s", year="%s", guid="%s", hash="%s"...' % (str(mediaName), str(mediaYear), str(media.guid), str(media.hash))) # Получаем страницу поиска Log.Debug('quering kinopoisk...') page = common.getElementFromHttpRequest(KINOPOISK_SEARCH % urllib.quote(mediaName.encode(ENCODING_KINOPOISK_PAGE)), ENCODING_KINOPOISK_PAGE) Log.Debug('Loading page "%s"' % urllib.quote(mediaName.encode(ENCODING_KINOPOISK_PAGE))) if page is None: Log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName) else: # Если страница получена, берем с нее перечень всех названий фильмов. Log.Debug('got a kinopoisk page to parse...') divInfoElems = page.xpath('//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..') itemIndex = 0 altTitle = None if len(divInfoElems): Log.Debug('found %d results' % len(divInfoElems)) for divInfoElem in divInfoElems: try: anchorFilmElem = divInfoElem.xpath('./a[contains(@href,"/level/1/film/")]/attribute::href') if len(anchorFilmElem): # Parse kinopoisk movie title id, title and year. match = re.search('\/film\/(.+?)\/', anchorFilmElem[0]) if match is None: Log.Error('unable to parse movie title id') else: kinoPoiskId = match.groups(1)[0] title = common.getXpathRequiredNode(divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()') year = common.getXpathOptionalNode(divInfoElem, './/span[@class="year"]/text()') # Try to parse the alternative (original) title. Ignore failures. # This is a <span> below the title <a> tag. try: altTitle = common.getXpathOptionalNode(divInfoElem, '../span[1]/text()') if altTitle is not None: altTitle = altTitle.split(',')[0].strip() except: pass score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) results.Append(MetadataSearchResult(id=kinoPoiskId, name=title, year=year, lang=lang, score=score)) else: Log.Warn('unable to find film anchor elements for title "%s"' % mediaName) except: common.logException('failed to parse div.info container') itemIndex += 1 else: Log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName) # TODO(zhenya): investigate 1 we need this clause at all (haven't seen this happening). # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =) # try: #title = page.xpath('//h1[@class="moviename-big"]/text()')[0].strip() #kinoPoiskId = re.search('\/film\/(.+?)\/', page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0]).groups(0)[0] #year = page.xpath('//a[contains(@href,"year")]/text()')[0].strip() #score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) #results.Append(MetadataSearchResult(id=kinoPoiskId, name=title, year=year, lang=lang, score=score)) #except: # common.logException('failed to parse a KinoPoisk page') # Sort results according to their score (Сортируем результаты). results.Sort('score', descending=True) if IS_DEBUG: common.printSearchResults(results) Log.Debug('SEARCH END <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')