def updateMediaItem(self, metadata, kinoPoiskId): titlePage = common.getElementFromHttpRequest( S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, S.ENCODING_KINOPOISK_PAGE) if titlePage is not None: # Don't update if the title page was failed to load. LOGGER.Debug( 'SUCCESS: got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId) try: self.resetMediaMetadata(metadata) self.parseInfoTableTagAndUpdateMetadata( titlePage, metadata) # Title, original title, ratings, and more. self.parseStudioPageData(metadata, kinoPoiskId) # Studio. Студия. self.parseCastPageData(titlePage, metadata, kinoPoiskId) # Actors, etc. Актёры. др. self.parsePostersPageData(metadata, kinoPoiskId) # Posters. Постеры. self.parseStillsPageData( metadata, kinoPoiskId) # Background art. Stills. except: common.logException('failed to update metadata for id %s' % kinoPoiskId)
def updateMediaItem(self, metadata, kinoPoiskId, lang): titlePage = common.getElementFromHttpRequest( S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, S.ENCODING_KINOPOISK_PAGE) if titlePage is not None: # Don't update if the title page was failed to load. LOGGER.Debug( 'SUCCESS: got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId) try: self.parseInfoTableTagAndUpdateMetadata( titlePage, metadata) # Title, original title, ratings, and more. # Search for a movie on TMDb to supplement our results with more data. # IMPORTANT, this must be done after parseInfoTableTagAndUpdateMetadata, # which populates the title and the year on the metadata object. tmdbId = self.searchForImdbTitleId(metadata.title, metadata.year) self.parseStudioPageData(metadata, kinoPoiskId) # Studio. Студия. self.parseCastPageData(titlePage, metadata, kinoPoiskId) # Actors, etc. Актёры. др. self.updateImagesMetadata( metadata, kinoPoiskId, tmdbId, lang) # Posters & Background art. Постеры. except: common.logException('failed to update metadata for id %s' % kinoPoiskId)
def parseImageDataFromAnchorElement(anchorElem, index): thumbSizeUrl = None fullSizeUrl = None fullSizeDimensions = None, None fullSizeProxyPageUrl = anchorElem.get('href') thumbSizeImgElem = parseXpathElementValue(anchorElem, './img') if thumbSizeImgElem is not None: thumbSizeUrl = thumbSizeImgElem.get('src') if thumbSizeUrl is not None: thumbSizeUrl = ensureAbsoluteUrl(thumbSizeUrl) if fullSizeProxyPageUrl is not None: fullSizeProxyPage = common.getElementFromHttpRequest(ensureAbsoluteUrl(fullSizeProxyPageUrl), ENCODING_KINOPOISK_PAGE) if fullSizeProxyPage is not None: imageElem = parseXpathElementValue(fullSizeProxyPage, '//img[@id="image"]') if imageElem is not None: fullSizeUrl = imageElem.get('src') fullSizeDimensions = parseImageElemDimensions(imageElem) # If we have no full size image URL, we could use the thumb's. if fullSizeUrl is None and thumbSizeUrl is not None: Log.Debug('found no full size image, will use the thumbnail') fullSizeUrl = thumbSizeUrl if fullSizeUrl is None and thumbSizeUrl is None: return None return common.Thumbnail(thumbSizeUrl, ensureAbsoluteUrl(fullSizeUrl), fullSizeDimensions[0], fullSizeDimensions[1], index, 0) # Initial score.
def searchForImdbTitles(mediaName, mediaYear, lang): """ Given media name and a candidate title, returns the title result score penalty. """ mediaName = mediaName.lower() page = common.getElementFromHttpRequest(TMDB_GETINFO % mediaName.replace(' ', '%20'), TMDB_PAGE_ENCODING) matches = [] if page is None: Log.Warn('nothing was found on tmdb for media name "%s"' % mediaName) else: movieElems = page.xpath('//movies/movie') itemIndex = 0 for movieElem in movieElems: try: imdbId = common.getXpathRequiredText(movieElem, './imdb_id/text()') title = common.getXpathRequiredText(movieElem, './name/text()') altTitle = common.getXpathOptionalText(movieElem, './alternative_name/text()') releaseDate = common.getXpathOptionalText(movieElem, './released/text()') year = common.getReOptionalGroup(MATCHER_RELEASED, releaseDate, 0) score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) matches.append({'id': imdbId, 'name': title, 'year': year, 'score': score}) itemIndex += 1 except: Log.Warn('failed to parse movie element') return matches
def parseStudioInfo(metadata, kinoPoiskId): page = common.getElementFromHttpRequest(KINOPOISK_STUDIO % kinoPoiskId, ENCODING_KINOPOISK_PAGE) if not page: return studios = page.xpath(u'//table/tr/td[b="Производство:"]/../following-sibling::tr/td/a/text()') if len(studios): # Берем только первую студию. studio = studios[0].strip() Log.Debug(' ... parsed studio: %s' % studio) metadata.studio = studio
def fetchImageDataPages(urlTemplate, kinoPoiskId, maxPages): pages = [] page = common.getElementFromHttpRequest(urlTemplate % (kinoPoiskId, 1), ENCODING_KINOPOISK_PAGE) if page is not None: pages.append(page) if maxPages > 1: anchorElems = page.xpath('//div[@class="navigator"]/ul/li[@class="arr"]/a') if len(anchorElems): nav = parseXpathElementValue(anchorElems[-1], './attribute::href') match = re.search('page\/(\d+?)\/$', nav) if match is not None: try: for pageIndex in range(2, int(match.groups(1)[0]) + 1): page = common.getElementFromHttpRequest(urlTemplate % (kinoPoiskId, pageIndex), ENCODING_KINOPOISK_PAGE) if page is not None: pages.append(page) if pageIndex == maxPages: break except: common.logException('unable to parse image art page') return pages
def updateMediaItem(self, metadata, kinoPoiskId): titlePage = common.getElementFromHttpRequest(S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, S.ENCODING_KINOPOISK_PAGE) if titlePage is not None: # Don't update if the title page was failed to load. LOGGER.Debug('SUCCESS: got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId) try: self.resetMediaMetadata(metadata) self.parseInfoTableTagAndUpdateMetadata(titlePage, metadata) # Title, original title, ratings, and more. self.parseStudioPageData(metadata, kinoPoiskId) # Studio. Студия. self.parseCastPageData(titlePage, metadata, kinoPoiskId) # Actors, etc. Актёры. др. self.parsePostersPageData(metadata, kinoPoiskId) # Posters. Постеры. self.parseStillsPageData(metadata, kinoPoiskId) # Background art. Stills. except: common.logException('failed to update metadata for id %s' % kinoPoiskId)
def updateMediaItem(self, metadata, kinoPoiskId): titlePage = common.getElementFromHttpRequest(KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, ENCODING_KINOPOISK_PAGE) if titlePage is not None: Log.Debug('got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId) try: resetMediaMetadata(metadata) parseTitleInfo(titlePage, metadata) # Title. Название на русском языке. parseOriginalTitleInfo(titlePage, metadata) # Original title. Название на оригинальном языке. parseSummaryInfo(titlePage, metadata) # Summary. Описание. parseRatingInfo(titlePage, metadata, kinoPoiskId) # Rating. Рейтинг. parseInfoTableTagAndUpdateMetadata(titlePage, metadata) parseStudioInfo(metadata, kinoPoiskId) # Studio. Студия. parsePeoplePageInfo(titlePage, metadata, kinoPoiskId) # Actors, etc. Актёры. др. parsePostersInfo(metadata, kinoPoiskId) # Posters. Постеры. parseBackgroundArtInfo(metadata, kinoPoiskId) # Background art. Задники. except: common.logException('failed to update metadata for id %s' % kinoPoiskId)
def updateMediaItem(self, metadata, kinoPoiskId, lang): titlePage = common.getElementFromHttpRequest( S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, S.ENCODING_KINOPOISK_PAGE) if titlePage is not None: # Don't update if the title page was failed to load. LOGGER.Debug('SUCCESS: got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId) try: self.parseInfoTableTagAndUpdateMetadata(titlePage, metadata) # Title, original title, ratings, and more. # Search for a movie on TMDb to supplement our results with more data. # IMPORTANT, this must be done after parseInfoTableTagAndUpdateMetadata, # which populates the title and the year on the metadata object. tmdbId = self.searchForImdbTitleId(metadata.title, metadata.year) self.parseStudioPageData(metadata, kinoPoiskId) # Studio. Студия. self.parseCastPageData(titlePage, metadata, kinoPoiskId) # Actors, etc. Актёры. др. self.updateImagesMetadata(metadata, kinoPoiskId, tmdbId, lang) # Posters & Background art. Постеры. except: common.logException('failed to update metadata for id %s' % kinoPoiskId)
def searchForImdbTitles(mediaName, mediaYear, lang): """ Given media name and a candidate title, returns the title result score penalty. """ mediaName = mediaName.lower() page = common.getElementFromHttpRequest( TMDB_GETINFO % mediaName.replace(' ', '%20'), TMDB_PAGE_ENCODING) matches = [] if page is None: Log.Warn('nothing was found on tmdb for media name "%s"' % mediaName) else: movieElems = page.xpath('//movies/movie') itemIndex = 0 for movieElem in movieElems: try: imdbId = common.getXpathRequiredText(movieElem, './imdb_id/text()') title = common.getXpathRequiredText(movieElem, './name/text()') altTitle = common.getXpathOptionalText( movieElem, './alternative_name/text()') releaseDate = common.getXpathOptionalText( movieElem, './released/text()') year = common.getReOptionalGroup(MATCHER_RELEASED, releaseDate, 0) score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) matches.append({ 'id': imdbId, 'name': title, 'year': year, 'score': score }) itemIndex += 1 except: Log.Warn('failed to parse movie element') return matches
def parsePeoplePageInfo(titlePage, metadata, kinoPoiskId): """ Parses people - mostly actors - here (on this page) we have access to extensive information about all who participated creating this movie. """ # First, parse actors from the main title page. parseAllActors = PREFS.getAllActors actorsMap = parseActorsInfoIntoMap(titlePage) mainActors = [] otherActors = [] # Now, parse a dedicated 'people' page. page = common.getElementFromHttpRequest(KINOPOISK_PEOPLE % kinoPoiskId, ENCODING_KINOPOISK_PAGE) if page is None: Log.Debug('NO people page') for actorName in actorsMap.keys(): addActorToMetadata(metadata, actorName, None) return personType = None peopleTags = page.xpath('//div[@id="content_block"]/table/tr/td/div[@class="block_left"]/*') for peopleTagElem in peopleTags: try: if peopleTagElem.tag == 'table': personType = None tagElems = peopleTagElem.xpath('./tr/td[@style="padding-left:20px;border-bottom:2px solid #f60;font-size:16px"]/text()') if len(tagElems): tagName = tagElems[0] if tagName == u'Актеры': personType = 'actor' elif tagName == u'Директора фильма' or tagName == u'Режиссеры': personType = 'director' elif tagName == u'Сценаристы': personType = 'writer' elif tagName == u'Операторы' or \ tagName == u'Монтажеры' or \ tagName == u'Композиторы' or \ tagName == u'Художники': # Skip these tags for now. personType = None Log.Debug('skipping an unsupported tag "%s"' % tagName) else: Log.Debug('skipping an unknown tag "%s"' % tagName) elif peopleTagElem.tag == 'div': personNameElems = peopleTagElem.xpath('./div/div/div[@class="name"]/a/text()') personName = None if len(personNameElems): personName = personNameElems[0] if personType == 'actor': actorRoleElems = peopleTagElem.xpath('./div/div/div[@class="role"]/text()') if len(actorRoleElems): roleName = str(actorRoleElems[0]).strip().strip('. ') if personName in actorsMap: Log.Debug(' . . . . parsed main actor "%s" with role "%s"' % (personName, roleName)) mainActors.append((personName, roleName)) del actorsMap[personName] elif parseAllActors: Log.Debug(' . . . . parsed other actor "%s" with role "%s"' % (personName, roleName)) otherActors.append((personName, roleName)) else: personType = None except: common.logException('unable to parse a people tag') # Adding main actors that were found on the 'people' page. for personName, roleName in mainActors: addActorToMetadata(metadata, personName, roleName) # Adding main actors that were NOT found on the 'people' page. for actorName in actorsMap.keys(): addActorToMetadata(metadata, actorName, None) # Adding other actors if requested. for personName, roleName in otherActors: addActorToMetadata(metadata, personName, roleName)
def search(self, results, media, lang, manual=False): """ Searches for matches on KinoPoisk using the title and year passed via the media object. All matches are saved in a list of results as MetadataSearchResult objects. For each results, we determine a page id, title, year, and the score (how good we think the match is on the scale of 1 - 100). """ Log.Debug('SEARCH START <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') mediaName = media.name mediaYear = media.year Log.Debug('searching for name="%s", year="%s", guid="%s", hash="%s"...' % (str(mediaName), str(mediaYear), str(media.guid), str(media.hash))) # Получаем страницу поиска Log.Debug('quering kinopoisk...') page = common.getElementFromHttpRequest(KINOPOISK_SEARCH % urllib.quote(mediaName.encode(ENCODING_KINOPOISK_PAGE)), ENCODING_KINOPOISK_PAGE) Log.Debug('Loading page "%s"' % urllib.quote(mediaName.encode(ENCODING_KINOPOISK_PAGE))) if page is None: Log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName) else: # Если страница получена, берем с нее перечень всех названий фильмов. Log.Debug('got a kinopoisk page to parse...') divInfoElems = page.xpath('//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..') itemIndex = 0 altTitle = None if len(divInfoElems): Log.Debug('found %d results' % len(divInfoElems)) for divInfoElem in divInfoElems: try: anchorFilmElem = divInfoElem.xpath('./a[contains(@href,"/level/1/film/")]/attribute::href') if len(anchorFilmElem): # Parse kinopoisk movie title id, title and year. match = re.search('\/film\/(.+?)\/', anchorFilmElem[0]) if match is None: Log.Error('unable to parse movie title id') else: kinoPoiskId = match.groups(1)[0] title = common.getXpathRequiredNode(divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()') year = common.getXpathOptionalNode(divInfoElem, './/span[@class="year"]/text()') # Try to parse the alternative (original) title. Ignore failures. # This is a <span> below the title <a> tag. try: altTitle = common.getXpathOptionalNode(divInfoElem, '../span[1]/text()') if altTitle is not None: altTitle = altTitle.split(',')[0].strip() except: pass score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) results.Append(MetadataSearchResult(id=kinoPoiskId, name=title, year=year, lang=lang, score=score)) else: Log.Warn('unable to find film anchor elements for title "%s"' % mediaName) except: common.logException('failed to parse div.info container') itemIndex += 1 else: Log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName) # TODO(zhenya): investigate 1 we need this clause at all (haven't seen this happening). # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =) # try: #title = page.xpath('//h1[@class="moviename-big"]/text()')[0].strip() #kinoPoiskId = re.search('\/film\/(.+?)\/', page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0]).groups(0)[0] #year = page.xpath('//a[contains(@href,"year")]/text()')[0].strip() #score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) #results.Append(MetadataSearchResult(id=kinoPoiskId, name=title, year=year, lang=lang, score=score)) #except: # common.logException('failed to parse a KinoPoisk page') # Sort results according to their score (Сортируем результаты). results.Sort('score', descending=True) if IS_DEBUG: common.printSearchResults(results) Log.Debug('SEARCH END <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')