def localTest_scoreMediaTitleMatch(self): score = common.scoreMediaTitleMatch('Gladiatory Rima', '2012', u'Гладиаторы Рима', 'Gladiatori di Roma', '2012', 3) self._assertEquals(92, score, 'Wrong score.') score = common.scoreMediaTitleMatch( u'Кавказская пленница', '1966', u'Кавказская пленница, или Новые приключения Шурика', None, '1966', 0) self._assertEquals(94, score, 'Wrong score.')
def queryKinoPoisk(self, mediaName, mediaYear): """ Ищет фильм на кинопоиске. Returns title results as they are returned (no sorting is done here!). """ results = [] try:encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE)) except:encodedName=urllib.quote(mediaName.encode('utf-8')) page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH_SIMPLE % encodedName) if page is None: self.log.Warn(' ### nothing was found on kinopoisk for media name "%s"' % mediaName) return results # Страница получена, берем с нее перечень всех названий фильмов. self.log.Debug('got a KinoPoisk query results page to parse...') reobj = re.compile(r'<span><a href="http://m\.kinopoisk\.ru/movie/(\d+)/">(.+?)</a><br />(.+?)</span>') result = reobj.findall(page) # Inspect query results titles and score them. self.log.Debug('found %d results (div info tags)' % len(result)) itemIndex=-1 for itemKinoPoiskId, itemTitleitemYear, itemAltTitle in result: itemIndex=itemIndex+1 itemAltTitle=itemAltTitle.replace(' ','') if ',' in itemTitleitemYear: itemYear=itemTitleitemYear.split(',')[-1].strip() itemTitle=itemTitleitemYear[:len(itemTitleitemYear)-6] else: itemYear=None itemTitle=itemTitleitemYear itemScore = common.scoreMediaTitleMatch(mediaName, mediaYear, itemTitle, itemAltTitle, itemYear, itemIndex) results.append([itemKinoPoiskId, itemTitle, itemYear, itemScore]) return results
def searchForImdbTitles(mediaName, mediaYear, lang): """ Given media name and a candidate title, returns the title result score penalty. """ mediaName = mediaName.lower() page = common.getElementFromHttpRequest(TMDB_GETINFO % mediaName.replace(' ', '%20'), TMDB_PAGE_ENCODING) matches = [] if page is None: Log.Warn('nothing was found on tmdb for media name "%s"' % mediaName) else: movieElems = page.xpath('//movies/movie') itemIndex = 0 for movieElem in movieElems: try: imdbId = common.getXpathRequiredText(movieElem, './imdb_id/text()') title = common.getXpathRequiredText(movieElem, './name/text()') altTitle = common.getXpathOptionalText(movieElem, './alternative_name/text()') releaseDate = common.getXpathOptionalText(movieElem, './released/text()') year = common.getReOptionalGroup(MATCHER_RELEASED, releaseDate, 0) score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) matches.append({'id': imdbId, 'name': title, 'year': year, 'score': score}) itemIndex += 1 except: Log.Warn('failed to parse movie element') return matches
def searchForImdbTitles(self, mediaName, mediaYear, lang='ru', searchByTitleOnly=False): """ Given media name and media year looks for media title matches on TMDb. @param mediaName: CGI escaped media name string. @param mediaYear: (optional) Filter the results release dates to matches that include this value. @param lang: (optional) ISO 639-1 code. """ yearSearch = '' if not searchByTitleOnly: yearSearch = mediaYear mediaName = mediaName.lower() search = Search(self.httpUtils) response = search.movie(query=mediaName.encode(S.TMDB_PAGE_ENCODING), year=yearSearch, language=lang) results = [] if response is None: self.log.Warn('nothing was found on tmdb for media name "%s"' % mediaName) else: itemIndex = 0 for s in search.results: try: tmdbId = str(s['id']) title = s['title'] altTitle = s['original_title'] year = common.getReOptionalGroup(MATCHER_RELEASED, s['release_date'], 0) score = common.scoreMediaTitleMatch( mediaName, mediaYear, title, altTitle, year, itemIndex) results.append({ 'id': tmdbId, 'title': title, 'year': year, 'score': score }) itemIndex = itemIndex + 1 except: self.log.Warn('failed to parse movie element') orderedResults = sorted(results, key=lambda item: item['score'], reverse=True) if self.isDebug: self.log.Debug('Search produced %d results:' % len(orderedResults)) index = -1 for result in orderedResults: index = index + 1 self.log.Debug( ' ... %d: id="%s", title="%s", year="%s", score="%d".' % (index, result['id'], result['title'], str( result['year']), result['score'])) return orderedResults
def queryKinoPoisk(self, mediaName, mediaYear): """ Ищет фильм на кинопоиске. Returns title results as they are returned (no sorting is done here!). """ results = [] try: encodedName = urllib.quote( mediaName.encode(S.ENCODING_KINOPOISK_PAGE)) except: encodedName = urllib.quote(mediaName.encode('utf-8')) page = self.httpUtils.requestAndParseHtmlPage( S.KINOPOISK_SEARCH_SIMPLE % encodedName) if page is None: self.log.Warn( ' ### nothing was found on kinopoisk for media name "%s"' % mediaName) return results # Страница получена, берем с нее перечень всех названий фильмов. self.log.Debug('**** Finding %s (%s) got a KinoPoisk...' % (mediaName, mediaYear)) reobj = re.compile( r'<span><a href="http://m\.kinopoisk\.ru/movie/(\d+)/">(.+?)</a><br />(.+?)</span>' ) result = reobj.findall(page) # Inspect query results titles and score them. self.log.Debug('found %d results (div info tags)' % len(result)) itemIndex = -1 for itemKinoPoiskId, itemTitleitemYear, itemAltTitle in result: itemIndex = itemIndex + 1 itemAltTitle = itemAltTitle.replace(' ', '') try: itemTitle, itemYear = re.compile( '^(.+?), (\d\d\d\d)$').findall( itemTitleitemYear.strip())[0] except: itemYear = None itemTitle = itemTitleitemYear itemScore = common.scoreMediaTitleMatch(mediaName, mediaYear, itemTitle, itemAltTitle, itemYear, itemIndex) results.append([itemKinoPoiskId, itemTitle, itemYear, itemScore]) return results
def searchForImdbTitles(self, mediaName, mediaYear, lang='ru', searchByTitleOnly=False): """ Given media name and media year looks for media title matches on TMDb. @param mediaName: CGI escaped media name string. @param mediaYear: (optional) Filter the results release dates to matches that include this value. @param lang: (optional) ISO 639-1 code. """ yearSearch = '' if not searchByTitleOnly: yearSearch = mediaYear mediaName = mediaName.lower() search = Search(self.httpUtils) response = search.movie( query = mediaName.encode(S.TMDB_PAGE_ENCODING), year = yearSearch, language = lang ) results = [] if response is None: self.log.Warn('nothing was found on tmdb for media name "%s"' % mediaName) else: itemIndex = 0 for s in search.results: try: tmdbId = str(s['id']) title = s['title'] altTitle = s['original_title'] year = common.getReOptionalGroup(MATCHER_RELEASED, s['release_date'], 0) score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) results.append({'id': tmdbId, 'title': title, 'year': year, 'score': score}) itemIndex = itemIndex + 1 except: self.log.Warn('failed to parse movie element') orderedResults = sorted(results, key=lambda item: item['score'], reverse=True) if self.isDebug: self.log.Debug('Search produced %d results:' % len(orderedResults)) index = -1 for result in orderedResults: index = index + 1 self.log.Debug(' ... %d: id="%s", title="%s", year="%s", score="%d".' % (index, result['id'], result['title'], str(result['year']), result['score'])) return orderedResults
def searchForImdbTitles(mediaName, mediaYear, lang): """ Given media name and a candidate title, returns the title result score penalty. """ mediaName = mediaName.lower() page = common.getElementFromHttpRequest( TMDB_GETINFO % mediaName.replace(' ', '%20'), TMDB_PAGE_ENCODING) matches = [] if page is None: Log.Warn('nothing was found on tmdb for media name "%s"' % mediaName) else: movieElems = page.xpath('//movies/movie') itemIndex = 0 for movieElem in movieElems: try: imdbId = common.getXpathRequiredText(movieElem, './imdb_id/text()') title = common.getXpathRequiredText(movieElem, './name/text()') altTitle = common.getXpathOptionalText( movieElem, './alternative_name/text()') releaseDate = common.getXpathOptionalText( movieElem, './released/text()') year = common.getReOptionalGroup(MATCHER_RELEASED, releaseDate, 0) score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) matches.append({ 'id': imdbId, 'name': title, 'year': year, 'score': score }) itemIndex += 1 except: Log.Warn('failed to parse movie element') return matches
def localTest_scoreMediaTitleMatch(self): score = common.scoreMediaTitleMatch('Gladiatory Rima', '2012', u'Гладиаторы Рима', 'Gladiatori di Roma', '2012', 3) self._assertEquals(92, score, 'Wrong score.') score = common.scoreMediaTitleMatch(u'Кавказская пленница', '1966', u'Кавказская пленница, или Новые приключения Шурика', None, '1966', 0) self._assertEquals(94, score, 'Wrong score.')
def queryKinoPoisk(self, mediaName, mediaYear): """ Ищет фильм на кинопоиске. Returns title results as they are returned (no sorting is done here!). """ results = [] encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE)) page = self.httpUtils.requestAndParseHtmlPage( S.KINOPOISK_SEARCH_SIMPLE % encodedName) if page is None: self.log.Warn( ' ### nothing was found on kinopoisk for media name "%s"' % mediaName) return results # Страница получена, берем с нее перечень всех названий фильмов. self.log.Debug('got a KinoPoisk query results page to parse...') divInfoElems = page.xpath( '//div[@class="info"][p[@class="name"]/a[contains(@href,"/level/1/film/")]]' ) # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить). # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening). if not len(divInfoElems): self.log.Warn( 'nothing was found on kinopoisk for media name "%s"' % mediaName) try: itemTitle = common.getXpathOptionalText( page, '//h1[@class="moviename-big"]/text()') if itemTitle is not None: itemKinoPoiskId = re.search( '\/film\/(.+?)\/', page.xpath( './/link[contains(@href, "/film/")]/attribute::href' )[0]).groups(0)[0] itemYear = common.parseYearFromString( page.xpath('//a[contains(@href,"year")]/text()')[0]) itemAltTitle = None # TODO: parse original title. itemScore = common.scoreMediaTitleMatch( mediaName, mediaYear, itemTitle, itemAltTitle, itemYear, 0) results.append( [itemKinoPoiskId, itemTitle, itemYear, itemScore]) except: self.logException( 'failed to parse a KinoPoisk query results page') return results # Inspect query results titles and score them. itemIndex = -1 self.log.Debug('found %d results (div info tags)' % len(divInfoElems)) for divInfoElem in divInfoElems: itemIndex += 1 try: anchorFilmElem = divInfoElem.xpath( './/a[contains(@href,"/level/1/film/")]/attribute::href') if not len(anchorFilmElem): self.log.Warn( 'unable to find film anchor elements for title "%s"' % mediaName) continue # Parse kinopoisk movie title id, title and year. match = re.search('\/film\/(.+?)\/', anchorFilmElem[0]) if match is None: self.log.Error('unable to parse movie title id') continue itemKinoPoiskId = match.groups(1)[0] itemTitle = common.getXpathRequiredText( divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()') itemYear = common.parseYearFromString( common.getXpathOptionalText( divInfoElem, './/span[@class="year"]/text()')) itemAltTitle = None try: # Try to parse the alternative (original) title. Ignore failures. # This is a <span> below the title <a> tag. altTitleCandidate = common.getXpathOptionalText( divInfoElem, './/span[@class="gray"]/text()') if altTitleCandidate is not None: # Strip any non alpha character in front (unfortunately, this may also remove a leading part # of a movie title if it starts with a digit). altTitleCandidate = MATCHER_LEADING_NONALPHA.sub( '', altTitleCandidate).rstrip() if len(altTitleCandidate) > 0: itemAltTitle = altTitleCandidate except: pass # self.log.Debug(' ... kinoPoiskId="%s"; title="%s"; year="%s"...' % (itemKinoPoiskId, itemTitle, str(itemYear))) itemScore = common.scoreMediaTitleMatch( mediaName, mediaYear, itemTitle, itemAltTitle, itemYear, itemIndex) results.append( [itemKinoPoiskId, itemTitle, itemYear, itemScore]) except: self.logException('failed to parse div.info container') return results
def search(self, results, media, lang, manual=False): """ Searches for matches on KinoPoisk using the title and year passed via the media object. All matches are saved in a list of results as MetadataSearchResult objects. For each results, we determine a page id, title, year, and the score (how good we think the match is on the scale of 1 - 100). """ Log.Debug('SEARCH START <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') mediaName = media.name mediaYear = media.year Log.Debug('searching for name="%s", year="%s", guid="%s", hash="%s"...' % (str(mediaName), str(mediaYear), str(media.guid), str(media.hash))) # Получаем страницу поиска Log.Debug('quering kinopoisk...') page = common.getElementFromHttpRequest(KINOPOISK_SEARCH % urllib.quote(mediaName.encode(ENCODING_KINOPOISK_PAGE)), ENCODING_KINOPOISK_PAGE) Log.Debug('Loading page "%s"' % urllib.quote(mediaName.encode(ENCODING_KINOPOISK_PAGE))) if page is None: Log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName) else: # Если страница получена, берем с нее перечень всех названий фильмов. Log.Debug('got a kinopoisk page to parse...') divInfoElems = page.xpath('//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..') itemIndex = 0 altTitle = None if len(divInfoElems): Log.Debug('found %d results' % len(divInfoElems)) for divInfoElem in divInfoElems: try: anchorFilmElem = divInfoElem.xpath('./a[contains(@href,"/level/1/film/")]/attribute::href') if len(anchorFilmElem): # Parse kinopoisk movie title id, title and year. match = re.search('\/film\/(.+?)\/', anchorFilmElem[0]) if match is None: Log.Error('unable to parse movie title id') else: kinoPoiskId = match.groups(1)[0] title = common.getXpathRequiredNode(divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()') year = common.getXpathOptionalNode(divInfoElem, './/span[@class="year"]/text()') # Try to parse the alternative (original) title. Ignore failures. # This is a <span> below the title <a> tag. try: altTitle = common.getXpathOptionalNode(divInfoElem, '../span[1]/text()') if altTitle is not None: altTitle = altTitle.split(',')[0].strip() except: pass score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) results.Append(MetadataSearchResult(id=kinoPoiskId, name=title, year=year, lang=lang, score=score)) else: Log.Warn('unable to find film anchor elements for title "%s"' % mediaName) except: common.logException('failed to parse div.info container') itemIndex += 1 else: Log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName) # TODO(zhenya): investigate 1 we need this clause at all (haven't seen this happening). # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =) # try: #title = page.xpath('//h1[@class="moviename-big"]/text()')[0].strip() #kinoPoiskId = re.search('\/film\/(.+?)\/', page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0]).groups(0)[0] #year = page.xpath('//a[contains(@href,"year")]/text()')[0].strip() #score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) #results.Append(MetadataSearchResult(id=kinoPoiskId, name=title, year=year, lang=lang, score=score)) #except: # common.logException('failed to parse a KinoPoisk page') # Sort results according to their score (Сортируем результаты). results.Sort('score', descending=True) if IS_DEBUG: common.printSearchResults(results) Log.Debug('SEARCH END <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
def fetchAndParseSearchResults(self, mediaName, mediaYear): """ Searches for movie titles on KinoPoisk. @param mediaName Movie title parsed from a filename. @param mediaName Movie year parsed from a filename. @return Array of tuples: [kinoPoiskId, title, year, score] """ self.log.Info('Quering kinopoisk...') results = [] encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE)) self.log.Debug('Loading page "%s"' % encodedName) page = self.httpUtils.requestAndParseHtmlPage( S.KINOPOISK_SEARCH_SIMPLE % encodedName) if page is None: self.log.Warn( ' ### nothing was found on kinopoisk for media name "%s"' % mediaName) else: # Если страница получена, берем с нее перечень всех названий фильмов. self.log.Debug('got a kinopoisk results page to parse...') # Pick all divs with class "info" that have specific children (/p/a/ etc). # divInfoElems = page.xpath('//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..') divInfoElems = page.xpath( '//div[@class="info"][p[@class="name"]/a[contains(@href,"/level/1/film/")]]' ) itemIndex = 0 if len(divInfoElems): self.log.Debug('found %d results (div info tags)' % len(divInfoElems)) for divInfoElem in divInfoElems: try: anchorFilmElem = divInfoElem.xpath( './/a[contains(@href,"/level/1/film/")]/attribute::href' ) if len(anchorFilmElem): # Parse kinopoisk movie title id, title and year. match = re.search('\/film\/(.+?)\/', anchorFilmElem[0]) if match is None: self.log.Error( 'unable to parse movie title id') else: kinoPoiskId = match.groups(1)[0] title = common.getXpathRequiredText( divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()' ) year = common.getXpathOptionalText( divInfoElem, './/span[@class="year"]/text()') # Try to parse the alternative (original) title. Ignore failures. # This is a <span> below the title <a> tag. altTitle = None try: altTitleCandidate = common.getXpathOptionalText( divInfoElem, './/span[@class="gray"]/text()') if altTitleCandidate is not None: # Strip any non alpha character in front (unfortunately, this may also remove a leading part # of a movie title if it starts with a digit). altTitleCandidate = MATCHER_LEADING_NONALPHA.sub( '', altTitleCandidate).rstrip() if len(altTitleCandidate) > 0: altTitle = altTitleCandidate except: pass self.log.Debug( ' ... kinoPoiskId="%s"; title="%s"; year="%s"...' % (kinoPoiskId, title, str(year))) score = common.scoreMediaTitleMatch( mediaName, mediaYear, title, altTitle, year, itemIndex) results.append( [kinoPoiskId, title, year, score]) else: self.log.Warn( 'unable to find film anchor elements for title "%s"' % mediaName) except: self.logException('failed to parse div.info container') itemIndex += 1 else: self.log.Warn( 'nothing was found on kinopoisk for media name "%s"' % mediaName) # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening). # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =) try: title = common.getXpathOptionalText( page, '//h1[@class="moviename-big"]/text()') if title is not None: kinoPoiskId = re.search( '\/film\/(.+?)\/', page.xpath( './/link[contains(@href, "/film/")]/attribute::href' )[0]).groups(0)[0] year = page.xpath( '//a[contains(@href,"year")]/text()')[0].strip() altTitle = None # TODO: parse original title. score = common.scoreMediaTitleMatch( mediaName, mediaYear, title, altTitle, year, itemIndex) results.append([kinoPoiskId, title, year, score]) except: self.logException('failed to parse a KinoPoisk page') return results
def fetchAndParseSearchResultsFull(self, mediaName, mediaYear): self.log.Info('Quering kinopoisk...') results = [] encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE)) self.log.Debug('Loading page "%s"' % encodedName) page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH % encodedName) if page is None: self.log.Warn( ' ### nothing was found on kinopoisk for media name "%s"' % mediaName) else: # Если страница получена, берем с нее перечень всех названий фильмов. self.log.Debug('got a kinopoisk page to parse...') divInfoElems = page.xpath( '//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..' ) itemIndex = 0 altTitle = None if len(divInfoElems): self.log.Debug('found %d results' % len(divInfoElems)) for divInfoElem in divInfoElems: try: anchorFilmElem = divInfoElem.xpath( './a[contains(@href,"/level/1/film/")]/attribute::href' ) if len(anchorFilmElem): # Parse kinopoisk movie title id, title and year. match = re.search('\/film\/(.+?)\/', anchorFilmElem[0]) if match is None: self.log.Error( 'unable to parse movie title id') else: kinoPoiskId = match.groups(1)[0] title = common.getXpathRequiredText( divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()' ) year = common.getXpathOptionalText( divInfoElem, './/span[@class="year"]/text()') # Try to parse the alternative (original) title. Ignore failures. # This is a <span> below the title <a> tag. try: altTitle = common.getXpathOptionalText( divInfoElem, '../span[1]/text()') if altTitle is not None: altTitle = altTitle.split( ',')[0].strip() except: pass score = common.scoreMediaTitleMatch( mediaName, mediaYear, title, altTitle, year, itemIndex) results.append( [kinoPoiskId, title, year, score]) else: self.log.Warn( 'unable to find film anchor elements for title "%s"' % mediaName) except: self.logException('failed to parse div.info container') itemIndex += 1 else: self.log.Warn( 'nothing was found on kinopoisk for media name "%s"' % mediaName) # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening). # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =) try: title = page.xpath( '//h1[@class="moviename-big"]/text()')[0].strip() kinoPoiskId = re.search( '\/film\/(.+?)\/', page.xpath( './/link[contains(@href, "/film/")]/attribute::href' )[0]).groups(0)[0] year = page.xpath( '//a[contains(@href,"year")]/text()')[0].strip() score = common.scoreMediaTitleMatch( mediaName, mediaYear, title, altTitle, year, itemIndex) results.append([kinoPoiskId, title, year, score]) except: self.logException('failed to parse a KinoPoisk page') return results
def fetchAndParseSearchResults(self, mediaName, mediaYear): """ Searches for movie titles on KinoPoisk. @param mediaName Movie title parsed from a filename. @param mediaName Movie year parsed from a filename. @return Array of tuples: [kinoPoiskId, title, year, score] """ self.log.Info("Quering kinopoisk...") results = [] encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE)) self.log.Debug('Loading page "%s"' % encodedName) page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH_SIMPLE % encodedName) if page is None: self.log.Warn(' ### nothing was found on kinopoisk for media name "%s"' % mediaName) else: # Если страница получена, берем с нее перечень всех названий фильмов. self.log.Debug("got a kinopoisk results page to parse...") # Pick all divs with class "info" that have specific children (/p/a/ etc). # divInfoElems = page.xpath('//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..') divInfoElems = page.xpath('//div[@class="info"][p[@class="name"]/a[contains(@href,"/level/1/film/")]]') itemIndex = 0 if len(divInfoElems): self.log.Debug("found %d results (div info tags)" % len(divInfoElems)) for divInfoElem in divInfoElems: try: anchorFilmElem = divInfoElem.xpath('.//a[contains(@href,"/level/1/film/")]/attribute::href') if len(anchorFilmElem): # Parse kinopoisk movie title id, title and year. match = re.search("\/film\/(.+?)\/", anchorFilmElem[0]) if match is None: self.log.Error("unable to parse movie title id") else: kinoPoiskId = match.groups(1)[0] title = common.getXpathRequiredText( divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()' ) year = common.getXpathOptionalText(divInfoElem, './/span[@class="year"]/text()') # Try to parse the alternative (original) title. Ignore failures. # This is a <span> below the title <a> tag. altTitle = None try: altTitleCandidate = common.getXpathOptionalText( divInfoElem, './/span[@class="gray"]/text()' ) if altTitleCandidate is not None: # Strip any non alpha character in front (unfortunately, this may also remove a leading part # of a movie title if it starts with a digit). altTitleCandidate = MATCHER_LEADING_NONALPHA.sub("", altTitleCandidate).rstrip() if len(altTitleCandidate) > 0: altTitle = altTitleCandidate except: pass self.log.Debug( ' ... kinoPoiskId="%s"; title="%s"; year="%s"...' % (kinoPoiskId, title, str(year)) ) score = common.scoreMediaTitleMatch( mediaName, mediaYear, title, altTitle, year, itemIndex ) results.append([kinoPoiskId, title, year, score]) else: self.log.Warn('unable to find film anchor elements for title "%s"' % mediaName) except: self.logException("failed to parse div.info container") itemIndex += 1 else: self.log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName) # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening). # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =) try: title = common.getXpathOptionalText(page, '//h1[@class="moviename-big"]/text()') if title is not None: kinoPoiskId = re.search( "\/film\/(.+?)\/", page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0] ).groups(0)[0] year = page.xpath('//a[contains(@href,"year")]/text()')[0].strip() altTitle = None # TODO: parse original title. score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) results.append([kinoPoiskId, title, year, score]) except: self.logException("failed to parse a KinoPoisk page") return results
def fetchAndParseSearchResultsFull(self, mediaName, mediaYear): self.log.Info("Quering kinopoisk...") results = [] encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE)) self.log.Debug('Loading page "%s"' % encodedName) page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH % encodedName) if page is None: self.log.Warn(' ### nothing was found on kinopoisk for media name "%s"' % mediaName) else: # Если страница получена, берем с нее перечень всех названий фильмов. self.log.Debug("got a kinopoisk page to parse...") divInfoElems = page.xpath( '//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..' ) itemIndex = 0 altTitle = None if len(divInfoElems): self.log.Debug("found %d results" % len(divInfoElems)) for divInfoElem in divInfoElems: try: anchorFilmElem = divInfoElem.xpath('./a[contains(@href,"/level/1/film/")]/attribute::href') if len(anchorFilmElem): # Parse kinopoisk movie title id, title and year. match = re.search("\/film\/(.+?)\/", anchorFilmElem[0]) if match is None: self.log.Error("unable to parse movie title id") else: kinoPoiskId = match.groups(1)[0] title = common.getXpathRequiredText( divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()' ) year = common.getXpathOptionalText(divInfoElem, './/span[@class="year"]/text()') # Try to parse the alternative (original) title. Ignore failures. # This is a <span> below the title <a> tag. try: altTitle = common.getXpathOptionalText(divInfoElem, "../span[1]/text()") if altTitle is not None: altTitle = altTitle.split(",")[0].strip() except: pass score = common.scoreMediaTitleMatch( mediaName, mediaYear, title, altTitle, year, itemIndex ) results.append([kinoPoiskId, title, year, score]) else: self.log.Warn('unable to find film anchor elements for title "%s"' % mediaName) except: self.logException("failed to parse div.info container") itemIndex += 1 else: self.log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName) # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening). # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =) try: title = page.xpath('//h1[@class="moviename-big"]/text()')[0].strip() kinoPoiskId = re.search( "\/film\/(.+?)\/", page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0] ).groups(0)[0] year = page.xpath('//a[contains(@href,"year")]/text()')[0].strip() score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) results.append([kinoPoiskId, title, year, score]) except: self.logException("failed to parse a KinoPoisk page") return results
def queryKinoPoisk(self, mediaName, mediaYear): """ Ищет фильм на кинопоиске. Returns title results as they are returned (no sorting is done here!). """ results = [] encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE)) page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH_SIMPLE % encodedName) if page is None: self.log.Warn(' ### nothing was found on kinopoisk for media name "%s"' % mediaName) return results # Страница получена, берем с нее перечень всех названий фильмов. self.log.Debug('got a KinoPoisk query results page to parse...') divInfoElems = page.xpath('//div[@class="info"][p[@class="name"]/a[contains(@href,"/level/1/film/")]]') # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить). # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening). if not len(divInfoElems): self.log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName) try: itemTitle = common.getXpathOptionalText(page, '//h1[@class="moviename-big"]/text()') if itemTitle is not None: itemKinoPoiskId = re.search('\/film\/(.+?)\/', page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0]).groups(0)[0] itemYear = common.parseYearFromString(page.xpath('//a[contains(@href,"year")]/text()')[0]) itemAltTitle = None # TODO: parse original title. itemScore = common.scoreMediaTitleMatch(mediaName, mediaYear, itemTitle, itemAltTitle, itemYear, 0) results.append([itemKinoPoiskId, itemTitle, itemYear, itemScore]) except: self.logException('failed to parse a KinoPoisk query results page') return results # Inspect query results titles and score them. itemIndex = -1 self.log.Debug('found %d results (div info tags)' % len(divInfoElems)) for divInfoElem in divInfoElems: itemIndex += 1 try: anchorFilmElem = divInfoElem.xpath('.//a[contains(@href,"/level/1/film/")]/attribute::href') if not len(anchorFilmElem): self.log.Warn('unable to find film anchor elements for title "%s"' % mediaName) continue # Parse kinopoisk movie title id, title and year. match = re.search('\/film\/(.+?)\/', anchorFilmElem[0]) if match is None: self.log.Error('unable to parse movie title id') continue itemKinoPoiskId = match.groups(1)[0] itemTitle = common.getXpathRequiredText(divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()') itemYear = common.parseYearFromString(common.getXpathOptionalText(divInfoElem, './/span[@class="year"]/text()')) itemAltTitle = None try: # Try to parse the alternative (original) title. Ignore failures. # This is a <span> below the title <a> tag. altTitleCandidate = common.getXpathOptionalText(divInfoElem, './/span[@class="gray"]/text()') if altTitleCandidate is not None: # Strip any non alpha character in front (unfortunately, this may also remove a leading part # of a movie title if it starts with a digit). altTitleCandidate = MATCHER_LEADING_NONALPHA.sub('', altTitleCandidate).rstrip() if len(altTitleCandidate) > 0: itemAltTitle = altTitleCandidate except: pass # self.log.Debug(' ... kinoPoiskId="%s"; title="%s"; year="%s"...' % (itemKinoPoiskId, itemTitle, str(itemYear))) itemScore = common.scoreMediaTitleMatch(mediaName, mediaYear, itemTitle, itemAltTitle, itemYear, itemIndex) results.append([itemKinoPoiskId, itemTitle, itemYear, itemScore]) except: self.logException('failed to parse div.info container') return results