Example #1
0
 def updateMediaItem(self, metadata, kinoPoiskId):
     titlePage = common.getElementFromHttpRequest(
         S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId,
         S.ENCODING_KINOPOISK_PAGE)
     if titlePage is not None:
         # Don't update if the title page was failed to load.
         LOGGER.Debug(
             'SUCCESS: got a KinoPoisk page for movie title id: "%s"' %
             kinoPoiskId)
         try:
             self.resetMediaMetadata(metadata)
             self.parseInfoTableTagAndUpdateMetadata(
                 titlePage,
                 metadata)  # Title, original title, ratings, and more.
             self.parseStudioPageData(metadata,
                                      kinoPoiskId)  # Studio. Студия.
             self.parseCastPageData(titlePage, metadata,
                                    kinoPoiskId)  # Actors, etc. Актёры. др.
             self.parsePostersPageData(metadata,
                                       kinoPoiskId)  # Posters. Постеры.
             self.parseStillsPageData(
                 metadata, kinoPoiskId)  # Background art. Stills.
         except:
             common.logException('failed to update metadata for id %s' %
                                 kinoPoiskId)
Example #2
0
    def updateMediaItem(self, metadata, kinoPoiskId, lang):
        titlePage = common.getElementFromHttpRequest(
            S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId,
            S.ENCODING_KINOPOISK_PAGE)
        if titlePage is not None:
            # Don't update if the title page was failed to load.
            LOGGER.Debug(
                'SUCCESS: got a KinoPoisk page for movie title id: "%s"' %
                kinoPoiskId)
            try:
                self.parseInfoTableTagAndUpdateMetadata(
                    titlePage,
                    metadata)  # Title, original title, ratings, and more.

                # Search for a movie on TMDb to supplement our results with more data.
                # IMPORTANT, this must be done after parseInfoTableTagAndUpdateMetadata,
                # which populates the title and the year on the metadata object.
                tmdbId = self.searchForImdbTitleId(metadata.title,
                                                   metadata.year)

                self.parseStudioPageData(metadata,
                                         kinoPoiskId)  # Studio. Студия.
                self.parseCastPageData(titlePage, metadata,
                                       kinoPoiskId)  # Actors, etc. Актёры. др.
                self.updateImagesMetadata(
                    metadata, kinoPoiskId, tmdbId,
                    lang)  # Posters & Background art. Постеры.
            except:
                common.logException('failed to update metadata for id %s' %
                                    kinoPoiskId)
Example #3
0
def parseImageDataFromAnchorElement(anchorElem, index):
  thumbSizeUrl = None
  fullSizeUrl = None
  fullSizeDimensions = None, None
  fullSizeProxyPageUrl = anchorElem.get('href')
  thumbSizeImgElem = parseXpathElementValue(anchorElem, './img')
  if thumbSizeImgElem is not None:
    thumbSizeUrl = thumbSizeImgElem.get('src')
    if thumbSizeUrl is not None:
      thumbSizeUrl = ensureAbsoluteUrl(thumbSizeUrl)

  if fullSizeProxyPageUrl is not None:
    fullSizeProxyPage = common.getElementFromHttpRequest(ensureAbsoluteUrl(fullSizeProxyPageUrl), ENCODING_KINOPOISK_PAGE)
    if fullSizeProxyPage is not None:
      imageElem = parseXpathElementValue(fullSizeProxyPage, '//img[@id="image"]')
      if imageElem is not None:
        fullSizeUrl = imageElem.get('src')
        fullSizeDimensions = parseImageElemDimensions(imageElem)

  # If we have no full size image URL, we could use the thumb's.
  if fullSizeUrl is None and thumbSizeUrl is not None:
      Log.Debug('found no full size image, will use the thumbnail')
      fullSizeUrl = thumbSizeUrl

  if fullSizeUrl is None and thumbSizeUrl is None:
    return None
  return common.Thumbnail(thumbSizeUrl,
    ensureAbsoluteUrl(fullSizeUrl),
    fullSizeDimensions[0],
    fullSizeDimensions[1],
    index,
    0) # Initial score.
Example #4
0
def searchForImdbTitles(mediaName, mediaYear, lang):
  """ Given media name and a candidate title, returns the title result score penalty.
  """
  mediaName = mediaName.lower()
  page = common.getElementFromHttpRequest(TMDB_GETINFO % mediaName.replace(' ', '%20'), TMDB_PAGE_ENCODING)
  matches = []
  if page is None:
    Log.Warn('nothing was found on tmdb for media name "%s"' % mediaName)
  else:
    movieElems = page.xpath('//movies/movie')
    itemIndex = 0
    for movieElem in movieElems:
      try:
        imdbId = common.getXpathRequiredText(movieElem, './imdb_id/text()')
        title = common.getXpathRequiredText(movieElem, './name/text()')
        altTitle = common.getXpathOptionalText(movieElem, './alternative_name/text()')
        releaseDate = common.getXpathOptionalText(movieElem, './released/text()')
        year = common.getReOptionalGroup(MATCHER_RELEASED, releaseDate, 0)
        score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex)
        matches.append({'id': imdbId, 'name': title, 'year': year, 'score': score})
        itemIndex += 1
      except:
        Log.Warn('failed to parse movie element')

    return matches
Example #5
0
def parseStudioInfo(metadata, kinoPoiskId):
  page = common.getElementFromHttpRequest(KINOPOISK_STUDIO % kinoPoiskId, ENCODING_KINOPOISK_PAGE)
  if not page:
    return
  studios = page.xpath(u'//table/tr/td[b="Производство:"]/../following-sibling::tr/td/a/text()')
  if len(studios):
    # Берем только первую студию.
    studio = studios[0].strip()
    Log.Debug(' ... parsed studio: %s' % studio)
    metadata.studio = studio
Example #6
0
def fetchImageDataPages(urlTemplate, kinoPoiskId, maxPages):
  pages = []
  page = common.getElementFromHttpRequest(urlTemplate % (kinoPoiskId, 1), ENCODING_KINOPOISK_PAGE)
  if page is not None:
    pages.append(page)
    if maxPages > 1:
      anchorElems = page.xpath('//div[@class="navigator"]/ul/li[@class="arr"]/a')
      if len(anchorElems):
        nav = parseXpathElementValue(anchorElems[-1], './attribute::href')
        match = re.search('page\/(\d+?)\/$', nav)
        if match is not None:
          try:
            for pageIndex in range(2, int(match.groups(1)[0]) + 1):
              page =  common.getElementFromHttpRequest(urlTemplate % (kinoPoiskId, pageIndex), ENCODING_KINOPOISK_PAGE)
              if page is not None:
                pages.append(page)
                if pageIndex == maxPages:
                  break
          except:
            common.logException('unable to parse image art page')
  return pages
Example #7
0
 def updateMediaItem(self, metadata, kinoPoiskId):
   titlePage =  common.getElementFromHttpRequest(S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, S.ENCODING_KINOPOISK_PAGE)
   if titlePage is not None:
     # Don't update if the title page was failed to load.
     LOGGER.Debug('SUCCESS: got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId)
     try:
       self.resetMediaMetadata(metadata)
       self.parseInfoTableTagAndUpdateMetadata(titlePage, metadata)    # Title, original title, ratings, and more.
       self.parseStudioPageData(metadata, kinoPoiskId)                 # Studio. Студия.
       self.parseCastPageData(titlePage, metadata, kinoPoiskId)        # Actors, etc. Актёры. др.
       self.parsePostersPageData(metadata, kinoPoiskId)                # Posters. Постеры.
       self.parseStillsPageData(metadata, kinoPoiskId)                 # Background art. Stills.
     except:
       common.logException('failed to update metadata for id %s' % kinoPoiskId)
Example #8
0
 def updateMediaItem(self, metadata, kinoPoiskId):
   titlePage =  common.getElementFromHttpRequest(KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, ENCODING_KINOPOISK_PAGE)
   if titlePage is not None:
     Log.Debug('got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId)
     try:
       resetMediaMetadata(metadata)
       parseTitleInfo(titlePage, metadata)                                       # Title. Название на русском языке.
       parseOriginalTitleInfo(titlePage, metadata)                               # Original title. Название на оригинальном языке.
       parseSummaryInfo(titlePage, metadata)                                     # Summary. Описание.
       parseRatingInfo(titlePage, metadata, kinoPoiskId)                         # Rating. Рейтинг.
       parseInfoTableTagAndUpdateMetadata(titlePage, metadata)
       parseStudioInfo(metadata, kinoPoiskId)                                    # Studio. Студия.
       parsePeoplePageInfo(titlePage, metadata, kinoPoiskId)                     # Actors, etc. Актёры. др.
       parsePostersInfo(metadata, kinoPoiskId)                                   # Posters. Постеры.
       parseBackgroundArtInfo(metadata, kinoPoiskId)                             # Background art. Задники.
     except:
       common.logException('failed to update metadata for id %s' % kinoPoiskId)
Example #9
0
  def updateMediaItem(self, metadata, kinoPoiskId, lang):
    titlePage =  common.getElementFromHttpRequest(
      S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, S.ENCODING_KINOPOISK_PAGE)
    if titlePage is not None:
      # Don't update if the title page was failed to load.
      LOGGER.Debug('SUCCESS: got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId)
      try:
        self.parseInfoTableTagAndUpdateMetadata(titlePage, metadata)    # Title, original title, ratings, and more.

        # Search for a movie on TMDb to supplement our results with more data.
        # IMPORTANT, this must be done after parseInfoTableTagAndUpdateMetadata,
        # which populates the title and the year on the metadata object.
        tmdbId = self.searchForImdbTitleId(metadata.title, metadata.year)

        self.parseStudioPageData(metadata, kinoPoiskId)                 # Studio. Студия.
        self.parseCastPageData(titlePage, metadata, kinoPoiskId)        # Actors, etc. Актёры. др.
        self.updateImagesMetadata(metadata, kinoPoiskId, tmdbId, lang)   # Posters & Background art. Постеры.
      except:
        common.logException('failed to update metadata for id %s' % kinoPoiskId)
Example #10
0
def searchForImdbTitles(mediaName, mediaYear, lang):
    """ Given media name and a candidate title, returns the title result score penalty.
  """
    mediaName = mediaName.lower()
    page = common.getElementFromHttpRequest(
        TMDB_GETINFO % mediaName.replace(' ', '%20'), TMDB_PAGE_ENCODING)
    matches = []
    if page is None:
        Log.Warn('nothing was found on tmdb for media name "%s"' % mediaName)
    else:
        movieElems = page.xpath('//movies/movie')
        itemIndex = 0
        for movieElem in movieElems:
            try:
                imdbId = common.getXpathRequiredText(movieElem,
                                                     './imdb_id/text()')
                title = common.getXpathRequiredText(movieElem, './name/text()')
                altTitle = common.getXpathOptionalText(
                    movieElem, './alternative_name/text()')
                releaseDate = common.getXpathOptionalText(
                    movieElem, './released/text()')
                year = common.getReOptionalGroup(MATCHER_RELEASED, releaseDate,
                                                 0)
                score = common.scoreMediaTitleMatch(mediaName, mediaYear,
                                                    title, altTitle, year,
                                                    itemIndex)
                matches.append({
                    'id': imdbId,
                    'name': title,
                    'year': year,
                    'score': score
                })
                itemIndex += 1
            except:
                Log.Warn('failed to parse movie element')

        return matches
Example #11
0
def parsePeoplePageInfo(titlePage, metadata, kinoPoiskId):
  """ Parses people - mostly actors - here (on this page)
      we have access to extensive information about all who participated
      creating this movie.
  """
  # First, parse actors from the main title page.
  parseAllActors = PREFS.getAllActors
  actorsMap = parseActorsInfoIntoMap(titlePage)
  mainActors = []
  otherActors = []

  # Now, parse a dedicated 'people' page.
  page = common.getElementFromHttpRequest(KINOPOISK_PEOPLE % kinoPoiskId, ENCODING_KINOPOISK_PAGE)
  if page is None:
    Log.Debug('NO people page')
    for actorName in actorsMap.keys():
      addActorToMetadata(metadata, actorName, None)
    return
  personType = None
  peopleTags = page.xpath('//div[@id="content_block"]/table/tr/td/div[@class="block_left"]/*')
  for peopleTagElem in peopleTags:
    try:
      if peopleTagElem.tag == 'table':
        personType = None
        tagElems = peopleTagElem.xpath('./tr/td[@style="padding-left:20px;border-bottom:2px solid #f60;font-size:16px"]/text()')
        if len(tagElems):
          tagName = tagElems[0]
          if tagName == u'Актеры':
            personType = 'actor'
          elif tagName == u'Директора фильма' or tagName == u'Режиссеры':
            personType = 'director'
          elif tagName == u'Сценаристы':
            personType = 'writer'
          elif tagName == u'Операторы' or \
               tagName == u'Монтажеры' or \
               tagName == u'Композиторы' or \
               tagName == u'Художники':
            # Skip these tags for now.
            personType = None
            Log.Debug('skipping an unsupported tag "%s"' % tagName)
          else:
            Log.Debug('skipping an unknown tag "%s"' % tagName)
      elif peopleTagElem.tag == 'div':
        personNameElems = peopleTagElem.xpath('./div/div/div[@class="name"]/a/text()')
        personName = None
        if len(personNameElems):
          personName = personNameElems[0]
        if personType == 'actor':
          actorRoleElems = peopleTagElem.xpath('./div/div/div[@class="role"]/text()')
          if len(actorRoleElems):
            roleName = str(actorRoleElems[0]).strip().strip('. ')
            if personName in actorsMap:
              Log.Debug(' . . . . parsed main actor "%s" with role "%s"' % (personName, roleName))
              mainActors.append((personName, roleName))
              del actorsMap[personName]
            elif parseAllActors:
              Log.Debug(' . . . . parsed other actor "%s" with role "%s"' % (personName, roleName))
              otherActors.append((personName, roleName))
      else:
        personType = None
    except:
      common.logException('unable to parse a people tag')

  # Adding main actors that were found on the 'people' page.
  for personName, roleName in mainActors:
    addActorToMetadata(metadata, personName, roleName)
  # Adding main actors that were NOT found on the 'people' page.
  for actorName in actorsMap.keys():
    addActorToMetadata(metadata, actorName, None)
  # Adding other actors if requested.
  for personName, roleName in otherActors:
    addActorToMetadata(metadata, personName, roleName)
Example #12
0
  def search(self, results, media, lang, manual=False):
    """ Searches for matches on KinoPoisk using the title and year
        passed via the media object. All matches are saved in a list of results
        as MetadataSearchResult objects. For each results, we determine a
        page id, title, year, and the score (how good we think the match
        is on the scale of 1 - 100).
    """
    Log.Debug('SEARCH START <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
    mediaName = media.name
    mediaYear = media.year
    Log.Debug('searching for name="%s", year="%s", guid="%s", hash="%s"...' %
        (str(mediaName), str(mediaYear), str(media.guid), str(media.hash)))
    # Получаем страницу поиска
    Log.Debug('quering kinopoisk...')

    page = common.getElementFromHttpRequest(KINOPOISK_SEARCH % urllib.quote(mediaName.encode(ENCODING_KINOPOISK_PAGE)), ENCODING_KINOPOISK_PAGE)
    Log.Debug('Loading page "%s"' % urllib.quote(mediaName.encode(ENCODING_KINOPOISK_PAGE)))

    if page is None:
      Log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName)
    else:
      # Если страница получена, берем с нее перечень всех названий фильмов.
      Log.Debug('got a kinopoisk page to parse...')
      divInfoElems = page.xpath('//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..')
      itemIndex = 0
      altTitle = None
      if len(divInfoElems):
        Log.Debug('found %d results' % len(divInfoElems))
        for divInfoElem in divInfoElems:
          try:
            anchorFilmElem = divInfoElem.xpath('./a[contains(@href,"/level/1/film/")]/attribute::href')
            if len(anchorFilmElem):
              # Parse kinopoisk movie title id, title and year.
              match = re.search('\/film\/(.+?)\/', anchorFilmElem[0])
              if match is None:
                Log.Error('unable to parse movie title id')
              else:
                kinoPoiskId = match.groups(1)[0]
                title = common.getXpathRequiredNode(divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()')
                year = common.getXpathOptionalNode(divInfoElem, './/span[@class="year"]/text()')
                # Try to parse the alternative (original) title. Ignore failures.
                # This is a <span> below the title <a> tag.
                try:
                  altTitle = common.getXpathOptionalNode(divInfoElem, '../span[1]/text()')
                  if altTitle is not None:
                    altTitle = altTitle.split(',')[0].strip()
                except:
                  pass
                score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex)
                results.Append(MetadataSearchResult(id=kinoPoiskId, name=title, year=year, lang=lang, score=score))
            else:
              Log.Warn('unable to find film anchor elements for title "%s"' % mediaName)
          except:
            common.logException('failed to parse div.info container')
          itemIndex += 1
      else:
        Log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName)
        # TODO(zhenya): investigate 1 we need this clause at all (haven't seen this happening).
        # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =)
       # try:
          #title = page.xpath('//h1[@class="moviename-big"]/text()')[0].strip()
          #kinoPoiskId = re.search('\/film\/(.+?)\/', page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0]).groups(0)[0]
          #year = page.xpath('//a[contains(@href,"year")]/text()')[0].strip()
          #score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex)
          #results.Append(MetadataSearchResult(id=kinoPoiskId, name=title, year=year, lang=lang, score=score))
        #except:
         # common.logException('failed to parse a KinoPoisk page')

    # Sort results according to their score (Сортируем результаты).
    results.Sort('score', descending=True)
    if IS_DEBUG:
      common.printSearchResults(results)
    Log.Debug('SEARCH END <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')