Ejemplo n.º 1
0
 def updateMediaItem(self, metadata, kinoPoiskId):
     titlePage = common.getElementFromHttpRequest(
         S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId,
         S.ENCODING_KINOPOISK_PAGE)
     if titlePage is not None:
         # Don't update if the title page was failed to load.
         LOGGER.Debug(
             'SUCCESS: got a KinoPoisk page for movie title id: "%s"' %
             kinoPoiskId)
         try:
             self.resetMediaMetadata(metadata)
             self.parseInfoTableTagAndUpdateMetadata(
                 titlePage,
                 metadata)  # Title, original title, ratings, and more.
             self.parseStudioPageData(metadata,
                                      kinoPoiskId)  # Studio. Студия.
             self.parseCastPageData(titlePage, metadata,
                                    kinoPoiskId)  # Actors, etc. Актёры. др.
             self.parsePostersPageData(metadata,
                                       kinoPoiskId)  # Posters. Постеры.
             self.parseStillsPageData(
                 metadata, kinoPoiskId)  # Background art. Stills.
         except:
             common.logException('failed to update metadata for id %s' %
                                 kinoPoiskId)
Ejemplo n.º 2
0
def parseImageDataFromPhotoTableTag(page, thumbnailList, isPoster, maxImagesToParse):
  anchorElems = page.xpath('//table[@class="fotos" or @class="fotos fotos1" or @class="fotos fotos2"]/tr/td/a')
  currItemIndex = len(thumbnailList)
  for anchorElem in anchorElems:
    thumb = None
    try:
      thumb = parseImageDataFromAnchorElement(anchorElem, currItemIndex)
      currItemIndex += 1
    except:
      common.logException('unable to parse image URLs')
    if thumb is None:
      Log.Debug('no URLs - skipping an image')
      continue
    else:
      common.scoreThumbnailResult(thumb, isPoster)
      if PREFS.imageChoice == common.IMAGE_CHOICE_BEST and \
         thumb.score < common.IMAGE_SCORE_BEST_THRESHOLD:
        continue
      thumbnailList.append(thumb)
      Log.Debug('GOT URLs for an image: index=%d, thumb="%s", full="%s" (%sx%s)' %
          (thumb.index, str(thumb.thumbImgUrl), str(thumb.fullImgUrl),
          str(thumb.fullImgWidth), str(thumb.fullImgHeight)))
      maxImagesToParse = maxImagesToParse - 1
      if not maxImagesToParse:
        break
  return maxImagesToParse
Ejemplo n.º 3
0
def getPosterThumbnailBigOrSmall(kinoPoiskId):
  Log.Debug(' * parsing thumbnail...')
  thumb = None
  try:
    bigImgThumbUrl = KINOPOISK_MOVIE_BIG_THUMBNAIL % kinoPoiskId
    response = common.getResponseFromHttpRequest(bigImgThumbUrl)
    if response is not None:
      contentType = response.headers['content-type']
      if 'image/jpeg' == contentType:
        Log.Debug(' * found BIG thumb')
        thumb = common.Thumbnail(None,
          bigImgThumbUrl,
          KINOPOISK_MOVIE_THUMBNAIL_DEFAULT_WIDTH,
          KINOPOISK_MOVIE_THUMBNAIL_DEFAULT_HEIGHT,
          0, # Index.
          1000) # Big thumb should have the highest initial score.
      else:
        Log.Debug(' * BIG thumb is NOT found')
  except:
    Log.Debug(' * UNABLE to fetch BIG thumb')
    if IS_DEBUG:
      common.logException('failed to fetch BIG thumb')

  if thumb is None:
    Log.Debug(' * adding default (SMALL) thumb')
    # If there is no big title, add a small one.
    thumb = common.Thumbnail(None,
      KINOPOISK_MOVIE_THUMBNAIL % kinoPoiskId,
      KINOPOISK_MOVIE_THUMBNAIL_WIDTH,
      KINOPOISK_MOVIE_THUMBNAIL_HEIGHT,
      0, # Index.
      0) # Initial score.

  return thumb
Ejemplo n.º 4
0
    def updateMediaItem(self, metadata, kinoPoiskId, lang):
        titlePage = common.getElementFromHttpRequest(
            S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId,
            S.ENCODING_KINOPOISK_PAGE)
        if titlePage is not None:
            # Don't update if the title page was failed to load.
            LOGGER.Debug(
                'SUCCESS: got a KinoPoisk page for movie title id: "%s"' %
                kinoPoiskId)
            try:
                self.parseInfoTableTagAndUpdateMetadata(
                    titlePage,
                    metadata)  # Title, original title, ratings, and more.

                # Search for a movie on TMDb to supplement our results with more data.
                # IMPORTANT, this must be done after parseInfoTableTagAndUpdateMetadata,
                # which populates the title and the year on the metadata object.
                tmdbId = self.searchForImdbTitleId(metadata.title,
                                                   metadata.year)

                self.parseStudioPageData(metadata,
                                         kinoPoiskId)  # Studio. Студия.
                self.parseCastPageData(titlePage, metadata,
                                       kinoPoiskId)  # Actors, etc. Актёры. др.
                self.updateImagesMetadata(
                    metadata, kinoPoiskId, tmdbId,
                    lang)  # Posters & Background art. Постеры.
            except:
                common.logException('failed to update metadata for id %s' %
                                    kinoPoiskId)
Ejemplo n.º 5
0
def parseYearInfo(infoRowElem, metadata):
  yearText = infoRowElem.xpath('.//a/text()')
  if len(yearText):
    Log.Debug(' ... parsed year: %s' % yearText[0])
    try:
      metadata.year = int(yearText[0])
    except:
      common.logException('unable to parse year')
Ejemplo n.º 6
0
def parseRatingInfo(page, metadata, kinoPoiskId):
  ratingText = page.xpath('.//*[@id="block_rating"]/div[1]/div[1]/a/span[1]/text()')
  if len(ratingText):
    try:
      rating = float(ratingText[0])
      Log.Debug(' ... parsed rating "%s"' % str(rating))
      metadata.rating = rating
    except:
      common.logException('unable to parse rating')
Ejemplo n.º 7
0
def parseDurationInfo(infoRowElem, metadata):
  durationElems = infoRowElem.xpath('./td[@class="time"]/text()')
  if len(durationElems) > 0:
    try:
      match = MATCHER_MOVIE_DURATION.search(durationElems[0])
      if match is not None:
        duration = int(int(match.groups(1)[0])) * 1000
        Log.Debug(' ... parsed duration: "%s"' % str(duration))
        metadata.duration = duration
    except:
      common.logException('unable to parse duration')
Ejemplo n.º 8
0
def parseOriginallyAvailableInfo(infoRowElem, metadata):
  originalDateElems = infoRowElem.xpath('.//a/text()')
  if len(originalDateElems):
    try:
      (dd, mm, yy) = originalDateElems[0].split()
      if len(dd) == 1:
        dd = '0' + dd
      mm = RU_MONTH[mm]
      originalDate = Datetime.ParseDate(yy + '-' + mm + '-' + dd).date()
      Log.Debug(' ... parsed originally available date: "%s"' % str(originalDate))
      metadata.originally_available_at = originalDate
    except:
      common.logException('unable to parse originally available date')
Ejemplo n.º 9
0
 def updateMediaItem(self, metadata, kinoPoiskId):
   titlePage =  common.getElementFromHttpRequest(S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, S.ENCODING_KINOPOISK_PAGE)
   if titlePage is not None:
     # Don't update if the title page was failed to load.
     LOGGER.Debug('SUCCESS: got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId)
     try:
       self.resetMediaMetadata(metadata)
       self.parseInfoTableTagAndUpdateMetadata(titlePage, metadata)    # Title, original title, ratings, and more.
       self.parseStudioPageData(metadata, kinoPoiskId)                 # Studio. Студия.
       self.parseCastPageData(titlePage, metadata, kinoPoiskId)        # Actors, etc. Актёры. др.
       self.parsePostersPageData(metadata, kinoPoiskId)                # Posters. Постеры.
       self.parseStillsPageData(metadata, kinoPoiskId)                 # Background art. Stills.
     except:
       common.logException('failed to update metadata for id %s' % kinoPoiskId)
Ejemplo n.º 10
0
 def updateMediaItem(self, metadata, kinoPoiskId):
   titlePage =  common.getElementFromHttpRequest(KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, ENCODING_KINOPOISK_PAGE)
   if titlePage is not None:
     Log.Debug('got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId)
     try:
       resetMediaMetadata(metadata)
       parseTitleInfo(titlePage, metadata)                                       # Title. Название на русском языке.
       parseOriginalTitleInfo(titlePage, metadata)                               # Original title. Название на оригинальном языке.
       parseSummaryInfo(titlePage, metadata)                                     # Summary. Описание.
       parseRatingInfo(titlePage, metadata, kinoPoiskId)                         # Rating. Рейтинг.
       parseInfoTableTagAndUpdateMetadata(titlePage, metadata)
       parseStudioInfo(metadata, kinoPoiskId)                                    # Studio. Студия.
       parsePeoplePageInfo(titlePage, metadata, kinoPoiskId)                     # Actors, etc. Актёры. др.
       parsePostersInfo(metadata, kinoPoiskId)                                   # Posters. Постеры.
       parseBackgroundArtInfo(metadata, kinoPoiskId)                             # Background art. Задники.
     except:
       common.logException('failed to update metadata for id %s' % kinoPoiskId)
Ejemplo n.º 11
0
  def updateMediaItem(self, metadata, kinoPoiskId, lang):
    titlePage =  common.getElementFromHttpRequest(
      S.KINOPOISK_TITLE_PAGE_URL % kinoPoiskId, S.ENCODING_KINOPOISK_PAGE)
    if titlePage is not None:
      # Don't update if the title page was failed to load.
      LOGGER.Debug('SUCCESS: got a KinoPoisk page for movie title id: "%s"' % kinoPoiskId)
      try:
        self.parseInfoTableTagAndUpdateMetadata(titlePage, metadata)    # Title, original title, ratings, and more.

        # Search for a movie on TMDb to supplement our results with more data.
        # IMPORTANT, this must be done after parseInfoTableTagAndUpdateMetadata,
        # which populates the title and the year on the metadata object.
        tmdbId = self.searchForImdbTitleId(metadata.title, metadata.year)

        self.parseStudioPageData(metadata, kinoPoiskId)                 # Studio. Студия.
        self.parseCastPageData(titlePage, metadata, kinoPoiskId)        # Actors, etc. Актёры. др.
        self.updateImagesMetadata(metadata, kinoPoiskId, tmdbId, lang)   # Posters & Background art. Постеры.
      except:
        common.logException('failed to update metadata for id %s' % kinoPoiskId)
Ejemplo n.º 12
0
def fetchImageDataPages(urlTemplate, kinoPoiskId, maxPages):
  pages = []
  page = common.getElementFromHttpRequest(urlTemplate % (kinoPoiskId, 1), ENCODING_KINOPOISK_PAGE)
  if page is not None:
    pages.append(page)
    if maxPages > 1:
      anchorElems = page.xpath('//div[@class="navigator"]/ul/li[@class="arr"]/a')
      if len(anchorElems):
        nav = parseXpathElementValue(anchorElems[-1], './attribute::href')
        match = re.search('page\/(\d+?)\/$', nav)
        if match is not None:
          try:
            for pageIndex in range(2, int(match.groups(1)[0]) + 1):
              page =  common.getElementFromHttpRequest(urlTemplate % (kinoPoiskId, pageIndex), ENCODING_KINOPOISK_PAGE)
              if page is not None:
                pages.append(page)
                if pageIndex == maxPages:
                  break
          except:
            common.logException('unable to parse image art page')
  return pages
Ejemplo n.º 13
0
def updateImageMetadata(pages, metadata, maxImages, isPoster, thumb):
  thumbnailList = []
  if thumb is not None:
    thumbnailList.append(thumb)
  if maxImages > 1:
    # Parsing URLs from the passed pages.
    maxImagesToParse = maxImages - len(thumbnailList) + 2 # Give it a couple of extras to choose from.
    for page in pages:
      maxImagesToParse = parseImageDataFromPhotoTableTag(page, thumbnailList, isPoster, maxImagesToParse)
      if not maxImagesToParse:
        break

  # Sort results according to their score and chop out extraneous images. Сортируем результаты.
  thumbnailList = sorted(thumbnailList, key=lambda t : t.score, reverse=True)[0:maxImages]
  if IS_DEBUG:
    common.printImageSearchResults(thumbnailList)

  # Now, walk over the top N (<max) results and update metadata.
  if isPoster:
    imagesContainer = metadata.posters
  else:
    imagesContainer = metadata.art
  index = 0
  validNames = list()
  for result in thumbnailList:
    if result.thumbImgUrl is None:
      img = result.fullImgUrl
    else:
      img = result.thumbImgUrl
    try:
      imagesContainer[result.fullImgUrl] = Proxy.Preview(HTTP.Request(img), sort_order = index)
      validNames.append(result.fullImgUrl)
      index += 1
    except:
      common.logException('Error generating preview for: "%s".' % str(img))
  imagesContainer.validate_keys(validNames)
Ejemplo n.º 14
0
def parsePeoplePageInfo(titlePage, metadata, kinoPoiskId):
  """ Parses people - mostly actors - here (on this page)
      we have access to extensive information about all who participated
      creating this movie.
  """
  # First, parse actors from the main title page.
  parseAllActors = PREFS.getAllActors
  actorsMap = parseActorsInfoIntoMap(titlePage)
  mainActors = []
  otherActors = []

  # Now, parse a dedicated 'people' page.
  page = common.getElementFromHttpRequest(KINOPOISK_PEOPLE % kinoPoiskId, ENCODING_KINOPOISK_PAGE)
  if page is None:
    Log.Debug('NO people page')
    for actorName in actorsMap.keys():
      addActorToMetadata(metadata, actorName, None)
    return
  personType = None
  peopleTags = page.xpath('//div[@id="content_block"]/table/tr/td/div[@class="block_left"]/*')
  for peopleTagElem in peopleTags:
    try:
      if peopleTagElem.tag == 'table':
        personType = None
        tagElems = peopleTagElem.xpath('./tr/td[@style="padding-left:20px;border-bottom:2px solid #f60;font-size:16px"]/text()')
        if len(tagElems):
          tagName = tagElems[0]
          if tagName == u'Актеры':
            personType = 'actor'
          elif tagName == u'Директора фильма' or tagName == u'Режиссеры':
            personType = 'director'
          elif tagName == u'Сценаристы':
            personType = 'writer'
          elif tagName == u'Операторы' or \
               tagName == u'Монтажеры' or \
               tagName == u'Композиторы' or \
               tagName == u'Художники':
            # Skip these tags for now.
            personType = None
            Log.Debug('skipping an unsupported tag "%s"' % tagName)
          else:
            Log.Debug('skipping an unknown tag "%s"' % tagName)
      elif peopleTagElem.tag == 'div':
        personNameElems = peopleTagElem.xpath('./div/div/div[@class="name"]/a/text()')
        personName = None
        if len(personNameElems):
          personName = personNameElems[0]
        if personType == 'actor':
          actorRoleElems = peopleTagElem.xpath('./div/div/div[@class="role"]/text()')
          if len(actorRoleElems):
            roleName = str(actorRoleElems[0]).strip().strip('. ')
            if personName in actorsMap:
              Log.Debug(' . . . . parsed main actor "%s" with role "%s"' % (personName, roleName))
              mainActors.append((personName, roleName))
              del actorsMap[personName]
            elif parseAllActors:
              Log.Debug(' . . . . parsed other actor "%s" with role "%s"' % (personName, roleName))
              otherActors.append((personName, roleName))
      else:
        personType = None
    except:
      common.logException('unable to parse a people tag')

  # Adding main actors that were found on the 'people' page.
  for personName, roleName in mainActors:
    addActorToMetadata(metadata, personName, roleName)
  # Adding main actors that were NOT found on the 'people' page.
  for actorName in actorsMap.keys():
    addActorToMetadata(metadata, actorName, None)
  # Adding other actors if requested.
  for personName, roleName in otherActors:
    addActorToMetadata(metadata, personName, roleName)
Ejemplo n.º 15
0
  def search(self, results, media, lang, manual=False):
    """ Searches for matches on KinoPoisk using the title and year
        passed via the media object. All matches are saved in a list of results
        as MetadataSearchResult objects. For each results, we determine a
        page id, title, year, and the score (how good we think the match
        is on the scale of 1 - 100).
    """
    Log.Debug('SEARCH START <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
    mediaName = media.name
    mediaYear = media.year
    Log.Debug('searching for name="%s", year="%s", guid="%s", hash="%s"...' %
        (str(mediaName), str(mediaYear), str(media.guid), str(media.hash)))
    # Получаем страницу поиска
    Log.Debug('quering kinopoisk...')

    page = common.getElementFromHttpRequest(KINOPOISK_SEARCH % urllib.quote(mediaName.encode(ENCODING_KINOPOISK_PAGE)), ENCODING_KINOPOISK_PAGE)
    Log.Debug('Loading page "%s"' % urllib.quote(mediaName.encode(ENCODING_KINOPOISK_PAGE)))

    if page is None:
      Log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName)
    else:
      # Если страница получена, берем с нее перечень всех названий фильмов.
      Log.Debug('got a kinopoisk page to parse...')
      divInfoElems = page.xpath('//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..')
      itemIndex = 0
      altTitle = None
      if len(divInfoElems):
        Log.Debug('found %d results' % len(divInfoElems))
        for divInfoElem in divInfoElems:
          try:
            anchorFilmElem = divInfoElem.xpath('./a[contains(@href,"/level/1/film/")]/attribute::href')
            if len(anchorFilmElem):
              # Parse kinopoisk movie title id, title and year.
              match = re.search('\/film\/(.+?)\/', anchorFilmElem[0])
              if match is None:
                Log.Error('unable to parse movie title id')
              else:
                kinoPoiskId = match.groups(1)[0]
                title = common.getXpathRequiredNode(divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()')
                year = common.getXpathOptionalNode(divInfoElem, './/span[@class="year"]/text()')
                # Try to parse the alternative (original) title. Ignore failures.
                # This is a <span> below the title <a> tag.
                try:
                  altTitle = common.getXpathOptionalNode(divInfoElem, '../span[1]/text()')
                  if altTitle is not None:
                    altTitle = altTitle.split(',')[0].strip()
                except:
                  pass
                score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex)
                results.Append(MetadataSearchResult(id=kinoPoiskId, name=title, year=year, lang=lang, score=score))
            else:
              Log.Warn('unable to find film anchor elements for title "%s"' % mediaName)
          except:
            common.logException('failed to parse div.info container')
          itemIndex += 1
      else:
        Log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName)
        # TODO(zhenya): investigate 1 we need this clause at all (haven't seen this happening).
        # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =)
       # try:
          #title = page.xpath('//h1[@class="moviename-big"]/text()')[0].strip()
          #kinoPoiskId = re.search('\/film\/(.+?)\/', page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0]).groups(0)[0]
          #year = page.xpath('//a[contains(@href,"year")]/text()')[0].strip()
          #score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex)
          #results.Append(MetadataSearchResult(id=kinoPoiskId, name=title, year=year, lang=lang, score=score))
        #except:
         # common.logException('failed to parse a KinoPoisk page')

    # Sort results according to their score (Сортируем результаты).
    results.Sort('score', descending=True)
    if IS_DEBUG:
      common.printSearchResults(results)
    Log.Debug('SEARCH END <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')