Example #1
0
    def parseImageDataFromAnchorElement(self, anchorElem, index):
        """ Given an anchor element from a posters page,
        fetches the corresponding poster (individual) page and
        parses poster's data into a Thumbnail object.
        @return common.Thumbnail or None if failed to parse.
    """
        fullSizeUrl = None
        dimensions = None, None

        # Read thumbnail image url from the <img> src tag attribute.
        thumbUrl = common.getXpathOptionalNode(anchorElem,
                                               './img/attribute::src')
        if thumbUrl is not None:
            thumbUrl = ensureAbsoluteUrl(thumbUrl.strip())

        # Fetch and parse individual page with image.
        posterPageUrl = ensureAbsoluteUrl(anchorElem.get('href').strip())
        self.log.Debug('fetching image page: "%s".' % posterPageUrl)
        posterPage = self.httpUtils.requestAndParseHtmlPage(posterPageUrl)
        if posterPage is not None:
            imageElem = common.getXpathOptionalNode(posterPage,
                                                    '//img[@id="image"]')
            if imageElem is not None:
                fullSizeUrl = imageElem.get('src')
                dimensions = parseImageElemDimensions(imageElem)

        # If we have no full size image URL, we could use the thumb's.
        if fullSizeUrl is None and thumbUrl is not None:
            self.log.Debug(
                ' - found no full size image, will use the thumbnail')
            fullSizeUrl = thumbUrl

        if fullSizeUrl is None and thumbUrl is None:
            return None

        thumb = common.Thumbnail(thumbUrl, ensureAbsoluteUrl(fullSizeUrl),
                                 dimensions[0], dimensions[1], index, 0,
                                 'null')
        if self.isDebug:
            self.log.Debug(' ... parsed a thumbnail:')
            print '    ' + str(thumb)
        return thumb
Example #2
0
def searchForImdbTitles(mediaName, mediaYear, lang):
  """ Given media name and a candidate title, returns the title result score penalty.
  """
  mediaName = mediaName.lower()
  page = common.getElementFromHttpRequest(TMDB_GETINFO % mediaName.replace(' ', '%20'), TMDB_PAGE_ENCODING)
  matches = []
  if page is None:
    Log.Warn('nothing was found on tmdb for media name "%s"' % mediaName)
  else:
    movieElems = page.xpath('//movies/movie')
    itemIndex = 0
    for movieElem in movieElems:
      imdbId = common.getXpathRequiredNode(movieElem, './imdb_id/text()')
      title = common.getXpathRequiredNode(movieElem, './name/text()')
      altTitle = common.getXpathOptionalNode(movieElem, './alternative_name/text()')
      releaseDate = common.getXpathOptionalNode(movieElem, './released/text()')
      year = common.getReOptionalGroup(MATCHER_RELEASED, releaseDate, 0)
      score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex)
      matches.append({'id': imdbId, 'name': title, 'year': year, 'score': score})
      itemIndex += 1
  return matches
  def parseImageDataFromAnchorElement(self, anchorElem, index):
    """ Given an anchor element from a posters page,
        fetches the corresponding poster (individual) page and
        parses poster's data into a Thumbnail object.
        @return common.Thumbnail or None if failed to parse.
    """
    fullSizeUrl = None
    dimensions = None, None

    # Read thumbnail image url from the <img> src tag attribute.
    thumbUrl = common.getXpathOptionalNode(anchorElem, './img/attribute::src')
    if thumbUrl is not None:
      thumbUrl = ensureAbsoluteUrl(thumbUrl.strip())

    # Fetch and parse individual page with image.
    posterPageUrl = ensureAbsoluteUrl(anchorElem.get('href').strip())
    self.log.Debug('fetching image page: "%s".' % posterPageUrl)
    posterPage = self.httpUtils.requestAndParseHtmlPage(posterPageUrl)
    if posterPage is not None:
      imageElem = common.getXpathOptionalNode(posterPage, '//img[@id="image"]')
      if imageElem is not None:
        fullSizeUrl = imageElem.get('src')
        dimensions = parseImageElemDimensions(imageElem)

    # If we have no full size image URL, we could use the thumb's.
    if fullSizeUrl is None and thumbUrl is not None:
      self.log.Debug(' - found no full size image, will use the thumbnail')
      fullSizeUrl = thumbUrl

    if fullSizeUrl is None and thumbUrl is None:
      return None

    thumb = common.Thumbnail(thumbUrl, ensureAbsoluteUrl(fullSizeUrl),
      dimensions[0], dimensions[1], index, 0, 'null')
    if self.isDebug:
      self.log.Debug(' ... parsed a thumbnail:')
      print '    ' + str(thumb)
    return thumb
Example #4
0
  def search(self, results, media, lang, manual=False):
    """ Searches for matches on KinoPoisk using the title and year
        passed via the media object. All matches are saved in a list of results
        as MetadataSearchResult objects. For each results, we determine a
        page id, title, year, and the score (how good we think the match
        is on the scale of 1 - 100).
    """
    Log.Debug('SEARCH START <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
    mediaName = media.name
    mediaYear = media.year
    Log.Debug('searching for name="%s", year="%s", guid="%s", hash="%s"...' %
        (str(mediaName), str(mediaYear), str(media.guid), str(media.hash)))
    # Получаем страницу поиска
    Log.Debug('quering kinopoisk...')

    page = common.getElementFromHttpRequest(KINOPOISK_SEARCH % urllib.quote(mediaName.encode(ENCODING_KINOPOISK_PAGE)), ENCODING_KINOPOISK_PAGE)
    Log.Debug('Loading page "%s"' % urllib.quote(mediaName.encode(ENCODING_KINOPOISK_PAGE)))

    if page is None:
      Log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName)
    else:
      # Если страница получена, берем с нее перечень всех названий фильмов.
      Log.Debug('got a kinopoisk page to parse...')
      divInfoElems = page.xpath('//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..')
      itemIndex = 0
      altTitle = None
      if len(divInfoElems):
        Log.Debug('found %d results' % len(divInfoElems))
        for divInfoElem in divInfoElems:
          try:
            anchorFilmElem = divInfoElem.xpath('./a[contains(@href,"/level/1/film/")]/attribute::href')
            if len(anchorFilmElem):
              # Parse kinopoisk movie title id, title and year.
              match = re.search('\/film\/(.+?)\/', anchorFilmElem[0])
              if match is None:
                Log.Error('unable to parse movie title id')
              else:
                kinoPoiskId = match.groups(1)[0]
                title = common.getXpathRequiredNode(divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()')
                year = common.getXpathOptionalNode(divInfoElem, './/span[@class="year"]/text()')
                # Try to parse the alternative (original) title. Ignore failures.
                # This is a <span> below the title <a> tag.
                try:
                  altTitle = common.getXpathOptionalNode(divInfoElem, '../span[1]/text()')
                  if altTitle is not None:
                    altTitle = altTitle.split(',')[0].strip()
                except:
                  pass
                score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex)
                results.Append(MetadataSearchResult(id=kinoPoiskId, name=title, year=year, lang=lang, score=score))
            else:
              Log.Warn('unable to find film anchor elements for title "%s"' % mediaName)
          except:
            common.logException('failed to parse div.info container')
          itemIndex += 1
      else:
        Log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName)
        # TODO(zhenya): investigate 1 we need this clause at all (haven't seen this happening).
        # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =)
       # try:
          #title = page.xpath('//h1[@class="moviename-big"]/text()')[0].strip()
          #kinoPoiskId = re.search('\/film\/(.+?)\/', page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0]).groups(0)[0]
          #year = page.xpath('//a[contains(@href,"year")]/text()')[0].strip()
          #score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex)
          #results.Append(MetadataSearchResult(id=kinoPoiskId, name=title, year=year, lang=lang, score=score))
        #except:
         # common.logException('failed to parse a KinoPoisk page')

    # Sort results according to their score (Сортируем результаты).
    results.Sort('score', descending=True)
    if IS_DEBUG:
      common.printSearchResults(results)
    Log.Debug('SEARCH END <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')