def searchForImdbTitles(mediaName, mediaYear, lang): """ Given media name and a candidate title, returns the title result score penalty. """ mediaName = mediaName.lower() page = common.getElementFromHttpRequest(TMDB_GETINFO % mediaName.replace(' ', '%20'), TMDB_PAGE_ENCODING) matches = [] if page is None: Log.Warn('nothing was found on tmdb for media name "%s"' % mediaName) else: movieElems = page.xpath('//movies/movie') itemIndex = 0 for movieElem in movieElems: try: imdbId = common.getXpathRequiredText(movieElem, './imdb_id/text()') title = common.getXpathRequiredText(movieElem, './name/text()') altTitle = common.getXpathOptionalText(movieElem, './alternative_name/text()') releaseDate = common.getXpathOptionalText(movieElem, './released/text()') year = common.getReOptionalGroup(MATCHER_RELEASED, releaseDate, 0) score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) matches.append({'id': imdbId, 'name': title, 'year': year, 'score': score}) itemIndex += 1 except: Log.Warn('failed to parse movie element') return matches
def parseStringFromText(self, data, elem, path, key, sanitizeChars=None, isInteger=False, isFloat=False): try: item = common.getXpathOptionalText(elem, path) if item is None: self.log.Warn(' ooo unable to parse "%s"' % key) else: item = sanitizeString(item).encode('utf8').strip() if sanitizeChars is not None: item = item.strip(sanitizeChars) if isInteger: item = removeWhiteSpace(item) data[key] = int(item) elif isFloat: item = removeWhiteSpace(item) data[key] = float(item) else: data[key] = item self.log.Debug(' ... parsed "%s": "%s"' % (key, item)) except: self.logException(' ### unable to parse string for key "%s"' % key)
def parseContentRatingAltInfo(self, data, infoRowElem): spanText = common.getXpathOptionalText(infoRowElem, "./td/span/text()") if spanText is not None: try: match = re.search(".*?(\d+).*?$", spanText) if match is not None: contentRating = match.groups(1)[0] data["contentRatingAlt"] = contentRating + "+" self.log.Debug(' ... parsed content rating alt "%s"' % contentRating) except: self.logException(" ### unable to parse content rating alt")
def parseContentRatingAltInfo(self, data, infoRowElem): spanText = common.getXpathOptionalText(infoRowElem, './td/span/text()') if spanText is not None: try: match = re.search('.*?(\d+).*?$', spanText) if match is not None: contentRating = match.groups(1)[0] data['contentRatingAlt'] = contentRating + '+' self.log.Debug(' ... parsed content rating alt "%s"' % contentRating) except: self.logException(' ### unable to parse content rating alt')
def searchForImdbTitles(mediaName, mediaYear, lang): """ Given media name and a candidate title, returns the title result score penalty. """ mediaName = mediaName.lower() page = common.getElementFromHttpRequest( TMDB_GETINFO % mediaName.replace(' ', '%20'), TMDB_PAGE_ENCODING) matches = [] if page is None: Log.Warn('nothing was found on tmdb for media name "%s"' % mediaName) else: movieElems = page.xpath('//movies/movie') itemIndex = 0 for movieElem in movieElems: try: imdbId = common.getXpathRequiredText(movieElem, './imdb_id/text()') title = common.getXpathRequiredText(movieElem, './name/text()') altTitle = common.getXpathOptionalText( movieElem, './alternative_name/text()') releaseDate = common.getXpathOptionalText( movieElem, './released/text()') year = common.getReOptionalGroup(MATCHER_RELEASED, releaseDate, 0) score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) matches.append({ 'id': imdbId, 'name': title, 'year': year, 'score': score }) itemIndex += 1 except: Log.Warn('failed to parse movie element') return matches
def parseStringFromText(self, data, elem, path, key, sanitizeChars=None, isInteger=False, isFloat=False): try: item = common.getXpathOptionalText(elem, path) if item is None: self.log.Warn(' ooo unable to parse "%s"' % key) else: item = sanitizeString(item).encode("utf8").strip() if sanitizeChars is not None: item = item.strip(sanitizeChars) if isInteger: item = removeWhiteSpace(item) data[key] = int(item) elif isFloat: item = removeWhiteSpace(item) data[key] = float(item) else: data[key] = item self.log.Debug(' ... parsed "%s": "%s"' % (key, item)) except: self.logException(' ### unable to parse string for key "%s"' % key)
def queryKinoPoisk(self, mediaName, mediaYear): """ Ищет фильм на кинопоиске. Returns title results as they are returned (no sorting is done here!). """ results = [] encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE)) page = self.httpUtils.requestAndParseHtmlPage( S.KINOPOISK_SEARCH_SIMPLE % encodedName) if page is None: self.log.Warn( ' ### nothing was found on kinopoisk for media name "%s"' % mediaName) return results # Страница получена, берем с нее перечень всех названий фильмов. self.log.Debug('got a KinoPoisk query results page to parse...') divInfoElems = page.xpath( '//div[@class="info"][p[@class="name"]/a[contains(@href,"/level/1/film/")]]' ) # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить). # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening). if not len(divInfoElems): self.log.Warn( 'nothing was found on kinopoisk for media name "%s"' % mediaName) try: itemTitle = common.getXpathOptionalText( page, '//h1[@class="moviename-big"]/text()') if itemTitle is not None: itemKinoPoiskId = re.search( '\/film\/(.+?)\/', page.xpath( './/link[contains(@href, "/film/")]/attribute::href' )[0]).groups(0)[0] itemYear = common.parseYearFromString( page.xpath('//a[contains(@href,"year")]/text()')[0]) itemAltTitle = None # TODO: parse original title. itemScore = common.scoreMediaTitleMatch( mediaName, mediaYear, itemTitle, itemAltTitle, itemYear, 0) results.append( [itemKinoPoiskId, itemTitle, itemYear, itemScore]) except: self.logException( 'failed to parse a KinoPoisk query results page') return results # Inspect query results titles and score them. itemIndex = -1 self.log.Debug('found %d results (div info tags)' % len(divInfoElems)) for divInfoElem in divInfoElems: itemIndex += 1 try: anchorFilmElem = divInfoElem.xpath( './/a[contains(@href,"/level/1/film/")]/attribute::href') if not len(anchorFilmElem): self.log.Warn( 'unable to find film anchor elements for title "%s"' % mediaName) continue # Parse kinopoisk movie title id, title and year. match = re.search('\/film\/(.+?)\/', anchorFilmElem[0]) if match is None: self.log.Error('unable to parse movie title id') continue itemKinoPoiskId = match.groups(1)[0] itemTitle = common.getXpathRequiredText( divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()') itemYear = common.parseYearFromString( common.getXpathOptionalText( divInfoElem, './/span[@class="year"]/text()')) itemAltTitle = None try: # Try to parse the alternative (original) title. Ignore failures. # This is a <span> below the title <a> tag. altTitleCandidate = common.getXpathOptionalText( divInfoElem, './/span[@class="gray"]/text()') if altTitleCandidate is not None: # Strip any non alpha character in front (unfortunately, this may also remove a leading part # of a movie title if it starts with a digit). altTitleCandidate = MATCHER_LEADING_NONALPHA.sub( '', altTitleCandidate).rstrip() if len(altTitleCandidate) > 0: itemAltTitle = altTitleCandidate except: pass # self.log.Debug(' ... kinoPoiskId="%s"; title="%s"; year="%s"...' % (itemKinoPoiskId, itemTitle, str(itemYear))) itemScore = common.scoreMediaTitleMatch( mediaName, mediaYear, itemTitle, itemAltTitle, itemYear, itemIndex) results.append( [itemKinoPoiskId, itemTitle, itemYear, itemScore]) except: self.logException('failed to parse div.info container') return results
def fetchAndParseSearchResults(self, mediaName, mediaYear): """ Searches for movie titles on KinoPoisk. @param mediaName Movie title parsed from a filename. @param mediaName Movie year parsed from a filename. @return Array of tuples: [kinoPoiskId, title, year, score] """ self.log.Info('Quering kinopoisk...') results = [] encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE)) self.log.Debug('Loading page "%s"' % encodedName) page = self.httpUtils.requestAndParseHtmlPage( S.KINOPOISK_SEARCH_SIMPLE % encodedName) if page is None: self.log.Warn( ' ### nothing was found on kinopoisk for media name "%s"' % mediaName) else: # Если страница получена, берем с нее перечень всех названий фильмов. self.log.Debug('got a kinopoisk results page to parse...') # Pick all divs with class "info" that have specific children (/p/a/ etc). # divInfoElems = page.xpath('//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..') divInfoElems = page.xpath( '//div[@class="info"][p[@class="name"]/a[contains(@href,"/level/1/film/")]]' ) itemIndex = 0 if len(divInfoElems): self.log.Debug('found %d results (div info tags)' % len(divInfoElems)) for divInfoElem in divInfoElems: try: anchorFilmElem = divInfoElem.xpath( './/a[contains(@href,"/level/1/film/")]/attribute::href' ) if len(anchorFilmElem): # Parse kinopoisk movie title id, title and year. match = re.search('\/film\/(.+?)\/', anchorFilmElem[0]) if match is None: self.log.Error( 'unable to parse movie title id') else: kinoPoiskId = match.groups(1)[0] title = common.getXpathRequiredText( divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()' ) year = common.getXpathOptionalText( divInfoElem, './/span[@class="year"]/text()') # Try to parse the alternative (original) title. Ignore failures. # This is a <span> below the title <a> tag. altTitle = None try: altTitleCandidate = common.getXpathOptionalText( divInfoElem, './/span[@class="gray"]/text()') if altTitleCandidate is not None: # Strip any non alpha character in front (unfortunately, this may also remove a leading part # of a movie title if it starts with a digit). altTitleCandidate = MATCHER_LEADING_NONALPHA.sub( '', altTitleCandidate).rstrip() if len(altTitleCandidate) > 0: altTitle = altTitleCandidate except: pass self.log.Debug( ' ... kinoPoiskId="%s"; title="%s"; year="%s"...' % (kinoPoiskId, title, str(year))) score = common.scoreMediaTitleMatch( mediaName, mediaYear, title, altTitle, year, itemIndex) results.append( [kinoPoiskId, title, year, score]) else: self.log.Warn( 'unable to find film anchor elements for title "%s"' % mediaName) except: self.logException('failed to parse div.info container') itemIndex += 1 else: self.log.Warn( 'nothing was found on kinopoisk for media name "%s"' % mediaName) # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening). # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =) try: title = common.getXpathOptionalText( page, '//h1[@class="moviename-big"]/text()') if title is not None: kinoPoiskId = re.search( '\/film\/(.+?)\/', page.xpath( './/link[contains(@href, "/film/")]/attribute::href' )[0]).groups(0)[0] year = page.xpath( '//a[contains(@href,"year")]/text()')[0].strip() altTitle = None # TODO: parse original title. score = common.scoreMediaTitleMatch( mediaName, mediaYear, title, altTitle, year, itemIndex) results.append([kinoPoiskId, title, year, score]) except: self.logException('failed to parse a KinoPoisk page') return results
def fetchAndParseSearchResultsFull(self, mediaName, mediaYear): self.log.Info('Quering kinopoisk...') results = [] encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE)) self.log.Debug('Loading page "%s"' % encodedName) page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH % encodedName) if page is None: self.log.Warn( ' ### nothing was found on kinopoisk for media name "%s"' % mediaName) else: # Если страница получена, берем с нее перечень всех названий фильмов. self.log.Debug('got a kinopoisk page to parse...') divInfoElems = page.xpath( '//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..' ) itemIndex = 0 altTitle = None if len(divInfoElems): self.log.Debug('found %d results' % len(divInfoElems)) for divInfoElem in divInfoElems: try: anchorFilmElem = divInfoElem.xpath( './a[contains(@href,"/level/1/film/")]/attribute::href' ) if len(anchorFilmElem): # Parse kinopoisk movie title id, title and year. match = re.search('\/film\/(.+?)\/', anchorFilmElem[0]) if match is None: self.log.Error( 'unable to parse movie title id') else: kinoPoiskId = match.groups(1)[0] title = common.getXpathRequiredText( divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()' ) year = common.getXpathOptionalText( divInfoElem, './/span[@class="year"]/text()') # Try to parse the alternative (original) title. Ignore failures. # This is a <span> below the title <a> tag. try: altTitle = common.getXpathOptionalText( divInfoElem, '../span[1]/text()') if altTitle is not None: altTitle = altTitle.split( ',')[0].strip() except: pass score = common.scoreMediaTitleMatch( mediaName, mediaYear, title, altTitle, year, itemIndex) results.append( [kinoPoiskId, title, year, score]) else: self.log.Warn( 'unable to find film anchor elements for title "%s"' % mediaName) except: self.logException('failed to parse div.info container') itemIndex += 1 else: self.log.Warn( 'nothing was found on kinopoisk for media name "%s"' % mediaName) # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening). # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =) try: title = page.xpath( '//h1[@class="moviename-big"]/text()')[0].strip() kinoPoiskId = re.search( '\/film\/(.+?)\/', page.xpath( './/link[contains(@href, "/film/")]/attribute::href' )[0]).groups(0)[0] year = page.xpath( '//a[contains(@href,"year")]/text()')[0].strip() score = common.scoreMediaTitleMatch( mediaName, mediaYear, title, altTitle, year, itemIndex) results.append([kinoPoiskId, title, year, score]) except: self.logException('failed to parse a KinoPoisk page') return results
def fetchAndParseSearchResults(self, mediaName, mediaYear): """ Searches for movie titles on KinoPoisk. @param mediaName Movie title parsed from a filename. @param mediaName Movie year parsed from a filename. @return Array of tuples: [kinoPoiskId, title, year, score] """ self.log.Info("Quering kinopoisk...") results = [] encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE)) self.log.Debug('Loading page "%s"' % encodedName) page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH_SIMPLE % encodedName) if page is None: self.log.Warn(' ### nothing was found on kinopoisk for media name "%s"' % mediaName) else: # Если страница получена, берем с нее перечень всех названий фильмов. self.log.Debug("got a kinopoisk results page to parse...") # Pick all divs with class "info" that have specific children (/p/a/ etc). # divInfoElems = page.xpath('//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..') divInfoElems = page.xpath('//div[@class="info"][p[@class="name"]/a[contains(@href,"/level/1/film/")]]') itemIndex = 0 if len(divInfoElems): self.log.Debug("found %d results (div info tags)" % len(divInfoElems)) for divInfoElem in divInfoElems: try: anchorFilmElem = divInfoElem.xpath('.//a[contains(@href,"/level/1/film/")]/attribute::href') if len(anchorFilmElem): # Parse kinopoisk movie title id, title and year. match = re.search("\/film\/(.+?)\/", anchorFilmElem[0]) if match is None: self.log.Error("unable to parse movie title id") else: kinoPoiskId = match.groups(1)[0] title = common.getXpathRequiredText( divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()' ) year = common.getXpathOptionalText(divInfoElem, './/span[@class="year"]/text()') # Try to parse the alternative (original) title. Ignore failures. # This is a <span> below the title <a> tag. altTitle = None try: altTitleCandidate = common.getXpathOptionalText( divInfoElem, './/span[@class="gray"]/text()' ) if altTitleCandidate is not None: # Strip any non alpha character in front (unfortunately, this may also remove a leading part # of a movie title if it starts with a digit). altTitleCandidate = MATCHER_LEADING_NONALPHA.sub("", altTitleCandidate).rstrip() if len(altTitleCandidate) > 0: altTitle = altTitleCandidate except: pass self.log.Debug( ' ... kinoPoiskId="%s"; title="%s"; year="%s"...' % (kinoPoiskId, title, str(year)) ) score = common.scoreMediaTitleMatch( mediaName, mediaYear, title, altTitle, year, itemIndex ) results.append([kinoPoiskId, title, year, score]) else: self.log.Warn('unable to find film anchor elements for title "%s"' % mediaName) except: self.logException("failed to parse div.info container") itemIndex += 1 else: self.log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName) # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening). # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =) try: title = common.getXpathOptionalText(page, '//h1[@class="moviename-big"]/text()') if title is not None: kinoPoiskId = re.search( "\/film\/(.+?)\/", page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0] ).groups(0)[0] year = page.xpath('//a[contains(@href,"year")]/text()')[0].strip() altTitle = None # TODO: parse original title. score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) results.append([kinoPoiskId, title, year, score]) except: self.logException("failed to parse a KinoPoisk page") return results
def fetchAndParseSearchResultsFull(self, mediaName, mediaYear): self.log.Info("Quering kinopoisk...") results = [] encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE)) self.log.Debug('Loading page "%s"' % encodedName) page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH % encodedName) if page is None: self.log.Warn(' ### nothing was found on kinopoisk for media name "%s"' % mediaName) else: # Если страница получена, берем с нее перечень всех названий фильмов. self.log.Debug("got a kinopoisk page to parse...") divInfoElems = page.xpath( '//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..' ) itemIndex = 0 altTitle = None if len(divInfoElems): self.log.Debug("found %d results" % len(divInfoElems)) for divInfoElem in divInfoElems: try: anchorFilmElem = divInfoElem.xpath('./a[contains(@href,"/level/1/film/")]/attribute::href') if len(anchorFilmElem): # Parse kinopoisk movie title id, title and year. match = re.search("\/film\/(.+?)\/", anchorFilmElem[0]) if match is None: self.log.Error("unable to parse movie title id") else: kinoPoiskId = match.groups(1)[0] title = common.getXpathRequiredText( divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()' ) year = common.getXpathOptionalText(divInfoElem, './/span[@class="year"]/text()') # Try to parse the alternative (original) title. Ignore failures. # This is a <span> below the title <a> tag. try: altTitle = common.getXpathOptionalText(divInfoElem, "../span[1]/text()") if altTitle is not None: altTitle = altTitle.split(",")[0].strip() except: pass score = common.scoreMediaTitleMatch( mediaName, mediaYear, title, altTitle, year, itemIndex ) results.append([kinoPoiskId, title, year, score]) else: self.log.Warn('unable to find film anchor elements for title "%s"' % mediaName) except: self.logException("failed to parse div.info container") itemIndex += 1 else: self.log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName) # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening). # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =) try: title = page.xpath('//h1[@class="moviename-big"]/text()')[0].strip() kinoPoiskId = re.search( "\/film\/(.+?)\/", page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0] ).groups(0)[0] year = page.xpath('//a[contains(@href,"year")]/text()')[0].strip() score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex) results.append([kinoPoiskId, title, year, score]) except: self.logException("failed to parse a KinoPoisk page") return results
def queryKinoPoisk(self, mediaName, mediaYear): """ Ищет фильм на кинопоиске. Returns title results as they are returned (no sorting is done here!). """ results = [] encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE)) page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH_SIMPLE % encodedName) if page is None: self.log.Warn(' ### nothing was found on kinopoisk for media name "%s"' % mediaName) return results # Страница получена, берем с нее перечень всех названий фильмов. self.log.Debug('got a KinoPoisk query results page to parse...') divInfoElems = page.xpath('//div[@class="info"][p[@class="name"]/a[contains(@href,"/level/1/film/")]]') # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить). # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening). if not len(divInfoElems): self.log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName) try: itemTitle = common.getXpathOptionalText(page, '//h1[@class="moviename-big"]/text()') if itemTitle is not None: itemKinoPoiskId = re.search('\/film\/(.+?)\/', page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0]).groups(0)[0] itemYear = common.parseYearFromString(page.xpath('//a[contains(@href,"year")]/text()')[0]) itemAltTitle = None # TODO: parse original title. itemScore = common.scoreMediaTitleMatch(mediaName, mediaYear, itemTitle, itemAltTitle, itemYear, 0) results.append([itemKinoPoiskId, itemTitle, itemYear, itemScore]) except: self.logException('failed to parse a KinoPoisk query results page') return results # Inspect query results titles and score them. itemIndex = -1 self.log.Debug('found %d results (div info tags)' % len(divInfoElems)) for divInfoElem in divInfoElems: itemIndex += 1 try: anchorFilmElem = divInfoElem.xpath('.//a[contains(@href,"/level/1/film/")]/attribute::href') if not len(anchorFilmElem): self.log.Warn('unable to find film anchor elements for title "%s"' % mediaName) continue # Parse kinopoisk movie title id, title and year. match = re.search('\/film\/(.+?)\/', anchorFilmElem[0]) if match is None: self.log.Error('unable to parse movie title id') continue itemKinoPoiskId = match.groups(1)[0] itemTitle = common.getXpathRequiredText(divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()') itemYear = common.parseYearFromString(common.getXpathOptionalText(divInfoElem, './/span[@class="year"]/text()')) itemAltTitle = None try: # Try to parse the alternative (original) title. Ignore failures. # This is a <span> below the title <a> tag. altTitleCandidate = common.getXpathOptionalText(divInfoElem, './/span[@class="gray"]/text()') if altTitleCandidate is not None: # Strip any non alpha character in front (unfortunately, this may also remove a leading part # of a movie title if it starts with a digit). altTitleCandidate = MATCHER_LEADING_NONALPHA.sub('', altTitleCandidate).rstrip() if len(altTitleCandidate) > 0: itemAltTitle = altTitleCandidate except: pass # self.log.Debug(' ... kinoPoiskId="%s"; title="%s"; year="%s"...' % (itemKinoPoiskId, itemTitle, str(itemYear))) itemScore = common.scoreMediaTitleMatch(mediaName, mediaYear, itemTitle, itemAltTitle, itemYear, itemIndex) results.append([itemKinoPoiskId, itemTitle, itemYear, itemScore]) except: self.logException('failed to parse div.info container') return results