def _getitem(self, key): """Handle special keys.""" if 'episode of' in self.data: if key == 'long imdb episode title': return build_title(self.data) elif key == 'series title': return self._getSeriesTitle(self.data['episode of']) elif key == 'canonical series title': ser_title = self._getSeriesTitle(self.data['episode of']) return canonicalTitle(ser_title) elif key == 'smart canonical series title': ser_title = self._getSeriesTitle(self.data['episode of']) return self.smartCanonicalTitle(ser_title) elif key == 'episode title': return self.data.get('title', '') elif key == 'canonical episode title': return canonicalTitle(self.data.get('title', '')) elif key == 'smart canonical episode title': return self.smartCanonicalTitle(self.data.get('title', '')) if 'title' in self.data: if key == 'title': return self.data['title'] elif key == 'long imdb title': return build_title(self.data) elif key == 'canonical title': return canonicalTitle(self.data['title']) elif key == 'smart canonical title': return self.smartCanonicalTitle(self.data['title']) elif key == 'long imdb canonical title': return build_title(self.data, canonical=True) elif key == 'smart long imdb canonical title': return build_title(self.data, canonical=True, lang=self.guessLanguage()) if key == 'full-size cover url': return self.get_fullsizeURL() return None
def _getitem(self, key): """Handle special keys.""" if self.data.has_key('episode of'): if key == 'long imdb episode title': return build_title(self.data, canonical=0) elif key == 'series title': ser_title = self.data['episode of'].get('canonical title') or \ self.data['episode of']['title'] return normalizeTitle(ser_title) elif key == 'canonical series title': ser_title = self.data['episode of'].get('canonical title') or \ self.data['episode of']['title'] return ser_title elif key == 'episode title': return normalizeTitle(self.data.get('title', u'')) elif key == 'canonical episode title': return self.data.get('title', u'') if self.data.has_key('title'): if key == 'title': return normalizeTitle(self.data['title']) elif key == 'long imdb title': return build_title(self.data, canonical=0) elif key == 'canonical title': return self.data['title'] elif key == 'long imdb canonical title': return build_title(self.data, canonical=1) return None
def _getitem(self, key): """Handle special keys.""" if self.data.has_key('episode of'): if key == 'long imdb episode title': return build_title(self.data) elif key == 'series title': return self.data['episode of']['title'] elif key == 'canonical series title': ser_title = self.data['episode of']['title'] return canonicalTitle(ser_title) elif key == 'smart canonical series title': ser_title = self.data['episode of']['title'] return self.smartCanonicalTitle(ser_title) elif key == 'episode title': return self.data.get('title', u'') elif key == 'canonical episode title': return canonicalTitle(self.data.get('title', u'')) elif key == 'smart canonical episode title': return self.smartCanonicalTitle(self.data.get('title', u'')) if self.data.has_key('title'): if key == 'title': return self.data['title'] elif key == 'long imdb title': return build_title(self.data) elif key == 'canonical title': return canonicalTitle(self.data['title']) elif key == 'smart canonical title': return self.smartCanonicalTitle(self.data['title']) elif key == 'long imdb canonical title': return build_title(self.data, canonical=1) elif key == 'smart long imdb canonical title': return build_title(self.data, canonical=1, lang=self.guessLanguage()) return None
def titleVariations(title, fromPtdf=0): """Build title variations useful for searches; if fromPtdf is true, the input is assumed to be in the plain text data files format.""" if fromPtdf: title1 = u'' else: title1 = title title2 = title3 = u'' if fromPtdf or re_year_index.search(title): # If it appears to have a (year[/imdbIndex]) indication, # assume that a long imdb canonical name was provided. titldict = analyze_title(title, canonical=1) # title1: the canonical name. title1 = titldict['title'] if titldict['kind'] != 'episode': # title3: the long imdb canonical name. if fromPtdf: title3 = title else: title3 = build_title(titldict, canonical=1, ptdf=1) else: title1 = normalizeTitle(title1) title3 = build_title(titldict, canonical=1, ptdf=1) else: # Just a title. # title1: the canonical title. title1 = canonicalTitle(title) title3 = u'' # title2 is title1 without the article, or title1 unchanged. if title1: title2 = title1 t2s = title2.split(u', ') if t2s[-1].lower() in _unicodeArticles: title2 = u', '.join(t2s[:-1]) return title1, title2, title3
def titleVariations(title, fromPtdf=0): """Build title variations useful for searches; if fromPtdf is true, the input is assumed to be in the plain text data files format.""" if fromPtdf: title1 = u'' else: title1 = title title2 = title3 = u'' if fromPtdf or re_year_index.search(title): # If it appears to have a (year[/imdbIndex]) indication, # assume that a long imdb canonical name was provided. titldict = analyze_title(title, canonical=1) # title1: the canonical name. title1 = titldict['title'] if titldict['kind'] != 'episode': # title3: the long imdb canonical name. if fromPtdf: title3 = title else: title3 = build_title(titldict, canonical=1, ptdf=1) else: title1 = normalizeTitle(title1) title3 = build_title(titldict, canonical=1, ptdf=1) else: # Just a title. # title1: the canonical title. title1 = canonicalTitle(title) title3 = u'' # title2 is title1 without the article, or title1 unchanged. if title1: title2 = title1 t2s = title2.split(u', ') if t2s[-1].lower() in _articles: title2 = u', '.join(t2s[:-1]) return title1, title2, title3
def isSameTitle(self, other): """Return true if this and the compared object have the same long imdb title and/or movieID. """ # XXX: obsolete? if not isinstance(other, self.__class__): return False if 'title' in self.data and 'title' in other.data and \ build_title(self.data, canonical=False) == build_title(other.data, canonical=False): return True if self.accessSystem == other.accessSystem and \ self.movieID is not None and self.movieID == other.movieID: return True return False
def isSameTitle(self, other): """Return true if this and the compared object have the same long imdb title and/or movieID. """ # XXX: obsolete? if not isinstance(other, self.__class__): return 0 if self.data.has_key('title') and \ other.data.has_key('title') and \ build_title(self.data, canonical=0) == \ build_title(other.data, canonical=0): return 1 if self.accessSystem == other.accessSystem and \ self.movieID is not None and self.movieID == other.movieID: return 1 return 0
def get_imdbID(self, mop): """Return the imdbID for the given Movie, Person, Character or Company object.""" imdbID = None if mop.accessSystem == self.accessSystem: aSystem = self else: aSystem = IMDb(mop.accessSystem) if isinstance(mop, Movie.Movie): if mop.movieID is not None: imdbID = aSystem.get_imdbMovieID(mop.movieID) else: imdbID = aSystem.title2imdbID( build_title(mop, canonical=0, ptdf=0, appendKind=False), mop['kind']) elif isinstance(mop, Person.Person): if mop.personID is not None: imdbID = aSystem.get_imdbPersonID(mop.personID) else: imdbID = aSystem.name2imdbID(build_name(mop, canonical=1)) elif isinstance(mop, Character.Character): if mop.characterID is not None: imdbID = aSystem.get_imdbCharacterID(mop.characterID) else: # canonical=0 ? imdbID = aSystem.character2imdbID(build_name(mop, canonical=1)) elif isinstance(mop, Company.Company): if mop.companyID is not None: imdbID = aSystem.get_imdbCompanyID(mop.companyID) else: imdbID = aSystem.company2imdbID(build_company_name(mop)) else: raise IMDbError('object ' + repr(mop) + \ ' is not a Movie, Person or Character instance') return imdbID
def get_imdbMovieID(self, movieID): """Translate a movieID in an imdbID. If not in the database, try an Exact Primary Title search on IMDb; return None if it's unable to get the imdbID. """ try: movie = Title.get(movieID) except NotFoundError: return None imdbID = movie.imdbID if imdbID is not None: return '%07d' % imdbID m_dict = get_movie_data(movie.id, self._kind) titline = build_title(m_dict, canonical=1, ptdf=1) imdbID = self.title2imdbID(titline) # If the imdbID was retrieved from the web and was not in the # database, update the database (ignoring errors, because it's # possibile that the current user has not update privileges). # There're times when I think I'm a genius; this one of # those times... <g> if imdbID is not None: try: movie.imdbID = int(imdbID) except: pass return imdbID
def get_imdbID(self, mop): """Return the imdbID for the given Movie, Person, Character or Company object.""" imdbID = None if mop.accessSystem == self.accessSystem: aSystem = self else: aSystem = IMDb(mop.accessSystem) if isinstance(mop, Movie.Movie): if mop.movieID is not None: imdbID = aSystem.get_imdbMovieID(mop.movieID) else: imdbID = aSystem.title2imdbID(build_title(mop, canonical=0, ptdf=0, appendKind=False), mop['kind']) elif isinstance(mop, Person.Person): if mop.personID is not None: imdbID = aSystem.get_imdbPersonID(mop.personID) else: imdbID = aSystem.name2imdbID(build_name(mop, canonical=1)) elif isinstance(mop, Character.Character): if mop.characterID is not None: imdbID = aSystem.get_imdbCharacterID(mop.characterID) else: # canonical=0 ? imdbID = aSystem.character2imdbID(build_name(mop, canonical=1)) elif isinstance(mop, Company.Company): if mop.companyID is not None: imdbID = aSystem.get_imdbCompanyID(mop.companyID) else: imdbID = aSystem.company2imdbID(build_company_name(mop)) else: raise IMDbError('object ' + repr(mop) + \ ' is not a Movie, Person or Character instance') return imdbID
def _search_movie(self, title, results, _episodes=False): title = title.strip() if not title: return [] # Search for these title variations. if not _episodes: title1, title2, title3 = titleVariations(title, fromPtdf=1) else: title1 = normalizeTitle(title) title2 = '' title3 = '' # XXX: only a guess: results are shrinked, to exclude Adult # titles and to remove duplicated entries. resultsST = results * 3 res = _scan_titles('%stitles.key' % self.__db, title1, title2, title3, resultsST, _episodes) res[:] = [x[1] for x in res] # Check for adult movies. if not self.doAdult: newlist = [] for entry in res: genres = getMovieMisc(movieID=entry[0], dataF='%s%s.data' % (self.__db, 'genres'), indexF='%s%s.index' % (self.__db, 'genres'), attrIF='%sattributes.index' % self.__db, attrKF='%sattributes.key' % self.__db) if 'Adult' not in genres: newlist.append(entry) res[:] = newlist # Get the real name, if this is an AKA. # XXX: duplicated code! new_res = [] seen_MID = [] for idx, (movieID, r) in enumerate(res): # Remove duplicates. # XXX: find a way to prefer titles with an AKA? Or prefer # the original title? if movieID in seen_MID: continue else: seen_MID.append(movieID) realMID = self._get_real_movieID(movieID) if movieID == realMID: new_res.append((movieID, r)) continue if realMID in seen_MID: continue else: seen_MID.append(realMID) aka_title = build_title(r, canonical=0) real_title = getLabel(realMID, '%stitles.index' % self.__db, '%stitles.key' % self.__db) if aka_title == real_title: new_res.append((realMID, r)) continue new_r = analyze_title(real_title, canonical=1) new_r['akas'] = [aka_title] new_res.append((realMID, new_r)) if results > 0: new_res[:] = new_res[:results] return new_res
def _findRefs(self, o, trefs, nrefs): """Find titles or names references in strings.""" if isinstance(o, (unicode, str)): for title in re_titleRef.findall(o): a_title = analyze_title(title, canonical=0) rtitle = build_title(a_title, ptdf=1) if trefs.has_key(rtitle): continue movieID = self._getTitleID(rtitle) if movieID is None: movieID = self._getTitleID(title) if movieID is None: continue m = Movie(title=rtitle, movieID=movieID, accessSystem=self.accessSystem) trefs[rtitle] = m rtitle2 = canonicalTitle(a_title.get('title', u'')) if rtitle2 and rtitle2 != rtitle and rtitle2 != title: trefs[rtitle2] = m if title != rtitle: trefs[title] = m for name in re_nameRef.findall(o): a_name = analyze_name(name, canonical=1) rname = build_name(a_name, canonical=1) if nrefs.has_key(rname): continue personID = self._getNameID(rname) if personID is None: personID = self._getNameID(name) if personID is None: continue p = Person(name=rname, personID=personID, accessSystem=self.accessSystem) nrefs[rname] = p rname2 = normalizeName(a_name.get('name', u'')) if rname2 and rname2 != rname: nrefs[rname2] = p if name != rname and name != rname2: nrefs[name] = p elif isinstance(o, (list, tuple)): for item in o: self._findRefs(item, trefs, nrefs) elif isinstance(o, dict): for value in o.values(): self._findRefs(value, trefs, nrefs) return (trefs, nrefs)
def get_imdbID(self, mop): """Return the imdbID for the given Movie, Person, Character or Company object.""" imdbID = None if mop.accessSystem == self.accessSystem: aSystem = self else: aSystem = IMDb(mop.accessSystem) if isinstance(mop, Movie.Movie): if mop.movieID is not None: imdbID = aSystem.get_imdbMovieID(mop.movieID) else: imdbID = aSystem.title2imdbID(build_title(mop, canonical=0, ptdf=0, appendKind=False), mop['kind']) else: raise IMDbError('object ' + repr(mop) + ' is not a Movie') return imdbID
def _findRefs(self, o, trefs, nrefs): """Find titles or names references in strings.""" if isinstance(o, (UnicodeType, StringType)): for title in re_titleRef.findall(o): a_title = analyze_title(title, canonical=1) rtitle = build_title(a_title, canonical=1, ptdf=1) if trefs.has_key(rtitle): continue movieID = self._getTitleID(rtitle) if movieID is None: movieID = self._getTitleID(title) if movieID is None: continue m = Movie(title=rtitle, movieID=movieID, accessSystem=self.accessSystem) trefs[rtitle] = m rtitle2 = canonicalTitle(a_title.get('title', u'')) if rtitle2 and rtitle2 != rtitle and rtitle2 != title: trefs[rtitle2] = m if title != rtitle: trefs[title] = m for name in re_nameRef.findall(o): a_name = analyze_name(name, canonical=1) rname = build_name(a_name, canonical=1) if nrefs.has_key(rname): continue personID = self._getNameID(rname) if personID is None: personID = self._getNameID(name) if personID is None: continue p = Person(name=rname, personID=personID, accessSystem=self.accessSystem) nrefs[rname] = p rname2 = normalizeName(a_name.get('name', u'')) if rname2 and rname2 != rname: nrefs[rname2] = p if name != rname and name != rname2: nrefs[name] = p elif isinstance(o, (ListType, TupleType)): for item in o: self._findRefs(item, trefs, nrefs) elif isinstance(o, DictType): for value in o.values(): self._findRefs(value, trefs, nrefs) return (trefs, nrefs)
class DOMHTMLSearchMovieParser(DOMParserBase): """Parse the html page that the IMDb web server shows when the "new search system" is used, for movies.""" _BaseParser = DOMBasicMovieParser _notDirectHitTitle = '<title>find - imdb</title>' _titleBuilder = lambda self, x: build_title(x) _linkPrefix = '/title/tt' _attrs = [ Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': ".//text()", 'akas': "./i//text()" }, postprocess=lambda x: (analyze_imdbid(x.get('link') or u''), custom_analyze_title(x.get('info') or u''), x.get('akas'))) ] extractors = [ Extractor(label='search', path="//td[@class='result_text']", attrs=_attrs) ] def _init(self): self.url = u'' def _reset(self): self.url = u'' def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:10240].lower(): if self._linkPrefix == '/title/tt': # Only for movies. # XXX (HTU): does this still apply? html_string = html_string.replace('(TV mini-series)', '(mini)') return html_string # Direct hit! dbme = self._BaseParser(useModule=self._useModule) res = dbme.parse(html_string, url=self.url) if not res: return u'' res = res['data'] if not (res and res[0]): return u'' link = '%s%s' % (self._linkPrefix, res[0][0]) # # Tries to cope with companies for which links to pro.imdb.com # # are missing. # link = self.url.replace(imdbURL_base[:-1], '') title = self._titleBuilder(res[0][1]) if not (link and title): return u'' link = link.replace('http://pro.imdb.com', '') new_html = '<td class="result_text"><a href="%s">%s</a></td>' % (link, title) return new_html def postprocess_data(self, data): if not data.has_key('data'): data['data'] = [] results = getattr(self, 'results', None) if results is not None: data['data'][:] = data['data'][:results] # Horrible hack to support AKAs. if data and data['data'] and len(data['data'][0]) == 3 and \ isinstance(data['data'][0], tuple): data['data'] = [x for x in data['data'] if x[0] and x[1]] for idx, datum in enumerate(data['data']): if not isinstance(datum, tuple): continue if not datum[0] and datum[1]: continue if datum[2] is not None: #akas = filter(None, datum[2].split('::')) if self._linkPrefix == '/title/tt': # XXX (HTU): couldn't find a result with multiple akas aka = datum[2] akas = [aka[1:-1]] # remove the quotes #akas = [a.replace('" - ', '::').rstrip() for a in akas] #akas = [a.replace('aka "', '', 1).replace('aka "', #'', 1).lstrip() for a in akas] datum[1]['akas'] = akas data['data'][idx] = (datum[0], datum[1]) else: data['data'][idx] = (datum[0], datum[1]) return data def add_refs(self, data): return data
def _search_movie(self, title, results, _episodes=False): title = title.strip() if not title: return [] title_dict = analyze_title(title, canonical=1) s_title = title_dict['title'] if not s_title: return [] episodeOf = title_dict.get('episode of') if not episodeOf: if not _episodes: s_title_split = s_title.split(', ') if len(s_title_split) > 1 and \ s_title_split[-1].lower() in _articles: s_title_rebuilt = ', '.join(s_title_split[:-1]) if s_title_rebuilt: s_title = s_title_rebuilt else: _episodes = False s_title = normalizeTitle(s_title) if isinstance(s_title, UnicodeType): s_title = s_title.encode('ascii', 'ignore') soundexCode = soundex(s_title) # XXX: improve the search restricting the kindID if the # "kind" of the input differs from "movie"? condition = conditionAka = None if _episodes: condition = AND(Title.q.phoneticCode == soundexCode, Title.q.kindID == self._kindRev['episode']) conditionAka = AND(AkaTitle.q.phoneticCode == soundexCode, AkaTitle.q.kindID == self._kindRev['episode']) elif title_dict['kind'] == 'episode' and episodeOf is not None: series_title = build_title(episodeOf, canonical=1) # XXX: is it safe to get "results" results? # Too many? Too few? serRes = results if serRes < 3 or serRes > 10: serRes = 10 searchSeries = self._search_movie(series_title, serRes) seriesIDs = [result[0] for result in searchSeries] if seriesIDs: condition = AND(Title.q.phoneticCode == soundexCode, IN(Title.q.episodeOfID, seriesIDs), Title.q.kindID == self._kindRev['episode']) conditionAka = AND( AkaTitle.q.phoneticCode == soundexCode, IN(AkaTitle.q.episodeOfID, seriesIDs), AkaTitle.q.kindID == self._kindRev['episode']) else: # XXX: bad situation: we have found no matching series; # try searching everything (both episodes and # non-episodes) for the title. condition = AND(Title.q.phoneticCode == soundexCode, IN(Title.q.episodeOfID, seriesIDs)) conditionAka = AND(AkaTitle.q.phoneticCode == soundexCode, IN(AkaTitle.q.episodeOfID, seriesIDs)) if condition is None: # XXX: excludes episodes? condition = AND(Title.q.kindID != self._kindRev['episode'], Title.q.phoneticCode == soundexCode) conditionAka = AND(AkaTitle.q.kindID != self._kindRev['episode'], AkaTitle.q.phoneticCode == soundexCode) # Up to 3 variations of the title are searched, plus the # long imdb canonical title, if provided. if not _episodes: title1, title2, title3 = titleVariations(title) else: title1 = title title2 = '' title3 = '' try: qr = [(q.id, get_movie_data(q.id, self._kind)) for q in Title.select(condition)] q2 = [(q.movieID, get_movie_data(q.id, self._kind, fromAka=1)) for q in AkaTitle.select(conditionAka)] qr += q2 except NotFoundError, e: raise IMDbDataAccessError, \ 'unable to search the database: "%s"' % str(e)
class DOMHTMLSearchMovieParser(DOMParserBase): """Parse the html page that the IMDb web server shows when the "new search system" is used, for movies.""" _BaseParser = DOMBasicMovieParser _notDirectHitTitle = '<title>imdb title' _titleBuilder = lambda self, x: build_title(x) _linkPrefix = '/title/tt' _attrs = [Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': ".//text()", #'akas': ".//div[@class='_imdbpyAKA']//text()" 'akas': ".//p[@class='find-aka']//text()" }, postprocess=lambda x: ( analyze_imdbid(x.get('link') or u''), custom_analyze_title(x.get('info') or u''), x.get('akas') ))] extractors = [Extractor(label='search', path="//td[3]/a[starts-with(@href, '/title/tt')]/..", attrs=_attrs)] def _init(self): self.url = u'' def _reset(self): self.url = u'' def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:1024].lower(): if self._linkPrefix == '/title/tt': # Only for movies. html_string = html_string.replace('(TV mini-series)', '(mini)') html_string = html_string.replace('<p class="find-aka">', '<p class="find-aka">::') #html_string = _reAKAStitles.sub( # r'<div class="_imdbpyAKA">\1::</div>\2', html_string) return html_string # Direct hit! dbme = self._BaseParser(useModule=self._useModule) res = dbme.parse(html_string, url=self.url) if not res: return u'' res = res['data'] if not (res and res[0]): return u'' link = '%s%s' % (self._linkPrefix, res[0][0]) # # Tries to cope with companies for which links to pro.imdb.com # # are missing. # link = self.url.replace(imdbURL_base[:-1], '') title = self._titleBuilder(res[0][1]) if not (link and title): return u'' link = link.replace('http://pro.imdb.com', '') new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link, title) return new_html def postprocess_data(self, data): if not data.has_key('data'): data['data'] = [] results = getattr(self, 'results', None) if results is not None: data['data'][:] = data['data'][:results] # Horrible hack to support AKAs. if data and data['data'] and len(data['data'][0]) == 3 and \ isinstance(data['data'][0], tuple): for idx, datum in enumerate(data['data']): if datum[2] is not None: akas = filter(None, datum[2].split('::')) if self._linkPrefix == '/title/tt': akas = [a.replace('" - ', '::').rstrip() for a in akas] akas = [a.replace('aka "', '', 1).lstrip() for a in akas] datum[1]['akas'] = akas data['data'][idx] = (datum[0], datum[1]) else: data['data'][idx] = (datum[0], datum[1]) return data def add_refs(self, data): return data
cDb = CompanyName.get(mdata[1]) cDbTxt = cDb.name if cDb.countryCode: cDbTxt += ' %s' % cDb.countryCode company = Company(name=cDbTxt, companyID=mdata[1], notes=mdata[2] or u'', accessSystem=self.accessSystem) res.setdefault(sect, []).append(company) # AKA titles. akat = [(get_movie_data(at.id, self._kind, fromAka=1), at.note) for at in AkaTitle.select(AkaTitle.q.movieID == movieID)] if akat: res['akas'] = [] for td, note in akat: nt = build_title(td, canonical=1, ptdf=1) if note: net = self._changeAKAencoding(note, nt) if net is not None: nt = net nt += '::%s' % note if nt not in res['akas']: res['akas'].append(nt) # Complete cast/crew. compcast = [ (self._compcast[cc.subjectID], self._compcast[cc.statusID]) for cc in CompleteCast.select(CompleteCast.q.movieID == movieID) ] if compcast: for entry in compcast: val = unicode(entry[1]) res[u'complete %s' % entry[0]] = val # Movie connections.
class DOMHTMLSearchMovieParser(DOMParserBase): """Parse the html page that the IMDb web server shows when the "new search system" is used, for movies.""" _BaseParser = DOMBasicMovieParser _notDirectHitTitle = '<title>imdb title' _titleBuilder = lambda self, x: build_title(x, canonical=True) _linkPrefix = '/title/tt' _attrs = [ Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': ".//text()" }, postprocess=lambda x: (analyze_imdbid(x.get('link') or u''), analyze_title(x.get('info') or u'', canonical=1))) ] extractors = [ Extractor(label='search', path="//td[3]/a[starts-with(@href, '/title/tt')]/..", attrs=_attrs) ] def _init(self): self.url = u'' def _reset(self): self.url = u'' def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:1024].lower(): if self._linkPrefix == '/title/tt': # Only for movies. html_string = html_string.replace('(TV mini-series)', '(mini)') html_string = _reAKAS.sub('</td>', html_string) return html_string # Direct hit! dbme = self._BaseParser(useModule=self._useModule) res = dbme.parse(html_string, url=self.url) if not res: return u'' res = res['data'] if not (res and res[0]): return u'' link = '%s%s' % (self._linkPrefix, res[0][0]) # # Tries to cope with companies for which links to pro.imdb.com # # are missing. # link = self.url.replace(imdbURL_base[:-1], '') title = self._titleBuilder(res[0][1]) if not (link and title): return u'' link = link.replace('http://pro.imdb.com', '') new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link, title) return new_html def postprocess_data(self, data): if not data.has_key('data'): data['data'] = [] results = getattr(self, 'results', None) if results is not None: data['data'][:] = data['data'][:results] return data def add_refs(self, data): return data
def scan_titles(titles_list, title1, title2, title3, results=0, searchingEpisode=0, onlyEpisodes=0, ro_thresold=None): """Scan a list of titles, searching for best matches against the given variations.""" if ro_thresold is not None: RO_THRESHOLD = ro_thresold else: RO_THRESHOLD = 0.6 sm1 = SequenceMatcher() sm2 = SequenceMatcher() sm3 = SequenceMatcher() sm1.set_seq1(title1.lower()) sm2.set_seq2(title2.lower()) if title3: sm3.set_seq1(title3.lower()) if title3[-1] == '}': searchingEpisode = 1 hasArt = 0 if title2 != title1: hasArt = 1 resd = {} for i, t_data in titles_list: if onlyEpisodes: if t_data.get('kind') != 'episode': continue til = t_data['title'] if til[-1] == ')': dateIdx = til.rfind('(') if dateIdx != -1: til = til[:dateIdx].rstrip() if not til: continue ratio = ratcliff(title1, til, sm1) if ratio >= RO_THRESHOLD: resd[i] = (ratio, (i, t_data)) continue if searchingEpisode: if t_data.get('kind') != 'episode': continue elif t_data.get('kind') == 'episode': continue til = t_data['title'] # XXX: on Symbian, here we get a str; not sure this is the # right place to fix it. if isinstance(til, str): til = unicode(til, 'latin1', 'ignore') # Distance with the canonical title (with or without article). # titleS -> titleR # titleS, the -> titleR, the if not searchingEpisode: til = canonicalTitle(til) ratios = [ratcliff(title1, til, sm1) + 0.05] # til2 is til without the article, if present. til2 = til tils = til2.split(', ') matchHasArt = 0 if tils[-1].lower() in _unicodeArticles: til2 = ', '.join(tils[:-1]) matchHasArt = 1 if hasArt and not matchHasArt: # titleS[, the] -> titleR ratios.append(ratcliff(title2, til, sm2)) elif matchHasArt and not hasArt: # titleS -> titleR[, the] ratios.append(ratcliff(title1, til2, sm1)) else: ratios = [0.0] if title3: # Distance with the long imdb canonical title. ratios.append( ratcliff(title3, build_title(t_data, canonical=1, ptdf=1), sm3) + 0.1) ratio = max(ratios) if ratio >= RO_THRESHOLD: if resd.has_key(i): if ratio > resd[i][0]: resd[i] = (ratio, (i, t_data)) else: resd[i] = (ratio, (i, t_data)) res = resd.values() res.sort() res.reverse() if results > 0: res[:] = res[:results] return res
class DOMHTMLSearchMovieParser(DOMParserBase): """Parse the html page that the IMDb web server shows when the "new search system" is used, for movies.""" _titleBuilder = lambda self, x: build_title(x) _linkPrefix = '/title/tt' _attrs = [ Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': ".//text()", 'akas': "./i//text()" }, postprocess=lambda x: (analyze_imdbid(x.get('link') or ''), custom_analyze_title(x.get('info') or ''), x.get('akas'))) ] extractors = [ Extractor(label='search', path="//td[@class='result_text']", attrs=_attrs) ] def _init(self): self.url = '' def _reset(self): self.url = '' def postprocess_data(self, data): if 'data' not in data: data['data'] = [] results = getattr(self, 'results', None) if results is not None: data['data'][:] = data['data'][:results] # Horrible hack to support AKAs. if data and data['data'] and len(data['data'][0]) == 3 and \ isinstance(data['data'][0], tuple): data['data'] = [x for x in data['data'] if x[0] and x[1]] for idx, datum in enumerate(data['data']): if not isinstance(datum, tuple): continue if not datum[0] and datum[1]: continue if datum[2] is not None: # akas = filter(None, datum[2].split('::')) if self._linkPrefix == '/title/tt': # XXX (HTU): couldn't find a result with multiple akas aka = datum[2] akas = [aka[1:-1]] # remove the quotes # akas = [a.replace('" - ', '::').rstrip() for a in akas] # akas = [a.replace('aka "', '', 1).replace('aka "', # '', 1).lstrip() for a in akas] datum[1]['akas'] = akas data['data'][idx] = (datum[0], datum[1]) else: data['data'][idx] = (datum[0], datum[1]) return data def add_refs(self, data): return data
def scan_titles(titles_list, title1, title2, title3, results=0, searchingEpisode=0, onlyEpisodes=0, ro_thresold=None): """Scan a list of titles, searching for best matches against the given variations.""" if ro_thresold is not None: RO_THRESHOLD = ro_thresold else: RO_THRESHOLD = 0.6 sm1 = SequenceMatcher() sm2 = SequenceMatcher() sm3 = SequenceMatcher() sm1.set_seq1(title1.lower()) sm2.set_seq2(title2.lower()) if title3: sm3.set_seq1(title3.lower()) if title3[-1] == '}': searchingEpisode = 1 hasArt = 0 if title2 != title1: hasArt = 1 resd = {} for i, t_data in titles_list: if onlyEpisodes: if t_data.get('kind') != 'episode': continue til = t_data['title'] if til[-1] == ')': dateIdx = til.rfind('(') if dateIdx != -1: til = til[:dateIdx].rstrip() if not til: continue ratio = ratcliff(title1, til, sm1) if ratio >= RO_THRESHOLD: resd[i] = (ratio, (i, t_data)) continue if searchingEpisode: if t_data.get('kind') != 'episode': continue elif t_data.get('kind') == 'episode': continue til = t_data['title'] # XXX: on Symbian, here we get a str; not sure this is the # right place to fix it. if isinstance(til, str): til = unicode(til, 'latin1', 'ignore') # Distance with the canonical title (with or without article). # titleS -> titleR # titleS, the -> titleR, the if not searchingEpisode: ratios = [ratcliff(title1, til, sm1) + 0.05] # til2 is til without the article, if present. til2 = til tils = til2.split(', ') matchHasArt = 0 if tils[-1].lower() in _articles: til2 = ', '.join(tils[:-1]) matchHasArt = 1 if hasArt and not matchHasArt: # titleS[, the] -> titleR ratios.append(ratcliff(title2, til, sm2)) elif matchHasArt and not hasArt: # titleS -> titleR[, the] ratios.append(ratcliff(title1, til2, sm1)) else: ratios = [0.0] if title3: # Distance with the long imdb canonical title. ratios.append(ratcliff(title3, build_title(t_data, canonical=1, ptdf=1), sm3) + 0.1) ratio = max(ratios) if ratio >= RO_THRESHOLD: if resd.has_key(i): if ratio > resd[i][0]: resd[i] = (ratio, (i, t_data)) else: resd[i] = (ratio, (i, t_data)) res = resd.values() res.sort() res.reverse() if results > 0: res[:] = res[:results] return res