def _search_character(self, name, results): cont = subXMLRefs(self._get_search_content("ch", name, results)) name = _findBetween(cont, "<title>", "</title>", maxRes=1) res = [] if not name: self._mobile_logger.error("no title tag searching character %s", name) return res nl = name[0].lower() if not nl.startswith("find - imdb"): # a direct hit! name = _unHtml(name[0]).replace("(Character)", "").strip() pid = None pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if pidtag: pid = _findBetween(pidtag[0], "/character/ch", "/", maxRes=1) if not (pid and name): self._mobile_logger.error("no direct hit name/characterID for" " character %s", name) return res res[:] = [(str(pid[0]), analyze_name(name))] else: lis = _findBetween(cont, '<td class="result_text"', ["<small", "</td>", "<br"]) for li in lis: li = "<%s" % li pid = re_imdbID.findall(li) pname = _unHtml(li) if not (pid and pname): self._mobile_logger.debug( "no name/characterID" " parsing %s searching for" " character %s", li, name ) continue res.append((str(pid[0]), analyze_name(pname))) return res
def _search_character(self, name, results): cont = subXMLRefs(self._get_search_content('char', name, results)) name = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not name: return res nl = name[0].lower() if not (nl.startswith('imdb search') or nl.startswith('imdb search') \ or nl.startswith('imdb character')): # XXX: a direct hit! name = _unHtml(name[0]).replace('(Character)', '').strip() pidtag = _getTagsWith(cont, '/character/ch', maxRes=1) pid = None if pidtag: pid = re_imdbID.findall(pidtag[0]) if not (pid and name): return res res[:] = [(str(pid[0]), analyze_name(name, canonical=0))] else: sects = _findBetween(cont, '<b>Popular Characters</b>', '</table>') sects += _findBetween(cont, '<b>Characters', '</table>') for sect in sects: lis = _findBetween(sect, '<a href="/character/', ['<small', '</td>', '<br']) for li in lis: li = '<%s' % li pid = re_imdbID.findall(li) pname = _unHtml(li) if not (pid and pname): continue res.append((str(pid[0]), analyze_name(pname, canonical=0))) return res
def _search_character(self, name, results): cont = subXMLRefs(self._get_search_content('ch', name, results)) name = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not name: self._mobile_logger.error('no title tag searching character %s', name) return res nl = name[0].lower() if not nl.startswith('find - imdb'): # a direct hit! name = _unHtml(name[0]).replace('(Character)', '').strip() pid = None pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if pidtag: pid = _findBetween(pidtag[0], '/character/ch', '/', maxRes=1) if not (pid and name): self._mobile_logger.error('no direct hit name/characterID for' \ ' character %s', name) return res res[:] = [(str(pid[0]), analyze_name(name))] else: lis = _findBetween(cont, '<td class="result_text"', ['<small', '</td>', '<br']) for li in lis: li = '<%s' % li pid = re_imdbID.findall(li) pname = _unHtml(li) if not (pid and pname): self._mobile_logger.debug('no name/characterID' \ ' parsing %s searching for' \ ' character %s', li, name) continue res.append((str(pid[0]), analyze_name(pname))) return res
def _search_person(self, name, results): ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name}) ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('nm', name, results)) name = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not name: self._mobile_logger.warn('no title tag searching for name %s', name) return res nl = name[0].lower() if not nl.startswith('imdb name'): # a direct hit! name = _unHtml(name[0]) name = name.replace('- Filmography by type', '').strip() pid = None pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if pidtag: pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1) if not (pid and name): self._mobile_logger.error('no direct hit name/personID for' \ ' name %s', name) return res res[:] = [(str(pid[0]), analyze_name(name, canonical=1))] else: lis = _findBetween(cont, 'td valign="top">', '</td>', maxRes=results * 3) for li in lis: akas = _findBetween(li, '<em>"', '"</em>') for sep in ['<small', '<br> aka', '<br> birth name']: sepIdx = li.find(sep) if sepIdx != -1: li = li[:sepIdx] pid = re_imdbID.findall(li) pname = _unHtml(li) if not (pid and pname): self._mobile_logger.debug('no name/personID parsing' \ ' %s searching for name %s', li, name) continue resd = analyze_name(pname, canonical=1) if akas: resd['akas'] = akas res.append((str(pid[0]), resd)) return res
def get_person_main(self, personID): infosets = ('main', 'biography', 'other works') nl = getLabel(personID, '%snames.index' % self.__db, '%snames.key' % self.__db) # No name, no party. if nl is None: raise IMDbDataAccessError, 'unable to get personID "%s"' % personID res = analyze_name(nl) res.update(getBio(personID, '%sbiographies.index' % self.__db, '%sbiographies.data' % self.__db)) akas = getAkaNames(personID, '%saka-names.data' % self.__db, '%snames.index' % self.__db, '%snames.key' % self.__db) if akas: res['akas'] = akas # XXX: horrible hack! The getBio() function is not able to # retrieve the movieID! # A cleaner solution, would be to NOT return Movies object # at first, from the getBio() function. # XXX: anyway, this is no more needed, since "guest appearances" # are gone with the new tv series episodes support. if res.has_key('notable tv guest appearances'): nl = [] for m in res['notable tv guest appearances']: movieID = self._getTitleID(m.get('long imdb canonical title')) if movieID is None: continue m.movieID = movieID nl.append(m) if nl: nl.sort() res['notable tv guest appearances'][:] = nl else: del res['notable tv guest appearances'] trefs, nrefs = self._extractRefs(res) return {'data': res, 'info sets': infosets, 'titlesRefs': trefs, 'namesRefs': nrefs}
def set_name(self, name): """Set the name of the character.""" try: d = analyze_name(name, canonical=False) self.data.update(d) except IMDbParserError: pass
class DOMBasicPersonParser(DOMBasicMovieParser): """Simply get the name of a person and the imdbID. It's used by the DOMHTMLSearchPersonParser class to return a result for a direct match (when a search on IMDb results in a single person, the web server sends directly the movie page.""" _titleFunct = lambda self, x: analyze_name(_cleanName(x), canonical=1)
class DOMBasicCharacterParser(DOMBasicMovieParser): """Simply get the name of a character and the imdbID. It's used by the DOMHTMLSearchCharacterParser class to return a result for a direct match (when a search on IMDb results in a single character, the web server sends directly the movie page.""" _titleFunct = lambda self, x: analyze_name(x or u'', canonical=False)
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser): """Parse the html page that the IMDb web server shows when the "new search system" is used, for persons.""" _BaseParser = DOMBasicPersonParser _notDirectHitTitle = '<title>imdb name' _titleBuilder = lambda self, x: build_name(x, canonical=True) _linkPrefix = '/name/nm' _attrs = [Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'name': "./a[1]/text()", 'index': "./text()[1]", 'akas': ".//div[@class='_imdbpyAKA']/text()" }, postprocess=lambda x: ( analyze_imdbid(x.get('link') or u''), analyze_name((x.get('name') or u'') + \ (x.get('index') or u''), canonical=1), x.get('akas') ))] extractors = [ Extractor(label='search', path="//td[3]/a[starts-with(@href, '/name/nm')]/..", attrs=_attrs) ] def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:1024].lower(): html_string = _reAKASp.sub( r'\1<div class="_imdbpyAKA">\2::</div>\3', html_string) return DOMHTMLSearchMovieParser.preprocess_string(self, html_string)
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser): """Parse the html page that the IMDb web server shows when the "new search system" is used, for persons.""" _titleBuilder = lambda self, x: build_name(x, canonical=True) _linkPrefix = '/name/nm' _attrs = [ Attribute( key='data', multi=True, path={ 'link': "./a[1]/@href", 'name': "./a[1]/text()", 'index': "./text()[1]", 'akas': ".//div[@class='_imdbpyAKA']/text()" }, postprocess=lambda x: ( analyze_imdbid(x.get('link') or ''), analyze_name((x.get('name') or '') + (x.get('index') or ''), canonical=1), x.get('akas') ) ) ] extractors = [ Extractor( label='search', path="//td[@class='result_text']/a[starts-with(@href, '/name/nm')]/..", attrs=_attrs ) ]
def _search_person(self, name, results): ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name}) ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('nm', name, results)) name = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not name: self._mobile_logger.warn('no title tag searching for name %s', name) return res nl = name[0].lower() if not nl.startswith('find - imdb'): # a direct hit! name = _unHtml(name[0]) name = name.replace('- Filmography by type' , '').strip() pid = None pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if pidtag: pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1) if not (pid and name): self._mobile_logger.error('no direct hit name/personID for' \ ' name %s', name) return res res[:] = [(str(pid[0]), analyze_name(name, canonical=1))] else: lis = _findBetween(cont, 'td class="result_text">', '</td>', maxRes=results*3) for li in lis: akas = _findBetween(li, '<em>"', '"</em>') for sep in ['<small', '<br> aka', '<br> birth name']: sepIdx = li.find(sep) if sepIdx != -1: li = li[:sepIdx] pid = re_imdbID.findall(li) pname = _unHtml(li) if not (pid and pname): self._mobile_logger.debug('no name/personID parsing' \ ' %s searching for name %s', li, name) continue resd = analyze_name(pname, canonical=1) if akas: resd['akas'] = akas res.append((str(pid[0]), resd)) return res
class DOMBasicPersonParser(DOMBasicMovieParser): """Simply get the name of a person and the imdbID. It's used by the DOMHTMLSearchPersonParser class to return a result for a direct match (when a search on IMDb results in a single person, the web server sends directly the movie page.""" _titleAttrPath = ".//text()" _linkPath = "//a[starts-with(@href, '/name/nm')]" _titleFunct = lambda self, x: analyze_name(x or u'', canonical=1)
def end_title(self): self._in_title = False self._title = self._title.strip() if self._title: if self.kind != 'character': self._data.update(analyze_name(self._title, canonical=1)) else: self._title = self._title.replace('(Character)', '').strip() self._data['name'] = self._title
def set_name(self, name): """Set the name of the character.""" # XXX: convert name to unicode, if it's a plain string? try: d = analyze_name(name, canonical=0) self.data.update(d) except: # TODO: catch only IMDbPYParserError and issue a warning. pass
def do_input(self, attrs): itype = self.get_attr_value(attrs, 'type') if itype is None or itype.lower() != 'hidden': return iname = self.get_attr_value(attrs, 'name') if iname is None or iname != 'primary': return ivalue = self.get_attr_value(attrs, 'value') if ivalue is None: return # It's hard to catch the correct 'Surname, Name' from the # title, so if the "credited alongside another name" form # is found, use it. self._data.update(analyze_name(ivalue, canonical=0))
def _readNamesKeyFile(keyFile): """Iterate over the given file, returning tuples suited for the common.locsql.scan_names function.""" try: kf = open(keyFile, 'r') except IOError, e: raise IMDbDataAccessError, str(e) for line in kf: ls = line.split('|') if not ls[0]: continue named = analyze_name(latin2utf(ls[0])) yield (long(ls[1], 16), named) kf.close()
def start_a(self, attrs): href = self.get_attr_value(attrs, 'href') if not href: return href = href.lower() if '/character/ch' in href and href.endswith('bio'): rpid = self.re_imdbID.findall(href) if rpid and self._name: n = self._name.replace('(Character)', '').strip() pid = str(rpid[-1]) d = analyze_name(n, canonical=0) res = [(pid, d)] self.reset() self._result = res
def _search_person(self, name, results): ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name}) ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('nm', name, results)) name = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not name: return res nl = name[0].lower() if not nl.startswith('imdb name'): # XXX: a direct hit! name = _unHtml(name[0]) # Easiest way: the board link (for person who already have # messages in the board). pidtag = _getTagsWith(cont, '/board/nest/', maxRes=1) pid = None if pidtag: pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1) if not (pid and name): # Otherwise, the 'credited alongside' for the name, # and the biography link for the personID. nametag = _getTagsWith(cont, 'NAME="primary"', maxRes=1) if not nametag: return res nametag = _findBetween(nametag[0], 'VALUE="', '"', maxRes=1) if not nametag: return res name = unquote(nametag[0]) pid = _findBetween(cont, '/name/nm', ('/', '"', '>'), maxRes=1) if not pid: return res if not (pid and name): return res res[:] = [(str(pid[0]), analyze_name(name, canonical=1))] else: lis = _findBetween(cont, 'td valign="top">', ['<small', '</td>', '<br> aka']) for li in lis: pid = re_imdbID.findall(li) pname = _unHtml(li) if not (pid and pname): continue res.append((str(pid[0]), analyze_name(pname, canonical=1))) return res
def _search_person(self, name, results): ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name}) ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('nm', name, results)) name = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not name: return res nl = name[0].lower() if not nl.startswith('imdb name'): # XXX: a direct hit! name = _unHtml(name[0]) # Easiest way: the board link (for person who already have # messages in the board). pidtag = _getTagsWith(cont, '/board/nest/', maxRes=1) pid = None if pidtag: pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1) if not (pid and name): # Otherwise, the 'credited alongside' for the name, # and the biography link for the personID. nametag = _getTagsWith(cont, 'NAME="primary"', maxRes=1) if not nametag: return res nametag = _findBetween(nametag[0], 'VALUE="', '"', maxRes=1) if not nametag: return res name = unquote(nametag[0]) pid = _findBetween(cont, '/name/nm', '/bio', maxRes=1) if not pid: return res if not (pid and name): return res res[:] = [(str(pid[0]), analyze_name(name, canonical=1))] else: lis = _findBetween(cont, 'td valign="top">', ['<small', '</td>', '<br']) for li in lis: pid = re_imdbID.findall(li) pname = _unHtml(li) if not (pid and pname): continue res.append((str(pid[0]), analyze_name(pname, canonical=1))) return res
def _search_character(self, name, results): name = name.strip() if not name: return [] s_name = normalizeName(analyze_name(name)['name']) nsplit = s_name.split() name2 = u'' if len(nsplit) > 1: name2 = '%s %s' % (nsplit[-1], ' '.join(nsplit[:-1])) if s_name == name2: name2 = u'' res = _scan_names('%scharacters.key' % self.__db, s_name, name2, u'', results, _scan_character=1) res[:] = [x[1] for x in res] return res
def _getNameID(self, name): """Given a long imdb canonical name, returns a personID or None if not found.""" nd = analyze_name(name) res = Name.select( AND( Name.q.name == self.toUTF8(nd['name']), self._buildNULLCondition(Name.q.imdbIndex, nd.get('imdbIndex')))) try: c = res.count() if res.count() != 1: return None except (UnicodeDecodeError, TypeError): return None return res[0].id
def _findRefs(self, o, trefs, nrefs): """Find titles or names references in strings.""" if isinstance(o, (unicode, str)): for title in re_titleRef.findall(o): a_title = analyze_title(title, canonical=0) rtitle = build_title(a_title, ptdf=1) if trefs.has_key(rtitle): continue movieID = self._getTitleID(rtitle) if movieID is None: movieID = self._getTitleID(title) if movieID is None: continue m = Movie(title=rtitle, movieID=movieID, accessSystem=self.accessSystem) trefs[rtitle] = m rtitle2 = canonicalTitle(a_title.get('title', u'')) if rtitle2 and rtitle2 != rtitle and rtitle2 != title: trefs[rtitle2] = m if title != rtitle: trefs[title] = m for name in re_nameRef.findall(o): a_name = analyze_name(name, canonical=1) rname = build_name(a_name, canonical=1) if nrefs.has_key(rname): continue personID = self._getNameID(rname) if personID is None: personID = self._getNameID(name) if personID is None: continue p = Person(name=rname, personID=personID, accessSystem=self.accessSystem) nrefs[rname] = p rname2 = normalizeName(a_name.get('name', u'')) if rname2 and rname2 != rname: nrefs[rname2] = p if name != rname and name != rname2: nrefs[name] = p elif isinstance(o, (list, tuple)): for item in o: self._findRefs(item, trefs, nrefs) elif isinstance(o, dict): for value in o.values(): self._findRefs(value, trefs, nrefs) return (trefs, nrefs)
def start_a(self, attrs): href = self.get_attr_value(attrs, 'href') if not href: return href = href.lower() # XXX: Since July 2004, IMDb has removed the "pageflicker", # so we've to gather the imdbID from the "IMDb message board" # link. if href.startswith('/name/nm') and \ href.find('/board') != -1: rpid = self.re_imdbID.findall(href) if rpid and self._name: n = self._name.strip() if n.find('IMDb Name') != -1 and n.find('Search') != -1: return pid = str(rpid[-1]) d = analyze_name(n, canonical=1) res = [(pid, d)] self.reset() self._result = res
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser): """A parser for the name search page.""" rules = [ Rule( key='data', extractor=Rules( foreach='//td[@class="result_text"]', rules=[ Rule( key='link', extractor=Path('./a/@href', reduce=reducers.first) ), Rule( key='name', extractor=Path('./a/text()') ), Rule( key='index', extractor=Path('./text()') ), Rule( key='akas', extractor=Path(foreach='./i', path='./text()') ), Rule( key='headshot', extractor=Path('../td[@class="primary_photo"]/a/img/@src') ) ], transform=lambda x: ( analyze_imdbid(x.get('link')), analyze_name(x.get('name', '') + x.get('index', ''), canonical=1), x.get('akas'), x.get('headshot') ) ) ) ] def _init(self): super(DOMHTMLSearchPersonParser, self)._init() self.img_type = 'headshot'
def _findRefs(self, o, trefs, nrefs): """Find titles or names references in strings.""" if isinstance(o, (UnicodeType, StringType)): for title in re_titleRef.findall(o): a_title = analyze_title(title, canonical=1) rtitle = build_title(a_title, canonical=1, ptdf=1) if trefs.has_key(rtitle): continue movieID = self._getTitleID(rtitle) if movieID is None: movieID = self._getTitleID(title) if movieID is None: continue m = Movie(title=rtitle, movieID=movieID, accessSystem=self.accessSystem) trefs[rtitle] = m rtitle2 = canonicalTitle(a_title.get('title', u'')) if rtitle2 and rtitle2 != rtitle and rtitle2 != title: trefs[rtitle2] = m if title != rtitle: trefs[title] = m for name in re_nameRef.findall(o): a_name = analyze_name(name, canonical=1) rname = build_name(a_name, canonical=1) if nrefs.has_key(rname): continue personID = self._getNameID(rname) if personID is None: personID = self._getNameID(name) if personID is None: continue p = Person(name=rname, personID=personID, accessSystem=self.accessSystem) nrefs[rname] = p rname2 = normalizeName(a_name.get('name', u'')) if rname2 and rname2 != rname: nrefs[rname2] = p if name != rname and name != rname2: nrefs[name] = p elif isinstance(o, (ListType, TupleType)): for item in o: self._findRefs(item, trefs, nrefs) elif isinstance(o, DictType): for value in o.values(): self._findRefs(value, trefs, nrefs) return (trefs, nrefs)
def _search_character(self, name, results): name = name.strip() if not name: return [] s_name = analyze_name(name)['name'] if not s_name: return [] if isinstance(s_name, UnicodeType): s_name = s_name.encode('ascii', 'ignore') s_name = normalizeName(s_name) soundexCode = soundex(s_name) surname = s_name.split(' ')[-1] surnameSoundex = soundex(surname) name2 = '' soundexName2 = None nsplit = s_name.split() if len(nsplit) > 1: name2 = '%s %s' % (nsplit[-1], ' '.join(nsplit[:-1])) if s_name == name2: name2 = '' else: soundexName2 = soundex(name2) # If the soundex is None, compare only with the first # phoneticCode column. if soundexCode is not None: if soundexName2 is not None: condition = OR( surnameSoundex == CharName.q.surnamePcode, IN(CharName.q.namePcodeNf, [soundexCode, soundexName2]), IN(CharName.q.surnamePcode, [soundexCode, soundexName2])) else: condition = OR( surnameSoundex == CharName.q.surnamePcode, IN(soundexCode, [CharName.q.namePcodeNf, CharName.q.surnamePcode])) else: condition = ISNULL(Name.q.namePcodeNf) try: qr = [(q.id, { 'name': q.name, 'imdbIndex': q.imdbIndex }) for q in CharName.select(condition)] except NotFoundError, e: raise IMDbDataAccessError, \ 'unable to search the database: "%s"' % str(e)
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser): """A parser for the name search page.""" rules = [ Rule(key='data', extractor=Rules( foreach='//td[@class="result_text"]', rules=[ Rule(key='link', extractor=Path('./a/@href', reduce=reducers.first)), Rule(key='name', extractor=Path('./a/text()')), Rule(key='index', extractor=Path('./text()')), Rule(key='akas', extractor=Path(foreach='./i', path='./text()')) ], transform=lambda x: (analyze_imdbid(x.get('link')), analyze_name(x.get('name', '') + x.get('index', ''), canonical=1), x.get('akas')))) ]
def get_character_main(self, characterID, results=1000): infosets = self.get_character_infoset() name = getCharacterName(characterID, '%scharacters.index' % self.__db, '%scharacters.data' % self.__db) if not name: raise IMDbDataAccessError, \ 'unable to get characterID "%s"' % characterID res = analyze_name(name, canonical=1) filmography = getCharacterFilmography(characterID, '%scharacters.index' % self.__db, '%scharacters.data' % self.__db, '%stitles.index' % self.__db, '%stitles.key' % self.__db, '%snames.index' % self.__db, '%snames.key' % self.__db, limit=results) if filmography: filmography = merge_roles(filmography) filmography.sort() res['filmography'] = filmography return {'data': res, 'info sets': infosets}
def nameVariations(name, fromPtdf=0): """Build name variations useful for searches; if fromPtdf is true, the input is assumed to be in the plain text data files format.""" name1 = name2 = name3 = u'' if fromPtdf or re_nameIndex.search(name): # We've a name with an (imdbIndex) namedict = analyze_name(name, canonical=1) # name1 is the name in the canonical format. name1 = namedict['name'] # name3 is the canonical name with the imdbIndex. if fromPtdf: if namedict.has_key('imdbIndex'): name3 = name else: name3 = build_name(namedict, canonical=1) else: # name1 is the name in the canonical format. name1 = canonicalName(name) name3 = u'' # name2 is the name in the normal format, if it differs from name1. name2 = normalizeName(name1) if name1 == name2: name2 = u'' return name1, name2, name3
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser): """Parse the html page that the IMDb web server shows when the "new search system" is used, for persons.""" _linkPrefix = '/name/nm' rules = [ Rule( key='data', extractor=Rules( foreach= '//td[@class="result_text"]/a[starts-with(@href, "/name/nm")]/..', rules=[ Rule(key='link', extractor=Path('./a[1]/@href')), Rule(key='name', extractor=Path('./a[1]/text()')), Rule(key='index', extractor=Path('./text()[1]')), Rule(key='akas', extractor=Path('.//div[@class="_imdbpyAKA"]/text()')) ], transform=lambda x: (analyze_imdbid(x.get('link') or ''), analyze_name((x.get('name') or '') + (x.get('index') or ''), canonical=1), x.get('akas')))) ]
def _search_person(self, name, results): name = name.strip() if not name: return [] s_name = analyze_name(name)['name'] if not s_name: return [] if isinstance(s_name, UnicodeType): s_name = s_name.encode('ascii', 'ignore') soundexCode = soundex(s_name) name1, name2, name3 = nameVariations(name) # If the soundex is None, compare only with the first # phoneticCode column. if soundexCode is not None: condition = IN( soundexCode, [Name.q.namePcodeCf, Name.q.namePcodeNf, Name.q.surnamePcode]) conditionAka = IN(soundexCode, [ AkaName.q.namePcodeCf, AkaName.q.namePcodeNf, AkaName.q.surnamePcode ]) else: condition = ISNULL(Name.q.namePcodeCf) conditionAka = ISNULL(AkaName.q.namePcodeCf) try: qr = [(q.id, { 'name': q.name, 'imdbIndex': q.imdbIndex }) for q in Name.select(condition)] qr += [(q.personID, { 'name': q.name, 'imdbIndex': q.imdbIndex }) for q in AkaName.select(conditionAka)] except NotFoundError, e: raise IMDbDataAccessError, \ 'unable to search the database: "%s"' % str(e)
def _search_person(self, name, results): name = name.strip() if not name: return [] name1, name2, name3 = nameVariations(name) res = _scan_names('%snames.key' % self.__db, name1, name2, name3, results) res[:] = [x[1] for x in res] new_res = [] seen_PID = [] for idx, (personID, r) in enumerate(res): # Remove duplicates. # XXX: find a way to prefer names with an AKA? Or prefer # the original name? if personID in seen_PID: continue else: seen_PID.append(personID) realPID = self._get_real_personID(personID) if personID == realPID: new_res.append((personID, r)) continue if realPID in seen_PID: continue else: seen_PID.append(realPID) aka_name = build_name(r, canonical=1) real_name = getLabel(realPID, '%snames.index' % self.__db, '%snames.key' % self.__db) if aka_name == real_name: new_res.append((realPID, r)) continue new_r = analyze_name(real_name, canonical=1) new_r['akas'] = [aka_name] new_res.append((realPID, new_r)) if results > 0: new_res[:] = new_res[:results] return new_res
def _parseBiography(biol): """Parse the biographies.data file.""" res = {} bio = ' '.join(_parseList(biol, 'BG', mline=0)) bio = _parseBioBy(biol) if bio: res['mini biography'] = bio for x in biol: x4 = x[:4] x6 = x[:6] if x4 == 'DB: ': date, notes = date_and_notes(x[4:]) if date: res['birth date'] = date if notes: res['birth notes'] = notes elif x4 == 'DD: ': date, notes = date_and_notes(x[4:]) if date: res['death date'] = date if notes: res['death notes'] = notes elif x6 == 'SP: * ': res.setdefault('spouse', []).append(x[6:].strip()) elif x4 == 'RN: ': n = x[4:].strip() if not n: continue rn = build_name(analyze_name(n, canonical=1), canonical=1) res['birth name'] = rn elif x6 == 'AT: * ': res.setdefault('articles', []).append(x[6:].strip()) elif x4 == 'HT: ': res['height'] = x[4:].strip() elif x6 == 'PT: * ': res.setdefault('pictorials', []).append(x[6:].strip()) elif x6 == 'CV: * ': res.setdefault('magazine covers', []).append(x[6:].strip()) elif x4 == 'NK: ': res.setdefault('nick names', []).append(normalizeName(x[4:])) elif x6 == 'PI: * ': res.setdefault('portrayed', []).append(x[6:].strip()) elif x6 == 'SA: * ': sal = x[6:].strip().replace(' -> ', '::') res.setdefault('salary history', []).append(sal) trl = _parseList(biol, 'TR') if trl: res['trivia'] = trl quotes = _parseList(biol, 'QU') if quotes: res['quotes'] = quotes otherworks = _parseList(biol, 'OW') if otherworks: res['other works'] = otherworks books = _parseList(biol, 'BO') if books: res['books'] = books agent = _parseList(biol, 'AG') if agent: res['agent address'] = agent wherenow = _parseList(biol, 'WN') if wherenow: res['where now'] = wherenow[0] biomovies = _parseList(biol, 'BT') if biomovies: res['biographical movies'] = biomovies guestapp = _buildGuests([x[6:].strip() for x in biol if x[:6] == 'GA: * ']) if guestapp: res['notable tv guest appearances'] = guestapp tm = _parseList(biol, 'TM') if tm: res['trademarks'] = tm interv = _parseList(biol, 'IT') if interv: res['interviews'] = interv return res
def set_name(self, name): """Set the name of the person.""" d = analyze_name(name, canonical=True) self.data.update(d)
def set_name(self, name): """Set the name of the person.""" # XXX: convert name to unicode, if it's a plain string? d = analyze_name(name, canonical=1) self.data.update(d)
class DOMHTMLMaindetailsParser(DOMParserBase): """Parser for the "categorized" (maindetails) page of a given person. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example:: cparser = DOMHTMLMaindetailsParser() result = cparser.parse(categorized_html_string) """ _containsObjects = True _name_imdb_index = re.compile(r'\([IVXLCDM]+\)') _birth_rules = [ Rule(key='birth date', extractor=Path('.//time[@itemprop="birthDate"]/@datetime')), Rule(key='birth place', extractor=Path( './/a[starts-with(@href, "/search/name?birth_place=")]/text()' )) ] _death_rules = [ Rule(key='death date', extractor=Path('.//time[@itemprop="deathDate"]/@datetime')), Rule(key='death place', extractor=Path( './/a[starts-with(@href, "/search/name?death_place=")]/text()' )) ] _film_rules = [ Rule(key='link', extractor=Path('./b/a[1]/@href')), Rule(key='title', extractor=Path('./b/a[1]/text()')), Rule(key='notes', extractor=Path('./b/following-sibling::text()')), Rule(key='year', extractor=Path('./span[@class="year_column"]/text()')), Rule(key='status', extractor=Path('./a[@class="in_production"]/text()')), Rule(key='rolesNoChar', extractor=Path('.//br/following-sibling::text()')), Rule(key='chrRoles', extractor=Path('./a[@imdbpyname]/@imdbpyname')) ] rules = [ Rule(key='name', extractor=Path('//h1[@class="header"]//text()', transform=lambda x: analyze_name(x, canonical=1))), Rule(key='name_index', extractor=Path('//h1[@class="header"]/span[1]/text()')), Rule(key='birth info', extractor=Rules(section='//div[h4="Born:"]', rules=_birth_rules)), Rule(key='death info', extractor=Rules( section='//div[h4="Died:"]', rules=_death_rules, )), Rule(key='headshot', extractor=Path( '//td[@id="img_primary"]/div[@class="image"]/a/img/@src')), Rule(key='akas', extractor=Path('//div[h4="Alternate Names:"]/text()', transform=lambda x: x.strip().split(' '))), Rule( key='filmography', extractor=Rules( foreach='//div[starts-with(@id, "filmo-head-")]', rules=[ Rule( key=Path( './a[@name]/text()', transform=lambda x: x.lower().replace(': ', ' ')), extractor=Rules( foreach= './following-sibling::div[1]/div[starts-with(@class, "filmo-row")]', rules=_film_rules, transform=lambda x: build_movie( x.get('title') or '', year=x.get('year'), movieID=analyze_imdbid(x.get('link') or ''), rolesNoChar=(x.get('rolesNoChar') or '').strip( ), chrRoles=(x.get('chrRoles') or '').strip(), additionalNotes=x.get('notes'), status=x.get('status') or None))) ])), Rule(key='in development', extractor=Rules(foreach='//div[starts-with(@class,"devitem")]', rules=[ Rule(key='link', extractor=Path('./a/@href')), Rule(key='title', extractor=Path('./a/text()')) ], transform=lambda x: build_movie( x.get('title') or '', movieID=analyze_imdbid(x.get('link') or ''), roleID=(x.get('roleID') or '').split('/'), status=x.get('status') or None))) ] preprocessors = [('<div class="clear"/> </div>', ''), ('<br/>', '<br />')] def postprocess_data(self, data): for key in ['name']: if (key in data) and isinstance(data[key], dict): subdata = data[key] del data[key] data.update(subdata) for what in 'birth date', 'death date': if what in data and not data[what]: del data[what] name_index = (data.get('name_index') or '').strip() if name_index: if self._name_imdb_index.match(name_index): data['imdbIndex'] = name_index[1:-1] del data['name_index'] # XXX: the code below is for backwards compatibility # probably could be removed for key in list(data.keys()): if key.startswith('actor '): if 'actor' not in data: data['actor'] = [] data['actor'].extend(data[key]) del data[key] if key.startswith('actress '): if 'actress' not in data: data['actress'] = [] data['actress'].extend(data[key]) del data[key] if key.startswith('self '): if 'self' not in data: data['self'] = [] data['self'].extend(data[key]) del data[key] if key == 'birth place': data['birth notes'] = data[key] del data[key] if key == 'death place': data['death notes'] = data[key] del data[key] return data
# latin_1 encoded strings. name1, name2, name3 = [x.encode('latin_1', 'replace') for x in name1, name2, name3] try: sn = search_name(keyFile, name1, name2, name3, results, _scan_character) except IOError, e: if _scan_character: import warnings warnings.warn('Unable to access characters information: %s' % e) return [] else: raise res = [] for x in sn: tmpd = analyze_name(latin2utf(x[2])) res.append((x[0], (x[1], tmpd))) return res except ImportError: import warnings warnings.warn('Unable to import the cutils.search_name function.' ' Searching names using the "local" data access system' ' will be REALLY slow.') from imdb.parser.common.locsql import scan_names def _readNamesKeyFile(keyFile): """Iterate over the given file, returning tuples suited for the common.locsql.scan_names function.""" try: kf = open(keyFile, 'r') except IOError, e: raise IMDbDataAccessError, str(e)
def get_person_main(self, personID, _parseChr=False): if not _parseChr: url = imdbURL_person_main % personID + 'maindetails' else: url = imdbURL_character_main % personID s = self._mretrieve(url) r = {} name = _findBetween(s, '<title>', '</title>', maxRes=1) if not name: if _parseChr: w = 'characterID' else: w = 'personID' raise IMDbDataAccessError, 'unable to get %s "%s"' % (w, personID) name = _unHtml(name[0]) if _parseChr: name = name.replace('(Character)', '').strip() r = analyze_name(name, canonical=not _parseChr) for dKind in ('birth', 'death'): date = _findBetween(s, '<h5>Date of %s:</h5>' % dKind.capitalize(), ('<a class', '</div>', '<br/><br/>'), maxRes=1) if date: date = _unHtml(date[0]) if date: date, notes = date_and_notes(date) if date: r['%s date' % dKind] = date if notes: r['%s notes' % dKind] = notes akas = _findBetween(s, 'Alternate Names:</h5>', ('</div>', '<br/><br/>'), maxRes=1) if akas: akas = akas[0] if akas.find(' | ') != -1: akas = _unHtml(akas).split(' | ') else: akas = _unHtml(akas).split(' / ') if akas: r['akas'] = akas hs = _findBetween(s, 'name="headshot"', '</a>', maxRes=1) if hs: hs[:] = _findBetween(hs[0], 'src="', '"', maxRes=1) if hs: r['headshot'] = hs[0] # Build a list of tuples such [('hrefLink', 'section name')] workkind = _findBetween(s, '<div class="strip jump">', '</div>', maxRes=1) if workkind: workkind[:] = _findBetween(workkind[0], 'href="#', '</a>') else: # Assume there's only one section and/or there are no # section links, for some reason. workkind[:] = _findBetween(s, '<h5><a name=', '</a></h5>') workkind[:] = [x.lstrip('"').rstrip(':').lower() for x in workkind] ws = [] for work in workkind: wsplit = work.split('">', 1) if len(wsplit) == 2: ws.append((wsplit[0], wsplit[1].lower())) # XXX: I think "guest appearances" are gone. if s.find('<a href="#guest-appearances"') != -1: ws.append(('guest-appearances', 'notable tv guest appearances')) if _parseChr: ws.append(('filmography', 'filmography')) for sect, sectName in ws: raws = u'' # Everything between the current section link and the end # of the <ol> tag. if _parseChr and sect == 'filmography': inisect = s.find('<div class="filmo">') else: inisect = s.find('<a name="%s' % sect) if inisect != -1: endsect = s[inisect:].find('</ol>') if endsect != -1: raws = s[inisect:inisect+endsect] if not raws: continue mlist = _findBetween(raws, '<li>', ('</li>', '<br>', '<br/>')) for m in mlist: # For every movie in the current section. movieID = re_imdbID.findall(m) if not movieID: continue if not _parseChr: chrIndx = m.find(' .... ') else: chrIndx = m.find(' Played by ') chids = [] if chrIndx != -1: chrtxt = m[chrIndx+6:] if _parseChr: chrtxt = chrtxt[5:] for ch in chrtxt.split(' / '): chid = re_imdbID.findall(ch) if not chid: chids.append(None) else: chids.append(chid[-1]) if not chids: chids = None elif len(chids) == 1: chids = chids[0] movieID = str(movieID[0]) # Search the status. stidx = m.find('<i>') status = u'' if stidx != -1: stendidx = m.rfind('</i>') if stendidx != -1: status = _unHtml(m[stidx+3:stendidx]) m = m.replace(m[stidx+3:stendidx], '') m = _unHtml(m) if not m: continue movie = build_movie(m, movieID=movieID, status=status, roleID=chids, modFunct=self._defModFunct, accessSystem=self.accessSystem, _parsingCharacter=_parseChr) r.setdefault(sectName, []).append(movie) # If available, take the always correct name from a form. itag = _getTagsWith(s, 'NAME="primary"', maxRes=1) if not itag: itag = _getTagsWith(s, 'name="primary"', maxRes=1) if itag: vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1) if not vtag: vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1) if vtag: try: vtag = unquote(str(vtag[0])) vtag = unicode(vtag, 'latin_1') r.update(analyze_name(vtag, canonical=0)) except UnicodeEncodeError: pass photo = _findBetween(s, '<div class="photo">', '</div>', maxRes=1) image_url = '' if (len(photo)>0): img = _findBetween(photo[0], '<img', '/a>', maxRes=1) if (len(img)>0): image_url = _findBetween(img[0],' src="', '"', maxRes=1)[0] r['image_url'] = image_url return {'data': r, 'info sets': ('main', 'filmography')}
def get_person_main(self, personID, _parseChr=False): if not _parseChr: url = self.urls['person_main'] % personID + 'maindetails' else: url = self.urls['character_main'] % personID s = self._mretrieve(url) r = {} name = _findBetween(s, '<title>', '</title>', maxRes=1) if not name: if _parseChr: w = 'characterID' else: w = 'personID' raise IMDbDataAccessError('unable to get %s "%s"' % (w, personID)) name = _unHtml(name[0].replace(' - IMDb', '')) if _parseChr: name = name.replace('(Character)', '').strip() name = name.replace('- Filmography by type', '').strip() else: name = name.replace('- Filmography by', '').strip() r = analyze_name(name, canonical=not _parseChr) for dKind in ('Born', 'Died'): date = _findBetween(s, '%s:</h4>' % dKind.capitalize(), ('<div class', '</div>', '<br/><br/>'), maxRes=1) if date: date = _unHtml(date[0]) if date: #date, notes = date_and_notes(date) # TODO: fix to handle real names. date_notes = date.split(' in ', 1) notes = u'' date = date_notes[0] if len(date_notes) == 2: notes = date_notes[1] dtitle = 'birth' if dKind == 'Died': dtitle = 'death' if date: r['%s date' % dtitle] = date if notes: r['%s notes' % dtitle] = notes akas = _findBetween(s, 'Alternate Names:</h4>', ('</div>', '<br/><br/>'), maxRes=1) if akas: akas = akas[0] if akas: akas = _unHtml(akas) if akas.find(' | ') != -1: akas = akas.split(' | ') else: akas = akas.split(' / ') if akas: r['akas'] = filter(None, [x.strip() for x in akas]) hs = _findBetween(s, "rel='image_src'", '>', maxRes=1) if not hs: hs = _findBetween(s, 'rel="image_src"', '>', maxRes=1) if not hs: hs = _findBetween(s, '<a name="headshot"', '</a>', maxRes=1) if hs: hsl = _findBetween(hs[0], "href='", "'", maxRes=1) if not hsl: hsl = _findBetween(hs[0], 'href="', '"', maxRes=1) if hsl and 'imdb-share-logo' not in hsl[0]: r['headshot'] = hsl[0] # Build a list of tuples such [('hrefLink', 'section name')] workkind = _findBetween(s, 'id="jumpto_', '</a>') ws = [] for work in workkind: sep = '" >' if '">' in work: sep = '">' wsplit = work.split(sep, 1) if len(wsplit) == 2: sect = wsplit[0] if '"' in sect: sect = sect[:sect.find('"')] ws.append((sect, wsplit[1].lower())) # XXX: I think "guest appearances" are gone. if s.find('<a href="#guest-appearances"') != -1: ws.append(('guest-appearances', 'notable tv guest appearances')) #if _parseChr: # ws.append(('filmography', 'filmography')) for sect, sectName in ws: raws = u'' if sectName == 'self': sect = 'Self' # Everything between the current section link and the end # of the <ol> tag. if _parseChr and sect == 'filmography': inisect = s.find('<div class="filmo">') else: inisect = s.find('<a name="%s' % sect) if inisect != -1: endsect = s[inisect:].find('<div id="filmo-head-') if endsect == -1: endsect = s[inisect:].find('<div class="article"') if endsect != -1: raws = s[inisect:inisect+endsect] #if not raws: continue mlist = _findBetween(raws, '<div class="filmo-row', ('<div class="clear"/>',)) for m in mlist: fCB = m.find('>') if fCB != -1: m = m[fCB+1:].lstrip() m = re_filmo_episodes.sub('', m) # For every movie in the current section. movieID = re_imdbID.findall(m) if not movieID: self._mobile_logger.debug('no movieID in %s', m) continue m = m.replace('<br/>', ' .... ', 1) if not _parseChr: chrIndx = m.find(' .... ') else: chrIndx = m.find(' Played by ') chids = [] if chrIndx != -1: chrtxt = m[chrIndx+6:] if _parseChr: chrtxt = chrtxt[5:] for ch in chrtxt.split(' / '): chid = re_imdbID.findall(ch) if not chid: chids.append(None) else: chids.append(chid[-1]) if not chids: chids = None elif len(chids) == 1: chids = chids[0] movieID = str(movieID[0]) # Search the status. stidx = m.find('<i>') status = u'' if stidx != -1: stendidx = m.rfind('</i>') if stendidx != -1: status = _unHtml(m[stidx+3:stendidx]) m = m.replace(m[stidx+3:stendidx], '') year = _findBetween(m, 'year_column">', '</span>', maxRes=1) if year: year = year[0] m = m.replace('<span class="year_column">%s</span>' % year, '') else: year = None m = _unHtml(m) if not m: self._mobile_logger.warn('no title for movieID %s', movieID) continue movie = build_movie(m, movieID=movieID, status=status, roleID=chids, modFunct=self._defModFunct, accessSystem=self.accessSystem, _parsingCharacter=_parseChr, year=year) sectName = sectName.split(':')[0] r.setdefault(sectName, []).append(movie) # If available, take the always correct name from a form. itag = _getTagsWith(s, 'NAME="primary"', maxRes=1) if not itag: itag = _getTagsWith(s, 'name="primary"', maxRes=1) if itag: vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1) if not vtag: vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1) if vtag: try: vtag = unquote(str(vtag[0])) vtag = unicode(vtag, 'latin_1') r.update(analyze_name(vtag)) except UnicodeEncodeError: pass return {'data': r, 'info sets': ('main', 'filmography')}