class DOMCompanyParser(DOMParserBase): """Parser for the main page of a given company. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: cparser = DOMCompanyParser() result = cparser.parse(company_html_string) """ _containsObjects = True extractors = [ Extractor(label='name', path="//h1/span[@class='display-title ']", # note the extra trailing space in class attrs=Attribute(key='name', path="./text()", postprocess=lambda x: \ analyze_company_name(x, stripNotes=True))), Extractor(label='filmography', group="//b/a[@name]", group_key="./text()", group_key_normalize=lambda x: x.lower(), path="../following-sibling::ol[1]/li", attrs=Attribute(key=None, multi=True, path={ 'link': "./a[1]/@href", 'title': "./a[1]/text()", 'year': "./text()[1]" }, postprocess=lambda x: build_movie(u'%s %s' % \ (x.get('title'), x.get('year').strip()), movieID=analyze_imdbid(x.get('link') or u''), _parsingCompany=True))), ] preprocessors = [ (re.compile('(<b><a name=)', re.I), r'</p>\1') ] def postprocess_data(self, data): for key in data.keys(): new_key = key.replace('company', 'companies') new_key = new_key.replace('other', 'miscellaneous') new_key = new_key.replace('distributor', 'distributors') if new_key != key: data[new_key] = data[key] del data[key] return data
def _init(self): self.preprocessors += [('<span class="tv-extra">TV mini-series</span>', '<span class="tv-extra">(mini)</span>')] self.extractors = [Extractor(label='title', path="//h1", attrs=Attribute(key='title', path=self._titleAttrPath, postprocess=self._titleFunct)), Extractor(label='link', path=self._linkPath, attrs=Attribute(key='link', path="./@href", postprocess=lambda x: \ analyze_imdbid((x or u'').replace( 'http://pro.imdb.com', '')) ))]
class DOMHTMLPersonSalesParser(DOMParserBase): """Parser for the "merchandising links" page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: sparser = DOMHTMLPersonSalesParser() result = sparser.parse(sales_html_string) """ extractors = [ Extractor(label='merchandising links', group="//span[@class='merch_title']", group_key=".//text()", path="./following-sibling::table[1]/" \ "/td[@class='w_rowtable_colshop']//tr[1]", attrs=Attribute(key=None, multi=True, path={ 'link': "./td[2]/a[1]/@href", 'text': "./td[1]/img[1]/@alt", 'cover': "./ancestor::td[1]/../" \ "td[1]/a[1]/img[1]/@src", }, postprocess=lambda x: _parse_merchandising_link(x))), ] preprocessors = [(re.compile('(<a name="[^"]+" )/>', re.I), r'\1></a>')] def postprocess_data(self, data): if len(data) == 0: return {} return {'merchandising links': data}
class DOMHTMLOtherWorksParser(DOMParserBase): """Parser for the "other works" and "agent" pages of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: owparser = DOMHTMLOtherWorksParser() result = owparser.parse(otherworks_html_string) """ _defGetRefs = True kind = 'other works' # XXX: looks like the 'agent' page is no more public. extractors = [ Extractor(label='other works', path="//h5[text()='Other works']/" \ "following-sibling::div[1]", attrs=Attribute(key='self.kind', path=".//text()", postprocess=lambda x: x.strip().split('\n\n'))) ] preprocessors = [(re.compile('(<h5>[^<]+</h5>)', re.I), r'</div>\1<div class="_imdbpy">'), (re.compile('(</table>\n</div>\s+)</div>', re.I), r'\1'), (re.compile('(<div id="tn15bot">)'), r'</div>\1'), (re.compile('<br/><br/>', re.I), r'\n\n')]
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser): """Parse the html page that the IMDb web server shows when the "new search system" is used, for persons.""" _BaseParser = DOMBasicPersonParser _notDirectHitTitle = '<title>imdb name' _titleBuilder = lambda self, x: build_name(x, canonical=True) _linkPrefix = '/name/nm' _attrs = [Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'name': "./a[1]/text()", 'index': "./text()[1]", 'akas': ".//div[@class='_imdbpyAKA']/text()" }, postprocess=lambda x: ( analyze_imdbid(x.get('link') or u''), analyze_name((x.get('name') or u'') + \ (x.get('index') or u''), canonical=1), x.get('akas') ))] extractors = [ Extractor(label='search', path="//td[3]/a[starts-with(@href, '/name/nm')]/..", attrs=_attrs) ] def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:1024].lower(): html_string = _reAKASp.sub( r'\1<div class="_imdbpyAKA">\2::</div>\3', html_string) return DOMHTMLSearchMovieParser.preprocess_string(self, html_string)
class DOMHTMLSearchMovieKeywordParser(DOMHTMLSearchMovieParser): """Parse the html page that the IMDb web server shows when the "new search system" is used, searching for movies with the given keyword.""" _notDirectHitTitle = '<title>best' _attrs = [ Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': "./a[1]//text()", 'ynote': "./span[@class='desc']/text()", 'outline': "./span[@class='outline']//text()" }, postprocess=lambda x: (analyze_imdbid(x.get('link') or u''), custom_analyze_title4kwd( x.get('info') or u'', x.get('ynote') or u'', x.get('outline') or u''))) ] extractors = [Extractor(label='search', path="//td[3]/a[starts-with(@href, " \ "'/title/tt')]/..", attrs=_attrs)]
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser): """Parse the html page that the IMDb web server shows when the "new search system" is used, for persons.""" _BaseParser = DOMBasicPersonParser _notDirectHitTitle = '<title>imdb name' _titleBuilder = lambda self, x: build_name(x, canonical=True) _linkPrefix = '/name/nm' _attrs = [ Attribute( key='data', multi=True, path={ 'link': "./a[1]/@href", 'name': "./a[1]/text()", 'index': "./text()[1]" }, postprocess=lambda x: (analyze_imdbid(x.get('link') or u''), analyze_name(x.get('name') + (x.get('index') or u''), canonical=1))) ] extractors = [ Extractor(label='search', path="//td[3]/a[starts-with(@href, '/name/nm')]/..", attrs=_attrs) ]
class DOMHTMLCharacterQuotesParser(DOMParserBase): """Parser for the "quotes" page of a given character. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: qparser = DOMHTMLCharacterQuotesParser() result = qparser.parse(character_quotes_html_string) """ _defGetRefs = True extractors = [ Extractor(label='introduction', group="//h5", group_key="./a/text()", path="./following-sibling::div[1]", attrs=Attribute(key=None, path=".//text()", postprocess=lambda x: x.strip().replace(': ', ': ').replace(': ', ': ').split('||'))), ] preprocessors = [ (re.compile('(</h5>)', re.I), r'\1<div>'), (re.compile('\s*<br/><br/>\s*', re.I), r'||'), (re.compile('\|\|\s*(<hr/>)', re.I), r'</div>\1'), (re.compile('\s*<br/>\s*', re.I), r'::') ] def postprocess_data(self, data): if len(data) == 0: return {} return {'quotes': data}
class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser): _BaseParser = DOMBasicCompanyParser _notDirectHitTitle = '<title>find - imdb' _titleBuilder = lambda self, x: build_company_name(x) _linkPrefix = '/company/co' _attrs = [ Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'name': "./a[1]/text()", 'notes': "./text()[1]" }, postprocess=lambda x: (analyze_imdbid(x.get('link')), analyze_company_name(x.get('name') + (x.get('notes') or u''), stripNotes=True))) ] extractors = [ Extractor( label='search', path= "//td[@class='result_text']/a[starts-with(@href, '/company/co')]/..", attrs=_attrs) ]
class DOMHTMLCharacterBioParser(DOMParserBase): """Parser for the "biography" page of a given character. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bparser = DOMHTMLCharacterBioParser() result = bparser.parse(character_biography_html_string) """ _defGetRefs = True extractors = [ Extractor(label='introduction', path="//div[@id='_intro']", attrs=Attribute(key='introduction', path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='biography', path="//span[@class='_biography']", attrs=Attribute(key='biography', multi=True, path={ 'info': "./preceding-sibling::h4[1]//text()", 'text': ".//text()", }, postprocess=lambda x: u'%s::%s' % ( x.get('info').strip(), x.get('text').replace('\n', ' ').replace('||', '\n\n').strip()))), ] preprocessors = [ (re.compile('(<div id="swiki.2.3.1">)', re.I), r'\1<div id="_intro">'), (re.compile('(<a name="history">)\s*(<table .*?</table>)', re.I | re.DOTALL), r'</div>\2\1</a>'), (re.compile('(<a name="[^"]+">)(<h4>)', re.I), r'</span>\1</a>\2'), (re.compile('(</h4>)</a>', re.I), r'\1<span class="_biography">'), (re.compile('<br/><br/>', re.I), r'||'), (re.compile('\|\|\n', re.I), r'</span>'), ]
class DOMHTMLSeriesParser(DOMParserBase): """Parser for the "by TV series" page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: sparser = DOMHTMLSeriesParser() result = sparser.parse(filmoseries_html_string) """ _containsObjects = True extractors = [ Extractor(label='series', group="//div[@class='filmo']/span[1]", group_key="./a[1]", path="./following-sibling::ol[1]/li/a[1]", attrs=Attribute( key=None, multi=True, path={ 'link': "./@href", 'title': "./text()", 'info': "./following-sibling::text()", 'role': "./following-sibling::i[1]/text()", 'roleA': "./following-sibling::a[1]/text()", 'roleAID': "./following-sibling::a[1]/@href" }, postprocess=lambda x: _build_episode( x.get('link'), x.get('title'), (x.get('info') or u'').strip(), x.get('role'), x.get('roleA'), x.get('roleAID')))) ] def postprocess_data(self, data): if len(data) == 0: return {} nd = {} for key in data.keys(): dom = self.get_dom(key) link = self.xpath(dom, "//a/@href")[0] title = self.xpath(dom, "//a/text()")[0][1:-1] series = Movie(movieID=analyze_imdbid(link), data=analyze_title(title), accessSystem=self._as, modFunct=self._modFunct) nd[series] = [] for episode in data[key]: # XXX: should we create a copy of 'series', to avoid # circular references? episode['episode of'] = series nd[series].append(episode) return {'episodes': nd}
def _init(self): self.extractors = [ Extractor(label=self.label, path="//div[@id='main']//table//tr", attrs=Attribute(key=None, multi=True, path={ self.ranktext: "./td[1]//text()", 'rating': "./td[2]//text()", 'title': "./td[3]//text()", 'movieID': "./td[3]//a/@href", 'votes': "./td[4]//text()" })) ]
class DOMHTMLSearchKeywordParser(DOMHTMLSearchMovieParser): """Parse the html page that the IMDb web server shows when the "new search system" is used, searching for keywords similar to the one given.""" _BaseParser = DOMBasicKeywordParser _notDirectHitTitle = '<title>imdb keyword' _titleBuilder = lambda self, x: x _linkPrefix = '/keyword/' _attrs = [Attribute(key='data', multi=True, path="./a[1]/text()")] extractors = [Extractor(label='search', path="//td[3]/a[starts-with(@href, " \ "'/keyword/')]/..", attrs=_attrs)]
class DOMHTMLCharacterQuotesParser(DOMParserBase): """Parser for the "quotes" page of a given character. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: qparser = DOMHTMLCharacterQuotesParser() result = qparser.parse(character_quotes_html_string) """ _defGetRefs = True extractors = [ Extractor(label='charquotes', group="//h5", group_key="./a/text()", path="./following-sibling::div[1]", attrs=Attribute( key=None, path={ 'txt': ".//text()", 'movieID': ".//a[1]/@href" }, postprocess=lambda x: (analyze_imdbid(x['movieID']), x['txt'].strip().replace( ': ', ': ').replace(': ', ': ').split('||')))) ] preprocessors = [(re.compile('(</h5>)', re.I), r'\1<div>'), (re.compile('\s*<br/><br/>\s*', re.I), r'||'), (re.compile('\|\|\s*(<hr/>)', re.I), r'</div>\1'), (re.compile('\s*<br/>\s*', re.I), r'::')] def postprocess_data(self, data): if not data: return {} newData = {} for title in data: movieID, quotes = data[title] if movieID is None: movie = title else: movie = Movie(title=title, movieID=movieID, accessSystem=self._as, modFunct=self._modFunct) newData[movie] = [quote.split('::') for quote in quotes] return {'quotes': newData}
class DOMHTMLSearchCharacterParser(DOMHTMLSearchMovieParser): _BaseParser = DOMBasicCharacterParser _notDirectHitTitle = '<title>imdb search' _titleBuilder = lambda self, x: build_name(x, canonical=False) _linkPrefix = '/character/ch' _attrs = [Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'name': "./a[1]/text()" }, postprocess=lambda x: ( analyze_imdbid(x.get('link') or u''), {'name': x.get('name')} ))] extractors = [Extractor(label='search', path="//td[3]/a[starts-with(@href, " \ "'/character/ch')]/..", attrs=_attrs)]
class DOMHTMLOtherWorksParser(DOMParserBase): """Parser for the "other works" and "agent" pages of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: owparser = DOMHTMLOtherWorksParser() result = owparser.parse(otherworks_html_string) """ _defGetRefs = True kind = 'other works' extractors = [ Extractor(label='other works', path="//li[@class='ipl-zebra-list__item']", attrs=Attribute(key='other works', path=".//text()", multi=True, postprocess=lambda x: x.strip())) ]
class DOMHTMLPersonGenresParser(DOMParserBase): """Parser for the "by genre" and "by keywords" pages of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: gparser = DOMHTMLPersonGenresParser() result = gparser.parse(bygenre_html_string) """ kind = 'genres' _containsObjects = True extractors = [ Extractor(label='genres', group="//b/a[@name]/following-sibling::a[1]", group_key="./text()", group_key_normalize=lambda x: x.lower(), path="../../following-sibling::ol[1]/li//a[1]", attrs=Attribute(key=None, multi=True, path={ 'link': "./@href", 'title': "./text()", 'info': "./following-sibling::text()" }, postprocess=lambda x: \ build_movie(x.get('title') + \ x.get('info').split('[')[0], analyze_imdbid(x.get('link'))))) ] def postprocess_data(self, data): if len(data) == 0: return {} return {self.kind: data}
class DOMHTMLSearchMovieParser(DOMParserBase): """Parse the html page that the IMDb web server shows when the "new search system" is used, for movies.""" _BaseParser = DOMBasicMovieParser _notDirectHitTitle = '<title>find - imdb</title>' _titleBuilder = lambda self, x: build_title(x) _linkPrefix = '/title/tt' _attrs = [ Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': ".//text()", 'akas': "./i//text()" }, postprocess=lambda x: (analyze_imdbid(x.get('link') or u''), custom_analyze_title(x.get('info') or u''), x.get('akas'))) ] extractors = [ Extractor(label='search', path="//td[@class='result_text']", attrs=_attrs) ] def _init(self): self.url = u'' def _reset(self): self.url = u'' def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:10240].lower(): if self._linkPrefix == '/title/tt': # Only for movies. # XXX (HTU): does this still apply? html_string = html_string.replace('(TV mini-series)', '(mini)') return html_string # Direct hit! dbme = self._BaseParser(useModule=self._useModule) res = dbme.parse(html_string, url=self.url) if not res: return u'' res = res['data'] if not (res and res[0]): return u'' link = '%s%s' % (self._linkPrefix, res[0][0]) # # Tries to cope with companies for which links to pro.imdb.com # # are missing. # link = self.url.replace(imdbURL_base[:-1], '') title = self._titleBuilder(res[0][1]) if not (link and title): return u'' link = link.replace('http://pro.imdb.com', '') new_html = '<td class="result_text"><a href="%s">%s</a></td>' % (link, title) return new_html def postprocess_data(self, data): if not data.has_key('data'): data['data'] = [] results = getattr(self, 'results', None) if results is not None: data['data'][:] = data['data'][:results] # Horrible hack to support AKAs. if data and data['data'] and len(data['data'][0]) == 3 and \ isinstance(data['data'][0], tuple): data['data'] = [x for x in data['data'] if x[0] and x[1]] for idx, datum in enumerate(data['data']): if not isinstance(datum, tuple): continue if not datum[0] and datum[1]: continue if datum[2] is not None: #akas = filter(None, datum[2].split('::')) if self._linkPrefix == '/title/tt': # XXX (HTU): couldn't find a result with multiple akas aka = datum[2] akas = [aka[1:-1]] # remove the quotes #akas = [a.replace('" - ', '::').rstrip() for a in akas] #akas = [a.replace('aka "', '', 1).replace('aka "', #'', 1).lstrip() for a in akas] datum[1]['akas'] = akas data['data'][idx] = (datum[0], datum[1]) else: data['data'][idx] = (datum[0], datum[1]) return data def add_refs(self, data): return data
class DOMHTMLMaindetailsParser(DOMParserBase): """Parser for the "categorized" (maindetails) page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: cparser = DOMHTMLMaindetailsParser() result = cparser.parse(categorized_html_string) """ _containsObjects = True _birth_attrs = [Attribute(key='birth date', path={ 'day': "./a[starts-with(@href, " \ "'/OnThisDay?')]/text()", 'year': "./a[starts-with(@href, " \ "'/BornInYear?')]/text()" }, postprocess=lambda x: build_date(x)), Attribute(key='birth notes', path="./a[starts-with(@href, '/BornWhere?')]/text()")] _death_attrs = [Attribute(key='death date', path={ 'day': "./a[starts-with(@href, " \ "'/OnThisDay?')]/text()", 'year': "./a[starts-with(@href, " \ "'/DiedInYear?')]/text()" }, postprocess=lambda x: build_date(x)), Attribute(key='death notes', path="./text()", # TODO: check if this slicing is always correct postprocess=lambda x: x.strip()[2:])] _film_attrs = [ Attribute(key=None, multi=True, path={ 'link': "./a[1]/@href", 'title': ".//text()", 'status': "./i/a//text()", 'roleID': "./div[@class='_imdbpyrole']/@roleid" }, postprocess=lambda x: build_movie( x.get('title') or u'', movieID=analyze_imdbid(x.get('link') or u''), roleID=(x.get('roleID') or u'').split('/'), status=x.get('status') or None)) ] extractors = [ Extractor(label='page title', path="//title", attrs=Attribute( key='name', path="./text()", postprocess=lambda x: analyze_name(x, canonical=1))), Extractor(label='birth info', path="//div[h5='Date of Birth:']", attrs=_birth_attrs), Extractor(label='death info', path="//div[h5='Date of Death:']", attrs=_death_attrs), Extractor(label='headshot', path="//a[@name='headshot']", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='akas', path="//div[h5='Alternate Names:']", attrs=Attribute( key='akas', path="./text()", postprocess=lambda x: x.strip().split(' | '))), Extractor(label='filmography', group="//div[@class='filmo'][h5]", group_key="./h5/a[@name]/text()", group_key_normalize=lambda x: x.lower()[:-1], path="./ol/li", attrs=_film_attrs) ] preprocessors = [ # XXX: check that this doesn't cut "status" or other info... (re.compile(r'<br>(\.\.\.| ?).+?</li>', re.I | re.M | re.S), '</li>'), (_reRoles, _manageRoles) ]
class DOMHTMLMaindetailsParser(DOMParserBase): """Parser for the "categorized" (maindetails) page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: cparser = DOMHTMLMaindetailsParser() result = cparser.parse(categorized_html_string) """ _containsObjects = True _name_imdb_index = re.compile(r'\([IVXLCDM]+\)') _birth_attrs = [Attribute(key='birth date', path='.//time[@itemprop="birthDate"]/@datetime'), Attribute(key='birth place', path=".//a[starts-with(@href, " \ "'/search/name?birth_place=')]/text()")] _death_attrs = [Attribute(key='death date', path='.//time[@itemprop="deathDate"]/@datetime'), Attribute(key='death place', path=".//a[starts-with(@href, " \ "'/search/name?death_place=')]/text()")] _film_attrs = [ Attribute(key=None, multi=True, path={ 'link': "./b/a[1]/@href", 'title': "./b/a[1]/text()", 'notes': "./b/following-sibling::text()", 'year': "./span[@class='year_column']/text()", 'status': "./a[@class='in_production']/text()", 'rolesNoChar': './/br/following-sibling::text()', 'chrRoles': "./a[@imdbpyname]/@imdbpyname", 'roleID': "./a[starts-with(@href, '/character/')]/@href" }, postprocess=lambda x: build_movie( x.get('title') or u'', year=x.get('year'), movieID=analyze_imdbid(x.get('link') or u''), rolesNoChar=(x.get('rolesNoChar') or u'').strip(), chrRoles=(x.get('chrRoles') or u'').strip(), additionalNotes=x.get('notes'), roleID=(x.get('roleID') or u''), status=x.get('status') or None)) ] extractors = [ Extractor(label='name', path="//h1[@class='header']", attrs=Attribute(key='name', path=".//text()", postprocess=lambda x: analyze_name(x, canonical=1))), Extractor(label='name_index', path="//h1[@class='header']/span[1]", attrs=Attribute(key='name_index', path="./text()")), Extractor(label='birth info', path="//div[h4='Born:']", attrs=_birth_attrs), Extractor(label='death info', path="//div[h4='Died:']", attrs=_death_attrs), Extractor(label='headshot', path="//td[@id='img_primary']/div[@class='image']/a", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='akas', path="//div[h4='Alternate Names:']", attrs=Attribute(key='akas', path="./text()", postprocess=lambda x: x.strip().split(' '))), Extractor(label='filmography', group="//div[starts-with(@id, 'filmo-head-')]", group_key="./a[@name]/text()", group_key_normalize=lambda x: x.lower().replace(': ', ' '), path="./following-sibling::div[1]" \ "/div[starts-with(@class, 'filmo-row')]", attrs=_film_attrs), Extractor(label='indevelopment', path="//div[starts-with(@class,'devitem')]", attrs=Attribute(key='in development', multi=True, path={ 'link': './a/@href', 'title': './a/text()' }, postprocess=lambda x: build_movie(x.get('title') or u'', movieID=analyze_imdbid(x.get('link') or u''), roleID=(x.get('roleID') or u'').split('/'), status=x.get('status') or None))) ] preprocessors = [ ('<div class="clear"/> </div>', ''), ('<br/>', '<br />'), (re.compile(r'(<a href="/character/ch[0-9]{7}")>(.*?)</a>'), r'\1 imdbpyname="\2@@">\2</a>') ] def postprocess_data(self, data): for what in 'birth date', 'death date': if what in data and not data[what]: del data[what] name_index = (data.get('name_index') or '').strip() if name_index: if self._name_imdb_index.match(name_index): data['imdbIndex'] = name_index[1:-1] del data['name_index'] # XXX: the code below is for backwards compatibility # probably could be removed for key in data.keys(): if key.startswith('actor '): if not data.has_key('actor'): data['actor'] = [] data['actor'].extend(data[key]) del data[key] if key.startswith('actress '): if not data.has_key('actress'): data['actress'] = [] data['actress'].extend(data[key]) del data[key] if key.startswith('self '): if not data.has_key('self'): data['self'] = [] data['self'].extend(data[key]) del data[key] if key == 'birth place': data['birth notes'] = data[key] del data[key] if key == 'death place': data['death notes'] = data[key] del data[key] return data
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser): """Parser for the "biography" page of a given character. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bparser = DOMHTMLCharacterMaindetailsParser() result = bparser.parse(character_biography_html_string) """ _containsObjects = True _film_attrs = [Attribute(key=None, multi=True, path={ 'link': "./a[1]/@href", 'title': ".//text()", 'status': "./i/a//text()", 'roleID': "./a/@href" }, postprocess=lambda x: build_movie(x.get('title') or u'', movieID=analyze_imdbid(x.get('link') or u''), roleID=_personIDs.findall(x.get('roleID') or u''), status=x.get('status') or None, _parsingCharacter=True))] extractors = [ Extractor(label='title', path="//title", attrs=Attribute(key='name', path="./text()", postprocess=lambda x: \ x.replace(' (Character)', '').strip())), Extractor(label='headshot', path="//a[@name='headshot']", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='akas', path="//div[h5='Alternate Names:']", attrs=Attribute(key='akas', path="./text()", postprocess=lambda x: x.strip().split(' / '))), Extractor(label='filmography', path="//div[@class='filmo'][not(h5)]/ol/li", attrs=_film_attrs), Extractor(label='filmography sections', group="//div[@class='filmo'][h5]", group_key="./h5/a/text()", group_key_normalize=lambda x: x.lower()[:-1], path="./ol/li", attrs=_film_attrs), ] preprocessors = [ # Check that this doesn't cut "status"... (re.compile(r'<br>(\.\.\.| ).+?</li>', re.I | re.M), '</li>')]
class DOMHTMLSearchMovieParser(DOMParserBase): """Parse the html page that the IMDb web server shows when the "new search system" is used, for movies.""" _BaseParser = DOMBasicMovieParser _notDirectHitTitle = '<title>imdb title' _titleBuilder = lambda self, x: build_title(x) _linkPrefix = '/title/tt' _attrs = [Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': ".//text()", #'akas': ".//div[@class='_imdbpyAKA']//text()" 'akas': ".//p[@class='find-aka']//text()" }, postprocess=lambda x: ( analyze_imdbid(x.get('link') or u''), custom_analyze_title(x.get('info') or u''), x.get('akas') ))] extractors = [Extractor(label='search', path="//td[3]/a[starts-with(@href, '/title/tt')]/..", attrs=_attrs)] def _init(self): self.url = u'' def _reset(self): self.url = u'' def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:1024].lower(): if self._linkPrefix == '/title/tt': # Only for movies. html_string = html_string.replace('(TV mini-series)', '(mini)') html_string = html_string.replace('<p class="find-aka">', '<p class="find-aka">::') #html_string = _reAKAStitles.sub( # r'<div class="_imdbpyAKA">\1::</div>\2', html_string) return html_string # Direct hit! dbme = self._BaseParser(useModule=self._useModule) res = dbme.parse(html_string, url=self.url) if not res: return u'' res = res['data'] if not (res and res[0]): return u'' link = '%s%s' % (self._linkPrefix, res[0][0]) # # Tries to cope with companies for which links to pro.imdb.com # # are missing. # link = self.url.replace(imdbURL_base[:-1], '') title = self._titleBuilder(res[0][1]) if not (link and title): return u'' link = link.replace('http://pro.imdb.com', '') new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link, title) return new_html def postprocess_data(self, data): if not data.has_key('data'): data['data'] = [] results = getattr(self, 'results', None) if results is not None: data['data'][:] = data['data'][:results] # Horrible hack to support AKAs. if data and data['data'] and len(data['data'][0]) == 3 and \ isinstance(data['data'][0], tuple): for idx, datum in enumerate(data['data']): if datum[2] is not None: akas = filter(None, datum[2].split('::')) if self._linkPrefix == '/title/tt': akas = [a.replace('" - ', '::').rstrip() for a in akas] akas = [a.replace('aka "', '', 1).lstrip() for a in akas] datum[1]['akas'] = akas data['data'][idx] = (datum[0], datum[1]) else: data['data'][idx] = (datum[0], datum[1]) return data def add_refs(self, data): return data
class DOMHTMLBioParser(DOMParserBase): """Parser for the "biography" page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bioparser = DOMHTMLBioParser() result = bioparser.parse(biography_html_string) """ _defGetRefs = True _birth_attrs = [Attribute(key='birth date', path={ 'day': "./a[starts-with(@href, " \ "'/search/name?birth_monthday=')]/text()", 'year': "./a[starts-with(@href, " \ "'/search/name?birth_year=')]/text()" }, postprocess=build_date), Attribute(key='birth notes', path="./a[starts-with(@href, " \ "'/search/name?birth_place=')]/text()")] _death_attrs = [Attribute(key='death date', path={ 'day': "./a[starts-with(@href, " \ "'/search/name?death_monthday=')]/text()", 'year': "./a[starts-with(@href, " \ "'/search/name?death_date=')]/text()" }, postprocess=build_date), Attribute(key='death notes', path="./text()", # TODO: check if this slicing is always correct postprocess=lambda x: u''.join(x).strip()[2:])] extractors = [ Extractor(label='headshot', path="//a[@name='headshot']", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='birth info', path="//table[@id='overviewTable']//td[text()='Date of Birth']/following-sibling::td[1]", attrs=_birth_attrs), Extractor(label='death info', path="//table[@id='overviewTable']//td[text()='Date of Death']/following-sibling::td[1]", attrs=_death_attrs), Extractor(label='nick names', path="//table[@id='overviewTable']//td[text()='Nickenames']/following-sibling::td[1]", attrs=Attribute(key='nick names', path="./text()", joiner='|', postprocess=lambda x: [n.strip().replace(' (', '::(', 1) for n in x.split('|') if n.strip()])), Extractor(label='birth name', path="//table[@id='overviewTable']//td[text()='Birth Name']/following-sibling::td[1]", attrs=Attribute(key='birth name', path="./text()", postprocess=lambda x: canonicalName(x.strip()))), Extractor(label='height', path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]", attrs=Attribute(key='height', path="./text()", postprocess=lambda x: x.strip())), Extractor(label='mini biography', path="//a[@name='mini_bio']/following-sibling::div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]", attrs=Attribute(key='mini biography', multi=True, path={ 'bio': ".//text()", 'by': ".//a[@name='ba']//text()" }, postprocess=lambda x: "%s::%s" % \ ((x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(), (x.get('by') or u'').strip() or u'Anonymous'))), Extractor(label='spouse', path="//div[h5='Spouse']/table/tr", attrs=Attribute(key='spouse', multi=True, path={ 'name': "./td[1]//text()", 'info': "./td[2]//text()" }, postprocess=lambda x: ("%s::%s" % \ (x.get('name').strip(), (x.get('info') or u'').strip())).strip(':'))), Extractor(label='trade mark', path="//div[h5='Trade Mark']/p", attrs=Attribute(key='trade mark', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='trivia', path="//div[h5='Trivia']/p", attrs=Attribute(key='trivia', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='quotes', path="//div[h5='Personal Quotes']/p", attrs=Attribute(key='quotes', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='salary', path="//div[h5='Salary']/table/tr", attrs=Attribute(key='salary history', multi=True, path={ 'title': "./td[1]//text()", 'info': "./td[2]/text()", }, postprocess=lambda x: "%s::%s" % \ (x.get('title').strip(), x.get('info').strip()))), Extractor(label='where now', path="//div[h5='Where Are They Now']/p", attrs=Attribute(key='where now', multi=True, path=".//text()", postprocess=lambda x: x.strip())), ] preprocessors = [(re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'), (re.compile('(</table>\n</div>\s+)</div>', re.I + re.DOTALL), r'\1'), (re.compile('(<div id="tn15bot">)'), r'</div>\1'), (re.compile('\.<br><br>([^\s])', re.I), r'. \1')] def postprocess_data(self, data): for what in 'birth date', 'death date': if what in data and not data[what]: del data[what] return data
class DOMHTMLSearchMovieParser(DOMParserBase): """Parse the html page that the IMDb web server shows when the "new search system" is used, for movies.""" _BaseParser = DOMBasicMovieParser _notDirectHitTitle = '<title>imdb title' _titleBuilder = lambda self, x: build_title(x, canonical=True) _linkPrefix = '/title/tt' _attrs = [ Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': ".//text()" }, postprocess=lambda x: (analyze_imdbid(x.get('link') or u''), analyze_title(x.get('info') or u'', canonical=1))) ] extractors = [ Extractor(label='search', path="//td[3]/a[starts-with(@href, '/title/tt')]/..", attrs=_attrs) ] def _init(self): self.url = u'' def _reset(self): self.url = u'' def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:1024].lower(): if self._linkPrefix == '/title/tt': # Only for movies. html_string = html_string.replace('(TV mini-series)', '(mini)') html_string = _reAKAS.sub('</td>', html_string) return html_string # Direct hit! dbme = self._BaseParser(useModule=self._useModule) res = dbme.parse(html_string, url=self.url) if not res: return u'' res = res['data'] if not (res and res[0]): return u'' link = '%s%s' % (self._linkPrefix, res[0][0]) # # Tries to cope with companies for which links to pro.imdb.com # # are missing. # link = self.url.replace(imdbURL_base[:-1], '') title = self._titleBuilder(res[0][1]) if not (link and title): return u'' link = link.replace('http://pro.imdb.com', '') new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link, title) return new_html def postprocess_data(self, data): if not data.has_key('data'): data['data'] = [] results = getattr(self, 'results', None) if results is not None: data['data'][:] = data['data'][:results] return data def add_refs(self, data): return data
class DOMHTMLResumeParser(DOMParserBase): """Parser for the "resume" page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: resumeparser = DOMHTMLResumeParser() result = resumeparser.parse(resume_html_string) """ _defGetRefs = True extractors = [ Extractor( label='info', group="//div[@class='section_box']", group_key="./h3/text()", group_key_normalize=lambda x: x.lower().replace(' ', '_'), path="./ul[@class='resume_section_multi_list']//li", attrs=Attribute( key=None, multi=True, path={ 'title': ".//b//text()", 'desc': ".//text()", }, postprocess=lambda x: (x.get('title'), x.get('desc').strip().replace('\n', ' ')))), Extractor(label='other_info', group="//div[@class='section_box']", group_key="./h3/text()", group_key_normalize=lambda x: x.lower().replace(' ', '_'), path="./ul[@class='_imdbpy']//li", attrs=Attribute( key=None, multi=True, path=".//text()", postprocess=lambda x: x.strip().replace('\n', ' '))), Extractor( label='credits', group="//div[@class='section_box']", group_key="./h3/text()", group_key_normalize=lambda x: x.lower().replace(' ', '_'), path="./table[@class='credits']//tr", attrs=Attribute( key=None, multi=True, path={ '0': ".//td[1]//text()", '1': ".//td[2]//text()", '2': ".//td[3]//text()", }, postprocess=lambda x: [x.get( '0'), x.get('1'), x.get('2')])), Extractor(label='mini_info', path="//div[@class='center']", attrs=Attribute( key='mini_info', path=".//text()", postprocess=lambda x: x.strip().replace('\n', ' '))), Extractor(label='name', path="//div[@class='center']/h1[@id='preview_user_name']", attrs=Attribute( key='name', path=".//text()", postprocess=lambda x: x.strip().replace('\n', ' '))), Extractor(label='resume_bio', path="//div[@id='resume_rendered_html']//p", attrs=Attribute(key='resume_bio', multi=True, path=".//text()")), ] preprocessors = [ (re.compile('(<ul>)', re.I), r'<ul class="_imdbpy">\1'), ] def postprocess_data(self, data): for key in data.keys(): if data[key] == '': del data[key] if key in ('mini_info', 'name', 'resume_bio'): if key == 'resume_bio': data[key] = "".join(data[key]).strip() continue if len(data[key][0]) == 3: for item in data[key]: item[:] = [x for x in item if not x is None] continue if len(data[key][0]) == 2: new_key = {} for item in data[key]: if item[0] is None: continue if ':' in item[0]: if item[1].replace(item[0], '')[1:].strip() == '': continue new_key[item[0].strip().replace(':', '')] = item[1].replace( item[0], '')[1:].strip() else: new_key[item[0]] = item[1] data[key] = new_key new_data = {'resume': data} return new_data
class DOMHTMLBioParser(DOMParserBase): """Parser for the "biography" page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bioparser = DOMHTMLBioParser() result = bioparser.parse(biography_html_string) """ _defGetRefs = True _birth_attrs = [Attribute(key='birth date', path={ 'day': "./a[starts-with(@href, " \ "'/OnThisDay?')]/text()", 'year': "./a[starts-with(@href, " \ "'/BornInYear?')]/text()" }, postprocess=lambda x: build_date(x)), Attribute(key='birth notes', path="./a[starts-with(@href, '/BornWhere?')]/text()")] _death_attrs = [Attribute(key='death date', path={ 'day': "./a[starts-with(@href, " \ "'/OnThisDay?')]/text()", 'year': "./a[starts-with(@href, " \ "'/DiedInYear?')]/text()" }, postprocess=lambda x: build_date(x)), Attribute(key='death notes', path="./text()", # TODO: check if this slicing is always correct postprocess=lambda x: u''.join(x).strip()[2:])] extractors = [ Extractor(label='birth info', path="//div[h5='Date of Birth']", attrs=_birth_attrs), Extractor(label='death info', path="//div[h5='Date of Death']", attrs=_death_attrs), Extractor(label='nick names', path="//div[h5='Nickname']", attrs=Attribute(key='nick names', path="./text()", joiner='|', postprocess=lambda x: [n.strip().replace(' (', '::(', 1) for n in x.split('|') if n.strip()])), Extractor(label='birth name', path="//div[h5='Birth Name']", attrs=Attribute(key='birth name', path="./text()", postprocess=lambda x: canonicalName(x.strip()))), Extractor(label='height', path="//div[h5='Height']", attrs=Attribute(key='height', path="./text()", postprocess=lambda x: x.strip())), Extractor(label='mini biography', path="//div[h5='Mini Biography']", attrs=Attribute(key='mini biography', multi=True, path={ 'bio': "./p//text()", 'by': "./b/following-sibling::a/text()" }, postprocess=lambda x: "%s::%s" % \ (x.get('bio').strip(), (x.get('by') or u'').strip() or u'Anonymous'))), Extractor(label='spouse', path="//div[h5='Spouse']/table/tr", attrs=Attribute(key='spouse', multi=True, path={ 'name': "./td[1]//text()", 'info': "./td[2]//text()" }, postprocess=lambda x: "%s::%s" % \ (x.get('name').strip(), x.get('info').strip()))), Extractor(label='trade mark', path="//div[h5='Trade Mark']/p", attrs=Attribute(key='trade mark', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='trivia', path="//div[h5='Trivia']/p", attrs=Attribute(key='trivia', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='quotes', path="//div[h5='Personal Quotes']/p", attrs=Attribute(key='quotes', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='salary', path="//div[h5='Salary']/table/tr", attrs=Attribute(key='salary history', multi=True, path={ 'title': "./td[1]//text()", 'info': "./td[2]/text()", }, postprocess=lambda x: "%s::%s" % \ (x.get('title').strip(), x.get('info').strip()))), Extractor(label='where now', path="//div[h5='Where Are They Now']/p", attrs=Attribute(key='where now', multi=True, path=".//text()", postprocess=lambda x: x.strip())), ] preprocessors = [(re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'), (re.compile('(</table>\n</div>\s+)</div>', re.I + re.DOTALL), r'\1'), (re.compile('(<div id="tn15bot">)'), r'</div>\1'), (re.compile('\.<br><br>([^\s])', re.I), r'. \1')]