def get_person_biography(self, personID): cont = self._mretrieve(imdbURL_person_main % personID + 'bio') d = {} spouses = _findBetween(cont, 'Spouse</h5>', ('</table>', '</dd>'), maxRes=1) if spouses: sl = [] for spouse in spouses[0].split('</tr>'): if spouse.count('</td>') > 1: spouse = spouse.replace('</td>', '::</td>', 1) spouse = _unHtml(spouse) spouse = spouse.replace(':: ', '::').strip() if spouse: sl.append(spouse) if sl: d['spouse'] = sl misc_sects = _findBetween(cont, '<h5>', '<br/>') misc_sects[:] = [x.split('</h5>') for x in misc_sects] misc_sects[:] = [x for x in misc_sects if len(x) == 2] for sect, data in misc_sects: sect = sect.lower().replace(':', '').strip() if d.has_key(sect): continue if sect == 'salary': sect = 'salary history' elif sect == 'spouse': continue elif sect == 'nickname': sect = 'nick names' elif sect == 'where are they now': sect = 'where now' elif sect == 'personal quotes': sect = 'quotes' data = data.replace('</p><p>', '::') data = data.replace('<br><br>', ' ') # for multi-paragraphs 'bio' data = data.replace('</td> <td valign="top">', '@@@@') data = data.replace('</td> </tr>', '::') data = _unHtml(data) data = [x.strip() for x in data.split('::')] data[:] = [x.replace('@@@@', '::') for x in data if x] if sect == 'height' and data: data = data[0] elif sect == 'birth name': data = canonicalName(data[0]) elif sect == 'date of birth': date, notes = date_and_notes(data[0]) if date: d['birth date'] = date if notes: d['birth notes'] = notes continue elif sect == 'date of death': date, notes = date_and_notes(data[0]) if date: d['death date'] = date if notes: d['death notes'] = notes continue elif sect == 'mini biography': ndata = [] for bio in data: byidx = bio.rfind('IMDb Mini Biography By') if byidx != -1: bio = u'%s::%s' % (bio[byidx + 23:].lstrip(), bio[:byidx].rstrip()) ndata.append(bio) data[:] = ndata d[sect] = data return {'data': d}
def _add_items(self): # Add a new section in the biography. if self._in_content and self._sect_name and self._sect_data: sect = self._sect_name.strip().lower() # XXX: to get rid of the last colons and normalize section names. if sect[-1] == ':': sect = sect[:-1] if sect == 'salary': sect = 'salary history' elif sect == 'nickname': sect = 'nick names' elif sect == 'where are they now': sect = 'where now' elif sect == 'personal quotes': sect = 'quotes' elif sect == 'date of birth': sect = 'birth date' elif sect == 'date of death': sect = 'death date' data = self._sect_data.strip() d_split = data.split('::') d_split[:] = filter(None, [x.strip() for x in d_split]) # Do some transformation on some special cases. if sect == 'salary history': newdata = [] for j in d_split: j = filter(None, [x.strip() for x in j.split('@@@@')]) newdata.append('::'.join(j)) d_split[:] = newdata elif sect == 'nick names': d_split[:] = [normalizeName(x) for x in d_split] elif sect == 'birth name': d_split = canonicalName(d_split[0]) elif sect == 'height': d_split = d_split[0] elif sect == 'spouse': d_split[:] = [x.replace(' (', '::(', 1).replace(' ::', '::') for x in d_split] # Birth/death date are in both maindetails and bio pages; # it's safe to collect both of them. if sect == 'birth date': date, notes = date_and_notes(d_split[0]) if date: self._bio_data['birth date'] = date if notes: self._bio_data['birth notes'] = notes elif sect == 'death date': date, notes = date_and_notes(d_split[0]) if date: self._bio_data['death date'] = date if notes: self._bio_data['death notes'] = notes elif d_split: # Multiple items are added separately (e.g.: 'trivia' is # a list of strings). self._bio_data[sect] = d_split self._sect_name = u'' self._sect_data = u'' self._in_sect = 0
def person_to_dict(person): try: person_dict = {'canonical_name': utils.canonicalName(person.data['name']), 'name': utils.normalizeName(person.data['name']), 'person_id': int(person.personID)} except: person_dict = None return person_dict
def scan_names(name_list, name, results=0, ro_threshold=RO_THRESHOLD): """Scan a list of names, searching for best matches against some variations. :param name_list: list of (personID, {person_data}) tuples :type name_list: list :param name: searched name :type name: str :results: returns at most as much results (all, if 0) :type results: int :param ro_threshold: ignore results with a score lower than this value :type ro_threshold: float :returns: list of results sorted by similarity :rtype: list""" canonical_name = canonicalName(name).replace(',', '') sm1 = SequenceMatcher() sm2 = SequenceMatcher() sm1.set_seq1(name.lower()) sm2.set_seq1(canonical_name.lower()) resd = {} for i, n_data in name_list: nil = n_data['name'] # Distance with the canonical name. ratios = [ ratcliff(name, nil, sm1) + 0.1, ratcliff(name, canonicalName(nil).replace(',', ''), sm2) ] ratio = max(ratios) if ratio >= ro_threshold: if i in resd: if ratio > resd[i][0]: resd[i] = (ratio, (i, n_data)) else: resd[i] = (ratio, (i, n_data)) res = list(resd.values()) res.sort() res.reverse() if results > 0: res[:] = res[:results] return res
def _getitem(self, key): """Handle special keys.""" if 'name' in self.data: if key == 'name': return normalizeName(self.data['name']) elif key == 'canonical name': return canonicalName(self.data['name']) elif key == 'long imdb name': return build_name(self.data, canonical=False) elif key == 'long imdb canonical name': return build_name(self.data, canonical=True) if key == 'full-size headshot': return self.get_fullsizeURL() return None
def scan_names(name_list, name, results=0, ro_threshold=RO_THRESHOLD): """Scan a list of names, searching for best matches against some variations. :param name_list: list of (personID, {person_data}) tuples :type name_list: list :param name: searched name :type name: str :results: returns at most as much results (all, if 0) :type results: int :param ro_threshold: ignore results with a score lower than this value :type ro_threshold: float :returns: list of results sorted by similarity :rtype: list""" canonical_name = canonicalName(name).replace(',', '') sm1 = SequenceMatcher() sm2 = SequenceMatcher() sm1.set_seq1(name.lower()) sm2.set_seq1(canonical_name.lower()) resd = {} for i, n_data in name_list: nil = n_data['name'] # Distance with the canonical name. ratios = [ratcliff(name, nil, sm1) + 0.1, ratcliff(name, canonicalName(nil).replace(',', ''), sm2)] ratio = max(ratios) if ratio >= ro_threshold: if i in resd: if ratio > resd[i][0]: resd[i] = (ratio, (i, n_data)) else: resd[i] = (ratio, (i, n_data)) res = list(resd.values()) res.sort() res.reverse() if results > 0: res[:] = res[:results] return res
def name_soundexes(name): """Return three soundex codes for the given name. :param name: person name :type name: str :returns: tuple of soundex codes: (S(Name Surname), S(Surname Name), S(Surname)) :rtype: tuple """ if not name: return None, None, None s1 = soundex(name) canonical_name = canonicalName(name) s2 = soundex(canonical_name) if s1 == s2: s2 = None s3 = soundex(canonical_name.split(', ')[0]) if s3 and s3 in (s1, s2): s3 = None return s1, s2, s3
def nameVariations(name, fromPtdf=0): """Build name variations useful for searches; if fromPtdf is true, the input is assumed to be in the plain text data files format.""" name1 = name2 = name3 = u'' if fromPtdf or re_nameIndex.search(name): # We've a name with an (imdbIndex) namedict = analyze_name(name, canonical=1) # name1 is the name in the canonical format. name1 = namedict['name'] # name3 is the canonical name with the imdbIndex. if fromPtdf: if namedict.has_key('imdbIndex'): name3 = name else: name3 = build_name(namedict, canonical=1) else: # name1 is the name in the canonical format. name1 = canonicalName(name) name3 = u'' # name2 is the name in the normal format, if it differs from name1. name2 = normalizeName(name1) if name1 == name2: name2 = u'' return name1, name2, name3
class DOMHTMLBioParser(DOMParserBase): """Parser for the "biography" page of a given person. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example:: bioparser = DOMHTMLBioParser() result = bioparser.parse(biography_html_string) """ _defGetRefs = True _birth_rules = [ Rule(key='birth date', extractor=Path('./time[@itemprop="birthDate"]/@datetime')), Rule( key='birth notes', extractor=Path( './a[starts-with(@href, "/search/name?birth_place=")]/text()')) ] _death_rules = [ Rule(key='death date', extractor=Path('./time[@itemprop="deathDate"]/@datetime')), Rule(key='death cause', extractor=Path( './text()', transform=lambda x: ''.join(x).strip()[2:].lstrip())), Rule(key='death notes', extractor=Path('..//text()', transform=lambda x: _re_spaces.sub( ' ', (x or '').strip().split('\n')[-1]))) ] rules = [ Rule(key='headshot', extractor=Path('//img[@class="poster"]/@src')), Rule(key='birth info', extractor=Rules(section='//table[@id="overviewTable"]' '//td[text()="Born"]/following-sibling::td[1]', rules=_birth_rules)), Rule(key='death info', extractor=Rules(section='//table[@id="overviewTable"]' '//td[text()="Died"]/following-sibling::td[1]', rules=_death_rules)), Rule( key='nick names', extractor=Path( '//table[@id="overviewTable"]' '//td[starts-with(text(), "Nickname")]/following-sibling::td[1]/text()', reduce=lambda xs: '|'.join(xs), transform=lambda x: [ n.strip().replace(' (', '::(', 1) for n in x.split('|') if n.strip() ])), Rule(key='birth name', extractor=Path( '//table[@id="overviewTable"]' '//td[text()="Birth Name"]/following-sibling::td[1]/text()', transform=lambda x: canonicalName(x.strip()))), Rule(key='height', extractor=Path( '//table[@id="overviewTable"]' '//td[text()="Height"]/following-sibling::td[1]/text()', transform=str.strip)), Rule( key='mini biography', extractor=Rules( foreach='//a[@name="mini_bio"]/following-sibling::' 'div[1 = count(preceding-sibling::a[1] | ../a[@name="mini_bio"])]', rules=[ Rule(key='bio', extractor=Path('.//text()')), Rule(key='by', extractor=Path('.//a[@name="ba"]//text()')) ], transform=lambda x: "%s::%s" % ((x.get('bio') or '').split('- IMDb Mini Biography By:')[0]. strip(), (x.get('by') or '').strip() or 'Anonymous'))), Rule( key='spouse', extractor=Rules( foreach='//a[@name="spouse"]/following::table[1]//tr', rules=[ Rule(key='name', extractor=Path('./td[1]//text()')), Rule(key='info', extractor=Path('./td[2]//text()')) ], transform=lambda x: ("%s::%s" % (x.get('name').strip(), (_re_spaces.sub(' ', x.get('info') or '')).strip())).strip(':'))), Rule( key='trade mark', extractor=Path( foreach= '//div[@class="_imdbpyh4"]/h4[starts-with(text(), "Trade Mark")]' '/.././div[contains(@class, "soda")]', path='.//text()', transform=str.strip)), Rule(key='trivia', extractor=Path( foreach= '//div[@class="_imdbpyh4"]/h4[starts-with(text(), "Trivia")]' '/.././div[contains(@class, "soda")]', path='.//text()', transform=str.strip)), Rule( key='quotes', extractor=Path( foreach= '//div[@class="_imdbpyh4"]/h4[starts-with(text(), "Personal Quotes")]' '/.././div[contains(@class, "soda")]', path='.//text()', transform=str.strip)), Rule(key='salary history', extractor=Rules( foreach='//a[@name="salary"]/following::table[1]//tr', rules=[ Rule(key='title', extractor=Path('./td[1]//text()')), Rule(key='info', extractor=Path('./td[2]//text()')) ], transform=lambda x: "%s::%s" % (x.get('title').strip( ), _re_spaces.sub(' ', (x.get('info') or '')).strip()))) ] preprocessors = [(re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'), (re.compile('(<h4)', re.I), r'</div><div class="_imdbpyh4">\1'), (re.compile('(</table>\n</div>\s+)</div>', re.I + re.DOTALL), r'\1'), (re.compile('(<div id="tn15bot">)'), r'</div>\1'), (re.compile('\.<br><br>([^\s])', re.I), r'. \1')] def postprocess_data(self, data): for key in ['birth info', 'death info']: if key in data and isinstance(data[key], dict): subdata = data[key] del data[key] data.update(subdata) for what in 'birth date', 'death date', 'death cause': if what in data and not data[what]: del data[what] return data
def get_person_biography(self, personID): cont = self._mretrieve(self.urls['person_main'] % personID + 'bio') d = {} spouses = _findBetween(cont, 'Spouse</h5>', ('</table>', '</dd>'), maxRes=1) if spouses: sl = [] for spouse in spouses[0].split('</tr>'): if spouse.count('</td>') > 1: spouse = spouse.replace('</td>', '::</td>', 1) spouse = _unHtml(spouse) spouse = spouse.replace(':: ', '::').strip() if spouse: sl.append(spouse) if sl: d['spouse'] = sl nnames = _findBetween(cont, '<h5>Nickname</h5>', ('<br/> <br/>','<h5>'), maxRes=1) if nnames: nnames = nnames[0] if nnames: nnames = [x.strip().replace(' (', '::(', 1) for x in nnames.split('<br/>')] if nnames: d['nick names'] = nnames misc_sects = _findBetween(cont, '<h5>', '<br/>') misc_sects[:] = [x.split('</h5>') for x in misc_sects] misc_sects[:] = [x for x in misc_sects if len(x) == 2] for sect, data in misc_sects: sect = sect.lower().replace(':', '').strip() if d.has_key(sect) and sect != 'mini biography': continue elif sect in ('spouse', 'nickname'): continue if sect == 'salary': sect = 'salary history' elif sect == 'where are they now': sect = 'where now' elif sect == 'personal quotes': sect = 'quotes' data = data.replace('</p><p>', '::') data = data.replace('<br><br>', ' ') # for multi-paragraphs 'bio' data = data.replace('</td> <td valign="top">', '@@@@') data = data.replace('</td> </tr>', '::') data = _unHtml(data) data = [x.strip() for x in data.split('::')] data[:] = [x.replace('@@@@', '::') for x in data if x] if sect == 'height' and data: data = data[0] elif sect == 'birth name': data = canonicalName(data[0]) elif sect == 'date of birth': date, notes = date_and_notes(data[0]) if date: d['birth date'] = date if notes: d['birth notes'] = notes continue elif sect == 'date of death': date, notes = date_and_notes(data[0]) if date: d['death date'] = date if notes: d['death notes'] = notes continue elif sect == 'mini biography': ndata = [] for bio in data: byidx = bio.rfind('IMDb Mini Biography By') if byidx != -1: bioAuth = bio[:byidx].rstrip() else: bioAuth = 'Anonymous' bio = u'%s::%s' % (bioAuth, bio[byidx+23:].lstrip()) ndata.append(bio) data[:] = ndata if 'mini biography' in d: d['mini biography'].append(ndata[0]) continue d[sect] = data return {'data': d}
def get_person_biography(self, personID): cont = self._mretrieve(self.urls["person_main"] % personID + "bio") d = {} spouses = _findBetween(cont, "Spouse</h5>", ("</table>", "</dd>"), maxRes=1) if spouses: sl = [] for spouse in spouses[0].split("</tr>"): if spouse.count("</td>") > 1: spouse = spouse.replace("</td>", "::</td>", 1) spouse = _unHtml(spouse) spouse = spouse.replace(":: ", "::").strip() if spouse: sl.append(spouse) if sl: d["spouse"] = sl nnames = _findBetween(cont, "<h5>Nickname</h5>", ("<br/> <br/>", "<h5>"), maxRes=1) if nnames: nnames = nnames[0] if nnames: nnames = [x.strip().replace(" (", "::(", 1) for x in nnames.split("<br/>")] if nnames: d["nick names"] = nnames misc_sects = _findBetween(cont, "<h5>", "<br/>") misc_sects[:] = [x.split("</h5>") for x in misc_sects] misc_sects[:] = [x for x in misc_sects if len(x) == 2] for sect, data in misc_sects: sect = sect.lower().replace(":", "").strip() if d.has_key(sect) and sect != "mini biography": continue elif sect in ("spouse", "nickname"): continue if sect == "salary": sect = "salary history" elif sect == "where are they now": sect = "where now" elif sect == "personal quotes": sect = "quotes" data = data.replace("</p><p>", "::") data = data.replace("<br><br>", " ") # for multi-paragraphs 'bio' data = data.replace('</td> <td valign="top">', "@@@@") data = data.replace("</td> </tr>", "::") data = _unHtml(data) data = [x.strip() for x in data.split("::")] data[:] = [x.replace("@@@@", "::") for x in data if x] if sect == "height" and data: data = data[0] elif sect == "birth name": data = canonicalName(data[0]) elif sect == "date of birth": date, notes = date_and_notes(data[0]) if date: d["birth date"] = date if notes: d["birth notes"] = notes continue elif sect == "date of death": date, notes = date_and_notes(data[0]) if date: d["death date"] = date if notes: d["death notes"] = notes continue elif sect == "mini biography": ndata = [] for bio in data: byidx = bio.rfind("IMDb Mini Biography By") if byidx != -1: bioAuth = bio[:byidx].rstrip() else: bioAuth = "Anonymous" bio = u"%s::%s" % (bioAuth, bio[byidx + 23 :].lstrip()) ndata.append(bio) data[:] = ndata if "mini biography" in d: d["mini biography"].append(ndata[0]) continue d[sect] = data return {"data": d}
class DOMHTMLBioParser(DOMParserBase): """Parser for the "biography" page of a given person. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bioparser = DOMHTMLBioParser() result = bioparser.parse(biography_html_string) """ _defGetRefs = True _birth_attrs = [ Attribute(key='birth date', path="./time[@itemprop='birthDate']/@datetime"), Attribute( key='birth notes', path="./a[starts-with(@href, '/search/name?birth_place=')]/text()") ] _death_attrs = [ Attribute(key='death date', path="./time[@itemprop='deathDate']/@datetime"), Attribute( key='death notes', path="./text()", # TODO: check if this slicing is always correct postprocess=lambda x: ''.join(x).strip()[2:]) ] extractors = [ Extractor(label='headshot', path="//img[@class='poster']", attrs=Attribute(key='headshot', path="./@src")), Extractor(label='birth info', path="//table[@id='overviewTable']" "//td[text()='Born']/following-sibling::td[1]", attrs=_birth_attrs), Extractor(label='death info', path="//table[@id='overviewTable']" "//td[text()='Died']/following-sibling::td[1]", attrs=_death_attrs), Extractor( label='nick names', path="//table[@id='overviewTable']" "//td[starts-with(text(), 'Nickname')]/following-sibling::td[1]", attrs=Attribute(key='nick names', path="./text()", joiner='|', postprocess=lambda x: [ n.strip().replace(' (', '::(', 1) for n in x.split('|') if n.strip() ])), Extractor(label='birth name', path="//table[@id='overviewTable']" "//td[text()='Birth Name']/following-sibling::td[1]", attrs=Attribute( key='birth name', path="./text()", postprocess=lambda x: canonicalName(x.strip()))), Extractor( label='height', path= "//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]", attrs=Attribute(key='height', path="./text()", postprocess=lambda x: x.strip())), Extractor( label='mini biography', path="//a[@name='mini_bio']/following-sibling::" "div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]", attrs=Attribute(key='mini biography', multi=True, path={ 'bio': ".//text()", 'by': ".//a[@name='ba']//text()" }, postprocess=lambda x: "%s::%s" % ((x.get('bio') or '').split( '- IMDb Mini Biography By:')[0].strip(), (x.get('by') or '').strip() or 'Anonymous'))), Extractor( label='spouse', path="//div[h5='Spouse']/table/tr", attrs=Attribute(key='spouse', multi=True, path={ 'name': "./td[1]//text()", 'info': "./td[2]//text()" }, postprocess=lambda x: ("%s::%s" % (x.get('name').strip(), (x.get('info') or '').strip())).strip(':'))), Extractor(label='trade mark', path="//div[h5='Trade Mark']/p", attrs=Attribute(key='trade mark', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='trivia', path="//div[h5='Trivia']/p", attrs=Attribute(key='trivia', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='quotes', path="//div[h5='Personal Quotes']/p", attrs=Attribute(key='quotes', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor( label='salary', path="//div[h5='Salary']/table/tr", attrs=Attribute(key='salary history', multi=True, path={ 'title': "./td[1]//text()", 'info': "./td[2]/text()", }, postprocess=lambda x: "%s::%s" % (x.get('title').strip(), x.get('info').strip()))), Extractor(label='where now', path="//div[h5='Where Are They Now']/p", attrs=Attribute(key='where now', multi=True, path=".//text()", postprocess=lambda x: x.strip())) ] preprocessors = [(re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'), (re.compile('(</table>\n</div>\s+)</div>', re.I + re.DOTALL), r'\1'), (re.compile('(<div id="tn15bot">)'), r'</div>\1'), (re.compile('\.<br><br>([^\s])', re.I), r'. \1')] def postprocess_data(self, data): for what in 'birth date', 'death date': if what in data and not data[what]: del data[what] return data
class DOMHTMLBioParser(DOMParserBase): """Parser for the "biography" page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bioparser = DOMHTMLBioParser() result = bioparser.parse(biography_html_string) """ _defGetRefs = True _birth_attrs = [Attribute(key='birth date', path={ 'day': "./a[starts-with(@href, " \ "'/OnThisDay?')]/text()", 'year': "./a[starts-with(@href, " \ "'/BornInYear?')]/text()" }, postprocess=lambda x: build_date(x)), Attribute(key='birth notes', path="./a[starts-with(@href, '/BornWhere?')]/text()")] _death_attrs = [Attribute(key='death date', path={ 'day': "./a[starts-with(@href, " \ "'/OnThisDay?')]/text()", 'year': "./a[starts-with(@href, " \ "'/DiedInYear?')]/text()" }, postprocess=lambda x: build_date(x)), Attribute(key='death notes', path="./text()", # TODO: check if this slicing is always correct postprocess=lambda x: u''.join(x).strip()[2:])] extractors = [ Extractor(label='birth info', path="//div[h5='Date of Birth']", attrs=_birth_attrs), Extractor(label='death info', path="//div[h5='Date of Death']", attrs=_death_attrs), Extractor(label='nick names', path="//div[h5='Nickname']", attrs=Attribute(key='nick names', path="./text()", joiner='|', postprocess=lambda x: [n.strip().replace(' (', '::(', 1) for n in x.split('|') if n.strip()])), Extractor(label='birth name', path="//div[h5='Birth Name']", attrs=Attribute(key='birth name', path="./text()", postprocess=lambda x: canonicalName(x.strip()))), Extractor(label='height', path="//div[h5='Height']", attrs=Attribute(key='height', path="./text()", postprocess=lambda x: x.strip())), Extractor(label='mini biography', path="//div[h5='Mini Biography']", attrs=Attribute(key='mini biography', multi=True, path={ 'bio': "./p//text()", 'by': "./b/following-sibling::a/text()" }, postprocess=lambda x: "%s::%s" % \ (x.get('bio').strip(), (x.get('by') or u'').strip() or u'Anonymous'))), Extractor(label='spouse', path="//div[h5='Spouse']/table/tr", attrs=Attribute(key='spouse', multi=True, path={ 'name': "./td[1]//text()", 'info': "./td[2]//text()" }, postprocess=lambda x: "%s::%s" % \ (x.get('name').strip(), x.get('info').strip()))), Extractor(label='trade mark', path="//div[h5='Trade Mark']/p", attrs=Attribute(key='trade mark', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='trivia', path="//div[h5='Trivia']/p", attrs=Attribute(key='trivia', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='quotes', path="//div[h5='Personal Quotes']/p", attrs=Attribute(key='quotes', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='salary', path="//div[h5='Salary']/table/tr", attrs=Attribute(key='salary history', multi=True, path={ 'title': "./td[1]//text()", 'info': "./td[2]/text()", }, postprocess=lambda x: "%s::%s" % \ (x.get('title').strip(), x.get('info').strip()))), Extractor(label='where now', path="//div[h5='Where Are They Now']/p", attrs=Attribute(key='where now', multi=True, path=".//text()", postprocess=lambda x: x.strip())), ] preprocessors = [(re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'), (re.compile('(</table>\n</div>\s+)</div>', re.I + re.DOTALL), r'\1'), (re.compile('(<div id="tn15bot">)'), r'</div>\1'), (re.compile('\.<br><br>([^\s])', re.I), r'. \1')]
def _add_items(self): # Add a new section in the biography. if self._in_content and self._sect_name and self._sect_data: sect = self._sect_name.strip().lower() # XXX: to get rid of the last colons and normalize section names. if sect[-1] == ':': sect = sect[:-1] if sect == 'salary': sect = 'salary history' elif sect == 'nickname': sect = 'nick names' elif sect == 'where are they now': sect = 'where now' elif sect == 'personal quotes': sect = 'quotes' elif sect == 'date of birth': sect = 'birth date' elif sect == 'date of death': sect = 'death date' data = self._sect_data.strip() d_split = data.split('::') d_split[:] = filter(None, [x.strip() for x in d_split]) # Do some transformation on some special cases. if sect == 'salary history': newdata = [] for j in d_split: j = filter(None, [x.strip() for x in j.split('@@@@')]) newdata.append('::'.join(j)) d_split[:] = newdata elif sect == 'nick names': d_split[:] = [normalizeName(x) for x in d_split] elif sect == 'birth name': d_split = canonicalName(d_split[0]) elif sect == 'height': d_split = d_split[0] elif sect == 'spouse': d_split[:] = [ x.replace(' (', '::(', 1).replace(' ::', '::') for x in d_split ] # Birth/death date are in both maindetails and bio pages; # it's safe to collect both of them. if sect == 'birth date': date, notes = date_and_notes(d_split[0]) if date: self._bio_data['birth date'] = date if notes: self._bio_data['birth notes'] = notes elif sect == 'death date': date, notes = date_and_notes(d_split[0]) if date: self._bio_data['death date'] = date if notes: self._bio_data['death notes'] = notes elif d_split: # Multiple items are added separately (e.g.: 'trivia' is # a list of strings). self._bio_data[sect] = d_split self._sect_name = u'' self._sect_data = u'' self._in_sect = 0