Example #1
0
 def get_person_biography(self, personID):
     cont = self._mretrieve(imdbURL_person_main % personID + 'bio')
     d = {}
     spouses = _findBetween(cont,
                            'Spouse</h5>', ('</table>', '</dd>'),
                            maxRes=1)
     if spouses:
         sl = []
         for spouse in spouses[0].split('</tr>'):
             if spouse.count('</td>') > 1:
                 spouse = spouse.replace('</td>', '::</td>', 1)
             spouse = _unHtml(spouse)
             spouse = spouse.replace(':: ', '::').strip()
             if spouse: sl.append(spouse)
         if sl: d['spouse'] = sl
     misc_sects = _findBetween(cont, '<h5>', '<br/>')
     misc_sects[:] = [x.split('</h5>') for x in misc_sects]
     misc_sects[:] = [x for x in misc_sects if len(x) == 2]
     for sect, data in misc_sects:
         sect = sect.lower().replace(':', '').strip()
         if d.has_key(sect): continue
         if sect == 'salary': sect = 'salary history'
         elif sect == 'spouse': continue
         elif sect == 'nickname': sect = 'nick names'
         elif sect == 'where are they now': sect = 'where now'
         elif sect == 'personal quotes': sect = 'quotes'
         data = data.replace('</p><p>', '::')
         data = data.replace('<br><br>', ' ')  # for multi-paragraphs 'bio'
         data = data.replace('</td> <td valign="top">', '@@@@')
         data = data.replace('</td> </tr>', '::')
         data = _unHtml(data)
         data = [x.strip() for x in data.split('::')]
         data[:] = [x.replace('@@@@', '::') for x in data if x]
         if sect == 'height' and data: data = data[0]
         elif sect == 'birth name': data = canonicalName(data[0])
         elif sect == 'date of birth':
             date, notes = date_and_notes(data[0])
             if date:
                 d['birth date'] = date
             if notes:
                 d['birth notes'] = notes
             continue
         elif sect == 'date of death':
             date, notes = date_and_notes(data[0])
             if date:
                 d['death date'] = date
             if notes:
                 d['death notes'] = notes
             continue
         elif sect == 'mini biography':
             ndata = []
             for bio in data:
                 byidx = bio.rfind('IMDb Mini Biography By')
                 if byidx != -1:
                     bio = u'%s::%s' % (bio[byidx + 23:].lstrip(),
                                        bio[:byidx].rstrip())
                 ndata.append(bio)
             data[:] = ndata
         d[sect] = data
     return {'data': d}
Example #2
0
 def _add_items(self):
     # Add a new section in the biography.
     if self._in_content and self._sect_name and self._sect_data:
         sect = self._sect_name.strip().lower()
         # XXX: to get rid of the last colons and normalize section names.
         if sect[-1] == ':':
             sect = sect[:-1]
         if sect == 'salary':
             sect = 'salary history'
         elif sect == 'nickname':
             sect = 'nick names'
         elif sect == 'where are they now':
             sect = 'where now'
         elif sect == 'personal quotes':
             sect = 'quotes'
         elif sect == 'date of birth':
             sect = 'birth date'
         elif sect == 'date of death':
             sect = 'death date'
         data = self._sect_data.strip()
         d_split = data.split('::')
         d_split[:] = filter(None, [x.strip() for x in d_split])
         # Do some transformation on some special cases.
         if sect == 'salary history':
             newdata = []
             for j in d_split:
                 j = filter(None, [x.strip() for x in j.split('@@@@')])
                 newdata.append('::'.join(j))
             d_split[:] = newdata
         elif sect == 'nick names':
             d_split[:] = [normalizeName(x) for x in d_split]
         elif sect == 'birth name':
             d_split = canonicalName(d_split[0])
         elif sect == 'height':
             d_split = d_split[0]
         elif sect == 'spouse':
             d_split[:] = [x.replace(' (', '::(', 1).replace(' ::', '::')
                             for x in d_split]
         # Birth/death date are in both maindetails and bio pages;
         # it's safe to collect both of them.
         if sect == 'birth date':
             date, notes = date_and_notes(d_split[0])
             if date:
                 self._bio_data['birth date'] = date
             if notes:
                 self._bio_data['birth notes'] = notes
         elif sect == 'death date':
             date, notes = date_and_notes(d_split[0])
             if date:
                 self._bio_data['death date'] = date
             if notes:
                 self._bio_data['death notes'] = notes
         elif d_split:
             # Multiple items are added separately (e.g.: 'trivia' is
             # a list of strings).
             self._bio_data[sect] = d_split
     self._sect_name = u''
     self._sect_data = u''
     self._in_sect = 0
Example #3
0
File: imdb.py Project: tijlk/qmdb
 def person_to_dict(person):
     try:
         person_dict = {'canonical_name': utils.canonicalName(person.data['name']),
                        'name': utils.normalizeName(person.data['name']),
                        'person_id': int(person.personID)}
     except:
         person_dict = None
     return person_dict
Example #4
0
def scan_names(name_list, name, results=0, ro_threshold=RO_THRESHOLD):
    """Scan a list of names, searching for best matches against some variations.

    :param name_list: list of (personID, {person_data}) tuples
    :type name_list: list
    :param name: searched name
    :type name: str
    :results: returns at most as much results (all, if 0)
    :type results: int
    :param ro_threshold: ignore results with a score lower than this value
    :type ro_threshold: float
    :returns: list of results sorted by similarity
    :rtype: list"""
    canonical_name = canonicalName(name).replace(',', '')
    sm1 = SequenceMatcher()
    sm2 = SequenceMatcher()
    sm1.set_seq1(name.lower())
    sm2.set_seq1(canonical_name.lower())
    resd = {}
    for i, n_data in name_list:
        nil = n_data['name']
        # Distance with the canonical name.
        ratios = [
            ratcliff(name, nil, sm1) + 0.1,
            ratcliff(name,
                     canonicalName(nil).replace(',', ''), sm2)
        ]
        ratio = max(ratios)
        if ratio >= ro_threshold:
            if i in resd:
                if ratio > resd[i][0]:
                    resd[i] = (ratio, (i, n_data))
            else:
                resd[i] = (ratio, (i, n_data))
    res = list(resd.values())
    res.sort()
    res.reverse()
    if results > 0:
        res[:] = res[:results]
    return res
Example #5
0
 def _getitem(self, key):
     """Handle special keys."""
     if 'name' in self.data:
         if key == 'name':
             return normalizeName(self.data['name'])
         elif key == 'canonical name':
             return canonicalName(self.data['name'])
         elif key == 'long imdb name':
             return build_name(self.data, canonical=False)
         elif key == 'long imdb canonical name':
             return build_name(self.data, canonical=True)
     if key == 'full-size headshot':
         return self.get_fullsizeURL()
     return None
Example #6
0
def scan_names(name_list, name, results=0, ro_threshold=RO_THRESHOLD):
    """Scan a list of names, searching for best matches against some variations.

    :param name_list: list of (personID, {person_data}) tuples
    :type name_list: list
    :param name: searched name
    :type name: str
    :results: returns at most as much results (all, if 0)
    :type results: int
    :param ro_threshold: ignore results with a score lower than this value
    :type ro_threshold: float
    :returns: list of results sorted by similarity
    :rtype: list"""
    canonical_name = canonicalName(name).replace(',', '')
    sm1 = SequenceMatcher()
    sm2 = SequenceMatcher()
    sm1.set_seq1(name.lower())
    sm2.set_seq1(canonical_name.lower())
    resd = {}
    for i, n_data in name_list:
        nil = n_data['name']
        # Distance with the canonical name.
        ratios = [ratcliff(name, nil, sm1) + 0.1,
                  ratcliff(name, canonicalName(nil).replace(',', ''), sm2)]
        ratio = max(ratios)
        if ratio >= ro_threshold:
            if i in resd:
                if ratio > resd[i][0]:
                    resd[i] = (ratio, (i, n_data))
            else:
                resd[i] = (ratio, (i, n_data))
    res = list(resd.values())
    res.sort()
    res.reverse()
    if results > 0:
        res[:] = res[:results]
    return res
Example #7
0
def name_soundexes(name):
    """Return three soundex codes for the given name.
    :param name: person name
    :type name: str
    :returns: tuple of soundex codes: (S(Name Surname), S(Surname Name), S(Surname))
    :rtype: tuple
    """
    if not name:
        return None, None, None
    s1 = soundex(name)
    canonical_name = canonicalName(name)
    s2 = soundex(canonical_name)
    if s1 == s2:
        s2 = None
    s3 = soundex(canonical_name.split(', ')[0])
    if s3 and s3 in (s1, s2):
        s3 = None
    return s1, s2, s3
Example #8
0
def name_soundexes(name):
    """Return three soundex codes for the given name.
    :param name: person name
    :type name: str
    :returns: tuple of soundex codes: (S(Name Surname), S(Surname Name), S(Surname))
    :rtype: tuple
    """
    if not name:
        return None, None, None
    s1 = soundex(name)
    canonical_name = canonicalName(name)
    s2 = soundex(canonical_name)
    if s1 == s2:
        s2 = None
    s3 = soundex(canonical_name.split(', ')[0])
    if s3 and s3 in (s1, s2):
        s3 = None
    return s1, s2, s3
Example #9
0
def nameVariations(name, fromPtdf=0):
    """Build name variations useful for searches; if fromPtdf is true,
    the input is assumed to be in the plain text data files format."""
    name1 = name2 = name3 = u''
    if fromPtdf or re_nameIndex.search(name):
        # We've a name with an (imdbIndex)
        namedict = analyze_name(name, canonical=1)
        # name1 is the name in the canonical format.
        name1 = namedict['name']
        # name3 is the canonical name with the imdbIndex.
        if fromPtdf:
            if namedict.has_key('imdbIndex'):
                name3 = name
        else:
            name3 = build_name(namedict, canonical=1)
    else:
        # name1 is the name in the canonical format.
        name1 = canonicalName(name)
        name3 = u''
    # name2 is the name in the normal format, if it differs from name1.
    name2 = normalizeName(name1)
    if name1 == name2: name2 = u''
    return name1, name2, name3
Example #10
0
def nameVariations(name, fromPtdf=0):
    """Build name variations useful for searches; if fromPtdf is true,
    the input is assumed to be in the plain text data files format."""
    name1 = name2 = name3 = u''
    if fromPtdf or re_nameIndex.search(name):
        # We've a name with an (imdbIndex)
        namedict = analyze_name(name, canonical=1)
        # name1 is the name in the canonical format.
        name1 = namedict['name']
        # name3 is the canonical name with the imdbIndex.
        if fromPtdf:
            if namedict.has_key('imdbIndex'):
                name3 = name
        else:
            name3 = build_name(namedict, canonical=1)
    else:
        # name1 is the name in the canonical format.
        name1 = canonicalName(name)
        name3 = u''
    # name2 is the name in the normal format, if it differs from name1.
    name2 = normalizeName(name1)
    if name1 == name2: name2 = u''
    return name1, name2, name3
Example #11
0
class DOMHTMLBioParser(DOMParserBase):
    """Parser for the "biography" page of a given person.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        bioparser = DOMHTMLBioParser()
        result = bioparser.parse(biography_html_string)
    """
    _defGetRefs = True

    _birth_rules = [
        Rule(key='birth date',
             extractor=Path('./time[@itemprop="birthDate"]/@datetime')),
        Rule(
            key='birth notes',
            extractor=Path(
                './a[starts-with(@href, "/search/name?birth_place=")]/text()'))
    ]

    _death_rules = [
        Rule(key='death date',
             extractor=Path('./time[@itemprop="deathDate"]/@datetime')),
        Rule(key='death cause',
             extractor=Path(
                 './text()',
                 transform=lambda x: ''.join(x).strip()[2:].lstrip())),
        Rule(key='death notes',
             extractor=Path('..//text()',
                            transform=lambda x: _re_spaces.sub(
                                ' ', (x or '').strip().split('\n')[-1])))
    ]

    rules = [
        Rule(key='headshot', extractor=Path('//img[@class="poster"]/@src')),
        Rule(key='birth info',
             extractor=Rules(section='//table[@id="overviewTable"]'
                             '//td[text()="Born"]/following-sibling::td[1]',
                             rules=_birth_rules)),
        Rule(key='death info',
             extractor=Rules(section='//table[@id="overviewTable"]'
                             '//td[text()="Died"]/following-sibling::td[1]',
                             rules=_death_rules)),
        Rule(
            key='nick names',
            extractor=Path(
                '//table[@id="overviewTable"]'
                '//td[starts-with(text(), "Nickname")]/following-sibling::td[1]/text()',
                reduce=lambda xs: '|'.join(xs),
                transform=lambda x: [
                    n.strip().replace(' (', '::(', 1) for n in x.split('|')
                    if n.strip()
                ])),
        Rule(key='birth name',
             extractor=Path(
                 '//table[@id="overviewTable"]'
                 '//td[text()="Birth Name"]/following-sibling::td[1]/text()',
                 transform=lambda x: canonicalName(x.strip()))),
        Rule(key='height',
             extractor=Path(
                 '//table[@id="overviewTable"]'
                 '//td[text()="Height"]/following-sibling::td[1]/text()',
                 transform=str.strip)),
        Rule(
            key='mini biography',
            extractor=Rules(
                foreach='//a[@name="mini_bio"]/following-sibling::'
                'div[1 = count(preceding-sibling::a[1] | ../a[@name="mini_bio"])]',
                rules=[
                    Rule(key='bio', extractor=Path('.//text()')),
                    Rule(key='by', extractor=Path('.//a[@name="ba"]//text()'))
                ],
                transform=lambda x: "%s::%s" %
                ((x.get('bio') or '').split('- IMDb Mini Biography By:')[0].
                 strip(), (x.get('by') or '').strip() or 'Anonymous'))),
        Rule(
            key='spouse',
            extractor=Rules(
                foreach='//a[@name="spouse"]/following::table[1]//tr',
                rules=[
                    Rule(key='name',
                         extractor=Path('./td[1]//text()')),
                    Rule(key='info', extractor=Path('./td[2]//text()'))
                ],
                transform=lambda x:
                ("%s::%s" %
                 (x.get('name').strip(),
                  (_re_spaces.sub(' ',
                                  x.get('info') or '')).strip())).strip(':'))),
        Rule(
            key='trade mark',
            extractor=Path(
                foreach=
                '//div[@class="_imdbpyh4"]/h4[starts-with(text(), "Trade Mark")]'
                '/.././div[contains(@class, "soda")]',
                path='.//text()',
                transform=str.strip)),
        Rule(key='trivia',
             extractor=Path(
                 foreach=
                 '//div[@class="_imdbpyh4"]/h4[starts-with(text(), "Trivia")]'
                 '/.././div[contains(@class, "soda")]',
                 path='.//text()',
                 transform=str.strip)),
        Rule(
            key='quotes',
            extractor=Path(
                foreach=
                '//div[@class="_imdbpyh4"]/h4[starts-with(text(), "Personal Quotes")]'
                '/.././div[contains(@class, "soda")]',
                path='.//text()',
                transform=str.strip)),
        Rule(key='salary history',
             extractor=Rules(
                 foreach='//a[@name="salary"]/following::table[1]//tr',
                 rules=[
                     Rule(key='title', extractor=Path('./td[1]//text()')),
                     Rule(key='info', extractor=Path('./td[2]//text()'))
                 ],
                 transform=lambda x: "%s::%s" % (x.get('title').strip(
                 ), _re_spaces.sub(' ', (x.get('info') or '')).strip())))
    ]

    preprocessors = [(re.compile('(<h5>)',
                                 re.I), r'</div><div class="_imdbpy">\1'),
                     (re.compile('(<h4)',
                                 re.I), r'</div><div class="_imdbpyh4">\1'),
                     (re.compile('(</table>\n</div>\s+)</div>',
                                 re.I + re.DOTALL), r'\1'),
                     (re.compile('(<div id="tn15bot">)'), r'</div>\1'),
                     (re.compile('\.<br><br>([^\s])', re.I), r'. \1')]

    def postprocess_data(self, data):
        for key in ['birth info', 'death info']:
            if key in data and isinstance(data[key], dict):
                subdata = data[key]
                del data[key]
                data.update(subdata)
        for what in 'birth date', 'death date', 'death cause':
            if what in data and not data[what]:
                del data[what]
        return data
Example #12
0
 def get_person_biography(self, personID):
     cont = self._mretrieve(self.urls['person_main'] % personID + 'bio')
     d = {}
     spouses = _findBetween(cont, 'Spouse</h5>', ('</table>', '</dd>'),
                             maxRes=1)
     if spouses:
         sl = []
         for spouse in spouses[0].split('</tr>'):
             if spouse.count('</td>') > 1:
                 spouse = spouse.replace('</td>', '::</td>', 1)
             spouse = _unHtml(spouse)
             spouse = spouse.replace(':: ', '::').strip()
             if spouse: sl.append(spouse)
         if sl: d['spouse'] = sl
     nnames = _findBetween(cont, '<h5>Nickname</h5>', ('<br/> <br/>','<h5>'),
                             maxRes=1)
     if nnames:
         nnames = nnames[0]
         if nnames:
             nnames = [x.strip().replace(' (', '::(', 1)
                         for x in nnames.split('<br/>')]
             if nnames:
                 d['nick names'] = nnames
     misc_sects = _findBetween(cont, '<h5>', '<br/>')
     misc_sects[:] = [x.split('</h5>') for x in misc_sects]
     misc_sects[:] = [x for x in misc_sects if len(x) == 2]
     for sect, data in misc_sects:
         sect = sect.lower().replace(':', '').strip()
         if d.has_key(sect) and sect != 'mini biography': continue
         elif sect in ('spouse', 'nickname'): continue
         if sect == 'salary': sect = 'salary history'
         elif sect == 'where are they now': sect = 'where now'
         elif sect == 'personal quotes': sect = 'quotes'
         data = data.replace('</p><p>', '::')
         data = data.replace('<br><br>', ' ') # for multi-paragraphs 'bio'
         data = data.replace('</td> <td valign="top">', '@@@@')
         data = data.replace('</td> </tr>', '::')
         data = _unHtml(data)
         data = [x.strip() for x in data.split('::')]
         data[:] = [x.replace('@@@@', '::') for x in data if x]
         if sect == 'height' and data: data = data[0]
         elif sect == 'birth name': data = canonicalName(data[0])
         elif sect == 'date of birth':
             date, notes = date_and_notes(data[0])
             if date:
                 d['birth date'] = date
             if notes:
                 d['birth notes'] = notes
             continue
         elif sect == 'date of death':
             date, notes = date_and_notes(data[0])
             if date:
                 d['death date'] = date
             if notes:
                 d['death notes'] = notes
             continue
         elif sect == 'mini biography':
             ndata = []
             for bio in data:
                 byidx = bio.rfind('IMDb Mini Biography By')
                 if byidx != -1:
                     bioAuth = bio[:byidx].rstrip()
                 else:
                     bioAuth = 'Anonymous'
                 bio = u'%s::%s' % (bioAuth, bio[byidx+23:].lstrip())
                 ndata.append(bio)
             data[:] = ndata
             if 'mini biography' in d:
                 d['mini biography'].append(ndata[0])
                 continue
         d[sect] = data
     return {'data': d}
Example #13
0
 def get_person_biography(self, personID):
     cont = self._mretrieve(self.urls["person_main"] % personID + "bio")
     d = {}
     spouses = _findBetween(cont, "Spouse</h5>", ("</table>", "</dd>"), maxRes=1)
     if spouses:
         sl = []
         for spouse in spouses[0].split("</tr>"):
             if spouse.count("</td>") > 1:
                 spouse = spouse.replace("</td>", "::</td>", 1)
             spouse = _unHtml(spouse)
             spouse = spouse.replace(":: ", "::").strip()
             if spouse:
                 sl.append(spouse)
         if sl:
             d["spouse"] = sl
     nnames = _findBetween(cont, "<h5>Nickname</h5>", ("<br/> <br/>", "<h5>"), maxRes=1)
     if nnames:
         nnames = nnames[0]
         if nnames:
             nnames = [x.strip().replace(" (", "::(", 1) for x in nnames.split("<br/>")]
             if nnames:
                 d["nick names"] = nnames
     misc_sects = _findBetween(cont, "<h5>", "<br/>")
     misc_sects[:] = [x.split("</h5>") for x in misc_sects]
     misc_sects[:] = [x for x in misc_sects if len(x) == 2]
     for sect, data in misc_sects:
         sect = sect.lower().replace(":", "").strip()
         if d.has_key(sect) and sect != "mini biography":
             continue
         elif sect in ("spouse", "nickname"):
             continue
         if sect == "salary":
             sect = "salary history"
         elif sect == "where are they now":
             sect = "where now"
         elif sect == "personal quotes":
             sect = "quotes"
         data = data.replace("</p><p>", "::")
         data = data.replace("<br><br>", " ")  # for multi-paragraphs 'bio'
         data = data.replace('</td> <td valign="top">', "@@@@")
         data = data.replace("</td> </tr>", "::")
         data = _unHtml(data)
         data = [x.strip() for x in data.split("::")]
         data[:] = [x.replace("@@@@", "::") for x in data if x]
         if sect == "height" and data:
             data = data[0]
         elif sect == "birth name":
             data = canonicalName(data[0])
         elif sect == "date of birth":
             date, notes = date_and_notes(data[0])
             if date:
                 d["birth date"] = date
             if notes:
                 d["birth notes"] = notes
             continue
         elif sect == "date of death":
             date, notes = date_and_notes(data[0])
             if date:
                 d["death date"] = date
             if notes:
                 d["death notes"] = notes
             continue
         elif sect == "mini biography":
             ndata = []
             for bio in data:
                 byidx = bio.rfind("IMDb Mini Biography By")
                 if byidx != -1:
                     bioAuth = bio[:byidx].rstrip()
                 else:
                     bioAuth = "Anonymous"
                 bio = u"%s::%s" % (bioAuth, bio[byidx + 23 :].lstrip())
                 ndata.append(bio)
             data[:] = ndata
             if "mini biography" in d:
                 d["mini biography"].append(ndata[0])
                 continue
         d[sect] = data
     return {"data": d}
Example #14
0
class DOMHTMLBioParser(DOMParserBase):
    """Parser for the "biography" page of a given person.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        bioparser = DOMHTMLBioParser()
        result = bioparser.parse(biography_html_string)
    """
    _defGetRefs = True

    _birth_attrs = [
        Attribute(key='birth date',
                  path="./time[@itemprop='birthDate']/@datetime"),
        Attribute(
            key='birth notes',
            path="./a[starts-with(@href, '/search/name?birth_place=')]/text()")
    ]

    _death_attrs = [
        Attribute(key='death date',
                  path="./time[@itemprop='deathDate']/@datetime"),
        Attribute(
            key='death notes',
            path="./text()",
            # TODO: check if this slicing is always correct
            postprocess=lambda x: ''.join(x).strip()[2:])
    ]

    extractors = [
        Extractor(label='headshot',
                  path="//img[@class='poster']",
                  attrs=Attribute(key='headshot', path="./@src")),
        Extractor(label='birth info',
                  path="//table[@id='overviewTable']"
                  "//td[text()='Born']/following-sibling::td[1]",
                  attrs=_birth_attrs),
        Extractor(label='death info',
                  path="//table[@id='overviewTable']"
                  "//td[text()='Died']/following-sibling::td[1]",
                  attrs=_death_attrs),
        Extractor(
            label='nick names',
            path="//table[@id='overviewTable']"
            "//td[starts-with(text(), 'Nickname')]/following-sibling::td[1]",
            attrs=Attribute(key='nick names',
                            path="./text()",
                            joiner='|',
                            postprocess=lambda x: [
                                n.strip().replace(' (', '::(', 1)
                                for n in x.split('|') if n.strip()
                            ])),
        Extractor(label='birth name',
                  path="//table[@id='overviewTable']"
                  "//td[text()='Birth Name']/following-sibling::td[1]",
                  attrs=Attribute(
                      key='birth name',
                      path="./text()",
                      postprocess=lambda x: canonicalName(x.strip()))),
        Extractor(
            label='height',
            path=
            "//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]",
            attrs=Attribute(key='height',
                            path="./text()",
                            postprocess=lambda x: x.strip())),
        Extractor(
            label='mini biography',
            path="//a[@name='mini_bio']/following-sibling::"
            "div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]",
            attrs=Attribute(key='mini biography',
                            multi=True,
                            path={
                                'bio': ".//text()",
                                'by': ".//a[@name='ba']//text()"
                            },
                            postprocess=lambda x: "%s::%s" %
                            ((x.get('bio') or '').split(
                                '- IMDb Mini Biography By:')[0].strip(),
                             (x.get('by') or '').strip() or 'Anonymous'))),
        Extractor(
            label='spouse',
            path="//div[h5='Spouse']/table/tr",
            attrs=Attribute(key='spouse',
                            multi=True,
                            path={
                                'name': "./td[1]//text()",
                                'info': "./td[2]//text()"
                            },
                            postprocess=lambda x:
                            ("%s::%s" %
                             (x.get('name').strip(),
                              (x.get('info') or '').strip())).strip(':'))),
        Extractor(label='trade mark',
                  path="//div[h5='Trade Mark']/p",
                  attrs=Attribute(key='trade mark',
                                  multi=True,
                                  path=".//text()",
                                  postprocess=lambda x: x.strip())),
        Extractor(label='trivia',
                  path="//div[h5='Trivia']/p",
                  attrs=Attribute(key='trivia',
                                  multi=True,
                                  path=".//text()",
                                  postprocess=lambda x: x.strip())),
        Extractor(label='quotes',
                  path="//div[h5='Personal Quotes']/p",
                  attrs=Attribute(key='quotes',
                                  multi=True,
                                  path=".//text()",
                                  postprocess=lambda x: x.strip())),
        Extractor(
            label='salary',
            path="//div[h5='Salary']/table/tr",
            attrs=Attribute(key='salary history',
                            multi=True,
                            path={
                                'title': "./td[1]//text()",
                                'info': "./td[2]/text()",
                            },
                            postprocess=lambda x: "%s::%s" %
                            (x.get('title').strip(), x.get('info').strip()))),
        Extractor(label='where now',
                  path="//div[h5='Where Are They Now']/p",
                  attrs=Attribute(key='where now',
                                  multi=True,
                                  path=".//text()",
                                  postprocess=lambda x: x.strip()))
    ]

    preprocessors = [(re.compile('(<h5>)',
                                 re.I), r'</div><div class="_imdbpy">\1'),
                     (re.compile('(</table>\n</div>\s+)</div>',
                                 re.I + re.DOTALL), r'\1'),
                     (re.compile('(<div id="tn15bot">)'), r'</div>\1'),
                     (re.compile('\.<br><br>([^\s])', re.I), r'. \1')]

    def postprocess_data(self, data):
        for what in 'birth date', 'death date':
            if what in data and not data[what]:
                del data[what]
        return data
Example #15
0
class DOMHTMLBioParser(DOMParserBase):
    """Parser for the "biography" page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        bioparser = DOMHTMLBioParser()
        result = bioparser.parse(biography_html_string)
    """
    _defGetRefs = True

    _birth_attrs = [Attribute(key='birth date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/OnThisDay?')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/BornInYear?')]/text()"
                            },
                        postprocess=lambda x: build_date(x)),
                    Attribute(key='birth notes',
                        path="./a[starts-with(@href, '/BornWhere?')]/text()")]
    _death_attrs = [Attribute(key='death date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/OnThisDay?')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/DiedInYear?')]/text()"
                            },
                        postprocess=lambda x: build_date(x)),
                    Attribute(key='death notes',
                        path="./text()",
                        # TODO: check if this slicing is always correct
                        postprocess=lambda x: u''.join(x).strip()[2:])]
    extractors = [
            Extractor(label='birth info',
                        path="//div[h5='Date of Birth']",
                        attrs=_birth_attrs),
            Extractor(label='death info',
                        path="//div[h5='Date of Death']",
                        attrs=_death_attrs),
            Extractor(label='nick names',
                        path="//div[h5='Nickname']",
                        attrs=Attribute(key='nick names',
                            path="./text()",
                            joiner='|',
                            postprocess=lambda x: [n.strip().replace(' (',
                                    '::(', 1) for n in x.split('|')
                                    if n.strip()])),
            Extractor(label='birth name',
                        path="//div[h5='Birth Name']",
                        attrs=Attribute(key='birth name',
                            path="./text()",
                            postprocess=lambda x: canonicalName(x.strip()))),
            Extractor(label='height',
                        path="//div[h5='Height']",
                        attrs=Attribute(key='height',
                            path="./text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='mini biography',
                        path="//div[h5='Mini Biography']",
                        attrs=Attribute(key='mini biography',
                            multi=True,
                            path={
                                'bio': "./p//text()",
                                'by': "./b/following-sibling::a/text()"
                                },
                            postprocess=lambda x: "%s::%s" % \
                                (x.get('bio').strip(),
                                (x.get('by') or u'').strip() or u'Anonymous'))),
            Extractor(label='spouse',
                        path="//div[h5='Spouse']/table/tr",
                        attrs=Attribute(key='spouse',
                            multi=True,
                            path={
                                'name': "./td[1]//text()",
                                'info': "./td[2]//text()"
                                },
                            postprocess=lambda x: "%s::%s" % \
                                            (x.get('name').strip(),
                                                x.get('info').strip()))),
            Extractor(label='trade mark',
                        path="//div[h5='Trade Mark']/p",
                        attrs=Attribute(key='trade mark',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='trivia',
                        path="//div[h5='Trivia']/p",
                        attrs=Attribute(key='trivia',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='quotes',
                        path="//div[h5='Personal Quotes']/p",
                        attrs=Attribute(key='quotes',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='salary',
                        path="//div[h5='Salary']/table/tr",
                        attrs=Attribute(key='salary history',
                            multi=True,
                            path={
                                'title': "./td[1]//text()",
                                'info': "./td[2]/text()",
                                },
                            postprocess=lambda x: "%s::%s" % \
                                    (x.get('title').strip(),
                                        x.get('info').strip()))),
            Extractor(label='where now',
                        path="//div[h5='Where Are They Now']/p",
                        attrs=Attribute(key='where now',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            ]

    preprocessors = [(re.compile('(<h5>)',
                                 re.I), r'</div><div class="_imdbpy">\1'),
                     (re.compile('(</table>\n</div>\s+)</div>',
                                 re.I + re.DOTALL), r'\1'),
                     (re.compile('(<div id="tn15bot">)'), r'</div>\1'),
                     (re.compile('\.<br><br>([^\s])', re.I), r'. \1')]
Example #16
0
 def _add_items(self):
     # Add a new section in the biography.
     if self._in_content and self._sect_name and self._sect_data:
         sect = self._sect_name.strip().lower()
         # XXX: to get rid of the last colons and normalize section names.
         if sect[-1] == ':':
             sect = sect[:-1]
         if sect == 'salary':
             sect = 'salary history'
         elif sect == 'nickname':
             sect = 'nick names'
         elif sect == 'where are they now':
             sect = 'where now'
         elif sect == 'personal quotes':
             sect = 'quotes'
         elif sect == 'date of birth':
             sect = 'birth date'
         elif sect == 'date of death':
             sect = 'death date'
         data = self._sect_data.strip()
         d_split = data.split('::')
         d_split[:] = filter(None, [x.strip() for x in d_split])
         # Do some transformation on some special cases.
         if sect == 'salary history':
             newdata = []
             for j in d_split:
                 j = filter(None, [x.strip() for x in j.split('@@@@')])
                 newdata.append('::'.join(j))
             d_split[:] = newdata
         elif sect == 'nick names':
             d_split[:] = [normalizeName(x) for x in d_split]
         elif sect == 'birth name':
             d_split = canonicalName(d_split[0])
         elif sect == 'height':
             d_split = d_split[0]
         elif sect == 'spouse':
             d_split[:] = [
                 x.replace(' (', '::(', 1).replace(' ::', '::')
                 for x in d_split
             ]
         # Birth/death date are in both maindetails and bio pages;
         # it's safe to collect both of them.
         if sect == 'birth date':
             date, notes = date_and_notes(d_split[0])
             if date:
                 self._bio_data['birth date'] = date
             if notes:
                 self._bio_data['birth notes'] = notes
         elif sect == 'death date':
             date, notes = date_and_notes(d_split[0])
             if date:
                 self._bio_data['death date'] = date
             if notes:
                 self._bio_data['death notes'] = notes
         elif d_split:
             # Multiple items are added separately (e.g.: 'trivia' is
             # a list of strings).
             self._bio_data[sect] = d_split
     self._sect_name = u''
     self._sect_data = u''
     self._in_sect = 0