Ejemplo n.º 1
0
 def get_person_biography(self, personID):
     cont = self._mretrieve(imdbURL_person_main % personID + 'bio')
     d = {}
     spouses = _findBetween(cont,
                            'Spouse</h5>', ('</table>', '</dd>'),
                            maxRes=1)
     if spouses:
         sl = []
         for spouse in spouses[0].split('</tr>'):
             if spouse.count('</td>') > 1:
                 spouse = spouse.replace('</td>', '::</td>', 1)
             spouse = _unHtml(spouse)
             spouse = spouse.replace(':: ', '::').strip()
             if spouse: sl.append(spouse)
         if sl: d['spouse'] = sl
     misc_sects = _findBetween(cont, '<h5>', '<br/>')
     misc_sects[:] = [x.split('</h5>') for x in misc_sects]
     misc_sects[:] = [x for x in misc_sects if len(x) == 2]
     for sect, data in misc_sects:
         sect = sect.lower().replace(':', '').strip()
         if d.has_key(sect): continue
         if sect == 'salary': sect = 'salary history'
         elif sect == 'spouse': continue
         elif sect == 'nickname': sect = 'nick names'
         elif sect == 'where are they now': sect = 'where now'
         elif sect == 'personal quotes': sect = 'quotes'
         data = data.replace('</p><p>', '::')
         data = data.replace('<br><br>', ' ')  # for multi-paragraphs 'bio'
         data = data.replace('</td> <td valign="top">', '@@@@')
         data = data.replace('</td> </tr>', '::')
         data = _unHtml(data)
         data = [x.strip() for x in data.split('::')]
         data[:] = [x.replace('@@@@', '::') for x in data if x]
         if sect == 'height' and data: data = data[0]
         elif sect == 'birth name': data = canonicalName(data[0])
         elif sect == 'date of birth':
             date, notes = date_and_notes(data[0])
             if date:
                 d['birth date'] = date
             if notes:
                 d['birth notes'] = notes
             continue
         elif sect == 'date of death':
             date, notes = date_and_notes(data[0])
             if date:
                 d['death date'] = date
             if notes:
                 d['death notes'] = notes
             continue
         elif sect == 'mini biography':
             ndata = []
             for bio in data:
                 byidx = bio.rfind('IMDb Mini Biography By')
                 if byidx != -1:
                     bio = u'%s::%s' % (bio[byidx + 23:].lstrip(),
                                        bio[:byidx].rstrip())
                 ndata.append(bio)
             data[:] = ndata
         d[sect] = data
     return {'data': d}
Ejemplo n.º 2
0
 def do_br(self, attrs):
     if self._stop_here or not self._in_content: return
     # Inside li tags in filmography, some useless information after a br.
     self._seen_br = True
     self._cur_txt = self._cur_txt.strip()
     if not (self._in_post_section and self._section and self._cur_txt):
         self._in_post_section = False
         self._cur_txt = u''
         return
     # We're at the end of a section.
     if self._section == 'birth date':
         date, notes = date_and_notes(self._cur_txt)
         if date:
             self._data['birth date'] = date
         if notes:
             self._data['birth notes'] = notes
     elif self._section == 'death date':
         date, notes = date_and_notes(self._cur_txt)
         if date:
             self._data['death date'] = date
         if notes:
             self._data['death notes'] = notes
     elif self._section == 'akas':
         sep = ' | '
         if self.kind == 'character':
             sep = ' / '
         akas = self._cur_txt.split(sep)
         if akas: self._data['akas'] = akas
     # XXX: not providing an 'else', we're deliberately ignoring
     #      other sections.
     self._in_post_section = False
     if self.kind == 'character':
         # XXX: I'm not confident this is the best place for this...
         self._section = 'filmography'
     self._cur_txt = u''
Ejemplo n.º 3
0
 def do_br(self, attrs):
     if self._stop_here or not self._in_content: return
     # Inside li tags in filmography, some useless information after a br.
     self._seen_br = True
     self._cur_txt = self._cur_txt.strip()
     if not (self._in_post_section and self._section and self._cur_txt):
         self._in_post_section = False
         self._cur_txt = u''
         return
     # We're at the end of a section.
     if self._section == 'birth date':
         date, notes = date_and_notes(self._cur_txt)
         if date:
             self._data['birth date'] = date
         if notes:
             self._data['birth notes'] = notes
     elif self._section == 'death date':
         date, notes = date_and_notes(self._cur_txt)
         if date:
             self._data['death date'] = date
         if notes:
             self._data['death notes'] = notes
     elif self._section == 'akas':
         sep = ' | '
         if self.kind == 'character':
             sep = ' / '
         akas = self._cur_txt.split(sep)
         if akas: self._data['akas'] = akas
     # XXX: not providing an 'else', we're deliberately ignoring
     #      other sections.
     self._in_post_section = False
     if self.kind == 'character':
         # XXX: I'm not confident this is the best place for this...
         self._section = 'filmography'
     self._cur_txt = u''
Ejemplo n.º 4
0
 def _add_items(self):
     # Add a new section in the biography.
     if self._in_content and self._sect_name and self._sect_data:
         sect = self._sect_name.strip().lower()
         # XXX: to get rid of the last colons and normalize section names.
         if sect[-1] == ':':
             sect = sect[:-1]
         if sect == 'salary':
             sect = 'salary history'
         elif sect == 'nickname':
             sect = 'nick names'
         elif sect == 'where are they now':
             sect = 'where now'
         elif sect == 'personal quotes':
             sect = 'quotes'
         elif sect == 'date of birth':
             sect = 'birth date'
         elif sect == 'date of death':
             sect = 'death date'
         data = self._sect_data.strip()
         d_split = data.split('::')
         d_split[:] = filter(None, [x.strip() for x in d_split])
         # Do some transformation on some special cases.
         if sect == 'salary history':
             newdata = []
             for j in d_split:
                 j = filter(None, [x.strip() for x in j.split('@@@@')])
                 newdata.append('::'.join(j))
             d_split[:] = newdata
         elif sect == 'nick names':
             d_split[:] = [normalizeName(x) for x in d_split]
         elif sect == 'birth name':
             d_split = canonicalName(d_split[0])
         elif sect == 'height':
             d_split = d_split[0]
         elif sect == 'spouse':
             d_split[:] = [x.replace(' (', '::(', 1).replace(' ::', '::')
                             for x in d_split]
         # Birth/death date are in both maindetails and bio pages;
         # it's safe to collect both of them.
         if sect == 'birth date':
             date, notes = date_and_notes(d_split[0])
             if date:
                 self._bio_data['birth date'] = date
             if notes:
                 self._bio_data['birth notes'] = notes
         elif sect == 'death date':
             date, notes = date_and_notes(d_split[0])
             if date:
                 self._bio_data['death date'] = date
             if notes:
                 self._bio_data['death notes'] = notes
         elif d_split:
             # Multiple items are added separately (e.g.: 'trivia' is
             # a list of strings).
             self._bio_data[sect] = d_split
     self._sect_name = u''
     self._sect_data = u''
     self._in_sect = 0
Ejemplo n.º 5
0
def _parseBiography(biol):
    """Parse the biographies.data file."""
    res = {}
    bio = ' '.join(_parseList(biol, 'BG', mline=0))
    bio = _parseBioBy(biol)
    if bio: res['mini biography'] = bio

    for x in biol:
        x4 = x[:4]
        x6 = x[:6]
        if x4 == 'DB: ':
            date, notes = date_and_notes(x[4:])
            if date:
                res['birth date'] = date
            if notes:
                res['birth notes'] = notes
            #bdate = x.strip()
            #i = bdate.find(',')
            #if i != -1:
            #    res['birth notes'] = bdate[i+1:].strip()
            #    bdate = bdate[:i]
            #res['birth date'] = bdate[4:]
        elif x4 == 'DD: ':
            date, notes = date_and_notes(x[4:])
            if date:
                res['death date'] = date
            if notes:
                res['death notes'] = notes
            #ddate = x.strip()
            #i = ddate.find(',')
            #if i != -1:
            #    res['death notes'] = ddate[i+1:].strip()
            #    ddate = ddate[:i]
            #res['death date'] = ddate[4:]
        elif x6 == 'SP: * ':
            res.setdefault('spouse', []).append(x[6:].strip())
        elif x4 == 'RN: ':
            n = x[4:].strip()
            if not n: continue
            rn = build_name(analyze_name(n, canonical=1), canonical=1)
            res['birth name'] = rn
        elif x6 == 'AT: * ':
            res.setdefault('articles', []).append(x[6:].strip())
        elif x4 == 'HT: ':
            res['height'] = x[4:].strip()
        elif x6 == 'PT: * ':
            res.setdefault('pictorials', []).append(x[6:].strip())
        elif x6 == 'CV: * ':
            res.setdefault('magazine covers', []).append(x[6:].strip())
        elif x4 == 'NK: ':
            res.setdefault('nick names', []).append(normalizeName(x[4:]))
        elif x6 == 'PI: * ':
            res.setdefault('portrayed', []).append(x[6:].strip())
        elif x6 == 'SA: * ':
            sal = x[6:].strip().replace(' -> ', '::')
            res.setdefault('salary history', []).append(sal)

    trl = _parseList(biol, 'TR')
    if trl: res['trivia'] = trl
    quotes = _parseList(biol, 'QU')
    if quotes: res['quotes'] = quotes
    otherworks = _parseList(biol, 'OW')
    if otherworks: res['other works'] = otherworks
    books = _parseList(biol, 'BO')
    if books: res['books'] = books
    agent = _parseList(biol, 'AG')
    if agent: res['agent address'] = agent
    wherenow = _parseList(biol, 'WN')
    if wherenow: res['where now'] = wherenow[0]
    biomovies = _parseList(biol, 'BT')
    if biomovies: res['biographical movies'] = biomovies
    guestapp = _buildGuests([x[6:].strip() for x in biol if x[:6] == 'GA: * '])
    if guestapp: res['notable tv guest appearances'] = guestapp
    tm = _parseList(biol, 'TM')
    if tm: res['trademarks'] = tm
    interv = _parseList(biol, 'IT')
    if interv: res['interviews'] = interv
    return res
Ejemplo n.º 6
0
 def get_person_biography(self, personID):
     cont = self._mretrieve(self.urls['person_main'] % personID + 'bio')
     d = {}
     spouses = _findBetween(cont, 'Spouse</h5>', ('</table>', '</dd>'),
                             maxRes=1)
     if spouses:
         sl = []
         for spouse in spouses[0].split('</tr>'):
             if spouse.count('</td>') > 1:
                 spouse = spouse.replace('</td>', '::</td>', 1)
             spouse = _unHtml(spouse)
             spouse = spouse.replace(':: ', '::').strip()
             if spouse: sl.append(spouse)
         if sl: d['spouse'] = sl
     nnames = _findBetween(cont, '<h5>Nickname</h5>', ('<br/> <br/>','<h5>'),
                             maxRes=1)
     if nnames:
         nnames = nnames[0]
         if nnames:
             nnames = [x.strip().replace(' (', '::(', 1)
                         for x in nnames.split('<br/>')]
             if nnames:
                 d['nick names'] = nnames
     misc_sects = _findBetween(cont, '<h5>', '<br/>')
     misc_sects[:] = [x.split('</h5>') for x in misc_sects]
     misc_sects[:] = [x for x in misc_sects if len(x) == 2]
     for sect, data in misc_sects:
         sect = sect.lower().replace(':', '').strip()
         if d.has_key(sect) and sect != 'mini biography': continue
         elif sect in ('spouse', 'nickname'): continue
         if sect == 'salary': sect = 'salary history'
         elif sect == 'where are they now': sect = 'where now'
         elif sect == 'personal quotes': sect = 'quotes'
         data = data.replace('</p><p>', '::')
         data = data.replace('<br><br>', ' ') # for multi-paragraphs 'bio'
         data = data.replace('</td> <td valign="top">', '@@@@')
         data = data.replace('</td> </tr>', '::')
         data = _unHtml(data)
         data = [x.strip() for x in data.split('::')]
         data[:] = [x.replace('@@@@', '::') for x in data if x]
         if sect == 'height' and data: data = data[0]
         elif sect == 'birth name': data = canonicalName(data[0])
         elif sect == 'date of birth':
             date, notes = date_and_notes(data[0])
             if date:
                 d['birth date'] = date
             if notes:
                 d['birth notes'] = notes
             continue
         elif sect == 'date of death':
             date, notes = date_and_notes(data[0])
             if date:
                 d['death date'] = date
             if notes:
                 d['death notes'] = notes
             continue
         elif sect == 'mini biography':
             ndata = []
             for bio in data:
                 byidx = bio.rfind('IMDb Mini Biography By')
                 if byidx != -1:
                     bioAuth = bio[:byidx].rstrip()
                 else:
                     bioAuth = 'Anonymous'
                 bio = u'%s::%s' % (bioAuth, bio[byidx+23:].lstrip())
                 ndata.append(bio)
             data[:] = ndata
             if 'mini biography' in d:
                 d['mini biography'].append(ndata[0])
                 continue
         d[sect] = data
     return {'data': d}
Ejemplo n.º 7
0
 def get_person_main(self, personID, _parseChr=False):
     if not _parseChr:
         url = imdbURL_person_main % personID + 'maindetails'
     else:
         url = imdbURL_character_main % personID
     s = self._mretrieve(url)
     r = {}
     name = _findBetween(s, '<title>', '</title>', maxRes=1)
     if not name:
         if _parseChr: w = 'characterID'
         else: w = 'personID'
         raise IMDbDataAccessError, 'unable to get %s "%s"' % (w, personID)
     name = _unHtml(name[0])
     if _parseChr:
         name = name.replace('(Character)', '').strip()
         name = name.replace('- Filmography by type', '').strip()
     else:
         name = name.replace('- Filmography by', '').strip()
     r = analyze_name(name, canonical=not _parseChr)
     for dKind in ('birth', 'death'):
         date = _findBetween(s,
                             '<h5>Date of %s:</h5>' % dKind.capitalize(),
                             ('<a class', '</div>', '<br/><br/>'),
                             maxRes=1)
         if date:
             date = _unHtml(date[0])
             if date:
                 date, notes = date_and_notes(date)
                 if date:
                     r['%s date' % dKind] = date
                 if notes:
                     r['%s notes' % dKind] = notes
     akas = _findBetween(s,
                         'Alternate Names:</h5>', ('</div>', '<br/><br/>'),
                         maxRes=1)
     if akas:
         akas = akas[0]
         if akas.find(' | ') != -1:
             akas = _unHtml(akas).split(' | ')
         else:
             akas = _unHtml(akas).split(' / ')
         if akas: r['akas'] = akas
     hs = _findBetween(s, 'name="headshot"', '</a>', maxRes=1)
     if hs:
         hs[:] = _findBetween(hs[0], 'src="', '"', maxRes=1)
         if hs: r['headshot'] = hs[0]
     # Build a list of tuples such [('hrefLink', 'section name')]
     workkind = _findBetween(s,
                             '<div class="strip jump">',
                             '</div>',
                             maxRes=1)
     if workkind:
         workkind[:] = _findBetween(workkind[0], 'href="#', '</a>')
     else:
         # Assume there's only one section and/or there are no
         # section links, for some reason.
         workkind[:] = _findBetween(s, '<h5><a name=', '</a></h5>')
         workkind[:] = [x.lstrip('"').rstrip(':').lower() for x in workkind]
     ws = []
     for work in workkind:
         wsplit = work.split('">', 1)
         if len(wsplit) == 2:
             sect = wsplit[0]
             if '"' in sect:
                 sect = sect[:sect.find('"')]
             ws.append((sect, wsplit[1].lower()))
     # XXX: I think "guest appearances" are gone.
     if s.find('<a href="#guest-appearances"') != -1:
         ws.append(('guest-appearances', 'notable tv guest appearances'))
     #if _parseChr:
     #    ws.append(('filmography', 'filmography'))
     for sect, sectName in ws:
         raws = u''
         # Everything between the current section link and the end
         # of the <ol> tag.
         if _parseChr and sect == 'filmography':
             inisect = s.find('<div class="filmo">')
         else:
             inisect = s.find('<a name="%s' % sect)
         if inisect != -1:
             endsect = s[inisect:].find('</ol>')
             if endsect != -1: raws = s[inisect:inisect + endsect]
         if not raws: continue
         mlist = _findBetween(raws, '<li>', ('</li>', '<br>', '<br/>'))
         for m in mlist:
             # For every movie in the current section.
             movieID = re_imdbID.findall(m)
             if not movieID:
                 self._mobile_logger.debug('no movieID in %s', m)
                 continue
             if not _parseChr:
                 chrIndx = m.find(' .... ')
             else:
                 chrIndx = m.find(' Played by ')
             chids = []
             if chrIndx != -1:
                 chrtxt = m[chrIndx + 6:]
                 if _parseChr:
                     chrtxt = chrtxt[5:]
                 for ch in chrtxt.split(' / '):
                     chid = re_imdbID.findall(ch)
                     if not chid:
                         chids.append(None)
                     else:
                         chids.append(chid[-1])
             if not chids:
                 chids = None
             elif len(chids) == 1:
                 chids = chids[0]
             movieID = str(movieID[0])
             # Search the status.
             stidx = m.find('<i>')
             status = u''
             if stidx != -1:
                 stendidx = m.rfind('</i>')
                 if stendidx != -1:
                     status = _unHtml(m[stidx + 3:stendidx])
                     m = m.replace(m[stidx + 3:stendidx], '')
             m = _unHtml(m)
             if not m:
                 self._mobile_logger.warn('no title fo rmovieID %s',
                                          movieID)
                 continue
             movie = build_movie(m,
                                 movieID=movieID,
                                 status=status,
                                 roleID=chids,
                                 modFunct=self._defModFunct,
                                 accessSystem=self.accessSystem,
                                 _parsingCharacter=_parseChr)
             r.setdefault(sectName, []).append(movie)
     # If available, take the always correct name from a form.
     itag = _getTagsWith(s, 'NAME="primary"', maxRes=1)
     if not itag:
         itag = _getTagsWith(s, 'name="primary"', maxRes=1)
     if itag:
         vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1)
         if not vtag:
             vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1)
         if vtag:
             try:
                 vtag = unquote(str(vtag[0]))
                 vtag = unicode(vtag, 'latin_1')
                 r.update(analyze_name(vtag))
             except UnicodeEncodeError:
                 pass
     return {'data': r, 'info sets': ('main', 'filmography')}
Ejemplo n.º 8
0
 def get_person_biography(self, personID):
     cont = self._mretrieve(self.urls["person_main"] % personID + "bio")
     d = {}
     spouses = _findBetween(cont, "Spouse</h5>", ("</table>", "</dd>"), maxRes=1)
     if spouses:
         sl = []
         for spouse in spouses[0].split("</tr>"):
             if spouse.count("</td>") > 1:
                 spouse = spouse.replace("</td>", "::</td>", 1)
             spouse = _unHtml(spouse)
             spouse = spouse.replace(":: ", "::").strip()
             if spouse:
                 sl.append(spouse)
         if sl:
             d["spouse"] = sl
     nnames = _findBetween(cont, "<h5>Nickname</h5>", ("<br/> <br/>", "<h5>"), maxRes=1)
     if nnames:
         nnames = nnames[0]
         if nnames:
             nnames = [x.strip().replace(" (", "::(", 1) for x in nnames.split("<br/>")]
             if nnames:
                 d["nick names"] = nnames
     misc_sects = _findBetween(cont, "<h5>", "<br/>")
     misc_sects[:] = [x.split("</h5>") for x in misc_sects]
     misc_sects[:] = [x for x in misc_sects if len(x) == 2]
     for sect, data in misc_sects:
         sect = sect.lower().replace(":", "").strip()
         if d.has_key(sect) and sect != "mini biography":
             continue
         elif sect in ("spouse", "nickname"):
             continue
         if sect == "salary":
             sect = "salary history"
         elif sect == "where are they now":
             sect = "where now"
         elif sect == "personal quotes":
             sect = "quotes"
         data = data.replace("</p><p>", "::")
         data = data.replace("<br><br>", " ")  # for multi-paragraphs 'bio'
         data = data.replace('</td> <td valign="top">', "@@@@")
         data = data.replace("</td> </tr>", "::")
         data = _unHtml(data)
         data = [x.strip() for x in data.split("::")]
         data[:] = [x.replace("@@@@", "::") for x in data if x]
         if sect == "height" and data:
             data = data[0]
         elif sect == "birth name":
             data = canonicalName(data[0])
         elif sect == "date of birth":
             date, notes = date_and_notes(data[0])
             if date:
                 d["birth date"] = date
             if notes:
                 d["birth notes"] = notes
             continue
         elif sect == "date of death":
             date, notes = date_and_notes(data[0])
             if date:
                 d["death date"] = date
             if notes:
                 d["death notes"] = notes
             continue
         elif sect == "mini biography":
             ndata = []
             for bio in data:
                 byidx = bio.rfind("IMDb Mini Biography By")
                 if byidx != -1:
                     bioAuth = bio[:byidx].rstrip()
                 else:
                     bioAuth = "Anonymous"
                 bio = u"%s::%s" % (bioAuth, bio[byidx + 23 :].lstrip())
                 ndata.append(bio)
             data[:] = ndata
             if "mini biography" in d:
                 d["mini biography"].append(ndata[0])
                 continue
         d[sect] = data
     return {"data": d}
Ejemplo n.º 9
0
 def get_person_main(self, personID, _parseChr=False):
     if not _parseChr:
         url = imdbURL_person_main % personID + 'maindetails'
     else:
         url = imdbURL_character_main % personID
     s = self._mretrieve(url)
     r = {}
     name = _findBetween(s, '<title>', '</title>', maxRes=1)
     if not name:
         if _parseChr: w = 'characterID'
         else: w = 'personID'
         raise IMDbDataAccessError, 'unable to get %s "%s"' % (w, personID)
     name = _unHtml(name[0])
     if _parseChr:
         name = name.replace('(Character)', '').strip()
     r = analyze_name(name, canonical=not _parseChr)
     for dKind in ('birth', 'death'):
         date = _findBetween(s, '<h5>Date of %s:</h5>' % dKind.capitalize(),
                             ('<a class', '</div>', '<br/><br/>'), maxRes=1)
         if date:
             date = _unHtml(date[0])
             if date:
                 date, notes = date_and_notes(date)
                 if date:
                     r['%s date' % dKind] = date
                 if notes:
                     r['%s notes' % dKind] = notes
     akas = _findBetween(s, 'Alternate Names:</h5>', ('</div>',
                         '<br/><br/>'), maxRes=1)
     if akas:
         akas = akas[0]
         if akas.find(' | ') != -1:
             akas = _unHtml(akas).split(' | ')
         else:
             akas = _unHtml(akas).split(' / ')
         if akas: r['akas'] = akas
     hs = _findBetween(s, 'name="headshot"', '</a>', maxRes=1)
     if hs:
         hs[:] = _findBetween(hs[0], 'src="', '"', maxRes=1)
         if hs: r['headshot'] = hs[0]
     # Build a list of tuples such [('hrefLink', 'section name')]
     workkind = _findBetween(s, '<div class="strip jump">', '</div>',
                             maxRes=1)
     if workkind:
         workkind[:] = _findBetween(workkind[0], 'href="#', '</a>')
     else:
         # Assume there's only one section and/or there are no
         # section links, for some reason.
         workkind[:] = _findBetween(s, '<h5><a name=', '</a></h5>')
         workkind[:] = [x.lstrip('"').rstrip(':').lower() for x in workkind]
     ws = []
     for work in workkind:
         wsplit = work.split('">', 1)
         if len(wsplit) == 2:
             ws.append((wsplit[0], wsplit[1].lower()))
     # XXX: I think "guest appearances" are gone.
     if s.find('<a href="#guest-appearances"') != -1:
         ws.append(('guest-appearances', 'notable tv guest appearances'))
     if _parseChr:
         ws.append(('filmography', 'filmography'))
     for sect, sectName in ws:
         raws = u''
         # Everything between the current section link and the end
         # of the <ol> tag.
         if _parseChr and sect == 'filmography':
             inisect = s.find('<div class="filmo">')
         else:
             inisect = s.find('<a name="%s' % sect)
         if inisect != -1:
             endsect = s[inisect:].find('</ol>')
             if endsect != -1: raws = s[inisect:inisect+endsect]
         if not raws: continue
         mlist = _findBetween(raws, '<li>', ('</li>', '<br>', '<br/>'))
         for m in mlist:
             # For every movie in the current section.
             movieID = re_imdbID.findall(m)
             if not movieID: continue
             if not _parseChr:
                 chrIndx = m.find(' .... ')
             else:
                 chrIndx = m.find(' Played by ')
             chids = []
             if chrIndx != -1:
                 chrtxt = m[chrIndx+6:]
                 if _parseChr:
                     chrtxt = chrtxt[5:]
                 for ch in chrtxt.split(' / '):
                     chid = re_imdbID.findall(ch)
                     if not chid:
                         chids.append(None)
                     else:
                         chids.append(chid[-1])
             if not chids:
                 chids = None
             elif len(chids) == 1:
                 chids = chids[0]
             movieID = str(movieID[0])
             # Search the status.
             stidx = m.find('<i>')
             status = u''
             if stidx != -1:
                 stendidx = m.rfind('</i>')
                 if stendidx != -1:
                     status = _unHtml(m[stidx+3:stendidx])
                     m = m.replace(m[stidx+3:stendidx], '')
             m = _unHtml(m)
             if not m: continue
             movie = build_movie(m, movieID=movieID, status=status,
                                 roleID=chids, modFunct=self._defModFunct,
                                 accessSystem=self.accessSystem,
                                 _parsingCharacter=_parseChr)
             r.setdefault(sectName, []).append(movie)
     # If available, take the always correct name from a form.
     itag = _getTagsWith(s, 'NAME="primary"', maxRes=1)
     if not itag:
         itag = _getTagsWith(s, 'name="primary"', maxRes=1)
     if itag:
         vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1)
         if not vtag:
             vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1)
         if vtag:
             try:
                 vtag = unquote(str(vtag[0]))
                 vtag = unicode(vtag, 'latin_1')
                 r.update(analyze_name(vtag, canonical=0))
             except UnicodeEncodeError:
                 pass
             
     photo = _findBetween(s, '<div class="photo">', '</div>', maxRes=1)
     image_url = ''
     if (len(photo)>0):
         img = _findBetween(photo[0], '<img', '/a>', maxRes=1)
         if (len(img)>0):
             image_url = _findBetween(img[0],' src="', '"', maxRes=1)[0]
     r['image_url'] = image_url
     
     return {'data': r, 'info sets': ('main', 'filmography')}
Ejemplo n.º 10
0
 def _add_items(self):
     # Add a new section in the biography.
     if self._in_content and self._sect_name and self._sect_data:
         sect = self._sect_name.strip().lower()
         # XXX: to get rid of the last colons and normalize section names.
         if sect[-1] == ':':
             sect = sect[:-1]
         if sect == 'salary':
             sect = 'salary history'
         elif sect == 'nickname':
             sect = 'nick names'
         elif sect == 'where are they now':
             sect = 'where now'
         elif sect == 'personal quotes':
             sect = 'quotes'
         elif sect == 'date of birth':
             sect = 'birth date'
         elif sect == 'date of death':
             sect = 'death date'
         data = self._sect_data.strip()
         d_split = data.split('::')
         d_split[:] = filter(None, [x.strip() for x in d_split])
         # Do some transformation on some special cases.
         if sect == 'salary history':
             newdata = []
             for j in d_split:
                 j = filter(None, [x.strip() for x in j.split('@@@@')])
                 newdata.append('::'.join(j))
             d_split[:] = newdata
         elif sect == 'nick names':
             d_split[:] = [normalizeName(x) for x in d_split]
         elif sect == 'birth name':
             d_split = canonicalName(d_split[0])
         elif sect == 'height':
             d_split = d_split[0]
         elif sect == 'spouse':
             d_split[:] = [
                 x.replace(' (', '::(', 1).replace(' ::', '::')
                 for x in d_split
             ]
         # Birth/death date are in both maindetails and bio pages;
         # it's safe to collect both of them.
         if sect == 'birth date':
             date, notes = date_and_notes(d_split[0])
             if date:
                 self._bio_data['birth date'] = date
             if notes:
                 self._bio_data['birth notes'] = notes
         elif sect == 'death date':
             date, notes = date_and_notes(d_split[0])
             if date:
                 self._bio_data['death date'] = date
             if notes:
                 self._bio_data['death notes'] = notes
         elif d_split:
             # Multiple items are added separately (e.g.: 'trivia' is
             # a list of strings).
             self._bio_data[sect] = d_split
     self._sect_name = u''
     self._sect_data = u''
     self._in_sect = 0
Ejemplo n.º 11
0
def _parseBiography(biol):
    """Parse the biographies.data file."""
    res = {}
    bio = ' '.join(_parseList(biol, 'BG', mline=0))
    bio = _parseBioBy(biol)
    if bio: res['mini biography'] = bio

    for x in biol:
        x4 = x[:4]
        x6 = x[:6]
        if x4 == 'DB: ':
            date, notes = date_and_notes(x[4:])
            if date:
                res['birth date'] = date
            if notes:
                res['birth notes'] = notes
        elif x4 == 'DD: ':
            date, notes = date_and_notes(x[4:])
            if date:
                res['death date'] = date
            if notes:
                res['death notes'] = notes
        elif x6 == 'SP: * ':
            res.setdefault('spouse', []).append(x[6:].strip())
        elif x4 == 'RN: ':
            n = x[4:].strip()
            if not n: continue
            rn = build_name(analyze_name(n, canonical=1), canonical=1)
            res['birth name'] = rn
        elif x6 == 'AT: * ':
            res.setdefault('articles', []).append(x[6:].strip())
        elif x4 == 'HT: ':
            res['height'] = x[4:].strip()
        elif x6 == 'PT: * ':
            res.setdefault('pictorials', []).append(x[6:].strip())
        elif x6 == 'CV: * ':
            res.setdefault('magazine covers', []).append(x[6:].strip())
        elif x4 == 'NK: ':
            res.setdefault('nick names', []).append(normalizeName(x[4:]))
        elif x6 == 'PI: * ':
            res.setdefault('portrayed', []).append(x[6:].strip())
        elif x6 == 'SA: * ':
            sal = x[6:].strip().replace(' -> ', '::')
            res.setdefault('salary history', []).append(sal)

    trl = _parseList(biol, 'TR')
    if trl: res['trivia'] = trl
    quotes = _parseList(biol, 'QU')
    if quotes: res['quotes'] = quotes
    otherworks = _parseList(biol, 'OW')
    if otherworks: res['other works'] = otherworks
    books = _parseList(biol, 'BO')
    if books: res['books'] = books
    agent = _parseList(biol, 'AG')
    if agent: res['agent address'] = agent
    wherenow = _parseList(biol, 'WN')
    if wherenow: res['where now'] = wherenow[0]
    biomovies = _parseList(biol, 'BT')
    if biomovies: res['biographical movies'] = biomovies
    guestapp = _buildGuests([x[6:].strip() for x in biol if x[:6] == 'GA: * '])
    if guestapp: res['notable tv guest appearances'] = guestapp
    tm = _parseList(biol, 'TM')
    if tm: res['trademarks'] = tm
    interv = _parseList(biol, 'IT')
    if interv: res['interviews'] = interv
    return res