コード例 #1
0
ファイル: __init__.py プロジェクト: GodZZila/SickRage
 def _search_character(self, name, results):
     cont = subXMLRefs(self._get_search_content("ch", name, results))
     name = _findBetween(cont, "<title>", "</title>", maxRes=1)
     res = []
     if not name:
         self._mobile_logger.error("no title tag searching character %s", name)
         return res
     nl = name[0].lower()
     if not nl.startswith("find - imdb"):
         # a direct hit!
         name = _unHtml(name[0]).replace("(Character)", "").strip()
         pid = None
         pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
         if pidtag:
             pid = _findBetween(pidtag[0], "/character/ch", "/", maxRes=1)
         if not (pid and name):
             self._mobile_logger.error("no direct hit name/characterID for" " character %s", name)
             return res
         res[:] = [(str(pid[0]), analyze_name(name))]
     else:
         lis = _findBetween(cont, '<td class="result_text"', ["<small", "</td>", "<br"])
         for li in lis:
             li = "<%s" % li
             pid = re_imdbID.findall(li)
             pname = _unHtml(li)
             if not (pid and pname):
                 self._mobile_logger.debug(
                     "no name/characterID" " parsing %s searching for" " character %s", li, name
                 )
                 continue
             res.append((str(pid[0]), analyze_name(pname)))
     return res
コード例 #2
0
 def _search_character(self, name, results):
     cont = subXMLRefs(self._get_search_content('char', name, results))
     name = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not name: return res
     nl = name[0].lower()
     if not (nl.startswith('imdb search') or nl.startswith('imdb  search') \
             or nl.startswith('imdb character')):
         # XXX: a direct hit!
         name = _unHtml(name[0]).replace('(Character)', '').strip()
         pidtag = _getTagsWith(cont, '/character/ch', maxRes=1)
         pid = None
         if pidtag:
             pid = re_imdbID.findall(pidtag[0])
         if not (pid and name): return res
         res[:] = [(str(pid[0]), analyze_name(name, canonical=0))]
     else:
         sects = _findBetween(cont, '<b>Popular Characters</b>', '</table>')
         sects += _findBetween(cont, '<b>Characters', '</table>')
         for sect in sects:
             lis = _findBetween(sect, '<a href="/character/',
                                ['<small', '</td>', '<br'])
             for li in lis:
                 li = '<%s' % li
                 pid = re_imdbID.findall(li)
                 pname = _unHtml(li)
                 if not (pid and pname): continue
                 res.append((str(pid[0]), analyze_name(pname, canonical=0)))
     return res
コード例 #3
0
ファイル: __init__.py プロジェクト: conwetlab/ezweb-gadgets
 def _search_character(self, name, results):
     cont = subXMLRefs(self._get_search_content('char', name, results))
     name = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not name: return res
     nl = name[0].lower()
     if not (nl.startswith('imdb search') or nl.startswith('imdb  search') \
             or nl.startswith('imdb character')):
         # XXX: a direct hit!
         name = _unHtml(name[0]).replace('(Character)', '').strip()
         pidtag = _getTagsWith(cont, '/character/ch', maxRes=1)
         pid = None
         if pidtag:
             pid = re_imdbID.findall(pidtag[0])
         if not (pid and name): return res
         res[:] = [(str(pid[0]), analyze_name(name, canonical=0))]
     else:
         sects = _findBetween(cont, '<b>Popular Characters</b>', '</table>')
         sects += _findBetween(cont, '<b>Characters', '</table>')
         for sect in sects:
             lis = _findBetween(sect, '<a href="/character/',
                                 ['<small', '</td>', '<br'])
             for li in lis:
                 li = '<%s' % li
                 pid = re_imdbID.findall(li)
                 pname = _unHtml(li)
                 if not (pid and pname): continue
                 res.append((str(pid[0]), analyze_name(pname, canonical=0)))
     return res
コード例 #4
0
 def _search_character(self, name, results):
     cont = subXMLRefs(self._get_search_content('ch', name, results))
     name = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not name:
         self._mobile_logger.error('no title tag searching character %s',
                                 name)
         return res
     nl = name[0].lower()
     if not nl.startswith('find - imdb'):
         # a direct hit!
         name = _unHtml(name[0]).replace('(Character)', '').strip()
         pid = None
         pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
         if pidtag:
             pid = _findBetween(pidtag[0], '/character/ch', '/', maxRes=1)
         if not (pid and name):
             self._mobile_logger.error('no direct hit name/characterID for' \
                                         ' character %s', name)
             return res
         res[:] = [(str(pid[0]), analyze_name(name))]
     else:
         lis = _findBetween(cont, '<td class="result_text"',
                             ['<small', '</td>', '<br'])
         for li in lis:
             li = '<%s' % li
             pid = re_imdbID.findall(li)
             pname = _unHtml(li)
             if not (pid and pname):
                 self._mobile_logger.debug('no name/characterID' \
                                         ' parsing %s searching for' \
                                         ' character %s', li, name)
                 continue
             res.append((str(pid[0]), analyze_name(pname)))
     return res
コード例 #5
0
 def _search_person(self, name, results):
     ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
     ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
     ##cont = self._mretrieve(imdbURL_search % params)
     cont = subXMLRefs(self._get_search_content('nm', name, results))
     name = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not name:
         self._mobile_logger.warn('no title tag searching for name %s',
                                  name)
         return res
     nl = name[0].lower()
     if not nl.startswith('imdb name'):
         # a direct hit!
         name = _unHtml(name[0])
         name = name.replace('- Filmography by type', '').strip()
         pid = None
         pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
         if pidtag:
             pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1)
         if not (pid and name):
             self._mobile_logger.error('no direct hit name/personID for' \
                                         ' name %s', name)
             return res
         res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
     else:
         lis = _findBetween(cont,
                            'td valign="top">',
                            '</td>',
                            maxRes=results * 3)
         for li in lis:
             akas = _findBetween(li, '<em>"', '"</em>')
             for sep in ['<small', '<br> aka', '<br> birth name']:
                 sepIdx = li.find(sep)
                 if sepIdx != -1:
                     li = li[:sepIdx]
             pid = re_imdbID.findall(li)
             pname = _unHtml(li)
             if not (pid and pname):
                 self._mobile_logger.debug('no name/personID parsing' \
                                         ' %s searching for name %s', li,
                                         name)
                 continue
             resd = analyze_name(pname, canonical=1)
             if akas:
                 resd['akas'] = akas
             res.append((str(pid[0]), resd))
     return res
コード例 #6
0
 def get_person_main(self, personID):
     infosets = ('main', 'biography', 'other works')
     nl = getLabel(personID, '%snames.index' % self.__db,
                     '%snames.key' % self.__db)
     # No name, no party.
     if nl is None:
         raise IMDbDataAccessError, 'unable to get personID "%s"' % personID
     res = analyze_name(nl)
     res.update(getBio(personID, '%sbiographies.index' % self.__db,
                 '%sbiographies.data' % self.__db))
     akas = getAkaNames(personID,
                 '%saka-names.data' % self.__db,
                 '%snames.index' % self.__db,
                 '%snames.key' % self.__db)
     if akas: res['akas'] = akas
     # XXX: horrible hack!  The getBio() function is not able to
     #      retrieve the movieID!
     #      A cleaner solution, would be to NOT return Movies object
     #      at first, from the getBio() function.
     # XXX: anyway, this is no more needed, since "guest appearances"
     #      are gone with the new tv series episodes support.
     if res.has_key('notable tv guest appearances'):
         nl = []
         for m in res['notable tv guest appearances']:
             movieID = self._getTitleID(m.get('long imdb canonical title'))
             if movieID is None: continue
             m.movieID = movieID
             nl.append(m)
         if nl:
             nl.sort()
             res['notable tv guest appearances'][:] = nl
         else: del res['notable tv guest appearances']
     trefs, nrefs = self._extractRefs(res)
     return {'data': res, 'info sets': infosets,
             'titlesRefs': trefs, 'namesRefs': nrefs}
コード例 #7
0
ファイル: Character.py プロジェクト: alberanid/imdbpy
 def set_name(self, name):
     """Set the name of the character."""
     try:
         d = analyze_name(name, canonical=False)
         self.data.update(d)
     except IMDbParserError:
         pass
コード例 #8
0
 def set_name(self, name):
     """Set the name of the character."""
     try:
         d = analyze_name(name, canonical=False)
         self.data.update(d)
     except IMDbParserError:
         pass
コード例 #9
0
class DOMBasicPersonParser(DOMBasicMovieParser):
    """Simply get the name of a person and the imdbID.

    It's used by the DOMHTMLSearchPersonParser class to return a result
    for a direct match (when a search on IMDb results in a single
    person, the web server sends directly the movie page."""
    _titleFunct = lambda self, x: analyze_name(_cleanName(x), canonical=1)
コード例 #10
0
class DOMBasicCharacterParser(DOMBasicMovieParser):
    """Simply get the name of a character and the imdbID.

    It's used by the DOMHTMLSearchCharacterParser class to return a result
    for a direct match (when a search on IMDb results in a single
    character, the web server sends directly the movie page."""
    _titleFunct = lambda self, x: analyze_name(x or u'', canonical=False)
コード例 #11
0
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for persons."""
    _BaseParser = DOMBasicPersonParser
    _notDirectHitTitle = '<title>imdb name'
    _titleBuilder = lambda self, x: build_name(x, canonical=True)
    _linkPrefix = '/name/nm'

    _attrs = [Attribute(key='data',
                        multi=True,
                        path={
                            'link': "./a[1]/@href",
                            'name': "./a[1]/text()",
                            'index': "./text()[1]",
                            'akas': ".//div[@class='_imdbpyAKA']/text()"
                            },
                        postprocess=lambda x: (
                            analyze_imdbid(x.get('link') or u''),
                            analyze_name((x.get('name') or u'') + \
                                        (x.get('index') or u''),
                                         canonical=1), x.get('akas')
                        ))]
    extractors = [
        Extractor(label='search',
                  path="//td[3]/a[starts-with(@href, '/name/nm')]/..",
                  attrs=_attrs)
    ]

    def preprocess_string(self, html_string):
        if self._notDirectHitTitle in html_string[:1024].lower():
            html_string = _reAKASp.sub(
                r'\1<div class="_imdbpyAKA">\2::</div>\3', html_string)
        return DOMHTMLSearchMovieParser.preprocess_string(self, html_string)
コード例 #12
0
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for persons."""
    _titleBuilder = lambda self, x: build_name(x, canonical=True)
    _linkPrefix = '/name/nm'

    _attrs = [
        Attribute(
            key='data',
            multi=True,
            path={
                'link': "./a[1]/@href",
                'name': "./a[1]/text()",
                'index': "./text()[1]",
                'akas': ".//div[@class='_imdbpyAKA']/text()"
            },
            postprocess=lambda x: (
                analyze_imdbid(x.get('link') or ''),
                analyze_name((x.get('name') or '') + (x.get('index') or ''),
                             canonical=1), x.get('akas')
            )
        )
    ]

    extractors = [
        Extractor(
            label='search',
            path="//td[@class='result_text']/a[starts-with(@href, '/name/nm')]/..",
            attrs=_attrs
        )
    ]
コード例 #13
0
ファイル: __init__.py プロジェクト: 070499/repo-scripts
 def get_person_main(self, personID):
     infosets = ('main', 'biography', 'other works')
     nl = getLabel(personID, '%snames.index' % self.__db,
                     '%snames.key' % self.__db)
     # No name, no party.
     if nl is None:
         raise IMDbDataAccessError, 'unable to get personID "%s"' % personID
     res = analyze_name(nl)
     res.update(getBio(personID, '%sbiographies.index' % self.__db,
                 '%sbiographies.data' % self.__db))
     akas = getAkaNames(personID,
                 '%saka-names.data' % self.__db,
                 '%snames.index' % self.__db,
                 '%snames.key' % self.__db)
     if akas: res['akas'] = akas
     # XXX: horrible hack!  The getBio() function is not able to
     #      retrieve the movieID!
     #      A cleaner solution, would be to NOT return Movies object
     #      at first, from the getBio() function.
     # XXX: anyway, this is no more needed, since "guest appearances"
     #      are gone with the new tv series episodes support.
     if res.has_key('notable tv guest appearances'):
         nl = []
         for m in res['notable tv guest appearances']:
             movieID = self._getTitleID(m.get('long imdb canonical title'))
             if movieID is None: continue
             m.movieID = movieID
             nl.append(m)
         if nl:
             nl.sort()
             res['notable tv guest appearances'][:] = nl
         else: del res['notable tv guest appearances']
     trefs, nrefs = self._extractRefs(res)
     return {'data': res, 'info sets': infosets,
             'titlesRefs': trefs, 'namesRefs': nrefs}
コード例 #14
0
ファイル: __init__.py プロジェクト: Elettronik/SickRage
 def _search_person(self, name, results):
     ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
     ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
     ##cont = self._mretrieve(imdbURL_search % params)
     cont = subXMLRefs(self._get_search_content('nm', name, results))
     name = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not name:
         self._mobile_logger.warn('no title tag searching for name %s', name)
         return res
     nl = name[0].lower()
     if not nl.startswith('find - imdb'):
         # a direct hit!
         name = _unHtml(name[0])
         name = name.replace('- Filmography by type' , '').strip()
         pid = None
         pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
         if pidtag:
             pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1)
         if not (pid and name):
             self._mobile_logger.error('no direct hit name/personID for' \
                                         ' name %s', name)
             return res
         res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
     else:
         lis = _findBetween(cont, 'td class="result_text">', '</td>',
                             maxRes=results*3)
         for li in lis:
             akas = _findBetween(li, '<em>"', '"</em>')
             for sep in ['<small', '<br> aka', '<br> birth name']:
                 sepIdx = li.find(sep)
                 if sepIdx != -1:
                     li = li[:sepIdx]
             pid = re_imdbID.findall(li)
             pname = _unHtml(li)
             if not (pid and pname):
                 self._mobile_logger.debug('no name/personID parsing' \
                                         ' %s searching for name %s', li,
                                         name)
                 continue
             resd = analyze_name(pname, canonical=1)
             if akas:
                 resd['akas'] = akas
             res.append((str(pid[0]), resd))
     return res
コード例 #15
0
class DOMBasicPersonParser(DOMBasicMovieParser):
    """Simply get the name of a person and the imdbID.

    It's used by the DOMHTMLSearchPersonParser class to return a result
    for a direct match (when a search on IMDb results in a single
    person, the web server sends directly the movie page."""
    _titleAttrPath = ".//text()"
    _linkPath = "//a[starts-with(@href, '/name/nm')]"
    _titleFunct = lambda self, x: analyze_name(x or u'', canonical=1)
コード例 #16
0
 def end_title(self):
     self._in_title = False
     self._title = self._title.strip()
     if self._title:
         if self.kind != 'character':
             self._data.update(analyze_name(self._title, canonical=1))
         else:
             self._title = self._title.replace('(Character)', '').strip()
             self._data['name'] = self._title
コード例 #17
0
 def end_title(self):
     self._in_title = False
     self._title = self._title.strip()
     if self._title:
         if self.kind != 'character':
             self._data.update(analyze_name(self._title, canonical=1))
         else:
             self._title = self._title.replace('(Character)', '').strip()
             self._data['name'] = self._title
コード例 #18
0
 def set_name(self, name):
     """Set the name of the character."""
     # XXX: convert name to unicode, if it's a plain string?
     try:
         d = analyze_name(name, canonical=0)
         self.data.update(d)
     except:
         # TODO: catch only IMDbPYParserError and issue a warning.
         pass
コード例 #19
0
ファイル: Character.py プロジェクト: Elettronik/SickRage
 def set_name(self, name):
     """Set the name of the character."""
     # XXX: convert name to unicode, if it's a plain string?
     try:
         d = analyze_name(name, canonical=0)
         self.data.update(d)
     except:
         # TODO: catch only IMDbPYParserError and issue a warning.
         pass
コード例 #20
0
 def do_input(self, attrs):
     itype = self.get_attr_value(attrs, 'type')
     if itype is None or itype.lower() != 'hidden': return
     iname = self.get_attr_value(attrs, 'name')
     if iname is None or iname != 'primary': return
     ivalue = self.get_attr_value(attrs, 'value')
     if ivalue is None: return
     # It's hard to catch the correct 'Surname, Name' from the
     # title, so if the "credited alongside another name" form
     # is found, use it.
     self._data.update(analyze_name(ivalue, canonical=0))
コード例 #21
0
ファイル: __init__.py プロジェクト: 070499/repo-scripts
 def _readNamesKeyFile(keyFile):
     """Iterate over the given file, returning tuples suited for
     the common.locsql.scan_names function."""
     try: kf = open(keyFile, 'r')
     except IOError, e: raise IMDbDataAccessError, str(e)
     for line in kf:
         ls = line.split('|')
         if not ls[0]: continue
         named = analyze_name(latin2utf(ls[0]))
         yield (long(ls[1], 16), named)
     kf.close()
コード例 #22
0
 def _readNamesKeyFile(keyFile):
     """Iterate over the given file, returning tuples suited for
     the common.locsql.scan_names function."""
     try: kf = open(keyFile, 'r')
     except IOError, e: raise IMDbDataAccessError, str(e)
     for line in kf:
         ls = line.split('|')
         if not ls[0]: continue
         named = analyze_name(latin2utf(ls[0]))
         yield (long(ls[1], 16), named)
     kf.close()
コード例 #23
0
 def do_input(self, attrs):
     itype = self.get_attr_value(attrs, 'type')
     if itype is None or itype.lower() != 'hidden': return
     iname = self.get_attr_value(attrs, 'name')
     if iname is None or iname != 'primary': return
     ivalue = self.get_attr_value(attrs, 'value')
     if ivalue is None: return
     # It's hard to catch the correct 'Surname, Name' from the
     # title, so if the "credited alongside another name" form
     # is found, use it.
     self._data.update(analyze_name(ivalue, canonical=0))
コード例 #24
0
 def start_a(self, attrs):
     href = self.get_attr_value(attrs, 'href')
     if not href: return
     href = href.lower()
     if '/character/ch' in href and href.endswith('bio'):
         rpid = self.re_imdbID.findall(href)
         if rpid and self._name:
             n = self._name.replace('(Character)', '').strip()
             pid = str(rpid[-1])
             d = analyze_name(n, canonical=0)
             res = [(pid, d)]
             self.reset()
             self._result = res
コード例 #25
0
ファイル: __init__.py プロジェクト: conwetlab/ezweb-gadgets
 def _search_person(self, name, results):
     ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
     ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
     ##cont = self._mretrieve(imdbURL_search % params)
     cont = subXMLRefs(self._get_search_content('nm', name, results))
     name = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not name: return res
     nl = name[0].lower()
     if not nl.startswith('imdb name'):
         # XXX: a direct hit!
         name = _unHtml(name[0])
         # Easiest way: the board link (for person who already have
         # messages in the board).
         pidtag = _getTagsWith(cont, '/board/nest/', maxRes=1)
         pid = None
         if pidtag: pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1)
         if not (pid and name):
             # Otherwise, the 'credited alongside' for the name,
             # and the biography link for the personID.
             nametag = _getTagsWith(cont, 'NAME="primary"', maxRes=1)
             if not nametag: return res
             nametag = _findBetween(nametag[0], 'VALUE="', '"', maxRes=1)
             if not nametag: return res
             name = unquote(nametag[0])
             pid = _findBetween(cont, '/name/nm', ('/', '"', '>'), maxRes=1)
             if not pid: return res
         if not (pid and name): return res
         res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
     else:
         lis = _findBetween(cont, 'td valign="top">',
                             ['<small', '</td>', '<br> aka'])
         for li in lis:
             pid = re_imdbID.findall(li)
             pname = _unHtml(li)
             if not (pid and pname): continue
             res.append((str(pid[0]), analyze_name(pname, canonical=1)))
     return res
コード例 #26
0
 def _search_person(self, name, results):
     ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
     ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
     ##cont = self._mretrieve(imdbURL_search % params)
     cont = subXMLRefs(self._get_search_content('nm', name, results))
     name = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not name: return res
     nl = name[0].lower()
     if not nl.startswith('imdb name'):
         # XXX: a direct hit!
         name = _unHtml(name[0])
         # Easiest way: the board link (for person who already have
         # messages in the board).
         pidtag = _getTagsWith(cont, '/board/nest/', maxRes=1)
         pid = None
         if pidtag: pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1)
         if not (pid and name):
             # Otherwise, the 'credited alongside' for the name,
             # and the biography link for the personID.
             nametag = _getTagsWith(cont, 'NAME="primary"', maxRes=1)
             if not nametag: return res
             nametag = _findBetween(nametag[0], 'VALUE="', '"', maxRes=1)
             if not nametag: return res
             name = unquote(nametag[0])
             pid = _findBetween(cont, '/name/nm', '/bio', maxRes=1)
             if not pid: return res
         if not (pid and name): return res
         res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
     else:
         lis = _findBetween(cont, 'td valign="top">',
                            ['<small', '</td>', '<br'])
         for li in lis:
             pid = re_imdbID.findall(li)
             pname = _unHtml(li)
             if not (pid and pname): continue
             res.append((str(pid[0]), analyze_name(pname, canonical=1)))
     return res
コード例 #27
0
ファイル: __init__.py プロジェクト: 070499/repo-scripts
 def _search_character(self, name, results):
     name = name.strip()
     if not name: return []
     s_name = normalizeName(analyze_name(name)['name'])
     nsplit = s_name.split()
     name2 = u''
     if len(nsplit) > 1:
         name2 = '%s %s' % (nsplit[-1], ' '.join(nsplit[:-1]))
         if s_name == name2:
             name2 = u''
     res =  _scan_names('%scharacters.key' % self.__db,
                         s_name, name2, u'', results, _scan_character=1)
     res[:] = [x[1] for x in res]
     return res
コード例 #28
0
 def _search_character(self, name, results):
     name = name.strip()
     if not name: return []
     s_name = normalizeName(analyze_name(name)['name'])
     nsplit = s_name.split()
     name2 = u''
     if len(nsplit) > 1:
         name2 = '%s %s' % (nsplit[-1], ' '.join(nsplit[:-1]))
         if s_name == name2:
             name2 = u''
     res =  _scan_names('%scharacters.key' % self.__db,
                         s_name, name2, u'', results, _scan_character=1)
     res[:] = [x[1] for x in res]
     return res
コード例 #29
0
ファイル: __init__.py プロジェクト: conwetlab/ezweb-gadgets
 def _getNameID(self, name):
     """Given a long imdb canonical name, returns a personID or
     None if not found."""
     nd = analyze_name(name)
     res = Name.select(
         AND(
             Name.q.name == self.toUTF8(nd['name']),
             self._buildNULLCondition(Name.q.imdbIndex,
                                      nd.get('imdbIndex'))))
     try:
         c = res.count()
         if res.count() != 1:
             return None
     except (UnicodeDecodeError, TypeError):
         return None
     return res[0].id
コード例 #30
0
 def _findRefs(self, o, trefs, nrefs):
     """Find titles or names references in strings."""
     if isinstance(o, (unicode, str)):
         for title in re_titleRef.findall(o):
             a_title = analyze_title(title, canonical=0)
             rtitle = build_title(a_title, ptdf=1)
             if trefs.has_key(rtitle): continue
             movieID = self._getTitleID(rtitle)
             if movieID is None:
                 movieID = self._getTitleID(title)
             if movieID is None:
                 continue
             m = Movie(title=rtitle,
                       movieID=movieID,
                       accessSystem=self.accessSystem)
             trefs[rtitle] = m
             rtitle2 = canonicalTitle(a_title.get('title', u''))
             if rtitle2 and rtitle2 != rtitle and rtitle2 != title:
                 trefs[rtitle2] = m
             if title != rtitle:
                 trefs[title] = m
         for name in re_nameRef.findall(o):
             a_name = analyze_name(name, canonical=1)
             rname = build_name(a_name, canonical=1)
             if nrefs.has_key(rname): continue
             personID = self._getNameID(rname)
             if personID is None:
                 personID = self._getNameID(name)
             if personID is None: continue
             p = Person(name=rname,
                        personID=personID,
                        accessSystem=self.accessSystem)
             nrefs[rname] = p
             rname2 = normalizeName(a_name.get('name', u''))
             if rname2 and rname2 != rname:
                 nrefs[rname2] = p
             if name != rname and name != rname2:
                 nrefs[name] = p
     elif isinstance(o, (list, tuple)):
         for item in o:
             self._findRefs(item, trefs, nrefs)
     elif isinstance(o, dict):
         for value in o.values():
             self._findRefs(value, trefs, nrefs)
     return (trefs, nrefs)
コード例 #31
0
 def start_a(self, attrs):
     href = self.get_attr_value(attrs, 'href')
     if not href: return
     href = href.lower()
     # XXX: Since July 2004, IMDb has removed the "pageflicker",
     #      so we've to gather the imdbID from the "IMDb message board"
     #      link.
     if href.startswith('/name/nm') and \
             href.find('/board') != -1:
         rpid = self.re_imdbID.findall(href)
         if rpid and self._name:
             n = self._name.strip()
             if n.find('IMDb Name') != -1 and n.find('Search') != -1:
                 return
             pid = str(rpid[-1])
             d = analyze_name(n, canonical=1)
             res = [(pid, d)]
             self.reset()
             self._result = res
コード例 #32
0
 def start_a(self, attrs):
     href = self.get_attr_value(attrs, 'href')
     if not href: return
     href = href.lower()
     # XXX: Since July 2004, IMDb has removed the "pageflicker",
     #      so we've to gather the imdbID from the "IMDb message board"
     #      link.
     if href.startswith('/name/nm') and \
             href.find('/board') != -1:
         rpid = self.re_imdbID.findall(href)
         if rpid and self._name:
             n = self._name.strip()
             if n.find('IMDb Name') != -1 and n.find('Search') != -1:
                 return
             pid = str(rpid[-1])
             d = analyze_name(n, canonical=1)
             res = [(pid, d)]
             self.reset()
             self._result = res
コード例 #33
0
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser):
    """A parser for the name search page."""

    rules = [
        Rule(
            key='data',
            extractor=Rules(
                foreach='//td[@class="result_text"]',
                rules=[
                    Rule(
                        key='link',
                        extractor=Path('./a/@href', reduce=reducers.first)
                    ),
                    Rule(
                        key='name',
                        extractor=Path('./a/text()')
                    ),
                    Rule(
                        key='index',
                        extractor=Path('./text()')
                    ),
                    Rule(
                        key='akas',
                        extractor=Path(foreach='./i', path='./text()')
                    ),
                    Rule(
                        key='headshot',
                        extractor=Path('../td[@class="primary_photo"]/a/img/@src')
                    )
                ],
                transform=lambda x: (
                    analyze_imdbid(x.get('link')),
                    analyze_name(x.get('name', '') + x.get('index', ''), canonical=1),
                    x.get('akas'),
                    x.get('headshot')
                )
            )
        )
    ]

    def _init(self):
        super(DOMHTMLSearchPersonParser, self)._init()
        self.img_type = 'headshot'
コード例 #34
0
ファイル: locsql.py プロジェクト: conwetlab/ezweb-gadgets
 def _findRefs(self, o, trefs, nrefs):
     """Find titles or names references in strings."""
     if isinstance(o, (UnicodeType, StringType)):
         for title in re_titleRef.findall(o):
             a_title = analyze_title(title, canonical=1)
             rtitle = build_title(a_title, canonical=1, ptdf=1)
             if trefs.has_key(rtitle): continue
             movieID = self._getTitleID(rtitle)
             if movieID is None:
                 movieID = self._getTitleID(title)
             if movieID is None:
                 continue
             m = Movie(title=rtitle, movieID=movieID,
                         accessSystem=self.accessSystem)
             trefs[rtitle] = m
             rtitle2 = canonicalTitle(a_title.get('title', u''))
             if rtitle2 and rtitle2 != rtitle and rtitle2 != title:
                 trefs[rtitle2] = m
             if title != rtitle:
                 trefs[title] = m
         for name in re_nameRef.findall(o):
             a_name = analyze_name(name, canonical=1)
             rname = build_name(a_name, canonical=1)
             if nrefs.has_key(rname): continue
             personID = self._getNameID(rname)
             if personID is None:
                 personID = self._getNameID(name)
             if personID is None: continue
             p = Person(name=rname, personID=personID,
                         accessSystem=self.accessSystem)
             nrefs[rname] = p
             rname2 = normalizeName(a_name.get('name', u''))
             if rname2 and rname2 != rname:
                 nrefs[rname2] = p
             if name != rname and name != rname2:
                 nrefs[name] = p
     elif isinstance(o, (ListType, TupleType)):
         for item in o:
             self._findRefs(item, trefs, nrefs)
     elif isinstance(o, DictType):
         for value in o.values():
             self._findRefs(value, trefs, nrefs)
     return (trefs, nrefs)
コード例 #35
0
ファイル: __init__.py プロジェクト: conwetlab/ezweb-gadgets
 def _search_character(self, name, results):
     name = name.strip()
     if not name: return []
     s_name = analyze_name(name)['name']
     if not s_name: return []
     if isinstance(s_name, UnicodeType):
         s_name = s_name.encode('ascii', 'ignore')
     s_name = normalizeName(s_name)
     soundexCode = soundex(s_name)
     surname = s_name.split(' ')[-1]
     surnameSoundex = soundex(surname)
     name2 = ''
     soundexName2 = None
     nsplit = s_name.split()
     if len(nsplit) > 1:
         name2 = '%s %s' % (nsplit[-1], ' '.join(nsplit[:-1]))
         if s_name == name2:
             name2 = ''
         else:
             soundexName2 = soundex(name2)
     # If the soundex is None, compare only with the first
     # phoneticCode column.
     if soundexCode is not None:
         if soundexName2 is not None:
             condition = OR(
                 surnameSoundex == CharName.q.surnamePcode,
                 IN(CharName.q.namePcodeNf, [soundexCode, soundexName2]),
                 IN(CharName.q.surnamePcode, [soundexCode, soundexName2]))
         else:
             condition = OR(
                 surnameSoundex == CharName.q.surnamePcode,
                 IN(soundexCode,
                    [CharName.q.namePcodeNf, CharName.q.surnamePcode]))
     else:
         condition = ISNULL(Name.q.namePcodeNf)
     try:
         qr = [(q.id, {
             'name': q.name,
             'imdbIndex': q.imdbIndex
         }) for q in CharName.select(condition)]
     except NotFoundError, e:
         raise IMDbDataAccessError, \
                 'unable to search the database: "%s"' % str(e)
コード例 #36
0
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser):
    """A parser for the name search page."""

    rules = [
        Rule(key='data',
             extractor=Rules(
                 foreach='//td[@class="result_text"]',
                 rules=[
                     Rule(key='link',
                          extractor=Path('./a/@href', reduce=reducers.first)),
                     Rule(key='name', extractor=Path('./a/text()')),
                     Rule(key='index', extractor=Path('./text()')),
                     Rule(key='akas',
                          extractor=Path(foreach='./i', path='./text()'))
                 ],
                 transform=lambda x:
                 (analyze_imdbid(x.get('link')),
                  analyze_name(x.get('name', '') + x.get('index', ''),
                               canonical=1), x.get('akas'))))
    ]
コード例 #37
0
 def get_character_main(self, characterID, results=1000):
     infosets = self.get_character_infoset()
     name = getCharacterName(characterID, '%scharacters.index' % self.__db,
                             '%scharacters.data' % self.__db)
     if not name:
         raise IMDbDataAccessError, \
                         'unable to get characterID "%s"' % characterID
     res = analyze_name(name, canonical=1)
     filmography = getCharacterFilmography(characterID,
                                           '%scharacters.index' % self.__db,
                                           '%scharacters.data' % self.__db,
                                           '%stitles.index' % self.__db,
                                           '%stitles.key' % self.__db,
                                           '%snames.index' % self.__db,
                                           '%snames.key' % self.__db,
                                           limit=results)
     if filmography:
         filmography = merge_roles(filmography)
         filmography.sort()
         res['filmography'] = filmography
     return {'data': res, 'info sets': infosets}
コード例 #38
0
ファイル: __init__.py プロジェクト: 070499/repo-scripts
 def get_character_main(self, characterID, results=1000):
     infosets = self.get_character_infoset()
     name = getCharacterName(characterID,
                             '%scharacters.index' % self.__db,
                             '%scharacters.data' % self.__db)
     if not name:
         raise IMDbDataAccessError, \
                         'unable to get characterID "%s"' % characterID
     res = analyze_name(name, canonical=1)
     filmography = getCharacterFilmography(characterID,
                                         '%scharacters.index' % self.__db,
                                         '%scharacters.data' % self.__db,
                                         '%stitles.index' % self.__db,
                                         '%stitles.key' % self.__db,
                                         '%snames.index' % self.__db,
                                         '%snames.key' % self.__db,
                                         limit=results)
     if filmography:
         filmography = merge_roles(filmography)
         filmography.sort()
         res['filmography'] = filmography
     return {'data': res, 'info sets': infosets}
コード例 #39
0
def nameVariations(name, fromPtdf=0):
    """Build name variations useful for searches; if fromPtdf is true,
    the input is assumed to be in the plain text data files format."""
    name1 = name2 = name3 = u''
    if fromPtdf or re_nameIndex.search(name):
        # We've a name with an (imdbIndex)
        namedict = analyze_name(name, canonical=1)
        # name1 is the name in the canonical format.
        name1 = namedict['name']
        # name3 is the canonical name with the imdbIndex.
        if fromPtdf:
            if namedict.has_key('imdbIndex'):
                name3 = name
        else:
            name3 = build_name(namedict, canonical=1)
    else:
        # name1 is the name in the canonical format.
        name1 = canonicalName(name)
        name3 = u''
    # name2 is the name in the normal format, if it differs from name1.
    name2 = normalizeName(name1)
    if name1 == name2: name2 = u''
    return name1, name2, name3
コード例 #40
0
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for persons."""
    _linkPrefix = '/name/nm'

    rules = [
        Rule(
            key='data',
            extractor=Rules(
                foreach=
                '//td[@class="result_text"]/a[starts-with(@href, "/name/nm")]/..',
                rules=[
                    Rule(key='link', extractor=Path('./a[1]/@href')),
                    Rule(key='name', extractor=Path('./a[1]/text()')),
                    Rule(key='index', extractor=Path('./text()[1]')),
                    Rule(key='akas',
                         extractor=Path('.//div[@class="_imdbpyAKA"]/text()'))
                ],
                transform=lambda x:
                (analyze_imdbid(x.get('link') or ''),
                 analyze_name((x.get('name') or '') + (x.get('index') or ''),
                              canonical=1), x.get('akas'))))
    ]
コード例 #41
0
ファイル: locsql.py プロジェクト: conwetlab/ezweb-gadgets
def nameVariations(name, fromPtdf=0):
    """Build name variations useful for searches; if fromPtdf is true,
    the input is assumed to be in the plain text data files format."""
    name1 = name2 = name3 = u''
    if fromPtdf or re_nameIndex.search(name):
        # We've a name with an (imdbIndex)
        namedict = analyze_name(name, canonical=1)
        # name1 is the name in the canonical format.
        name1 = namedict['name']
        # name3 is the canonical name with the imdbIndex.
        if fromPtdf:
            if namedict.has_key('imdbIndex'):
                name3 = name
        else:
            name3 = build_name(namedict, canonical=1)
    else:
        # name1 is the name in the canonical format.
        name1 = canonicalName(name)
        name3 = u''
    # name2 is the name in the normal format, if it differs from name1.
    name2 = normalizeName(name1)
    if name1 == name2: name2 = u''
    return name1, name2, name3
コード例 #42
0
ファイル: __init__.py プロジェクト: conwetlab/ezweb-gadgets
    def _search_person(self, name, results):
        name = name.strip()
        if not name: return []
        s_name = analyze_name(name)['name']
        if not s_name: return []
        if isinstance(s_name, UnicodeType):
            s_name = s_name.encode('ascii', 'ignore')
        soundexCode = soundex(s_name)
        name1, name2, name3 = nameVariations(name)

        # If the soundex is None, compare only with the first
        # phoneticCode column.
        if soundexCode is not None:
            condition = IN(
                soundexCode,
                [Name.q.namePcodeCf, Name.q.namePcodeNf, Name.q.surnamePcode])
            conditionAka = IN(soundexCode, [
                AkaName.q.namePcodeCf, AkaName.q.namePcodeNf,
                AkaName.q.surnamePcode
            ])
        else:
            condition = ISNULL(Name.q.namePcodeCf)
            conditionAka = ISNULL(AkaName.q.namePcodeCf)

        try:
            qr = [(q.id, {
                'name': q.name,
                'imdbIndex': q.imdbIndex
            }) for q in Name.select(condition)]
            qr += [(q.personID, {
                'name': q.name,
                'imdbIndex': q.imdbIndex
            }) for q in AkaName.select(conditionAka)]
        except NotFoundError, e:
            raise IMDbDataAccessError, \
                    'unable to search the database: "%s"' % str(e)
コード例 #43
0
 def _search_person(self, name, results):
     name = name.strip()
     if not name: return []
     name1, name2, name3 = nameVariations(name)
     res =  _scan_names('%snames.key' % self.__db,
                         name1, name2, name3, results)
     res[:] = [x[1] for x in res]
     new_res = []
     seen_PID = []
     for idx, (personID, r) in enumerate(res):
         # Remove duplicates.
         # XXX: find a way to prefer names with an AKA?  Or prefer
         #      the original name?
         if personID in seen_PID:
             continue
         else:
             seen_PID.append(personID)
         realPID = self._get_real_personID(personID)
         if personID == realPID:
             new_res.append((personID, r))
             continue
         if realPID in seen_PID:
             continue
         else:
             seen_PID.append(realPID)
         aka_name = build_name(r, canonical=1)
         real_name = getLabel(realPID, '%snames.index' % self.__db,
                             '%snames.key' % self.__db)
         if aka_name == real_name:
             new_res.append((realPID, r))
             continue
         new_r = analyze_name(real_name, canonical=1)
         new_r['akas'] = [aka_name]
         new_res.append((realPID, new_r))
     if results > 0: new_res[:] = new_res[:results]
     return new_res
コード例 #44
0
ファイル: personParser.py プロジェクト: 070499/repo-scripts
def _parseBiography(biol):
    """Parse the biographies.data file."""
    res = {}
    bio = ' '.join(_parseList(biol, 'BG', mline=0))
    bio = _parseBioBy(biol)
    if bio: res['mini biography'] = bio

    for x in biol:
        x4 = x[:4]
        x6 = x[:6]
        if x4 == 'DB: ':
            date, notes = date_and_notes(x[4:])
            if date:
                res['birth date'] = date
            if notes:
                res['birth notes'] = notes
        elif x4 == 'DD: ':
            date, notes = date_and_notes(x[4:])
            if date:
                res['death date'] = date
            if notes:
                res['death notes'] = notes
        elif x6 == 'SP: * ':
            res.setdefault('spouse', []).append(x[6:].strip())
        elif x4 == 'RN: ':
            n = x[4:].strip()
            if not n: continue
            rn = build_name(analyze_name(n, canonical=1), canonical=1)
            res['birth name'] = rn
        elif x6 == 'AT: * ':
            res.setdefault('articles', []).append(x[6:].strip())
        elif x4 == 'HT: ':
            res['height'] = x[4:].strip()
        elif x6 == 'PT: * ':
            res.setdefault('pictorials', []).append(x[6:].strip())
        elif x6 == 'CV: * ':
            res.setdefault('magazine covers', []).append(x[6:].strip())
        elif x4 == 'NK: ':
            res.setdefault('nick names', []).append(normalizeName(x[4:]))
        elif x6 == 'PI: * ':
            res.setdefault('portrayed', []).append(x[6:].strip())
        elif x6 == 'SA: * ':
            sal = x[6:].strip().replace(' -> ', '::')
            res.setdefault('salary history', []).append(sal)

    trl = _parseList(biol, 'TR')
    if trl: res['trivia'] = trl
    quotes = _parseList(biol, 'QU')
    if quotes: res['quotes'] = quotes
    otherworks = _parseList(biol, 'OW')
    if otherworks: res['other works'] = otherworks
    books = _parseList(biol, 'BO')
    if books: res['books'] = books
    agent = _parseList(biol, 'AG')
    if agent: res['agent address'] = agent
    wherenow = _parseList(biol, 'WN')
    if wherenow: res['where now'] = wherenow[0]
    biomovies = _parseList(biol, 'BT')
    if biomovies: res['biographical movies'] = biomovies
    guestapp = _buildGuests([x[6:].strip() for x in biol if x[:6] == 'GA: * '])
    if guestapp: res['notable tv guest appearances'] = guestapp
    tm = _parseList(biol, 'TM')
    if tm: res['trademarks'] = tm
    interv = _parseList(biol, 'IT')
    if interv: res['interviews'] = interv
    return res
コード例 #45
0
ファイル: Person.py プロジェクト: alberanid/imdbpy
 def set_name(self, name):
     """Set the name of the person."""
     d = analyze_name(name, canonical=True)
     self.data.update(d)
コード例 #46
0
 def set_name(self, name):
     """Set the name of the person."""
     d = analyze_name(name, canonical=True)
     self.data.update(d)
コード例 #47
0
ファイル: Person.py プロジェクト: Black0wL/webtechproject
 def set_name(self, name):
     """Set the name of the person."""
     # XXX: convert name to unicode, if it's a plain string?
     d = analyze_name(name, canonical=1)
     self.data.update(d)
コード例 #48
0
class DOMHTMLMaindetailsParser(DOMParserBase):
    """Parser for the "categorized" (maindetails) page of a given person.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        cparser = DOMHTMLMaindetailsParser()
        result = cparser.parse(categorized_html_string)
    """
    _containsObjects = True
    _name_imdb_index = re.compile(r'\([IVXLCDM]+\)')

    _birth_rules = [
        Rule(key='birth date',
             extractor=Path('.//time[@itemprop="birthDate"]/@datetime')),
        Rule(key='birth place',
             extractor=Path(
                 './/a[starts-with(@href, "/search/name?birth_place=")]/text()'
             ))
    ]

    _death_rules = [
        Rule(key='death date',
             extractor=Path('.//time[@itemprop="deathDate"]/@datetime')),
        Rule(key='death place',
             extractor=Path(
                 './/a[starts-with(@href, "/search/name?death_place=")]/text()'
             ))
    ]

    _film_rules = [
        Rule(key='link', extractor=Path('./b/a[1]/@href')),
        Rule(key='title', extractor=Path('./b/a[1]/text()')),
        Rule(key='notes', extractor=Path('./b/following-sibling::text()')),
        Rule(key='year',
             extractor=Path('./span[@class="year_column"]/text()')),
        Rule(key='status',
             extractor=Path('./a[@class="in_production"]/text()')),
        Rule(key='rolesNoChar',
             extractor=Path('.//br/following-sibling::text()')),
        Rule(key='chrRoles', extractor=Path('./a[@imdbpyname]/@imdbpyname'))
    ]

    rules = [
        Rule(key='name',
             extractor=Path('//h1[@class="header"]//text()',
                            transform=lambda x: analyze_name(x, canonical=1))),
        Rule(key='name_index',
             extractor=Path('//h1[@class="header"]/span[1]/text()')),
        Rule(key='birth info',
             extractor=Rules(section='//div[h4="Born:"]', rules=_birth_rules)),
        Rule(key='death info',
             extractor=Rules(
                 section='//div[h4="Died:"]',
                 rules=_death_rules,
             )),
        Rule(key='headshot',
             extractor=Path(
                 '//td[@id="img_primary"]/div[@class="image"]/a/img/@src')),
        Rule(key='akas',
             extractor=Path('//div[h4="Alternate Names:"]/text()',
                            transform=lambda x: x.strip().split('  '))),
        Rule(
            key='filmography',
            extractor=Rules(
                foreach='//div[starts-with(@id, "filmo-head-")]',
                rules=[
                    Rule(
                        key=Path(
                            './a[@name]/text()',
                            transform=lambda x: x.lower().replace(': ', ' ')),
                        extractor=Rules(
                            foreach=
                            './following-sibling::div[1]/div[starts-with(@class, "filmo-row")]',
                            rules=_film_rules,
                            transform=lambda x: build_movie(
                                x.get('title') or '',
                                year=x.get('year'),
                                movieID=analyze_imdbid(x.get('link') or ''),
                                rolesNoChar=(x.get('rolesNoChar') or '').strip(
                                ),
                                chrRoles=(x.get('chrRoles') or '').strip(),
                                additionalNotes=x.get('notes'),
                                status=x.get('status') or None)))
                ])),
        Rule(key='in development',
             extractor=Rules(foreach='//div[starts-with(@class,"devitem")]',
                             rules=[
                                 Rule(key='link', extractor=Path('./a/@href')),
                                 Rule(key='title',
                                      extractor=Path('./a/text()'))
                             ],
                             transform=lambda x: build_movie(
                                 x.get('title') or '',
                                 movieID=analyze_imdbid(x.get('link') or ''),
                                 roleID=(x.get('roleID') or '').split('/'),
                                 status=x.get('status') or None)))
    ]

    preprocessors = [('<div class="clear"/> </div>', ''), ('<br/>', '<br />')]

    def postprocess_data(self, data):
        for key in ['name']:
            if (key in data) and isinstance(data[key], dict):
                subdata = data[key]
                del data[key]
                data.update(subdata)
        for what in 'birth date', 'death date':
            if what in data and not data[what]:
                del data[what]
        name_index = (data.get('name_index') or '').strip()
        if name_index:
            if self._name_imdb_index.match(name_index):
                data['imdbIndex'] = name_index[1:-1]
            del data['name_index']
        # XXX: the code below is for backwards compatibility
        # probably could be removed
        for key in list(data.keys()):
            if key.startswith('actor '):
                if 'actor' not in data:
                    data['actor'] = []
                data['actor'].extend(data[key])
                del data[key]
            if key.startswith('actress '):
                if 'actress' not in data:
                    data['actress'] = []
                data['actress'].extend(data[key])
                del data[key]
            if key.startswith('self '):
                if 'self' not in data:
                    data['self'] = []
                data['self'].extend(data[key])
                del data[key]
            if key == 'birth place':
                data['birth notes'] = data[key]
                del data[key]
            if key == 'death place':
                data['death notes'] = data[key]
                del data[key]
        return data
コード例 #49
0
ファイル: __init__.py プロジェクト: 070499/repo-scripts
        # latin_1 encoded strings.
        name1, name2, name3 = [x.encode('latin_1', 'replace')
                                for x in name1, name2, name3]
        try:
            sn = search_name(keyFile, name1, name2, name3, results,
                    _scan_character)
        except IOError, e:
            if _scan_character:
                import warnings
                warnings.warn('Unable to access characters information: %s' % e)
                return []
            else:
                raise
        res = []
        for x in sn:
            tmpd = analyze_name(latin2utf(x[2]))
            res.append((x[0], (x[1], tmpd)))
        return res
except ImportError:
    import warnings
    warnings.warn('Unable to import the cutils.search_name function.'
                    '  Searching names using the "local" data access system'
                    ' will be REALLY slow.')

    from imdb.parser.common.locsql import scan_names

    def _readNamesKeyFile(keyFile):
        """Iterate over the given file, returning tuples suited for
        the common.locsql.scan_names function."""
        try: kf = open(keyFile, 'r')
        except IOError, e: raise IMDbDataAccessError, str(e)
コード例 #50
0
ファイル: __init__.py プロジェクト: conwetlab/ezweb-gadgets
 def get_person_main(self, personID, _parseChr=False):
     if not _parseChr:
         url = imdbURL_person_main % personID + 'maindetails'
     else:
         url = imdbURL_character_main % personID
     s = self._mretrieve(url)
     r = {}
     name = _findBetween(s, '<title>', '</title>', maxRes=1)
     if not name:
         if _parseChr: w = 'characterID'
         else: w = 'personID'
         raise IMDbDataAccessError, 'unable to get %s "%s"' % (w, personID)
     name = _unHtml(name[0])
     if _parseChr:
         name = name.replace('(Character)', '').strip()
     r = analyze_name(name, canonical=not _parseChr)
     for dKind in ('birth', 'death'):
         date = _findBetween(s, '<h5>Date of %s:</h5>' % dKind.capitalize(),
                             ('<a class', '</div>', '<br/><br/>'), maxRes=1)
         if date:
             date = _unHtml(date[0])
             if date:
                 date, notes = date_and_notes(date)
                 if date:
                     r['%s date' % dKind] = date
                 if notes:
                     r['%s notes' % dKind] = notes
     akas = _findBetween(s, 'Alternate Names:</h5>', ('</div>',
                         '<br/><br/>'), maxRes=1)
     if akas:
         akas = akas[0]
         if akas.find(' | ') != -1:
             akas = _unHtml(akas).split(' | ')
         else:
             akas = _unHtml(akas).split(' / ')
         if akas: r['akas'] = akas
     hs = _findBetween(s, 'name="headshot"', '</a>', maxRes=1)
     if hs:
         hs[:] = _findBetween(hs[0], 'src="', '"', maxRes=1)
         if hs: r['headshot'] = hs[0]
     # Build a list of tuples such [('hrefLink', 'section name')]
     workkind = _findBetween(s, '<div class="strip jump">', '</div>',
                             maxRes=1)
     if workkind:
         workkind[:] = _findBetween(workkind[0], 'href="#', '</a>')
     else:
         # Assume there's only one section and/or there are no
         # section links, for some reason.
         workkind[:] = _findBetween(s, '<h5><a name=', '</a></h5>')
         workkind[:] = [x.lstrip('"').rstrip(':').lower() for x in workkind]
     ws = []
     for work in workkind:
         wsplit = work.split('">', 1)
         if len(wsplit) == 2:
             ws.append((wsplit[0], wsplit[1].lower()))
     # XXX: I think "guest appearances" are gone.
     if s.find('<a href="#guest-appearances"') != -1:
         ws.append(('guest-appearances', 'notable tv guest appearances'))
     if _parseChr:
         ws.append(('filmography', 'filmography'))
     for sect, sectName in ws:
         raws = u''
         # Everything between the current section link and the end
         # of the <ol> tag.
         if _parseChr and sect == 'filmography':
             inisect = s.find('<div class="filmo">')
         else:
             inisect = s.find('<a name="%s' % sect)
         if inisect != -1:
             endsect = s[inisect:].find('</ol>')
             if endsect != -1: raws = s[inisect:inisect+endsect]
         if not raws: continue
         mlist = _findBetween(raws, '<li>', ('</li>', '<br>', '<br/>'))
         for m in mlist:
             # For every movie in the current section.
             movieID = re_imdbID.findall(m)
             if not movieID: continue
             if not _parseChr:
                 chrIndx = m.find(' .... ')
             else:
                 chrIndx = m.find(' Played by ')
             chids = []
             if chrIndx != -1:
                 chrtxt = m[chrIndx+6:]
                 if _parseChr:
                     chrtxt = chrtxt[5:]
                 for ch in chrtxt.split(' / '):
                     chid = re_imdbID.findall(ch)
                     if not chid:
                         chids.append(None)
                     else:
                         chids.append(chid[-1])
             if not chids:
                 chids = None
             elif len(chids) == 1:
                 chids = chids[0]
             movieID = str(movieID[0])
             # Search the status.
             stidx = m.find('<i>')
             status = u''
             if stidx != -1:
                 stendidx = m.rfind('</i>')
                 if stendidx != -1:
                     status = _unHtml(m[stidx+3:stendidx])
                     m = m.replace(m[stidx+3:stendidx], '')
             m = _unHtml(m)
             if not m: continue
             movie = build_movie(m, movieID=movieID, status=status,
                                 roleID=chids, modFunct=self._defModFunct,
                                 accessSystem=self.accessSystem,
                                 _parsingCharacter=_parseChr)
             r.setdefault(sectName, []).append(movie)
     # If available, take the always correct name from a form.
     itag = _getTagsWith(s, 'NAME="primary"', maxRes=1)
     if not itag:
         itag = _getTagsWith(s, 'name="primary"', maxRes=1)
     if itag:
         vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1)
         if not vtag:
             vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1)
         if vtag:
             try:
                 vtag = unquote(str(vtag[0]))
                 vtag = unicode(vtag, 'latin_1')
                 r.update(analyze_name(vtag, canonical=0))
             except UnicodeEncodeError:
                 pass
             
     photo = _findBetween(s, '<div class="photo">', '</div>', maxRes=1)
     image_url = ''
     if (len(photo)>0):
         img = _findBetween(photo[0], '<img', '/a>', maxRes=1)
         if (len(img)>0):
             image_url = _findBetween(img[0],' src="', '"', maxRes=1)[0]
     r['image_url'] = image_url
     
     return {'data': r, 'info sets': ('main', 'filmography')}
コード例 #51
0
ファイル: __init__.py プロジェクト: Elettronik/SickRage
 def get_person_main(self, personID, _parseChr=False):
     if not _parseChr:
         url = self.urls['person_main'] % personID + 'maindetails'
     else:
         url = self.urls['character_main'] % personID
     s = self._mretrieve(url)
     r = {}
     name = _findBetween(s, '<title>', '</title>', maxRes=1)
     if not name:
         if _parseChr: w = 'characterID'
         else: w = 'personID'
         raise IMDbDataAccessError('unable to get %s "%s"' % (w, personID))
     name = _unHtml(name[0].replace(' - IMDb', ''))
     if _parseChr:
         name = name.replace('(Character)', '').strip()
         name = name.replace('- Filmography by type', '').strip()
     else:
         name = name.replace('- Filmography by', '').strip()
     r = analyze_name(name, canonical=not _parseChr)
     for dKind in ('Born', 'Died'):
         date = _findBetween(s, '%s:</h4>' % dKind.capitalize(),
                             ('<div class', '</div>', '<br/><br/>'), maxRes=1)
         if date:
             date = _unHtml(date[0])
             if date:
                 #date, notes = date_and_notes(date)
                 # TODO: fix to handle real names.
                 date_notes = date.split(' in ', 1)
                 notes = u''
                 date = date_notes[0]
                 if len(date_notes) == 2:
                     notes = date_notes[1]
                 dtitle = 'birth'
                 if dKind == 'Died':
                     dtitle = 'death'
                 if date:
                     r['%s date' % dtitle] = date
                 if notes:
                     r['%s notes' % dtitle] = notes
     akas = _findBetween(s, 'Alternate Names:</h4>', ('</div>',
                         '<br/><br/>'), maxRes=1)
     if akas:
         akas = akas[0]
         if akas:
             akas = _unHtml(akas)
         if akas.find(' | ') != -1:
             akas = akas.split(' | ')
         else:
             akas = akas.split(' / ')
         if akas: r['akas'] = filter(None, [x.strip() for x in akas])
     hs = _findBetween(s, "rel='image_src'", '>', maxRes=1)
     if not hs:
         hs = _findBetween(s, 'rel="image_src"', '>', maxRes=1)
     if not hs:
         hs = _findBetween(s, '<a name="headshot"', '</a>', maxRes=1)
     if hs:
         hsl = _findBetween(hs[0], "href='", "'", maxRes=1)
         if not hsl:
             hsl = _findBetween(hs[0], 'href="', '"', maxRes=1)
         if hsl and 'imdb-share-logo' not in hsl[0]:
             r['headshot'] = hsl[0]
     # Build a list of tuples such [('hrefLink', 'section name')]
     workkind = _findBetween(s, 'id="jumpto_', '</a>')
     ws = []
     for work in workkind:
         sep = '" >'
         if '">' in work:
             sep = '">'
         wsplit = work.split(sep, 1)
         if len(wsplit) == 2:
             sect = wsplit[0]
             if '"' in sect:
                 sect = sect[:sect.find('"')]
             ws.append((sect, wsplit[1].lower()))
     # XXX: I think "guest appearances" are gone.
     if s.find('<a href="#guest-appearances"') != -1:
         ws.append(('guest-appearances', 'notable tv guest appearances'))
     #if _parseChr:
     #    ws.append(('filmography', 'filmography'))
     for sect, sectName in ws:
         raws = u''
         if sectName == 'self':
             sect = 'Self'
         # Everything between the current section link and the end
         # of the <ol> tag.
         if _parseChr and sect == 'filmography':
             inisect = s.find('<div class="filmo">')
         else:
             inisect = s.find('<a name="%s' % sect)
         if inisect != -1:
             endsect = s[inisect:].find('<div id="filmo-head-')
             if endsect == -1:
                 endsect = s[inisect:].find('<div class="article"')
             if endsect != -1: raws = s[inisect:inisect+endsect]
         #if not raws: continue
         mlist = _findBetween(raws, '<div class="filmo-row',
                 ('<div class="clear"/>',))
         for m in mlist:
             fCB = m.find('>')
             if fCB != -1:
                 m = m[fCB+1:].lstrip()
             m = re_filmo_episodes.sub('', m)
             # For every movie in the current section.
             movieID = re_imdbID.findall(m)
             if not movieID:
                 self._mobile_logger.debug('no movieID in %s', m)
                 continue
             m = m.replace('<br/>', ' .... ', 1)
             if not _parseChr:
                 chrIndx = m.find(' .... ')
             else:
                 chrIndx = m.find(' Played by ')
             chids = []
             if chrIndx != -1:
                 chrtxt = m[chrIndx+6:]
                 if _parseChr:
                     chrtxt = chrtxt[5:]
                 for ch in chrtxt.split(' / '):
                     chid = re_imdbID.findall(ch)
                     if not chid:
                         chids.append(None)
                     else:
                         chids.append(chid[-1])
             if not chids:
                 chids = None
             elif len(chids) == 1:
                 chids = chids[0]
             movieID = str(movieID[0])
             # Search the status.
             stidx = m.find('<i>')
             status = u''
             if stidx != -1:
                 stendidx = m.rfind('</i>')
                 if stendidx != -1:
                     status = _unHtml(m[stidx+3:stendidx])
                     m = m.replace(m[stidx+3:stendidx], '')
             year = _findBetween(m, 'year_column">', '</span>', maxRes=1)
             if year:
                 year = year[0]
                 m = m.replace('<span class="year_column">%s</span>' % year,
                         '')
             else:
                 year = None
             m = _unHtml(m)
             if not m:
                 self._mobile_logger.warn('no title for movieID %s', movieID)
                 continue
             movie = build_movie(m, movieID=movieID, status=status,
                                 roleID=chids, modFunct=self._defModFunct,
                                 accessSystem=self.accessSystem,
                                 _parsingCharacter=_parseChr, year=year)
             sectName = sectName.split(':')[0]
             r.setdefault(sectName, []).append(movie)
     # If available, take the always correct name from a form.
     itag = _getTagsWith(s, 'NAME="primary"', maxRes=1)
     if not itag:
         itag = _getTagsWith(s, 'name="primary"', maxRes=1)
     if itag:
         vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1)
         if not vtag:
             vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1)
         if vtag:
             try:
                 vtag = unquote(str(vtag[0]))
                 vtag = unicode(vtag, 'latin_1')
                 r.update(analyze_name(vtag))
             except UnicodeEncodeError:
                 pass
     return {'data': r, 'info sets': ('main', 'filmography')}