Esempio n. 1
0
 def _search_character(self, name, results):
     cont = subXMLRefs(self._get_search_content('ch', name, results))
     name = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not name:
         self._mobile_logger.error('no title tag searching character %s',
                                 name)
         return res
     nl = name[0].lower()
     if not nl.startswith('find - imdb'):
         # a direct hit!
         name = _unHtml(name[0]).replace('(Character)', '').strip()
         pid = None
         pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
         if pidtag:
             pid = _findBetween(pidtag[0], '/character/ch', '/', maxRes=1)
         if not (pid and name):
             self._mobile_logger.error('no direct hit name/characterID for' \
                                         ' character %s', name)
             return res
         res[:] = [(str(pid[0]), analyze_name(name))]
     else:
         lis = _findBetween(cont, '<td class="result_text"',
                             ['<small', '</td>', '<br'])
         for li in lis:
             li = '<%s' % li
             pid = re_imdbID.findall(li)
             pname = _unHtml(li)
             if not (pid and pname):
                 self._mobile_logger.debug('no name/characterID' \
                                         ' parsing %s searching for' \
                                         ' character %s', li, name)
                 continue
             res.append((str(pid[0]), analyze_name(pname)))
     return res
Esempio n. 2
0
 def _search_character(self, name, results):
     cont = subXMLRefs(self._get_search_content("ch", name, results))
     name = _findBetween(cont, "<title>", "</title>", maxRes=1)
     res = []
     if not name:
         self._mobile_logger.error("no title tag searching character %s", name)
         return res
     nl = name[0].lower()
     if not nl.startswith("find - imdb"):
         # a direct hit!
         name = _unHtml(name[0]).replace("(Character)", "").strip()
         pid = None
         pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
         if pidtag:
             pid = _findBetween(pidtag[0], "/character/ch", "/", maxRes=1)
         if not (pid and name):
             self._mobile_logger.error("no direct hit name/characterID for" " character %s", name)
             return res
         res[:] = [(str(pid[0]), analyze_name(name))]
     else:
         lis = _findBetween(cont, '<td class="result_text"', ["<small", "</td>", "<br"])
         for li in lis:
             li = "<%s" % li
             pid = re_imdbID.findall(li)
             pname = _unHtml(li)
             if not (pid and pname):
                 self._mobile_logger.debug(
                     "no name/characterID" " parsing %s searching for" " character %s", li, name
                 )
                 continue
             res.append((str(pid[0]), analyze_name(pname)))
     return res
Esempio n. 3
0
 def _search_character(self, name, results):
     cont = subXMLRefs(self._get_search_content('char', name, results))
     name = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not name: return res
     nl = name[0].lower()
     if not (nl.startswith('imdb search') or nl.startswith('imdb  search') \
             or nl.startswith('imdb character')):
         # XXX: a direct hit!
         name = _unHtml(name[0]).replace('(Character)', '').strip()
         pidtag = _getTagsWith(cont, '/character/ch', maxRes=1)
         pid = None
         if pidtag:
             pid = re_imdbID.findall(pidtag[0])
         if not (pid and name): return res
         res[:] = [(str(pid[0]), analyze_name(name, canonical=0))]
     else:
         sects = _findBetween(cont, '<b>Popular Characters</b>', '</table>')
         sects += _findBetween(cont, '<b>Characters', '</table>')
         for sect in sects:
             lis = _findBetween(sect, '<a href="/character/',
                                ['<small', '</td>', '<br'])
             for li in lis:
                 li = '<%s' % li
                 pid = re_imdbID.findall(li)
                 pname = _unHtml(li)
                 if not (pid and pname): continue
                 res.append((str(pid[0]), analyze_name(pname, canonical=0)))
     return res
Esempio n. 4
0
 def _search_movie(self, title, results):
     ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
     ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results))
     ##cont = self._mretrieve(imdbURL_search % params)
     cont = subXMLRefs(self._get_search_content('tt', title, results))
     title = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not title: return res
     tl = title[0].lower()
     if not tl.startswith('imdb title'):
         # XXX: a direct hit!
         title = _unHtml(title[0])
         midtag = _getTagsWith(cont, 'name="arg"', maxRes=1)
         if not midtag: midtag = _getTagsWith(cont, 'name="auto"', maxRes=1)
         mid = None
         if midtag:
             mid = _findBetween(midtag[0], 'value="', '"', maxRes=1)
             if mid and not mid[0].isdigit():
                 mid = re_imdbID.findall(mid[0])
         if not (mid and title): return res
         res[:] = [(str(mid[0]), analyze_title(title, canonical=1))]
     else:
         lis = _findBetween(cont, 'td valign="top">', ['</td>', '<small>'])
         for li in lis:
             imdbid = re_imdbID.findall(li)
             mtitle = _unHtml(li)
             if not (imdbid and mtitle): continue
             res.append((str(imdbid[0]), analyze_title(mtitle,
                                                       canonical=1)))
     return res
Esempio n. 5
0
 def _search_character(self, name, results):
     cont = subXMLRefs(self._get_search_content('char', name, results))
     name = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not name: return res
     nl = name[0].lower()
     if not (nl.startswith('imdb search') or nl.startswith('imdb  search') \
             or nl.startswith('imdb character')):
         # XXX: a direct hit!
         name = _unHtml(name[0]).replace('(Character)', '').strip()
         pidtag = _getTagsWith(cont, '/character/ch', maxRes=1)
         pid = None
         if pidtag:
             pid = re_imdbID.findall(pidtag[0])
         if not (pid and name): return res
         res[:] = [(str(pid[0]), analyze_name(name, canonical=0))]
     else:
         sects = _findBetween(cont, '<b>Popular Characters</b>', '</table>')
         sects += _findBetween(cont, '<b>Characters', '</table>')
         for sect in sects:
             lis = _findBetween(sect, '<a href="/character/',
                                 ['<small', '</td>', '<br'])
             for li in lis:
                 li = '<%s' % li
                 pid = re_imdbID.findall(li)
                 pname = _unHtml(li)
                 if not (pid and pname): continue
                 res.append((str(pid[0]), analyze_name(pname, canonical=0)))
     return res
Esempio n. 6
0
 def _search_movie(self, title, results):
     ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
     ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results))
     ##cont = self._mretrieve(imdbURL_search % params)
     cont = subXMLRefs(self._get_search_content('tt', title, results))
     title = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not title:
         self._mobile_logger.error('no title tag searching for movie %s',
                                 title)
         return res
     tl = title[0].lower()
     if not tl.startswith('imdb title'):
         # a direct hit!
         title = _unHtml(title[0])
         mid = None
         midtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
         if midtag:
             mid = _findBetween(midtag[0], '/title/tt', '/', maxRes=1)
         if not (mid and title):
             self._mobile_logger.error('no direct hit title/movieID for' \
                                         ' title %s', title)
             return res
         if cont.find('<span class="tv-extra">TV mini-series</span>') != -1:
             title += ' (mini)'
         res[:] = [(str(mid[0]), analyze_title(title))]
     else:
         # XXX: this results*3 prevents some recursion errors, but...
         #      it's not exactly understandable (i.e.: why 'results' is
         #      not enough to get all the results?)
         lis = _findBetween(cont, 'td valign="top">', '</td>',
                             maxRes=results*3)
         for li in lis:
             akaIdx = li.find('aka <em>')
             akas = []
             if akaIdx != -1:
                 akas = [_unHtml(x) for x in li[akaIdx:].split('<br>')]
                 li = li[:akaIdx]
             if akas:
                 for idx, aka in enumerate(akas):
                     aka = aka.replace('" - ', '::')
                     if aka.startswith('aka "'):
                         aka = aka[5:]
                     if aka[-1] == '"':
                         aka = aka[:-1]
                     akas[idx] = aka
             imdbid = re_imdbID.findall(li)
             mtitle = _unHtml(li)
             if not (imdbid and mtitle):
                 self._mobile_logger.debug('no title/movieID parsing' \
                                         ' %s searching for title %s', li,
                                         title)
                 continue
             mtitle = mtitle.replace('(TV mini-series)', '(mini)')
             resd = analyze_title(mtitle)
             if akas:
                 resd['akas'] = akas
             res.append((str(imdbid[0]), resd))
     return res
Esempio n. 7
0
 def _search_movie(self, title, results):
     ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
     ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results))
     ##cont = self._mretrieve(imdbURL_search % params)
     cont = subXMLRefs(self._get_search_content('tt', title, results))
     title = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not title:
         self._mobile_logger.error('no title tag searching for movie %s',
                                   title)
         return res
     tl = title[0].lower()
     if not tl.startswith('imdb title'):
         # a direct hit!
         title = _unHtml(title[0])
         mid = None
         midtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
         if midtag:
             mid = _findBetween(midtag[0], '/title/tt', '/', maxRes=1)
         if not (mid and title):
             self._mobile_logger.error('no direct hit title/movieID for' \
                                         ' title %s', title)
             return res
         if cont.find('<span class="tv-extra">TV mini-series</span>') != -1:
             title += ' (mini)'
         res[:] = [(str(mid[0]), analyze_title(title))]
     else:
         # XXX: this results*3 prevents some recursion errors, but...
         #      it's not exactly understandable (i.e.: why 'results' is
         #      not enough to get all the results?)
         lis = _findBetween(cont,
                            'td valign="top">',
                            '</td>',
                            maxRes=results * 3)
         for li in lis:
             akas = re_makas.findall(li)
             for idx, aka in enumerate(akas):
                 aka = aka.replace('" - ', '::', 1)
                 aka = _unHtml(aka)
                 if aka.startswith('aka "'):
                     aka = aka[5:].strip()
                 if aka[-1] == '"':
                     aka = aka[:-1]
                 akas[idx] = aka
             imdbid = re_imdbID.findall(li)
             li = re_makas.sub('', li)
             mtitle = _unHtml(li)
             if not (imdbid and mtitle):
                 self._mobile_logger.debug('no title/movieID parsing' \
                                         ' %s searching for title %s', li,
                                         title)
                 continue
             mtitle = mtitle.replace('(TV mini-series)', '(mini)')
             resd = analyze_title(mtitle)
             if akas:
                 resd['akas'] = akas
             res.append((str(imdbid[0]), resd))
     return res
Esempio n. 8
0
 def _search_movie(self, title, results):
     ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
     ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results))
     ##cont = self._mretrieve(imdbURL_search % params)
     cont = subXMLRefs(self._get_search_content("tt", title, results))
     title = _findBetween(cont, "<title>", "</title>", maxRes=1)
     res = []
     if not title:
         self._mobile_logger.error("no title tag searching for movie %s", title)
         return res
     tl = title[0].lower()
     if not tl.startswith("find - imdb"):
         # a direct hit!
         title = _unHtml(title[0])
         mid = None
         midtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
         if midtag:
             mid = _findBetween(midtag[0], "/title/tt", "/", maxRes=1)
         if not (mid and title):
             self._mobile_logger.error("no direct hit title/movieID for" " title %s", title)
             return res
         if cont.find('<span class="tv-extra">TV mini-series</span>') != -1:
             title += " (mini)"
         res[:] = [(str(mid[0]), analyze_title(title))]
     else:
         # XXX: this results*3 prevents some recursion errors, but...
         #      it's not exactly understandable (i.e.: why 'results' is
         #      not enough to get all the results?)
         lis = _findBetween(cont, 'td class="result_text">', "</td>", maxRes=results * 3)
         for li in lis:
             akas = re_makas.findall(li)
             for idx, aka in enumerate(akas):
                 aka = aka.replace('" - ', "::", 1)
                 aka = _unHtml(aka)
                 if aka.startswith('aka "'):
                     aka = aka[5:].strip()
                 if aka[-1] == '"':
                     aka = aka[:-1]
                 akas[idx] = aka
             imdbid = re_imdbID.findall(li)
             li = re_makas.sub("", li)
             mtitle = _unHtml(li)
             if not (imdbid and mtitle):
                 self._mobile_logger.debug("no title/movieID parsing" " %s searching for title %s", li, title)
                 continue
             mtitle = mtitle.replace("(TV mini-series)", "(mini)")
             resd = analyze_title(mtitle)
             if akas:
                 resd["akas"] = akas
             res.append((str(imdbid[0]), resd))
     return res
Esempio n. 9
0
 def _search_person(self, name, results):
     ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
     ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
     ##cont = self._mretrieve(imdbURL_search % params)
     cont = subXMLRefs(self._get_search_content('nm', name, results))
     name = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not name:
         self._mobile_logger.warn('no title tag searching for name %s',
                                  name)
         return res
     nl = name[0].lower()
     if not nl.startswith('imdb name'):
         # a direct hit!
         name = _unHtml(name[0])
         name = name.replace('- Filmography by type', '').strip()
         pid = None
         pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
         if pidtag:
             pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1)
         if not (pid and name):
             self._mobile_logger.error('no direct hit name/personID for' \
                                         ' name %s', name)
             return res
         res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
     else:
         lis = _findBetween(cont,
                            'td valign="top">',
                            '</td>',
                            maxRes=results * 3)
         for li in lis:
             akas = _findBetween(li, '<em>"', '"</em>')
             for sep in ['<small', '<br> aka', '<br> birth name']:
                 sepIdx = li.find(sep)
                 if sepIdx != -1:
                     li = li[:sepIdx]
             pid = re_imdbID.findall(li)
             pname = _unHtml(li)
             if not (pid and pname):
                 self._mobile_logger.debug('no name/personID parsing' \
                                         ' %s searching for name %s', li,
                                         name)
                 continue
             resd = analyze_name(pname, canonical=1)
             if akas:
                 resd['akas'] = akas
             res.append((str(pid[0]), resd))
     return res
Esempio n. 10
0
 def _search_person(self, name, results):
     ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
     ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
     ##cont = self._mretrieve(imdbURL_search % params)
     cont = subXMLRefs(self._get_search_content('nm', name, results))
     name = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not name:
         self._mobile_logger.warn('no title tag searching for name %s', name)
         return res
     nl = name[0].lower()
     if not nl.startswith('find - imdb'):
         # a direct hit!
         name = _unHtml(name[0])
         name = name.replace('- Filmography by type' , '').strip()
         pid = None
         pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
         if pidtag:
             pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1)
         if not (pid and name):
             self._mobile_logger.error('no direct hit name/personID for' \
                                         ' name %s', name)
             return res
         res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
     else:
         lis = _findBetween(cont, 'td class="result_text">', '</td>',
                             maxRes=results*3)
         for li in lis:
             akas = _findBetween(li, '<em>"', '"</em>')
             for sep in ['<small', '<br> aka', '<br> birth name']:
                 sepIdx = li.find(sep)
                 if sepIdx != -1:
                     li = li[:sepIdx]
             pid = re_imdbID.findall(li)
             pname = _unHtml(li)
             if not (pid and pname):
                 self._mobile_logger.debug('no name/personID parsing' \
                                         ' %s searching for name %s', li,
                                         name)
                 continue
             resd = analyze_name(pname, canonical=1)
             if akas:
                 resd['akas'] = akas
             res.append((str(pid[0]), resd))
     return res
Esempio n. 11
0
 def _search_movie(self, title, results):
     ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
     ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results))
     ##cont = self._mretrieve(imdbURL_search % params)
     cont = subXMLRefs(self._get_search_content('tt', title, results))
     title = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not title: return res
     tl = title[0].lower()
     if not tl.startswith('imdb title'):
         # XXX: a direct hit!
         title = _unHtml(title[0])
         midtag = _getTagsWith(cont, 'name="arg"', maxRes=1)
         if not midtag: midtag = _getTagsWith(cont, 'name="auto"', maxRes=1)
         mid = None
         if midtag:
             mid = _findBetween(midtag[0], 'value="', '"', maxRes=1)
             if mid and not mid[0].isdigit():
                 mid = re_imdbID.findall(mid[0])
         if not (mid and title): return res
         if cont.find('<span class="tv-extra">TV mini-series</span>') != -1:
             title += ' (mini)'
         res[:] = [(str(mid[0]), analyze_title(title, canonical=1))]
     else:
         cont = _reAKAS.sub('</td>', cont)
         lis = _findBetween(cont, 'td valign="top">', ['</td>', '</small>'])
         for li in lis:
             imdbid = re_imdbID.findall(li)
             mtitle = _unHtml(li)
             if not (imdbid and mtitle): 
                 img = _findBetween(li, '<img src="', ['" '])
                 if img and len(img)>0:
                     img = img[0]
                 else:
                     img=''
                 continue
             
             mtitle = mtitle.replace('(TV mini-series)', '(mini)')
             movie = (str(imdbid[0]), analyze_title(mtitle, canonical=1))
             movie[1]['image']=img
             res.append(movie)
     return res
Esempio n. 12
0
 def _search_person(self, name, results):
     ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
     ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
     ##cont = self._mretrieve(imdbURL_search % params)
     cont = subXMLRefs(self._get_search_content('nm', name, results))
     name = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not name: return res
     nl = name[0].lower()
     if not nl.startswith('imdb name'):
         # XXX: a direct hit!
         name = _unHtml(name[0])
         # Easiest way: the board link (for person who already have
         # messages in the board).
         pidtag = _getTagsWith(cont, '/board/nest/', maxRes=1)
         pid = None
         if pidtag: pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1)
         if not (pid and name):
             # Otherwise, the 'credited alongside' for the name,
             # and the biography link for the personID.
             nametag = _getTagsWith(cont, 'NAME="primary"', maxRes=1)
             if not nametag: return res
             nametag = _findBetween(nametag[0], 'VALUE="', '"', maxRes=1)
             if not nametag: return res
             name = unquote(nametag[0])
             pid = _findBetween(cont, '/name/nm', '/bio', maxRes=1)
             if not pid: return res
         if not (pid and name): return res
         res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
     else:
         lis = _findBetween(cont, 'td valign="top">',
                            ['<small', '</td>', '<br'])
         for li in lis:
             pid = re_imdbID.findall(li)
             pname = _unHtml(li)
             if not (pid and pname): continue
             res.append((str(pid[0]), analyze_name(pname, canonical=1)))
     return res
Esempio n. 13
0
 def _search_person(self, name, results):
     ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
     ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
     ##cont = self._mretrieve(imdbURL_search % params)
     cont = subXMLRefs(self._get_search_content('nm', name, results))
     name = _findBetween(cont, '<title>', '</title>', maxRes=1)
     res = []
     if not name: return res
     nl = name[0].lower()
     if not nl.startswith('imdb name'):
         # XXX: a direct hit!
         name = _unHtml(name[0])
         # Easiest way: the board link (for person who already have
         # messages in the board).
         pidtag = _getTagsWith(cont, '/board/nest/', maxRes=1)
         pid = None
         if pidtag: pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1)
         if not (pid and name):
             # Otherwise, the 'credited alongside' for the name,
             # and the biography link for the personID.
             nametag = _getTagsWith(cont, 'NAME="primary"', maxRes=1)
             if not nametag: return res
             nametag = _findBetween(nametag[0], 'VALUE="', '"', maxRes=1)
             if not nametag: return res
             name = unquote(nametag[0])
             pid = _findBetween(cont, '/name/nm', ('/', '"', '>'), maxRes=1)
             if not pid: return res
         if not (pid and name): return res
         res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
     else:
         lis = _findBetween(cont, 'td valign="top">',
                             ['<small', '</td>', '<br> aka'])
         for li in lis:
             pid = re_imdbID.findall(li)
             pname = _unHtml(li)
             if not (pid and pname): continue
             res.append((str(pid[0]), analyze_name(pname, canonical=1)))
     return res
Esempio n. 14
0
 def _clean_html(self, html):
     """Normalize the retrieve html."""
     html = re_spaces.sub(' ', html)
     # Remove silly &nbsp;&raquo; chars.
     html = html.replace('&nbsp;&raquo;', '')
     return subXMLRefs(html)
Esempio n. 15
0
 def _clean_html(self, html):
     """Normalize the retrieve html."""
     html = re_spaces.sub(' ', html)
     return subXMLRefs(html)
Esempio n. 16
0
 def _clean_html(self, html):
     """Normalize the retrieve html."""
     html = re_spaces.sub(' ', html)
     return subXMLRefs(html)