def _search_character(self, name, results): cont = subXMLRefs(self._get_search_content('ch', name, results)) name = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not name: self._mobile_logger.error('no title tag searching character %s', name) return res nl = name[0].lower() if not nl.startswith('find - imdb'): # a direct hit! name = _unHtml(name[0]).replace('(Character)', '').strip() pid = None pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if pidtag: pid = _findBetween(pidtag[0], '/character/ch', '/', maxRes=1) if not (pid and name): self._mobile_logger.error('no direct hit name/characterID for' \ ' character %s', name) return res res[:] = [(str(pid[0]), analyze_name(name))] else: lis = _findBetween(cont, '<td class="result_text"', ['<small', '</td>', '<br']) for li in lis: li = '<%s' % li pid = re_imdbID.findall(li) pname = _unHtml(li) if not (pid and pname): self._mobile_logger.debug('no name/characterID' \ ' parsing %s searching for' \ ' character %s', li, name) continue res.append((str(pid[0]), analyze_name(pname))) return res
def _search_character(self, name, results): cont = subXMLRefs(self._get_search_content("ch", name, results)) name = _findBetween(cont, "<title>", "</title>", maxRes=1) res = [] if not name: self._mobile_logger.error("no title tag searching character %s", name) return res nl = name[0].lower() if not nl.startswith("find - imdb"): # a direct hit! name = _unHtml(name[0]).replace("(Character)", "").strip() pid = None pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if pidtag: pid = _findBetween(pidtag[0], "/character/ch", "/", maxRes=1) if not (pid and name): self._mobile_logger.error("no direct hit name/characterID for" " character %s", name) return res res[:] = [(str(pid[0]), analyze_name(name))] else: lis = _findBetween(cont, '<td class="result_text"', ["<small", "</td>", "<br"]) for li in lis: li = "<%s" % li pid = re_imdbID.findall(li) pname = _unHtml(li) if not (pid and pname): self._mobile_logger.debug( "no name/characterID" " parsing %s searching for" " character %s", li, name ) continue res.append((str(pid[0]), analyze_name(pname))) return res
def _search_character(self, name, results): cont = subXMLRefs(self._get_search_content('char', name, results)) name = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not name: return res nl = name[0].lower() if not (nl.startswith('imdb search') or nl.startswith('imdb search') \ or nl.startswith('imdb character')): # XXX: a direct hit! name = _unHtml(name[0]).replace('(Character)', '').strip() pidtag = _getTagsWith(cont, '/character/ch', maxRes=1) pid = None if pidtag: pid = re_imdbID.findall(pidtag[0]) if not (pid and name): return res res[:] = [(str(pid[0]), analyze_name(name, canonical=0))] else: sects = _findBetween(cont, '<b>Popular Characters</b>', '</table>') sects += _findBetween(cont, '<b>Characters', '</table>') for sect in sects: lis = _findBetween(sect, '<a href="/character/', ['<small', '</td>', '<br']) for li in lis: li = '<%s' % li pid = re_imdbID.findall(li) pname = _unHtml(li) if not (pid and pname): continue res.append((str(pid[0]), analyze_name(pname, canonical=0))) return res
def _search_movie(self, title, results): ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title}) ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('tt', title, results)) title = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not title: return res tl = title[0].lower() if not tl.startswith('imdb title'): # XXX: a direct hit! title = _unHtml(title[0]) midtag = _getTagsWith(cont, 'name="arg"', maxRes=1) if not midtag: midtag = _getTagsWith(cont, 'name="auto"', maxRes=1) mid = None if midtag: mid = _findBetween(midtag[0], 'value="', '"', maxRes=1) if mid and not mid[0].isdigit(): mid = re_imdbID.findall(mid[0]) if not (mid and title): return res res[:] = [(str(mid[0]), analyze_title(title, canonical=1))] else: lis = _findBetween(cont, 'td valign="top">', ['</td>', '<small>']) for li in lis: imdbid = re_imdbID.findall(li) mtitle = _unHtml(li) if not (imdbid and mtitle): continue res.append((str(imdbid[0]), analyze_title(mtitle, canonical=1))) return res
def _search_movie(self, title, results): ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title}) ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('tt', title, results)) title = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not title: self._mobile_logger.error('no title tag searching for movie %s', title) return res tl = title[0].lower() if not tl.startswith('imdb title'): # a direct hit! title = _unHtml(title[0]) mid = None midtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if midtag: mid = _findBetween(midtag[0], '/title/tt', '/', maxRes=1) if not (mid and title): self._mobile_logger.error('no direct hit title/movieID for' \ ' title %s', title) return res if cont.find('<span class="tv-extra">TV mini-series</span>') != -1: title += ' (mini)' res[:] = [(str(mid[0]), analyze_title(title))] else: # XXX: this results*3 prevents some recursion errors, but... # it's not exactly understandable (i.e.: why 'results' is # not enough to get all the results?) lis = _findBetween(cont, 'td valign="top">', '</td>', maxRes=results*3) for li in lis: akaIdx = li.find('aka <em>') akas = [] if akaIdx != -1: akas = [_unHtml(x) for x in li[akaIdx:].split('<br>')] li = li[:akaIdx] if akas: for idx, aka in enumerate(akas): aka = aka.replace('" - ', '::') if aka.startswith('aka "'): aka = aka[5:] if aka[-1] == '"': aka = aka[:-1] akas[idx] = aka imdbid = re_imdbID.findall(li) mtitle = _unHtml(li) if not (imdbid and mtitle): self._mobile_logger.debug('no title/movieID parsing' \ ' %s searching for title %s', li, title) continue mtitle = mtitle.replace('(TV mini-series)', '(mini)') resd = analyze_title(mtitle) if akas: resd['akas'] = akas res.append((str(imdbid[0]), resd)) return res
def _search_movie(self, title, results): ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title}) ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('tt', title, results)) title = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not title: self._mobile_logger.error('no title tag searching for movie %s', title) return res tl = title[0].lower() if not tl.startswith('imdb title'): # a direct hit! title = _unHtml(title[0]) mid = None midtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if midtag: mid = _findBetween(midtag[0], '/title/tt', '/', maxRes=1) if not (mid and title): self._mobile_logger.error('no direct hit title/movieID for' \ ' title %s', title) return res if cont.find('<span class="tv-extra">TV mini-series</span>') != -1: title += ' (mini)' res[:] = [(str(mid[0]), analyze_title(title))] else: # XXX: this results*3 prevents some recursion errors, but... # it's not exactly understandable (i.e.: why 'results' is # not enough to get all the results?) lis = _findBetween(cont, 'td valign="top">', '</td>', maxRes=results * 3) for li in lis: akas = re_makas.findall(li) for idx, aka in enumerate(akas): aka = aka.replace('" - ', '::', 1) aka = _unHtml(aka) if aka.startswith('aka "'): aka = aka[5:].strip() if aka[-1] == '"': aka = aka[:-1] akas[idx] = aka imdbid = re_imdbID.findall(li) li = re_makas.sub('', li) mtitle = _unHtml(li) if not (imdbid and mtitle): self._mobile_logger.debug('no title/movieID parsing' \ ' %s searching for title %s', li, title) continue mtitle = mtitle.replace('(TV mini-series)', '(mini)') resd = analyze_title(mtitle) if akas: resd['akas'] = akas res.append((str(imdbid[0]), resd)) return res
def _search_movie(self, title, results): ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title}) ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content("tt", title, results)) title = _findBetween(cont, "<title>", "</title>", maxRes=1) res = [] if not title: self._mobile_logger.error("no title tag searching for movie %s", title) return res tl = title[0].lower() if not tl.startswith("find - imdb"): # a direct hit! title = _unHtml(title[0]) mid = None midtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if midtag: mid = _findBetween(midtag[0], "/title/tt", "/", maxRes=1) if not (mid and title): self._mobile_logger.error("no direct hit title/movieID for" " title %s", title) return res if cont.find('<span class="tv-extra">TV mini-series</span>') != -1: title += " (mini)" res[:] = [(str(mid[0]), analyze_title(title))] else: # XXX: this results*3 prevents some recursion errors, but... # it's not exactly understandable (i.e.: why 'results' is # not enough to get all the results?) lis = _findBetween(cont, 'td class="result_text">', "</td>", maxRes=results * 3) for li in lis: akas = re_makas.findall(li) for idx, aka in enumerate(akas): aka = aka.replace('" - ', "::", 1) aka = _unHtml(aka) if aka.startswith('aka "'): aka = aka[5:].strip() if aka[-1] == '"': aka = aka[:-1] akas[idx] = aka imdbid = re_imdbID.findall(li) li = re_makas.sub("", li) mtitle = _unHtml(li) if not (imdbid and mtitle): self._mobile_logger.debug("no title/movieID parsing" " %s searching for title %s", li, title) continue mtitle = mtitle.replace("(TV mini-series)", "(mini)") resd = analyze_title(mtitle) if akas: resd["akas"] = akas res.append((str(imdbid[0]), resd)) return res
def _search_person(self, name, results): ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name}) ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('nm', name, results)) name = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not name: self._mobile_logger.warn('no title tag searching for name %s', name) return res nl = name[0].lower() if not nl.startswith('imdb name'): # a direct hit! name = _unHtml(name[0]) name = name.replace('- Filmography by type', '').strip() pid = None pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if pidtag: pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1) if not (pid and name): self._mobile_logger.error('no direct hit name/personID for' \ ' name %s', name) return res res[:] = [(str(pid[0]), analyze_name(name, canonical=1))] else: lis = _findBetween(cont, 'td valign="top">', '</td>', maxRes=results * 3) for li in lis: akas = _findBetween(li, '<em>"', '"</em>') for sep in ['<small', '<br> aka', '<br> birth name']: sepIdx = li.find(sep) if sepIdx != -1: li = li[:sepIdx] pid = re_imdbID.findall(li) pname = _unHtml(li) if not (pid and pname): self._mobile_logger.debug('no name/personID parsing' \ ' %s searching for name %s', li, name) continue resd = analyze_name(pname, canonical=1) if akas: resd['akas'] = akas res.append((str(pid[0]), resd)) return res
def _search_person(self, name, results): ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name}) ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('nm', name, results)) name = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not name: self._mobile_logger.warn('no title tag searching for name %s', name) return res nl = name[0].lower() if not nl.startswith('find - imdb'): # a direct hit! name = _unHtml(name[0]) name = name.replace('- Filmography by type' , '').strip() pid = None pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if pidtag: pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1) if not (pid and name): self._mobile_logger.error('no direct hit name/personID for' \ ' name %s', name) return res res[:] = [(str(pid[0]), analyze_name(name, canonical=1))] else: lis = _findBetween(cont, 'td class="result_text">', '</td>', maxRes=results*3) for li in lis: akas = _findBetween(li, '<em>"', '"</em>') for sep in ['<small', '<br> aka', '<br> birth name']: sepIdx = li.find(sep) if sepIdx != -1: li = li[:sepIdx] pid = re_imdbID.findall(li) pname = _unHtml(li) if not (pid and pname): self._mobile_logger.debug('no name/personID parsing' \ ' %s searching for name %s', li, name) continue resd = analyze_name(pname, canonical=1) if akas: resd['akas'] = akas res.append((str(pid[0]), resd)) return res
def _search_movie(self, title, results): ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title}) ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('tt', title, results)) title = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not title: return res tl = title[0].lower() if not tl.startswith('imdb title'): # XXX: a direct hit! title = _unHtml(title[0]) midtag = _getTagsWith(cont, 'name="arg"', maxRes=1) if not midtag: midtag = _getTagsWith(cont, 'name="auto"', maxRes=1) mid = None if midtag: mid = _findBetween(midtag[0], 'value="', '"', maxRes=1) if mid and not mid[0].isdigit(): mid = re_imdbID.findall(mid[0]) if not (mid and title): return res if cont.find('<span class="tv-extra">TV mini-series</span>') != -1: title += ' (mini)' res[:] = [(str(mid[0]), analyze_title(title, canonical=1))] else: cont = _reAKAS.sub('</td>', cont) lis = _findBetween(cont, 'td valign="top">', ['</td>', '</small>']) for li in lis: imdbid = re_imdbID.findall(li) mtitle = _unHtml(li) if not (imdbid and mtitle): img = _findBetween(li, '<img src="', ['" ']) if img and len(img)>0: img = img[0] else: img='' continue mtitle = mtitle.replace('(TV mini-series)', '(mini)') movie = (str(imdbid[0]), analyze_title(mtitle, canonical=1)) movie[1]['image']=img res.append(movie) return res
def _search_person(self, name, results): ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name}) ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('nm', name, results)) name = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not name: return res nl = name[0].lower() if not nl.startswith('imdb name'): # XXX: a direct hit! name = _unHtml(name[0]) # Easiest way: the board link (for person who already have # messages in the board). pidtag = _getTagsWith(cont, '/board/nest/', maxRes=1) pid = None if pidtag: pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1) if not (pid and name): # Otherwise, the 'credited alongside' for the name, # and the biography link for the personID. nametag = _getTagsWith(cont, 'NAME="primary"', maxRes=1) if not nametag: return res nametag = _findBetween(nametag[0], 'VALUE="', '"', maxRes=1) if not nametag: return res name = unquote(nametag[0]) pid = _findBetween(cont, '/name/nm', '/bio', maxRes=1) if not pid: return res if not (pid and name): return res res[:] = [(str(pid[0]), analyze_name(name, canonical=1))] else: lis = _findBetween(cont, 'td valign="top">', ['<small', '</td>', '<br']) for li in lis: pid = re_imdbID.findall(li) pname = _unHtml(li) if not (pid and pname): continue res.append((str(pid[0]), analyze_name(pname, canonical=1))) return res
def _search_person(self, name, results): ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name}) ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('nm', name, results)) name = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not name: return res nl = name[0].lower() if not nl.startswith('imdb name'): # XXX: a direct hit! name = _unHtml(name[0]) # Easiest way: the board link (for person who already have # messages in the board). pidtag = _getTagsWith(cont, '/board/nest/', maxRes=1) pid = None if pidtag: pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1) if not (pid and name): # Otherwise, the 'credited alongside' for the name, # and the biography link for the personID. nametag = _getTagsWith(cont, 'NAME="primary"', maxRes=1) if not nametag: return res nametag = _findBetween(nametag[0], 'VALUE="', '"', maxRes=1) if not nametag: return res name = unquote(nametag[0]) pid = _findBetween(cont, '/name/nm', ('/', '"', '>'), maxRes=1) if not pid: return res if not (pid and name): return res res[:] = [(str(pid[0]), analyze_name(name, canonical=1))] else: lis = _findBetween(cont, 'td valign="top">', ['<small', '</td>', '<br> aka']) for li in lis: pid = re_imdbID.findall(li) pname = _unHtml(li) if not (pid and pname): continue res.append((str(pid[0]), analyze_name(pname, canonical=1))) return res
def _clean_html(self, html): """Normalize the retrieve html.""" html = re_spaces.sub(' ', html) # Remove silly » chars. html = html.replace(' »', '') return subXMLRefs(html)
def _clean_html(self, html): """Normalize the retrieve html.""" html = re_spaces.sub(' ', html) return subXMLRefs(html)