def lookup_nyt_review(content): name = content.simple_name().encode('utf-8') title, year = common.detect_title_year(name) url = 'http://movies.nytimes.com/gst/movies/msearch.html?%s' data = {'query': title} url = url % urllib.urlencode(data) _, page = common.get_page(url) if not page: logging.error("Couldn't get NYT search page for '%s'" % content) return None doc = B(page) entertainment_results = doc.findChild( 'div', attrs={'id': 'entertainment_results'}) results_container = entertainment_results.findChild( 'ol') if entertainment_results else None results = results_container.findChildren( 'li', recursive=False) if results_container else [] for result in results: title_header = result.findChild('h3') title_link = title_header.findChild('a') if title_header else None nyt_title = title_link.string if title_link else None if not nyt_title: logging.warning("Couldn't find title node for '%s'" % title) continue # This sucks. nyt_title = nyt_title.replace(u'\xa0', ' ') nyt_title = nyt_title.encode('utf-8') nyt_title, nyt_year = common.detect_title_year(nyt_title) if not common.title_match(title, nyt_title): try: logging.warning( "Skipping NYT title '%s' because it didn't match '%s'" % (nyt_title, title)) except Exception, e: import pdb pdb.set_trace() print e continue extra_links = result.findChild('ul') if extra_links: for link in extra_links.findChildren('a'): if link.string == "N.Y.Times Review": return 'http://movies.nytimes.com%s' % link.get('href')
def rottentomatoes_find_id(title, year=None, imdb_id=None): # Find the content by search. url = u"http://www.rottentomatoes.com/search/movie.php?%s" title_latin1 = title.encode('latin1') data = {'searchby': 'movies', 'search': title_latin1} try: url = url % urllib.urlencode(data) logging.info("Executing RT regular search for '%s' at '%s'" % (title, url)) result_url, page = common.get_page(url) # BeautifulSoup can't handle hex entities. Massage them into decimal. hexentityMassage = copy.copy(B.MARKUP_MASSAGE) hexentityMassage = [(re.compile('&#x([^;]+);'), lambda m: '&#%d;' % int(m.group(1), 16))] #page = imdb_cleanup_markup(page) document = B(page, convertEntities=B.HTML_ENTITIES, markupMassage=hexentityMassage) results_ul = document.findChild('ul', attrs={'id': re.compile('movie_results_ul')}) results = (results_ul.findAll('li', attrs={'class': re.compile('media_block')}) if results_ul else None) if results is None: logging.error("Couldn't lookup RT ID for '%s (%s)'" % (title, year)) return None for result_node in results: # Scope in on the content div, because otherwise we get the poster # image. content_div = result_node.findChild( 'div', attrs={'class': re.compile('media_block_content')}) link = content_div.findChild('a', attrs={'href': rottentomatoes_id_pattern}) link_title = link.string if link else None if not link_title: logging.error("Couldn't find RT result link title. Skipping") continue titles = [] # Try the original title titles.append(link_title) # Rotten Tomatoes annoyingly embeds the AKAs in the title in parens following the head title. # For example: # - Batoru rowaiaru II: Chinkonka (Battle Royale II) # - Battle Royale (Batoru Rowaiaru) endparen_match = re.search("\(([^\(\)]+)\)$", link_title) while endparen_match: titles.append(endparen_match.groups()[0]) # Strip out the ending (title) and any spaces before it. link_title = re.sub("\s*\(([^\(\)]+)\)$", '', link_title) endparen_match = re.search("\(([^\(\)]+)\)$", link_title) # Add the final version of the title with the AKAs removed to # the title list. if not endparen_match: titles.append(link_title) found_title = None for aka in titles: if not common.title_match(title, aka): try: logging.warning(u"Skipping RT title '%s' because it didn't match '%s'" % (aka, title)) except Exception, e: traceback.print_exc(e) continue else: logging.info("Found RT title match '%s' for '%s'" % (aka, title)) found_title = aka break if not found_title: continue span_year = result_node.findChild('span', attrs={'class': re.compile('movie_year')}) link_year = unicode(span_year.string) if span_year and span_year.string else None link_year = link_year.strip(' ()') if year and link_year != year: logging.info("Link '%s's year '%s' doesn't match '%s'." % (link_title, link_year, year)) continue # Get RT ID link_href = link.get('href') link_match = rottentomatoes_id_pattern.match(link_href) assert link_match # guaranteed return link_match.groupdict()['id'] except Exception, e: traceback.print_exc(e) logging.error("Couldn't lookup RT ID for '%s (%s)'" % (title, year)) pass
imdb_id_match = re.match('/title/(?P<imdb_id>tt[0-9]+)/*', imdb_uri) if not imdb_id_match: continue extras['imdb_id'] = imdb_id_match.groupdict()['imdb_id'] imdb_name = link.get('title') imdb_title, imdb_year = common.detect_title_year(imdb_name) imdb_title = imdb_title.encode('utf-8') extras['imdb_canonical_title'] = imdb_name extras['imdb_title'] = imdb_name if imdb_year is not None: extras['imdb_year'] = imdb_year if not common.title_match(title, imdb_title): logging.info("Skipping IMDB title '%s' because it didn't match '%s'" % (imdb_title, title)) continue thumb_node = result_node.findChild('td', attrs={'class':'image'}) thumb_image = thumb_node.findChild('img') if thumb_node is not None else None if thumb_image: extras['imdb_thumb_uri'] = thumb_image.get('src') extras['imdb_thumb_width'] = thumb_image.get('width') extras['imdb_thumb_height'] = thumb_image.get('height') runtime_node = result_node.findChild('span', attrs={'class': 'runtime'}) if runtime_node: runtime_match = re.match("(?P<length>\d+) mins.", runtime_node.string) if runtime_match: extras['imdb_length'] = int(runtime_match.groupdict()['length'])
def imdb_find_id(title, year=None): title = title.decode('utf8') url = u'http://www.imdb.com/find?%s' data = {'s': 'tt', 'q': title.encode('latin1')} try: url = url % urllib.urlencode(data) logging.info("Executing IMDB regular search for '%s' at '%s'" % (title, url)) result_url, page = common.get_page(url) result_url = result_url.replace('http://www.imdb.com', '') result_url_match = imdb_title_pattern.match(result_url) if result_url_match: # IMDb saw fit to redirect us to the thing we searched for. Let's # trust them? logging.info("IMDb redirected us to '%s', trusting them." % result_url) return result_url_match.groupdict()['imdb_id'] # BeautifulSoup can't handle hex entities. Massage them into decimal. hexentityMassage = copy.copy(B.MARKUP_MASSAGE) hexentityMassage = [(re.compile('&#x([^;]+);'), lambda m: '&#%d;' % int(m.group(1), 16))] #page = imdb_cleanup_markup(page) document = B(page, convertEntities=B.HTML_ENTITIES, markupMassage=hexentityMassage) links = document.findAll('a', attrs={'href': re.compile('^/title/tt\d{7}/$')}) for link in links: link_title = link.string if not link_title: continue if not common.title_match(title, link_title): logging.info("Skipping IMDB link title '%s' because it didn't match '%s'" % (link_title, title)) continue link_year = link.nextSibling if not isinstance(link_year, basestring): continue link_year = link_year.strip() link_year_match = re.match('\((?P<year>\d{4}).*?\)', link_year) link_year = link_year_match.groupdict()['year'] if link_year_match else None if not link_year: continue if year and link_year != year: logging.info("Link '%s's year '%s' doesn't match '%s'." % (link_title, link_year, year)) continue imdb_url = link.get('href') imdb_match = re.match('^/title/tt(?P<imdb_id>\d{7})/', imdb_url) logging.info("Found match for '%s (%s)': '%s (%s)'" % (title, year, link_title, link_year)) # We know this because the nodes were selected with this regex. assert imdb_match return imdb_match.groupdict()['imdb_id'] logging.error("Found no matches for '%s'" % title) except Exception, e: logging.error("Couldn't get IMDB regular search for '%s'" % title) traceback.print_exc(e)
def lookup_metacritic_metadata(content): metadata = {} name = content.simple_name() title, year = common.detect_title_year(name) url_kind_map = { models.KIND_MOVIE: 'http://www.metacritic.com/search/movie/%s/results', models.KIND_SERIES: 'http://www.metacritic.com/search/tv/%s/results', models.KIND_TV: 'http://www.metacritic.com/search/tv/%s/results', models.KIND_SEASON: 'http://www.metacritic.com/search/tv/%s/results' } url = url_kind_map[content.kind] # Remove special characters that the regular metacritic search seems to # remove anyway. title_utf8 = title.encode('utf-8') title_stripped = re.sub('[!@#$%^&*();.,?]', '', title_utf8).strip() #title.replace('-','').replace(':','').replace('(','').replace(')','') title_stripped = re.sub('[:\-\s]', '+', title_stripped) #title_stripped = title_stripped.replace(' ', '+') # Fake encode the title, strip out the a= #title_stripped = re.sub('^a=', '', urllib.urlencode({'a': title_stripped})) url = url % title_stripped logging.info("Trying to search: %s" % url) _, page = common.get_page(url) if not page: logging.error("Couldn't get metacritic page for '%s'" % content) return None doc = B(page) # Get results results = doc.findAll('li', attrs={'class': re.compile('result')}) for result in results: title_node = result.findChild('h3', attrs={'class': re.compile('product_title')}) title_link = title_node.findChild('a') if title_node else None mc_title = title_link.string if title_link else None if not title_link or not mc_title: logging.warning("Could't find MC title link for result.") continue mc_title = mc_title.strip() if not common.title_match(title, mc_title): try: logging.warning(u"Skipping MC title '%s' because it didn't " "match '%s'" % (mc_title, title)) except Exception, e: traceback.print_exc(e) continue logging.info("Found a matching title, '%s' for '%s'" % (mc_title, title)) mc_url = title_link.get('href') id_match = re.match('/(?P<type>movie|tv)/(?P<mc_id>.*)', mc_url) if not id_match: logging.warning("Could't find MC id from link '%s'." % mc_url) continue metadata['mc_uri'] = mc_url metadata['mc_id'] = id_match.groupdict()['mc_id'] metascore_node = result.findChild('span', attrs={'class': re.compile('metascore')}) metascore = metascore_node.string if metascore_node else None if metascore: metascore_class = metascore_node.get('class') score = 'unknown' if 'score_outstanding' in metascore_class: score = 'outstanding' elif 'score_favorable' in metascore_class: score = 'favorable' elif 'score_mixed' in metascore_class: score = 'mixed' elif 'score_unfavorable' in metascore_class: score = 'unfavorable' elif 'score_terrible' in metascore_class: score = 'terrible' elif 'score_tbd' in metascore_class: score = 'tbd' metadata['mc_status'] = score try: metadata['mc_score'] = int(metascore) except: logging.error("Couldn't convert metascore '%s' to integer." % metascore) return metadata