def lookup_nyt_review(content): name = content.simple_name().encode('utf-8') title, year = common.detect_title_year(name) url = 'http://movies.nytimes.com/gst/movies/msearch.html?%s' data = {'query': title} url = url % urllib.urlencode(data) _, page = common.get_page(url) if not page: logging.error("Couldn't get NYT search page for '%s'" % content) return None doc = B(page) entertainment_results = doc.findChild( 'div', attrs={'id': 'entertainment_results'}) results_container = entertainment_results.findChild( 'ol') if entertainment_results else None results = results_container.findChildren( 'li', recursive=False) if results_container else [] for result in results: title_header = result.findChild('h3') title_link = title_header.findChild('a') if title_header else None nyt_title = title_link.string if title_link else None if not nyt_title: logging.warning("Couldn't find title node for '%s'" % title) continue # This sucks. nyt_title = nyt_title.replace(u'\xa0', ' ') nyt_title = nyt_title.encode('utf-8') nyt_title, nyt_year = common.detect_title_year(nyt_title) if not common.title_match(title, nyt_title): try: logging.warning( "Skipping NYT title '%s' because it didn't match '%s'" % (nyt_title, title)) except Exception, e: import pdb pdb.set_trace() print e continue extra_links = result.findChild('ul') if extra_links: for link in extra_links.findChildren('a'): if link.string == "N.Y.Times Review": return 'http://movies.nytimes.com%s' % link.get('href')
def imdb_metadata_search(content): """Attempt to lookup the IMDB page for a ContentNode if we do not know its IMDB ID. Parses the results of the IMDB Advanced Search page. Deprecated""" name = content.simple_name().encode('utf-8') title, year = common.detect_title_year(name) logging.info("Finding IMDB ID for content named '%s'" % name) if year is None: logging.info("Couldn't split '%s' into title/year, skipping IMDb ID detection." % name) return None year = int(year) years = "%d,%d" % (year - 1, year + 1) url = u'http://www.imdb.com/List' url = u'http://www.imdb.com/search/title?' data = {'title': title, 'release_date': years} try: url = url + urllib.urlencode(data) except Exception, e: logging.error("Could not URL encode %s" % str(data)) return None
def update_rottentomatoes_metadata(node, force=False): try: logging.info("Looking up RT metadata for '%s'" % node) rt = node.metadata.rotten_tomatoes if rt and not force: logging.info("RT metadata already present for '%s'. Skipping." % node) return True name = node.simple_name() title, year = common.detect_title_year(name) rt_id = rt.rt_id if rt else None if not rt_id: imdb_id = node.metadata.imdb.imdb_id if node.metadata.imdb else None rt_id = rottentomatoes_find_id(title, year, imdb_id=imdb_id) if not rt_id: return False elif not force: logging.info("RT metadata already found for '%s', skipping." % node) return True # If we already have an RT node with this RT id and we're not erasing # existing data, we don't need to rescrape the page. if not force: try: rt = models.RottenTomatoesMetadata.objects.get(rt_id=rt_id) except models.RottenTomatoesMetadata.DoesNotExist: # We don't have it, so continue with the scraping. pass else: logging.info("Found exsting RottenTomatoesMetadata for '%s'" % node) node.metadata.rottentomatoes = rt node.metadata.save() node.save() return True (rt, _) = models.RottenTomatoesMetadata.objects.get_or_create(rt_id=rt_id) rt.rt_uri = u'http://www.rottentomatoes.com/m/%s/' % rt_id rt.save() metadata = rottentomatoes_parse_page(rt.rt_id) if metadata is None: logging.error("Could not find metadata for '%s'" % node) return False if 'rt_thumb_uri' in metadata: rt.thumb_uri = metadata['rt_thumb_uri'] try: rt.thumb_width = int(metadata['rt_thumb_width']) rt.thumb_height = int(metadata['rt_thumb_height']) except: pass if 'rt_top_percent' in metadata: rt.top_critics_percent = metadata['rt_top_percent'] rt.top_critics_fresh = metadata['rt_top_fresh'] if 'rt_all_percent' in metadata: rt.all_critics_percent = metadata['rt_all_percent'] rt.all_critics_fresh = metadata['rt_all_fresh'] #TODO(XXX) handle rt_directors and rt_actors rt.save() node.metadata.rotten_tomatoes = rt node.metadata.save() node.save() return True except Exception, ex: traceback.print_exc() logging.error("Could not update metadata for '%s'. Got exception: %s" % (node, ex))
def update_imdb_metadata(node, force=False, erase=False): try: name = node.simple_name().encode('utf-8') title, year = common.detect_title_year(name) imdb = node.metadata.imdb imdb_id = imdb.imdb_id if imdb else None if not imdb_id or erase: imdb_id = imdb_find_id(title, year) if not imdb_id: return False elif not force: logging.info("IMDb metadata already found for '%s', skipping." % node) return True # If we already have an IMDB node with this IMDB id and we're not # erasing existing data, we don't need to rescrape the page. if not force and not erase: try: imdb = models.IMDBMetadata.objects.get(imdb_id=imdb_id) except models.IMDBMetadata.DoesNotExist: # We don't have it, so continue with the scraping. pass else: logging.info("Found exsting IMDBMetadata for '%s'" % node) node.metadata.imdb = imdb node.metadata.save() node.save() return True (imdb, _) = models.IMDBMetadata.objects.get_or_create(imdb_id=imdb_id) imdb.imdb_id = imdb_id imdb.imdb_uri = u'http://www.imdb.com/title/tt%s/' % imdb_id imdb.save() fetched = imdb_parse_page_metadata(imdb_id) if not fetched: logging.error("Couldn't lookup IMDB metadata for '%s'" % node) return False if 'imdb_genres' in fetched: for genre in fetched['imdb_genres']: (gnode, _) = models.Genre.objects.get_or_create(name=genre) gnode.save() imdb.genres.add(gnode) if 'imdb_directors' in fetched: for director in fetched['imdb_directors']: (dnode, _) = models.Director.objects.get_or_create(name=director) dnode.save() imdb.directors.add(dnode) if 'imdb_actors' in fetched: for pos, (actor, role) in enumerate(fetched['imdb_actors']): (anode, _) = models.Actor.objects.get_or_create(name=actor) anode.save() (rnode, _) = models.Role.objects.get_or_create(actor=anode, imdb=imdb, role=role, bill_pos=pos+1) rnode.save() # TODO Handle # imdb_releasedate # imdb_writers # imdb_tagline if 'imdb_cover_uri' in fetched: cover_uri = fetched['imdb_cover_uri'] cover_width = fetched['imdb_cover_width'] _, ext = os.path.splitext(cover_uri) saved_name = "%s_%s%s" % (imdb_id, cover_width, ext) storage_path = os.path.join(imdb.thumb_image.field.upload_to, saved_name) storage_abs_path = os.path.join(settings.MEDIA_ROOT, storage_path) try: if is_valid_image(storage_abs_path): logging.info("IMDb thumb image already exists.") imdb.thumb_image = storage_path # Force Django to read the image width and height. # Sometimes it tries to be lazy about reading this data, # which can cause PIL-related exceptions during template # rendering. if imdb.thumb_width == 0 or imdb.thumb_height == 0: raise ValueError("Invalid image width and height") else: saved_name, _ = urllib.urlretrieve(cover_uri, storage_abs_path) assert (os.path.realpath(saved_name) == os.path.realpath(storage_abs_path)) if is_valid_image(saved_name): # Store the source URI used imdb.thumb_uri = cover_uri imdb.thumb_image = storage_path # thumb_width and thumb_height are filled automatically except Exception, e: logging.error("Couldn't lookup IMDb cover from given URI: %s" % fetched['imdb_cover_uri']) traceback.print_exc(e) else: logging.info("Fetched thumbnail from: %s" % fetched['imdb_cover_uri']) if 'imdb_outline' in fetched: imdb.plot_outline = fetched['imdb_outline'] if 'imdb_runtime' in fetched: imdb.length = fetched['imdb_runtime'] if 'imdb_rating' in fetched: imdb.rating = fetched['imdb_rating'] # fetched = imdb_metadata_search(node) # if not fetched: # logging.error("Couldn't lookup IMDB metadata for '%s'" % node) # return False # # If success, guaranteed to have the imdb_id, imdb_uri, and imdb_canonical_title # imdb.imdb_id = fetched['imdb_id'] # imdb.imdb_uri = fetched['imdb_uri'] # imdb.imdb_canonical_title = fetched['imdb_canonical_title'] # imdb.save() # # Might have these as well. Don't want to overwrite existing stuff. # if 'imdb_year' in fetched: # imdb.release_year = fetched['imdb_year'] # if 'imdb_thumb_uri' in fetched: # imdb.thumb_uri = fetched['imdb_thumb_uri'] # imdb.thumb_uri_width = fetched.get('imdb_thumb_width', 0) # imdb.thumb_uri_height = fetched.get('imdb_thumb_height', 0) # if 'imdb_genres' in fetched: # for genre in fetched['imdb_genres']: # (gnode, _) = models.Genre.objects.get_or_create(name=genre) # gnode.save() # imdb.genres.add(gnode) imdb.save() node.metadata.imdb = imdb node.metadata.save() node.save() return True
extras = {} link = result_node.findChild('a') if link is None: logging.error("Could not get link node of result for '%s', skipping." % name) continue extras['imdb_uri'] = imdb_uri = link.get('href') imdb_id_match = re.match('/title/(?P<imdb_id>tt[0-9]+)/*', imdb_uri) if not imdb_id_match: continue extras['imdb_id'] = imdb_id_match.groupdict()['imdb_id'] imdb_name = link.get('title') imdb_title, imdb_year = common.detect_title_year(imdb_name) imdb_title = imdb_title.encode('utf-8') extras['imdb_canonical_title'] = imdb_name extras['imdb_title'] = imdb_name if imdb_year is not None: extras['imdb_year'] = imdb_year if not common.title_match(title, imdb_title): logging.info("Skipping IMDB title '%s' because it didn't match '%s'" % (imdb_title, title)) continue thumb_node = result_node.findChild('td', attrs={'class':'image'}) thumb_image = thumb_node.findChild('img') if thumb_node is not None else None if thumb_image: extras['imdb_thumb_uri'] = thumb_image.get('src')
def lookup_metacritic_metadata(content): metadata = {} name = content.simple_name() title, year = common.detect_title_year(name) url_kind_map = { models.KIND_MOVIE: 'http://www.metacritic.com/search/movie/%s/results', models.KIND_SERIES: 'http://www.metacritic.com/search/tv/%s/results', models.KIND_TV: 'http://www.metacritic.com/search/tv/%s/results', models.KIND_SEASON: 'http://www.metacritic.com/search/tv/%s/results' } url = url_kind_map[content.kind] # Remove special characters that the regular metacritic search seems to # remove anyway. title_utf8 = title.encode('utf-8') title_stripped = re.sub('[!@#$%^&*();.,?]', '', title_utf8).strip() #title.replace('-','').replace(':','').replace('(','').replace(')','') title_stripped = re.sub('[:\-\s]', '+', title_stripped) #title_stripped = title_stripped.replace(' ', '+') # Fake encode the title, strip out the a= #title_stripped = re.sub('^a=', '', urllib.urlencode({'a': title_stripped})) url = url % title_stripped logging.info("Trying to search: %s" % url) _, page = common.get_page(url) if not page: logging.error("Couldn't get metacritic page for '%s'" % content) return None doc = B(page) # Get results results = doc.findAll('li', attrs={'class': re.compile('result')}) for result in results: title_node = result.findChild('h3', attrs={'class': re.compile('product_title')}) title_link = title_node.findChild('a') if title_node else None mc_title = title_link.string if title_link else None if not title_link or not mc_title: logging.warning("Could't find MC title link for result.") continue mc_title = mc_title.strip() if not common.title_match(title, mc_title): try: logging.warning(u"Skipping MC title '%s' because it didn't " "match '%s'" % (mc_title, title)) except Exception, e: traceback.print_exc(e) continue logging.info("Found a matching title, '%s' for '%s'" % (mc_title, title)) mc_url = title_link.get('href') id_match = re.match('/(?P<type>movie|tv)/(?P<mc_id>.*)', mc_url) if not id_match: logging.warning("Could't find MC id from link '%s'." % mc_url) continue metadata['mc_uri'] = mc_url metadata['mc_id'] = id_match.groupdict()['mc_id'] metascore_node = result.findChild('span', attrs={'class': re.compile('metascore')}) metascore = metascore_node.string if metascore_node else None if metascore: metascore_class = metascore_node.get('class') score = 'unknown' if 'score_outstanding' in metascore_class: score = 'outstanding' elif 'score_favorable' in metascore_class: score = 'favorable' elif 'score_mixed' in metascore_class: score = 'mixed' elif 'score_unfavorable' in metascore_class: score = 'unfavorable' elif 'score_terrible' in metascore_class: score = 'terrible' elif 'score_tbd' in metascore_class: score = 'tbd' metadata['mc_status'] = score try: metadata['mc_score'] = int(metascore) except: logging.error("Couldn't convert metascore '%s' to integer." % metascore) return metadata