Exemple #1
0
def lookup_nyt_review(content):
    name = content.simple_name().encode('utf-8')
    title, year = common.detect_title_year(name)

    url = 'http://movies.nytimes.com/gst/movies/msearch.html?%s'
    data = {'query': title}

    url = url % urllib.urlencode(data)
    _, page = common.get_page(url)

    if not page:
        logging.error("Couldn't get NYT search page for '%s'" % content)
        return None

    doc = B(page)

    entertainment_results = doc.findChild(
        'div', attrs={'id': 'entertainment_results'})
    results_container = entertainment_results.findChild(
        'ol') if entertainment_results else None
    results = results_container.findChildren(
        'li', recursive=False) if results_container else []

    for result in results:
        title_header = result.findChild('h3')
        title_link = title_header.findChild('a') if title_header else None
        nyt_title = title_link.string if title_link else None

        if not nyt_title:
            logging.warning("Couldn't find title node for '%s'" % title)
            continue

        # This sucks.
        nyt_title = nyt_title.replace(u'\xa0', ' ')
        nyt_title = nyt_title.encode('utf-8')

        nyt_title, nyt_year = common.detect_title_year(nyt_title)

        if not common.title_match(title, nyt_title):
            try:
                logging.warning(
                    "Skipping NYT title '%s' because it didn't match '%s'" %
                    (nyt_title, title))
            except Exception, e:
                import pdb
                pdb.set_trace()
                print e
            continue

        extra_links = result.findChild('ul')
        if extra_links:
            for link in extra_links.findChildren('a'):
                if link.string == "N.Y.Times Review":
                    return 'http://movies.nytimes.com%s' % link.get('href')
Exemple #2
0
def imdb_metadata_search(content):
    """Attempt to lookup the IMDB page for a ContentNode if we do not know its
    IMDB ID. Parses the results of the IMDB Advanced Search page. Deprecated"""

    name = content.simple_name().encode('utf-8')
    title, year = common.detect_title_year(name)

    logging.info("Finding IMDB ID for content named '%s'" % name)

    if year is None:
        logging.info("Couldn't split '%s' into title/year, skipping IMDb ID detection." % name)
        return None

    year = int(year)

    years = "%d,%d" % (year - 1, year + 1)

    url = u'http://www.imdb.com/List'
    url = u'http://www.imdb.com/search/title?'

    data = {'title': title,
            'release_date': years}

    try:
        url = url + urllib.urlencode(data)
    except Exception, e:
        logging.error("Could not URL encode %s" % str(data))
        return None
Exemple #3
0
def update_rottentomatoes_metadata(node, force=False):
    try:
        logging.info("Looking up RT metadata for '%s'" % node)
        rt = node.metadata.rotten_tomatoes

        if rt and not force:
            logging.info("RT metadata already present for '%s'. Skipping." % node)
            return True

        name = node.simple_name()
        title, year = common.detect_title_year(name)

        rt_id = rt.rt_id if rt else None
        if not rt_id:
            imdb_id = node.metadata.imdb.imdb_id if node.metadata.imdb else None
            rt_id = rottentomatoes_find_id(title, year, imdb_id=imdb_id)
            if not rt_id:
                return False
        elif not force:
            logging.info("RT metadata already found for '%s', skipping." % node)
            return True

        # If we already have an RT node with this RT id and we're not erasing
        # existing data, we don't need to rescrape the page.
        if not force:
            try:
                rt = models.RottenTomatoesMetadata.objects.get(rt_id=rt_id)
            except models.RottenTomatoesMetadata.DoesNotExist:
                # We don't have it, so continue with the scraping.
                pass
            else:
                logging.info("Found exsting RottenTomatoesMetadata for '%s'" % node)
                node.metadata.rottentomatoes = rt
                node.metadata.save()
                node.save()
                return True

        (rt, _) = models.RottenTomatoesMetadata.objects.get_or_create(rt_id=rt_id)
        rt.rt_uri = u'http://www.rottentomatoes.com/m/%s/' % rt_id
        rt.save()

        metadata = rottentomatoes_parse_page(rt.rt_id)

        if metadata is None:
            logging.error("Could not find metadata for '%s'" % node)
            return False

        if 'rt_thumb_uri' in metadata:
            rt.thumb_uri = metadata['rt_thumb_uri']
            try:
                rt.thumb_width = int(metadata['rt_thumb_width'])
                rt.thumb_height = int(metadata['rt_thumb_height'])
            except:
                pass

        if 'rt_top_percent' in metadata:
            rt.top_critics_percent = metadata['rt_top_percent']
            rt.top_critics_fresh = metadata['rt_top_fresh']

        if 'rt_all_percent' in metadata:
            rt.all_critics_percent = metadata['rt_all_percent']
            rt.all_critics_fresh = metadata['rt_all_fresh']

        #TODO(XXX) handle rt_directors and rt_actors

        rt.save()
        node.metadata.rotten_tomatoes = rt
        node.metadata.save()
        node.save()
        return True
    except Exception, ex:
        traceback.print_exc()
        logging.error("Could not update metadata for '%s'. Got exception: %s" % (node, ex))
Exemple #4
0
def update_imdb_metadata(node, force=False, erase=False):
    try:
        name = node.simple_name().encode('utf-8')
        title, year = common.detect_title_year(name)
        imdb = node.metadata.imdb
        imdb_id = imdb.imdb_id if imdb else None

        if not imdb_id or erase:
            imdb_id = imdb_find_id(title, year)
            if not imdb_id:
                return False
        elif not force:
            logging.info("IMDb metadata already found for '%s', skipping." % node)
            return True

        # If we already have an IMDB node with this IMDB id and we're not
        # erasing existing data, we don't need to rescrape the page.
        if not force and not erase:
            try:
                imdb = models.IMDBMetadata.objects.get(imdb_id=imdb_id)
            except models.IMDBMetadata.DoesNotExist:
                # We don't have it, so continue with the scraping.
                pass
            else:
                logging.info("Found exsting IMDBMetadata for '%s'" % node)
                node.metadata.imdb = imdb
                node.metadata.save()
                node.save()
                return True

        (imdb, _) = models.IMDBMetadata.objects.get_or_create(imdb_id=imdb_id)
        imdb.imdb_id = imdb_id
        imdb.imdb_uri = u'http://www.imdb.com/title/tt%s/' % imdb_id
        imdb.save()

        fetched = imdb_parse_page_metadata(imdb_id)

        if not fetched:
            logging.error("Couldn't lookup IMDB metadata for '%s'" % node)
            return False

        if 'imdb_genres' in fetched:
            for genre in fetched['imdb_genres']:
                (gnode, _) = models.Genre.objects.get_or_create(name=genre)
                gnode.save()
                imdb.genres.add(gnode)

        if 'imdb_directors' in fetched:
            for director in fetched['imdb_directors']:
                (dnode, _) = models.Director.objects.get_or_create(name=director)
                dnode.save()
                imdb.directors.add(dnode)

        if 'imdb_actors' in fetched:
            for pos, (actor, role) in enumerate(fetched['imdb_actors']):
                (anode, _) = models.Actor.objects.get_or_create(name=actor)
                anode.save()
                (rnode, _) = models.Role.objects.get_or_create(actor=anode,
                                                               imdb=imdb,
                                                               role=role,
                                                               bill_pos=pos+1)
                rnode.save()


        # TODO Handle
        # imdb_releasedate
        # imdb_writers
        # imdb_tagline

        if 'imdb_cover_uri' in fetched:
            cover_uri = fetched['imdb_cover_uri']
            cover_width = fetched['imdb_cover_width']
            _, ext = os.path.splitext(cover_uri)
            saved_name = "%s_%s%s" %  (imdb_id, cover_width, ext)

            storage_path = os.path.join(imdb.thumb_image.field.upload_to, saved_name)
            storage_abs_path = os.path.join(settings.MEDIA_ROOT, storage_path)

            try:
                if is_valid_image(storage_abs_path):
                    logging.info("IMDb thumb image already exists.")
                    imdb.thumb_image = storage_path
                    # Force Django to read the image width and height.
                    # Sometimes it tries to be lazy about reading this data,
                    # which can cause PIL-related exceptions during template
                    # rendering.
                    if imdb.thumb_width == 0 or imdb.thumb_height == 0:
                        raise ValueError("Invalid image width and height")
                else:
                    saved_name, _ = urllib.urlretrieve(cover_uri, storage_abs_path)
                    assert (os.path.realpath(saved_name) ==
                            os.path.realpath(storage_abs_path))
                    if is_valid_image(saved_name):
                        # Store the source URI used
                        imdb.thumb_uri = cover_uri
                        imdb.thumb_image = storage_path
                        # thumb_width and thumb_height are filled automatically
            except Exception, e:
                logging.error("Couldn't lookup IMDb cover from given URI: %s" %
                              fetched['imdb_cover_uri'])
                traceback.print_exc(e)
            else:
                logging.info("Fetched thumbnail from: %s" %
                             fetched['imdb_cover_uri'])

        if 'imdb_outline' in fetched:
            imdb.plot_outline = fetched['imdb_outline']

        if 'imdb_runtime' in fetched:
            imdb.length = fetched['imdb_runtime']

        if 'imdb_rating' in fetched:
            imdb.rating = fetched['imdb_rating']

        # fetched = imdb_metadata_search(node)
        # if not fetched:
        #     logging.error("Couldn't lookup IMDB metadata for '%s'" % node)
        #     return False
        # # If success, guaranteed to have the imdb_id, imdb_uri, and imdb_canonical_title
        # imdb.imdb_id = fetched['imdb_id']
        # imdb.imdb_uri = fetched['imdb_uri']
        # imdb.imdb_canonical_title = fetched['imdb_canonical_title']
        # imdb.save()

        #     # Might have these as well. Don't want to overwrite existing stuff.
        # if 'imdb_year' in fetched:
        #     imdb.release_year = fetched['imdb_year']
        # if 'imdb_thumb_uri' in fetched:
        #     imdb.thumb_uri = fetched['imdb_thumb_uri']
        #     imdb.thumb_uri_width = fetched.get('imdb_thumb_width', 0)
        #     imdb.thumb_uri_height = fetched.get('imdb_thumb_height', 0)

        # if 'imdb_genres' in fetched:
        #     for genre in fetched['imdb_genres']:
        #         (gnode, _) = models.Genre.objects.get_or_create(name=genre)
        #         gnode.save()
        #         imdb.genres.add(gnode)

        imdb.save()
        node.metadata.imdb = imdb
        node.metadata.save()
        node.save()
        return True
Exemple #5
0
        extras = {}

        link = result_node.findChild('a')
        if link is None:
            logging.error("Could not get link node of result for '%s', skipping." % name)
            continue

        extras['imdb_uri'] = imdb_uri = link.get('href')
        imdb_id_match = re.match('/title/(?P<imdb_id>tt[0-9]+)/*', imdb_uri)
        if not imdb_id_match:
            continue

        extras['imdb_id'] = imdb_id_match.groupdict()['imdb_id']

        imdb_name = link.get('title')
        imdb_title, imdb_year = common.detect_title_year(imdb_name)
        imdb_title = imdb_title.encode('utf-8')

        extras['imdb_canonical_title'] = imdb_name
        extras['imdb_title'] = imdb_name
        if imdb_year is not None:
            extras['imdb_year'] = imdb_year

        if not common.title_match(title, imdb_title):
            logging.info("Skipping IMDB title '%s' because it didn't match '%s'" % (imdb_title, title))
            continue

        thumb_node = result_node.findChild('td', attrs={'class':'image'})
        thumb_image = thumb_node.findChild('img') if thumb_node is not None else None
        if thumb_image:
            extras['imdb_thumb_uri'] = thumb_image.get('src')
def lookup_metacritic_metadata(content):
    metadata = {}
    name = content.simple_name()
    title, year = common.detect_title_year(name)

    url_kind_map = { models.KIND_MOVIE: 'http://www.metacritic.com/search/movie/%s/results',
                     models.KIND_SERIES: 'http://www.metacritic.com/search/tv/%s/results',
                     models.KIND_TV: 'http://www.metacritic.com/search/tv/%s/results',
                     models.KIND_SEASON: 'http://www.metacritic.com/search/tv/%s/results' }

    url = url_kind_map[content.kind]

    # Remove special characters that the regular metacritic search seems to
    # remove anyway.
    title_utf8 = title.encode('utf-8')
    title_stripped = re.sub('[!@#$%^&*();.,?]', '', title_utf8).strip() #title.replace('-','').replace(':','').replace('(','').replace(')','')
    title_stripped = re.sub('[:\-\s]', '+', title_stripped)
    #title_stripped = title_stripped.replace(' ', '+')

    # Fake encode the title, strip out the a=
    #title_stripped = re.sub('^a=', '', urllib.urlencode({'a': title_stripped}))

    url = url % title_stripped
    logging.info("Trying to search: %s" % url)
    _, page = common.get_page(url)

    if not page:
        logging.error("Couldn't get metacritic page for '%s'" % content)
        return None

    doc = B(page)

    # Get results
    results = doc.findAll('li', attrs={'class': re.compile('result')})

    for result in results:
        title_node = result.findChild('h3', attrs={'class': re.compile('product_title')})
        title_link = title_node.findChild('a') if title_node else None
        mc_title = title_link.string if title_link else None

        if not title_link or not mc_title:
            logging.warning("Could't find MC title link for result.")
            continue

        mc_title = mc_title.strip()

        if not common.title_match(title, mc_title):
            try:
                logging.warning(u"Skipping MC title '%s' because it didn't "
                                "match '%s'" % (mc_title, title))
            except Exception, e:
                traceback.print_exc(e)
            continue

        logging.info("Found a matching title, '%s' for '%s'" % (mc_title, title))

        mc_url = title_link.get('href')
        id_match = re.match('/(?P<type>movie|tv)/(?P<mc_id>.*)', mc_url)
        if not id_match:
            logging.warning("Could't find MC id from link '%s'." % mc_url)
            continue

        metadata['mc_uri'] = mc_url
        metadata['mc_id'] = id_match.groupdict()['mc_id']

        metascore_node = result.findChild('span', attrs={'class': re.compile('metascore')})
        metascore = metascore_node.string if metascore_node else None

        if metascore:
            metascore_class = metascore_node.get('class')
            score = 'unknown'
            if 'score_outstanding' in metascore_class:
                score = 'outstanding'
            elif 'score_favorable' in metascore_class:
                score = 'favorable'
            elif 'score_mixed' in metascore_class:
                score = 'mixed'
            elif 'score_unfavorable' in metascore_class:
                score = 'unfavorable'
            elif 'score_terrible' in metascore_class:
                score = 'terrible'
            elif 'score_tbd' in metascore_class:
                score = 'tbd'

            metadata['mc_status'] = score

            try:
                metadata['mc_score'] = int(metascore)
            except:
                logging.error("Couldn't convert metascore '%s' to integer." % metascore)

        return metadata