Ejemplo n.º 1
0
def lookup_nyt_review(content):
    name = content.simple_name().encode('utf-8')
    title, year = common.detect_title_year(name)

    url = 'http://movies.nytimes.com/gst/movies/msearch.html?%s'
    data = {'query': title}

    url = url % urllib.urlencode(data)
    _, page = common.get_page(url)

    if not page:
        logging.error("Couldn't get NYT search page for '%s'" % content)
        return None

    doc = B(page)

    entertainment_results = doc.findChild(
        'div', attrs={'id': 'entertainment_results'})
    results_container = entertainment_results.findChild(
        'ol') if entertainment_results else None
    results = results_container.findChildren(
        'li', recursive=False) if results_container else []

    for result in results:
        title_header = result.findChild('h3')
        title_link = title_header.findChild('a') if title_header else None
        nyt_title = title_link.string if title_link else None

        if not nyt_title:
            logging.warning("Couldn't find title node for '%s'" % title)
            continue

        # This sucks.
        nyt_title = nyt_title.replace(u'\xa0', ' ')
        nyt_title = nyt_title.encode('utf-8')

        nyt_title, nyt_year = common.detect_title_year(nyt_title)

        if not common.title_match(title, nyt_title):
            try:
                logging.warning(
                    "Skipping NYT title '%s' because it didn't match '%s'" %
                    (nyt_title, title))
            except Exception, e:
                import pdb
                pdb.set_trace()
                print e
            continue

        extra_links = result.findChild('ul')
        if extra_links:
            for link in extra_links.findChildren('a'):
                if link.string == "N.Y.Times Review":
                    return 'http://movies.nytimes.com%s' % link.get('href')
Ejemplo n.º 2
0
def rottentomatoes_find_id(title, year=None, imdb_id=None):

    # Find the content by search.
    url = u"http://www.rottentomatoes.com/search/movie.php?%s"
    title_latin1 = title.encode('latin1')
    data = {'searchby': 'movies',
            'search': title_latin1}

    try:
        url = url % urllib.urlencode(data)
        logging.info("Executing RT regular search for '%s' at '%s'" % (title, url))
        result_url, page = common.get_page(url)

        # BeautifulSoup can't handle hex entities. Massage them into decimal.
        hexentityMassage = copy.copy(B.MARKUP_MASSAGE)
        hexentityMassage = [(re.compile('&#x([^;]+);'),
                             lambda m: '&#%d;' % int(m.group(1), 16))]

        #page = imdb_cleanup_markup(page)
        document = B(page, convertEntities=B.HTML_ENTITIES,
                     markupMassage=hexentityMassage)

        results_ul = document.findChild('ul', attrs={'id': re.compile('movie_results_ul')})
        results = (results_ul.findAll('li', attrs={'class': re.compile('media_block')})
                   if results_ul else None)

        if results is None:
            logging.error("Couldn't lookup RT ID for '%s (%s)'" % (title, year))
            return None

        for result_node in results:
            # Scope in on the content div, because otherwise we get the poster
            # image.
            content_div = result_node.findChild(
                'div', attrs={'class': re.compile('media_block_content')})
            link = content_div.findChild('a', attrs={'href': rottentomatoes_id_pattern})

            link_title = link.string if link else None
            if not link_title:
                logging.error("Couldn't find RT result link title. Skipping")
                continue

            titles = []

            # Try the original title
            titles.append(link_title)

            # Rotten Tomatoes annoyingly embeds the AKAs in the title in parens following the head title.
            # For example:
            # - Batoru rowaiaru II: Chinkonka (Battle Royale II)
            # - Battle Royale (Batoru Rowaiaru)
            endparen_match = re.search("\(([^\(\)]+)\)$", link_title)

            while endparen_match:
                titles.append(endparen_match.groups()[0])
                # Strip out the ending (title) and any spaces before it.
                link_title = re.sub("\s*\(([^\(\)]+)\)$", '', link_title)
                endparen_match = re.search("\(([^\(\)]+)\)$", link_title)

                # Add the final version of the title with the AKAs removed to
                # the title list.
                if not endparen_match:
                    titles.append(link_title)

            found_title = None
            for aka in titles:
                if not common.title_match(title, aka):
                    try:
                        logging.warning(u"Skipping RT title '%s' because it didn't match '%s'" % (aka, title))
                    except Exception, e:
                        traceback.print_exc(e)
                    continue
                else:
                    logging.info("Found RT title match '%s' for '%s'" % (aka, title))
                found_title = aka
                break

            if not found_title:
                continue

            span_year = result_node.findChild('span', attrs={'class': re.compile('movie_year')})
            link_year = unicode(span_year.string) if span_year and span_year.string else None
            link_year = link_year.strip(' ()')

            if year and link_year != year:
                logging.info("Link '%s's year '%s' doesn't match '%s'." %
                             (link_title, link_year, year))
                continue

            # Get RT ID
            link_href = link.get('href')
            link_match = rottentomatoes_id_pattern.match(link_href)
            assert link_match # guaranteed
            return link_match.groupdict()['id']
    except Exception, e:
        traceback.print_exc(e)
        logging.error("Couldn't lookup RT ID for '%s (%s)'" % (title, year))
        pass
Ejemplo n.º 3
0
        imdb_id_match = re.match('/title/(?P<imdb_id>tt[0-9]+)/*', imdb_uri)
        if not imdb_id_match:
            continue

        extras['imdb_id'] = imdb_id_match.groupdict()['imdb_id']

        imdb_name = link.get('title')
        imdb_title, imdb_year = common.detect_title_year(imdb_name)
        imdb_title = imdb_title.encode('utf-8')

        extras['imdb_canonical_title'] = imdb_name
        extras['imdb_title'] = imdb_name
        if imdb_year is not None:
            extras['imdb_year'] = imdb_year

        if not common.title_match(title, imdb_title):
            logging.info("Skipping IMDB title '%s' because it didn't match '%s'" % (imdb_title, title))
            continue

        thumb_node = result_node.findChild('td', attrs={'class':'image'})
        thumb_image = thumb_node.findChild('img') if thumb_node is not None else None
        if thumb_image:
            extras['imdb_thumb_uri'] = thumb_image.get('src')
            extras['imdb_thumb_width'] = thumb_image.get('width')
            extras['imdb_thumb_height'] = thumb_image.get('height')

        runtime_node = result_node.findChild('span', attrs={'class': 'runtime'})
        if runtime_node:
            runtime_match = re.match("(?P<length>\d+) mins.", runtime_node.string)
            if runtime_match:
                extras['imdb_length'] = int(runtime_match.groupdict()['length'])
Ejemplo n.º 4
0
def imdb_find_id(title, year=None):
    title = title.decode('utf8')

    url = u'http://www.imdb.com/find?%s'
    data = {'s': 'tt',
            'q': title.encode('latin1')}

    try:
        url = url % urllib.urlencode(data)
        logging.info("Executing IMDB regular search for '%s' at '%s'" % (title, url))
        result_url, page = common.get_page(url)

        result_url = result_url.replace('http://www.imdb.com', '')
        result_url_match = imdb_title_pattern.match(result_url)
        if result_url_match:
            # IMDb saw fit to redirect us to the thing we searched for. Let's
            # trust them?
            logging.info("IMDb redirected us to '%s', trusting them." % result_url)
            return result_url_match.groupdict()['imdb_id']

        # BeautifulSoup can't handle hex entities. Massage them into decimal.
        hexentityMassage = copy.copy(B.MARKUP_MASSAGE)
        hexentityMassage = [(re.compile('&#x([^;]+);'),
                             lambda m: '&#%d;' % int(m.group(1), 16))]

        #page = imdb_cleanup_markup(page)
        document = B(page, convertEntities=B.HTML_ENTITIES,
                     markupMassage=hexentityMassage)

        links = document.findAll('a', attrs={'href': re.compile('^/title/tt\d{7}/$')})
        for link in links:
            link_title = link.string
            if not link_title:
                continue

            if not common.title_match(title, link_title):
                logging.info("Skipping IMDB link title '%s' because it didn't match '%s'" % (link_title, title))
                continue

            link_year = link.nextSibling

            if not isinstance(link_year, basestring):
                continue

            link_year = link_year.strip()
            link_year_match = re.match('\((?P<year>\d{4}).*?\)', link_year)
            link_year = link_year_match.groupdict()['year'] if link_year_match else None

            if not link_year:
                continue

            if year and link_year != year:
                logging.info("Link '%s's year '%s' doesn't match '%s'." % (link_title, link_year, year))
                continue

            imdb_url = link.get('href')
            imdb_match = re.match('^/title/tt(?P<imdb_id>\d{7})/', imdb_url)
            logging.info("Found match for '%s (%s)': '%s (%s)'" % (title, year, link_title, link_year))
            # We know this because the nodes were selected with this regex.
            assert imdb_match
            return imdb_match.groupdict()['imdb_id']
        logging.error("Found no matches for '%s'" % title)
    except Exception, e:
        logging.error("Couldn't get IMDB regular search for '%s'" % title)
        traceback.print_exc(e)
Ejemplo n.º 5
0
def lookup_metacritic_metadata(content):
    metadata = {}
    name = content.simple_name()
    title, year = common.detect_title_year(name)

    url_kind_map = { models.KIND_MOVIE: 'http://www.metacritic.com/search/movie/%s/results',
                     models.KIND_SERIES: 'http://www.metacritic.com/search/tv/%s/results',
                     models.KIND_TV: 'http://www.metacritic.com/search/tv/%s/results',
                     models.KIND_SEASON: 'http://www.metacritic.com/search/tv/%s/results' }

    url = url_kind_map[content.kind]

    # Remove special characters that the regular metacritic search seems to
    # remove anyway.
    title_utf8 = title.encode('utf-8')
    title_stripped = re.sub('[!@#$%^&*();.,?]', '', title_utf8).strip() #title.replace('-','').replace(':','').replace('(','').replace(')','')
    title_stripped = re.sub('[:\-\s]', '+', title_stripped)
    #title_stripped = title_stripped.replace(' ', '+')

    # Fake encode the title, strip out the a=
    #title_stripped = re.sub('^a=', '', urllib.urlencode({'a': title_stripped}))

    url = url % title_stripped
    logging.info("Trying to search: %s" % url)
    _, page = common.get_page(url)

    if not page:
        logging.error("Couldn't get metacritic page for '%s'" % content)
        return None

    doc = B(page)

    # Get results
    results = doc.findAll('li', attrs={'class': re.compile('result')})

    for result in results:
        title_node = result.findChild('h3', attrs={'class': re.compile('product_title')})
        title_link = title_node.findChild('a') if title_node else None
        mc_title = title_link.string if title_link else None

        if not title_link or not mc_title:
            logging.warning("Could't find MC title link for result.")
            continue

        mc_title = mc_title.strip()

        if not common.title_match(title, mc_title):
            try:
                logging.warning(u"Skipping MC title '%s' because it didn't "
                                "match '%s'" % (mc_title, title))
            except Exception, e:
                traceback.print_exc(e)
            continue

        logging.info("Found a matching title, '%s' for '%s'" % (mc_title, title))

        mc_url = title_link.get('href')
        id_match = re.match('/(?P<type>movie|tv)/(?P<mc_id>.*)', mc_url)
        if not id_match:
            logging.warning("Could't find MC id from link '%s'." % mc_url)
            continue

        metadata['mc_uri'] = mc_url
        metadata['mc_id'] = id_match.groupdict()['mc_id']

        metascore_node = result.findChild('span', attrs={'class': re.compile('metascore')})
        metascore = metascore_node.string if metascore_node else None

        if metascore:
            metascore_class = metascore_node.get('class')
            score = 'unknown'
            if 'score_outstanding' in metascore_class:
                score = 'outstanding'
            elif 'score_favorable' in metascore_class:
                score = 'favorable'
            elif 'score_mixed' in metascore_class:
                score = 'mixed'
            elif 'score_unfavorable' in metascore_class:
                score = 'unfavorable'
            elif 'score_terrible' in metascore_class:
                score = 'terrible'
            elif 'score_tbd' in metascore_class:
                score = 'tbd'

            metadata['mc_status'] = score

            try:
                metadata['mc_score'] = int(metascore)
            except:
                logging.error("Couldn't convert metascore '%s' to integer." % metascore)

        return metadata