コード例 #1
0
def scrape_top_250(url):
    """
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    """
    movie_urls = []
    dom = DOM(url.download(cached=True))

    allurls = dom.get_elements_by_classname("titleColumn")
    for oneurl in allurls:
        link = abs(oneurl[1].attrs.get("href", ""), base=url.redirect or url.string)
        movie_urls.append(link)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls