Beispiel #1
0
def showlist(limiter=None):
    """ Return the list of information about all TV shows offered by Hulu. 
        @param limiter -- int -- The maximum number of shows to extract.
        @return [[String]]
    """
    HULU = "https://www.hulu.com/start/more_content?&page=1&sort=alpha&video_type=tv"
    shows = list()  # List of show information offered by Hulu.
    soup = scraping.brewsoup(HULU)  # BeautifulSoup4 object from HULU url.
    total = int(soup.find(
        class_="total").get_text())  # Total number of pages to scrape.
    for page in range(1, 1 + total):
        url = HULU.replace("page=1", "page={}".format(page))
        soup = scraping.brewsoup(url)
        table = soup.find("table")
        for row in table.find_all("tr"):
            for cell in row.find_all("td"):
                title = cell.find(class_="channel-results-show")
                if (title
                    ):  # Some td cells are empty. Must avoid scraping them.
                    name = title.find(class_="beaconid")
                    url = name["href"].replace("hulu.com/",
                                               "hulu.com/shows/info/")
                    shows.append(showinfo(url))
                    if limiter and limiter <= len(shows):
                        return shows  # For testing small number of extractions
        print(len(shows)
              )  # Give some visual feedback so I know computer is not frozen.
    return shows
Beispiel #2
0
def showinfo(url):
    """ Return show info corresponding to the show's url.
        @param url -- String -- the url of the show.
        @return [String]
        Format of returned list:
            [Name, Description, Episodes/Seasons, "ACTORS", actors...]
            Example: ["Spongebob",
                      "Square man lives on wild side."
                      "Episodes: 10, Seasons: 3",
                      "ACTORS",
                      "Will Smith",
                      "Daniel Z",
                      "Oprah Winfrey"]
    """
    info = list()
    soup = scraping.brewsoup(url)
    cast = soup.find(
        class_="slider slider--cast js-slider")  # Find the cast/characters
    info.append(soup.find(class_="hero__headline").get_text().strip())  # Name
    info.append(
        soup.find(
            "p",
            class_="block-container__copy").get_text().strip())  # Description
    info.append(
        soup.find("h3", class_="section-header section-header--border").
        get_text().strip())  # Episodes/Seasons
    info.append("ACTORS")  # To separate actors from previous information
    if cast:  # Some shows did not have any cast or characters.
        for actor in cast.find_all(class_="promo__copy"):
            info.append(actor.get_text().strip())  # Add the actors
    return info
Beispiel #3
0
def showinfo(url):
    """ Return show info corresponding to the show's url.
        @param url -- String -- the url of the show.
        @return [String]
    """
    soup = scraping.brewsoup(url)
    elem = soup.find(id="sidebar_elements").find_all(
        "li", class_="large-margin-bottom")
    desc = soup.find(class_="description")
    extra = dict()  # Extra information.
    extra["Name"] = soup.find("h1", class_="ellipsis").get_text().strip()
    if desc.find(class_="more"):
        extra["Desc"] = desc.find(class_="more").get_text().strip()
    else:
        extra["Desc"] = desc.get_text().strip()
    for li in elem[-1].find_all("li"):
        li = li.get_text().split(":")
        key = li[0]
        val = ":".join(li[1:])
        extra[key.strip()] = " ".join(val.strip().split())
    return [
        extra.get("Name"),  # Name of Show
        extra.get("Tags"),  # Tags associated with show
        extra.get("Videos"),  # Number of episodes/videos
        extra.get("Year"),  # Year of airing
        extra.get("Publisher"),  # Publisher
        extra.get("Desc")
    ]  # Description of show
Beispiel #4
0
def showinfo(url):
    """ Return show info corresponding to the show's url.
        @param url -- String -- the url of the show.
        @return [String]
    """
    soup = scraping.brewsoup(url)
    text = soup.get_text().replace("false",
                                   "False").replace("true",
                                                    "True")  # So eval works.
    info = ast.literal_eval(
        text)  # Hulu encodes show info like a python dict. Lucky!
    return [
        info["name"],  # Name of Show
        info["channel"],  # Genre of Show
        info["seasons_count"],  # Number of Seasons
        info["episodes_count"],  # Number of Episodes
        info["description"]
    ]  # Hulu's Description
Beispiel #5
0
def showlist(limiter=None):
    """ Return the list of information about all TV shows offered by Showtime. 
        @param limiter -- int -- The maximum number of shows to extract.
        @return [[String]]
    """
    STIME = "http://www.sho.com"
    shows = list(
    )  # List of show lists such that each show list provides show info
    soup = scraping.brewsoup(
        "{}/series".format(STIME))  # Get page of all shows
    alls = soup.find(class_="section section--gradient section--pad-more"
                     )  # Table of all shows
    for poster in alls.find_all(class_="promo__link"):
        if limiter and limiter <= len(shows):
            return shows  # Limit number of shows for testing purposes
        url = "{0}{1}".format(STIME, poster["href"])  # Grab url of show
        shows.append(showinfo(url))
    return shows
Beispiel #6
0
def showlist(limiter=None):
    """ Return the list of information about all TV shows offered by Crunchyroll. 
        @param limiter -- int -- the maximum number of shows to extract
        @return [[String]]
    """
    CROLL = "http://www.crunchyroll.com/videos/anime/alpha?group=all"
    shows = list()  # List of show information offered by Crunchyroll.
    soup = scraping.brewsoup(
        CROLL)  # BeautifulSoup4 object from Crunchyroll url.
    table = soup.find(class_="videos-column-container cf")
    for column in table.find_all(class_="videos-column left"):
        for cell in column.find_all("ul"):
            for show in cell.find_all("li"):
                url = "http://www.crunchyroll.com{}".format(
                    show.find("a")["href"])
                info = showinfo(url)
                shows.append(info)
                if limiter and limiter <= len(shows):
                    return shows  # For testing small number extractions
                print(info[0].encode("utf-8")
                      )  # Visual feedback so I know computer is not frozen.
    return shows