def showlist(limiter=None): """ Return the list of information about all TV shows offered by Hulu. @param limiter -- int -- The maximum number of shows to extract. @return [[String]] """ HULU = "https://www.hulu.com/start/more_content?&page=1&sort=alpha&video_type=tv" shows = list() # List of show information offered by Hulu. soup = scraping.brewsoup(HULU) # BeautifulSoup4 object from HULU url. total = int(soup.find( class_="total").get_text()) # Total number of pages to scrape. for page in range(1, 1 + total): url = HULU.replace("page=1", "page={}".format(page)) soup = scraping.brewsoup(url) table = soup.find("table") for row in table.find_all("tr"): for cell in row.find_all("td"): title = cell.find(class_="channel-results-show") if (title ): # Some td cells are empty. Must avoid scraping them. name = title.find(class_="beaconid") url = name["href"].replace("hulu.com/", "hulu.com/shows/info/") shows.append(showinfo(url)) if limiter and limiter <= len(shows): return shows # For testing small number of extractions print(len(shows) ) # Give some visual feedback so I know computer is not frozen. return shows
def showinfo(url): """ Return show info corresponding to the show's url. @param url -- String -- the url of the show. @return [String] Format of returned list: [Name, Description, Episodes/Seasons, "ACTORS", actors...] Example: ["Spongebob", "Square man lives on wild side." "Episodes: 10, Seasons: 3", "ACTORS", "Will Smith", "Daniel Z", "Oprah Winfrey"] """ info = list() soup = scraping.brewsoup(url) cast = soup.find( class_="slider slider--cast js-slider") # Find the cast/characters info.append(soup.find(class_="hero__headline").get_text().strip()) # Name info.append( soup.find( "p", class_="block-container__copy").get_text().strip()) # Description info.append( soup.find("h3", class_="section-header section-header--border"). get_text().strip()) # Episodes/Seasons info.append("ACTORS") # To separate actors from previous information if cast: # Some shows did not have any cast or characters. for actor in cast.find_all(class_="promo__copy"): info.append(actor.get_text().strip()) # Add the actors return info
def showinfo(url): """ Return show info corresponding to the show's url. @param url -- String -- the url of the show. @return [String] """ soup = scraping.brewsoup(url) elem = soup.find(id="sidebar_elements").find_all( "li", class_="large-margin-bottom") desc = soup.find(class_="description") extra = dict() # Extra information. extra["Name"] = soup.find("h1", class_="ellipsis").get_text().strip() if desc.find(class_="more"): extra["Desc"] = desc.find(class_="more").get_text().strip() else: extra["Desc"] = desc.get_text().strip() for li in elem[-1].find_all("li"): li = li.get_text().split(":") key = li[0] val = ":".join(li[1:]) extra[key.strip()] = " ".join(val.strip().split()) return [ extra.get("Name"), # Name of Show extra.get("Tags"), # Tags associated with show extra.get("Videos"), # Number of episodes/videos extra.get("Year"), # Year of airing extra.get("Publisher"), # Publisher extra.get("Desc") ] # Description of show
def showinfo(url): """ Return show info corresponding to the show's url. @param url -- String -- the url of the show. @return [String] """ soup = scraping.brewsoup(url) text = soup.get_text().replace("false", "False").replace("true", "True") # So eval works. info = ast.literal_eval( text) # Hulu encodes show info like a python dict. Lucky! return [ info["name"], # Name of Show info["channel"], # Genre of Show info["seasons_count"], # Number of Seasons info["episodes_count"], # Number of Episodes info["description"] ] # Hulu's Description
def showlist(limiter=None): """ Return the list of information about all TV shows offered by Showtime. @param limiter -- int -- The maximum number of shows to extract. @return [[String]] """ STIME = "http://www.sho.com" shows = list( ) # List of show lists such that each show list provides show info soup = scraping.brewsoup( "{}/series".format(STIME)) # Get page of all shows alls = soup.find(class_="section section--gradient section--pad-more" ) # Table of all shows for poster in alls.find_all(class_="promo__link"): if limiter and limiter <= len(shows): return shows # Limit number of shows for testing purposes url = "{0}{1}".format(STIME, poster["href"]) # Grab url of show shows.append(showinfo(url)) return shows
def showlist(limiter=None): """ Return the list of information about all TV shows offered by Crunchyroll. @param limiter -- int -- the maximum number of shows to extract @return [[String]] """ CROLL = "http://www.crunchyroll.com/videos/anime/alpha?group=all" shows = list() # List of show information offered by Crunchyroll. soup = scraping.brewsoup( CROLL) # BeautifulSoup4 object from Crunchyroll url. table = soup.find(class_="videos-column-container cf") for column in table.find_all(class_="videos-column left"): for cell in column.find_all("ul"): for show in cell.find_all("li"): url = "http://www.crunchyroll.com{}".format( show.find("a")["href"]) info = showinfo(url) shows.append(info) if limiter and limiter <= len(shows): return shows # For testing small number extractions print(info[0].encode("utf-8") ) # Visual feedback so I know computer is not frozen. return shows