def get_cast(element): if (element.name is not None): for child in element.children: if child.name in ["table", "div", "td", "ul", "span", "section"]: pos = soup.find_all(["table", "div", "td"]) for item in pos: if (hasKey(cast_temp, str(item.parent))): cast_temp[str( item.parent)] = 1 + cast_temp[(str(item.parent))] else: cast_temp[str(item.parent)] = 1 sorted_x = sorted(cast_temp.items(), key=operator.itemgetter(1)) return sorted_x def hasKey(hashMap, key): for x in hashMap: if x == key: return True return False with open('extractor\sites.txt') as f: lines = f.readlines() f.close() for site in lines: soup = r.get_link(site) a = get_cast(soup.find("body")) print(a[-1])
import reader as r soup = r.get_link("http://www.imdb.com/title/tt0413573/?ref_=nv_sr_1") title = soup.find("h1", {"itemprop": "name"}).text creator = soup.find("span", {"itemprop": "creator"}).text print(title) rating = soup.find("span", {"itemprop": "ratingValue"}).text print(rating) cast_table = soup.find("table", { "class": "cast_list" }).find_all("span", {"class": "itemprop"}) for item in cast_table: actor = item.text print(actor) character_list = soup.find("table", { "class": "cast_list" }).find_all("td", {"class": "character"}) for item in character_list: actor_name = item.find("div").a.text print(actor_name) resume = soup.find("div", {"itemprop": "description"}).text print(resume) genres = soup.find("div", {"itemprop": "genre"}).find_all("a") genre = [] for item in genres: genre.append(item.text) print(genre) details = soup.find("div", {"id": "titleDetails"}).find_all("div")
import reader as r soup = r.get_link("https://www.rottentomatoes.com/tv/lost_in_space/") title = soup.find("h1", {"class":"title"}).text resume = soup.find("div", {"id": "movieSynopsis"}).text genre = soup.find("td", text = "Genre:").parent.text cast = soup.find_all("div",{"class":"cast-item media inlineBlock "}) cast_list = [] for item in cast: actor = item.find("div").find("a").text.strip() characther = str.replace(item.find("span",{"class": "characters subtle smaller"}).text,"as ","") cast_list.append([actor,characther]) rate = soup.find("div",{"class":"critic-score meter"}).span.text
import reader as r soup = r.get_link("https://www.themoviedb.org/tv/1418-the-big-bang-theory") title = soup.find("div", {"class": "title"}).text resume = soup.find("div", {"class": "overview"}).text creator = soup.find_all("li", {"class": "profile"}) creator_list = [] for item in creator: creator_list.append(item.text.strip()) cast = soup.find("ol", {"class": "people scroller"}).find_all("li") cast_list = [] for item in cast: cast_list.append(item.text.strip().split("\n")) genre = soup.find("section", {"class": "genres right_column"}).find("li").text
import reader as r soup = r.get_link("http://www.tvguide.com/tvshows/the-flash/644014/") title = soup.find("div", { "class": "tvobject-masthead-wrapper content-wrapper" }).find("h1").text.strip() resume = soup.find("div", { "class": "tvobject-masthead-wrapper content-wrapper" }).find("div", { "class": "tvobject-masthead-description" }).text.strip() cast = soup.find("div", { "data-section-id": "cast" }).find("div", { "class": "row" }).find_all("div") cast_list = [] for item in cast: cast_list.append(item.text.strip())
import reader as r soup = r.get_link("https://trakt.tv/shows/the-big-bang-theory") title = soup.find("div", { "class": "col-md-10 col-md-offset-2 col-sm-9 col-sm-offset-3 mobile-title" }).text country = soup.find("li", {"itemprop": "countryOfOrigin"}).text language = soup.find("label", text="Language").parent.text language = str.replace(language, "Language", "") genre = str.replace( soup.find("label", text="Genres").parent.text, "Genres", "") print(genre) description = soup.find("div", {"itemprop": "description"}).text cast_list = [] list_actors = soup.find_all("li", {"itemprop": "actor"}) for item in list_actors: name = item.find("h4", itemprop="name").text character = item.find("h4", {"class": "character"}).text cast_list.append([name, character]) numberOfSeasons = soup.find("span", {"class": "season-count"}).text